Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
68 commits
Select commit Hold shift + click to select a range
5850463
feat(generation): add Dynamo URL-forwarder backend
jthomson04 May 6, 2026
3498c7c
feat(nrl-k8s): ingest DynamoGraphDeployment manifests
jthomson04 May 6, 2026
bf94213
feat(nrl-k8s): cascade DGD teardown via owner refs
jthomson04 May 6, 2026
968f882
fix(nrl-k8s): propagate python exit code past tee in entrypoint
jthomson04 May 6, 2026
9d8bb7c
fix(nrl-k8s): follow RayJob.status.rayClusterName in status
jthomson04 May 6, 2026
2e119b7
fix(nrl-k8s): don't clobber dynamo operator's per-DGD service account
jthomson04 May 6, 2026
d808850
feat(generation): support dynamo backend through nemo-gym + perf metrics
jthomson04 May 6, 2026
89f605c
chore(nemo-gym): downgrade token-contiguity assert to a warning
jthomson04 May 6, 2026
8417f02
feat(nrl-k8s): add workplace_assistant + dynamo smoke example
jthomson04 May 6, 2026
6fc9661
fix(nemo-gym): pin token_id/logprob tensor dtypes for Dynamo path
jthomson04 May 7, 2026
c89601a
fix(nemo-gym): restore strict token-contiguity assert + bump gym subm…
jthomson04 May 9, 2026
aa83745
fix(nrl-k8s): gate wait_for_dgd_ready on pods Ready + frontend serving
jthomson04 May 11, 2026
2a54f19
feat(nrl-k8s): add Nemotron-3-Nano-V3 + planner load smokes
jthomson04 May 11, 2026
982be4f
fix(nrl-k8s): apply DGDs before RayJob to close startup race
jthomson04 May 11, 2026
2190607
chore(nemo-gym): bump gym submodule to scoped required_prefix_token_i…
jthomson04 May 11, 2026
d9e4de3
feat: ModelExpress + NIXL RDMA weight-sync (v2 path) for non-colocate…
KavinKrishnan May 7, 2026
82fdff7
feat(generation/dynamo): wire ModelExpress v2 refit path
jthomson04 May 18, 2026
4da7a93
feat(nrl-k8s): ModelExpress v2 smoke recipe + DGD/infra YAMLs
jthomson04 May 18, 2026
8610812
feat(generation/dynamo): synchronous-RPC mx refit via /engine/<route>
jthomson04 May 24, 2026
e689732
chore(nrl-k8s/dynamo_mx): retire bootstrap_mx.sh; bake worker image i…
jthomson04 May 24, 2026
d1437b3
chore(nrl-k8s/dynamo_mx): Phase B UCX/dma-buf env + trainer-infra knobs
jthomson04 May 24, 2026
49aeabb
chore(nrl-k8s/dynamo_mx): drop trainer-side workarounds; bump nemo-rl…
jthomson04 May 24, 2026
09c8790
fix(nrl-k8s): make `run --rayjob` reliable end-to-end
jthomson04 May 25, 2026
acef424
chore(nrl-k8s/dynamo_mx): rebuild overlay for nemo-rl:nightly + cu13 …
jthomson04 May 25, 2026
fa577dd
fix(generation/dynamo): use master_config attribute access in our ref…
jthomson04 May 25, 2026
53d1577
chore(nrl-k8s/dynamo_mx): bump trainer image to nemo-rl-mx:05-25-v3
jthomson04 May 25, 2026
1f02ea4
chore(gym): bump submodule to post-rebase identical SHA
jthomson04 May 27, 2026
3914a79
Delete infra/nrl_k8s/dynamo_mx/DEBUGGING_POSTMORTEM.md
jthomson04 May 27, 2026
56b0953
chore(nrl-k8s/dynamo_smoke): bump image to nemo-rl-mx:05-25-v3
jthomson04 May 27, 2026
1cc4e6f
fix(generation/dynamo): support multi-GPU/multi-worker MX refit
jthomson04 Jun 2, 2026
e1b9245
feat(nrl-k8s/dynamo_mx): 16xTP2 MX weight-transfer benchmark
jthomson04 Jun 2, 2026
dba03d0
docs: add Dynamo integration design doc
jthomson04 Jun 2, 2026
f2a5734
feat(nrl-k8s/dynamo_mx): enable v2-aware MX loader for autoscaling
jthomson04 Jun 2, 2026
b869c37
Add hierarchical GlobalPlanner + GlobalRouter MX smoke (+ refit/readi…
jthomson04 Jun 3, 2026
d90e248
feat(nrl-k8s/swe2): GB300 DTensor+Dynamo SWE2 recipe + bake singulari…
jthomson04 Jun 5, 2026
bcb78e8
build(nrl-k8s/dynamo_mx): bump modelexpress MX_REF to e45a14f
jthomson04 Jun 5, 2026
59bc42d
feat(nrl-k8s/dynamo_mx): wave-parallel MX refit + 2-worker peer-pull …
jthomson04 Jun 5, 2026
21d8841
build(nrl-k8s/dynamo_mx): bump MX_REF to 3fbf7d0 for tree fan-out
jthomson04 Jun 5, 2026
d5bc95b
feat(dynamo/swe2): Dynamo→wandb metrics bridge + gen-benchmark & grac…
jthomson04 Jun 9, 2026
0440798
feat(megatron): MX v2 publisher path — stream_weights_via_mx + role c…
KavinKrishnan Jun 5, 2026
14745af
refactor(megatron-mx): use Megatron-Bridge AutoMapping registry for r…
KavinKrishnan Jun 5, 2026
a187cf1
feat(megatron-mx): publish transformer_config + Megatron→HF name map …
KavinKrishnan Jun 5, 2026
9f776c4
feat(megatron-mx): receiver-side Megatron-MX path on VllmInternalWork…
KavinKrishnan Jun 5, 2026
ff65ce4
fix(megatron-mx): use bulk receive_weights + pre_assembled_buffers in…
KavinKrishnan Jun 5, 2026
609b0dc
Megatron-MX: fix Bridge API name + normalize module. prefix
KavinKrishnan Jun 8, 2026
43692e6
Megatron-MX: implement mixed-TP receiver path (v0)
KavinKrishnan Jun 8, 2026
ee65362
Megatron-MX: classify grouped-MoE per-expert tensors
KavinKrishnan Jun 8, 2026
487f249
fix(grpo): commit wandb metric history on async path + SWE2 24-worker…
jthomson04 Jun 10, 2026
efa0631
fix(megatron-mx): classifier needs prefixed name for model-walk
KavinKrishnan Jun 10, 2026
c077eeb
feat(megatron-mx): wire v1 sliced-pull into mixed-TP receiver path
KavinKrishnan Jun 10, 2026
6abe752
feat(megatron-mx): GRPO recipe with cluster.weight_sync.method=mx
KavinKrishnan Jun 10, 2026
79e9f9e
chore(nrl-k8s): keep swe2 dynamo mx debug config
jthomson04 Jun 11, 2026
28034dc
feat(megatron-mx): GRPO recipe — Qwen3-4B Megatron + Dynamo + MX
KavinKrishnan Jun 11, 2026
45b0fc1
feat(dynamo): add direct generation k8s exemplars
jthomson04 Jun 12, 2026
1a2d3e7
Merge remote-tracking branch 'jthomson04/pr-2' into jthomson04/megatr…
jthomson04 Jun 12, 2026
aed46c2
feat(dynamo): add megatron mx k8s exemplar
jthomson04 Jun 12, 2026
1a97432
fix(dynamo): use local device for megatron mx
jthomson04 Jun 12, 2026
9283957
feat(dynamo): add nemotron nano v2 mx exemplar
jthomson04 Jun 12, 2026
f9ecad4
docs: update dynamo integration design doc
jthomson04 Jun 12, 2026
1724407
chore(dynamo): update v5 nemotron k8s images
jthomson04 Jun 12, 2026
d5e1999
chore(k8s): remove v5 dynamo overlays
jthomson04 Jun 13, 2026
51f7f29
chore(k8s): refresh v1-v3 dynamo exemplars
jthomson04 Jun 13, 2026
233cbb4
feat(dynamo): add qwen3 fp8 kv-cache mx recipe
jthomson04 Jun 21, 2026
a3c6b6f
Merge pull request #3 from jthomson04/v6-dynamo-fp8-kvcache
jthomson04 Jun 23, 2026
bbe0979
feat(k8s): add EAGLE specdec MX refit support
jthomson04 Jun 23, 2026
b25a3cb
chore(k8s): use Dynamo EAGLE refit handler
jthomson04 Jun 23, 2026
408d725
chore(k8s): keep existing Dynamo MX overlay
jthomson04 Jun 23, 2026
786aed2
Merge pull request #5 from jthomson04/qwen3-eagle3-dynamo-mx
jthomson04 Jun 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rdparty/Gym-workspace/Gym
Submodule Gym updated 1334 files
382 changes: 382 additions & 0 deletions docs/design-docs/dynamo-integration.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ design-docs/training-backends.md
design-docs/sequence-packing-and-dynamic-batching.md
design-docs/env-vars.md
design-docs/nemo-gym-integration.md
design-docs/dynamo-integration.md
```

```{toctree}
Expand Down
60 changes: 60 additions & 0 deletions examples/configs/grpo_math_1B_megatron_mx.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# GRPO Algorithm Configuration — Megatron-Core + ModelExpress v2 refit
#
# Inherits from grpo_math_1B_megatron.yaml (Qwen2.5-1.5B Megatron TP=1 EP=1
# PP=1, vLLM generation backend) and flips cluster.weight_sync.method to
# "mx" so refit cycles flow through the cluster-validated Megatron-MX
# data plane:
#
# trainer: collect_megatron_publish_set -> stream_weights_via_mx
# (publisher emits per-rank native shards + sidecar with
# transformer_config + Megatron->HF name map)
# refit: MX server catalogs the source, ready=True
# receiver: VllmInternalWorkerExtension._update_weights_via_mx_megatron
# -> v1 sliced-pull (axis-0 roles) / v0 scratch+copy (axis-1)
# -> translate Megatron->HF via vendored Bridge helpers
# -> vllm.model.load_weights()
#
# Smallest viable Megatron + MX GRPO recipe. End-to-end byte-identity of
# the Megatron path is independently validated on Qwen3-MoE-30B-A3B
# (18 867 / 18 867 HF tensors byte-identical — see PR #429 + #2). This
# recipe exercises the path inside a running GRPO loop.
#
# Companion infra YAML: infra/nrl_k8s/examples/grpo_math_1B_megatron_mx.gb200.infra.yaml

defaults: "grpo_math_1B_megatron.yaml"

grpo:
# First step: validate that one refit cycle completes successfully and
# vLLM accepts the loaded HF state-dict. Bump after that lands.
max_num_steps: 5
val_period: 5
val_at_start: false

policy:
# Megatron path stays as-is from the parent (TP=1, EP=1, PP=1).
megatron_cfg:
enabled: true
dtensor_cfg:
enabled: false

cluster:
weight_sync:
method: "mx"
mx_config:
enabled: true
# GB200 / kavin namespace MX server. Point at modelexpress-server.<ns>
# for other clusters.
mx_server_url: "modelexpress-server.kavin.svc.cluster.local:8001"
timeout_seconds: 300.0
# Required on multi-subnet RDMA fabrics — also fine on single-subnet
# because the trainer always publishes for its own rank.
same_rank_only: true
# Enable so each DGD receiver republishes itself as a v2
# inference_replica after refit; later-arriving replicas can pull
# from peers instead of contending on the trainer's NIC.
tree_scale_out: true
# Qwen2.5-1.5B is dense; the MoE expert filter is a no-op but
# harmless to enable.
moe_expert_filter: false
# NIXL retry budget for transient NIXL_ERR_NOT_ALLOWED.
nixl_retry_budget: 3
138 changes: 138 additions & 0 deletions examples/nemo_gym/grpo_mini_swe_nemotron_v3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# Proof-of-life smoke: mini_swe_agent + Nemotron-3-Nano-30B-A3B-BF16 via Dynamo.
#
# Mirrors grpo_mini_swe_smoke.yaml (Qwen3-4B-Thinking) but:
# * Inherits MoE/Mamba knobs from the Nemotron-Nano-V2 9B recipe
# (workplace_assistant_nemotron_nano_v2_9b) — that's the
# canonical Nemotron-family config in the repo. V3 30B-A3B is
# also a Mamba hybrid MoE so the same knobs apply.
# * Model pinned to the local Lustre snapshot to avoid HF
# downloads on the policy worker.
# * Tool/reasoning round-trip disabled (parser-side strips
# `<think>` blocks on replay, same as V2). /no_think keeps
# reasoning out of the wire form so the contiguity assert
# doesn't fire on multi-turn rollouts.
#
# Smoke shape: 1 step × 4 prompts × 2 generations = 8 rollouts.
# Minimal mini_swe load (step_limit=2, concurrency=16). Goal is
# end-to-end pipeline validation, not training quality.

defaults: grpo_workplace_assistant_nemotron_nano_v2_9b.yaml

grpo:
num_prompts_per_step: 4
num_generations_per_prompt: 2
max_rollout_turns: 1
max_num_steps: 1
val_period: 999999
val_at_start: false
val_at_end: false

policy:
# The DGD serves under the canonical HF ID (via --served-model-name).
# The gym uses `model_name` verbatim as the `model:` field in chat
# completion requests, so it must match the served name — otherwise
# the frontend returns 404 "model or resource not found".
# tokenizer.name still points at the local Lustre dir since HF
# transformers `from_pretrained` accepts a filesystem path and we
# don't want a network round-trip on the policy worker.
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
tokenizer:
name: ${policy.model_name}
chat_template_kwargs: null
train_global_batch_size: ${mul:${grpo.num_prompts_per_step}, ${grpo.num_generations_per_prompt}}
train_micro_batch_size: 1
logprob_batch_size: 1
max_total_sequence_length: 8192
# Required when TP > 1 + sequence_parallel: pads seq len to a
# multiple of TP so the activation tensor splits cleanly across
# TP ranks. Inherited default from the parent recipe is 1 (the
# parent uses TP=2 SP=false). With TP=4 here we need 4.
make_sequence_length_divisible_by: 4

megatron_cfg:
enabled: true
# 30B MoE BF16 on one GB300 node (4 GPUs):
# * TP=4 shards both attention and expert weights → ~15GB
# weights per GPU.
# * use_distributed_optimizer=true (parent recipe) shards
# AdamW state across the 4 TP ranks → ~90GB per GPU.
# * Total per-GPU footprint ~130-150GB; GB300 has 282GB HBM,
# so we don't need CPU offload — which was the actual cause
# of the previous host-OOM: 360GB optimizer state on the
# pod's 850GiB CPU budget plus model load + Ray overhead.
tensor_model_parallel_size: 4
sequence_parallel: true
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
# MegatronPolicyWorker.train reads this key at line 459 of
# megatron_policy_worker.py — not present in the V2 9B parent
# recipe, so we must set it explicitly here.
moe_per_layer_logging: false
context_parallel_size: 1
activation_checkpointing: true
empty_unused_memory_level: 2
defer_fp32_logits: true
optimizer:
optimizer_cpu_offload: false
optimizer_offload_fraction: 0.0

generation:
backend: "dynamo"
max_new_tokens: 8192
dynamo_cfg:
tool_call_parser: "nemotron_deci"
reasoning_parser: "nemotron_nano"
colocated:
enabled: false

env:
nemo_gym:
config_paths:
- responses_api_models/vllm_model/configs/vllm_model_for_training.yaml
- resources_servers/mini_swe_agent/configs/mini_swe_agent.yaml
mini_swe_simple_agent:
responses_api_agents:
mini_swe_agent:
step_limit: 2
concurrency: 16
cache_dir_template: "/mnt/rl-workspace/jothomson/swebench_containers/swebench_sweb.eval.arm64.{instance_id}.sif"
run_golden: false
policy_model:
responses_api_models:
vllm_model:
# Nemotron's chat template strips <think> blocks on
# replay; disable reasoning round-trip on the gym side
# to avoid tripping the contiguity assert. `/no_think`
# in the workplace_assistant system prompt disables
# thinking end-to-end — for mini_swe we lean on
# parser-side stripping since mini_swe's prompts don't
# ship a /no_think directive.
uses_reasoning_parser: false
extra_body:
chat_template_kwargs:
enable_thinking: false

data:
train:
data_path: /mnt/rl-workspace/jothomson/swebench_containers/swebench_verified_arm64_mini_swe.jsonl
validation:
data_path: /mnt/rl-workspace/jothomson/swebench_containers/swebench_verified_arm64_mini_swe.jsonl

checkpointing:
enabled: false

logger:
log_dir: "logs/grpo-mini-swe-nemotron-v3"
num_val_samples_to_print: 0
wandb_enabled: false
tensorboard_enabled: false
mlflow_enabled: false
swanlab_enabled: false
monitor_gpus: false

cluster:
gpus_per_node: 4
# 2 training pods (TP=4 × DP=2 = 8 GPUs) so use_distributed_optimizer
# shards the AdamW state across 8 ranks instead of 4. Fits in HBM
# without CPU offload.
num_nodes: 2
Loading
Loading