NVIDIA-NeMo · jthomson04 · May 6, 2026 · May 6, 2026 · May 6, 2026 · May 6, 2026
@@ -294,6 +294,7 @@ design-docs/training-backends.md
 design-docs/sequence-packing-and-dynamic-batching.md
 design-docs/env-vars.md
 design-docs/nemo-gym-integration.md
+design-docs/dynamo-integration.md
 ```
 
 ```{toctree}

@@ -0,0 +1,60 @@
+# GRPO Algorithm Configuration — Megatron-Core + ModelExpress v2 refit
+#
+# Inherits from grpo_math_1B_megatron.yaml (Qwen2.5-1.5B Megatron TP=1 EP=1
+# PP=1, vLLM generation backend) and flips cluster.weight_sync.method to
+# "mx" so refit cycles flow through the cluster-validated Megatron-MX
+# data plane:
+#
+#   trainer:  collect_megatron_publish_set -> stream_weights_via_mx
+#               (publisher emits per-rank native shards + sidecar with
+#                transformer_config + Megatron->HF name map)
+#   refit:    MX server catalogs the source, ready=True
+#   receiver: VllmInternalWorkerExtension._update_weights_via_mx_megatron
+#               -> v1 sliced-pull (axis-0 roles) / v0 scratch+copy (axis-1)
+#               -> translate Megatron->HF via vendored Bridge helpers
+#               -> vllm.model.load_weights()
+#
+# Smallest viable Megatron + MX GRPO recipe. End-to-end byte-identity of
+# the Megatron path is independently validated on Qwen3-MoE-30B-A3B
+# (18 867 / 18 867 HF tensors byte-identical — see PR #429 + #2). This
+# recipe exercises the path inside a running GRPO loop.
+#
+# Companion infra YAML: infra/nrl_k8s/examples/grpo_math_1B_megatron_mx.gb200.infra.yaml
+
+defaults: "grpo_math_1B_megatron.yaml"
+
+grpo:
+  # First step: validate that one refit cycle completes successfully and
+  # vLLM accepts the loaded HF state-dict. Bump after that lands.
+  max_num_steps: 5
+  val_period: 5
+  val_at_start: false
+
+policy:
+  # Megatron path stays as-is from the parent (TP=1, EP=1, PP=1).
+  megatron_cfg:
+    enabled: true
+  dtensor_cfg:
+    enabled: false
+
+cluster:
+  weight_sync:
+    method: "mx"
+    mx_config:
+      enabled: true
+      # GB200 / kavin namespace MX server. Point at modelexpress-server.<ns>
+      # for other clusters.
+      mx_server_url: "modelexpress-server.kavin.svc.cluster.local:8001"
+      timeout_seconds: 300.0
+      # Required on multi-subnet RDMA fabrics — also fine on single-subnet
+      # because the trainer always publishes for its own rank.
+      same_rank_only: true
+      # Enable so each DGD receiver republishes itself as a v2
+      # inference_replica after refit; later-arriving replicas can pull
+      # from peers instead of contending on the trainer's NIC.
+      tree_scale_out: true
+      # Qwen2.5-1.5B is dense; the MoE expert filter is a no-op but
+      # harmless to enable.
+      moe_expert_filter: false
+      # NIXL retry budget for transient NIXL_ERR_NOT_ALLOWED.
+      nixl_retry_budget: 3
@@ -0,0 +1,138 @@
+# Proof-of-life smoke: mini_swe_agent + Nemotron-3-Nano-30B-A3B-BF16 via Dynamo.
+#
+# Mirrors grpo_mini_swe_smoke.yaml (Qwen3-4B-Thinking) but:
+#   * Inherits MoE/Mamba knobs from the Nemotron-Nano-V2 9B recipe
+#     (workplace_assistant_nemotron_nano_v2_9b) — that's the
+#     canonical Nemotron-family config in the repo. V3 30B-A3B is
+#     also a Mamba hybrid MoE so the same knobs apply.
+#   * Model pinned to the local Lustre snapshot to avoid HF
+#     downloads on the policy worker.
+#   * Tool/reasoning round-trip disabled (parser-side strips
+#     `<think>` blocks on replay, same as V2). /no_think keeps
+#     reasoning out of the wire form so the contiguity assert
+#     doesn't fire on multi-turn rollouts.
+#
+# Smoke shape: 1 step × 4 prompts × 2 generations = 8 rollouts.
+# Minimal mini_swe load (step_limit=2, concurrency=16). Goal is
+# end-to-end pipeline validation, not training quality.
+
+defaults: grpo_workplace_assistant_nemotron_nano_v2_9b.yaml
+
+grpo:
+  num_prompts_per_step: 4
+  num_generations_per_prompt: 2
+  max_rollout_turns: 1
+  max_num_steps: 1
+  val_period: 999999
+  val_at_start: false
+  val_at_end: false
+
+policy:
+  # The DGD serves under the canonical HF ID (via --served-model-name).
+  # The gym uses `model_name` verbatim as the `model:` field in chat
+  # completion requests, so it must match the served name — otherwise
+  # the frontend returns 404 "model or resource not found".
+  # tokenizer.name still points at the local Lustre dir since HF
+  # transformers `from_pretrained` accepts a filesystem path and we
+  # don't want a network round-trip on the policy worker.
+  model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
+  tokenizer:
+    name: ${policy.model_name}
+    chat_template_kwargs: null
+  train_global_batch_size: ${mul:${grpo.num_prompts_per_step}, ${grpo.num_generations_per_prompt}}
+  train_micro_batch_size: 1
+  logprob_batch_size: 1
+  max_total_sequence_length: 8192
+  # Required when TP > 1 + sequence_parallel: pads seq len to a
+  # multiple of TP so the activation tensor splits cleanly across
+  # TP ranks. Inherited default from the parent recipe is 1 (the
+  # parent uses TP=2 SP=false). With TP=4 here we need 4.
+  make_sequence_length_divisible_by: 4
+
+  megatron_cfg:
+    enabled: true
+    # 30B MoE BF16 on one GB300 node (4 GPUs):
+    #   * TP=4 shards both attention and expert weights → ~15GB
+    #     weights per GPU.
+    #   * use_distributed_optimizer=true (parent recipe) shards
+    #     AdamW state across the 4 TP ranks → ~90GB per GPU.
+    #   * Total per-GPU footprint ~130-150GB; GB300 has 282GB HBM,
+    #     so we don't need CPU offload — which was the actual cause
+    #     of the previous host-OOM: 360GB optimizer state on the
+    #     pod's 850GiB CPU budget plus model load + Ray overhead.
+    tensor_model_parallel_size: 4
+    sequence_parallel: true
+    expert_tensor_parallel_size: 1
+    expert_model_parallel_size: 1
+    # MegatronPolicyWorker.train reads this key at line 459 of
+    # megatron_policy_worker.py — not present in the V2 9B parent
+    # recipe, so we must set it explicitly here.
+    moe_per_layer_logging: false
+    context_parallel_size: 1
+    activation_checkpointing: true
+    empty_unused_memory_level: 2
+    defer_fp32_logits: true
+    optimizer:
+      optimizer_cpu_offload: false
+      optimizer_offload_fraction: 0.0
+
+  generation:
+    backend: "dynamo"
+    max_new_tokens: 8192
+    dynamo_cfg:
+      tool_call_parser: "nemotron_deci"
+      reasoning_parser: "nemotron_nano"
+    colocated:
+      enabled: false
+
+env:
+  nemo_gym:
+    config_paths:
+      - responses_api_models/vllm_model/configs/vllm_model_for_training.yaml
+      - resources_servers/mini_swe_agent/configs/mini_swe_agent.yaml
+    mini_swe_simple_agent:
+      responses_api_agents:
+        mini_swe_agent:
+          step_limit: 2
+          concurrency: 16
+          cache_dir_template: "/mnt/rl-workspace/jothomson/swebench_containers/swebench_sweb.eval.arm64.{instance_id}.sif"
+          run_golden: false
+    policy_model:
+      responses_api_models:
+        vllm_model:
+          # Nemotron's chat template strips <think> blocks on
+          # replay; disable reasoning round-trip on the gym side
+          # to avoid tripping the contiguity assert. `/no_think`
+          # in the workplace_assistant system prompt disables
+          # thinking end-to-end — for mini_swe we lean on
+          # parser-side stripping since mini_swe's prompts don't
+          # ship a /no_think directive.
+          uses_reasoning_parser: false
+          extra_body:
+            chat_template_kwargs:
+              enable_thinking: false
+
+data:
+  train:
+    data_path: /mnt/rl-workspace/jothomson/swebench_containers/swebench_verified_arm64_mini_swe.jsonl
+  validation:
+    data_path: /mnt/rl-workspace/jothomson/swebench_containers/swebench_verified_arm64_mini_swe.jsonl
+
+checkpointing:
+  enabled: false
+
+logger:
+  log_dir: "logs/grpo-mini-swe-nemotron-v3"
+  num_val_samples_to_print: 0
+  wandb_enabled: false
+  tensorboard_enabled: false
+  mlflow_enabled: false
+  swanlab_enabled: false
+  monitor_gpus: false
+
+cluster:
+  gpus_per_node: 4
+  # 2 training pods (TP=4 × DP=2 = 8 GPUs) so use_distributed_optimizer
+  # shards the AdamW state across 8 ranks instead of 4. Fits in HBM
+  # without CPU offload.
+  num_nodes: 2