From b4a21a5f0f015c202e343771e185d979f99f8217 Mon Sep 17 00:00:00 2001
From: Rohit Pujar Nagraj <rpujarnagraj@nvidia.com>
Date: Tue, 30 Jun 2026 13:12:16 -0700
Subject: [PATCH 1/5] perf: update DSR1 B200 FP4 SGLang MTP config (image +
 low-latency search space)

Bump the dsr1-fp4-b200-sglang-mtp image to lmsysorg/sglang:v0.5.12.post1 and
reshape the MTP search space into a low-latency TP4/EP1 lane plus TP4/EP4
(1k/1k: TP4/EP1 c1-32 + TP4/EP4 c64-256; 8k/1k: TP4/EP1 c1-32 + TP4/EP4
DP-attention c64-256). Update dsr1_fp4_b200_mtp.sh with the DP-attention path,
SGLANG_RADIX_FORCE_MISS=1, --disable-piecewise-cuda-graph, aligned runtime
settings, and drop the TP=8-only restriction.
---
 .github/configs/nvidia-master.yaml            |  8 ++-
 .../fixed_seq_len/dsr1_fp4_b200_mtp.sh        | 55 ++++++++++++++-----
 perf-changelog.yaml                           | 11 ++++
 3 files changed, 57 insertions(+), 17 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index aaa88c634f..46e2bf7b73 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1700,7 +1700,7 @@ dsr1-fp4-b200-sglang:
     #   - { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256, 512] }
 
 dsr1-fp4-b200-sglang-mtp:
-  image: lmsysorg/sglang:v0.5.12-cu130
+  image: lmsysorg/sglang:v0.5.12.post1
   model: nvidia/DeepSeek-R1-0528-FP4-V2
   model-prefix: dsr1
   runner: b200
@@ -1712,11 +1712,13 @@ dsr1-fp4-b200-sglang-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp }
+      - { tp: 4, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp }
+      - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp }
+      - { tp: 4, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp }
+      - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
 
 dsv4-fp4-b200-sglang:
   image: lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b
diff --git a/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_mtp.sh
index 4a76a82d45..df08e52e98 100755
--- a/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_mtp.sh
@@ -6,6 +6,8 @@
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
+DP_ATTENTION="${DP_ATTENTION:-false}"
+
 check_env_vars \
     MODEL \
     TP \
@@ -14,7 +16,13 @@ check_env_vars \
     OSL \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME \
-    EP_SIZE
+    EP_SIZE \
+    DP_ATTENTION
+
+if [[ "$DP_ATTENTION" != "true" && "$DP_ATTENTION" != "false" ]]; then
+    echo "DP_ATTENTION must be true or false; got '$DP_ATTENTION'" >&2
+    exit 1
+fi
 
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
@@ -24,12 +32,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 nvidia-smi
 
-# MTP only supports TP=8 for now (matching dsr1_fp8_b200_mtp.sh)
-if [[ $TP -ne 8 ]]; then
-  echo "MTP only supports TP=8, got TP=$TP!"
-  exit 1
-fi
-
 SERVER_LOG=/workspace/server.log
 
 if [[ $CONC -ge 16 ]]; then
@@ -37,7 +39,32 @@ if [[ $CONC -ge 16 ]]; then
 else
   SCHEDULER_RECV_INTERVAL=10
 fi
-echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+
+CHUNKED_PREFILL_SIZE=16384
+SGLANG_PARALLEL_ARGS=(
+    --tensor-parallel-size="$TP"
+    --data-parallel-size=1
+)
+SGLANG_DPA_ARGS=()
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    SCHEDULER_RECV_INTERVAL=1
+    CHUNKED_PREFILL_SIZE=32768
+    SGLANG_PARALLEL_ARGS=(
+        --tensor-parallel-size="$TP"
+        --data-parallel-size="$TP"
+        --enable-dp-attention
+        --enable-dp-attention-local-control-broadcast
+        --enable-dp-lm-head
+    )
+    SGLANG_DPA_ARGS=(
+        --schedule-conservativeness 3.33
+        --enable-prefill-delayer
+    )
+fi
+
+echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CHUNKED_PREFILL_SIZE: $CHUNKED_PREFILL_SIZE"
 
 # MTP (Multi-Token Prediction) Config - EAGLE speculative decoding
 SPECULATIVE_NUM_STEPS=2
@@ -54,17 +81,17 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
---tensor-parallel-size=$TP --data-parallel-size=1 \
---cuda-graph-max-bs 512 --max-running-requests 512 --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 \
---chunked-prefill-size 16384 --max-prefill-tokens 16384 \
+SGLANG_RADIX_FORCE_MISS=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
+"${SGLANG_PARALLEL_ARGS[@]}" \
+--cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \
+--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \
 --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 30 \
+--disable-piecewise-cuda-graph --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 \
 --speculative-algorithm EAGLE \
 --speculative-num-steps $SPECULATIVE_NUM_STEPS \
 --speculative-num-draft-tokens $SPECULATIVE_DRAFT_TOKENS \
 --speculative-eagle-topk $SPECULATIVE_EAGLE_TOPK \
-$EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+"${SGLANG_DPA_ARGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index c318f2a2ac..b9e61609a0 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4343,3 +4343,14 @@
     - "Use nvidia/MiniMax-M3-NVFP4 from /scratch/models/MiniMax-M3-NVFP4 with vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41, which includes vllm-project/vllm PR #46380; no runtime patch needed"
     - "Reuse the existing MXFP8 B300 topology and concurrency matrix across 15 srt-slurm recipes, while dropping the FP8-only Marlin override from TP4 decode"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1931
+
+- config-keys:
+    - dsr1-fp4-b200-sglang-mtp
+  description:
+    - "Update the B200 FP4 SGLang MTP image to lmsysorg/sglang:v0.5.12.post1."
+    - "Update the MTP search space to include the low-latency TP4/EP1 lane with MTP speculative decoding enabled."
+    - "Use 1k/1k TP4/EP1 c1-c32 and TP4/EP4 c64-c256; use 8k/1k TP4/EP1 c1-c32 and TP4/EP4 DP-attention c64-c256."
+    - "Enable the MTP DP-attention path with local-control broadcast, DP LM head, prefill delayer, chunked prefill size 32768, scheduler recv interval 1, and schedule conservativeness 3.33."
+    - "Use SGLANG_RADIX_FORCE_MISS=1, remove --disable-radix-cache and --enable-symm-mem, and explicitly pass --disable-piecewise-cuda-graph."
+    - "Align MTP runtime settings with the non-MTP aggregate: cuda graph max batch size 256, max running requests 256, mem fraction static 0.85, stream interval 10, and no explicit max-prefill-tokens."
+  pr-link: XXX

From c46cf4f2ad2170c74c13ac733752355459435f81 Mon Sep 17 00:00:00 2001
From: Rohit Pujar Nagraj <rpujarnagraj@nvidia.com>
Date: Tue, 30 Jun 2026 13:12:36 -0700
Subject: [PATCH 2/5] Update perf-changelog pr-link for #1962

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index b9e61609a0..ab10b78176 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4353,4 +4353,4 @@
     - "Enable the MTP DP-attention path with local-control broadcast, DP LM head, prefill delayer, chunked prefill size 32768, scheduler recv interval 1, and schedule conservativeness 3.33."
     - "Use SGLANG_RADIX_FORCE_MISS=1, remove --disable-radix-cache and --enable-symm-mem, and explicitly pass --disable-piecewise-cuda-graph."
     - "Align MTP runtime settings with the non-MTP aggregate: cuda graph max batch size 256, max running requests 256, mem fraction static 0.85, stream interval 10, and no explicit max-prefill-tokens."
-  pr-link: XXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1962

From a028a702e3b29a666b98e89373c90f3f7cc8f0be Mon Sep 17 00:00:00 2001
From: Rohit Pujar Nagraj <rpujarnagraj@nvidia.com>
Date: Tue, 30 Jun 2026 16:26:28 -0700
Subject: [PATCH 3/5] fix: point dsr1-fp4 b200-dgxc MODEL_PATH at
 DeepSeek-R1-0528-NVFP4

The dsr1/fp4 weights are staged at /scratch/fsw/models/DeepSeek-R1-0528-NVFP4
on the b200-dgxc cluster; the previous /lustre/fsw/models/dsr1-0528-nvfp4-v2
path no longer exists, so the container mount failed at launch.
---
 runners/launch_b200-dgxc.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index f10e0f4ea4..dad5745ca7 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -14,7 +14,7 @@ set -x
 # NOTE: per-node /raid/models/* would be faster but is only populated on a
 # subset of dgxc nodes today, so we use Lustre for reliability.
 if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then
-    export MODEL_PATH="/lustre/fsw/models/dsr1-0528-nvfp4-v2"
+    export MODEL_PATH="/scratch/fsw/models/DeepSeek-R1-0528-NVFP4"
     export SRT_SLURM_MODEL_PREFIX="dsr1"
 elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then
     export MODEL_PATH="/lustre/fsw/models/dsr1-0528-fp8"

From 131d22cd01fea49167e75a09a588e1e1a819cec5 Mon Sep 17 00:00:00 2001
From: Rohit Pujar Nagraj <rpujarnagraj@nvidia.com>
Date: Tue, 30 Jun 2026 18:54:00 -0700
Subject: [PATCH 4/5] Updated model path

---
 runners/launch_b200-dgxc.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index dad5745ca7..432d0bae29 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -14,7 +14,7 @@ set -x
 # NOTE: per-node /raid/models/* would be faster but is only populated on a
 # subset of dgxc nodes today, so we use Lustre for reliability.
 if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then
-    export MODEL_PATH="/scratch/fsw/models/DeepSeek-R1-0528-NVFP4"
+    export MODEL_PATH="/lustre/fsw/models/DeepSeek-R1-0528-NVFP4"
     export SRT_SLURM_MODEL_PREFIX="dsr1"
 elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then
     export MODEL_PATH="/lustre/fsw/models/dsr1-0528-fp8"

From 806dc285aa6abfe2ece5191543ade5e6853fcce7 Mon Sep 17 00:00:00 2001
From: Rohit Pujar Nagraj <rpujarnagraj@nvidia.com>
Date: Wed, 1 Jul 2026 10:41:36 -0700
Subject: [PATCH 5/5] fix: correct dsr1-fp4 b200-dgxc MODEL_PATH to
 DeepSeek-R1-0528-NVFP4-v2

---
 runners/launch_b200-dgxc.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index 432d0bae29..733ad0824c 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -14,7 +14,7 @@ set -x
 # NOTE: per-node /raid/models/* would be faster but is only populated on a
 # subset of dgxc nodes today, so we use Lustre for reliability.
 if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then
-    export MODEL_PATH="/lustre/fsw/models/DeepSeek-R1-0528-NVFP4"
+    export MODEL_PATH="/scratch/fsw/models/DeepSeek-R1-0528-NVFP4-v2"
     export SRT_SLURM_MODEL_PREFIX="dsr1"
 elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then
     export MODEL_PATH="/lustre/fsw/models/dsr1-0528-fp8"