From b4a21a5f0f015c202e343771e185d979f99f8217 Mon Sep 17 00:00:00 2001 From: Rohit Pujar Nagraj Date: Tue, 30 Jun 2026 13:12:16 -0700 Subject: [PATCH 1/5] perf: update DSR1 B200 FP4 SGLang MTP config (image + low-latency search space) Bump the dsr1-fp4-b200-sglang-mtp image to lmsysorg/sglang:v0.5.12.post1 and reshape the MTP search space into a low-latency TP4/EP1 lane plus TP4/EP4 (1k/1k: TP4/EP1 c1-32 + TP4/EP4 c64-256; 8k/1k: TP4/EP1 c1-32 + TP4/EP4 DP-attention c64-256). Update dsr1_fp4_b200_mtp.sh with the DP-attention path, SGLANG_RADIX_FORCE_MISS=1, --disable-piecewise-cuda-graph, aligned runtime settings, and drop the TP=8-only restriction. --- .github/configs/nvidia-master.yaml | 8 ++- .../fixed_seq_len/dsr1_fp4_b200_mtp.sh | 55 ++++++++++++++----- perf-changelog.yaml | 11 ++++ 3 files changed, 57 insertions(+), 17 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index aaa88c634f..46e2bf7b73 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1700,7 +1700,7 @@ dsr1-fp4-b200-sglang: # - { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256, 512] } dsr1-fp4-b200-sglang-mtp: - image: lmsysorg/sglang:v0.5.12-cu130 + image: lmsysorg/sglang:v0.5.12.post1 model: nvidia/DeepSeek-R1-0528-FP4-V2 model-prefix: dsr1 runner: b200 @@ -1712,11 +1712,13 @@ dsr1-fp4-b200-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } dsv4-fp4-b200-sglang: image: lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b diff --git a/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_mtp.sh index 4a76a82d45..df08e52e98 100755 --- a/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_mtp.sh @@ -6,6 +6,8 @@ source "$(dirname "$0")/../../benchmark_lib.sh" +DP_ATTENTION="${DP_ATTENTION:-false}" + check_env_vars \ MODEL \ TP \ @@ -14,7 +16,13 @@ check_env_vars \ OSL \ RANDOM_RANGE_RATIO \ RESULT_FILENAME \ - EP_SIZE + EP_SIZE \ + DP_ATTENTION + +if [[ "$DP_ATTENTION" != "true" && "$DP_ATTENTION" != "false" ]]; then + echo "DP_ATTENTION must be true or false; got '$DP_ATTENTION'" >&2 + exit 1 +fi if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -24,12 +32,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi nvidia-smi -# MTP only supports TP=8 for now (matching dsr1_fp8_b200_mtp.sh) -if [[ $TP -ne 8 ]]; then - echo "MTP only supports TP=8, got TP=$TP!" - exit 1 -fi - SERVER_LOG=/workspace/server.log if [[ $CONC -ge 16 ]]; then @@ -37,7 +39,32 @@ if [[ $CONC -ge 16 ]]; then else SCHEDULER_RECV_INTERVAL=10 fi -echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +CHUNKED_PREFILL_SIZE=16384 +SGLANG_PARALLEL_ARGS=( + --tensor-parallel-size="$TP" + --data-parallel-size=1 +) +SGLANG_DPA_ARGS=() + +if [[ "$DP_ATTENTION" == "true" ]]; then + SCHEDULER_RECV_INTERVAL=1 + CHUNKED_PREFILL_SIZE=32768 + SGLANG_PARALLEL_ARGS=( + --tensor-parallel-size="$TP" + --data-parallel-size="$TP" + --enable-dp-attention + --enable-dp-attention-local-control-broadcast + --enable-dp-lm-head + ) + SGLANG_DPA_ARGS=( + --schedule-conservativeness 3.33 + --enable-prefill-delayer + ) +fi + +echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CHUNKED_PREFILL_SIZE: $CHUNKED_PREFILL_SIZE" # MTP (Multi-Token Prediction) Config - EAGLE speculative decoding SPECULATIVE_NUM_STEPS=2 @@ -54,17 +81,17 @@ fi start_gpu_monitor set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ ---tensor-parallel-size=$TP --data-parallel-size=1 \ ---cuda-graph-max-bs 512 --max-running-requests 512 --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 \ ---chunked-prefill-size 16384 --max-prefill-tokens 16384 \ +SGLANG_RADIX_FORCE_MISS=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ +"${SGLANG_PARALLEL_ARGS[@]}" \ +--cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \ +--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \ --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ ---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 30 \ +--disable-piecewise-cuda-graph --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 \ --speculative-algorithm EAGLE \ --speculative-num-steps $SPECULATIVE_NUM_STEPS \ --speculative-num-draft-tokens $SPECULATIVE_DRAFT_TOKENS \ --speculative-eagle-topk $SPECULATIVE_EAGLE_TOPK \ -$EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & +"${SGLANG_DPA_ARGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c318f2a2ac..b9e61609a0 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4343,3 +4343,14 @@ - "Use nvidia/MiniMax-M3-NVFP4 from /scratch/models/MiniMax-M3-NVFP4 with vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41, which includes vllm-project/vllm PR #46380; no runtime patch needed" - "Reuse the existing MXFP8 B300 topology and concurrency matrix across 15 srt-slurm recipes, while dropping the FP8-only Marlin override from TP4 decode" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1931 + +- config-keys: + - dsr1-fp4-b200-sglang-mtp + description: + - "Update the B200 FP4 SGLang MTP image to lmsysorg/sglang:v0.5.12.post1." + - "Update the MTP search space to include the low-latency TP4/EP1 lane with MTP speculative decoding enabled." + - "Use 1k/1k TP4/EP1 c1-c32 and TP4/EP4 c64-c256; use 8k/1k TP4/EP1 c1-c32 and TP4/EP4 DP-attention c64-c256." + - "Enable the MTP DP-attention path with local-control broadcast, DP LM head, prefill delayer, chunked prefill size 32768, scheduler recv interval 1, and schedule conservativeness 3.33." + - "Use SGLANG_RADIX_FORCE_MISS=1, remove --disable-radix-cache and --enable-symm-mem, and explicitly pass --disable-piecewise-cuda-graph." + - "Align MTP runtime settings with the non-MTP aggregate: cuda graph max batch size 256, max running requests 256, mem fraction static 0.85, stream interval 10, and no explicit max-prefill-tokens." + pr-link: XXX From c46cf4f2ad2170c74c13ac733752355459435f81 Mon Sep 17 00:00:00 2001 From: Rohit Pujar Nagraj Date: Tue, 30 Jun 2026 13:12:36 -0700 Subject: [PATCH 2/5] Update perf-changelog pr-link for #1962 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index b9e61609a0..ab10b78176 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4353,4 +4353,4 @@ - "Enable the MTP DP-attention path with local-control broadcast, DP LM head, prefill delayer, chunked prefill size 32768, scheduler recv interval 1, and schedule conservativeness 3.33." - "Use SGLANG_RADIX_FORCE_MISS=1, remove --disable-radix-cache and --enable-symm-mem, and explicitly pass --disable-piecewise-cuda-graph." - "Align MTP runtime settings with the non-MTP aggregate: cuda graph max batch size 256, max running requests 256, mem fraction static 0.85, stream interval 10, and no explicit max-prefill-tokens." - pr-link: XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1962 From a028a702e3b29a666b98e89373c90f3f7cc8f0be Mon Sep 17 00:00:00 2001 From: Rohit Pujar Nagraj Date: Tue, 30 Jun 2026 16:26:28 -0700 Subject: [PATCH 3/5] fix: point dsr1-fp4 b200-dgxc MODEL_PATH at DeepSeek-R1-0528-NVFP4 The dsr1/fp4 weights are staged at /scratch/fsw/models/DeepSeek-R1-0528-NVFP4 on the b200-dgxc cluster; the previous /lustre/fsw/models/dsr1-0528-nvfp4-v2 path no longer exists, so the container mount failed at launch. --- runners/launch_b200-dgxc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index f10e0f4ea4..dad5745ca7 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -14,7 +14,7 @@ set -x # NOTE: per-node /raid/models/* would be faster but is only populated on a # subset of dgxc nodes today, so we use Lustre for reliability. if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then - export MODEL_PATH="/lustre/fsw/models/dsr1-0528-nvfp4-v2" + export MODEL_PATH="/scratch/fsw/models/DeepSeek-R1-0528-NVFP4" export SRT_SLURM_MODEL_PREFIX="dsr1" elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/lustre/fsw/models/dsr1-0528-fp8" From 131d22cd01fea49167e75a09a588e1e1a819cec5 Mon Sep 17 00:00:00 2001 From: Rohit Pujar Nagraj Date: Tue, 30 Jun 2026 18:54:00 -0700 Subject: [PATCH 4/5] Updated model path --- runners/launch_b200-dgxc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index dad5745ca7..432d0bae29 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -14,7 +14,7 @@ set -x # NOTE: per-node /raid/models/* would be faster but is only populated on a # subset of dgxc nodes today, so we use Lustre for reliability. if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then - export MODEL_PATH="/scratch/fsw/models/DeepSeek-R1-0528-NVFP4" + export MODEL_PATH="/lustre/fsw/models/DeepSeek-R1-0528-NVFP4" export SRT_SLURM_MODEL_PREFIX="dsr1" elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/lustre/fsw/models/dsr1-0528-fp8" From 806dc285aa6abfe2ece5191543ade5e6853fcce7 Mon Sep 17 00:00:00 2001 From: Rohit Pujar Nagraj Date: Wed, 1 Jul 2026 10:41:36 -0700 Subject: [PATCH 5/5] fix: correct dsr1-fp4 b200-dgxc MODEL_PATH to DeepSeek-R1-0528-NVFP4-v2 --- runners/launch_b200-dgxc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index 432d0bae29..733ad0824c 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -14,7 +14,7 @@ set -x # NOTE: per-node /raid/models/* would be faster but is only populated on a # subset of dgxc nodes today, so we use Lustre for reliability. if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then - export MODEL_PATH="/lustre/fsw/models/DeepSeek-R1-0528-NVFP4" + export MODEL_PATH="/scratch/fsw/models/DeepSeek-R1-0528-NVFP4-v2" export SRT_SLURM_MODEL_PREFIX="dsr1" elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/lustre/fsw/models/dsr1-0528-fp8"