Skip to content
8 changes: 5 additions & 3 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1700,7 +1700,7 @@ dsr1-fp4-b200-sglang:
# - { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256, 512] }

dsr1-fp4-b200-sglang-mtp:
image: lmsysorg/sglang:v0.5.12-cu130
image: lmsysorg/sglang:v0.5.12.post1
model: nvidia/DeepSeek-R1-0528-FP4-V2
model-prefix: dsr1
runner: b200
Expand All @@ -1712,11 +1712,13 @@ dsr1-fp4-b200-sglang-mtp:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp }
- { tp: 4, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp }
- { tp: 4, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }

dsv4-fp4-b200-sglang:
image: lmsysorg/sglang:nightly-dev-cu13-20260628-da802ddc
Expand Down
55 changes: 41 additions & 14 deletions benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

source "$(dirname "$0")/../../benchmark_lib.sh"

DP_ATTENTION="${DP_ATTENTION:-false}"

check_env_vars \
MODEL \
TP \
Expand All @@ -14,7 +16,13 @@ check_env_vars \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
EP_SIZE
EP_SIZE \
DP_ATTENTION

if [[ "$DP_ATTENTION" != "true" && "$DP_ATTENTION" != "false" ]]; then
echo "DP_ATTENTION must be true or false; got '$DP_ATTENTION'" >&2
exit 1
fi

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
Expand All @@ -24,20 +32,39 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

nvidia-smi

# MTP only supports TP=8 for now (matching dsr1_fp8_b200_mtp.sh)
if [[ $TP -ne 8 ]]; then
echo "MTP only supports TP=8, got TP=$TP!"
exit 1
fi

SERVER_LOG=/workspace/server.log

if [[ $CONC -ge 16 ]]; then
SCHEDULER_RECV_INTERVAL=30
else
SCHEDULER_RECV_INTERVAL=10
fi
echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"

CHUNKED_PREFILL_SIZE=16384
SGLANG_PARALLEL_ARGS=(
--tensor-parallel-size="$TP"
--data-parallel-size=1
)
SGLANG_DPA_ARGS=()

if [[ "$DP_ATTENTION" == "true" ]]; then
SCHEDULER_RECV_INTERVAL=1
CHUNKED_PREFILL_SIZE=32768
SGLANG_PARALLEL_ARGS=(
--tensor-parallel-size="$TP"
--data-parallel-size="$TP"
--enable-dp-attention
--enable-dp-attention-local-control-broadcast
--enable-dp-lm-head
)
SGLANG_DPA_ARGS=(
--schedule-conservativeness 3.33
--enable-prefill-delayer
)
fi

echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CHUNKED_PREFILL_SIZE: $CHUNKED_PREFILL_SIZE"

# MTP (Multi-Token Prediction) Config - EAGLE speculative decoding
SPECULATIVE_NUM_STEPS=2
Expand All @@ -54,17 +81,17 @@ fi
start_gpu_monitor

set -x
PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
--tensor-parallel-size=$TP --data-parallel-size=1 \
--cuda-graph-max-bs 512 --max-running-requests 512 --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 \
--chunked-prefill-size 16384 --max-prefill-tokens 16384 \
SGLANG_RADIX_FORCE_MISS=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
"${SGLANG_PARALLEL_ARGS[@]}" \
--cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \
--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \
--ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 30 \
--disable-piecewise-cuda-graph --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 \
--speculative-algorithm EAGLE \
--speculative-num-steps $SPECULATIVE_NUM_STEPS \
--speculative-num-draft-tokens $SPECULATIVE_DRAFT_TOKENS \
--speculative-eagle-topk $SPECULATIVE_EAGLE_TOPK \
$EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
"${SGLANG_DPA_ARGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
11 changes: 11 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4407,3 +4407,14 @@
description:
- "Bump SGLang image from lmsysorg/sglang:deepseek-v4-blackwell (digest sha256:df18bfc4...) to mainline nightly lmsysorg/sglang:nightly-dev-cu13-20260628-da802ddc."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1923

- config-keys:
- dsr1-fp4-b200-sglang-mtp
description:
- "Update the B200 FP4 SGLang MTP image to lmsysorg/sglang:v0.5.12.post1."
- "Update the MTP search space to include the low-latency TP4/EP1 lane with MTP speculative decoding enabled."
- "Use 1k/1k TP4/EP1 c1-c32 and TP4/EP4 c64-c256; use 8k/1k TP4/EP1 c1-c32 and TP4/EP4 DP-attention c64-c256."
- "Enable the MTP DP-attention path with local-control broadcast, DP LM head, prefill delayer, chunked prefill size 32768, scheduler recv interval 1, and schedule conservativeness 3.33."
- "Use SGLANG_RADIX_FORCE_MISS=1, remove --disable-radix-cache and --enable-symm-mem, and explicitly pass --disable-piecewise-cuda-graph."
- "Align MTP runtime settings with the non-MTP aggregate: cuda graph max batch size 256, max running requests 256, mem fraction static 0.85, stream interval 10, and no explicit max-prefill-tokens."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1962
2 changes: 1 addition & 1 deletion runners/launch_b200-dgxc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ set -x
# NOTE: per-node /raid/models/* would be faster but is only populated on a
# subset of dgxc nodes today, so we use Lustre for reliability.
if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then
export MODEL_PATH="/lustre/fsw/models/dsr1-0528-nvfp4-v2"
export MODEL_PATH="/scratch/fsw/models/DeepSeek-R1-0528-NVFP4-v2"
export SRT_SLURM_MODEL_PREFIX="dsr1"
elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then
export MODEL_PATH="/lustre/fsw/models/dsr1-0528-fp8"
Expand Down