Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12362,14 +12362,14 @@ minimaxm3-fp8-gb300-dynamo-vllm:
osl: 1024
search-space:
# 1p1d DEP2+DEP8, 3n: conc 256
- conc-list: [256]
- conc-list: [256, 512]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml"
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k-healthcheck.yaml"
decode:
num-worker: 1
tp: 8
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dynamo:
version: 1.3.0.dev20260614

health_check:
max_attempts: 720
max_attempts: 840
interval_seconds: 10

sbatch_directives:
Expand Down Expand Up @@ -103,5 +103,5 @@ benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "256"
concurrencies: "256x512"
req_rate: "inf"
6 changes: 6 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4343,3 +4343,9 @@
- "Use nvidia/MiniMax-M3-NVFP4 from /scratch/models/MiniMax-M3-NVFP4 with vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41, which includes vllm-project/vllm PR #46380; no runtime patch needed"
- "Reuse the existing MXFP8 B300 topology and concurrency matrix across 15 srt-slurm recipes, while dropping the FP8-only Marlin override from TP4 decode"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1931

- config-keys:
- minimaxm3-fp8-gb300-dynamo-vllm
description:
- "Rename 1p1d-dep2-dep8-8k1k.yaml to 1p1d-dep2-dep8-8k1k-healthcheck.yaml and update the nvidia-master.yaml CONFIG_FILE pointer accordingly; increase health_check max_attempts from 720 to 840 (7200s → 8400s) to accommodate longer GB300 model load times."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1961
Loading