From 88feffcc3dad1de058453cb92dd50e36ea58f5c2 Mon Sep 17 00:00:00 2001 From: richardhuo-nv Date: Tue, 30 Jun 2026 12:14:41 -0700 Subject: [PATCH 1/5] fix --- .../vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml | 2 +- perf-changelog.yaml | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml index d990d661b..65996854a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml @@ -15,7 +15,7 @@ dynamo: version: 1.3.0.dev20260614 health_check: - max_attempts: 720 + max_attempts: 840 interval_seconds: 10 sbatch_directives: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3ef8c37db..7e25dd5fc 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4323,3 +4323,9 @@ - "Enable AITER MoE on MiniMax-M3 MXFP4 MI355X single-node vLLM STP: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1; pass --moe-backend aiter." - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e) for AITER MoE and shared-expert fusion support (vllm-project/vllm#46419, vllm-project/vllm#46545)." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1954 + +- config-keys: + - minimaxm3-fp8-gb300-dynamo-vllm + description: + - "Increase health_check max_attempts from 720 to 840 (7200s → 8400s) for the 1p1d-dep2-dep8-8k1k GB300 recipe to accommodate longer model load times." + pr-link: TODO From b7c65f38ae4a3dc9dddae7eec1beb2ce7d9bda65 Mon Sep 17 00:00:00 2001 From: richardhuo-nv Date: Tue, 30 Jun 2026 12:18:59 -0700 Subject: [PATCH 2/5] resolve conflict --- perf-changelog.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1efd670f6..da6f0c0b9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4324,12 +4324,6 @@ - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e) for AITER MoE and shared-expert fusion support (vllm-project/vllm#46419, vllm-project/vllm#46545)." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1954 -- config-keys: - - minimaxm3-fp8-gb300-dynamo-vllm - description: - - "Increase health_check max_attempts from 720 to 840 (7200s → 8400s) for the 1p1d-dep2-dep8-8k1k GB300 recipe to accommodate longer model load times." - pr-link: TODO - - config-keys: - minimaxm3-fp8-mi355x-vllm - minimaxm3-fp8-mi355x-vllm-mtp @@ -4349,3 +4343,9 @@ - "Use nvidia/MiniMax-M3-NVFP4 from /scratch/models/MiniMax-M3-NVFP4 with vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41, which includes vllm-project/vllm PR #46380; no runtime patch needed" - "Reuse the existing MXFP8 B300 topology and concurrency matrix across 15 srt-slurm recipes, while dropping the FP8-only Marlin override from TP4 decode" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1931 + +- config-keys: + - minimaxm3-fp8-gb300-dynamo-vllm + description: + - "Increase health_check max_attempts from 720 to 840 (7200s → 8400s) for the 1p1d-dep2-dep8-8k1k GB300 recipe to accommodate longer model load times." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1961 \ No newline at end of file From c292fcca4e0abb063a03e1343ff1bdff9df5a2d4 Mon Sep 17 00:00:00 2001 From: richardhuo-nv Date: Tue, 30 Jun 2026 12:20:25 -0700 Subject: [PATCH 3/5] fix newline --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index da6f0c0b9..a59b5b842 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4348,4 +4348,4 @@ - minimaxm3-fp8-gb300-dynamo-vllm description: - "Increase health_check max_attempts from 720 to 840 (7200s → 8400s) for the 1p1d-dep2-dep8-8k1k GB300 recipe to accommodate longer model load times." - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1961 \ No newline at end of file + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1961 From 56f730d5df926839e83052dbcb77dc7ebb68b90a Mon Sep 17 00:00:00 2001 From: richardhuo-nv Date: Tue, 30 Jun 2026 12:42:34 -0700 Subject: [PATCH 4/5] fix newline --- .github/configs/nvidia-master.yaml | 2 +- ...dep2-dep8-8k1k.yaml => 1p1d-dep2-dep8-8k1k-healthcheck.yaml} | 0 perf-changelog.yaml | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/{1p1d-dep2-dep8-8k1k.yaml => 1p1d-dep2-dep8-8k1k-healthcheck.yaml} (100%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index aaa88c634..29a026743 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12369,7 +12369,7 @@ minimaxm3-fp8-gb300-dynamo-vllm: ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k-healthcheck.yaml" decode: num-worker: 1 tp: 8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k-healthcheck.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k-healthcheck.yaml diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a59b5b842..40b8b23c4 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4347,5 +4347,5 @@ - config-keys: - minimaxm3-fp8-gb300-dynamo-vllm description: - - "Increase health_check max_attempts from 720 to 840 (7200s → 8400s) for the 1p1d-dep2-dep8-8k1k GB300 recipe to accommodate longer model load times." + - "Rename 1p1d-dep2-dep8-8k1k.yaml to 1p1d-dep2-dep8-8k1k-healthcheck.yaml and update the nvidia-master.yaml CONFIG_FILE pointer accordingly; increase health_check max_attempts from 720 to 840 (7200s → 8400s) to accommodate longer GB300 model load times." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1961 From d339fced4099df73a386e9f5bfe03c4206e5f1c1 Mon Sep 17 00:00:00 2001 From: richardhuo-nv Date: Tue, 30 Jun 2026 12:46:21 -0700 Subject: [PATCH 5/5] resolve conflict --- .github/configs/nvidia-master.yaml | 2 +- .../8k1k/1p1d-dep2-dep8-8k1k-healthcheck.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 29a026743..a92d44431 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12362,7 +12362,7 @@ minimaxm3-fp8-gb300-dynamo-vllm: osl: 1024 search-space: # 1p1d DEP2+DEP8, 3n: conc 256 - - conc-list: [256] + - conc-list: [256, 512] prefill: num-worker: 1 tp: 2 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k-healthcheck.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k-healthcheck.yaml index 65996854a..f6889705e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k-healthcheck.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k-healthcheck.yaml @@ -103,5 +103,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "256" + concurrencies: "256x512" req_rate: "inf"