Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1366,6 +1366,56 @@ kimik2.5-fp4-mi355x-vllm-disagg:
additional-settings:
- "DECODE_NODES=2"

# Agentic multinode 1P1D bring-up: Mooncake(tcp) carries the current-request
# prefill->decode KV transfer; LMCacheMP is enabled only on the prefill engine
# for local host-DRAM L2 prefix reuse. Decode intentionally uses Mooncake only
# to avoid decode-side LMCache lookup/retrieve racing the remote-prefill load.
# Keep the vLLM GPU memory target aligned with the known-working Mooncake+
# LMCacheMP recipe. Lower values leave too little KV capacity for 262k agentic
# traces and collapse even low-concurrency warmup into capacity waits.
kimik2.5-fp4-mi355x-vllm-disagg-agentic:
image: yukiozzz/kimi-lmc-mc-rocm:dmabuf

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we eliminate the private image ?

model: amd/Kimi-K2.5-MXFP4
model-prefix: kimik2.5
runner: mi355x-disagg
precision: fp4
framework: vllm-disagg
multinode: true
disagg: true
scenarios:
agentic-coding:
- duration: 1800
search-space:
- spec-decoding: "none"
conc-list: [ 8, 16, 32 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "ROUTER_TYPE=mc-proxy"
- "PREFILL_KV_CONNECTOR=mooncake-lmcachemp"
- "DECODE_KV_CONNECTOR=mooncake"
- "MC_PROTOCOL=tcp"
- "ENABLE_PREFIX_CACHING=1"
- "MAX_MODEL_LEN=262144"
- "WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k"
- "LMCACHE_L1_SIZE_GB=1200"

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how about moving to env.sh

- "LMCACHE_L1_INIT_SIZE_GB=20"
- "LMCACHE_L1_READ_TTL_SECONDS=7200"
- "LMCACHE_CHUNK_SIZE=256"
- "LMCACHE_MAX_WORKERS=8"
- "LMCACHE_MP_MQ_TIMEOUT=1200"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"

dsr1-fp4-mi355x-sglang-disagg:
image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
model: amd/DeepSeek-R1-0528-MXFP4-v2
Expand Down
86 changes: 86 additions & 0 deletions .github/workflows/benchmark-multinode-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,13 @@ jobs:
done
fi

- name: Workspace cleanup (pre-checkout)
run: |
sudo rm -rf "$GITHUB_WORKSPACE/benchmark_logs" || true
sudo rm -f "$GITHUB_WORKSPACE"/samples*.jsonl || true
sudo rm -f "$GITHUB_WORKSPACE"/meta_env.json || true
sudo chown -R "$USER":"$USER" "$GITHUB_WORKSPACE" || true

- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
token: ${{ secrets.REPO_PAT }}
Expand Down Expand Up @@ -271,6 +278,85 @@ jobs:
name: bmk_${{ env.RESULT_FILENAME }}
path: agg_${{ env.RESULT_FILENAME }}_*.json

- name: Package debug logs
if: always()
run: |
set +e
DEBUG_DIR="multinode_debug_logs"
rm -rf "$DEBUG_DIR" multinode_server_logs.tar.gz
mkdir -p "$DEBUG_DIR"

{
echo "result_filename=${RESULT_FILENAME:-unset}"
echo "exp_name=${EXP_NAME:-unset}"
echo "runner=${{ runner.name }}"
echo "runner_type=${{ inputs.runner }}"
echo "framework=${FRAMEWORK:-unset}"
echo "scenario_type=${SCENARIO_TYPE:-unset}"
echo "conc_list=${CONC_LIST:-unset}"
echo "conc=${CONC:-unset}"
echo "duration=${DURATION:-unset}"
echo "model=${MODEL:-unset}"
echo "model_prefix=${MODEL_PREFIX:-unset}"
echo "precision=${PRECISION:-unset}"
echo "github_run_id=${GITHUB_RUN_ID:-unset}"
echo "github_sha=${GITHUB_SHA:-unset}"
date -u +"utc_time=%Y-%m-%dT%H:%M:%SZ"
} > "$DEBUG_DIR/context.txt"

if [ -d benchmark_logs ]; then
sudo tar -czf "$DEBUG_DIR/benchmark_logs.tar.gz" benchmark_logs 2>/dev/null || true
fi

if [ -d LOGS ]; then
sudo tar -czf "$DEBUG_DIR/LOGS.tar.gz" LOGS 2>/dev/null || true
fi

mkdir -p "$DEBUG_DIR/top_level"
find . -maxdepth 1 -type f \( \
-name '*.json' -o \
-name '*.jsonl' -o \
-name '*.log' -o \
-name '*.out' -o \
-name '*.err' -o \
-name 'meta_env.json' -o \
-name 'samples*.jsonl' \
\) -exec cp -f {} "$DEBUG_DIR/top_level/" \; 2>/dev/null || true

{
echo "=== container/slurm/process snapshot ==="
command -v squeue >/dev/null 2>&1 && squeue -u "$USER" || true
command -v docker >/dev/null 2>&1 && docker ps -a --format '{{.Names}} {{.Status}} {{.Image}}' || true
echo
echo "=== benchmark_logs files ==="
find benchmark_logs -maxdepth 4 -type f -printf '%p %s bytes\n' 2>/dev/null | sort || true
echo
echo "=== LOGS files ==="
find LOGS -maxdepth 5 -type f -printf '%p %s bytes\n' 2>/dev/null | sort || true
} > "$DEBUG_DIR/file_manifest.txt"

ERROR_PAT='ERROR|Error|Traceback|Exception|AssertionError|RuntimeError|ValueError|Internal Server Error|No available memory|No HIP GPUs|Engine core initialization failed|Run failed|server internal error|PD_BENCH.*rc=[1-9]|GSM8K_.*rc=[1-9]'
{
echo "Error summary generated from benchmark_logs, LOGS, and top-level logs."
echo "Pattern: $ERROR_PAT"
echo
for f in $(find benchmark_logs LOGS "$DEBUG_DIR/top_level" -type f 2>/dev/null | sort); do
case "$f" in
*.log|*.out|*.err|*.txt|*.json|*.jsonl|*.csv)
if sudo grep -E -q "$ERROR_PAT" "$f" 2>/dev/null; then
echo "===== $f ====="
sudo grep -E -n "$ERROR_PAT" "$f" 2>/dev/null | tail -200
echo
fi
;;
esac
done
} > "$DEBUG_DIR/error_summary.txt"

sudo chown -R "$USER":"$USER" "$DEBUG_DIR" 2>/dev/null || true
tar -czf multinode_server_logs.tar.gz "$DEBUG_DIR" 2>/dev/null || true
ls -lh multinode_server_logs.tar.gz || true

- name: Upload server logs
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
Expand Down
13 changes: 11 additions & 2 deletions benchmarks/benchmark_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1108,8 +1108,14 @@ resolve_trace_source() {
semianalysis_cc_traces_weka_with_subagents_256k)
dataset="semianalysisai/cc-traces-weka-with-subagents-052726-256k"
;;
semianalysis_cc_traces_weka_with_subagents_060826)
dataset="semianalysisai/cc-traces-weka-with-subagents-060826"
;;
semianalysis_cc_traces_weka_with_subagents_060826_256k)
dataset="semianalysisai/cc-traces-weka-with-subagents-060826-256k"
;;
*)
echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k" >&2
echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k, semianalysis_cc_traces_weka_with_subagents_060826, semianalysis_cc_traces_weka_with_subagents_060826_256k" >&2
exit 1
;;
esac
Expand Down Expand Up @@ -1183,7 +1189,10 @@ build_replay_cmd() {
REPLAY_CMD+=" --endpoint /v1/chat/completions"
REPLAY_CMD+=" --endpoint-type chat"
REPLAY_CMD+=" --streaming"
REPLAY_CMD+=" --model $MODEL"
REPLAY_CMD+=" --model ${AIPERF_MODEL:-$MODEL}"
if [ -n "${AIPERF_MODEL:-}" ] && [ "${AIPERF_MODEL}" != "$MODEL" ]; then
REPLAY_CMD+=" --tokenizer $MODEL"
fi
REPLAY_CMD+=" --concurrency $CONC"
REPLAY_CMD+=" --benchmark-duration $duration"
REPLAY_CMD+=" --random-seed 42"
Expand Down
39 changes: 39 additions & 0 deletions benchmarks/multi_node/amd_utils/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,45 @@ source "$(dirname "$0")/../../benchmark_lib.sh"

REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"

if [[ "${IS_AGENTIC:-0}" == "1" ]]; then
export PORT="${ROUTER_PORT}"
export MODEL="${MODEL:-${BENCH_MODEL}}"
if [[ "$ENGINE" == "vllm-disagg" ]]; then
# vLLM disagg serves --served-model-name=$MODEL_NAME. The workflow's
# MODEL env is the HF repo id (e.g. amd/Kimi-K2.5-MXFP4), which vLLM's
# OpenAI endpoint rejects unless it matches the served model name. Keep
# MODEL as result metadata and use AIPERF_MODEL only for the request body.
export AIPERF_MODEL="${AIPERF_MODEL:-${BENCH_MODEL}}"
fi
export DURATION="${DURATION:-1800}"
export INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}"
export AGENTIC_OUTPUT_DIR="${AGENTIC_OUTPUT_DIR:-/workspace}"
export RESULT_FILENAME="${RESULT_FILENAME:-agentic_bench}"

RESULT_DIR="${RESULT_DIR:-/workspace/LOGS/agentic}"
mkdir -p "$RESULT_DIR"

resolve_trace_source
install_agentic_deps

# Multinode agentic matrix entries carry a single concurrency, but keep
# the loop so local one-off runs can pass a small x-separated list.
replay_failed=0
for max_concurrency in "${chosen_concurrencies[@]}"; do
export CONC="$max_concurrency"
export USERS="$max_concurrency"
build_replay_cmd "$RESULT_DIR"
run_agentic_replay_and_write_outputs "$RESULT_DIR" || replay_failed=1

if [[ "$ENGINE" == "vllm-disagg" ]]; then
echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..."
sleep 10
fi
done

exit "$replay_failed"
fi

for max_concurrency in "${chosen_concurrencies[@]}"; do

export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}"
Expand Down
45 changes: 45 additions & 0 deletions benchmarks/multi_node/amd_utils/job.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,29 @@ export RESULT_FILENAME="${RESULT_FILENAME:-}"
export SPEC_DECODING="${SPEC_DECODING:-}"
export IS_MULTINODE="${IS_MULTINODE:-false}"

# Agentic / custom vLLM-disagg connector knobs (threaded from submit.sh)
export IS_AGENTIC="${IS_AGENTIC:-0}"
export DURATION="${DURATION:-1800}"
export MODEL="${MODEL:-}"
export ROUTER_TYPE="${ROUTER_TYPE:-vllm-router}"
export ROUTER_PORT="${ROUTER_PORT:-30000}"
export ENABLE_PREFIX_CACHING="${ENABLE_PREFIX_CACHING:-}"
export MAX_MODEL_LEN="${MAX_MODEL_LEN:-}"
export WEKA_LOADER_OVERRIDE="${WEKA_LOADER_OVERRIDE:-}"
export VLLM_BIND_IP="${VLLM_BIND_IP:-}"
export PREFILL_KV_CONNECTOR="${PREFILL_KV_CONNECTOR:-moriio}"
export DECODE_KV_CONNECTOR="${DECODE_KV_CONNECTOR:-moriio}"
export MC_PROTOCOL="${MC_PROTOCOL:-tcp}"
export LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
export LMCACHE_PORT="${LMCACHE_PORT:-5555}"
export LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
export LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-2500}"
export LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}"
export LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-3600}"
export LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
export LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-8}"
export LMCACHE_MP_MQ_TIMEOUT="${LMCACHE_MP_MQ_TIMEOUT:-1200}"

SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"

Expand Down Expand Up @@ -385,6 +408,28 @@ DOCKER_ENV_COMMON=(
-e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP
-e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE
-e IS_MULTINODE=\$IS_MULTINODE
-e IS_AGENTIC=\$IS_AGENTIC
-e DURATION=\$DURATION
-e MODEL=\$MODEL
-e ROUTER_TYPE=\$ROUTER_TYPE
-e ROUTER_PORT=\$ROUTER_PORT
-e ENABLE_PREFIX_CACHING=\$ENABLE_PREFIX_CACHING
-e MAX_MODEL_LEN=\$MAX_MODEL_LEN
-e MAX_NUM_SEQS=\$MAX_NUM_SEQS
-e WEKA_LOADER_OVERRIDE=\$WEKA_LOADER_OVERRIDE
-e VLLM_BIND_IP=\$VLLM_BIND_IP
-e PREFILL_KV_CONNECTOR=\$PREFILL_KV_CONNECTOR
-e DECODE_KV_CONNECTOR=\$DECODE_KV_CONNECTOR
-e MC_PROTOCOL=\$MC_PROTOCOL
-e LMCACHE_HOST=\$LMCACHE_HOST
-e LMCACHE_PORT=\$LMCACHE_PORT
-e LMCACHE_HTTP_PORT=\$LMCACHE_HTTP_PORT
-e LMCACHE_L1_SIZE_GB=\$LMCACHE_L1_SIZE_GB
-e LMCACHE_L1_INIT_SIZE_GB=\$LMCACHE_L1_INIT_SIZE_GB
-e LMCACHE_L1_READ_TTL_SECONDS=\$LMCACHE_L1_READ_TTL_SECONDS
-e LMCACHE_CHUNK_SIZE=\$LMCACHE_CHUNK_SIZE
-e LMCACHE_MAX_WORKERS=\$LMCACHE_MAX_WORKERS
-e LMCACHE_MP_MQ_TIMEOUT=\$LMCACHE_MP_MQ_TIMEOUT
)

# Engine-specific env vars
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/multi_node/amd_utils/models_vllm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ amd-Llama-3.3-70B-Instruct-FP8-KV:
env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1"

Kimi-K2.5-MXFP4:
prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600"
prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --kv-cache-dtype fp8 --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
decode_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --kv-cache-dtype fp8 --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ENGINE_READY_TIMEOUT_S=3600 VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS=1200"
hf_dir: "models--amd--Kimi-K2.5-MXFP4"

MiniMax-M2.5:
Expand Down
Loading
Loading