diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 3caa5faae..be8964f8f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1366,6 +1366,56 @@ kimik2.5-fp4-mi355x-vllm-disagg: additional-settings: - "DECODE_NODES=2" +# Agentic multinode 1P1D bring-up: Mooncake(tcp) carries the current-request +# prefill->decode KV transfer; LMCacheMP is enabled only on the prefill engine +# for local host-DRAM L2 prefix reuse. Decode intentionally uses Mooncake only +# to avoid decode-side LMCache lookup/retrieve racing the remote-prefill load. +# Keep the vLLM GPU memory target aligned with the known-working Mooncake+ +# LMCacheMP recipe. Lower values leave too little KV capacity for 262k agentic +# traces and collapse even low-concurrency warmup into capacity waits. +kimik2.5-fp4-mi355x-vllm-disagg-agentic: + image: yukiozzz/kimi-lmc-mc-rocm:dmabuf + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + runner: mi355x-disagg + precision: fp4 + framework: vllm-disagg + multinode: true + disagg: true + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "ROUTER_TYPE=mc-proxy" + - "PREFILL_KV_CONNECTOR=mooncake-lmcachemp" + - "DECODE_KV_CONNECTOR=mooncake" + - "MC_PROTOCOL=tcp" + - "ENABLE_PREFIX_CACHING=1" + - "MAX_MODEL_LEN=262144" + - "WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k" + - "LMCACHE_L1_SIZE_GB=1200" + - "LMCACHE_L1_INIT_SIZE_GB=20" + - "LMCACHE_L1_READ_TTL_SECONDS=7200" + - "LMCACHE_CHUNK_SIZE=256" + - "LMCACHE_MAX_WORKERS=8" + - "LMCACHE_MP_MQ_TIMEOUT=1200" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + dsr1-fp4-mi355x-sglang-disagg: image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 model: amd/DeepSeek-R1-0528-MXFP4-v2 diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 3beb246cc..4bff263cf 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -184,6 +184,13 @@ jobs: done fi + - name: Workspace cleanup (pre-checkout) + run: | + sudo rm -rf "$GITHUB_WORKSPACE/benchmark_logs" || true + sudo rm -f "$GITHUB_WORKSPACE"/samples*.jsonl || true + sudo rm -f "$GITHUB_WORKSPACE"/meta_env.json || true + sudo chown -R "$USER":"$USER" "$GITHUB_WORKSPACE" || true + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: token: ${{ secrets.REPO_PAT }} @@ -271,6 +278,85 @@ jobs: name: bmk_${{ env.RESULT_FILENAME }} path: agg_${{ env.RESULT_FILENAME }}_*.json + - name: Package debug logs + if: always() + run: | + set +e + DEBUG_DIR="multinode_debug_logs" + rm -rf "$DEBUG_DIR" multinode_server_logs.tar.gz + mkdir -p "$DEBUG_DIR" + + { + echo "result_filename=${RESULT_FILENAME:-unset}" + echo "exp_name=${EXP_NAME:-unset}" + echo "runner=${{ runner.name }}" + echo "runner_type=${{ inputs.runner }}" + echo "framework=${FRAMEWORK:-unset}" + echo "scenario_type=${SCENARIO_TYPE:-unset}" + echo "conc_list=${CONC_LIST:-unset}" + echo "conc=${CONC:-unset}" + echo "duration=${DURATION:-unset}" + echo "model=${MODEL:-unset}" + echo "model_prefix=${MODEL_PREFIX:-unset}" + echo "precision=${PRECISION:-unset}" + echo "github_run_id=${GITHUB_RUN_ID:-unset}" + echo "github_sha=${GITHUB_SHA:-unset}" + date -u +"utc_time=%Y-%m-%dT%H:%M:%SZ" + } > "$DEBUG_DIR/context.txt" + + if [ -d benchmark_logs ]; then + sudo tar -czf "$DEBUG_DIR/benchmark_logs.tar.gz" benchmark_logs 2>/dev/null || true + fi + + if [ -d LOGS ]; then + sudo tar -czf "$DEBUG_DIR/LOGS.tar.gz" LOGS 2>/dev/null || true + fi + + mkdir -p "$DEBUG_DIR/top_level" + find . -maxdepth 1 -type f \( \ + -name '*.json' -o \ + -name '*.jsonl' -o \ + -name '*.log' -o \ + -name '*.out' -o \ + -name '*.err' -o \ + -name 'meta_env.json' -o \ + -name 'samples*.jsonl' \ + \) -exec cp -f {} "$DEBUG_DIR/top_level/" \; 2>/dev/null || true + + { + echo "=== container/slurm/process snapshot ===" + command -v squeue >/dev/null 2>&1 && squeue -u "$USER" || true + command -v docker >/dev/null 2>&1 && docker ps -a --format '{{.Names}} {{.Status}} {{.Image}}' || true + echo + echo "=== benchmark_logs files ===" + find benchmark_logs -maxdepth 4 -type f -printf '%p %s bytes\n' 2>/dev/null | sort || true + echo + echo "=== LOGS files ===" + find LOGS -maxdepth 5 -type f -printf '%p %s bytes\n' 2>/dev/null | sort || true + } > "$DEBUG_DIR/file_manifest.txt" + + ERROR_PAT='ERROR|Error|Traceback|Exception|AssertionError|RuntimeError|ValueError|Internal Server Error|No available memory|No HIP GPUs|Engine core initialization failed|Run failed|server internal error|PD_BENCH.*rc=[1-9]|GSM8K_.*rc=[1-9]' + { + echo "Error summary generated from benchmark_logs, LOGS, and top-level logs." + echo "Pattern: $ERROR_PAT" + echo + for f in $(find benchmark_logs LOGS "$DEBUG_DIR/top_level" -type f 2>/dev/null | sort); do + case "$f" in + *.log|*.out|*.err|*.txt|*.json|*.jsonl|*.csv) + if sudo grep -E -q "$ERROR_PAT" "$f" 2>/dev/null; then + echo "===== $f =====" + sudo grep -E -n "$ERROR_PAT" "$f" 2>/dev/null | tail -200 + echo + fi + ;; + esac + done + } > "$DEBUG_DIR/error_summary.txt" + + sudo chown -R "$USER":"$USER" "$DEBUG_DIR" 2>/dev/null || true + tar -czf multinode_server_logs.tar.gz "$DEBUG_DIR" 2>/dev/null || true + ls -lh multinode_server_logs.tar.gz || true + - name: Upload server logs if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 95e063a3d..c6379987d 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -1108,8 +1108,14 @@ resolve_trace_source() { semianalysis_cc_traces_weka_with_subagents_256k) dataset="semianalysisai/cc-traces-weka-with-subagents-052726-256k" ;; + semianalysis_cc_traces_weka_with_subagents_060826) + dataset="semianalysisai/cc-traces-weka-with-subagents-060826" + ;; + semianalysis_cc_traces_weka_with_subagents_060826_256k) + dataset="semianalysisai/cc-traces-weka-with-subagents-060826-256k" + ;; *) - echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k" >&2 + echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k, semianalysis_cc_traces_weka_with_subagents_060826, semianalysis_cc_traces_weka_with_subagents_060826_256k" >&2 exit 1 ;; esac @@ -1183,7 +1189,10 @@ build_replay_cmd() { REPLAY_CMD+=" --endpoint /v1/chat/completions" REPLAY_CMD+=" --endpoint-type chat" REPLAY_CMD+=" --streaming" - REPLAY_CMD+=" --model $MODEL" + REPLAY_CMD+=" --model ${AIPERF_MODEL:-$MODEL}" + if [ -n "${AIPERF_MODEL:-}" ] && [ "${AIPERF_MODEL}" != "$MODEL" ]; then + REPLAY_CMD+=" --tokenizer $MODEL" + fi REPLAY_CMD+=" --concurrency $CONC" REPLAY_CMD+=" --benchmark-duration $duration" REPLAY_CMD+=" --random-seed 42" diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 05384f435..f5c2b8b62 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -55,6 +55,45 @@ source "$(dirname "$0")/../../benchmark_lib.sh" REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" +if [[ "${IS_AGENTIC:-0}" == "1" ]]; then + export PORT="${ROUTER_PORT}" + export MODEL="${MODEL:-${BENCH_MODEL}}" + if [[ "$ENGINE" == "vllm-disagg" ]]; then + # vLLM disagg serves --served-model-name=$MODEL_NAME. The workflow's + # MODEL env is the HF repo id (e.g. amd/Kimi-K2.5-MXFP4), which vLLM's + # OpenAI endpoint rejects unless it matches the served model name. Keep + # MODEL as result metadata and use AIPERF_MODEL only for the request body. + export AIPERF_MODEL="${AIPERF_MODEL:-${BENCH_MODEL}}" + fi + export DURATION="${DURATION:-1800}" + export INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}" + export AGENTIC_OUTPUT_DIR="${AGENTIC_OUTPUT_DIR:-/workspace}" + export RESULT_FILENAME="${RESULT_FILENAME:-agentic_bench}" + + RESULT_DIR="${RESULT_DIR:-/workspace/LOGS/agentic}" + mkdir -p "$RESULT_DIR" + + resolve_trace_source + install_agentic_deps + + # Multinode agentic matrix entries carry a single concurrency, but keep + # the loop so local one-off runs can pass a small x-separated list. + replay_failed=0 + for max_concurrency in "${chosen_concurrencies[@]}"; do + export CONC="$max_concurrency" + export USERS="$max_concurrency" + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" || replay_failed=1 + + if [[ "$ENGINE" == "vllm-disagg" ]]; then + echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." + sleep 10 + fi + done + + exit "$replay_failed" +fi + for max_concurrency in "${chosen_concurrencies[@]}"; do export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}" diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 01a5bd386..f1c6765aa 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -312,6 +312,29 @@ export RESULT_FILENAME="${RESULT_FILENAME:-}" export SPEC_DECODING="${SPEC_DECODING:-}" export IS_MULTINODE="${IS_MULTINODE:-false}" +# Agentic / custom vLLM-disagg connector knobs (threaded from submit.sh) +export IS_AGENTIC="${IS_AGENTIC:-0}" +export DURATION="${DURATION:-1800}" +export MODEL="${MODEL:-}" +export ROUTER_TYPE="${ROUTER_TYPE:-vllm-router}" +export ROUTER_PORT="${ROUTER_PORT:-30000}" +export ENABLE_PREFIX_CACHING="${ENABLE_PREFIX_CACHING:-}" +export MAX_MODEL_LEN="${MAX_MODEL_LEN:-}" +export WEKA_LOADER_OVERRIDE="${WEKA_LOADER_OVERRIDE:-}" +export VLLM_BIND_IP="${VLLM_BIND_IP:-}" +export PREFILL_KV_CONNECTOR="${PREFILL_KV_CONNECTOR:-moriio}" +export DECODE_KV_CONNECTOR="${DECODE_KV_CONNECTOR:-moriio}" +export MC_PROTOCOL="${MC_PROTOCOL:-tcp}" +export LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" +export LMCACHE_PORT="${LMCACHE_PORT:-5555}" +export LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" +export LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-2500}" +export LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" +export LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-3600}" +export LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" +export LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-8}" +export LMCACHE_MP_MQ_TIMEOUT="${LMCACHE_MP_MQ_TIMEOUT:-1200}" + SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" @@ -385,6 +408,28 @@ DOCKER_ENV_COMMON=( -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE -e IS_MULTINODE=\$IS_MULTINODE + -e IS_AGENTIC=\$IS_AGENTIC + -e DURATION=\$DURATION + -e MODEL=\$MODEL + -e ROUTER_TYPE=\$ROUTER_TYPE + -e ROUTER_PORT=\$ROUTER_PORT + -e ENABLE_PREFIX_CACHING=\$ENABLE_PREFIX_CACHING + -e MAX_MODEL_LEN=\$MAX_MODEL_LEN + -e MAX_NUM_SEQS=\$MAX_NUM_SEQS + -e WEKA_LOADER_OVERRIDE=\$WEKA_LOADER_OVERRIDE + -e VLLM_BIND_IP=\$VLLM_BIND_IP + -e PREFILL_KV_CONNECTOR=\$PREFILL_KV_CONNECTOR + -e DECODE_KV_CONNECTOR=\$DECODE_KV_CONNECTOR + -e MC_PROTOCOL=\$MC_PROTOCOL + -e LMCACHE_HOST=\$LMCACHE_HOST + -e LMCACHE_PORT=\$LMCACHE_PORT + -e LMCACHE_HTTP_PORT=\$LMCACHE_HTTP_PORT + -e LMCACHE_L1_SIZE_GB=\$LMCACHE_L1_SIZE_GB + -e LMCACHE_L1_INIT_SIZE_GB=\$LMCACHE_L1_INIT_SIZE_GB + -e LMCACHE_L1_READ_TTL_SECONDS=\$LMCACHE_L1_READ_TTL_SECONDS + -e LMCACHE_CHUNK_SIZE=\$LMCACHE_CHUNK_SIZE + -e LMCACHE_MAX_WORKERS=\$LMCACHE_MAX_WORKERS + -e LMCACHE_MP_MQ_TIMEOUT=\$LMCACHE_MP_MQ_TIMEOUT ) # Engine-specific env vars diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml index b051de8d9..a61947a4c 100644 --- a/benchmarks/multi_node/amd_utils/models_vllm.yaml +++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml @@ -25,9 +25,9 @@ amd-Llama-3.3-70B-Instruct-FP8-KV: env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" Kimi-K2.5-MXFP4: - prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" - decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" - env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" + prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --kv-cache-dtype fp8 --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" + decode_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --kv-cache-dtype fp8 --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ENGINE_READY_TIMEOUT_S=3600 VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS=1200" hf_dir: "models--amd--Kimi-K2.5-MXFP4" MiniMax-M2.5: diff --git a/benchmarks/multi_node/amd_utils/mooncake_lmcache_proxy.py b/benchmarks/multi_node/amd_utils/mooncake_lmcache_proxy.py new file mode 100644 index 000000000..1612b0314 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/mooncake_lmcache_proxy.py @@ -0,0 +1,297 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import itertools +import logging +import os +import uuid +from contextlib import asynccontextmanager + +import httpx +from fastapi import FastAPI, Request +from fastapi.responses import StreamingResponse + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """ + Lifespan context manager to handle startup and shutdown events. + """ + # Startup: Initialize client pools for prefiller and decoder services + app.state.prefill_clients = [] + app.state.decode_clients = [] + + # Create prefill clients + for i, (host, port) in enumerate(global_args.prefiller_instances): + prefiller_base_url = f"http://{host}:{port}/v1" + app.state.prefill_clients.append( + { + "client": httpx.AsyncClient( + timeout=None, + base_url=prefiller_base_url, + limits=httpx.Limits( + max_connections=None, + max_keepalive_connections=None, + ), + ), + "host": host, + "port": port, + "id": i, + } + ) + + # Create decode clients + for i, (host, port) in enumerate(global_args.decoder_instances): + decoder_base_url = f"http://{host}:{port}/v1" + app.state.decode_clients.append( + { + "client": httpx.AsyncClient( + timeout=None, + base_url=decoder_base_url, + limits=httpx.Limits( + max_connections=None, + max_keepalive_connections=None, + ), + ), + "host": host, + "port": port, + "id": i, + } + ) + + # Initialize round-robin iterators + app.state.prefill_iterator = itertools.cycle(range(len(app.state.prefill_clients))) + app.state.decode_iterator = itertools.cycle(range(len(app.state.decode_clients))) + + print( + f"Initialized {len(app.state.prefill_clients)} prefill clients " + f"and {len(app.state.decode_clients)} decode clients." + ) + + yield + + # Shutdown: Close all clients + for client_info in app.state.prefill_clients: + await client_info["client"].aclose() + + for client_info in app.state.decode_clients: + await client_info["client"].aclose() + + +# Update FastAPI app initialization to use lifespan +app = FastAPI(lifespan=lifespan) + + +def parse_args(): + parser = argparse.ArgumentParser() + + parser.add_argument("--port", type=int, default=8000) + # Always use 127.0.0.1 as localhost binds to IPv6 which is blocked on CI + parser.add_argument("--host", type=str, default="127.0.0.1") + + # For prefiller instances + parser.add_argument( + "--prefiller-hosts", + "--prefiller-host", + type=str, + nargs="+", + default=["localhost"], + ) + parser.add_argument( + "--prefiller-ports", "--prefiller-port", type=int, nargs="+", default=[8100] + ) + + # For decoder instances + parser.add_argument( + "--decoder-hosts", "--decoder-host", type=str, nargs="+", default=["localhost"] + ) + parser.add_argument( + "--decoder-ports", "--decoder-port", type=int, nargs="+", default=[8200] + ) + + args = parser.parse_args() + + # Validate and pair hosts with ports + if len(args.prefiller_hosts) != len(args.prefiller_ports): + raise ValueError( + "Number of prefiller hosts must match number of prefiller ports" + ) + + if len(args.decoder_hosts) != len(args.decoder_ports): + raise ValueError("Number of decoder hosts must match number of decoder ports") + + # Create tuples of (host, port) for each service type + args.prefiller_instances = list(zip(args.prefiller_hosts, args.prefiller_ports)) + args.decoder_instances = list(zip(args.decoder_hosts, args.decoder_ports)) + + return args + + +def get_next_client(app, service_type: str): + """ + Get the next client in round-robin fashion. + + Args: + app: The FastAPI app instance + service_type: Either 'prefill' or 'decode' + + Returns: + The next client to use + """ + if service_type == "prefill": + client_idx = next(app.state.prefill_iterator) + return app.state.prefill_clients[client_idx] + elif service_type == "decode": + client_idx = next(app.state.decode_iterator) + return app.state.decode_clients[client_idx] + else: + raise ValueError(f"Unknown service type: {service_type}") + + +async def send_request_to_service( + client_info: dict, endpoint: str, req_data: dict, request_id: str +): + """ + Send a request to a service using a client from the pool. + """ + req_data = req_data.copy() + req_data["kv_transfer_params"] = { + "do_remote_decode": True, + "do_remote_prefill": False, + "transfer_id": request_id, + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": None, + "remote_port": None, + } + req_data["stream"] = False + req_data["max_tokens"] = 1 + if "max_completion_tokens" in req_data: + req_data["max_completion_tokens"] = 1 + if "stream_options" in req_data: + del req_data["stream_options"] + # These args are not supported for P + min_tokens = req_data.pop("min_tokens", None) + min_completion_tokens = req_data.pop("min_completion_tokens", None) + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id, + } + + response = await client_info["client"].post( + endpoint, json=req_data, headers=headers + ) + response.raise_for_status() + + # read/consume the response body to release the connection + # otherwise, it would http.ReadError + await response.aread() + + # Add back the min_tokens and min_completion_tokens so D can use them + req_data["min_tokens"] = min_tokens + req_data["min_completion_tokens"] = min_completion_tokens + + return response + + +async def stream_service_response( + client_info: dict, endpoint: str, req_data: dict, request_id: str +): + """ + Asynchronously stream response from a service using a client from the pool. + """ + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id, + } + + async with client_info["client"].stream( + "POST", endpoint, json=req_data, headers=headers + ) as response: + response.raise_for_status() + async for chunk in response.aiter_bytes(): + yield chunk + + +async def _handle_completions(api: str, request: Request): + try: + req_data = await request.json() + request_id = str(uuid.uuid4()) + + # Get the next prefill client in round-robin fashion + prefill_client_info = get_next_client(request.app, "prefill") + + # Send request to prefill service + response = await send_request_to_service( + prefill_client_info, api, req_data, request_id + ) + + # Extract the needed fields + response_json = response.json() + await response.aclose() # CRITICAL: Release connection back to pool + kv_transfer_params = response_json.get("kv_transfer_params", {}) + if kv_transfer_params: + req_data["kv_transfer_params"] = kv_transfer_params + + # Get the next decode client in round-robin fashion + decode_client_info = get_next_client(request.app, "decode") + + logger.debug("Using %s %s", prefill_client_info, decode_client_info) + + # Stream response from decode service + async def generate_stream(): + async for chunk in stream_service_response( + decode_client_info, api, req_data, request_id=request_id + ): + yield chunk + + return StreamingResponse(generate_stream(), media_type="text/event-stream") + + except Exception as e: + import sys + import traceback + + exc_info = sys.exc_info() + print(f"Error occurred in disagg prefill proxy server - {api} endpoint") + print(e) + print("".join(traceback.format_exception(*exc_info))) + raise + + +@app.post("/v1/completions") +async def handle_completions(request: Request): + return await _handle_completions("/completions", request) + + +@app.post("/v1/chat/completions") +async def handle_chat_completions(request: Request): + return await _handle_completions("/chat/completions", request) + + +@app.get("/healthcheck") +async def healthcheck(): + """Simple endpoint to check if the server is running.""" + return { + "status": "ok", + "prefill_instances": len(app.state.prefill_clients), + "decode_instances": len(app.state.decode_clients), + } + + +@app.get("/health") +async def health(): + """Compatibility endpoint for the existing multinode health barrier.""" + return await healthcheck() + + +if __name__ == "__main__": + global global_args + global_args = parse_args() + + import uvicorn + + uvicorn.run(app, host=global_args.host, port=global_args.port) diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index d61fe0359..aad6e3fe7 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -181,6 +181,44 @@ fi echo "PREFILL_SERVER_CONFIG (after TP/EP/DP): $PREFILL_SERVER_CONFIG" echo "DECODE_SERVER_CONFIG (after TP/EP/DP): $DECODE_SERVER_CONFIG" +if [[ "${ENABLE_PREFIX_CACHING:-0}" == "1" ]]; then + for _cfg in PREFILL_SERVER_CONFIG DECODE_SERVER_CONFIG; do + _val="${!_cfg}" + _val="${_val//--no-enable-prefix-caching/}" + if ! echo "$_val" | grep -q -- '--enable-prefix-caching'; then + _val+=" --enable-prefix-caching" + fi + printf -v "$_cfg" '%s' "$_val" + done + echo "[vLLM] ENABLE_PREFIX_CACHING=1 -> prefix cache enabled on prefill + decode" +fi + +if [[ -n "${MAX_MODEL_LEN:-}" && "${MAX_MODEL_LEN}" != "0" ]]; then + for _cfg in PREFILL_SERVER_CONFIG DECODE_SERVER_CONFIG; do + _val="${!_cfg}" + if echo "$_val" | grep -q -- '--max-model-len'; then + _val=$(echo "$_val" | sed -E "s/--max-model-len[=[:space:]]+[0-9]+/--max-model-len ${MAX_MODEL_LEN}/g") + else + _val+=" --max-model-len ${MAX_MODEL_LEN}" + fi + printf -v "$_cfg" '%s' "$_val" + done + echo "[vLLM] MAX_MODEL_LEN=${MAX_MODEL_LEN}" +fi + +if [[ -n "${MAX_NUM_SEQS:-}" && "${MAX_NUM_SEQS}" != "0" ]]; then + for _cfg in PREFILL_SERVER_CONFIG DECODE_SERVER_CONFIG; do + _val="${!_cfg}" + if echo "$_val" | grep -q -- '--max-num-seqs'; then + _val=$(echo "$_val" | sed -E "s/--max-num-seqs[=[:space:]]+[0-9]+/--max-num-seqs ${MAX_NUM_SEQS}/g") + else + _val+=" --max-num-seqs ${MAX_NUM_SEQS}" + fi + printf -v "$_cfg" '%s' "$_val" + done + echo "[vLLM] MAX_NUM_SEQS=${MAX_NUM_SEQS}" +fi + # ============================================================================= # Container Synchronization # ============================================================================= @@ -217,13 +255,161 @@ echo "Decode node IPs: ${DECODE_ARGS}" # MoRI-IO proxy ZMQ registration port (must match vllm-router --vllm-discovery-address) PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" +# ============================================================================= +# KV connector selection +# ============================================================================= +_MORIIO_EXTRA="\"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}" +KVT_PREFILL="{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", ${_MORIIO_EXTRA}}" +KVT_DECODE="{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", ${_MORIIO_EXTRA}}" + +MC_PREFILL_CONN="{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_producer\",\"kv_connector_extra_config\":{\"mooncake_protocol\":\"${MC_PROTOCOL:-tcp}\"}}" +MC_DECODE_CONN="{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_consumer\",\"kv_connector_extra_config\":{\"mooncake_protocol\":\"${MC_PROTOCOL:-tcp}\"}}" +LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://${LMCACHE_HOST:-127.0.0.1}}" +LMC_CONN="{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"${LMCACHE_CONNECT_HOST}\",\"lmcache.mp.port\":${LMCACHE_PORT:-5555},\"lmcache.mp.mq_timeout\":${LMCACHE_MP_MQ_TIMEOUT:-1200}}}" + +case "${PREFILL_KV_CONNECTOR:-moriio}" in + mooncake-lmcachemp) + KVT_PREFILL="{\"kv_connector\":\"MultiConnector\",\"kv_role\":\"kv_producer\",\"kv_connector_extra_config\":{\"connectors\":[${MC_PREFILL_CONN},${LMC_CONN}]}}" + ;; + mooncake) + KVT_PREFILL="$MC_PREFILL_CONN" + ;; + moriio|"") + ;; + *) + echo "ERROR: unsupported PREFILL_KV_CONNECTOR=${PREFILL_KV_CONNECTOR}" >&2 + exit 1 + ;; +esac + +case "${DECODE_KV_CONNECTOR:-moriio}" in + mooncake) + KVT_DECODE="$MC_DECODE_CONN" + ;; + moriio|"") + ;; + *) + echo "ERROR: unsupported DECODE_KV_CONNECTOR=${DECODE_KV_CONNECTOR}" >&2 + exit 1 + ;; +esac + +if [[ "$NODE_RANK" -lt "$xP" ]]; then + ROLE_KV_CONNECTOR="${PREFILL_KV_CONNECTOR:-moriio}" +else + ROLE_KV_CONNECTOR="${DECODE_KV_CONNECTOR:-moriio}" +fi +echo "[KV] PREFILL_KV_CONNECTOR=${PREFILL_KV_CONNECTOR:-moriio}; DECODE_KV_CONNECTOR=${DECODE_KV_CONNECTOR:-moriio}; rank connector=${ROLE_KV_CONNECTOR}" + # vLLM runtime environment (static vars moved to env.sh; these depend on per-node state) setup_vllm_env() { - export VLLM_NIXL_SIDE_CHANNEL_HOST=${rdma_ip} + local bind_ip="${VLLM_BIND_IP:-${rdma_ip}}" + export VLLM_HOST_IP="${bind_ip}" + export VLLM_NIXL_SIDE_CHANNEL_HOST="${bind_ip}" export VLLM_NIXL_SIDE_CHANNEL_PORT=5600 for env_pair in ${MODEL_ENVS}; do export "$env_pair" done + echo "[vLLM] VLLM_HOST_IP=${VLLM_HOST_IP}" +} + +start_lmcache_mp_if_needed() { + if [[ "$ROLE_KV_CONNECTOR" != "mooncake-lmcachemp" ]]; then + return 0 + fi + + LMCACHE_PATCH_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/lmcache_mp_patch" + mkdir -p "$LMCACHE_PATCH_DIR" + cat > "$LMCACHE_PATCH_DIR/sitecustomize.py" <<'PY' +"""Keep LMCacheMP from producing proxy-visible PD transfer params. + +MultiConnector permits only one child connector to return kv_transfer_params. +Mooncake owns the prefill->decode PD protocol; LMCacheMP should only provide +local L2 lookup/retrieve/store on the prefill engine. +""" +import builtins +import sys + +_orig_import = builtins.__import__ + + +def _patch_module(mod): + cls = getattr(mod, "LMCacheMPConnector", None) + if cls is None or getattr(cls, "_inferencex_pd_params_patch", False): + return + orig = cls.request_finished + + def request_finished(self, request, block_ids): + async_save, params = orig(self, request, block_ids) + req_params = getattr(request, "kv_transfer_params", None) + if req_params and ( + req_params.get("do_remote_decode") or req_params.get("do_remote_prefill") + ): + return async_save, None + return async_save, params + + cls.request_finished = request_finished + cls._inferencex_pd_params_patch = True + + +def _import(name, globals=None, locals=None, fromlist=(), level=0): + mod = _orig_import(name, globals, locals, fromlist, level) + target = "lmcache.integration.vllm.lmcache_mp_connector" + if name == target or target in sys.modules: + _patch_module(sys.modules[target]) + return mod + + +builtins.__import__ = _import +if "lmcache.integration.vllm.lmcache_mp_connector" in sys.modules: + _patch_module(sys.modules["lmcache.integration.vllm.lmcache_mp_connector"]) + +try: + from vllm.distributed.kv_transfer.kv_connector.v1 import multi_connector + _orig_observe = multi_connector.MultiConnectorPromMetrics.observe + + def _safe_observe(self, transfer_stats_data, engine_idx): + if not isinstance(transfer_stats_data, dict): + return _orig_observe(self, transfer_stats_data, engine_idx) + filtered = { + connector_id: stats_data + for connector_id, stats_data in transfer_stats_data.items() + if connector_id in getattr(self, "_prom_metrics", {}) + } + if filtered: + return _orig_observe(self, filtered, engine_idx) + + multi_connector.MultiConnectorPromMetrics.observe = _safe_observe +except Exception: + pass +PY + export PYTHONPATH="$LMCACHE_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}" + + python3 -c 'import lmcache.integration.vllm.lmcache_mp_connector; print("LMCacheMPConnector import OK")' + + LMCACHE_LOG="/run_logs/slurm_job-${SLURM_JOB_ID}/lmcache_${host_name}.log" + echo "[LMCacheMP] starting server on ${LMCACHE_HOST:-127.0.0.1}:${LMCACHE_PORT:-5555} (http ${LMCACHE_HTTP_PORT:-8080})" + lmcache server \ + --host "${LMCACHE_HOST:-127.0.0.1}" --port "${LMCACHE_PORT:-5555}" \ + --http-host "${LMCACHE_HOST:-127.0.0.1}" --http-port "${LMCACHE_HTTP_PORT:-8080}" \ + --l1-size-gb "${LMCACHE_L1_SIZE_GB:-2500}" \ + --l1-init-size-gb "${LMCACHE_L1_INIT_SIZE_GB:-20}" \ + --l1-read-ttl-seconds "${LMCACHE_L1_READ_TTL_SECONDS:-3600}" \ + --chunk-size "${LMCACHE_CHUNK_SIZE:-256}" \ + --max-workers "${LMCACHE_MAX_WORKERS:-8}" \ + --eviction-policy LRU \ + > "$LMCACHE_LOG" 2>&1 & + + for _i in $(seq 1 120); do + if curl -sf --max-time 3 "http://${LMCACHE_HOST:-127.0.0.1}:${LMCACHE_HTTP_PORT:-8080}/healthcheck" >/dev/null 2>&1; then + echo "[LMCacheMP] server healthy" + return 0 + fi + sleep 2 + done + echo "ERROR: LMCache MP server failed to become healthy; tailing $LMCACHE_LOG" >&2 + tail -n 80 "$LMCACHE_LOG" >&2 || true + return 1 } # ============================================================================= @@ -247,16 +433,21 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "================================================" setup_vllm_env + start_lmcache_mp_if_needed - # Router is started as an external container by job.slurm (VLLM_ROUTER_IMAGE) - echo "Using external vllm-router container (started by job.slurm on this node)" + if [[ "${ROUTER_TYPE:-vllm-router}" == "mc-proxy" ]]; then + echo "Using in-container Mooncake/LMCache PD proxy on ROUTER_PORT=${ROUTER_PORT}" + else + # Router is started as an external container by job.slurm (VLLM_ROUTER_IMAGE) + echo "Using external vllm-router container (started by job.slurm on this node)" + fi SERVED_MODEL="${MODEL_NAME}" PREFILL_CMD="vllm serve ${MODEL_PATH} \ --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + --kv-transfer-config '${KVT_PREFILL}' \ ${PREFILL_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then @@ -282,6 +473,18 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Congratulations!!! All prefill and decode servers are up . . ." + if [[ "${ROUTER_TYPE:-vllm-router}" == "mc-proxy" ]]; then + MC_PROXY_DECODER_IP="${IP_ARRAY[$xP]:-${NODE0_ADDR}}" + MC_PROXY_LOG="/run_logs/slurm_job-${SLURM_JOB_ID}/mc_lmcache_pd_proxy.log" + echo "[Mooncake] starting in-container PD proxy :${ROUTER_PORT} (P=${NODE0_ADDR}:${SERVER_PORT} D=${MC_PROXY_DECODER_IP}:${SERVER_PORT})" + python3 -c 'import httpx,fastapi,uvicorn' 2>/dev/null || pip install -q httpx fastapi uvicorn + ( python3 "$WS_PATH/mooncake_lmcache_proxy.py" --host 0.0.0.0 --port "${ROUTER_PORT}" \ + --prefiller-hosts "${NODE0_ADDR}" --prefiller-ports "${SERVER_PORT}" \ + --decoder-hosts "${MC_PROXY_DECODER_IP}" --decoder-ports "${SERVER_PORT}" \ + > "$MC_PROXY_LOG" 2>&1 & ) || echo "[Mooncake] WARN: proxy failed to start (see $MC_PROXY_LOG)" + sleep 6 + fi + # Wait for proxy /health to confirm it is accepting requests HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ --node-ips ${NODE0_ADDR} \ @@ -416,13 +619,14 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then echo "Using prefill config: $PREFILL_SERVER_CONFIG" setup_vllm_env + start_lmcache_mp_if_needed SERVED_MODEL="${MODEL_NAME}" PREFILL_CMD="vllm serve ${MODEL_PATH} \ --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + --kv-transfer-config '${KVT_PREFILL}' \ ${PREFILL_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then @@ -478,7 +682,7 @@ else --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + --kv-transfer-config '${KVT_DECODE}' \ ${DECODE_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index fa3d65418..4379815fb 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -124,6 +124,7 @@ export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} +export MAX_NUM_SEQS=${MAX_NUM_SEQS:-$(echo "$CONCURRENCIES" | tr 'x' '\n' | sort -n | tail -1)} # Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker) export RUN_EVAL="${RUN_EVAL:-false}" @@ -137,6 +138,30 @@ export RESULT_FILENAME="${RESULT_FILENAME:-}" export SPEC_DECODING="${SPEC_DECODING:-}" export IS_MULTINODE="${IS_MULTINODE:-false}" +# Agentic / custom vLLM-disagg connector knobs (threaded to job.slurm -> Docker). +export IS_AGENTIC="${IS_AGENTIC:-0}" +export DURATION="${DURATION:-1800}" +export MODEL="${MODEL:-}" +export ROUTER_TYPE="${ROUTER_TYPE:-vllm-router}" +export ROUTER_PORT="${ROUTER_PORT:-30000}" +export ENABLE_PREFIX_CACHING="${ENABLE_PREFIX_CACHING:-}" +export MAX_MODEL_LEN="${MAX_MODEL_LEN:-}" +export MAX_NUM_SEQS="${MAX_NUM_SEQS:-}" +export WEKA_LOADER_OVERRIDE="${WEKA_LOADER_OVERRIDE:-}" +export VLLM_BIND_IP="${VLLM_BIND_IP:-}" +export PREFILL_KV_CONNECTOR="${PREFILL_KV_CONNECTOR:-moriio}" +export DECODE_KV_CONNECTOR="${DECODE_KV_CONNECTOR:-moriio}" +export MC_PROTOCOL="${MC_PROTOCOL:-tcp}" +export LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" +export LMCACHE_PORT="${LMCACHE_PORT:-5555}" +export LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" +export LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-2500}" +export LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" +export LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-3600}" +export LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" +export LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-8}" +export LMCACHE_MP_MQ_TIMEOUT="${LMCACHE_MP_MQ_TIMEOUT:-1200}" + # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" mkdir -p "$BENCHMARK_LOGS_DIR" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d6a5f35e4..47d2c44eb 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4153,3 +4153,11 @@ - "Run the PR #1891 MiniMax-M3 MXFP8 B300 Dynamo-vLLM recipe set on top of current main." - "Uses the vllm/vllm-openai:minimax-m3-0618-x86_64-cu130 image and the TEP4/TEP8 8k1k topologies not covered by PR #1890." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1891 + +- config-keys: + - kimik2.5-fp4-mi355x-vllm-disagg-agentic + description: + - "Add Kimi-K2.5 MI355X multinode agentic recipe using Mooncake(tcp) for PD KV transfer and LMCacheMP only on prefill for L2 prefix reuse." + - "Decode uses MooncakeConnector only to avoid decode-side LMCache lookup/retrieve racing the remote-prefill KV load." + - "Use fp8 KV cache, 0.55 vLLM GPU memory utilization, and 1.2TB LMCache host-DRAM cache to leave HBM headroom for LMCacheMP KV registration." + pr-link: XXX diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index acfd4912a..34fd65262 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -126,7 +126,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement # Find the latest log directory that contains the data - if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + if [[ "${EVAL_ONLY:-false}" != "true" && "${IS_AGENTIC:-0}" != "1" ]]; then cat > collect_latest_results.py <<'PY' import os, sys job_dir, isl, osl, nexp, framework = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]), sys.argv[5] diff --git a/utils/process_agentic_result.py b/utils/process_agentic_result.py index 3c4015ce6..cf7218104 100644 --- a/utils/process_agentic_result.py +++ b/utils/process_agentic_result.py @@ -659,6 +659,13 @@ def main() -> int: return 1 records = load_records(jsonl_path) + if not records: + print( + f"ERROR: {jsonl_path} contains no successful agentic records", + file=sys.stderr, + ) + return 1 + aggregate = load_aggregate(aggregate_path) if aggregate_path.exists() else {} server_metrics = load_server_metrics(server_metrics_path)