From a878654a23dfa8e49ef792c55126a30c0242baa2 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Thu, 2 Jul 2026 12:12:11 -0500
Subject: [PATCH 1/4] feat(evals): add SWE-bench Lite eval (lm-eval generation
 + swebench harness scoring, Modal-capable)

Add a SWE-bench Lite accuracy eval that generates patches via the lm-eval
harness and scores them with the official swebench evaluation harness.

- utils/evals/swebench_lite.yaml: lm-eval task config for SWE-bench Lite
  generation (prompt/doc-to-text, generation kwargs, dataset wiring).
- utils/evals/swebench_score.py: post-processing + scoring. Extracts model
  patches from lm-eval output, feeds them to the swebench harness, and emits
  a "resolved" rate. Supports running the harness locally or on Modal via
  SWEBENCH_USE_MODAL (Modal pass-through so scoring can run off-box).
- utils/collect_eval_results.py: extract_lm_metrics learns a "resolved"
  filter branch so the swebench resolved metric is collected alongside the
  existing lm-eval metrics.
- utils/evals/thresholds.json: add the swebench_lite threshold entry.
- utils/evals/EVALS.md: document the SWE-bench Lite eval and how scoring works.
- benchmarks/benchmark_lib.sh: add run_swebench_eval, _install_swebench_deps,
  maybe_run_eval, and Modal pass-through. run_eval now picks a per-scenario
  default framework (agentic-coding -> swebench, fixed-seq-len -> lm-eval);
  an explicit EVAL_FRAMEWORK env var or --framework arg overrides the default.
  EVAL_TASKS_DIR selects the task yaml.
- utils/evals/test_swebench_eval.py, utils/evals/test_run_eval_dispatch.py:
  tests for the scorer and the scenario/framework dispatch precedence.
---
 benchmarks/benchmark_lib.sh           | 141 ++++++++-
 utils/collect_eval_results.py         |   4 +-
 utils/evals/EVALS.md                  |  27 ++
 utils/evals/swebench_lite.yaml        |  56 ++++
 utils/evals/swebench_score.py         | 397 ++++++++++++++++++++++++++
 utils/evals/test_run_eval_dispatch.py |  67 +++++
 utils/evals/test_swebench_eval.py     | 235 +++++++++++++++
 utils/evals/thresholds.json           |   3 +-
 8 files changed, 926 insertions(+), 4 deletions(-)
 create mode 100644 utils/evals/swebench_lite.yaml
 create mode 100644 utils/evals/swebench_score.py
 create mode 100644 utils/evals/test_run_eval_dispatch.py
 create mode 100644 utils/evals/test_swebench_eval.py
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 95e063a3d..1e59aa1b4 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -966,21 +966,157 @@ META
     echo "Moved eval artifacts to: $(pwd)"
 }
 
+# ------------------------------
+# SWE-bench eval helpers
+# ------------------------------
+
+# Run the SWE-bench Lite eval: generate patches with lm-eval, then score them
+# with the official swebench Docker harness. lm-eval cannot score SWE-bench
+# itself (no repo-level test executor), so we reuse it only for generation and
+# emit an lm-eval-shaped results JSON from swebench_score.py so the rest of the
+# pipeline (append_lm_eval_summary / collect / validate) is unchanged.
+#
+# Env knobs:
+#   SWEBENCH_TASK_NAME     (default swebench_lite) selects utils/evals/<name>.yaml
+#   SWEBENCH_DATASET       optional; must equal the YAML's dataset_path (the
+#                          scoring dataset is derived from the YAML so generation
+#                          and scoring never diverge) -- mismatch fails fast
+#   SWEBENCH_MAX_WORKERS   (default 4) harness workers / Modal parallelism
+#   SWEBENCH_USE_MODAL     "true" => score on Modal remote sandboxes instead of
+#                          local Docker (no Docker needed on the node; requires a
+#                          Modal account + ~/.modal.toml or MODAL_TOKEN_* creds)
+#   SWEBENCH_NAMESPACE     local-Docker only: pass "" on arm/Mac to build locally
+#   SWEBENCH_SKIP_SCORE    "true" => generate + stage predictions only, no scoring
+#                          (score elsewhere)
+_install_swebench_deps() {
+    # Best-effort (mirrors _install_lm_eval_deps); a real failure surfaces at scoring.
+    python3 -m pip install -q --no-cache-dir --break-system-packages swebench || true
+    if [ "${SWEBENCH_USE_MODAL:-false}" = "true" ]; then
+        python3 -m pip install -q --no-cache-dir --break-system-packages modal || true
+    fi
+}
+
+# Run the configured eval and stage its artifacts when RUN_EVAL is enabled.
+# run_eval auto-selects the framework by scenario (agentic -> swebench,
+# fixed-seq-len -> lm-eval), so recipes call this without naming a framework.
+maybe_run_eval() {
+    local port="${1:-${PORT:-8888}}"
+    if [ "${RUN_EVAL}" = "true" ]; then
+        run_eval --port "$port"
+        append_lm_eval_summary
+    fi
+}
+
+run_swebench_eval() {
+    local out_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}"
+    local task_name="${SWEBENCH_TASK_NAME:-swebench_lite}"
+    local gen_dir
+    gen_dir=$(mktemp -d /tmp/swebench_gen-XXXXXX)
+
+    # Keep the scoring dataset in lockstep with the generation YAML: the harness
+    # must score against the same instance set lm-eval generated patches for, or
+    # the instance IDs won't match. Derive it from the task YAML; if
+    # SWEBENCH_DATASET is set it must agree (fail-fast rather than mis-score).
+    local yaml_path="${EVAL_TASKS_DIR:-utils/evals/${task_name}.yaml}"
+    local dataset
+    dataset=$(awk '/^dataset_path:[[:space:]]/{print $2; exit}' "$yaml_path" 2>/dev/null)
+    if [ -z "$dataset" ]; then
+        echo "ERROR: could not read dataset_path from ${yaml_path}" >&2
+        rm -rf "$gen_dir" 2>/dev/null || true
+        return 1
+    fi
+    if [ -n "${SWEBENCH_DATASET:-}" ] && [ "${SWEBENCH_DATASET}" != "$dataset" ]; then
+        echo "ERROR: SWEBENCH_DATASET='${SWEBENCH_DATASET}' disagrees with ${yaml_path} dataset_path='${dataset}'." >&2
+        echo "       Generation and scoring must use the same dataset; edit the YAML or unset SWEBENCH_DATASET." >&2
+        rm -rf "$gen_dir" 2>/dev/null || true
+        return 1
+    fi
+
+    # 1. Generation via lm-eval (reuses endpoint wiring, _patch_lm_eval, etc.).
+    #    run_lm_eval already passes --log_samples, which is what we consume.
+    local prev_tasks_dir="${EVAL_TASKS_DIR:-}"
+    export EVAL_TASKS_DIR="$yaml_path"
+    local gen_rc=0
+    run_lm_eval "$@" --results-dir "$gen_dir" || gen_rc=$?
+    export EVAL_TASKS_DIR="$prev_tasks_dir"
+    if [ "$gen_rc" -ne 0 ]; then
+        echo "ERROR: swebench generation (lm-eval) failed with $gen_rc" >&2
+        rm -rf "$gen_dir" 2>/dev/null || true
+        return "$gen_rc"
+    fi
+
+    # Preserve generations as artifacts alongside the scored results.
+    mkdir -p "$out_dir"
+    find "$gen_dir" -name 'samples_*.jsonl' -exec cp -f {} "$out_dir"/ \; 2>/dev/null || true
+    export EVAL_RESULT_DIR="$out_dir"
+
+    local lm_eval_version
+    lm_eval_version=$(python3 -c 'import lm_eval; print(lm_eval.__version__)' 2>/dev/null || echo unknown)
+
+    if [ "${SWEBENCH_SKIP_SCORE:-false}" = "true" ]; then
+        # Generation-only mode: emit predictions, defer Docker scoring elsewhere.
+        # TODO(alec): wire the separate scoring job (Modal / sb-cli / CPU runner).
+        local skip_rc=0
+        python3 utils/evals/swebench_score.py \
+            --samples-dir "$gen_dir" --out-dir "$out_dir" \
+            --model-name "${MODEL_NAME:-$MODEL}" --task-name "$task_name" \
+            --predictions-only || skip_rc=$?
+        echo "SWEBENCH_SKIP_SCORE=true: staged predictions only (no resolved-rate)." >&2
+        rm -rf "$gen_dir" 2>/dev/null || true
+        return "$skip_rc"
+    fi
+
+    # 2. Score with the official swebench harness (local Docker, or Modal remote
+    #    sandboxes when SWEBENCH_USE_MODAL=true) and emit the lm-eval-shaped JSON.
+    if [ "${INFERENCEX_SWEBENCH_RUNTIME_READY:-false}" != "true" ]; then
+        _install_swebench_deps
+        export INFERENCEX_SWEBENCH_RUNTIME_READY=true
+    fi
+    local score_rc=0
+    python3 utils/evals/swebench_score.py \
+        --samples-dir "$gen_dir" \
+        --out-dir "$out_dir" \
+        --model-name "${MODEL_NAME:-$MODEL}" \
+        --task-name "$task_name" \
+        --dataset-name "$dataset" \
+        --max-workers "${SWEBENCH_MAX_WORKERS:-4}" \
+        --lm-eval-version "$lm_eval_version" \
+        ${SWEBENCH_USE_MODAL:+--modal} \
+        ${SWEBENCH_NAMESPACE+--namespace "$SWEBENCH_NAMESPACE"} \
+        || score_rc=$?
+    rm -rf "$gen_dir" 2>/dev/null || true
+    if [ "$score_rc" -ne 0 ]; then
+        echo "ERROR: swebench scoring failed with $score_rc" >&2
+        return "$score_rc"
+    fi
+}
+
 # ------------------------------
 # Unified eval entrypoint
 # ------------------------------
 
 run_eval() {
-    local framework="${EVAL_FRAMEWORK:-lm-eval}"
+    local cli_framework=""
     local forwarded=()
 
     while [[ $# -gt 0 ]]; do
         case "$1" in
-            --framework) framework="$2"; shift 2 ;;
+            --framework) cli_framework="$2"; shift 2 ;;
             *)           forwarded+=("$1"); shift ;;
         esac
     done
 
+    # Eval framework is chosen by SCENARIO: agentic-coding configs run swebench;
+    # fixed-seq-len (8k1k/1k1k) run lm-eval/gsm8k -- agentic never runs gsm8k and
+    # 8k1k/1k1k never runs swebench. An explicit EVAL_FRAMEWORK env or --framework
+    # arg still overrides the scenario default (e.g. the recipes' `--framework
+    # lm-eval`, or a forced override).
+    local scenario_default="lm-eval"
+    if [ "${IS_AGENTIC:-0}" = "1" ] || [ "${SCENARIO_TYPE:-}" = "agentic-coding" ]; then
+        scenario_default="swebench"
+    fi
+    local framework="${EVAL_FRAMEWORK:-${cli_framework:-$scenario_default}}"
+
     # Compute EVAL_MAX_MODEL_LEN if not already set by the calling script
     if [ -z "${EVAL_MAX_MODEL_LEN:-}" ]; then
         compute_eval_context_length "$MODEL" "${MAX_MODEL_LEN:-0}" > /dev/null
@@ -1052,6 +1188,7 @@ run_eval() {
     local eval_rc=0
     case "$framework" in
         lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" || eval_rc=$? ;;
+        swebench)        run_swebench_eval "${forwarded[@]}" || eval_rc=$? ;;
         *)               echo "Unknown framework '${framework}'"; eval_rc=1 ;;
     esac
 
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 333146566..0da10b71e 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -165,7 +165,9 @@ def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]:
             # Extract metrics for each filter
             for f in filter_list:
                 fname = f['name']
-                if 'strict' in fname:
+                # 'resolved' is SWE-bench's resolved-rate (swebench_score.py);
+                # treat it as the primary/strict score so it populates `score`.
+                if 'strict' in fname or 'resolved' in fname:
                     strict_val, strict_se = get_val_se(fname)
                 elif 'flex' in fname or 'extract' in fname:
                     flex_val, flex_se = get_val_se(fname)
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index 7ff878dce..a7738defc 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -169,7 +169,34 @@ The codebase patches lm-eval compatibility via `_patch_lm_eval`:
 1. Reasoning token handling: extracts `reasoning_content` when `message.content` is empty.
 2. TRT compatibility: avoids injecting `{"type": "text"}` for non-HF tokenizers.
 
+### SWE-bench Lite (`--framework swebench`)
+
+SWE-bench is **not** a `generate_until` QA task — it requires applying the model's
+patch to a repo and running tests in Docker, which lm-eval cannot do. So it runs
+through a dedicated framework that reuses lm-eval for *generation* only, then scores
+with the official `swebench` harness and emits an lm-eval-shaped results JSON
+(metric `exact_match,resolved` = resolved-rate) so collect/validate work unchanged.
+
+```bash
+run_eval --framework swebench --port "$PORT"   # generation (lm-eval) -> scoring (swebench)
+append_lm_eval_summary
+```
+
+- Task: `utils/evals/swebench_lite.yaml` (generation) — SWE-bench Lite, the ~300-instance curated
+  quick-eval subset (no difficulty filter needed; Lite is already the lightweight set).
+- Scoring: `utils/evals/swebench_score.py` (diff extraction → `predictions.jsonl` →
+  `python -m swebench.harness.run_evaluation` → resolved-rate → results JSON). Offline
+  `--report` mode skips Docker for testing.
+- Knobs: `SWEBENCH_TASK_NAME` (selects the YAML), `SWEBENCH_MAX_WORKERS`,
+  `SWEBENCH_NAMESPACE` (pass `""` on arm/Mac), `SWEBENCH_SKIP_SCORE=true` (generate-only). The
+  scoring dataset is derived from the YAML's `dataset_path` so generation and scoring can't diverge;
+  `SWEBENCH_DATASET`, if set, must match it (mismatch fails fast).
+- **Requires Docker + ~120 GB disk on the scoring host.** This is an MVP; the single-shot prompt and
+  diff extraction still need tuning to reach published resolved-rates, and the `thresholds.json` entry
+  needs calibration from a baseline run.
+
 ## Task files
 The following files are task definitions from lm-eval; more information on changes lives within the files:
 - `utils/evals/gsm8k.yaml`
 - `utils/evals/gpqa_diamond.yaml`
+- `utils/evals/swebench_lite.yaml` (generation only; scored by `swebench_score.py`)
diff --git a/utils/evals/swebench_lite.yaml b/utils/evals/swebench_lite.yaml
new file mode 100644
index 000000000..4633af462
--- /dev/null
+++ b/utils/evals/swebench_lite.yaml
@@ -0,0 +1,56 @@
+# SWE-bench Lite -- GENERATION ONLY.
+#
+# Lite is the ~300-instance curated subset for quick evals (no difficulty labels;
+# it's already the lightweight set, so no filtering is needed -- unlike Verified,
+# which carries a `difficulty` field).
+#
+# lm-eval is used purely to drive the served OpenAI-compatible endpoint and dump
+# one candidate patch per instance via --log_samples. The metric below is a
+# PLACEHOLDER that lm-eval computes but we ignore: the real resolved-rate comes
+# from utils/evals/swebench_score.py running the official `swebench` harness,
+# which then emits an lm-eval-shaped results JSON for collect/validate.
+#
+# Run it through the dedicated framework, not bare lm-eval:
+#   run_eval --framework swebench --port "$PORT"
+# which wires generation -> scoring. Bare `--tasks swebench_lite.yaml` would
+# produce only the meaningless placeholder metric.
+task: swebench_lite
+dataset_path: princeton-nlp/SWE-bench_Lite  # also mirrored at SWE-bench/SWE-bench_Lite
+output_type: generate_until
+test_split: test
+
+doc_to_text: |
+  You are an expert software engineer fixing a real GitHub issue in the
+  repository `{{repo}}` at commit {{base_commit}}.
+
+  <issue>
+  {{problem_statement}}
+  </issue>
+
+  Respond with ONLY a unified diff (a git patch) that resolves the issue, using
+  real repository file paths. Do not include explanations. Wrap the patch in a
+  single fenced block exactly like:
+
+  ```diff
+  diff --git a/path/to/file.py b/path/to/file.py
+  --- a/path/to/file.py
+  +++ b/path/to/file.py
+  @@ ... @@
+  ```
+# The gold patch is the nominal target. lm-eval's exact_match against it is
+# meaningless for patches (overwritten by the harness score); it only exists so
+# generate_until has a target + a metric and does not error.
+doc_to_target: "{{patch}}"
+
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0.0
+
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+
+metadata:
+  version: 0.1
diff --git a/utils/evals/swebench_score.py b/utils/evals/swebench_score.py
new file mode 100644
index 000000000..813a0e811
--- /dev/null
+++ b/utils/evals/swebench_score.py
@@ -0,0 +1,397 @@
+#!/usr/bin/env python3
+"""Score SWE-bench patches generated by lm-eval and emit an lm-eval-shaped result.
+
+Pipeline:
+
+  1. Read lm-eval ``--log_samples`` output (samples_*.jsonl): one candidate per
+     SWE-bench instance.
+  2. Extract a unified diff from each model generation.
+  3. Write a ``predictions.jsonl`` in the format the official ``swebench`` harness
+     expects: ``{instance_id, model_name_or_path, model_patch}``.
+  4. Run ``python -m swebench.harness.run_evaluation`` (Docker) to get the
+     resolved-rate -- unless ``--no-run``/``--report`` is given (offline/testing).
+  5. Emit a results JSON shaped like an lm-eval result so the existing
+     ``collect_eval_results.py`` / ``validate_scores.py`` ingest it unchanged.
+     The metric is published as ``exact_match,resolved`` = resolved-rate.
+
+The harness needs Docker + lots of disk and is NOT runnable on this dev Mac, so
+the Docker step is isolated behind ``--no-run`` for local testing. TODO(alec):
+exercise the real ``--run`` path on a runner.
+"""
+
+import argparse
+import json
+import math
+import re
+import subprocess
+import sys
+from pathlib import Path
+from typing import Iterator, Optional
+
+DEFAULT_DATASET = "princeton-nlp/SWE-bench_Lite"
+DEFAULT_TASK = "swebench_lite"
+
+# A unified diff, optionally inside a ```diff / ```patch fence. We try fenced
+# first (what the prompt asks for), then a bare ``diff --git`` slice.
+_FENCED_DIFF_RE = re.compile(
+    r"```(?:diff|patch)?\s*\n(?P<body>.*?)```",
+    re.DOTALL | re.IGNORECASE,
+)
+_DIFF_GIT_RE = re.compile(r"(?:^|\n)(diff --git .*)", re.DOTALL)
+
+# Line prefixes that belong to a (git) unified-diff body. Anything else marks
+# the end of the patch.
+_DIFF_LINE_PREFIXES = (
+    "diff ", "index ", "--- ", "+++ ", "@@", "+", "-", " ", "\\",
+    "old mode ", "new mode ", "new file mode ", "deleted file mode ",
+    "rename ", "copy ", "similarity ", "dissimilarity ",
+    "Binary files ", "GIT binary patch",
+)
+
+
+def _trim_to_diff_body(text: str) -> str:
+    """Keep only the leading run of diff-shaped lines, dropping trailing prose.
+
+    Models frequently emit a bare patch followed by an explanation ("Notes:",
+    "This fixes #123."). With no terminator that tail gets glued onto the patch
+    and rejected by ``git apply``, scoring the instance unresolved. Blank lines
+    are kept only when the diff resumes after them; a blank line followed by
+    non-diff text ends the patch.
+    """
+    lines = text.splitlines()
+    out: list[str] = []
+    i, n = 0, len(lines)
+    while i < n:
+        if lines[i].startswith(_DIFF_LINE_PREFIXES):
+            out.append(lines[i])
+            i += 1
+            continue
+        if lines[i] == "":
+            j = i
+            while j < n and lines[j] == "":
+                j += 1
+            if j < n and lines[j].startswith(_DIFF_LINE_PREFIXES):
+                out.extend(lines[i:j])  # interior blank line(s); diff resumes
+                i = j
+                continue
+        break  # trailing blank(s)+prose, or any other non-diff line
+    return "\n".join(out)
+
+
+def extract_patch(text: str) -> str:
+    """Pull a unified diff out of a model generation.
+
+    Best-effort and deliberately conservative -- a wrong extraction just means
+    that instance is unresolved, never a crash. Diff-extraction quality is a
+    primary tuning lever (TODO(alec)): bad fences here directly suppress the
+    resolved-rate.
+    """
+    if not text:
+        return ""
+
+    def _finish(body: str) -> str:
+        body = _trim_to_diff_body(body).strip("\n")
+        return body + "\n" if body else ""
+
+    # 1. Prefer a fenced block that actually looks like a diff.
+    for match in _FENCED_DIFF_RE.finditer(text):
+        body = match.group("body")
+        if "diff --git" in body or body.lstrip().startswith(("--- ", "+++ ")):
+            return _finish(body)
+    # 2. Fall back to a bare ``diff --git``, trimmed to the diff body so
+    #    trailing prose can't corrupt the patch.
+    git_match = _DIFF_GIT_RE.search(text)
+    if git_match:
+        trimmed = _finish(git_match.group(1))
+        if trimmed:
+            return trimmed
+    # 3. Last resort: a lone fenced block (fence-bounded), or the raw text.
+    lone = _FENCED_DIFF_RE.search(text)
+    if lone:
+        body = lone.group("body").strip("\n")
+        return body + "\n" if body else ""
+    return text.strip("\n") + "\n" if text.strip() else ""
+
+
+def _response_text(record: dict) -> str:
+    """Extract the model's text from one lm-eval sample record.
+
+    lm-eval's sample schema has drifted across versions; be tolerant.
+    TODO(alec): confirm against the pinned harness's real samples_*.jsonl.
+    """
+    for key in ("filtered_resps", "resps"):
+        val = record.get(key)
+        while isinstance(val, (list, tuple)) and val:
+            val = val[0]
+        if isinstance(val, str) and val.strip():
+            return val
+    return ""
+
+
+def _instance_id(record: dict) -> Optional[str]:
+    doc = record.get("doc")
+    if isinstance(doc, dict):
+        for key in ("instance_id", "instance", "id"):
+            val = doc.get(key)
+            if isinstance(val, str) and val:
+                return val
+    # Some versions hoist doc fields to the top level.
+    val = record.get("instance_id")
+    return val if isinstance(val, str) and val else None
+
+
+def iter_samples(samples_dir: Path) -> Iterator[dict]:
+    """Yield JSON records from every samples_*.jsonl under ``samples_dir``."""
+    files = sorted(samples_dir.rglob("samples_*.jsonl"))
+    if not files:
+        raise FileNotFoundError(
+            f"no samples_*.jsonl found under {samples_dir} -- did lm-eval run "
+            "with --log_samples?"
+        )
+    for path in files:
+        with path.open() as fh:
+            for line in fh:
+                line = line.strip()
+                if line:
+                    yield json.loads(line)
+
+
+def build_predictions(samples_dir: Path, model_name: str) -> list[dict]:
+    """Turn lm-eval samples into swebench prediction rows (dedup by instance)."""
+    by_instance: dict[str, dict] = {}
+    skipped = 0
+    for record in iter_samples(samples_dir):
+        instance_id = _instance_id(record)
+        if not instance_id:
+            skipped += 1
+            continue
+        patch = extract_patch(_response_text(record))
+        # Last write wins; SWE-bench is single-attempt so there should be one
+        # record per instance anyway.
+        by_instance[instance_id] = {
+            "instance_id": instance_id,
+            "model_name_or_path": model_name,
+            "model_patch": patch,
+        }
+    if skipped:
+        print(f"WARN: skipped {skipped} sample(s) with no instance_id", file=sys.stderr)
+    if not by_instance:
+        raise ValueError("no usable predictions extracted from samples")
+    return list(by_instance.values())
+
+
+def write_predictions(predictions: list[dict], out_path: Path) -> None:
+    with out_path.open("w") as fh:
+        for row in predictions:
+            fh.write(json.dumps(row) + "\n")
+
+
+def run_harness(
+    predictions_path: Path,
+    dataset_name: str,
+    run_id: str,
+    work_dir: Path,
+    max_workers: int,
+    namespace: Optional[str],
+    modal: bool = False,
+) -> None:
+    """Invoke the official swebench harness (local Docker, or Modal sandboxes)."""
+    cmd = [
+        sys.executable, "-m", "swebench.harness.run_evaluation",
+        "--dataset_name", dataset_name,
+        "--predictions_path", str(predictions_path),
+        "--run_id", run_id,
+    ]
+    if modal:
+        # Modal remote sandboxes instead of local Docker (no Docker on the node).
+        # --parallelism replaces --max_workers; --namespace is local-Docker only.
+        cmd += ["--modal", "true", "--parallelism", str(max_workers)]
+    else:
+        cmd += ["--max_workers", str(max_workers)]
+        if namespace is not None:
+            # On arm/Mac (and to force local image builds) pass --namespace ''.
+            cmd += ["--namespace", namespace]
+    print(f"[swebench] running: {' '.join(cmd)}", flush=True)
+    subprocess.run(cmd, cwd=str(work_dir), check=True)
+
+
+def find_report(work_dir: Path, model_name: str, run_id: str) -> Path:
+    """Locate the harness report JSON, tolerant to known layout variants."""
+    sanitized = model_name.replace("/", "__")
+    candidates = [
+        work_dir / f"{sanitized}.{run_id}.json",          # classic: <model>.<run_id>.json
+        work_dir / f"{model_name}.{run_id}.json",
+        work_dir / "evaluation_results" / "results.json",  # newer layout
+    ]
+    for path in candidates:
+        if path.exists():
+            return path
+    # Broad fallback: any *.json mentioning resolved/total at the top level.
+    for path in sorted(work_dir.rglob("*.json")):
+        try:
+            data = json.loads(path.read_text())
+        except (json.JSONDecodeError, OSError):
+            continue
+        if isinstance(data, dict) and (
+            "resolved_instances" in data or "resolved_ids" in data
+        ):
+            return path
+    raise FileNotFoundError(
+        f"could not locate a swebench report under {work_dir} "
+        f"(looked for {[str(c) for c in candidates]})"
+    )
+
+
+def parse_resolved(report: dict) -> tuple[int, int]:
+    """Return (resolved, total) from a harness report, tolerant to key variants.
+
+    Denominator is the full instance count (leaderboard convention:
+    resolved / total), not just completed instances.
+    """
+    resolved: Optional[int] = None
+    for key in ("resolved_instances", "resolved", "num_resolved"):
+        if isinstance(report.get(key), int):
+            resolved = report[key]
+            break
+    if resolved is None and isinstance(report.get("resolved_ids"), list):
+        resolved = len(report["resolved_ids"])
+
+    total: Optional[int] = None
+    for key in ("total_instances", "completed_instances", "submitted_instances"):
+        val = report.get(key)
+        if isinstance(val, int) and val > 0:
+            total = val
+            break
+    if total is None:
+        for key in ("completed_ids", "submitted_ids"):
+            if isinstance(report.get(key), list) and report[key]:
+                total = len(report[key])
+                break
+
+    if resolved is None or total is None or total <= 0:
+        raise ValueError(
+            f"could not parse resolved/total from report keys {sorted(report)}"
+        )
+    return resolved, total
+
+
+def build_results_json(
+    task: str,
+    resolved: int,
+    total: int,
+    model_name: str,
+    lm_eval_version: str,
+    report: Optional[dict],
+) -> dict:
+    """Shape the resolved-rate as an lm-eval result.
+
+    Published as ``exact_match,resolved`` so validate_scores (prefix
+    ``exact_match,``) gates it and collect_eval_results surfaces it as ``score``.
+    """
+    rate = resolved / total
+    stderr = math.sqrt(rate * (1.0 - rate) / total) if total else 0.0
+    return {
+        "lm_eval_version": lm_eval_version,
+        "model_name": model_name,
+        "results": {
+            task: {
+                "alias": task,
+                "exact_match,resolved": rate,
+                "exact_match_stderr,resolved": stderr,
+            }
+        },
+        "configs": {
+            task: {
+                "metric_list": [{"metric": "exact_match"}],
+                "filter_list": [{"name": "resolved"}],
+            }
+        },
+        "n-samples": {task: {"effective": total, "original": total}},
+        # Debugging passthrough; ignored by collectors (no lm_eval_version here).
+        "swebench": {
+            "resolved": resolved,
+            "total": total,
+            "resolved_rate": rate,
+            "report": report,
+        },
+    }
+
+
+def main(argv: Optional[list[str]] = None) -> int:
+    parser = argparse.ArgumentParser(description="Score SWE-bench patches from lm-eval samples")
+    parser.add_argument("--samples-dir", required=True, help="dir containing lm-eval samples_*.jsonl")
+    parser.add_argument("--out-dir", required=True, help="dir to write predictions + results JSON")
+    parser.add_argument("--model-name", required=True, help="served model name (model_name_or_path)")
+    parser.add_argument("--dataset-name", default=DEFAULT_DATASET)
+    parser.add_argument("--task-name", default=DEFAULT_TASK)
+    parser.add_argument("--run-id", default=None, help="harness run id (default: task name)")
+    parser.add_argument("--max-workers", type=int, default=4)
+    parser.add_argument(
+        "--namespace", default=None,
+        help="local-Docker --namespace value (pass '' on arm/Mac to build images locally)",
+    )
+    parser.add_argument(
+        "--modal", action="store_true",
+        help="score on Modal remote sandboxes instead of local Docker (needs modal creds)",
+    )
+    parser.add_argument("--lm-eval-version", default="unknown")
+    parser.add_argument(
+        "--predictions-only", action="store_true",
+        help="write predictions.jsonl and stop (no scoring; score elsewhere)",
+    )
+    parser.add_argument(
+        "--no-run", action="store_true",
+        help="skip the Docker harness; requires --report (offline/testing)",
+    )
+    parser.add_argument(
+        "--report", default=None,
+        help="path to a pre-computed harness report JSON (implies --no-run)",
+    )
+    args = parser.parse_args(argv)
+
+    samples_dir = Path(args.samples_dir)
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    run_id = args.run_id or args.task_name
+
+    # 1-3. samples -> predictions.jsonl
+    predictions = build_predictions(samples_dir, args.model_name)
+    predictions_path = out_dir / "predictions.jsonl"
+    write_predictions(predictions, predictions_path)
+    print(f"[swebench] wrote {len(predictions)} predictions -> {predictions_path}")
+
+    if args.predictions_only:
+        print("[swebench] predictions-only: skipping scoring (score elsewhere)")
+        return 0
+
+    # 4. score (Docker) or load an existing report
+    if args.report:
+        report = json.loads(Path(args.report).read_text())
+    elif args.no_run:
+        print("ERROR: --no-run requires --report", file=sys.stderr)
+        return 1
+    else:
+        run_harness(
+            predictions_path, args.dataset_name, run_id,
+            out_dir, args.max_workers, args.namespace, modal=args.modal,
+        )
+        report = json.loads(find_report(out_dir, args.model_name, run_id).read_text())
+
+    resolved, total = parse_resolved(report)
+
+    # 5. emit lm-eval-shaped results
+    results = build_results_json(
+        args.task_name, resolved, total, args.model_name,
+        args.lm_eval_version, report,
+    )
+    results_path = out_dir / f"results_{args.task_name}.json"
+    results_path.write_text(json.dumps(results, indent=2))
+    print(
+        f"[swebench] {args.task_name}: resolved {resolved}/{total} "
+        f"= {resolved / total:.4f} -> {results_path}"
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/utils/evals/test_run_eval_dispatch.py b/utils/evals/test_run_eval_dispatch.py
new file mode 100644
index 000000000..bef379bf8
--- /dev/null
+++ b/utils/evals/test_run_eval_dispatch.py
@@ -0,0 +1,67 @@
+"""run_eval framework dispatch.
+
+Scenario picks the default framework (agentic-coding -> swebench, fixed-seq-len
+-> lm-eval); an explicit EVAL_FRAMEWORK env or --framework arg overrides it.
+"""
+
+import os
+import subprocess
+from pathlib import Path
+
+BENCHMARK_LIB = Path(__file__).resolve().parents[2] / "benchmarks" / "benchmark_lib.sh"
+
+# Stub the framework runners so dispatch is observable without a server/Docker,
+# and pin EVAL_MAX_MODEL_LEN so run_eval skips context computation. CLI_FW is
+# only forwarded as --framework when set (so we can test the no-arg path).
+_SCRIPT = r'''
+source "$BENCHMARK_LIB"
+run_lm_eval()       { echo "DISPATCH=lm-eval"; }
+run_swebench_eval() { echo "DISPATCH=swebench"; }
+export EVAL_MAX_MODEL_LEN=16384
+unset EVAL_CONCURRENT_REQUESTS
+run_eval ${CLI_FW:+--framework "$CLI_FW"} --port 8888
+'''
+
+
+def _dispatch(*, is_agentic: str = "0", cli_fw=None, env_fw=None) -> str:
+    env = {**os.environ, "BENCHMARK_LIB": str(BENCHMARK_LIB), "IS_AGENTIC": is_agentic}
+    env.pop("EVAL_FRAMEWORK", None)
+    env.pop("CLI_FW", None)
+    if cli_fw is not None:
+        env["CLI_FW"] = cli_fw
+    if env_fw is not None:
+        env["EVAL_FRAMEWORK"] = env_fw
+    res = subprocess.run(
+        ["bash", "-c", _SCRIPT], env=env, text=True, capture_output=True, check=True
+    )
+    return res.stdout
+
+
+# --- scenario default ------------------------------------------------------
+
+def test_agentic_scenario_defaults_to_swebench():
+    assert "DISPATCH=swebench" in _dispatch(is_agentic="1")
+
+
+def test_fixed_seqlen_scenario_defaults_to_lm_eval():
+    assert "DISPATCH=lm-eval" in _dispatch(is_agentic="0")
+
+
+# --- explicit overrides win over the scenario default ----------------------
+
+def test_explicit_framework_arg_overrides_scenario():
+    # agentic, but recipe passed --framework lm-eval -> lm-eval.
+    assert "DISPATCH=lm-eval" in _dispatch(is_agentic="1", cli_fw="lm-eval")
+
+
+def test_env_framework_overrides_scenario():
+    assert "DISPATCH=lm-eval" in _dispatch(is_agentic="1", env_fw="lm-eval")
+
+
+def test_env_can_force_swebench_on_fixed_seqlen():
+    assert "DISPATCH=swebench" in _dispatch(is_agentic="0", env_fw="swebench")
+
+
+def test_recipe_lm_eval_arg_still_lm_eval_on_fixed_seqlen():
+    # The existing fixed-seq-len recipes call `run_eval --framework lm-eval`.
+    assert "DISPATCH=lm-eval" in _dispatch(is_agentic="0", cli_fw="lm-eval")
diff --git a/utils/evals/test_swebench_eval.py b/utils/evals/test_swebench_eval.py
new file mode 100644
index 000000000..72dba9b32
--- /dev/null
+++ b/utils/evals/test_swebench_eval.py
@@ -0,0 +1,235 @@
+"""Tests for the SWE-bench Lite eval MVP (generation -> scoring -> lm-eval shape).
+
+Pure-stdlib paths (extract_patch, predictions, report parsing, results shape)
+run on any interpreter. The dataset filter and the collect/validate integration
+guard on optional deps / interpreter version so the file imports cleanly even on
+the macOS system python 3.9 used for local spot-checks.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent))          # utils/evals
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))      # utils
+
+import swebench_score as sbs
+
+
+# --- diff extraction -------------------------------------------------------
+
+def test_extract_patch_from_diff_fence():
+    text = (
+        "Here is the fix:\n\n```diff\n"
+        "diff --git a/f.py b/f.py\n--- a/f.py\n+++ b/f.py\n"
+        "@@ -1 +1 @@\n-old\n+new\n```\nDone."
+    )
+    patch = sbs.extract_patch(text)
+    assert patch.startswith("diff --git a/f.py b/f.py")
+    assert patch.endswith("\n")
+    assert "Here is the fix" not in patch
+    assert "Done." not in patch
+
+
+def test_extract_patch_bare_diff_git():
+    text = "no fence\ndiff --git a/x b/x\n@@ @@\n-a\n+b\n"
+    patch = sbs.extract_patch(text)
+    assert patch.startswith("diff --git a/x b/x")
+    assert "no fence" not in patch
+
+
+def test_extract_patch_bare_diff_strips_trailing_prose():
+    # A bare diff followed by an explanation must not glue the prose onto the
+    # patch (git apply would reject it -> instance scored unresolved).
+    text = (
+        "diff --git a/x b/x\n--- a/x\n+++ b/x\n@@ -1 +1 @@\n-old\n+new\n"
+        "\nNotes:\nThis fixes #123.\n"
+    )
+    patch = sbs.extract_patch(text)
+    assert patch.rstrip().endswith("+new")
+    assert "Notes:" not in patch
+    assert "This fixes" not in patch
+
+
+def test_extract_patch_keeps_multi_file_and_interior_context():
+    # Multiple files + a blank context line (represented as " ") stay intact.
+    text = (
+        "```diff\n"
+        "diff --git a/a b/a\n@@ -1,2 +1,2 @@\n context\n-x\n+y\n"
+        "diff --git a/b b/b\n@@ -1 +1 @@\n-p\n+q\n"
+        "```\nthanks!"
+    )
+    patch = sbs.extract_patch(text)
+    assert "diff --git a/a b/a" in patch
+    assert "diff --git a/b b/b" in patch
+    assert "thanks" not in patch
+
+
+def test_extract_patch_empty_when_no_diff():
+    assert sbs.extract_patch("") == ""
+    # Prose with no diff markers falls back to the raw text (harness will reject).
+    assert sbs.extract_patch("just words").strip() == "just words"
+
+
+# --- samples -> predictions ------------------------------------------------
+
+def _write_samples(dirpath: Path, records: list[dict]) -> None:
+    with (dirpath / "samples_swebench_lite_2026.jsonl").open("w") as fh:
+        for rec in records:
+            fh.write(json.dumps(rec) + "\n")
+
+
+def test_build_predictions_extracts_instance_and_patch(tmp_path):
+    _write_samples(tmp_path, [
+        {
+            "doc": {"instance_id": "repo__proj-1"},
+            "filtered_resps": ["```diff\ndiff --git a/a b/a\n+x\n```"],
+        },
+        {
+            "doc": {"instance_id": "repo__proj-2"},
+            "resps": [["diff --git a/b b/b\n+y\n"]],
+        },
+    ])
+    preds = sbs.build_predictions(tmp_path, "my-model")
+    by_id = {p["instance_id"]: p for p in preds}
+    assert set(by_id) == {"repo__proj-1", "repo__proj-2"}
+    assert by_id["repo__proj-1"]["model_name_or_path"] == "my-model"
+    assert by_id["repo__proj-1"]["model_patch"].startswith("diff --git a/a b/a")
+    assert by_id["repo__proj-2"]["model_patch"].startswith("diff --git a/b b/b")
+
+
+def test_build_predictions_raises_without_samples(tmp_path):
+    with pytest.raises(FileNotFoundError):
+        sbs.build_predictions(tmp_path, "m")
+
+
+# --- report parsing --------------------------------------------------------
+
+def test_parse_resolved_classic_counts():
+    assert sbs.parse_resolved(
+        {"resolved_instances": 80, "total_instances": 196}
+    ) == (80, 196)
+
+
+def test_parse_resolved_from_id_lists():
+    report = {"resolved_ids": ["a", "b", "c"], "completed_ids": ["a", "b", "c", "d"]}
+    # no total_instances -> falls back to completed_ids length
+    assert sbs.parse_resolved(report) == (3, 4)
+
+
+def test_parse_resolved_raises_on_garbage():
+    with pytest.raises(ValueError):
+        sbs.parse_resolved({"nope": 1})
+
+
+# --- harness command construction (Docker vs Modal) ------------------------
+
+def _captured_harness_cmd(monkeypatch, tmp_path, *, modal, namespace):
+    captured = {}
+    monkeypatch.setattr(sbs.subprocess, "run", lambda cmd, **kw: captured.setdefault("cmd", cmd))
+    sbs.run_harness(
+        tmp_path / "predictions.jsonl", "princeton-nlp/SWE-bench_Lite", "rid",
+        tmp_path, 8, namespace, modal=modal,
+    )
+    return captured["cmd"]
+
+
+def test_run_harness_modal_uses_modal_and_parallelism(monkeypatch, tmp_path):
+    cmd = _captured_harness_cmd(monkeypatch, tmp_path, modal=True, namespace="")
+    assert "--modal" in cmd and "--parallelism" in cmd
+    assert "--namespace" not in cmd  # Docker-only
+    assert "--max_workers" not in cmd
+
+
+def test_run_harness_docker_uses_max_workers_and_namespace(monkeypatch, tmp_path):
+    cmd = _captured_harness_cmd(monkeypatch, tmp_path, modal=False, namespace="")
+    assert "--max_workers" in cmd
+    assert "--namespace" in cmd
+    assert "--modal" not in cmd
+
+
+# --- lm-eval-shaped results ------------------------------------------------
+
+def test_build_results_json_is_lm_eval_shaped():
+    res = sbs.build_results_json(
+        "swebench_lite", 49, 196, "m", "0.4.12", {"resolved_instances": 49}
+    )
+    assert "lm_eval_version" in res  # detection key for collect_eval_results
+    task = res["results"]["swebench_lite"]
+    assert task["exact_match,resolved"] == pytest.approx(0.25)
+    cfg = res["configs"]["swebench_lite"]
+    assert cfg["filter_list"] == [{"name": "resolved"}]
+    assert res["n-samples"]["swebench_lite"]["effective"] == 196
+
+
+def test_score_offline_end_to_end(tmp_path):
+    """--report path: samples -> predictions + results JSON, no Docker."""
+    samples = tmp_path / "gen"
+    samples.mkdir()
+    _write_samples(samples, [
+        {"doc": {"instance_id": "r__p-1"}, "filtered_resps": ["```diff\ndiff --git a/a b/a\n+x\n```"]},
+    ])
+    report = tmp_path / "report.json"
+    report.write_text(json.dumps({"resolved_instances": 1, "total_instances": 1}))
+    out = tmp_path / "out"
+    rc = sbs.main([
+        "--samples-dir", str(samples), "--out-dir", str(out),
+        "--model-name", "m", "--report", str(report),
+    ])
+    assert rc == 0
+    assert (out / "predictions.jsonl").exists()
+    results = json.loads((out / "results_swebench_lite.json").read_text())
+    assert results["results"]["swebench_lite"]["exact_match,resolved"] == 1.0
+
+
+def test_predictions_only_writes_predictions_no_results(tmp_path):
+    """SWEBENCH_SKIP_SCORE path: predictions only, no Docker, no results JSON."""
+    samples = tmp_path / "gen"
+    samples.mkdir()
+    _write_samples(samples, [
+        {"doc": {"instance_id": "r__p-1"}, "filtered_resps": ["```diff\ndiff --git a/a b/a\n+x\n```"]},
+    ])
+    out = tmp_path / "out"
+    rc = sbs.main([
+        "--samples-dir", str(samples), "--out-dir", str(out),
+        "--model-name", "m", "--predictions-only",
+    ])
+    assert rc == 0
+    assert (out / "predictions.jsonl").exists()
+    assert not (out / "results_swebench_lite.json").exists()
+
+
+# --- integration with the existing pipeline (needs tabulate + py3.10+) -----
+
+@pytest.mark.skipif(sys.version_info < (3, 10), reason="repo modules use py3.10 syntax")
+def test_results_json_flows_through_collect_and_validate(tmp_path, monkeypatch):
+    pytest.importorskip("tabulate")
+    import collect_eval_results as cer
+    import validate_scores as vs
+
+    art = tmp_path / "eval"
+    art.mkdir()
+    (art / "meta_env.json").write_text(json.dumps({
+        "infmax_model_prefix": "dsr1", "hw": "b200", "framework": "sglang",
+        "precision": "fp8", "isl": 8192, "osl": 1024,
+    }))
+    res = sbs.build_results_json(
+        "swebench_lite", 150, 300, "dsr1", "0.4.12", None
+    )
+    (art / "results_swebench_lite.json").write_text(json.dumps(res))
+
+    # collect surfaces the resolved-rate as the unified `score`.
+    rows = cer.collect_eval_rows(tmp_path)
+    assert len(rows) == 1
+    assert rows[0]["task"] == "swebench_lite"
+    assert rows[0]["score"] == pytest.approx(0.5)
+
+    # validate_scores gates exact_match,resolved against thresholds.json (0.10).
+    monkeypatch.chdir(art)
+    monkeypatch.setattr(sys, "argv", [
+        "validate_scores.py",
+        "--results-glob", "results_swebench_lite.json",
+    ])
+    assert vs.main() == 0  # 0.5 >= 0.10 default threshold
diff --git a/utils/evals/thresholds.json b/utils/evals/thresholds.json
index d6c091152..cbbe65105 100644
--- a/utils/evals/thresholds.json
+++ b/utils/evals/thresholds.json
@@ -1,7 +1,8 @@
 {
   "default": {
     "gsm8k": 0.90,
-    "gpqa_diamond_cot_n_shot": 0.30
+    "gpqa_diamond_cot_n_shot": 0.30,
+    "swebench_lite": 0.10
   },
   "models": {
     "dsr1": {

From 8f9f3ee4b3303ee1fb252cac931cddb5931f62a7 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Thu, 2 Jul 2026 12:12:34 -0500
Subject: [PATCH 2/4] feat(evals): agentic-scenario eval selection + routing
 (swebench on agentic configs)

Wire the SWE-bench Lite eval into the sweep matrix so it runs on agentic
coding configs, and route it through e2e-tests.

- utils/matrix_logic/generate_sweep_configs.py: add mark_eval_entries and
  mark_all_eval_entries. For agentic configs these mark exactly one eval
  entry per (model, runner, framework, precision) group at the highest
  concurrency, single-node only, so each unique agentic config gets one
  swebench eval run rather than one per concurrency point.
- utils/matrix_logic/test_generate_sweep_configs.py: add
  test_marks_agentic_entry_for_swebench and update TestMarkAllEvalEntries
  to cover the agentic marking behavior.
- .github/workflows/e2e-tests.yml: add the agentic-eval-config bucket, a
  test-sweep-agentic-evals job, and make collect-evals depend on it. The
  AGENTIC_EVAL filter (agentic + no prefill + run-eval) selects the eval
  entries; the throughput AGENTIC filter (agentic + not run-eval) excludes
  them so throughput and eval runs don't collide.
- benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh: add the eval hook so
  the recipe triggers the agentic swebench eval.
---
 .github/workflows/e2e-tests.yml               | 44 +++++++++++++++++--
 .../single_node/agentic/kimik2.5_fp4_b300.sh  | 13 ++++--
 utils/matrix_logic/generate_sweep_configs.py  | 31 +++++++++++--
 .../test_generate_sweep_configs.py            | 30 ++++++++++++-
 4 files changed, 106 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index 1b83a798a..58c6fe34d 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -50,6 +50,7 @@ jobs:
             eval-config: ${{ steps.get-jobs.outputs.eval-config }}
             multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }}
             agentic-config: ${{ steps.get-jobs.outputs.agentic-config }}
+            agentic-eval-config: ${{ steps.get-jobs.outputs.agentic-eval-config }}
             multi-node-agentic-config: ${{ steps.get-jobs.outputs.multi-node-agentic-config }}
         steps:
             - name: Checkout code (ref)
@@ -69,13 +70,15 @@ jobs:
                   pip install pydantic
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \
                     ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
-                  AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x]))")
+                  AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x and not x.get('run-eval', False)]))")
+                  AGENTIC_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x and x.get('run-eval', False)]))")
                   MULTI_AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' in x]))")
                   SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
                   MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
                   EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and x.get('run-eval', False)]))")
                   MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))")
                   echo "agentic-config=$AGENTIC" >> $GITHUB_OUTPUT
+                  echo "agentic-eval-config=$AGENTIC_EVAL" >> $GITHUB_OUTPUT
                   echo "multi-node-agentic-config=$MULTI_AGENTIC" >> $GITHUB_OUTPUT
                   echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
                   echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
@@ -195,6 +198,41 @@ jobs:
             scenario-type: agentic-coding
             ref: ${{ inputs.ref }}
 
+    test-sweep-agentic-evals:
+        needs: get-jobs
+        if: ${{ needs.get-jobs.outputs.agentic-eval-config != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: agentic eval /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-jobs.outputs.agentic-eval-config) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+            offloading: ${{ matrix.config.offloading }}
+            duration: ${{ inputs.duration-override != '' && inputs.duration-override || matrix.config.duration }}
+            isl: '0'
+            osl: '0'
+            max-model-len: '0'
+            spec-decoding: 'none'
+            disagg: 'false'
+            # scenario-type agentic-coding => run_eval auto-selects swebench.
+            run-eval: true
+            eval-only: true
+            scenario-type: agentic-coding
+            ref: ${{ inputs.ref }}
+
     test-sweep-multi-node-agentic:
         needs: get-jobs
         if: ${{ needs.get-jobs.outputs.multi-node-agentic-config != '[]' }}
@@ -305,8 +343,8 @@ jobs:
             result-prefix: "bmk"
 
     collect-evals:
-        needs: [test-sweep-evals, test-sweep-multi-node-evals]
-        if: ${{ always() && (needs.test-sweep-evals.result != 'skipped' || needs.test-sweep-multi-node-evals.result != 'skipped') }}
+        needs: [test-sweep-evals, test-sweep-multi-node-evals, test-sweep-agentic-evals]
+        if: ${{ always() && (needs.test-sweep-evals.result != 'skipped' || needs.test-sweep-multi-node-evals.result != 'skipped' || needs.test-sweep-agentic-evals.result != 'skipped') }}
         uses: ./.github/workflows/collect-evals.yml
         secrets: inherit
 
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
index 8cebe4f20..af0678246 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
@@ -109,7 +109,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index af3e5053a..2512175d1 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -137,10 +137,30 @@ def _eligible_eval_concs(entry):
         eval_indices.add(best_idx)
         mn_eval_conc[best_idx] = best_eval_conc
 
-    # Mark the selected entries (skip agentic entries which don't support evals)
+    # Agentic-coding entries run swebench (single-shot), not gsm8k, and have no
+    # isl/osl so the 8k1k logic above never selects them. Single-node only for
+    # now: mark one entry per (model, runner, framework, precision) at its
+    # highest concurrency. (Multi-node agentic eval is a future extension.)
+    ag_sn_groups = defaultdict(list)
     for i, entry in enumerate(matrix_values):
-        if entry.get(Fields.SCENARIO_TYPE.value) == 'agentic-coding':
+        if entry.get(Fields.SCENARIO_TYPE.value) != 'agentic-coding':
+            continue
+        if Fields.PREFILL.value in entry:  # multi-node agentic: not yet
             continue
+        conc = entry[Fields.CONC.value]
+        conc_val = max(conc) if isinstance(conc, list) else conc
+        key = (
+            entry[Fields.MODEL.value],
+            entry[Fields.RUNNER.value],
+            entry[Fields.FRAMEWORK.value],
+            entry[Fields.PRECISION.value],
+        )
+        ag_sn_groups[key].append((i, conc_val))
+    for entries in ag_sn_groups.values():
+        eval_indices.add(max(entries, key=lambda item: item[1])[0])
+
+    # Mark the selected entries (agentic-coding entries run swebench; see above).
+    for i, entry in enumerate(matrix_values):
         entry[Fields.RUN_EVAL.value] = i in eval_indices
         if i in mn_eval_conc:
             entry[Fields.EVAL_CONC.value] = mn_eval_conc[i]
@@ -154,7 +174,8 @@ def mark_all_eval_entries(matrix_values: list[dict]) -> list[dict]:
     Evals only run at 8k1k (matching mark_eval_entries), so entries at other
     sequence lengths (e.g. 1k1k) are passed through untouched rather than
     expanded into eval rows.
-    Agentic entries are left untouched because they do not support lm-eval.
+    Single-node agentic entries run swebench (single-shot) and are marked for
+    eval (multi-node agentic eval is a future extension).
     Multi-node rows with the same engine topology are merged into one eval row
     whose full concurrency list is run sequentially against the same engine.
     """
@@ -165,6 +186,10 @@ def mark_all_eval_entries(matrix_values: list[dict]) -> list[dict]:
 
     for entry in matrix_values:
         if entry.get(Fields.SCENARIO_TYPE.value) == 'agentic-coding':
+            # Agentic runs swebench (single-shot); mark single-node agentic for
+            # eval. Multi-node agentic eval is a future extension.
+            if Fields.PREFILL.value not in entry:
+                entry[Fields.RUN_EVAL.value] = True
             expanded_entries.append(entry)
             continue
 
diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py
index 72717eab6..da9d3b5db 100644
--- a/utils/matrix_logic/test_generate_sweep_configs.py
+++ b/utils/matrix_logic/test_generate_sweep_configs.py
@@ -196,6 +196,29 @@ def test_unknown_sequence_lengths(self):
 class TestMarkEvalEntries:
     """Tests for eval matrix selection policy."""
 
+    def test_marks_agentic_entry_for_swebench(self):
+        """Agentic-coding configs (no isl/osl) are marked run-eval for swebench:
+        one per (model, runner, framework, precision) at highest conc."""
+        matrix_values = [
+            {
+                "scenario-type": "agentic-coding",
+                "model": "m", "runner": "b300", "framework": "vllm",
+                "precision": "fp4", "tp": 8, "conc": 32,
+            },
+            {
+                "scenario-type": "agentic-coding",
+                "model": "m", "runner": "b300", "framework": "vllm",
+                "precision": "fp4", "tp": 8, "conc": 64,
+            },
+        ]
+
+        result = mark_eval_entries(matrix_values)
+
+        # One eval per config, at the highest concurrency (64, not 32).
+        marked = [e for e in result if e.get("run-eval")]
+        assert len(marked) == 1
+        assert marked[0]["conc"] == 64
+
     def test_single_node_skips_eval_entries_below_min_conc(self):
         """Single-node eval selection should ignore conc values below MIN_EVAL_CONC."""
         matrix_values = [
@@ -605,7 +628,10 @@ def test_excludes_1k1k_multinode_entries_from_expansion(self):
         assert eight_k['eval-all-concs'] is True
         assert eight_k['conc'] == [8, 32]
 
-    def test_skips_agentic_entries(self):
+    def test_marks_agentic_entries_for_swebench(self):
+        # Agentic configs now run swebench (single-shot), so --all-evals marks
+        # every agentic entry run-eval=True -- but without the fixed-seq-len
+        # multi-node batching fields (eval-conc / eval-all-concs).
         entries = [
             {
                 'scenario-type': 'agentic-coding',
@@ -617,7 +643,7 @@ def test_skips_agentic_entries(self):
 
         result = mark_all_eval_entries(entries)
 
-        assert 'run-eval' not in result[0]
+        assert result[0]['run-eval'] is True
         assert 'eval-conc' not in result[0]
         assert 'eval-all-concs' not in result[0]
 

From 13bf8e8c5a6a2270142200140005c5c5bbbdb1e0 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Thu, 2 Jul 2026 12:22:53 -0500
Subject: [PATCH 3/4] fix(evals): swebench Modal uses --max_workers (no
 --parallelism in 4.1.0) + bootstrap Modal creds from env

swebench 4.1.0 exposes --max_workers in both Docker and Modal modes; --parallelism
does not exist. Fix run_harness() to emit --max_workers in the Modal branch.

Add _ensure_modal_credentials() to benchmark_lib.sh: swebench's credential
check only looks for ~/.modal.toml, but CI supplies MODAL_TOKEN_ID/
MODAL_TOKEN_SECRET env vars (GitHub secret). The helper bootstraps the file
from the env vars when the file is absent, so the harness check passes. Called
in run_swebench_eval() right after _install_swebench_deps, scoring path only.

Update the Modal test name and assertions, the run_swebench_eval docstring,
and the EVALS.md knobs bullet to document the credential bootstrapping.
---
 benchmarks/benchmark_lib.sh       | 23 ++++++++++++++++++++++-
 utils/evals/EVALS.md              |  9 ++++++---
 utils/evals/swebench_score.py     |  5 +++--
 utils/evals/test_swebench_eval.py |  7 ++++---
 4 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 1e59aa1b4..474c846fc 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -984,7 +984,10 @@ META
 #   SWEBENCH_MAX_WORKERS   (default 4) harness workers / Modal parallelism
 #   SWEBENCH_USE_MODAL     "true" => score on Modal remote sandboxes instead of
 #                          local Docker (no Docker needed on the node; requires a
-#                          Modal account + ~/.modal.toml or MODAL_TOKEN_* creds)
+#                          Modal account — credentials from ~/.modal.toml or from
+#                          MODAL_TOKEN_ID/MODAL_TOKEN_SECRET env vars, e.g. a
+#                          GitHub secret; the env vars are bootstrapped into
+#                          ~/.modal.toml automatically if the file is absent)
 #   SWEBENCH_NAMESPACE     local-Docker only: pass "" on arm/Mac to build locally
 #   SWEBENCH_SKIP_SCORE    "true" => generate + stage predictions only, no scoring
 #                          (score elsewhere)
@@ -996,6 +999,23 @@ _install_swebench_deps() {
     fi
 }
 
+# swebench's validate_modal_credentials() only checks that ~/.modal.toml
+# exists; the modal package itself prefers MODAL_TOKEN_ID/MODAL_TOKEN_SECRET
+# env vars (how CI passes the GitHub secret). Bootstrap a minimal file from
+# the env so the harness's check passes. Never overwrite an existing file.
+_ensure_modal_credentials() {
+    if [ "${SWEBENCH_USE_MODAL:-false}" != "true" ]; then return 0; fi
+    if [ -f "$HOME/.modal.toml" ]; then return 0; fi
+    if [ -n "${MODAL_TOKEN_ID:-}" ] && [ -n "${MODAL_TOKEN_SECRET:-}" ]; then
+        printf '[default]\ntoken_id = "%s"\ntoken_secret = "%s"\nactive = true\n' \
+            "$MODAL_TOKEN_ID" "$MODAL_TOKEN_SECRET" > "$HOME/.modal.toml"
+        chmod 600 "$HOME/.modal.toml"
+        echo "[swebench] wrote ~/.modal.toml from MODAL_TOKEN_ID/MODAL_TOKEN_SECRET env"
+    else
+        echo "WARN: SWEBENCH_USE_MODAL=true but no ~/.modal.toml and no MODAL_TOKEN_ID/MODAL_TOKEN_SECRET env; Modal scoring will fail credential validation" >&2
+    fi
+}
+
 # Run the configured eval and stage its artifacts when RUN_EVAL is enabled.
 # run_eval auto-selects the framework by scenario (agentic -> swebench,
 # fixed-seq-len -> lm-eval), so recipes call this without naming a framework.
@@ -1072,6 +1092,7 @@ run_swebench_eval() {
         _install_swebench_deps
         export INFERENCEX_SWEBENCH_RUNTIME_READY=true
     fi
+    _ensure_modal_credentials
     local score_rc=0
     python3 utils/evals/swebench_score.py \
         --samples-dir "$gen_dir" \
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index a7738defc..e518dcdb8 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -188,9 +188,12 @@ append_lm_eval_summary
   `python -m swebench.harness.run_evaluation` → resolved-rate → results JSON). Offline
   `--report` mode skips Docker for testing.
 - Knobs: `SWEBENCH_TASK_NAME` (selects the YAML), `SWEBENCH_MAX_WORKERS`,
-  `SWEBENCH_NAMESPACE` (pass `""` on arm/Mac), `SWEBENCH_SKIP_SCORE=true` (generate-only). The
-  scoring dataset is derived from the YAML's `dataset_path` so generation and scoring can't diverge;
-  `SWEBENCH_DATASET`, if set, must match it (mismatch fails fast).
+  `SWEBENCH_NAMESPACE` (pass `""` on arm/Mac), `SWEBENCH_SKIP_SCORE=true` (generate-only),
+  `SWEBENCH_USE_MODAL=true` (score on Modal remote sandboxes instead of local Docker). Modal
+  credentials: set `MODAL_TOKEN_ID`/`MODAL_TOKEN_SECRET` (e.g. from a GitHub secret) or provide
+  `~/.modal.toml`; if the file is absent the env vars are bootstrapped into it automatically.
+  The scoring dataset is derived from the YAML's `dataset_path` so generation and scoring can't
+  diverge; `SWEBENCH_DATASET`, if set, must match it (mismatch fails fast).
 - **Requires Docker + ~120 GB disk on the scoring host.** This is an MVP; the single-shot prompt and
   diff extraction still need tuning to reach published resolved-rates, and the `thresholds.json` entry
   needs calibration from a baseline run.
diff --git a/utils/evals/swebench_score.py b/utils/evals/swebench_score.py
index 813a0e811..8492cf00b 100644
--- a/utils/evals/swebench_score.py
+++ b/utils/evals/swebench_score.py
@@ -204,8 +204,9 @@ def run_harness(
     ]
     if modal:
         # Modal remote sandboxes instead of local Docker (no Docker on the node).
-        # --parallelism replaces --max_workers; --namespace is local-Docker only.
-        cmd += ["--modal", "true", "--parallelism", str(max_workers)]
+        # swebench 4.1.0 uses --max_workers in both modal and Docker modes;
+        # --namespace is local-Docker only and is still omitted for modal.
+        cmd += ["--modal", "true", "--max_workers", str(max_workers)]
     else:
         cmd += ["--max_workers", str(max_workers)]
         if namespace is not None:
diff --git a/utils/evals/test_swebench_eval.py b/utils/evals/test_swebench_eval.py
index 72dba9b32..6df711956 100644
--- a/utils/evals/test_swebench_eval.py
+++ b/utils/evals/test_swebench_eval.py
@@ -136,11 +136,12 @@ def _captured_harness_cmd(monkeypatch, tmp_path, *, modal, namespace):
     return captured["cmd"]
 
 
-def test_run_harness_modal_uses_modal_and_parallelism(monkeypatch, tmp_path):
+def test_run_harness_modal_uses_modal_flag(monkeypatch, tmp_path):
     cmd = _captured_harness_cmd(monkeypatch, tmp_path, modal=True, namespace="")
-    assert "--modal" in cmd and "--parallelism" in cmd
+    assert "--modal" in cmd
+    assert "--max_workers" in cmd
+    assert "--parallelism" not in cmd
     assert "--namespace" not in cmd  # Docker-only
-    assert "--max_workers" not in cmd
 
 
 def test_run_harness_docker_uses_max_workers_and_namespace(monkeypatch, tmp_path):

From e34035d55e8b97bf532637b3d12b5f132524a24c Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Thu, 2 Jul 2026 12:27:25 -0500
Subject: [PATCH 4/4] feat(evals): eval-only gating for all single-node agentic
 recipes

Apply the EVAL_ONLY=true if/else gating pattern (already present in
kimik2.5_fp4_b300.sh) to the remaining 24 single-node agentic recipes in
benchmarks/single_node/agentic/. In eval-only mode each recipe skips the
multi-turn agentic replay and calls maybe_run_eval "$PORT" against the live
server; run_eval auto-selects swebench for the agentic-coding scenario.
The deprecated/ subdirectory was not touched.
---
 benchmarks/single_node/agentic/dsr1_fp4_b200.sh     | 13 +++++++++----
 benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh   | 13 +++++++++----
 .../single_node/agentic/dsv4_fp4_b200_vllm.sh       | 13 +++++++++----
 .../single_node/agentic/dsv4_fp4_b300_vllm.sh       | 13 +++++++++----
 .../single_node/agentic/dsv4_fp4_mi355x_sglang.sh   | 13 +++++++++----
 benchmarks/single_node/agentic/dsv4_fp8_h200.sh     | 13 +++++++++----
 benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh | 13 +++++++++----
 benchmarks/single_node/agentic/glm5_fp8_b200.sh     | 13 +++++++++----
 benchmarks/single_node/agentic/gptoss_fp4_b200.sh   | 13 +++++++++----
 benchmarks/single_node/agentic/gptoss_fp4_h100.sh   | 13 +++++++++----
 benchmarks/single_node/agentic/gptoss_fp4_h200.sh   | 13 +++++++++----
 benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh | 13 +++++++++----
 benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh | 13 +++++++++----
 benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh | 13 +++++++++----
 .../single_node/agentic/kimik2.5_fp4_mi355x.sh      | 13 +++++++++----
 .../single_node/agentic/kimik2.5_int4_b200.sh       | 13 +++++++++----
 .../single_node/agentic/kimik2.5_int4_h100.sh       | 13 +++++++++----
 .../single_node/agentic/kimik2.5_int4_h200.sh       | 13 +++++++++----
 benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh | 13 +++++++++----
 benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh  | 13 +++++++++----
 .../single_node/agentic/qwen3.5_fp8_b300_sglang.sh  | 13 +++++++++----
 benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh  | 13 +++++++++----
 .../single_node/agentic/qwen3.5_fp8_mi355x.sh       | 13 +++++++++----
 .../agentic/qwen3.5_fp8_mi355x_sglang.sh            | 13 +++++++++----
 24 files changed, 216 insertions(+), 96 deletions(-)

diff --git a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
index f9955adc7..7a46c136f 100755
--- a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
@@ -58,7 +58,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
index ff76b768d..38dc01922 100755
--- a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
@@ -52,7 +52,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
index 108347479..f663b659f 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
@@ -249,7 +249,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
index f6748a5f8..92e1dd2ba 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
@@ -137,7 +137,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
index 99aec25fe..04c297f4f 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
@@ -156,7 +156,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
index 0a0177983..71fd2b331 100755
--- a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
@@ -63,7 +63,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
index 500b456f5..b731d81ce 100755
--- a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
@@ -64,7 +64,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/glm5_fp8_b200.sh b/benchmarks/single_node/agentic/glm5_fp8_b200.sh
index 259c19586..26e09e382 100755
--- a/benchmarks/single_node/agentic/glm5_fp8_b200.sh
+++ b/benchmarks/single_node/agentic/glm5_fp8_b200.sh
@@ -69,7 +69,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh
index 6e921db58..ddcbc7937 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh
@@ -66,7 +66,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh
index 557986b0d..68856a75c 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh
@@ -70,7 +70,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh
index 1592a8d5c..360aed712 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh
@@ -70,7 +70,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
index eb1883ff1..1fc45d47d 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
@@ -83,7 +83,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh
index 99e29c819..c1bde7465 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh
@@ -82,7 +82,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
index ad0b4495a..791bd549c 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
@@ -202,7 +202,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
index fd0ce3677..4b7e285ee 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
@@ -808,7 +808,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
index 697d3fa45..f40572563 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
@@ -61,7 +61,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
index 2fd3b381c..a11462bc2 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
@@ -62,7 +62,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
index 97929e43e..81fba6cfd 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
@@ -72,7 +72,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
index 4ba87976b..80591bbce 100755
--- a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
@@ -65,7 +65,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
index 3432af5c9..daee15f20 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
@@ -65,7 +65,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh
index 9d9c1d7d5..a49d925dc 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh
@@ -120,7 +120,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
index 95f0397a0..2cc3d1526 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
@@ -131,7 +131,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
index aef9650ca..84002aeb8 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
@@ -56,7 +56,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
index 5427d0d31..e25afa775 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
@@ -136,7 +136,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi