From a878654a23dfa8e49ef792c55126a30c0242baa2 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Thu, 2 Jul 2026 12:12:11 -0500 Subject: [PATCH 1/4] feat(evals): add SWE-bench Lite eval (lm-eval generation + swebench harness scoring, Modal-capable) Add a SWE-bench Lite accuracy eval that generates patches via the lm-eval harness and scores them with the official swebench evaluation harness. - utils/evals/swebench_lite.yaml: lm-eval task config for SWE-bench Lite generation (prompt/doc-to-text, generation kwargs, dataset wiring). - utils/evals/swebench_score.py: post-processing + scoring. Extracts model patches from lm-eval output, feeds them to the swebench harness, and emits a "resolved" rate. Supports running the harness locally or on Modal via SWEBENCH_USE_MODAL (Modal pass-through so scoring can run off-box). - utils/collect_eval_results.py: extract_lm_metrics learns a "resolved" filter branch so the swebench resolved metric is collected alongside the existing lm-eval metrics. - utils/evals/thresholds.json: add the swebench_lite threshold entry. - utils/evals/EVALS.md: document the SWE-bench Lite eval and how scoring works. - benchmarks/benchmark_lib.sh: add run_swebench_eval, _install_swebench_deps, maybe_run_eval, and Modal pass-through. run_eval now picks a per-scenario default framework (agentic-coding -> swebench, fixed-seq-len -> lm-eval); an explicit EVAL_FRAMEWORK env var or --framework arg overrides the default. EVAL_TASKS_DIR selects the task yaml. - utils/evals/test_swebench_eval.py, utils/evals/test_run_eval_dispatch.py: tests for the scorer and the scenario/framework dispatch precedence. --- benchmarks/benchmark_lib.sh | 141 ++++++++- utils/collect_eval_results.py | 4 +- utils/evals/EVALS.md | 27 ++ utils/evals/swebench_lite.yaml | 56 ++++ utils/evals/swebench_score.py | 397 ++++++++++++++++++++++++++ utils/evals/test_run_eval_dispatch.py | 67 +++++ utils/evals/test_swebench_eval.py | 235 +++++++++++++++ utils/evals/thresholds.json | 3 +- 8 files changed, 926 insertions(+), 4 deletions(-) create mode 100644 utils/evals/swebench_lite.yaml create mode 100644 utils/evals/swebench_score.py create mode 100644 utils/evals/test_run_eval_dispatch.py create mode 100644 utils/evals/test_swebench_eval.py diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 95e063a3d..1e59aa1b4 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -966,21 +966,157 @@ META echo "Moved eval artifacts to: $(pwd)" } +# ------------------------------ +# SWE-bench eval helpers +# ------------------------------ + +# Run the SWE-bench Lite eval: generate patches with lm-eval, then score them +# with the official swebench Docker harness. lm-eval cannot score SWE-bench +# itself (no repo-level test executor), so we reuse it only for generation and +# emit an lm-eval-shaped results JSON from swebench_score.py so the rest of the +# pipeline (append_lm_eval_summary / collect / validate) is unchanged. +# +# Env knobs: +# SWEBENCH_TASK_NAME (default swebench_lite) selects utils/evals/.yaml +# SWEBENCH_DATASET optional; must equal the YAML's dataset_path (the +# scoring dataset is derived from the YAML so generation +# and scoring never diverge) -- mismatch fails fast +# SWEBENCH_MAX_WORKERS (default 4) harness workers / Modal parallelism +# SWEBENCH_USE_MODAL "true" => score on Modal remote sandboxes instead of +# local Docker (no Docker needed on the node; requires a +# Modal account + ~/.modal.toml or MODAL_TOKEN_* creds) +# SWEBENCH_NAMESPACE local-Docker only: pass "" on arm/Mac to build locally +# SWEBENCH_SKIP_SCORE "true" => generate + stage predictions only, no scoring +# (score elsewhere) +_install_swebench_deps() { + # Best-effort (mirrors _install_lm_eval_deps); a real failure surfaces at scoring. + python3 -m pip install -q --no-cache-dir --break-system-packages swebench || true + if [ "${SWEBENCH_USE_MODAL:-false}" = "true" ]; then + python3 -m pip install -q --no-cache-dir --break-system-packages modal || true + fi +} + +# Run the configured eval and stage its artifacts when RUN_EVAL is enabled. +# run_eval auto-selects the framework by scenario (agentic -> swebench, +# fixed-seq-len -> lm-eval), so recipes call this without naming a framework. +maybe_run_eval() { + local port="${1:-${PORT:-8888}}" + if [ "${RUN_EVAL}" = "true" ]; then + run_eval --port "$port" + append_lm_eval_summary + fi +} + +run_swebench_eval() { + local out_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}" + local task_name="${SWEBENCH_TASK_NAME:-swebench_lite}" + local gen_dir + gen_dir=$(mktemp -d /tmp/swebench_gen-XXXXXX) + + # Keep the scoring dataset in lockstep with the generation YAML: the harness + # must score against the same instance set lm-eval generated patches for, or + # the instance IDs won't match. Derive it from the task YAML; if + # SWEBENCH_DATASET is set it must agree (fail-fast rather than mis-score). + local yaml_path="${EVAL_TASKS_DIR:-utils/evals/${task_name}.yaml}" + local dataset + dataset=$(awk '/^dataset_path:[[:space:]]/{print $2; exit}' "$yaml_path" 2>/dev/null) + if [ -z "$dataset" ]; then + echo "ERROR: could not read dataset_path from ${yaml_path}" >&2 + rm -rf "$gen_dir" 2>/dev/null || true + return 1 + fi + if [ -n "${SWEBENCH_DATASET:-}" ] && [ "${SWEBENCH_DATASET}" != "$dataset" ]; then + echo "ERROR: SWEBENCH_DATASET='${SWEBENCH_DATASET}' disagrees with ${yaml_path} dataset_path='${dataset}'." >&2 + echo " Generation and scoring must use the same dataset; edit the YAML or unset SWEBENCH_DATASET." >&2 + rm -rf "$gen_dir" 2>/dev/null || true + return 1 + fi + + # 1. Generation via lm-eval (reuses endpoint wiring, _patch_lm_eval, etc.). + # run_lm_eval already passes --log_samples, which is what we consume. + local prev_tasks_dir="${EVAL_TASKS_DIR:-}" + export EVAL_TASKS_DIR="$yaml_path" + local gen_rc=0 + run_lm_eval "$@" --results-dir "$gen_dir" || gen_rc=$? + export EVAL_TASKS_DIR="$prev_tasks_dir" + if [ "$gen_rc" -ne 0 ]; then + echo "ERROR: swebench generation (lm-eval) failed with $gen_rc" >&2 + rm -rf "$gen_dir" 2>/dev/null || true + return "$gen_rc" + fi + + # Preserve generations as artifacts alongside the scored results. + mkdir -p "$out_dir" + find "$gen_dir" -name 'samples_*.jsonl' -exec cp -f {} "$out_dir"/ \; 2>/dev/null || true + export EVAL_RESULT_DIR="$out_dir" + + local lm_eval_version + lm_eval_version=$(python3 -c 'import lm_eval; print(lm_eval.__version__)' 2>/dev/null || echo unknown) + + if [ "${SWEBENCH_SKIP_SCORE:-false}" = "true" ]; then + # Generation-only mode: emit predictions, defer Docker scoring elsewhere. + # TODO(alec): wire the separate scoring job (Modal / sb-cli / CPU runner). + local skip_rc=0 + python3 utils/evals/swebench_score.py \ + --samples-dir "$gen_dir" --out-dir "$out_dir" \ + --model-name "${MODEL_NAME:-$MODEL}" --task-name "$task_name" \ + --predictions-only || skip_rc=$? + echo "SWEBENCH_SKIP_SCORE=true: staged predictions only (no resolved-rate)." >&2 + rm -rf "$gen_dir" 2>/dev/null || true + return "$skip_rc" + fi + + # 2. Score with the official swebench harness (local Docker, or Modal remote + # sandboxes when SWEBENCH_USE_MODAL=true) and emit the lm-eval-shaped JSON. + if [ "${INFERENCEX_SWEBENCH_RUNTIME_READY:-false}" != "true" ]; then + _install_swebench_deps + export INFERENCEX_SWEBENCH_RUNTIME_READY=true + fi + local score_rc=0 + python3 utils/evals/swebench_score.py \ + --samples-dir "$gen_dir" \ + --out-dir "$out_dir" \ + --model-name "${MODEL_NAME:-$MODEL}" \ + --task-name "$task_name" \ + --dataset-name "$dataset" \ + --max-workers "${SWEBENCH_MAX_WORKERS:-4}" \ + --lm-eval-version "$lm_eval_version" \ + ${SWEBENCH_USE_MODAL:+--modal} \ + ${SWEBENCH_NAMESPACE+--namespace "$SWEBENCH_NAMESPACE"} \ + || score_rc=$? + rm -rf "$gen_dir" 2>/dev/null || true + if [ "$score_rc" -ne 0 ]; then + echo "ERROR: swebench scoring failed with $score_rc" >&2 + return "$score_rc" + fi +} + # ------------------------------ # Unified eval entrypoint # ------------------------------ run_eval() { - local framework="${EVAL_FRAMEWORK:-lm-eval}" + local cli_framework="" local forwarded=() while [[ $# -gt 0 ]]; do case "$1" in - --framework) framework="$2"; shift 2 ;; + --framework) cli_framework="$2"; shift 2 ;; *) forwarded+=("$1"); shift ;; esac done + # Eval framework is chosen by SCENARIO: agentic-coding configs run swebench; + # fixed-seq-len (8k1k/1k1k) run lm-eval/gsm8k -- agentic never runs gsm8k and + # 8k1k/1k1k never runs swebench. An explicit EVAL_FRAMEWORK env or --framework + # arg still overrides the scenario default (e.g. the recipes' `--framework + # lm-eval`, or a forced override). + local scenario_default="lm-eval" + if [ "${IS_AGENTIC:-0}" = "1" ] || [ "${SCENARIO_TYPE:-}" = "agentic-coding" ]; then + scenario_default="swebench" + fi + local framework="${EVAL_FRAMEWORK:-${cli_framework:-$scenario_default}}" + # Compute EVAL_MAX_MODEL_LEN if not already set by the calling script if [ -z "${EVAL_MAX_MODEL_LEN:-}" ]; then compute_eval_context_length "$MODEL" "${MAX_MODEL_LEN:-0}" > /dev/null @@ -1052,6 +1188,7 @@ run_eval() { local eval_rc=0 case "$framework" in lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" || eval_rc=$? ;; + swebench) run_swebench_eval "${forwarded[@]}" || eval_rc=$? ;; *) echo "Unknown framework '${framework}'"; eval_rc=1 ;; esac diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index 333146566..0da10b71e 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -165,7 +165,9 @@ def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]: # Extract metrics for each filter for f in filter_list: fname = f['name'] - if 'strict' in fname: + # 'resolved' is SWE-bench's resolved-rate (swebench_score.py); + # treat it as the primary/strict score so it populates `score`. + if 'strict' in fname or 'resolved' in fname: strict_val, strict_se = get_val_se(fname) elif 'flex' in fname or 'extract' in fname: flex_val, flex_se = get_val_se(fname) diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md index 7ff878dce..a7738defc 100644 --- a/utils/evals/EVALS.md +++ b/utils/evals/EVALS.md @@ -169,7 +169,34 @@ The codebase patches lm-eval compatibility via `_patch_lm_eval`: 1. Reasoning token handling: extracts `reasoning_content` when `message.content` is empty. 2. TRT compatibility: avoids injecting `{"type": "text"}` for non-HF tokenizers. +### SWE-bench Lite (`--framework swebench`) + +SWE-bench is **not** a `generate_until` QA task — it requires applying the model's +patch to a repo and running tests in Docker, which lm-eval cannot do. So it runs +through a dedicated framework that reuses lm-eval for *generation* only, then scores +with the official `swebench` harness and emits an lm-eval-shaped results JSON +(metric `exact_match,resolved` = resolved-rate) so collect/validate work unchanged. + +```bash +run_eval --framework swebench --port "$PORT" # generation (lm-eval) -> scoring (swebench) +append_lm_eval_summary +``` + +- Task: `utils/evals/swebench_lite.yaml` (generation) — SWE-bench Lite, the ~300-instance curated + quick-eval subset (no difficulty filter needed; Lite is already the lightweight set). +- Scoring: `utils/evals/swebench_score.py` (diff extraction → `predictions.jsonl` → + `python -m swebench.harness.run_evaluation` → resolved-rate → results JSON). Offline + `--report` mode skips Docker for testing. +- Knobs: `SWEBENCH_TASK_NAME` (selects the YAML), `SWEBENCH_MAX_WORKERS`, + `SWEBENCH_NAMESPACE` (pass `""` on arm/Mac), `SWEBENCH_SKIP_SCORE=true` (generate-only). The + scoring dataset is derived from the YAML's `dataset_path` so generation and scoring can't diverge; + `SWEBENCH_DATASET`, if set, must match it (mismatch fails fast). +- **Requires Docker + ~120 GB disk on the scoring host.** This is an MVP; the single-shot prompt and + diff extraction still need tuning to reach published resolved-rates, and the `thresholds.json` entry + needs calibration from a baseline run. + ## Task files The following files are task definitions from lm-eval; more information on changes lives within the files: - `utils/evals/gsm8k.yaml` - `utils/evals/gpqa_diamond.yaml` +- `utils/evals/swebench_lite.yaml` (generation only; scored by `swebench_score.py`) diff --git a/utils/evals/swebench_lite.yaml b/utils/evals/swebench_lite.yaml new file mode 100644 index 000000000..4633af462 --- /dev/null +++ b/utils/evals/swebench_lite.yaml @@ -0,0 +1,56 @@ +# SWE-bench Lite -- GENERATION ONLY. +# +# Lite is the ~300-instance curated subset for quick evals (no difficulty labels; +# it's already the lightweight set, so no filtering is needed -- unlike Verified, +# which carries a `difficulty` field). +# +# lm-eval is used purely to drive the served OpenAI-compatible endpoint and dump +# one candidate patch per instance via --log_samples. The metric below is a +# PLACEHOLDER that lm-eval computes but we ignore: the real resolved-rate comes +# from utils/evals/swebench_score.py running the official `swebench` harness, +# which then emits an lm-eval-shaped results JSON for collect/validate. +# +# Run it through the dedicated framework, not bare lm-eval: +# run_eval --framework swebench --port "$PORT" +# which wires generation -> scoring. Bare `--tasks swebench_lite.yaml` would +# produce only the meaningless placeholder metric. +task: swebench_lite +dataset_path: princeton-nlp/SWE-bench_Lite # also mirrored at SWE-bench/SWE-bench_Lite +output_type: generate_until +test_split: test + +doc_to_text: | + You are an expert software engineer fixing a real GitHub issue in the + repository `{{repo}}` at commit {{base_commit}}. + + + {{problem_statement}} + + + Respond with ONLY a unified diff (a git patch) that resolves the issue, using + real repository file paths. Do not include explanations. Wrap the patch in a + single fenced block exactly like: + + ```diff + diff --git a/path/to/file.py b/path/to/file.py + --- a/path/to/file.py + +++ b/path/to/file.py + @@ ... @@ + ``` +# The gold patch is the nominal target. lm-eval's exact_match against it is +# meaningless for patches (overwritten by the harness score); it only exists so +# generate_until has a target + a metric and does not error. +doc_to_target: "{{patch}}" + +generation_kwargs: + until: [] + do_sample: false + temperature: 0.0 + +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + +metadata: + version: 0.1 diff --git a/utils/evals/swebench_score.py b/utils/evals/swebench_score.py new file mode 100644 index 000000000..813a0e811 --- /dev/null +++ b/utils/evals/swebench_score.py @@ -0,0 +1,397 @@ +#!/usr/bin/env python3 +"""Score SWE-bench patches generated by lm-eval and emit an lm-eval-shaped result. + +Pipeline: + + 1. Read lm-eval ``--log_samples`` output (samples_*.jsonl): one candidate per + SWE-bench instance. + 2. Extract a unified diff from each model generation. + 3. Write a ``predictions.jsonl`` in the format the official ``swebench`` harness + expects: ``{instance_id, model_name_or_path, model_patch}``. + 4. Run ``python -m swebench.harness.run_evaluation`` (Docker) to get the + resolved-rate -- unless ``--no-run``/``--report`` is given (offline/testing). + 5. Emit a results JSON shaped like an lm-eval result so the existing + ``collect_eval_results.py`` / ``validate_scores.py`` ingest it unchanged. + The metric is published as ``exact_match,resolved`` = resolved-rate. + +The harness needs Docker + lots of disk and is NOT runnable on this dev Mac, so +the Docker step is isolated behind ``--no-run`` for local testing. TODO(alec): +exercise the real ``--run`` path on a runner. +""" + +import argparse +import json +import math +import re +import subprocess +import sys +from pathlib import Path +from typing import Iterator, Optional + +DEFAULT_DATASET = "princeton-nlp/SWE-bench_Lite" +DEFAULT_TASK = "swebench_lite" + +# A unified diff, optionally inside a ```diff / ```patch fence. We try fenced +# first (what the prompt asks for), then a bare ``diff --git`` slice. +_FENCED_DIFF_RE = re.compile( + r"```(?:diff|patch)?\s*\n(?P.*?)```", + re.DOTALL | re.IGNORECASE, +) +_DIFF_GIT_RE = re.compile(r"(?:^|\n)(diff --git .*)", re.DOTALL) + +# Line prefixes that belong to a (git) unified-diff body. Anything else marks +# the end of the patch. +_DIFF_LINE_PREFIXES = ( + "diff ", "index ", "--- ", "+++ ", "@@", "+", "-", " ", "\\", + "old mode ", "new mode ", "new file mode ", "deleted file mode ", + "rename ", "copy ", "similarity ", "dissimilarity ", + "Binary files ", "GIT binary patch", +) + + +def _trim_to_diff_body(text: str) -> str: + """Keep only the leading run of diff-shaped lines, dropping trailing prose. + + Models frequently emit a bare patch followed by an explanation ("Notes:", + "This fixes #123."). With no terminator that tail gets glued onto the patch + and rejected by ``git apply``, scoring the instance unresolved. Blank lines + are kept only when the diff resumes after them; a blank line followed by + non-diff text ends the patch. + """ + lines = text.splitlines() + out: list[str] = [] + i, n = 0, len(lines) + while i < n: + if lines[i].startswith(_DIFF_LINE_PREFIXES): + out.append(lines[i]) + i += 1 + continue + if lines[i] == "": + j = i + while j < n and lines[j] == "": + j += 1 + if j < n and lines[j].startswith(_DIFF_LINE_PREFIXES): + out.extend(lines[i:j]) # interior blank line(s); diff resumes + i = j + continue + break # trailing blank(s)+prose, or any other non-diff line + return "\n".join(out) + + +def extract_patch(text: str) -> str: + """Pull a unified diff out of a model generation. + + Best-effort and deliberately conservative -- a wrong extraction just means + that instance is unresolved, never a crash. Diff-extraction quality is a + primary tuning lever (TODO(alec)): bad fences here directly suppress the + resolved-rate. + """ + if not text: + return "" + + def _finish(body: str) -> str: + body = _trim_to_diff_body(body).strip("\n") + return body + "\n" if body else "" + + # 1. Prefer a fenced block that actually looks like a diff. + for match in _FENCED_DIFF_RE.finditer(text): + body = match.group("body") + if "diff --git" in body or body.lstrip().startswith(("--- ", "+++ ")): + return _finish(body) + # 2. Fall back to a bare ``diff --git``, trimmed to the diff body so + # trailing prose can't corrupt the patch. + git_match = _DIFF_GIT_RE.search(text) + if git_match: + trimmed = _finish(git_match.group(1)) + if trimmed: + return trimmed + # 3. Last resort: a lone fenced block (fence-bounded), or the raw text. + lone = _FENCED_DIFF_RE.search(text) + if lone: + body = lone.group("body").strip("\n") + return body + "\n" if body else "" + return text.strip("\n") + "\n" if text.strip() else "" + + +def _response_text(record: dict) -> str: + """Extract the model's text from one lm-eval sample record. + + lm-eval's sample schema has drifted across versions; be tolerant. + TODO(alec): confirm against the pinned harness's real samples_*.jsonl. + """ + for key in ("filtered_resps", "resps"): + val = record.get(key) + while isinstance(val, (list, tuple)) and val: + val = val[0] + if isinstance(val, str) and val.strip(): + return val + return "" + + +def _instance_id(record: dict) -> Optional[str]: + doc = record.get("doc") + if isinstance(doc, dict): + for key in ("instance_id", "instance", "id"): + val = doc.get(key) + if isinstance(val, str) and val: + return val + # Some versions hoist doc fields to the top level. + val = record.get("instance_id") + return val if isinstance(val, str) and val else None + + +def iter_samples(samples_dir: Path) -> Iterator[dict]: + """Yield JSON records from every samples_*.jsonl under ``samples_dir``.""" + files = sorted(samples_dir.rglob("samples_*.jsonl")) + if not files: + raise FileNotFoundError( + f"no samples_*.jsonl found under {samples_dir} -- did lm-eval run " + "with --log_samples?" + ) + for path in files: + with path.open() as fh: + for line in fh: + line = line.strip() + if line: + yield json.loads(line) + + +def build_predictions(samples_dir: Path, model_name: str) -> list[dict]: + """Turn lm-eval samples into swebench prediction rows (dedup by instance).""" + by_instance: dict[str, dict] = {} + skipped = 0 + for record in iter_samples(samples_dir): + instance_id = _instance_id(record) + if not instance_id: + skipped += 1 + continue + patch = extract_patch(_response_text(record)) + # Last write wins; SWE-bench is single-attempt so there should be one + # record per instance anyway. + by_instance[instance_id] = { + "instance_id": instance_id, + "model_name_or_path": model_name, + "model_patch": patch, + } + if skipped: + print(f"WARN: skipped {skipped} sample(s) with no instance_id", file=sys.stderr) + if not by_instance: + raise ValueError("no usable predictions extracted from samples") + return list(by_instance.values()) + + +def write_predictions(predictions: list[dict], out_path: Path) -> None: + with out_path.open("w") as fh: + for row in predictions: + fh.write(json.dumps(row) + "\n") + + +def run_harness( + predictions_path: Path, + dataset_name: str, + run_id: str, + work_dir: Path, + max_workers: int, + namespace: Optional[str], + modal: bool = False, +) -> None: + """Invoke the official swebench harness (local Docker, or Modal sandboxes).""" + cmd = [ + sys.executable, "-m", "swebench.harness.run_evaluation", + "--dataset_name", dataset_name, + "--predictions_path", str(predictions_path), + "--run_id", run_id, + ] + if modal: + # Modal remote sandboxes instead of local Docker (no Docker on the node). + # --parallelism replaces --max_workers; --namespace is local-Docker only. + cmd += ["--modal", "true", "--parallelism", str(max_workers)] + else: + cmd += ["--max_workers", str(max_workers)] + if namespace is not None: + # On arm/Mac (and to force local image builds) pass --namespace ''. + cmd += ["--namespace", namespace] + print(f"[swebench] running: {' '.join(cmd)}", flush=True) + subprocess.run(cmd, cwd=str(work_dir), check=True) + + +def find_report(work_dir: Path, model_name: str, run_id: str) -> Path: + """Locate the harness report JSON, tolerant to known layout variants.""" + sanitized = model_name.replace("/", "__") + candidates = [ + work_dir / f"{sanitized}.{run_id}.json", # classic: ..json + work_dir / f"{model_name}.{run_id}.json", + work_dir / "evaluation_results" / "results.json", # newer layout + ] + for path in candidates: + if path.exists(): + return path + # Broad fallback: any *.json mentioning resolved/total at the top level. + for path in sorted(work_dir.rglob("*.json")): + try: + data = json.loads(path.read_text()) + except (json.JSONDecodeError, OSError): + continue + if isinstance(data, dict) and ( + "resolved_instances" in data or "resolved_ids" in data + ): + return path + raise FileNotFoundError( + f"could not locate a swebench report under {work_dir} " + f"(looked for {[str(c) for c in candidates]})" + ) + + +def parse_resolved(report: dict) -> tuple[int, int]: + """Return (resolved, total) from a harness report, tolerant to key variants. + + Denominator is the full instance count (leaderboard convention: + resolved / total), not just completed instances. + """ + resolved: Optional[int] = None + for key in ("resolved_instances", "resolved", "num_resolved"): + if isinstance(report.get(key), int): + resolved = report[key] + break + if resolved is None and isinstance(report.get("resolved_ids"), list): + resolved = len(report["resolved_ids"]) + + total: Optional[int] = None + for key in ("total_instances", "completed_instances", "submitted_instances"): + val = report.get(key) + if isinstance(val, int) and val > 0: + total = val + break + if total is None: + for key in ("completed_ids", "submitted_ids"): + if isinstance(report.get(key), list) and report[key]: + total = len(report[key]) + break + + if resolved is None or total is None or total <= 0: + raise ValueError( + f"could not parse resolved/total from report keys {sorted(report)}" + ) + return resolved, total + + +def build_results_json( + task: str, + resolved: int, + total: int, + model_name: str, + lm_eval_version: str, + report: Optional[dict], +) -> dict: + """Shape the resolved-rate as an lm-eval result. + + Published as ``exact_match,resolved`` so validate_scores (prefix + ``exact_match,``) gates it and collect_eval_results surfaces it as ``score``. + """ + rate = resolved / total + stderr = math.sqrt(rate * (1.0 - rate) / total) if total else 0.0 + return { + "lm_eval_version": lm_eval_version, + "model_name": model_name, + "results": { + task: { + "alias": task, + "exact_match,resolved": rate, + "exact_match_stderr,resolved": stderr, + } + }, + "configs": { + task: { + "metric_list": [{"metric": "exact_match"}], + "filter_list": [{"name": "resolved"}], + } + }, + "n-samples": {task: {"effective": total, "original": total}}, + # Debugging passthrough; ignored by collectors (no lm_eval_version here). + "swebench": { + "resolved": resolved, + "total": total, + "resolved_rate": rate, + "report": report, + }, + } + + +def main(argv: Optional[list[str]] = None) -> int: + parser = argparse.ArgumentParser(description="Score SWE-bench patches from lm-eval samples") + parser.add_argument("--samples-dir", required=True, help="dir containing lm-eval samples_*.jsonl") + parser.add_argument("--out-dir", required=True, help="dir to write predictions + results JSON") + parser.add_argument("--model-name", required=True, help="served model name (model_name_or_path)") + parser.add_argument("--dataset-name", default=DEFAULT_DATASET) + parser.add_argument("--task-name", default=DEFAULT_TASK) + parser.add_argument("--run-id", default=None, help="harness run id (default: task name)") + parser.add_argument("--max-workers", type=int, default=4) + parser.add_argument( + "--namespace", default=None, + help="local-Docker --namespace value (pass '' on arm/Mac to build images locally)", + ) + parser.add_argument( + "--modal", action="store_true", + help="score on Modal remote sandboxes instead of local Docker (needs modal creds)", + ) + parser.add_argument("--lm-eval-version", default="unknown") + parser.add_argument( + "--predictions-only", action="store_true", + help="write predictions.jsonl and stop (no scoring; score elsewhere)", + ) + parser.add_argument( + "--no-run", action="store_true", + help="skip the Docker harness; requires --report (offline/testing)", + ) + parser.add_argument( + "--report", default=None, + help="path to a pre-computed harness report JSON (implies --no-run)", + ) + args = parser.parse_args(argv) + + samples_dir = Path(args.samples_dir) + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + run_id = args.run_id or args.task_name + + # 1-3. samples -> predictions.jsonl + predictions = build_predictions(samples_dir, args.model_name) + predictions_path = out_dir / "predictions.jsonl" + write_predictions(predictions, predictions_path) + print(f"[swebench] wrote {len(predictions)} predictions -> {predictions_path}") + + if args.predictions_only: + print("[swebench] predictions-only: skipping scoring (score elsewhere)") + return 0 + + # 4. score (Docker) or load an existing report + if args.report: + report = json.loads(Path(args.report).read_text()) + elif args.no_run: + print("ERROR: --no-run requires --report", file=sys.stderr) + return 1 + else: + run_harness( + predictions_path, args.dataset_name, run_id, + out_dir, args.max_workers, args.namespace, modal=args.modal, + ) + report = json.loads(find_report(out_dir, args.model_name, run_id).read_text()) + + resolved, total = parse_resolved(report) + + # 5. emit lm-eval-shaped results + results = build_results_json( + args.task_name, resolved, total, args.model_name, + args.lm_eval_version, report, + ) + results_path = out_dir / f"results_{args.task_name}.json" + results_path.write_text(json.dumps(results, indent=2)) + print( + f"[swebench] {args.task_name}: resolved {resolved}/{total} " + f"= {resolved / total:.4f} -> {results_path}" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/utils/evals/test_run_eval_dispatch.py b/utils/evals/test_run_eval_dispatch.py new file mode 100644 index 000000000..bef379bf8 --- /dev/null +++ b/utils/evals/test_run_eval_dispatch.py @@ -0,0 +1,67 @@ +"""run_eval framework dispatch. + +Scenario picks the default framework (agentic-coding -> swebench, fixed-seq-len +-> lm-eval); an explicit EVAL_FRAMEWORK env or --framework arg overrides it. +""" + +import os +import subprocess +from pathlib import Path + +BENCHMARK_LIB = Path(__file__).resolve().parents[2] / "benchmarks" / "benchmark_lib.sh" + +# Stub the framework runners so dispatch is observable without a server/Docker, +# and pin EVAL_MAX_MODEL_LEN so run_eval skips context computation. CLI_FW is +# only forwarded as --framework when set (so we can test the no-arg path). +_SCRIPT = r''' +source "$BENCHMARK_LIB" +run_lm_eval() { echo "DISPATCH=lm-eval"; } +run_swebench_eval() { echo "DISPATCH=swebench"; } +export EVAL_MAX_MODEL_LEN=16384 +unset EVAL_CONCURRENT_REQUESTS +run_eval ${CLI_FW:+--framework "$CLI_FW"} --port 8888 +''' + + +def _dispatch(*, is_agentic: str = "0", cli_fw=None, env_fw=None) -> str: + env = {**os.environ, "BENCHMARK_LIB": str(BENCHMARK_LIB), "IS_AGENTIC": is_agentic} + env.pop("EVAL_FRAMEWORK", None) + env.pop("CLI_FW", None) + if cli_fw is not None: + env["CLI_FW"] = cli_fw + if env_fw is not None: + env["EVAL_FRAMEWORK"] = env_fw + res = subprocess.run( + ["bash", "-c", _SCRIPT], env=env, text=True, capture_output=True, check=True + ) + return res.stdout + + +# --- scenario default ------------------------------------------------------ + +def test_agentic_scenario_defaults_to_swebench(): + assert "DISPATCH=swebench" in _dispatch(is_agentic="1") + + +def test_fixed_seqlen_scenario_defaults_to_lm_eval(): + assert "DISPATCH=lm-eval" in _dispatch(is_agentic="0") + + +# --- explicit overrides win over the scenario default ---------------------- + +def test_explicit_framework_arg_overrides_scenario(): + # agentic, but recipe passed --framework lm-eval -> lm-eval. + assert "DISPATCH=lm-eval" in _dispatch(is_agentic="1", cli_fw="lm-eval") + + +def test_env_framework_overrides_scenario(): + assert "DISPATCH=lm-eval" in _dispatch(is_agentic="1", env_fw="lm-eval") + + +def test_env_can_force_swebench_on_fixed_seqlen(): + assert "DISPATCH=swebench" in _dispatch(is_agentic="0", env_fw="swebench") + + +def test_recipe_lm_eval_arg_still_lm_eval_on_fixed_seqlen(): + # The existing fixed-seq-len recipes call `run_eval --framework lm-eval`. + assert "DISPATCH=lm-eval" in _dispatch(is_agentic="0", cli_fw="lm-eval") diff --git a/utils/evals/test_swebench_eval.py b/utils/evals/test_swebench_eval.py new file mode 100644 index 000000000..72dba9b32 --- /dev/null +++ b/utils/evals/test_swebench_eval.py @@ -0,0 +1,235 @@ +"""Tests for the SWE-bench Lite eval MVP (generation -> scoring -> lm-eval shape). + +Pure-stdlib paths (extract_patch, predictions, report parsing, results shape) +run on any interpreter. The dataset filter and the collect/validate integration +guard on optional deps / interpreter version so the file imports cleanly even on +the macOS system python 3.9 used for local spot-checks. +""" + +import json +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent)) # utils/evals +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) # utils + +import swebench_score as sbs + + +# --- diff extraction ------------------------------------------------------- + +def test_extract_patch_from_diff_fence(): + text = ( + "Here is the fix:\n\n```diff\n" + "diff --git a/f.py b/f.py\n--- a/f.py\n+++ b/f.py\n" + "@@ -1 +1 @@\n-old\n+new\n```\nDone." + ) + patch = sbs.extract_patch(text) + assert patch.startswith("diff --git a/f.py b/f.py") + assert patch.endswith("\n") + assert "Here is the fix" not in patch + assert "Done." not in patch + + +def test_extract_patch_bare_diff_git(): + text = "no fence\ndiff --git a/x b/x\n@@ @@\n-a\n+b\n" + patch = sbs.extract_patch(text) + assert patch.startswith("diff --git a/x b/x") + assert "no fence" not in patch + + +def test_extract_patch_bare_diff_strips_trailing_prose(): + # A bare diff followed by an explanation must not glue the prose onto the + # patch (git apply would reject it -> instance scored unresolved). + text = ( + "diff --git a/x b/x\n--- a/x\n+++ b/x\n@@ -1 +1 @@\n-old\n+new\n" + "\nNotes:\nThis fixes #123.\n" + ) + patch = sbs.extract_patch(text) + assert patch.rstrip().endswith("+new") + assert "Notes:" not in patch + assert "This fixes" not in patch + + +def test_extract_patch_keeps_multi_file_and_interior_context(): + # Multiple files + a blank context line (represented as " ") stay intact. + text = ( + "```diff\n" + "diff --git a/a b/a\n@@ -1,2 +1,2 @@\n context\n-x\n+y\n" + "diff --git a/b b/b\n@@ -1 +1 @@\n-p\n+q\n" + "```\nthanks!" + ) + patch = sbs.extract_patch(text) + assert "diff --git a/a b/a" in patch + assert "diff --git a/b b/b" in patch + assert "thanks" not in patch + + +def test_extract_patch_empty_when_no_diff(): + assert sbs.extract_patch("") == "" + # Prose with no diff markers falls back to the raw text (harness will reject). + assert sbs.extract_patch("just words").strip() == "just words" + + +# --- samples -> predictions ------------------------------------------------ + +def _write_samples(dirpath: Path, records: list[dict]) -> None: + with (dirpath / "samples_swebench_lite_2026.jsonl").open("w") as fh: + for rec in records: + fh.write(json.dumps(rec) + "\n") + + +def test_build_predictions_extracts_instance_and_patch(tmp_path): + _write_samples(tmp_path, [ + { + "doc": {"instance_id": "repo__proj-1"}, + "filtered_resps": ["```diff\ndiff --git a/a b/a\n+x\n```"], + }, + { + "doc": {"instance_id": "repo__proj-2"}, + "resps": [["diff --git a/b b/b\n+y\n"]], + }, + ]) + preds = sbs.build_predictions(tmp_path, "my-model") + by_id = {p["instance_id"]: p for p in preds} + assert set(by_id) == {"repo__proj-1", "repo__proj-2"} + assert by_id["repo__proj-1"]["model_name_or_path"] == "my-model" + assert by_id["repo__proj-1"]["model_patch"].startswith("diff --git a/a b/a") + assert by_id["repo__proj-2"]["model_patch"].startswith("diff --git a/b b/b") + + +def test_build_predictions_raises_without_samples(tmp_path): + with pytest.raises(FileNotFoundError): + sbs.build_predictions(tmp_path, "m") + + +# --- report parsing -------------------------------------------------------- + +def test_parse_resolved_classic_counts(): + assert sbs.parse_resolved( + {"resolved_instances": 80, "total_instances": 196} + ) == (80, 196) + + +def test_parse_resolved_from_id_lists(): + report = {"resolved_ids": ["a", "b", "c"], "completed_ids": ["a", "b", "c", "d"]} + # no total_instances -> falls back to completed_ids length + assert sbs.parse_resolved(report) == (3, 4) + + +def test_parse_resolved_raises_on_garbage(): + with pytest.raises(ValueError): + sbs.parse_resolved({"nope": 1}) + + +# --- harness command construction (Docker vs Modal) ------------------------ + +def _captured_harness_cmd(monkeypatch, tmp_path, *, modal, namespace): + captured = {} + monkeypatch.setattr(sbs.subprocess, "run", lambda cmd, **kw: captured.setdefault("cmd", cmd)) + sbs.run_harness( + tmp_path / "predictions.jsonl", "princeton-nlp/SWE-bench_Lite", "rid", + tmp_path, 8, namespace, modal=modal, + ) + return captured["cmd"] + + +def test_run_harness_modal_uses_modal_and_parallelism(monkeypatch, tmp_path): + cmd = _captured_harness_cmd(monkeypatch, tmp_path, modal=True, namespace="") + assert "--modal" in cmd and "--parallelism" in cmd + assert "--namespace" not in cmd # Docker-only + assert "--max_workers" not in cmd + + +def test_run_harness_docker_uses_max_workers_and_namespace(monkeypatch, tmp_path): + cmd = _captured_harness_cmd(monkeypatch, tmp_path, modal=False, namespace="") + assert "--max_workers" in cmd + assert "--namespace" in cmd + assert "--modal" not in cmd + + +# --- lm-eval-shaped results ------------------------------------------------ + +def test_build_results_json_is_lm_eval_shaped(): + res = sbs.build_results_json( + "swebench_lite", 49, 196, "m", "0.4.12", {"resolved_instances": 49} + ) + assert "lm_eval_version" in res # detection key for collect_eval_results + task = res["results"]["swebench_lite"] + assert task["exact_match,resolved"] == pytest.approx(0.25) + cfg = res["configs"]["swebench_lite"] + assert cfg["filter_list"] == [{"name": "resolved"}] + assert res["n-samples"]["swebench_lite"]["effective"] == 196 + + +def test_score_offline_end_to_end(tmp_path): + """--report path: samples -> predictions + results JSON, no Docker.""" + samples = tmp_path / "gen" + samples.mkdir() + _write_samples(samples, [ + {"doc": {"instance_id": "r__p-1"}, "filtered_resps": ["```diff\ndiff --git a/a b/a\n+x\n```"]}, + ]) + report = tmp_path / "report.json" + report.write_text(json.dumps({"resolved_instances": 1, "total_instances": 1})) + out = tmp_path / "out" + rc = sbs.main([ + "--samples-dir", str(samples), "--out-dir", str(out), + "--model-name", "m", "--report", str(report), + ]) + assert rc == 0 + assert (out / "predictions.jsonl").exists() + results = json.loads((out / "results_swebench_lite.json").read_text()) + assert results["results"]["swebench_lite"]["exact_match,resolved"] == 1.0 + + +def test_predictions_only_writes_predictions_no_results(tmp_path): + """SWEBENCH_SKIP_SCORE path: predictions only, no Docker, no results JSON.""" + samples = tmp_path / "gen" + samples.mkdir() + _write_samples(samples, [ + {"doc": {"instance_id": "r__p-1"}, "filtered_resps": ["```diff\ndiff --git a/a b/a\n+x\n```"]}, + ]) + out = tmp_path / "out" + rc = sbs.main([ + "--samples-dir", str(samples), "--out-dir", str(out), + "--model-name", "m", "--predictions-only", + ]) + assert rc == 0 + assert (out / "predictions.jsonl").exists() + assert not (out / "results_swebench_lite.json").exists() + + +# --- integration with the existing pipeline (needs tabulate + py3.10+) ----- + +@pytest.mark.skipif(sys.version_info < (3, 10), reason="repo modules use py3.10 syntax") +def test_results_json_flows_through_collect_and_validate(tmp_path, monkeypatch): + pytest.importorskip("tabulate") + import collect_eval_results as cer + import validate_scores as vs + + art = tmp_path / "eval" + art.mkdir() + (art / "meta_env.json").write_text(json.dumps({ + "infmax_model_prefix": "dsr1", "hw": "b200", "framework": "sglang", + "precision": "fp8", "isl": 8192, "osl": 1024, + })) + res = sbs.build_results_json( + "swebench_lite", 150, 300, "dsr1", "0.4.12", None + ) + (art / "results_swebench_lite.json").write_text(json.dumps(res)) + + # collect surfaces the resolved-rate as the unified `score`. + rows = cer.collect_eval_rows(tmp_path) + assert len(rows) == 1 + assert rows[0]["task"] == "swebench_lite" + assert rows[0]["score"] == pytest.approx(0.5) + + # validate_scores gates exact_match,resolved against thresholds.json (0.10). + monkeypatch.chdir(art) + monkeypatch.setattr(sys, "argv", [ + "validate_scores.py", + "--results-glob", "results_swebench_lite.json", + ]) + assert vs.main() == 0 # 0.5 >= 0.10 default threshold diff --git a/utils/evals/thresholds.json b/utils/evals/thresholds.json index d6c091152..cbbe65105 100644 --- a/utils/evals/thresholds.json +++ b/utils/evals/thresholds.json @@ -1,7 +1,8 @@ { "default": { "gsm8k": 0.90, - "gpqa_diamond_cot_n_shot": 0.30 + "gpqa_diamond_cot_n_shot": 0.30, + "swebench_lite": 0.10 }, "models": { "dsr1": { From 8f9f3ee4b3303ee1fb252cac931cddb5931f62a7 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Thu, 2 Jul 2026 12:12:34 -0500 Subject: [PATCH 2/4] feat(evals): agentic-scenario eval selection + routing (swebench on agentic configs) Wire the SWE-bench Lite eval into the sweep matrix so it runs on agentic coding configs, and route it through e2e-tests. - utils/matrix_logic/generate_sweep_configs.py: add mark_eval_entries and mark_all_eval_entries. For agentic configs these mark exactly one eval entry per (model, runner, framework, precision) group at the highest concurrency, single-node only, so each unique agentic config gets one swebench eval run rather than one per concurrency point. - utils/matrix_logic/test_generate_sweep_configs.py: add test_marks_agentic_entry_for_swebench and update TestMarkAllEvalEntries to cover the agentic marking behavior. - .github/workflows/e2e-tests.yml: add the agentic-eval-config bucket, a test-sweep-agentic-evals job, and make collect-evals depend on it. The AGENTIC_EVAL filter (agentic + no prefill + run-eval) selects the eval entries; the throughput AGENTIC filter (agentic + not run-eval) excludes them so throughput and eval runs don't collide. - benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh: add the eval hook so the recipe triggers the agentic swebench eval. --- .github/workflows/e2e-tests.yml | 44 +++++++++++++++++-- .../single_node/agentic/kimik2.5_fp4_b300.sh | 13 ++++-- utils/matrix_logic/generate_sweep_configs.py | 31 +++++++++++-- .../test_generate_sweep_configs.py | 30 ++++++++++++- 4 files changed, 106 insertions(+), 12 deletions(-) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 1b83a798a..58c6fe34d 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -50,6 +50,7 @@ jobs: eval-config: ${{ steps.get-jobs.outputs.eval-config }} multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }} agentic-config: ${{ steps.get-jobs.outputs.agentic-config }} + agentic-eval-config: ${{ steps.get-jobs.outputs.agentic-eval-config }} multi-node-agentic-config: ${{ steps.get-jobs.outputs.multi-node-agentic-config }} steps: - name: Checkout code (ref) @@ -69,13 +70,15 @@ jobs: pip install pydantic CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \ ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }}) - AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x]))") + AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x and not x.get('run-eval', False)]))") + AGENTIC_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x and x.get('run-eval', False)]))") MULTI_AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' in x]))") SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))") MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))") EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and x.get('run-eval', False)]))") MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))") echo "agentic-config=$AGENTIC" >> $GITHUB_OUTPUT + echo "agentic-eval-config=$AGENTIC_EVAL" >> $GITHUB_OUTPUT echo "multi-node-agentic-config=$MULTI_AGENTIC" >> $GITHUB_OUTPUT echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT @@ -195,6 +198,41 @@ jobs: scenario-type: agentic-coding ref: ${{ inputs.ref }} + test-sweep-agentic-evals: + needs: get-jobs + if: ${{ needs.get-jobs.outputs.agentic-eval-config != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + name: agentic eval / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-jobs.outputs.agentic-eval-config) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + offloading: ${{ matrix.config.offloading }} + duration: ${{ inputs.duration-override != '' && inputs.duration-override || matrix.config.duration }} + isl: '0' + osl: '0' + max-model-len: '0' + spec-decoding: 'none' + disagg: 'false' + # scenario-type agentic-coding => run_eval auto-selects swebench. + run-eval: true + eval-only: true + scenario-type: agentic-coding + ref: ${{ inputs.ref }} + test-sweep-multi-node-agentic: needs: get-jobs if: ${{ needs.get-jobs.outputs.multi-node-agentic-config != '[]' }} @@ -305,8 +343,8 @@ jobs: result-prefix: "bmk" collect-evals: - needs: [test-sweep-evals, test-sweep-multi-node-evals] - if: ${{ always() && (needs.test-sweep-evals.result != 'skipped' || needs.test-sweep-multi-node-evals.result != 'skipped') }} + needs: [test-sweep-evals, test-sweep-multi-node-evals, test-sweep-agentic-evals] + if: ${{ always() && (needs.test-sweep-evals.result != 'skipped' || needs.test-sweep-multi-node-evals.result != 'skipped' || needs.test-sweep-agentic-evals.result != 'skipped') }} uses: ./.github/workflows/collect-evals.yml secrets: inherit diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh index 8cebe4f20..af0678246 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh @@ -109,7 +109,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index af3e5053a..2512175d1 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -137,10 +137,30 @@ def _eligible_eval_concs(entry): eval_indices.add(best_idx) mn_eval_conc[best_idx] = best_eval_conc - # Mark the selected entries (skip agentic entries which don't support evals) + # Agentic-coding entries run swebench (single-shot), not gsm8k, and have no + # isl/osl so the 8k1k logic above never selects them. Single-node only for + # now: mark one entry per (model, runner, framework, precision) at its + # highest concurrency. (Multi-node agentic eval is a future extension.) + ag_sn_groups = defaultdict(list) for i, entry in enumerate(matrix_values): - if entry.get(Fields.SCENARIO_TYPE.value) == 'agentic-coding': + if entry.get(Fields.SCENARIO_TYPE.value) != 'agentic-coding': + continue + if Fields.PREFILL.value in entry: # multi-node agentic: not yet continue + conc = entry[Fields.CONC.value] + conc_val = max(conc) if isinstance(conc, list) else conc + key = ( + entry[Fields.MODEL.value], + entry[Fields.RUNNER.value], + entry[Fields.FRAMEWORK.value], + entry[Fields.PRECISION.value], + ) + ag_sn_groups[key].append((i, conc_val)) + for entries in ag_sn_groups.values(): + eval_indices.add(max(entries, key=lambda item: item[1])[0]) + + # Mark the selected entries (agentic-coding entries run swebench; see above). + for i, entry in enumerate(matrix_values): entry[Fields.RUN_EVAL.value] = i in eval_indices if i in mn_eval_conc: entry[Fields.EVAL_CONC.value] = mn_eval_conc[i] @@ -154,7 +174,8 @@ def mark_all_eval_entries(matrix_values: list[dict]) -> list[dict]: Evals only run at 8k1k (matching mark_eval_entries), so entries at other sequence lengths (e.g. 1k1k) are passed through untouched rather than expanded into eval rows. - Agentic entries are left untouched because they do not support lm-eval. + Single-node agentic entries run swebench (single-shot) and are marked for + eval (multi-node agentic eval is a future extension). Multi-node rows with the same engine topology are merged into one eval row whose full concurrency list is run sequentially against the same engine. """ @@ -165,6 +186,10 @@ def mark_all_eval_entries(matrix_values: list[dict]) -> list[dict]: for entry in matrix_values: if entry.get(Fields.SCENARIO_TYPE.value) == 'agentic-coding': + # Agentic runs swebench (single-shot); mark single-node agentic for + # eval. Multi-node agentic eval is a future extension. + if Fields.PREFILL.value not in entry: + entry[Fields.RUN_EVAL.value] = True expanded_entries.append(entry) continue diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py index 72717eab6..da9d3b5db 100644 --- a/utils/matrix_logic/test_generate_sweep_configs.py +++ b/utils/matrix_logic/test_generate_sweep_configs.py @@ -196,6 +196,29 @@ def test_unknown_sequence_lengths(self): class TestMarkEvalEntries: """Tests for eval matrix selection policy.""" + def test_marks_agentic_entry_for_swebench(self): + """Agentic-coding configs (no isl/osl) are marked run-eval for swebench: + one per (model, runner, framework, precision) at highest conc.""" + matrix_values = [ + { + "scenario-type": "agentic-coding", + "model": "m", "runner": "b300", "framework": "vllm", + "precision": "fp4", "tp": 8, "conc": 32, + }, + { + "scenario-type": "agentic-coding", + "model": "m", "runner": "b300", "framework": "vllm", + "precision": "fp4", "tp": 8, "conc": 64, + }, + ] + + result = mark_eval_entries(matrix_values) + + # One eval per config, at the highest concurrency (64, not 32). + marked = [e for e in result if e.get("run-eval")] + assert len(marked) == 1 + assert marked[0]["conc"] == 64 + def test_single_node_skips_eval_entries_below_min_conc(self): """Single-node eval selection should ignore conc values below MIN_EVAL_CONC.""" matrix_values = [ @@ -605,7 +628,10 @@ def test_excludes_1k1k_multinode_entries_from_expansion(self): assert eight_k['eval-all-concs'] is True assert eight_k['conc'] == [8, 32] - def test_skips_agentic_entries(self): + def test_marks_agentic_entries_for_swebench(self): + # Agentic configs now run swebench (single-shot), so --all-evals marks + # every agentic entry run-eval=True -- but without the fixed-seq-len + # multi-node batching fields (eval-conc / eval-all-concs). entries = [ { 'scenario-type': 'agentic-coding', @@ -617,7 +643,7 @@ def test_skips_agentic_entries(self): result = mark_all_eval_entries(entries) - assert 'run-eval' not in result[0] + assert result[0]['run-eval'] is True assert 'eval-conc' not in result[0] assert 'eval-all-concs' not in result[0] From 13bf8e8c5a6a2270142200140005c5c5bbbdb1e0 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Thu, 2 Jul 2026 12:22:53 -0500 Subject: [PATCH 3/4] fix(evals): swebench Modal uses --max_workers (no --parallelism in 4.1.0) + bootstrap Modal creds from env swebench 4.1.0 exposes --max_workers in both Docker and Modal modes; --parallelism does not exist. Fix run_harness() to emit --max_workers in the Modal branch. Add _ensure_modal_credentials() to benchmark_lib.sh: swebench's credential check only looks for ~/.modal.toml, but CI supplies MODAL_TOKEN_ID/ MODAL_TOKEN_SECRET env vars (GitHub secret). The helper bootstraps the file from the env vars when the file is absent, so the harness check passes. Called in run_swebench_eval() right after _install_swebench_deps, scoring path only. Update the Modal test name and assertions, the run_swebench_eval docstring, and the EVALS.md knobs bullet to document the credential bootstrapping. --- benchmarks/benchmark_lib.sh | 23 ++++++++++++++++++++++- utils/evals/EVALS.md | 9 ++++++--- utils/evals/swebench_score.py | 5 +++-- utils/evals/test_swebench_eval.py | 7 ++++--- 4 files changed, 35 insertions(+), 9 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 1e59aa1b4..474c846fc 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -984,7 +984,10 @@ META # SWEBENCH_MAX_WORKERS (default 4) harness workers / Modal parallelism # SWEBENCH_USE_MODAL "true" => score on Modal remote sandboxes instead of # local Docker (no Docker needed on the node; requires a -# Modal account + ~/.modal.toml or MODAL_TOKEN_* creds) +# Modal account — credentials from ~/.modal.toml or from +# MODAL_TOKEN_ID/MODAL_TOKEN_SECRET env vars, e.g. a +# GitHub secret; the env vars are bootstrapped into +# ~/.modal.toml automatically if the file is absent) # SWEBENCH_NAMESPACE local-Docker only: pass "" on arm/Mac to build locally # SWEBENCH_SKIP_SCORE "true" => generate + stage predictions only, no scoring # (score elsewhere) @@ -996,6 +999,23 @@ _install_swebench_deps() { fi } +# swebench's validate_modal_credentials() only checks that ~/.modal.toml +# exists; the modal package itself prefers MODAL_TOKEN_ID/MODAL_TOKEN_SECRET +# env vars (how CI passes the GitHub secret). Bootstrap a minimal file from +# the env so the harness's check passes. Never overwrite an existing file. +_ensure_modal_credentials() { + if [ "${SWEBENCH_USE_MODAL:-false}" != "true" ]; then return 0; fi + if [ -f "$HOME/.modal.toml" ]; then return 0; fi + if [ -n "${MODAL_TOKEN_ID:-}" ] && [ -n "${MODAL_TOKEN_SECRET:-}" ]; then + printf '[default]\ntoken_id = "%s"\ntoken_secret = "%s"\nactive = true\n' \ + "$MODAL_TOKEN_ID" "$MODAL_TOKEN_SECRET" > "$HOME/.modal.toml" + chmod 600 "$HOME/.modal.toml" + echo "[swebench] wrote ~/.modal.toml from MODAL_TOKEN_ID/MODAL_TOKEN_SECRET env" + else + echo "WARN: SWEBENCH_USE_MODAL=true but no ~/.modal.toml and no MODAL_TOKEN_ID/MODAL_TOKEN_SECRET env; Modal scoring will fail credential validation" >&2 + fi +} + # Run the configured eval and stage its artifacts when RUN_EVAL is enabled. # run_eval auto-selects the framework by scenario (agentic -> swebench, # fixed-seq-len -> lm-eval), so recipes call this without naming a framework. @@ -1072,6 +1092,7 @@ run_swebench_eval() { _install_swebench_deps export INFERENCEX_SWEBENCH_RUNTIME_READY=true fi + _ensure_modal_credentials local score_rc=0 python3 utils/evals/swebench_score.py \ --samples-dir "$gen_dir" \ diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md index a7738defc..e518dcdb8 100644 --- a/utils/evals/EVALS.md +++ b/utils/evals/EVALS.md @@ -188,9 +188,12 @@ append_lm_eval_summary `python -m swebench.harness.run_evaluation` → resolved-rate → results JSON). Offline `--report` mode skips Docker for testing. - Knobs: `SWEBENCH_TASK_NAME` (selects the YAML), `SWEBENCH_MAX_WORKERS`, - `SWEBENCH_NAMESPACE` (pass `""` on arm/Mac), `SWEBENCH_SKIP_SCORE=true` (generate-only). The - scoring dataset is derived from the YAML's `dataset_path` so generation and scoring can't diverge; - `SWEBENCH_DATASET`, if set, must match it (mismatch fails fast). + `SWEBENCH_NAMESPACE` (pass `""` on arm/Mac), `SWEBENCH_SKIP_SCORE=true` (generate-only), + `SWEBENCH_USE_MODAL=true` (score on Modal remote sandboxes instead of local Docker). Modal + credentials: set `MODAL_TOKEN_ID`/`MODAL_TOKEN_SECRET` (e.g. from a GitHub secret) or provide + `~/.modal.toml`; if the file is absent the env vars are bootstrapped into it automatically. + The scoring dataset is derived from the YAML's `dataset_path` so generation and scoring can't + diverge; `SWEBENCH_DATASET`, if set, must match it (mismatch fails fast). - **Requires Docker + ~120 GB disk on the scoring host.** This is an MVP; the single-shot prompt and diff extraction still need tuning to reach published resolved-rates, and the `thresholds.json` entry needs calibration from a baseline run. diff --git a/utils/evals/swebench_score.py b/utils/evals/swebench_score.py index 813a0e811..8492cf00b 100644 --- a/utils/evals/swebench_score.py +++ b/utils/evals/swebench_score.py @@ -204,8 +204,9 @@ def run_harness( ] if modal: # Modal remote sandboxes instead of local Docker (no Docker on the node). - # --parallelism replaces --max_workers; --namespace is local-Docker only. - cmd += ["--modal", "true", "--parallelism", str(max_workers)] + # swebench 4.1.0 uses --max_workers in both modal and Docker modes; + # --namespace is local-Docker only and is still omitted for modal. + cmd += ["--modal", "true", "--max_workers", str(max_workers)] else: cmd += ["--max_workers", str(max_workers)] if namespace is not None: diff --git a/utils/evals/test_swebench_eval.py b/utils/evals/test_swebench_eval.py index 72dba9b32..6df711956 100644 --- a/utils/evals/test_swebench_eval.py +++ b/utils/evals/test_swebench_eval.py @@ -136,11 +136,12 @@ def _captured_harness_cmd(monkeypatch, tmp_path, *, modal, namespace): return captured["cmd"] -def test_run_harness_modal_uses_modal_and_parallelism(monkeypatch, tmp_path): +def test_run_harness_modal_uses_modal_flag(monkeypatch, tmp_path): cmd = _captured_harness_cmd(monkeypatch, tmp_path, modal=True, namespace="") - assert "--modal" in cmd and "--parallelism" in cmd + assert "--modal" in cmd + assert "--max_workers" in cmd + assert "--parallelism" not in cmd assert "--namespace" not in cmd # Docker-only - assert "--max_workers" not in cmd def test_run_harness_docker_uses_max_workers_and_namespace(monkeypatch, tmp_path): From e34035d55e8b97bf532637b3d12b5f132524a24c Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Thu, 2 Jul 2026 12:27:25 -0500 Subject: [PATCH 4/4] feat(evals): eval-only gating for all single-node agentic recipes Apply the EVAL_ONLY=true if/else gating pattern (already present in kimik2.5_fp4_b300.sh) to the remaining 24 single-node agentic recipes in benchmarks/single_node/agentic/. In eval-only mode each recipe skips the multi-turn agentic replay and calls maybe_run_eval "$PORT" against the live server; run_eval auto-selects swebench for the agentic-coding scenario. The deprecated/ subdirectory was not touched. --- benchmarks/single_node/agentic/dsr1_fp4_b200.sh | 13 +++++++++---- benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh | 13 +++++++++---- .../single_node/agentic/dsv4_fp4_b200_vllm.sh | 13 +++++++++---- .../single_node/agentic/dsv4_fp4_b300_vllm.sh | 13 +++++++++---- .../single_node/agentic/dsv4_fp4_mi355x_sglang.sh | 13 +++++++++---- benchmarks/single_node/agentic/dsv4_fp8_h200.sh | 13 +++++++++---- benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh | 13 +++++++++---- benchmarks/single_node/agentic/glm5_fp8_b200.sh | 13 +++++++++---- benchmarks/single_node/agentic/gptoss_fp4_b200.sh | 13 +++++++++---- benchmarks/single_node/agentic/gptoss_fp4_h100.sh | 13 +++++++++---- benchmarks/single_node/agentic/gptoss_fp4_h200.sh | 13 +++++++++---- benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh | 13 +++++++++---- benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh | 13 +++++++++---- benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh | 13 +++++++++---- .../single_node/agentic/kimik2.5_fp4_mi355x.sh | 13 +++++++++---- .../single_node/agentic/kimik2.5_int4_b200.sh | 13 +++++++++---- .../single_node/agentic/kimik2.5_int4_h100.sh | 13 +++++++++---- .../single_node/agentic/kimik2.5_int4_h200.sh | 13 +++++++++---- benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh | 13 +++++++++---- benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh | 13 +++++++++---- .../single_node/agentic/qwen3.5_fp8_b300_sglang.sh | 13 +++++++++---- benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh | 13 +++++++++---- .../single_node/agentic/qwen3.5_fp8_mi355x.sh | 13 +++++++++---- .../agentic/qwen3.5_fp8_mi355x_sglang.sh | 13 +++++++++---- 24 files changed, 216 insertions(+), 96 deletions(-) diff --git a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh index f9955adc7..7a46c136f 100755 --- a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh +++ b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh @@ -58,7 +58,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh index ff76b768d..38dc01922 100755 --- a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh @@ -52,7 +52,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index 108347479..f663b659f 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -249,7 +249,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index f6748a5f8..92e1dd2ba 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -137,7 +137,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh index 99aec25fe..04c297f4f 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh @@ -156,7 +156,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh index 0a0177983..71fd2b331 100755 --- a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh +++ b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh @@ -63,7 +63,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh index 500b456f5..b731d81ce 100755 --- a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh @@ -64,7 +64,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/glm5_fp8_b200.sh b/benchmarks/single_node/agentic/glm5_fp8_b200.sh index 259c19586..26e09e382 100755 --- a/benchmarks/single_node/agentic/glm5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/glm5_fp8_b200.sh @@ -69,7 +69,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh index 6e921db58..ddcbc7937 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh @@ -66,7 +66,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh index 557986b0d..68856a75c 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh @@ -70,7 +70,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh index 1592a8d5c..360aed712 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh @@ -70,7 +70,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh index eb1883ff1..1fc45d47d 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh @@ -83,7 +83,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh index 99e29c819..c1bde7465 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh @@ -82,7 +82,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh index ad0b4495a..791bd549c 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh @@ -202,7 +202,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh index fd0ce3677..4b7e285ee 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh @@ -808,7 +808,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh index 697d3fa45..f40572563 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh @@ -61,7 +61,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh index 2fd3b381c..a11462bc2 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh @@ -62,7 +62,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh index 97929e43e..81fba6cfd 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh @@ -72,7 +72,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh index 4ba87976b..80591bbce 100755 --- a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh +++ b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh @@ -65,7 +65,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh index 3432af5c9..daee15f20 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh @@ -65,7 +65,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh index 9d9c1d7d5..a49d925dc 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh @@ -120,7 +120,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh index 95f0397a0..2cc3d1526 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh @@ -131,7 +131,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh index aef9650ca..84002aeb8 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh @@ -56,7 +56,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh index 5427d0d31..e25afa775 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh @@ -136,7 +136,12 @@ echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" +if [ "${EVAL_ONLY}" = "true" ]; then + # Eval-only: skip the multi-turn agentic replay and run the eval against the + # live server. run_eval auto-selects swebench for agentic-coding scenario. + maybe_run_eval "$PORT" +else + # ---- Run benchmark ------------------------------------------------------ + build_replay_cmd "$RESULT_DIR" + run_agentic_replay_and_write_outputs "$RESULT_DIR" +fi