diff --git a/README.md b/README.md
index 62352216..b63b30cd 100644
--- a/README.md
+++ b/README.md
@@ -96,14 +96,14 @@ alphajudge PATH [PATH ...] \
 - **-o / --summary**: Write an aggregated CSV across all processed runs
 - **--cores**: Number of processes to use across run directories (0 = all available cores)
 - **--report / --no-report**: Write an RCSB-style `report.pdf` next to each per-run `interfaces.csv`. Default is on for single-run scoring and off when `--summary` is used, so benchmark aggregations stay fast.
-- **--aggregate_report AGGREGATE.pdf**: After scoring, build a multi-page validation PDF from the `--summary` CSV with one slider page per interface ranked by meta score (requires `--summary`).
+- **--aggregate_report AGGREGATE.pdf**: After scoring, build a multi-page validation PDF from the `--summary` CSV with one slider page per interface ranked by meta score, followed by a "Per-complex evidence" section with the per-complex confidence sliders and PAE heatmap for each top-N complex (requires `--summary`).
 
 Outputs:
 - Always writes `interfaces.csv` inside each processed run directory.
 - For each processed model, also writes a PAE heatmap PNG `pae_<model>.png` next to `interfaces.csv`.
-- If `--report` is on, also writes `report.pdf` next to `interfaces.csv` -- an RCSB-style validation report with a percentile slider panel for every detected interface.
+- If `--report` is on, also writes `report.pdf` next to `interfaces.csv` -- an RCSB-style validation report with a percentile slider panel for every detected interface and a final "Complex-level confidence & PAE" page combining the per-complex scalars (confidence score, pDockQ/mpDockQ) with the PAE heatmap.
 - If `--summary` is provided, also writes a union-header CSV at the given path containing rows from all runs.
-- If `--aggregate_report` is provided, also writes a multi-page PDF with one slider page per interface across the whole cohort, plus a cover with the meta-score histogram, summary statistics, and a top-N interfaces table.
+- If `--aggregate_report` is provided, also writes a multi-page PDF: cover with the meta-score histogram, summary statistics, and a top-N interfaces table; one slider page per interface across the whole cohort; then a "Per-complex evidence" section with one page per top-N complex (per-complex confidence sliders plus PAE heatmap).
 
 Report generation is backend-agnostic: AF2, AF3, and Boltz-2 runs all flow through the same scoring path, so `--report` and `--aggregate_report` work identically for any mix of supported predictions in one cohort. Multimers contribute one slider page per detected chain pair; dimers contribute one.
 
diff --git a/src/alphajudge/report.py b/src/alphajudge/report.py
index 476974ff..f2df8af0 100644
--- a/src/alphajudge/report.py
+++ b/src/alphajudge/report.py
@@ -106,22 +106,31 @@
 }
 
 # Metric grouping for the slider panel. Lines are drawn only WITHIN each group
-# (AF-derived vs. biophysical); the Q-score is kept separate and never joined
-# to a polyline.
+# (AF-derived vs. biophysical); the Meta-score row stays separate and is never
+# joined to a polyline.
+#
+# Per-interface vs. complex-level: features that are scalars per predicted
+# complex (not per chain pair) are pulled out of the per-interface slider
+# panel and shown together with the PAE on a dedicated end-of-report page.
+# In AF3 iptm is per chain pair (chain_pair_iptm), so it stays in the
+# AF-derived group; confidence_score and pDockQ/mpDockQ are global to the
+# complex and live in COMPLEX_LEVEL_FEATURES.
 _AF_DERIVED_FEATURES = (
     "interface_LIS",
     "interface_ipSAE",
     "interface_pDockQ2",
     "iptm",
-    "confidence_score",
     "average_interface_pae",
-    "pDockQ/mpDockQ",
 )
 _BIOPHYSICAL_FEATURES = (
     "interface_sc",
     "interface_hb",
     "interface_solv_en",
 )
+_COMPLEX_LEVEL_FEATURES = (
+    "confidence_score",
+    "pDockQ/mpDockQ",
+)
 
 
 # ---------------------------------------------------------------------------
@@ -590,14 +599,23 @@ def _metric_rows_for_slider_panel(
     row: Mapping[str, Any],
     *,
     include_overall: bool,
+    groups: Sequence[tuple[str, Sequence[str]]] | None = None,
 ) -> list[tuple[str, float | None, float | None, str, str]]:
     """Return (label, raw, percentile, units, group) per slider row.
 
-    Group is one of "overall" (the Q-score header row), "af" (AlphaFold-
-    derived confidence features) or "biophys" (biophysical features). The
-    grouping is used by ``_draw_slider_panel`` to add vertical spacing
-    between groups and to draw polylines only within a group.
+    Group is one of "overall" (the Meta-score row), "af" (AlphaFold-
+    derived confidence features), "biophys" (biophysical features), or
+    "complex" (per-complex scalars). The grouping is used by
+    ``_draw_slider_panel`` to add vertical spacing between groups and to
+    draw polylines only within a group.
+
+    ``groups`` lets callers swap the per-interface feature list for a
+    different set (e.g. just complex-level metrics on the end-of-report
+    PAE page); when ``None`` the per-interface layout is used.
     """
+    if groups is None:
+        groups = (("af", _AF_DERIVED_FEATURES), ("biophys", _BIOPHYSICAL_FEATURES))
+
     rows: list[tuple[str, float | None, float | None, str, str]] = []
 
     if include_overall:
@@ -605,28 +623,25 @@ def _metric_rows_for_slider_panel(
         rows.append(("Meta score", score, score, "", "overall"))
 
     fv = _feature_view(row)
-    for feat in _AF_DERIVED_FEATURES:
-        raw, pct = fv[feat]
-        rows.append(
-            (
-                _FEATURE_DISPLAY.get(feat, feat),
-                raw,
-                pct,
-                _FEATURE_UNITS.get(feat, ""),
-                "af",
-            )
-        )
-    for feat in _BIOPHYSICAL_FEATURES:
-        raw, pct = fv[feat]
-        rows.append(
-            (
-                _FEATURE_DISPLAY.get(feat, feat),
-                raw,
-                pct,
-                _FEATURE_UNITS.get(feat, ""),
-                "biophys",
+    for group_tag, features in groups:
+        for feat in features:
+            if feat in fv:
+                raw, pct = fv[feat]
+            else:
+                # Compute percentile even when feature isn't in METASCORE
+                # (e.g. complex-level features were dropped from the metascore
+                # but still need a slider bar).
+                raw = _safe_float(row.get(feat))
+                pct = calibrated_feature_percentile(feat, raw) if raw is not None else None
+            rows.append(
+                (
+                    _FEATURE_DISPLAY.get(feat, feat),
+                    raw,
+                    pct,
+                    _FEATURE_UNITS.get(feat, ""),
+                    group_tag,
+                )
             )
-        )
 
     return rows
 
@@ -674,17 +689,21 @@ def _draw_slider_panel(
     height: float,
     row: Mapping[str, Any],
     include_overall: bool = True,
+    groups: Sequence[tuple[str, Sequence[str]]] | None = None,
 ) -> float:
     """Draw a compact wwPDB-style percentile graphic.
 
-    The Q-score row (if included) is rendered first and visually offset
-    from the rest. AlphaFold-derived confidence features and biophysical
-    features are drawn as two separate groups, each connected by its own
-    polyline; lines never cross the Q-score or the group boundary.
+    The Meta-score row (if included) is rendered first and visually offset
+    from the rest. Each group passed in ``groups`` is rendered as its own
+    block, with its own connecting polyline; lines never cross the
+    Meta-score row or a group boundary. When ``groups`` is ``None`` the
+    standard per-interface layout (AF-derived + biophysical) is used.
 
     Returns the bottom y coordinate of the graphic.
     """
-    rows = _metric_rows_for_slider_panel(row, include_overall=include_overall)
+    rows = _metric_rows_for_slider_panel(
+        row, include_overall=include_overall, groups=groups
+    )
     n_rows = len(rows)
     if n_rows == 0:
         return top
@@ -796,11 +815,13 @@ def _draw_slider_panel(
     def _row_y(idx: int) -> float:
         return centers[idx]
 
-    # Polyline segments per metric group (skip "overall" - no line through Q-score).
-    by_group: dict[str, list[tuple[float, float]]] = {"af": [], "biophys": []}
+    # Polyline segments per metric group (skip "overall" - the Meta-score row
+    # is intentionally not connected to any feature row).
+    by_group: dict[str, list[tuple[float, float]]] = {}
     for idx, pct, group in pct_positions:
-        if group in by_group:
-            by_group[group].append((pct, _row_y(idx)))
+        if group == "overall":
+            continue
+        by_group.setdefault(group, []).append((pct, _row_y(idx)))
 
     for points in by_group.values():
         if len(points) >= 2:
@@ -1290,18 +1311,27 @@ def render_pae_png(
     return out_path
 
 
-def _pae_page(
+def _complex_evidence_page(
     pdf: PdfPages,
     *,
     title: str,
     entry_id: str,
     section_no: str,
-    pae_path: Path,
+    row: Mapping[str, Any] | None,
+    pae_path: Path | None,
     model_label: str,
     page_no: int,
     total: int,
     last: bool = False,
+    complex_label: str | None = None,
 ) -> None:
+    """One end-of-report page that combines:
+
+    - Complex-level slider rows (confidence_score, pDockQ/mpDockQ).
+      These are scalars per predicted complex/model rather than per chain
+      pair, so showing them on every interface page was misleading.
+    - The PAE heatmap for the same model (when a PNG is available).
+    """
     fig = _new_figure()
     _add_page_header(fig, page_no=page_no, total=total, title=title, entry=entry_id)
 
@@ -1312,30 +1342,59 @@ def _pae_page(
         w=0.86,
         h=0.045,
         number=section_no,
-        title="Predicted aligned error (PAE)",
+        title="Complex-level confidence & PAE",
         show_info=False,
     )
 
+    sub_bits: list[str] = []
+    if complex_label:
+        sub_bits.append(complex_label)
     if model_label:
+        sub_bits.append(f"Model {model_label}")
+    if sub_bits:
         sub_ax = fig.add_axes((0.10, 0.855, 0.80, 0.030))
         sub_ax.axis("off")
         sub_ax.text(
             0.5,
             0.5,
-            f"Model: {model_label}",
+            "  •  ".join(sub_bits),
             ha="center",
             va="center",
-            fontsize=9,
-            color="#555555",
+            fontsize=10,
+            color="#1f1f1f",
             transform=sub_ax.transAxes,
         )
 
-    img_ax = fig.add_axes((0.10, 0.105, 0.80, 0.730))
-    try:
-        img = mpimg.imread(str(pae_path))
-        img_ax.imshow(img)
-    except Exception as e:
-        img_ax.text(0.5, 0.5, f"PAE image unavailable\n({e})", ha="center", va="center")
+    # Top half: complex-level slider mini-panel.
+    if row is not None:
+        _draw_slider_panel(
+            fig,
+            top=0.815,
+            height=0.180,
+            row=row,
+            include_overall=False,
+            groups=[("complex", _COMPLEX_LEVEL_FEATURES)],
+        )
+
+    # Bottom half: PAE heatmap, or a small inline note if no PNG was found.
+    img_ax = fig.add_axes((0.10, 0.075, 0.80, 0.530))
+    if pae_path is not None and Path(pae_path).exists():
+        try:
+            img = mpimg.imread(str(pae_path))
+            img_ax.imshow(img)
+        except Exception as e:
+            img_ax.text(0.5, 0.5, f"PAE image unavailable\n({e})",
+                        ha="center", va="center", fontsize=10, color="#666")
+    else:
+        img_ax.text(
+            0.5,
+            0.5,
+            "No PAE heatmap available for this model.",
+            ha="center",
+            va="center",
+            fontsize=10,
+            color="#666",
+        )
     img_ax.set_xticks([])
     img_ax.set_yticks([])
     for spine in img_ax.spines.values():
@@ -1620,7 +1679,7 @@ def generate_per_run_report(
         1  # cover
         + (1 if show_interface_table else 0)  # overview table
         + len(interface_rows)  # one slider page per interface
-        + (1 if pae_path else 0)  # PAE heatmap
+        + 1  # complex-level confidence + PAE evidence
         + len(other_models)  # non-best-model appendix
     )
 
@@ -1709,20 +1768,21 @@ def generate_per_run_report(
             )
         next_section = quality_section_no + 1
 
-        if pae_path is not None:
-            page_no += 1
-            _pae_page(
-                pdf,
-                title=_REPORT_TITLE,
-                entry_id=entry_id,
-                section_no=str(next_section),
-                pae_path=pae_path,
-                model_label=best_model,
-                page_no=page_no,
-                total=total,
-                last=(page_no == total),
-            )
-            next_section += 1
+        page_no += 1
+        _complex_evidence_page(
+            pdf,
+            title=_REPORT_TITLE,
+            entry_id=entry_id,
+            section_no=str(next_section),
+            row=best,
+            pae_path=pae_path,
+            model_label=best_model,
+            page_no=page_no,
+            total=total,
+            last=(page_no == total),
+            complex_label=run_dir.name,
+        )
+        next_section += 1
 
         for m in other_models:
             m_rows = by_model[m]
@@ -1790,7 +1850,21 @@ def generate_aggregate_report(
     ranked.sort(key=lambda t: t[3], reverse=True)
 
     top_rows = [(label, score, r) for label, _, _, score, r in ranked[:top_n]]
-    ranked_per_page = ranked if max_complexes is None else ranked[:max_complexes]
+    if max_complexes is None:
+        ranked_per_page = ranked
+    else:
+        # Cap the number of DISTINCT complexes (not raw interface rows).
+        # Walk metascore-sorted; keep every interface row whose complex is
+        # among the first `max_complexes` complexes encountered.
+        ranked_per_page = []
+        seen_complex: set[str] = set()
+        for entry in ranked:
+            cname = entry[1]
+            if cname in seen_complex:
+                ranked_per_page.append(entry)
+            elif len(seen_complex) < max_complexes:
+                seen_complex.add(cname)
+                ranked_per_page.append(entry)
 
     # Backends counted per complex (so a multimer doesn't multi-count).
     seen_backend: dict[str, str] = {}
@@ -1804,7 +1878,23 @@ def generate_aggregate_report(
     scores = [s for _, _, _, s, _ in ranked]
     n_complexes = len(seen_backend)
     n_interfaces = len(ranked)
-    total = 1 + len(ranked_per_page)
+
+    # Pick a best-row per complex for the "Per-complex evidence" section so
+    # we can render one PAE+complex-level slider page per complex. Limit to
+    # the same top_n the cover table shows so the aggregate PDF stays bounded.
+    best_per_complex: "OrderedDict[str, tuple[float, Mapping[str, Any]]]" = OrderedDict()
+    for _label, cname, _iface, score, r in ranked:
+        cur = best_per_complex.get(cname)
+        if cur is None or score > cur[0]:
+            best_per_complex[cname] = (score, r)
+    evidence_cap = top_n if max_complexes is None else min(top_n, max_complexes)
+    complex_evidence = sorted(
+        best_per_complex.items(),
+        key=lambda kv: kv[1][0],
+        reverse=True,
+    )[:evidence_cap]
+
+    total = 1 + len(ranked_per_page) + len(complex_evidence)
 
     out_pdf = Path(out_pdf)
     out_pdf.parent.mkdir(parents=True, exist_ok=True)
@@ -1829,7 +1919,29 @@ def generate_aggregate_report(
                 cohort_position=(rank, len(ranked_per_page)),
                 page_no=1 + rank,
                 total=total,
-                last=(rank == len(ranked_per_page)),
+                last=False,  # not last; complex evidence pages follow
+            )
+
+        ev_page = 1 + len(ranked_per_page)
+        for ev_rank, (cname, (cscore, crow)) in enumerate(complex_evidence, start=1):
+            ev_page += 1
+            source_dir = str(crow.get("source_dir") or "")
+            model_label = str(crow.get("model_used") or "")
+            pae_path = None
+            if source_dir:
+                pae_path = _find_pae_png(Path(source_dir), model_label)
+            _complex_evidence_page(
+                pdf,
+                title=_REPORT_TITLE,
+                entry_id=_truncate(cname, 40),
+                section_no=f"{ev_rank}",
+                row=crow,
+                pae_path=pae_path,
+                model_label=model_label,
+                page_no=ev_page,
+                total=total,
+                last=(ev_rank == len(complex_evidence)),
+                complex_label=cname,
             )
 
     logger.info("wrote %s", out_pdf)
diff --git a/src/alphajudge/runner.py b/src/alphajudge/runner.py
index 65087173..dae92731 100644
--- a/src/alphajudge/runner.py
+++ b/src/alphajudge/runner.py
@@ -218,6 +218,15 @@ def _process_one_run(
 
     want_summary = summary_csv is not None
 
+    source_dir = str(d.resolve())
+
+    def _stamp(rows: list[dict]) -> list[dict]:
+        """Add an absolute ``source_dir`` to every row so the aggregate report
+        can locate per-run side files (PAE PNGs, etc.) from the summary CSV."""
+        for r in rows:
+            r.setdefault("source_dir", source_dir)
+        return rows
+
     # When building a summary, prefer reusing precomputed interfaces.csv
     if want_summary and existing_csv.exists() and not force_recompute:
         try:
@@ -226,7 +235,7 @@ def _process_one_run(
                 logger.info(f"reused existing {existing_csv} for aggregation")
                 if write_per_run_report:
                     _safe_write_per_run_report(d, csv_name=per_run_csv_name)
-                return (d_str, rows)
+                return (d_str, _stamp(rows))
             logger.info(f"existing {existing_csv} is empty; recomputing")
         except Exception as e:
             logger.warning(f"could not reuse {existing_csv}; recomputing: {e}")
@@ -254,7 +263,7 @@ def _process_one_run(
 
     if want_summary and out_path is not None:
         try:
-            return (d_str, _read_csv_rows(Path(out_path)))
+            return (d_str, _stamp(_read_csv_rows(Path(out_path))))
         except Exception as e:
             logger.error(f"failed reading {out_path} for aggregation: {e}")
 
diff --git a/test/test_report.py b/test/test_report.py
index 8015100d..6100923f 100644
--- a/test/test_report.py
+++ b/test/test_report.py
@@ -100,8 +100,9 @@ def test_aggregate_report_writes_cover_plus_one_page_per_interface(tmp_path: Pat
     result = generate_aggregate_report(summary, out_pdf=out)
     assert result == out
     assert out.exists() and out.stat().st_size > 0
-    # cover + one page per scorable interface row (3 here)
-    assert _pdf_page_count(out) == 4
+    # cover + one page per scorable interface (3) + one complex-evidence
+    # page per unique complex (2 unique complexes in this fixture).
+    assert _pdf_page_count(out) == 6
 
 
 def test_aggregate_report_handles_missing_meta_score_via_recompute(tmp_path: Path) -> None:
@@ -119,4 +120,5 @@ def test_aggregate_report_handles_missing_meta_score_via_recompute(tmp_path: Pat
     result = generate_aggregate_report(summary, out_pdf=out)
     assert result is not None
     assert out.exists()
-    assert _pdf_page_count(out) == 3  # cover + 2 interfaces
+    # cover + 2 interface pages + 2 complex-evidence pages
+    assert _pdf_page_count(out) == 5