From 57edcb5e1b8e7ca586a0b75715703467e8b8a462 Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Fri, 5 Jun 2026 20:42:13 -0700
Subject: [PATCH 01/20] docs: tighten for server/ package + config-driven
 identity (post #569-#572) (#579)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sweep the docs + README for staleness left by the ADR-0023 server/ decomposition
and the #570-#572 operator-fork seams:

- New config-reference sections: a2a (skills + description, #570) and security
  (callback_allowlist, #572); plugins guide gains register_a2a_skill +
  register_thread_id_resolver.
- agent-card reference + add-a-skill guide + TEMPLATE.md rewritten to the
  config-driven card (declare a2a.skills / register_a2a_skill — don't edit
  server/a2a.py). Fixed the add-a-skill test example (was importing the gone
  _SKILL_SPECS).
- server.py -> server/ package across every living guide/reference (module paths
  to server/agent_init.py / server/chat.py / server/a2a.py); fixed the BROKEN
  'python server.py' launch command in TEMPLATE.md -> 'python -m server'.
- a2a_handler.py (deleted in the A2A 1.0 migration) re-pointed to its real homes:
  a2a_auth.py (bearer/token), a2a_stores.py (webhook SSRF), a2a_executor.py
  (worldstate-delta), server/chat.py (caller-trace); diagram boxes relabeled to
  the conceptual 'A2A handler'.

Docs build green; no code touched.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 README.md                          | 20 ++++++------
 TEMPLATE.md                        | 39 +++++++++++++++---------
 docs/explanation/a2a-protocol.md   |  2 +-
 docs/explanation/architecture.md   | 12 ++++----
 docs/explanation/cost-and-trace.md |  2 +-
 docs/guides/add-a-skill.md         | 49 ++++++++++++------------------
 docs/guides/discord.md             |  2 +-
 docs/guides/fork-the-template.md   |  2 +-
 docs/guides/goal-mode.md           |  2 +-
 docs/guides/plugins.md             |  8 +++--
 docs/guides/react-tauri-ui.md      |  4 +--
 docs/guides/scheduler.md           |  2 +-
 docs/guides/subagents.md           |  2 +-
 docs/reference/agent-card.md       | 15 ++++-----
 docs/reference/configuration.md    | 38 +++++++++++++++++++++++
 docs/reference/extensions.md       | 14 ++++-----
 docs/reference/starter-tools.md    |  4 +--
 17 files changed, 130 insertions(+), 87 deletions(-)

diff --git a/README.md b/README.md
index f9a5710..d3a0323 100644
--- a/README.md
+++ b/README.md
@@ -31,8 +31,8 @@ rename / release-pipeline wiring.
 
 | Concern | Where it lives | What it does |
 |---|---|---|
-| A2A server | `a2a_handler.py` | JSON-RPC 2.0 over `/a2a`, SSE streaming, `tasks/*` lifecycle, push notifications, well-known agent card, dual token-shape parsing |
-| Agent runtime | `graph/agent.py`, `server.py` | LangGraph `create_agent()` wired to the A2A handler, with streaming token capture for cost-v1 |
+| A2A server | `server/a2a.py`, `a2a_executor.py` | JSON-RPC 2.0 over `/a2a`, SSE streaming, `tasks/*` lifecycle, push notifications, well-known agent card, dual token-shape parsing |
+| Agent runtime | `graph/agent.py`, `server/` | LangGraph `create_agent()` wired to the A2A handler, with streaming token capture for cost-v1 |
 | LLM gateway | `graph/llm.py` | OpenAI-compatible client pointed at LiteLLM — swap models by editing the gateway config, not the fork |
 | Subagents | `graph/subagents/config.py` | DeerFlow-pattern delegation via a `task()` tool; one worked example ships — a `researcher` (web + memory, plan→search→synthesize→cite) |
 | Starter tools | `tools/lg_tools.py`, `tools/github_tools.py` | Default-on set: 4 keyless general (`current_time`, `calculator` safe AST eval, `web_search` via DuckDuckGo, `fetch_url`) + 2 HITL (`ask_human`, `request_user_input`) + 4 GitHub read tools over the `gh` CLI + 4 notes + 5 memory + 3 scheduler + 4 beads + inbox/peer (conditional). Drop any via `tools.disabled`; add via a plugin. See [Starter tools](./docs/reference/starter-tools.md) |
@@ -76,7 +76,7 @@ own GHCR: [Customize & deploy](./docs/guides/customize-and-deploy.md).
 
 ```
 ┌──────────────┐     A2A JSON-RPC + SSE      ┌─────────────────┐
-│   Consumer   │ ──────────────────────────▶ │  a2a_handler    │
+│   Consumer   │ ──────────────────────────▶ │  A2A handler    │
 │  (any A2A    │                             │  (FastAPI)      │
 │   client)    │ ◀──── cost-v1 DataPart ─────│                 │
 └──────────────┘                             └────────┬────────┘
@@ -108,9 +108,9 @@ subagent `task()` delegation, and the structured-output protocol.
 | `a2a.trace` propagation | No (it's a protocol convention, not a card extension) | Yes — reads caller's Langfuse trace context from `params.metadata["a2a.trace"]` and nests this agent's trace under it |
 
 Declare additional extensions on the card in
-`server.py::_build_agent_card` when your agent's skills actually
-mutate shared state (see `effect-domain-v1` in the Workstacean
-docs for when this applies).
+`server/a2a.py::_build_agent_card_proto` when your agent's skills
+actually mutate shared state (see `effect-domain-v1` in the
+Workstacean docs for when this applies).
 
 ## Push notification support
 
@@ -130,7 +130,7 @@ The A2A handler supports both token shapes the spec permits:
 Both produce `Authorization: Bearer shared-secret` on outgoing
 webhooks. If your fork is getting 401s on callbacks, check which
 shape the consumer is sending before changing anything —
-`_extract_push_token` in `a2a_handler.py` reads both and the
+the dual-token parser in `a2a_auth.py` reads both and the
 test suite covers both.
 
 ## Observability
@@ -212,6 +212,6 @@ complete end-to-end example and cron setup.
 ## Contributing
 
 This is a template repo — bugs and improvements to the shared
-runtime (`a2a_handler.py`, `graph/agent.py`, extension support,
-release pipeline) land here. Domain-specific agent logic lives
-in the fork, not here.
+runtime (the `server/` package, `graph/agent.py`, extension
+support, release pipeline) land here. Domain-specific agent logic
+lives in the fork, not here.
diff --git a/TEMPLATE.md b/TEMPLATE.md
index 07dc324..5434295 100644
--- a/TEMPLATE.md
+++ b/TEMPLATE.md
@@ -120,20 +120,29 @@ its own recursion budget.
 
 If your agent doesn't need the subagent pattern at all, delete
 the registry entry and call `create_agent_graph(config,
-include_subagents=False)` in `server.py`.
-
-## 6. Rewrite the agent card
-
-`server.py::_build_agent_card` has a placeholder card. Replace:
+include_subagents=False)` in `server/agent_init.py`.
+
+## 6. Declare the agent card
+
+Don't edit the card builder — its identity is config-driven
+(#570). Declare your `description` + `skills` in the `a2a:`
+section of `config/langgraph-config.yaml` (or contribute skills
+from a plugin via `register_a2a_skill`):
+
+```yaml
+a2a:
+  description: "What your agent does, in one line."
+  skills:
+    - id: my_skill        # what A2A callers dispatch to
+      name: My Skill
+      description: ...
+```
 
-- `name` and `description` with the agent's real surface
-- `skills` — each skill is what A2A callers dispatch to. IDs
-  should match what your tools can actually accomplish.
-- `capabilities.extensions` — declare any A2A extensions your
-  agent implements. `cost-v1` is declared by default because
-  the runtime emits it automatically; add `effect-domain-v1`
-  if your skills mutate shared state you want Workstacean's
-  planner to be aware of.
+- `name` already follows `identity.name` (the setup wizard).
+- `capabilities.extensions` — `cost-v1` is declared by default
+  (the runtime emits it automatically); add `effect-domain-v1`
+  in `server/a2a.py::_build_agent_card_proto` if your skills
+  mutate shared state Workstacean's planner should know about.
 
 ## 7. Set up the model
 
@@ -197,12 +206,12 @@ sqlite poller or a Workstacean adapter, selected at startup via env:
 
 ```bash
 # Default: local sqlite, persists at /sandbox/scheduler/<agent_name>/jobs.db
-python server.py
+python -m server
 
 # Workstacean: set both and restart
 export WORKSTACEAN_API_BASE=http://your-workstacean:3000
 export WORKSTACEAN_API_KEY=...
-python server.py
+python -m server
 ```
 
 Multi-fork safety: every job is namespaced by `AGENT_NAME`, so
diff --git a/docs/explanation/a2a-protocol.md b/docs/explanation/a2a-protocol.md
index 0cadfba..eead9af 100644
--- a/docs/explanation/a2a-protocol.md
+++ b/docs/explanation/a2a-protocol.md
@@ -72,7 +72,7 @@ A webhook URL is an outbound HTTP call this agent makes with a shared secret att
 
 ...the agent would happily POST task payloads (potentially with `Authorization: Bearer <secret>`) to any of them.
 
-`_is_safe_webhook_url` in `a2a_handler.py` resolves the URL's hostname once and rejects anything that lands in a private range. It's not a full DNS-rebinding defence, but it closes the "just use an RFC1918 literal" vector. Operator allowlists (`PUSH_NOTIFICATION_ALLOWED_HOSTS`) bypass the check for trusted docker-network targets that would otherwise fail.
+`is_safe_webhook_url` in `a2a_stores.py` resolves the URL's hostname once and rejects anything that lands in a private range. It's not a full DNS-rebinding defence, but it closes the "just use an RFC1918 literal" vector. Operator allowlists (`PUSH_NOTIFICATION_ALLOWED_HOSTS`) bypass the check for trusted docker-network targets that would otherwise fail.
 
 ## Task lifecycle
 
diff --git a/docs/explanation/architecture.md b/docs/explanation/architecture.md
index 8b7f671..5b8d691 100644
--- a/docs/explanation/architecture.md
+++ b/docs/explanation/architecture.md
@@ -4,14 +4,14 @@
 
 ```
 ┌──────────────┐     A2A JSON-RPC + SSE      ┌──────────────────┐
-│   Consumer   │ ──────────────────────────▶ │  a2a_handler.py  │
+│   Consumer   │ ──────────────────────────▶ │  A2A handler     │
 │  (any A2A    │                             │  (FastAPI app)   │
 │   client)    │ ◀──── cost-v1 DataPart ─────│                  │
 └──────────────┘                             └────────┬─────────┘
                                                       │ submits message
                                                       ▼
                                             ┌──────────────────┐
-                                            │  server.py       │
+                                            │  server/chat.py  │
                                             │  _chat_langgraph │
                                             │  _stream         │
                                             └────────┬─────────┘
@@ -47,7 +47,7 @@ A2A is a protocol, not a library. The handler owns:
 The LangGraph runtime has no idea any of this exists. It sees a message, runs a tool loop, produces output. That means:
 
 - If LangGraph's API changes, the A2A handler doesn't break.
-- If A2A's spec changes, only this file changes.
+- If A2A's spec changes, only this layer changes (the `server/a2a.py` + `a2a_executor.py` + `a2a_stores.py` modules).
 - Tests for the protocol are isolated from tests for the agent.
 
 ## Why LangGraph owns the tool loop
@@ -82,7 +82,7 @@ Memory is **enabled by default** (`middleware.memory: true` in `langgraph-config
 
 Three independent layers defend the A2A surface. Each can be enabled or left open for local dev, but production forks should enable all three.
 
-**Bearer authentication** — `a2a_handler.py` reads `A2A_AUTH_TOKEN` at startup. When set, every A2A route (`/a2a`, `message/send`, `tasks/*`, and SSE streaming endpoints) requires `Authorization: Bearer <token>`. Comparison uses `hmac.compare_digest` so timing analysis can't leak the token. When set, the agent card advertises `securitySchemes.bearer` so consumers know to present credentials.
+**Bearer authentication** — `a2a_auth.py` reads `A2A_AUTH_TOKEN` at startup. When set, every A2A route (`/a2a`, `message/send`, `tasks/*`, and SSE streaming endpoints) requires `Authorization: Bearer <token>`. Comparison uses `hmac.compare_digest` so timing analysis can't leak the token. When set, the agent card advertises `securitySchemes.bearer` so consumers know to present credentials.
 
 **Audit redaction** — `graph/middleware/redaction.py` scrubs credentials before anything is written to `audit.jsonl` or emitted as a Langfuse span attribute. Patterns covered: `Authorization: Bearer ...`, OpenAI-style `sk-...` keys, generic `api_key=...` forms, and nested dicts keyed by well-known env var names (`OPENAI_API_KEY`, `LANGFUSE_SECRET_KEY`, `A2A_AUTH_TOKEN`, etc.). This closes the class of bugs where a tool returns a secret in its payload and it leaks into the audit trail or trace.
 
@@ -115,7 +115,7 @@ Beyond the shipped tools, three opt-in seams add capability to a *running* agent
 
 - **Tools enter via one list.** `create_agent_graph` assembles `get_all_tools()` (built-in) plus an `extra_tools` argument, then hands the combined set to the LangGraph loop. Both external sources below feed `extra_tools`, so they're indistinguishable to the model and inherit the same Audit/Langfuse middleware.
 - **MCP** (`tools/mcp_tools.py`) — configured [Model Context Protocol](/guides/mcp) servers (stdio / streamable-HTTP) are connected via `langchain-mcp-adapters`; their tools are discovered at graph-build time, namespaced `<server>__<tool>`, and appended to `extra_tools`. The client is stateless (a fresh session per call), so discovery happens once and tools are event-loop-agnostic.
-- **Plugins** (`graph/plugins/`: `loader`, `registry`, `manifest`, `host`, `pconfig`) — drop-in packages (`protoagent.plugin.yaml` + `register(registry)`) that contribute, via the registry, **tools** (→ `extra_tools`), bundled **`SKILL.md`** dirs (→ the skill index), FastAPI **routers** (mounted under `/plugins/<id>`), background **surfaces** (lifecycle-managed ingress like the Discord gateway), **subagents** (→ `SUBAGENT_REGISTRY`), and managed **MCP servers** (→ `mcp.servers[]` factory), plus their own **config / secrets / Settings** claimed as a top-level YAML section (ADR 0018/0019). A surface/route reaches the agent + event bus + live config via the plugin **host** (`registry.host`: `invoke` / `publish` / `subscribe` / `config` / `apply_settings`). They run **in-process** with the agent's privileges, so a third-party plugin is disabled by default. The first-party **Discord** ingress (`plugins/discord`, a surface + route + tools) and **Google** Gmail/Calendar integration (`plugins/google`, a managed MCP server + route) ship this way — they are **not** wired in `server.py`. See [Plugins](/guides/plugins).
+- **Plugins** (`graph/plugins/`: `loader`, `registry`, `manifest`, `host`, `pconfig`) — drop-in packages (`protoagent.plugin.yaml` + `register(registry)`) that contribute, via the registry, **tools** (→ `extra_tools`), bundled **`SKILL.md`** dirs (→ the skill index), FastAPI **routers** (mounted under `/plugins/<id>`), background **surfaces** (lifecycle-managed ingress like the Discord gateway), **subagents** (→ `SUBAGENT_REGISTRY`), and managed **MCP servers** (→ `mcp.servers[]` factory), plus their own **config / secrets / Settings** claimed as a top-level YAML section (ADR 0018/0019). A surface/route reaches the agent + event bus + live config via the plugin **host** (`registry.host`: `invoke` / `publish` / `subscribe` / `config` / `apply_settings`). They run **in-process** with the agent's privileges, so a third-party plugin is disabled by default. The first-party **Discord** ingress (`plugins/discord`, a surface + route + tools) and **Google** Gmail/Calendar integration (`plugins/google`, a managed MCP server + route) ship this way — they are **not** wired into the core `server/` package. See [Plugins](/guides/plugins).
 
 All are surfaced in `GET /api/runtime/status` (`skills`, `mcp`, `plugins` — with per-plugin route/surface/subagent counts) and load best-effort — a bad skill/server/plugin is logged and skipped, never fatal. Untrusted third-party tools belong on MCP (out-of-process) rather than in-process plugins.
 
@@ -125,7 +125,7 @@ See [LiteLLM gateway](/explanation/litellm-gateway) for the full rationale. The
 
 ## Why streaming specifically this way
 
-`_chat_langgraph_stream` in `server.py` consumes `astream_events(v2)` and yields structured frames: `tool_start`, `tool_end`, `usage`, `done`. The A2A handler then translates those into A2A SSE frames.
+`_chat_langgraph_stream` in `server/chat.py` consumes `astream_events(v2)` and yields structured frames: `tool_start`, `tool_end`, `usage`, `done`. The A2A executor (`a2a_executor.py`) then translates those into A2A SSE frames.
 
 This extra layer of indirection exists because:
 
diff --git a/docs/explanation/cost-and-trace.md b/docs/explanation/cost-and-trace.md
index 1d20dcc..b2cef77 100644
--- a/docs/explanation/cost-and-trace.md
+++ b/docs/explanation/cost-and-trace.md
@@ -75,7 +75,7 @@ Agent A stamps its current Langfuse context into the outbound A2A request:
 }
 ```
 
-Agent B reads this in `a2a_handler.py`, opens its own trace, and stamps `caller_trace_id=abc123`, `caller_span_id=def456` into that trace's metadata.
+Agent B reads this in `server/chat.py`, opens its own trace, and stamps `caller_trace_id=abc123`, `caller_span_id=def456` into that trace's metadata.
 
 An operator looking at Agent A's Langfuse trace can copy the trace ID and search Agent B (and C, D, …) by `metadata.caller_trace_id == abc123` to find every downstream trace that branched off.
 
diff --git a/docs/guides/add-a-skill.md b/docs/guides/add-a-skill.md
index 935f65f..27cb38d 100644
--- a/docs/guides/add-a-skill.md
+++ b/docs/guides/add-a-skill.md
@@ -10,32 +10,23 @@ A *skill* is a named capability A2A callers can dispatch to. Skills live on the
 
 You don't need a skill for "things the chat UI does" — the Gradio chat is already covered by the default dispatch path.
 
-## 1. Add to the card
-
-Edit the `_SKILL_SPECS` list in `server.py` (the card builder `_build_agent_card_proto` turns it into the card's `skills` via `_agent_skills()`). Each spec is a dict — this is where entries go:
-
-```python
-"skills": [
-    {
-        "id": "chat",
-        "name": "Chat",
-        "description": "General-purpose chat interface.",
-        "tags": ["template"],
-        "examples": ["hello", "what can you do?"],
-    },
-    # ↓ new skill
-    {
-        "id": "summarize_pr",
-        "name": "Summarize Pull Request",
-        "description": "Fetch a GitHub PR and return a three-bullet summary of what it changes and why.",
-        "tags": ["github", "summarization"],
-        "examples": [
-            "summarize https://github.com/protoLabsAI/protoAgent/pull/64",
-        ],
-    },
-],
+## 1. Declare it in config
+
+Card skills are config-driven ([#570](/reference/configuration#a2a)) — declare them in the `a2a:` section of `config/langgraph-config.yaml`, **not** by editing `server/a2a.py`. Each entry is a spec:
+
+```yaml
+a2a:
+  skills:
+    - id: summarize_pr
+      name: Summarize Pull Request
+      description: Fetch a GitHub PR and return a three-bullet summary of what it changes and why.
+      tags: [github, summarization]
+      examples:
+        - "summarize https://github.com/protoLabsAI/protoAgent/pull/64"
 ```
 
+The server merges these (plus any contributed by plugins via `register_a2a_skill`) and falls back to the template's `chat` placeholder when none are set. A plugin can ship card skills the same way — `registry.register_a2a_skill(spec)` — so a distributable extension carries its own advertised capabilities.
+
 **IDs are sticky**. `cost-v1` samples, `effect-domain-v1` declarations, and Workstacean's routing all key off the `id`. Pick one and don't rename later.
 
 ## 2. (Optional) Declare an effect
@@ -114,13 +105,13 @@ curl -X POST http://localhost:7870/a2a \
 
 ## 5. Test it
 
-Add to `tests/test_a2a_integration.py`:
+Assert your config declares the skill (it flows to the card via `_resolved_skill_specs`):
 
 ```python
-def test_agent_card_includes_summarize_pr_skill():
-    from server import _SKILL_SPECS
-    skill_ids = {s["id"] for s in _SKILL_SPECS}
-    assert "summarize_pr" in skill_ids
+def test_config_advertises_summarize_pr():
+    from graph.config import LangGraphConfig
+    cfg = LangGraphConfig.from_yaml("config/langgraph-config.yaml")
+    assert "summarize_pr" in {s["id"] for s in cfg.a2a_skills}
 ```
 
 ## Related
diff --git a/docs/guides/discord.md b/docs/guides/discord.md
index db11d20..1f2c91f 100644
--- a/docs/guides/discord.md
+++ b/docs/guides/discord.md
@@ -85,7 +85,7 @@ GUILD_MESSAGE_REACTIONS | DIRECT_MESSAGES | MESSAGE_CONTENT`.
 Discord ships as a **first-party plugin** (`plugins/discord/`, ADR 0018/0019) —
 the gateway, the `test-discord` route, the outbound tools, and this `discord`
 config section are all declared by `plugins/discord/protoagent.plugin.yaml`, not
-wired in `server.py`. Disable it entirely with `plugins: { disabled: [discord] }`,
+wired into the core `server/` package. Disable it entirely with `plugins: { disabled: [discord] }`,
 or replace it with your own ingress plugin — no core edit. See [Plugins](./plugins.md).
 
 The token, admin list, and on/off toggle are set **in the app** (Settings →
diff --git a/docs/guides/fork-the-template.md b/docs/guides/fork-the-template.md
index 400ee39..96f85e1 100644
--- a/docs/guides/fork-the-template.md
+++ b/docs/guides/fork-the-template.md
@@ -67,7 +67,7 @@ See the [starter tools reference](/reference/starter-tools) for the shapes of th
 `graph/subagents/config.py` ships with one `researcher`. Either:
 
 - Add more by registering `SubagentConfig` instances in `SUBAGENT_REGISTRY` and matching fields in `graph/config.py::LangGraphConfig`, or
-- Call `create_agent_graph(config, include_subagents=False)` in `server.py::_init_langgraph_agent()` to skip subagents entirely.
+- Call `create_agent_graph(config, include_subagents=False)` in `server/agent_init.py::_init_langgraph_agent()` to skip subagents entirely.
 
 See [Configure subagents](/guides/subagents) for the full pattern.
 
diff --git a/docs/guides/goal-mode.md b/docs/guides/goal-mode.md
index b662c00..84fa1c6 100644
--- a/docs/guides/goal-mode.md
+++ b/docs/guides/goal-mode.md
@@ -19,7 +19,7 @@ It's modelled on protocli's goal system but deliberately more rigorous for a lon
 4. **Not met** → the controller extracts/refreshes the agent's `<goal_plan>` checklist, then re-invokes the agent on the same thread (history preserved) with a continuation prompt that includes the verifier's reason + evidence and the current plan.
 5. This repeats until met, the **iteration budget** (`goal.max_iterations`) is spent (`exhausted`), the verifier returns the **same evidence too many times** (`goal.no_progress_limit` → `unachievable`), or the agent itself emits `<goal_unachievable reason="…"/>` (`unachievable`).
 
-The loop wraps graph invocation in `server.py` (both the A2A streaming path and the non-streaming chat path); the graph itself is unchanged.
+The loop wraps graph invocation in `server/chat.py` (both the A2A streaming path and the non-streaming chat path); the graph itself is unchanged.
 
 ## Setting a goal
 
diff --git a/docs/guides/plugins.md b/docs/guides/plugins.md
index 8b812c5..ed38a86 100644
--- a/docs/guides/plugins.md
+++ b/docs/guides/plugins.md
@@ -59,24 +59,28 @@ def register(registry):
 ```
 
 `register` is called once at load. The registry accepts **six** contribution
-types — a fork adds any of them as a plugin, never editing core `server.py`:
+types — a fork adds any of them as a plugin, never editing the core `server/` package:
 
 | Method | Contributes | Lifecycle |
 |---|---|---|
 | `register_tool(tool)` / `register_tools(iter)` | A LangChain tool | graph build (live-reloads) |
-| `register_skill_dir(path)` | A `SKILL.md` directory | graph build |
+| `register_skill_dir(path)` | A `SKILL.md` directory (procedural memory) | graph build |
+| `register_a2a_skill(spec)` | An A2A **card** skill (what the card advertises; optional structured output) | agent-card build |
 | `register_router(router, prefix=None)` | A FastAPI `APIRouter` | **mounted once** at init (default prefix `/plugins/<id>`) |
 | `register_surface(start, stop=None, name=None, reload=None)` | A background surface (a Discord-style gateway) | `start` in startup, `stop` in shutdown, `reload(cfg)` on config save |
 | `register_subagent(config)` | A `SubagentConfig` (a delegate) | added to `SUBAGENT_REGISTRY` |
 | `register_mcp_server(factory)` | A **managed MCP server** the agent connects to | `factory(config)` called at each graph build → entry dict or `None` |
+| `register_thread_id_resolver(fn)` | A `(request_metadata, session_id) → str` checkpointer-scope resolver (e.g. per-project memory) | each turn; one wins (last plugin) |
 
 ```python
 def register(registry):
     registry.register_tool(hello)
+    registry.register_a2a_skill({"id": "greet", "name": "Greet", "description": "..."})
     registry.register_router(_build_router())        # → GET /plugins/<id>/...
     registry.register_surface(_start, stop=_stop, name="my-surface")
     registry.register_subagent(_build_subagent())    # delegate via task/task_batch
     registry.register_mcp_server(_server_factory)    # a managed MCP server (e.g. Google)
+    registry.register_thread_id_resolver(lambda md, sid: f"proj:{md.get('project')}:{sid}")
 ```
 
 ### Managed MCP servers — `register_mcp_server`
diff --git a/docs/guides/react-tauri-ui.md b/docs/guides/react-tauri-ui.md
index ec0ae17..d835e29 100644
--- a/docs/guides/react-tauri-ui.md
+++ b/docs/guides/react-tauri-ui.md
@@ -100,7 +100,7 @@ apps/web/                     React + Vite operator console
   src/subagents/              manual subagent launcher
   src/lib/api.ts              FastAPI/A2A client
 
-server.py                     FastAPI API + static React asset serving
+server/                       FastAPI API + static React asset serving
 graph/agent.py                LangGraph lead + subagent runtime
 src-tauri/                    Tauri shell after web app works
 ```
@@ -117,7 +117,7 @@ npm run web:build
 npm run web:preview
 ```
 
-The built app lives under `apps/web/dist/`. `server.py` serves it at `/app`
+The built app lives under `apps/web/dist/`. the `server/` package serves it at `/app`
 when that directory contains `index.html`; otherwise the server boots without
 mounting the React surface.
 
diff --git a/docs/guides/scheduler.md b/docs/guides/scheduler.md
index 4a687a6..b4818a4 100644
--- a/docs/guides/scheduler.md
+++ b/docs/guides/scheduler.md
@@ -32,7 +32,7 @@ not "do that thing we discussed").
 
 ## Backend selection
 
-`server.py::_build_scheduler` picks at startup:
+`server/agent_init.py::_build_scheduler` picks at startup:
 
 1. `middleware.scheduler: false` in YAML → no scheduler. The three
    tools don't ship. (Symmetric with `middleware.knowledge` /
diff --git a/docs/guides/subagents.md b/docs/guides/subagents.md
index 8226e0c..53a2d03 100644
--- a/docs/guides/subagents.md
+++ b/docs/guides/subagents.md
@@ -130,7 +130,7 @@ question. Handle short factual queries yourself.
 
 ## 5. Turn subagents off entirely
 
-If your agent is simple enough that subagents are pure overhead, flip `include_subagents=False` when the graph is built. In `server.py::_init_langgraph_agent`:
+If your agent is simple enough that subagents are pure overhead, flip `include_subagents=False` when the graph is built. In `server/agent_init.py::_init_langgraph_agent`:
 
 ```python
 _graph = create_agent_graph(
diff --git a/docs/reference/agent-card.md b/docs/reference/agent-card.md
index 5d63aee..cef1c67 100644
--- a/docs/reference/agent-card.md
+++ b/docs/reference/agent-card.md
@@ -1,6 +1,6 @@
 # Agent card
 
-Served at `/.well-known/agent-card.json` and `/.well-known/agent.json`. Built by `server.py::_build_agent_card_proto` (which assembles it via `protolabs_a2a.build_agent_card`; skills come from `_SKILL_SPECS` / `_agent_skills()`).
+Served at `/.well-known/agent-card.json` and `/.well-known/agent.json`. Built by `server/a2a.py::_build_agent_card_proto` (which assembles it via `protolabs_a2a.build_agent_card`). Its identity is **config/plugin-driven** ([#570](configuration.md#a2a)) — `name` from `identity.name`, `description` + `skills` from the `a2a:` config section or `register_a2a_skill`, so a fork declares its card without editing the package.
 
 ## Full shape
 
@@ -57,8 +57,9 @@ The card is the **A2A 1.0** (`a2a-sdk` proto) shape, assembled by
 
 > The `provider` block, the four `capabilities.extensions`, and the
 > `securitySchemes` / `securityRequirements` shapes are owned by
-> `protolabs_a2a` (not editable per-fork in `server.py`). Fork the `name`,
-> `description`, `version`, and `skills`.
+> `protolabs_a2a` (not editable per-fork). Customize `name` (`identity.name`),
+> `description` + `skills` (the [`a2a:`](configuration.md#a2a) config section or
+> `register_a2a_skill`), and `version` (your `pyproject`).
 
 ## Field reference
 
@@ -133,12 +134,12 @@ card *also* declares a `bearer` scheme (`{"httpAuthSecurityScheme": {"scheme":
 consumer reading the card learns bearer is accepted, not just `apiKey`. (Both
 shapes come from `protolabs_a2a.security_schemes(bearer=…)`.)
 
-## Fork this file
+## Customize (no core edit)
 
-The card lives in `server.py::_build_agent_card_proto`. The template declares four custom extensions by default — **cost** / **confidence** / **worldstate-delta** / **tool-call** (the URIs come from `protolabs_a2a`; see [Extensions](/reference/extensions)). At a minimum, every fork should replace:
+The card is assembled in `server/a2a.py::_build_agent_card_proto`, but you **don't edit it** — identity is config/plugin-driven ([#570](configuration.md#a2a)). The template declares four custom extensions by default — **cost** / **confidence** / **worldstate-delta** / **tool-call** (the URIs come from `protolabs_a2a`; see [Extensions](/reference/extensions)). At a minimum, every fork sets:
 
-- `name` + `description`
-- `skills` (sourced from `_SKILL_SPECS` / `_agent_skills()` — add your real ones)
+- `name` → `identity.name` (the setup wizard sets it)
+- `description` + `skills` → the [`a2a:`](configuration.md#a2a) config section (or a plugin's `register_a2a_skill`)
 
 ## Related
 
diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md
index ad22da5..3f7ccca 100644
--- a/docs/reference/configuration.md
+++ b/docs/reference/configuration.md
@@ -281,6 +281,21 @@ egress:
 
 Covers `fetch_url` only; `execute_code`/`run_command` process-level egress is fenced by running under OpenShell (see [Sandboxing & egress](../guides/sandboxing.md)).
 
+## `security`
+
+Opt-in CIDR allowlist for the outbound A2A destinations the agent POSTs to — push-notification **callbacks** (caller-supplied webhook URLs) and **`peer_consult`** (`PEER_<HANDLE>_URL`). Empty/unset = today's behavior: callbacks keep their default private-IP denylist (`a2a_stores`), `peer_consult` is unrestricted.
+
+```yaml
+security:
+  callback_allowlist:
+    - 100.64.0.0/10   # tailnet
+    - 10.0.0.0/8      # private fleet
+```
+
+| Key | Default | What |
+|---|---|---|
+| `callback_allowlist` | `[]` | CIDRs an outbound callback / peer destination may resolve into. **Empty = off.** When set it becomes the policy: a destination is allowed iff **every** resolved IP is inside a listed range (overrides the default callback denylist, so you can permit a specific internal/tailnet range; everything else is rejected). Hot-reloads. |
+
 ## `routing`
 
 Wires langchain's `ModelFallbackMiddleware`: on a primary-model error, retry on each fallback model (same gateway) in order. Opt-in (empty = no fallback). `aux_model` is a separate, optional cheap/fast alias for non-reasoning calls.
@@ -356,6 +371,29 @@ Human-authored skills in the AgentSkills [`SKILL.md`](../guides/skills.md) forma
 
 Skills load from two roots — bundled (`config/skills/`, shipped) and writable (`<config-dir>/skills/`, your drop-ins); live skills override bundled ones by `name`. `GET /api/runtime/status` reports `skills.count`. See the [Skills guide](../guides/skills.md) for authoring.
 
+## `a2a`
+
+Your fork's **A2A [agent card](agent-card.md) identity** — the advertised skills and description a caller sees. Declare them here (or contribute card skills from a plugin via `register_a2a_skill`) instead of editing `server/a2a.py`. Distinct from `skills` above: those are disk `SKILL.md` *procedural memory* retrieved at inference; these are what the *card* advertises. Omit both keys and the template ships one free-text `chat` placeholder so a fresh clone stays callable. The card `name` follows `identity.name` / `AGENT_NAME`.
+
+```yaml
+a2a:
+  description: "Acme Bot — turns support tickets into triaged, drafted replies."
+  skills:
+    - id: triage_ticket
+      name: Triage Ticket
+      description: Classify a support ticket and draft a reply.
+      tags: [support]
+      examples: ["triage ticket #1234"]
+      # Optional structured output — enforced + emitted as a typed DataPart (#476):
+      # result_mime: application/vnd.protolabs.triage-v1+json
+      # output_schema: { type: object, properties: { ... }, required: [ ... ] }
+```
+
+| Key | Default | What |
+|---|---|---|
+| `description` | template placeholder | The agent card's `description`. |
+| `skills` | one `chat` placeholder | Advertised `AgentSkill`s (`id`/`name`/`description` + optional `tags`/`examples`). A skill declaring `result_mime` + `output_schema` returns schema-enforced structured output as a typed DataPart ([#476](agent-card.md)); the MIME is advertised in its `output_modes`. |
+
 ## `mcp`
 
 Connect external [Model Context Protocol](../guides/mcp.md) servers; their tools become agent tools (namespaced `<server>__<tool>`). Off by default — adding a server is the opt-in. Built on `langchain-mcp-adapters`.
diff --git a/docs/reference/extensions.md b/docs/reference/extensions.md
index 330225c..f307178 100644
--- a/docs/reference/extensions.md
+++ b/docs/reference/extensions.md
@@ -67,7 +67,7 @@ Declares the **scope of effect** of each skill so a consumer can gate higher-imp
 }
 ```
 
-Purely declarative — keep the declared radius honest with what each skill handler actually does. Uncomment the stanza in `server.py::_build_agent_card` and use your real skill IDs.
+Purely declarative — keep the declared radius honest with what each skill handler actually does. Uncomment the stanza in `server/a2a.py::_build_agent_card_proto` and use your real skill IDs.
 
 ## `hitl-mode-v1`
 
@@ -121,7 +121,7 @@ Fields:
 
 Only declare effects that actually mutate shared state. Over-declaring confuses the planner into routing your agent for goals it can't move.
 
-**Pair with runtime emission**: if you declare an effect, emit a matching `worldstate-delta-v1` DataPart when the tool succeeds at runtime (see `a2a_handler.py::TaskRecord.world_deltas`). Divergence between declared and observed mutations breaks the planner's scoring model.
+**Pair with runtime emission**: if you declare an effect, emit a matching `worldstate-delta-v1` DataPart when the tool succeeds at runtime — yield a `delta` event from the tool and the executor (`a2a_executor.py`) accumulates them onto the terminal artifact. Divergence between declared and observed mutations breaks the planner's scoring model.
 
 See `docs/extensions/effect-domain-v1` in the [protoWorkstacean repo](https://github.com/protoLabsAI/protoWorkstacean) for the full spec.
 
@@ -147,7 +147,7 @@ Emitted as a DataPart on the terminal artifact:
 }
 ```
 
-The template doesn't emit this by default because the shipped tools don't mutate anything. See `a2a_handler.py::TaskRecord.add_delta` for where to hook in.
+The template doesn't emit this by default because the shipped tools don't mutate anything. To hook in, yield a `("delta", {domain, path, op, value})` event from your tool; `a2a_executor.py` collects them into the artifact.
 
 ## `tool-call-v1`
 
@@ -180,7 +180,7 @@ Fields:
 | `input` | Truncated preview of the tool input (on `start`). Structured inputs (dict/list) are rendered as **compact JSON** so the console can pretty-print them; everything else is stringified. |
 | `output` | Truncated preview of the tool result (on `end`). Unwrapped from langchain's `ToolMessage` to its `.content` — the message repr would otherwise leak `name=`/`tool_call_id=` noise into the card. |
 
-**Producer** — `server.py::_run_turn_stream` yields structured `("tool_start"|"tool_end", {...})` tuples from langchain's `astream_events`. Inputs/outputs are coerced via `_coerce_tool_value` / `_coerce_tool_output` (JSON for structured values, `.content` for `ToolMessage`s) and truncated to `_TOOL_PREVIEW_CHARS`. The runner stores the latest on `TaskRecord.last_tool_event` and `_build_status_event` attaches it alongside the existing text status part (`🔧 name: input` / `✅ name → output`), so text-only consumers still see progress — the DataPart is purely additive and backward-compatible.
+**Producer** — `server/chat.py::_run_turn_stream` yields structured `("tool_start"|"tool_end", {...})` tuples from langchain's `astream_events`. Inputs/outputs are coerced via `_coerce_tool_value` / `_coerce_tool_output` (JSON for structured values, `.content` for `ToolMessage`s) and truncated to `_TOOL_PREVIEW_CHARS`. The runner stores the latest on `TaskRecord.last_tool_event` and `_build_status_event` attaches it alongside the existing text status part (`🔧 name: input` / `✅ name → output`), so text-only consumers still see progress — the DataPart is purely additive and backward-compatible.
 
 **Coalescing caveat** — the SSE watcher (`_watch_task`) coalesces bursts of updates, so a tool that starts and ends within a single event-loop tick may only surface one frame. Real tools are slow enough (network, I/O) that `start` and `end` land on separate frames. Consumers must tolerate a missing `start` (render the `end` as a completed card) and dedupe by `(id, phase)`.
 
@@ -220,7 +220,7 @@ Fields:
 | `created_at` | UTC ISO timestamp |
 | `source_session_id` | Provenance — which session produced the artifact |
 
-**Collection** — `a2a_handler.py` reads skills from the `_pending_skills` ContextVar at task completion and appends them as DataParts. Agents and middleware never access the ContextVar directly; they use `emit_skill_artifact()` to add and `get_pending_skills()` to read.
+**Collection** — an emitted skill is serialized by `graph/extensions/skills.py` and surfaced to A2A consumers as a DataPart on the terminal artifact (alongside being persisted to the local index). The mimeType is the contract.
 
 **Indexing** — protoAgent's own `SkillsIndex` (`graph/skills/index.py`) at `/sandbox/skills.db` picks these up on the next sweep and makes them retrievable by `KnowledgeMiddleware.load_skills(query)`. Consumers running their own skill registries can index the DataParts from the A2A stream directly — the mimeType is the contract.
 
@@ -251,11 +251,11 @@ When the caller stamps their trace context:
 }
 ```
 
-The agent reads it in `a2a_handler.py` and stamps `caller_trace_id` + `caller_span_id` into its own Langfuse trace metadata. Operators can then filter Langfuse by `metadata.caller_trace_id` to find every agent trace spawned from a single dispatch.
+The agent reads it in `server/chat.py` and stamps `caller_trace_id` + `caller_span_id` into its own Langfuse trace metadata. Operators can then filter Langfuse by `metadata.caller_trace_id` to find every agent trace spawned from a single dispatch.
 
 ## Adding a new extension
 
-1. Emit or parse in `a2a_handler.py` / `server.py`.
+1. Emit or parse in `a2a_executor.py` / `server/chat.py`.
 2. Declare on the card under `capabilities.extensions` with a URI consumers agree on.
 3. Document the shape in this file.
 4. Add a test to `tests/test_a2a_integration.py` asserting the declaration is present on the card.
diff --git a/docs/reference/starter-tools.md b/docs/reference/starter-tools.md
index 090ad6c..1a64823 100644
--- a/docs/reference/starter-tools.md
+++ b/docs/reference/starter-tools.md
@@ -8,11 +8,11 @@ The default tool set (from `tools/lg_tools.py::get_all_tools`):
 - Four **notes tools** — `notes_list`, `notes_read`, `notes_write`, `notes_revert` — bridge the operator console's Notes panel tabs to the agent (one agent-global notebook), gated per-tab by the operator's Agent-read/write toggles; writes are versioned (undoable) and surface live in the panel.
 - Five **memory tools** — `memory_ingest`, `memory_recall`, `memory_list`, `memory_stats`, `daily_log` — bound to the bundled `KnowledgeStore` (sqlite + FTS5, see [Configuration](/reference/configuration#knowledge)). Omitted when no store.
 - Three **scheduler tools** — `schedule_task`, `list_schedules`, `cancel_schedule` — bound to the bundled scheduler backend (local sqlite or the Workstacean adapter, see [Schedule future work](/guides/scheduler)). Omitted when no scheduler.
-- Four **beads tools** — `beads_create`, `beads_list`, `beads_update`, `beads_close` — the agent's in-process planning board, bridged to the console Beads panel. Bound when a beads store is present (default in `server.py`).
+- Four **beads tools** — `beads_create`, `beads_list`, `beads_update`, `beads_close` — the agent's in-process planning board, bridged to the console Beads panel. Bound when a beads store is present (default in `server/agent_init.py`).
 - One **inbox tool** — `check_inbox` — bound to the durable inbound inbox (ADR 0003) when configured; pulls stimuli pushed to `POST /api/inbox`.
 - **Peer-consult tools** (`peer_list` / `peer_consult`) — added only when at least one `PEER_<HANDLE>_URL` is set (A2A federation).
 
-`get_all_tools(knowledge_store=None, scheduler=None, inbox_store=None, beads_store=None)` is the registry; the conditional groups above are included only when their backend is passed (all are constructed by default in `server.py`; opt out via `middleware.knowledge: false` / `middleware.scheduler: false`). To **drop** a core tool without editing this function, list it in `tools.disabled`; to **add** tools, ship a [plugin](/guides/plugins) (`register_tools`) — editing `get_all_tools` is the legacy core-edit path that conflicts on re-sync.
+`get_all_tools(knowledge_store=None, scheduler=None, inbox_store=None, beads_store=None)` is the registry; the conditional groups above are included only when their backend is passed (all are constructed by default in `server/agent_init.py`; opt out via `middleware.knowledge: false` / `middleware.scheduler: false`). To **drop** a core tool without editing this function, list it in `tools.disabled`; to **add** tools, ship a [plugin](/guides/plugins) (`register_tools`) — editing `get_all_tools` is the legacy core-edit path that conflicts on re-sync.
 
 ## `current_time`
 

From 57c626e4cd3332032aacad90d89e04c18712bf80 Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Fri, 5 Jun 2026 21:19:23 -0700
Subject: [PATCH 02/20] fix(deploy): launch the server/ package, not the
 deleted server.py (+ CI guard) (#580)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ADR 0023 promoted server.py → the server/ package (launch: python -m server),
but the k8s manifest and the OpenShell sandbox-create script still ran
'python server.py' — which exits immediately ('can't open file server.py'),
i.e. CrashLoopBackOff on k8s. PYTHONPATH=/opt/protoagent is baked as an image
ENV so it survives the command override; switch both to 'python -m server'.

Add a CI guard (checks.yml) that fails on any 'python server.py' invocation in
scripts/manifests/Dockerfiles so this can't regress.

Prod-readiness audit P0 (batch A).
---
 .github/workflows/checks.yml                  | 13 +++++++++++++
 deploy/openshell/create-protoagent-sandbox.sh |  2 +-
 deploy/openshell/k8s/protoagent-sandbox.yaml  |  4 +++-
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
index 8b8523b..01b5a77 100644
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@@ -34,6 +34,19 @@ jobs:
           git clone --depth 1 \
             https://github.com/protoLabsAI/release-tools /tmp/release-tools
           node /tmp/release-tools/bin/verify-workspace-config.mjs --root "$GITHUB_WORKSPACE"
+      - name: Guard against the deleted server.py entrypoint
+        # ADR 0023 promoted the monolith to the server/ package (launch:
+        # `python -m server`). A deploy file launching the old single-file binary
+        # CrashLoopBackOffs. Fail CI if any deploy artifact still does. Scoped to
+        # deploy surfaces (excludes .github so this guard can't match itself).
+        run: |
+          if grep -rnE 'python[0-9]* +server\.py' \
+               --include='*.sh' --include='*.yaml' --include='*.yml' \
+               --include='Dockerfile*' --exclude-dir=.github . ; then
+            echo "::error::Stale single-file launch found — use 'python -m server' (ADR 0023)."
+            exit 1
+          fi
+          echo "ok: no stale single-file launches in deploy artifacts"
 
   tests:
     name: Python tests
diff --git a/deploy/openshell/create-protoagent-sandbox.sh b/deploy/openshell/create-protoagent-sandbox.sh
index ba27c63..54fe9cf 100755
--- a/deploy/openshell/create-protoagent-sandbox.sh
+++ b/deploy/openshell/create-protoagent-sandbox.sh
@@ -34,6 +34,6 @@ openshell sandbox create \
   --image "$IMAGE" \
   --publish "127.0.0.1:${PORT}:7870" \
   --mount "$CONFIG:/sandbox/config/langgraph-config.yaml:ro" \
-  -- python server.py --port 7870 --ui none
+  -- python -m server --port 7870 --ui none   # server.py is now the server/ package (ADR 0023); PYTHONPATH baked in the image
 
 echo "✓ protoAgent sandbox created. Inspect: openshell sandbox list"
diff --git a/deploy/openshell/k8s/protoagent-sandbox.yaml b/deploy/openshell/k8s/protoagent-sandbox.yaml
index f96fba0..ae733ae 100644
--- a/deploy/openshell/k8s/protoagent-sandbox.yaml
+++ b/deploy/openshell/k8s/protoagent-sandbox.yaml
@@ -27,7 +27,9 @@ spec:
       containers:
         - name: protoagent
           image: ghcr.io/protolabsai/protoagent:latest
-          command: ["python", "server.py", "--port", "7870", "--ui", "none"]  # lean headless tier (ADR 0010)
+          # server.py is now the server/ package (ADR 0023) — launch as a module.
+          # PYTHONPATH=/opt/protoagent is baked as an image ENV so it survives this override.
+          command: ["python", "-m", "server", "--port", "7870", "--ui", "none"]  # lean headless tier (ADR 0010)
           ports:
             - containerPort: 7870
           env:

From 095da22775b742b9c1cd0224d4cfd669ccd5d321 Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Fri, 5 Jun 2026 21:24:59 -0700
Subject: [PATCH 03/20] security: default-bind 127.0.0.1; containers expose
 0.0.0.0 explicitly (#581)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prod-readiness audit P0 (network-boundary half). The server hardcoded
host=0.0.0.0, so a local/desktop run exposed the operator/console + OpenAI-compat
API (/api/*, /api/chat, /v1/*) — which are NOT auth-gated — to anything that
could reach the port.

- New --host arg / PROTOAGENT_HOST env, defaulting to 127.0.0.1 (loopback). Local
  + desktop-sidecar runs are now loopback-only by default.
- Containers bind 0.0.0.0 explicitly: entrypoint.sh passes --host 0.0.0.0, and the
  k8s + OpenShell command-overrides (which bypass the entrypoint) pass it too.
  Their boundary is the published port + network policy, not the in-container bind.
- Loud startup WARNING when binding non-loopback with no A2A auth token configured.

Verified: boot smoke binds 127.0.0.1 by default (Uvicorn running on
http://127.0.0.1); 1047 tests green. The credential-boundary half (bearer-gate
/api/* + /v1/*, which needs console-token plumbing) follows in a separate PR.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 deploy/openshell/create-protoagent-sandbox.sh |  2 +-
 deploy/openshell/k8s/protoagent-sandbox.yaml  |  2 +-
 docs/reference/environment-variables.md       |  1 +
 entrypoint.sh                                 |  7 +++++-
 server/__init__.py                            | 24 ++++++++++++++++---
 5 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/deploy/openshell/create-protoagent-sandbox.sh b/deploy/openshell/create-protoagent-sandbox.sh
index 54fe9cf..1f947ec 100755
--- a/deploy/openshell/create-protoagent-sandbox.sh
+++ b/deploy/openshell/create-protoagent-sandbox.sh
@@ -34,6 +34,6 @@ openshell sandbox create \
   --image "$IMAGE" \
   --publish "127.0.0.1:${PORT}:7870" \
   --mount "$CONFIG:/sandbox/config/langgraph-config.yaml:ro" \
-  -- python -m server --port 7870 --ui none   # server.py is now the server/ package (ADR 0023); PYTHONPATH baked in the image
+  -- python -m server --host 0.0.0.0 --port 7870 --ui none   # server/ package (ADR 0023); --host 0.0.0.0 since this override bypasses entrypoint.sh (server defaults to loopback)
 
 echo "✓ protoAgent sandbox created. Inspect: openshell sandbox list"
diff --git a/deploy/openshell/k8s/protoagent-sandbox.yaml b/deploy/openshell/k8s/protoagent-sandbox.yaml
index ae733ae..977d8c2 100644
--- a/deploy/openshell/k8s/protoagent-sandbox.yaml
+++ b/deploy/openshell/k8s/protoagent-sandbox.yaml
@@ -29,7 +29,7 @@ spec:
           image: ghcr.io/protolabsai/protoagent:latest
           # server.py is now the server/ package (ADR 0023) — launch as a module.
           # PYTHONPATH=/opt/protoagent is baked as an image ENV so it survives this override.
-          command: ["python", "-m", "server", "--port", "7870", "--ui", "none"]  # lean headless tier (ADR 0010)
+          command: ["python", "-m", "server", "--host", "0.0.0.0", "--port", "7870", "--ui", "none"]  # lean headless tier (ADR 0010); --host 0.0.0.0 because this override bypasses entrypoint.sh (the server now defaults to loopback)
           ports:
             - containerPort: 7870
           env:
diff --git a/docs/reference/environment-variables.md b/docs/reference/environment-variables.md
index 83d48b8..7930eef 100644
--- a/docs/reference/environment-variables.md
+++ b/docs/reference/environment-variables.md
@@ -29,6 +29,7 @@ Every env var the template reads at runtime.
 | Variable | Default | What |
 |---|---|---|
 | `PROTOAGENT_UI` | `full` | UI deployment tier (or `--ui`): `full` (Gradio + React console + API/A2A), `console` (console + API/A2A, no Gradio), `none` (API + A2A + `/metrics` only — the lean headless stack). The Docker image defaults to `none`. |
+| `PROTOAGENT_HOST` | `127.0.0.1` | Bind address (or `--host`). **Defaults to loopback** so a local/desktop run isn't exposed on all interfaces — the operator/console API (`/api/*`, `/api/chat`, `/v1/*`) is otherwise reachable by anything that can hit the port. The container entrypoint + deploy manifests set `0.0.0.0` because their boundary is the published port + network policy, not the in-container bind. Binding non-loopback **without** an A2A auth token logs a security warning at startup. |
 | `PROTOAGENT_HEADLESS` | (unset) | **Deprecated** alias for `PROTOAGENT_UI=console` (or `--headless`). |
 | `PROTOAGENT_HEADLESS_SETUP` | (unset) | Set `1`/`true` to auto-complete setup from a validated config even outside the `none` tier (no wizard). The `none` tier implies this. |
 
diff --git a/entrypoint.sh b/entrypoint.sh
index a0b5db3..e14c12b 100755
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -24,4 +24,9 @@ fi
 # module with the install dir on PYTHONPATH so the package (and its sibling
 # top-level modules: paths, events, graph, …) resolve, while keeping the
 # agent's workspace (/sandbox) as the working directory.
-exec env PYTHONPATH="/opt/protoagent${PYTHONPATH:+:$PYTHONPATH}" python -m server
+#
+# Bind all interfaces inside the container — the boundary is the published port
+# + network policy, not the in-container bind. (The server defaults to loopback
+# for local/desktop runs; PROTOAGENT_HOST overrides either way.)
+exec env PYTHONPATH="/opt/protoagent${PYTHONPATH:+:$PYTHONPATH}" \
+    python -m server --host "${PROTOAGENT_HOST:-0.0.0.0}"
diff --git a/server/__init__.py b/server/__init__.py
index b8d8110..4c0baa5 100644
--- a/server/__init__.py
+++ b/server/__init__.py
@@ -290,6 +290,12 @@ def _main():
         description=f"{AGENT_NAME_ENV} — protoAgent server",
     )
     parser.add_argument("--port", type=int, default=7870)
+    # Bind host. Defaults to loopback so a local/desktop run is NOT exposed on
+    # all interfaces (prod-readiness: the console + operator API are otherwise
+    # reachable by anything that can reach the port). Containers set
+    # PROTOAGENT_HOST=0.0.0.0 (entrypoint / deploy manifests) because their
+    # boundary is the network policy + published port, not the in-container bind.
+    parser.add_argument("--host", type=str, default=os.environ.get("PROTOAGENT_HOST", "127.0.0.1"))
     parser.add_argument("--config", type=str, default=None)
     parser.add_argument(
         "--ui",
@@ -685,10 +691,22 @@ async def _serve_sw() -> FileResponse:
             footer_links=[],
             favicon_path=str(static_dir / "favicon.svg") if (static_dir / "favicon.svg").exists() else None,
         )
-        log.info("Starting %s (ui=full) on http://0.0.0.0:%d", agent_name(), args.port)
+        log.info("Starting %s (ui=full) on http://%s:%d", agent_name(), args.host, args.port)
     else:
         app = fastapi_app
-        log.info("Starting %s (ui=%s) on http://0.0.0.0:%d", agent_name(), ui, args.port)
+        log.info("Starting %s (ui=%s) on http://%s:%d", agent_name(), ui, args.host, args.port)
+
+    # Loud warning when exposed on all interfaces with no A2A auth token: the
+    # operator/console API (/api/*, /api/chat, /v1/*) is reachable by untrusted
+    # callers. Loopback (the default) is safe; a container behind a network
+    # policy is the intended 0.0.0.0 case.
+    if args.host not in ("127.0.0.1", "localhost", "::1") and not _bearer_configured():
+        log.warning(
+            "[security] binding %s with NO A2A auth token — the agent + operator "
+            "API are open to anything that can reach this port. Set auth.token / "
+            "A2A_AUTH_TOKEN, or bind 127.0.0.1 (the default), or front it with a "
+            "network policy.", args.host,
+        )
 
     # Don't outlive the launcher. When run as a desktop sidecar the Tauri shell
     # sets PROTOAGENT_PARENT_PID; a PyInstaller-frozen onefile runs as a
@@ -696,7 +714,7 @@ async def _serve_sw() -> FileResponse:
     # server orphaned (holding its port). Poll the launcher and exit if it dies.
     _install_parent_death_watchdog()
 
-    uvicorn.run(app, host="0.0.0.0", port=args.port)
+    uvicorn.run(app, host=args.host, port=args.port)
 
 
 if __name__ == "__main__":

From 00cc56df6805a0c40db0318e6e3f153cbb06bc4c Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Fri, 5 Jun 2026 21:29:07 -0700
Subject: [PATCH 04/20] reliability: bound the LLM call (timeout + retries) +
 close the push client (#582)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prod-readiness audit P1 (resilience).

- The model client had NO timeout — a hung/slow LiteLLM gateway blocked the turn
  (and the A2A task / SSE stream) indefinitely; under load these pile up. Add
  model.request_timeout (default 120s) + model.max_retries (default 2), passed as
  timeout=/max_retries= to ChatOpenAI in graph/llm.py.
- _a2a_push_client (httpx.AsyncClient) was created but never closed — connection
  pool leak on shutdown/reload (bites the desktop-sidecar restart loop). Close it
  best-effort in the shutdown hook.

Verified: 1047+ tests green (new test_llm asserts the bounds); boot + clean
SIGTERM shutdown smoke shows no push-client errors.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 config/langgraph-config.example.yaml |  2 ++
 graph/config.py                      |  8 ++++++++
 graph/llm.py                         |  4 ++++
 server/__init__.py                   |  8 ++++++++
 tests/test_llm.py                    | 10 ++++++++++
 5 files changed, 32 insertions(+)

diff --git a/config/langgraph-config.example.yaml b/config/langgraph-config.example.yaml
index ab93299..ba16ac1 100644
--- a/config/langgraph-config.example.yaml
+++ b/config/langgraph-config.example.yaml
@@ -18,6 +18,8 @@ model:
   temperature: 0.2
   max_tokens: 32768  # 32k — required headroom for the Qwen models we run
   max_iterations: 50
+  request_timeout: 120   # per-call timeout (s) on the model client — bounds a hung/slow gateway
+  max_retries: 2         # transient-error retries inside the client
   # Advanced sampling — all optional; omit to use the gateway / model-card
   # defaults. top_p + presence_penalty are standard OpenAI params; top_k +
   # repetition_penalty + chat_template_kwargs ride extra_body for vLLM-style
diff --git a/graph/config.py b/graph/config.py
index 5003696..4d272cf 100644
--- a/graph/config.py
+++ b/graph/config.py
@@ -87,6 +87,12 @@ class LangGraphConfig:
     max_tokens: int = 32768  # 32k — required headroom for the Qwen models we run
     max_iterations: int = 50
 
+    # Per-call timeout (seconds) on the model client + transient-retry cap. Bounds
+    # a hung/slow gateway so a turn surfaces a clean error instead of blocking the
+    # A2A task / SSE stream indefinitely (prod-readiness). 0/None ⇒ SDK default.
+    request_timeout: float = 120.0
+    llm_max_retries: int = 2
+
     # Advanced sampling — all opt-in. ``None`` (or a negative top_k) means
     # "let the gateway / model card decide". top_p and presence_penalty are
     # standard OpenAI params; top_k and repetition_penalty aren't, so they
@@ -472,6 +478,8 @@ def from_yaml(cls, path: str | Path) -> "LangGraphConfig":
             temperature=model.get("temperature", cls.temperature),
             max_tokens=model.get("max_tokens", cls.max_tokens),
             max_iterations=model.get("max_iterations", cls.max_iterations),
+            request_timeout=model.get("request_timeout", cls.request_timeout),
+            llm_max_retries=model.get("max_retries", cls.llm_max_retries),
             top_p=model.get("top_p", cls.top_p),
             top_k=model.get("top_k", cls.top_k),
             presence_penalty=model.get("presence_penalty", cls.presence_penalty),
diff --git a/graph/llm.py b/graph/llm.py
index 5a6412c..a9aa38a 100644
--- a/graph/llm.py
+++ b/graph/llm.py
@@ -25,6 +25,10 @@ def _build_llm_kwargs(config: LangGraphConfig) -> dict:
         "model": config.model_name,
         "temperature": config.temperature,
         "max_tokens": config.max_tokens,
+        # Bound a hung/slow gateway: a per-call timeout + transient-retry cap so a
+        # turn fails cleanly instead of hanging the A2A task / SSE stream forever.
+        "timeout": config.request_timeout,
+        "max_retries": config.llm_max_retries,
         # Forces token-usage info onto the final streaming chunk so
         # `astream_events(v2)` populates `output.usage_metadata` on
         # `on_chat_model_end`. Without this, streaming chunks arrive as
diff --git a/server/__init__.py b/server/__init__.py
index 4c0baa5..e45278f 100644
--- a/server/__init__.py
+++ b/server/__init__.py
@@ -544,6 +544,14 @@ async def _scheduler_shutdown() -> None:
             log.exception("[discord] shutdown failed")
         if STATE.checkpoint_prune_task is not None:
             STATE.checkpoint_prune_task.cancel()
+        # Close the long-lived A2A push-notification client (created below in
+        # _main) so its connection pool doesn't leak on shutdown/reload — matters
+        # in the desktop-sidecar restart loop. Best-effort; NameError if boot
+        # never reached its construction is swallowed.
+        try:
+            await _a2a_push_client.aclose()
+        except Exception:
+            log.exception("[a2a] push client close failed")
 
     # Chat / goal / health / OpenAI-compat HTTP surface. Extracted to
     # operator_api/chat_routes.py (ADR 0023 phase 3); ``ui`` is passed in
diff --git a/tests/test_llm.py b/tests/test_llm.py
index 42d31b9..074396f 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -16,6 +16,16 @@ def test_defaults_omit_optional_sampling_params():
     assert "extra_body" not in kwargs
 
 
+def test_request_timeout_and_max_retries_bound_the_gateway():
+    # Prod-readiness: the client must carry a per-call timeout + retry cap so a
+    # hung/slow gateway can't block a turn (and the A2A task) indefinitely.
+    kwargs = _build_llm_kwargs(LangGraphConfig())
+    assert kwargs["timeout"] == 120.0
+    assert kwargs["max_retries"] == 2
+    custom = _build_llm_kwargs(LangGraphConfig(request_timeout=45.0, llm_max_retries=0))
+    assert custom["timeout"] == 45.0 and custom["max_retries"] == 0
+
+
 def test_standard_openai_params_passed_directly():
     cfg = LangGraphConfig(top_p=0.95, presence_penalty=0.5)
     kwargs = _build_llm_kwargs(cfg)

From fb7cf97d07208948d911a9c2bc49b1a071540f8a Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Fri, 5 Jun 2026 21:33:31 -0700
Subject: [PATCH 05/20] security: default-on SSRF denylist for fetch_url +
 re-check redirects (#583)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prod-readiness audit P1. egress.check_url was a host allowlist that was
permissive (off) by default — so with no allowlist set, the model could
fetch_url an internal service or http://169.254.169.254/ (cloud metadata). And
fetch_url used follow_redirects=True, so a public URL 30x-redirecting to an
internal host bypassed the check entirely.

- egress.check_url now applies a default-on private-IP denylist when no allowlist
  is set: resolve the host, block loopback/link-local/private/multicast/reserved/
  metadata (+ unresolvable). Public hosts still work with no config. An allowlist,
  when set, is the explicit-trust path and bypasses the denylist (mirrors
  a2a_stores.is_safe_webhook_url).
- fetch_url disables auto-redirects and follows manually, re-checking each hop's
  host against egress (max 5 hops; refuses non-http(s) redirects).

Verified: test_egress updated for the new default (public allowed, private/metadata
blocked, allowlist bypass); 1047+ tests green.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 egress.py            | 83 +++++++++++++++++++++++++++++++++++---------
 tests/test_egress.py | 18 ++++++++--
 tools/lg_tools.py    | 15 +++++++-
 3 files changed, 96 insertions(+), 20 deletions(-)

diff --git a/egress.py b/egress.py
index c2a511d..f4be42a 100644
--- a/egress.py
+++ b/egress.py
@@ -1,10 +1,13 @@
 """Egress allowlist for outbound HTTP from agent tools (ADR 0008).
 
-Deny-by-default host allowlist enforced in ``fetch_url`` — the tool where the
-model picks an arbitrary host, i.e. the main in-process exfiltration / SSRF
-vector. An **empty allowlist is permissive** (off), so existing deployments are
-unchanged until they opt in. This is also the single source of truth the
-OpenShell network policy is generated from (``scripts/gen_openshell_policy.py``).
+Enforced in ``fetch_url`` — the tool where the model picks an arbitrary host,
+i.e. the main in-process exfiltration / SSRF vector. Two layers: an optional
+host **allowlist** (deny-by-default when set; the single source of truth the
+OpenShell network policy is generated from, ``scripts/gen_openshell_policy.py``),
+and — when no allowlist is set — a **default-on private-IP denylist** so the
+model can't reach an internal service or cloud-metadata (``169.254.169.254``)
+out of the box. Public hosts still work with no allowlist; allowlisting a host
+explicitly trusts it (bypasses the denylist).
 
 Mirrors the ``PUSH_NOTIFICATION_ALLOWED_HOSTS`` SSRF-guard pattern in
 ``a2a_stores``. Wildcards: a leading ``*.`` matches any subdomain
@@ -17,6 +20,8 @@
 
 from __future__ import annotations
 
+import ipaddress
+import socket
 from urllib.parse import urlparse
 
 _allowed: list[str] = []  # lowercased host patterns; empty = permissive (off)
@@ -50,20 +55,64 @@ def _host_allowed(host: str) -> bool:
     return False
 
 
-def check_url(url: str) -> str | None:
-    """Return an error string if the URL's host is not allowed, else ``None``.
+def _blocked_ip(host: str) -> str | None:
+    """Resolve ``host`` and return the first address that is a private/internal
+    SSRF target (loopback / link-local / private / multicast / reserved /
+    unspecified), or the literal ``"unresolvable"`` when DNS fails (treated as
+    unsafe, matching ``a2a_stores``). ``None`` ⇒ the host resolves only to
+    globally-routable addresses. One-shot resolution — not a DNS-rebinding
+    defence, but closes the trivial literal/redirect-to-internal vector."""
+    try:
+        ipaddress.ip_address(host)
+        candidates = [host]
+    except ValueError:
+        try:
+            candidates = [info[4][0] for info in socket.getaddrinfo(host, None)]
+        except socket.gaierror:
+            return "unresolvable"
+    for addr in candidates:
+        try:
+            ip = ipaddress.ip_address(addr)
+        except ValueError:
+            return addr
+        if (ip.is_loopback or ip.is_link_local or ip.is_private
+                or ip.is_multicast or ip.is_reserved or ip.is_unspecified):
+            return addr
+    return None
 
-    Permissive (returns ``None``) when no allowlist is configured.
+
+def check_url(url: str) -> str | None:
+    """Return an error string if the URL's host is not permitted, else ``None``.
+
+    Two layers:
+    - **Allowlist set** → only allowlisted hosts pass (wildcards supported). An
+      allowlisted host is explicitly trusted and bypasses the IP denylist below
+      (you may allowlist an internal host on purpose).
+    - **No allowlist (default)** → a host is permitted unless it resolves to a
+      private / loopback / link-local / cloud-metadata / reserved address. This
+      default-on SSRF guard stops the model `fetch_url`-ing an internal service
+      or `169.254.169.254` even when no allowlist is configured.
     """
-    if not _allowed:
-        return None
     try:
-        host = urlparse(url).hostname or ""
+        host = (urlparse(url).hostname or "").lower()
     except ValueError:
         return f"Error: malformed URL: {url!r}"
-    if _host_allowed(host):
-        return None
-    return (
-        f"Error: egress to {host or url!r} is blocked — not in the egress "
-        f"allowlist ({', '.join(_allowed)}). Set egress.allowed_hosts to permit it."
-    )
+    if not host:
+        return f"Error: no host in URL: {url!r}"
+    if _allowed:
+        if _host_allowed(host):
+            return None
+        return (
+            f"Error: egress to {host} is blocked — not in the egress allowlist "
+            f"({', '.join(_allowed)}). Set egress.allowed_hosts to permit it."
+        )
+    bad = _blocked_ip(host)
+    if bad == "unresolvable":
+        return f"Error: egress to {host} is blocked — host did not resolve."
+    if bad:
+        return (
+            f"Error: egress to {host} ({bad}) is blocked — it resolves to a "
+            f"private/internal address (SSRF guard). Allowlist it via "
+            f"egress.allowed_hosts if this is intentional."
+        )
+    return None
diff --git a/tests/test_egress.py b/tests/test_egress.py
index 38c6b56..e1cb048 100644
--- a/tests/test_egress.py
+++ b/tests/test_egress.py
@@ -20,9 +20,23 @@ def _reset():
 # ── egress allowlist ───────────────────────────────────────────────────────────
 
 
-def test_permissive_when_unset():
+def test_unset_allows_public_blocks_private():
+    # No allowlist → public IPs pass, but the default-on SSRF denylist blocks
+    # private/loopback/link-local/metadata even without an allowlist. (IP
+    # literals so the test doesn't depend on DNS.)
     assert egress.is_enabled() is False
-    assert egress.check_url("http://anything.example/x") is None
+    assert egress.check_url("http://8.8.8.8/x") is None           # public
+    for bad in ("http://127.0.0.1/", "http://10.0.0.1/", "http://192.168.1.1/",
+                "http://169.254.169.254/latest/meta-data/", "http://[::1]/"):
+        assert egress.check_url(bad) is not None, bad
+
+
+def test_allowlisted_host_bypasses_ip_denylist():
+    # An operator can intentionally allowlist an internal host — the allowlist is
+    # the explicit-trust path and bypasses the private-IP denylist.
+    egress.set_allowed_hosts(["internal.svc"])
+    assert egress.check_url("http://internal.svc/x") is None
+    egress.set_allowed_hosts([])
 
 
 def test_exact_host_allow_and_deny():
diff --git a/tools/lg_tools.py b/tools/lg_tools.py
index f6bb3f7..c147bf9 100644
--- a/tools/lg_tools.py
+++ b/tools/lg_tools.py
@@ -278,12 +278,25 @@ async def fetch_url(url: str, max_chars: int = _MAX_OUTPUT_CHARS) -> str:
         return "Error: httpx not installed — cannot fetch URLs."
 
     try:
+        # Disable auto-redirects and follow manually so each hop's host is
+        # re-checked against egress — otherwise a public URL that 30x-redirects
+        # to http://169.254.169.254/ would bypass the SSRF guard above.
         async with httpx.AsyncClient(
-            follow_redirects=True, timeout=15, headers={
+            follow_redirects=False, timeout=15, headers={
                 "User-Agent": "protoAgent/0.1 (+https://github.com/protoLabsAI/protoAgent)",
             },
         ) as client:
             resp = await client.get(url)
+            hops = 0
+            while resp.is_redirect and hops < 5:
+                nxt = str(resp.url.join(resp.headers.get("location") or ""))
+                if not (nxt.startswith("http://") or nxt.startswith("https://")):
+                    return f"Error: refusing non-http(s) redirect to {nxt!r}"
+                blocked = egress.check_url(nxt)
+                if blocked:
+                    return blocked
+                resp = await client.get(nxt)
+                hops += 1
     except httpx.HTTPError as e:
         return f"Error: fetch failed: {e}"
 

From 0beba052091696078a13087e43fc5d1c2c75d99a Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Fri, 5 Jun 2026 21:35:03 -0700
Subject: [PATCH 06/20] deploy: wire the existing /healthz into a HEALTHCHECK +
 k8s probes (#584)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prod-readiness audit P1. The app exposes a correct readiness endpoint (/healthz
returns 200 only when STATE.graph is compiled, 503 during cold start) but nothing
used it — compose restart:unless-stopped couldn't detect a hung-but-alive
process, and k8s wouldn't gate traffic on readiness (requests hit the pod during
the model-cold-start window).

- Dockerfile HEALTHCHECK (curl /healthz; start-period 60s for first-compile).
- docker-compose healthcheck block.
- k8s readinessProbe (gates traffic) + livenessProbe (generous: initialDelay 90s
  + failureThreshold 6 so a slow first compile can't trigger a restart loop).

curl is already in the image. Both YAML files parse.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 Dockerfile                                   |  7 +++++++
 deploy/openshell/k8s/protoagent-sandbox.yaml | 13 +++++++++++++
 docker-compose.yml                           | 10 ++++++++++
 3 files changed, 30 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 6e20326..6f04b51 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -104,4 +104,11 @@ USER sandbox
 WORKDIR /sandbox
 
 EXPOSE 7870
+
+# Readiness/health: /healthz returns 200 only once the agent graph is compiled
+# (503 during the model-cold-start window). start-period covers the
+# frozen-sidecar / first-compile boot so a slow start isn't marked unhealthy.
+HEALTHCHECK --interval=30s --timeout=3s --start-period=60s --retries=3 \
+    CMD curl -fsS http://localhost:7870/healthz || exit 1
+
 CMD ["/opt/protoagent/entrypoint.sh"]
diff --git a/deploy/openshell/k8s/protoagent-sandbox.yaml b/deploy/openshell/k8s/protoagent-sandbox.yaml
index 977d8c2..4b7a6d9 100644
--- a/deploy/openshell/k8s/protoagent-sandbox.yaml
+++ b/deploy/openshell/k8s/protoagent-sandbox.yaml
@@ -32,6 +32,19 @@ spec:
           command: ["python", "-m", "server", "--host", "0.0.0.0", "--port", "7870", "--ui", "none"]  # lean headless tier (ADR 0010); --host 0.0.0.0 because this override bypasses entrypoint.sh (the server now defaults to loopback)
           ports:
             - containerPort: 7870
+          # /healthz returns 200 only once the agent graph is compiled (503 during
+          # cold start). Readiness gates traffic until ready; liveness is generous
+          # so a slow first compile (model warm-up) can't trigger a restart loop.
+          readinessProbe:
+            httpGet: { path: /healthz, port: 7870 }
+            initialDelaySeconds: 10
+            periodSeconds: 10
+            failureThreshold: 3
+          livenessProbe:
+            httpGet: { path: /healthz, port: 7870 }
+            initialDelaySeconds: 90
+            periodSeconds: 30
+            failureThreshold: 6
           env:
             - name: PROTOAGENT_INSTANCE         # ADR 0004 — isolate this deployment's stores
               value: "k8s"
diff --git a/docker-compose.yml b/docker-compose.yml
index fd059bd..1938c37 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -17,6 +17,16 @@ services:
     container_name: protoagent
     restart: unless-stopped
 
+    # Readiness — /healthz is 200 only once the agent graph is ready (503 during
+    # cold start). With restart:unless-stopped this lets the daemon detect a
+    # hung-but-alive process. start_period covers the first-compile boot.
+    healthcheck:
+      test: ["CMD", "curl", "-fsS", "http://localhost:7870/healthz"]
+      interval: 30s
+      timeout: 3s
+      start_period: 60s
+      retries: 3
+
     # ── Hardening ──────────────────────────────────────────────────────────
     security_opt:
       - no-new-privileges:true

From a1c13962730f1fda3c49a146839a6ecd8f11466c Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Fri, 5 Jun 2026 22:11:14 -0700
Subject: [PATCH 07/20] ci(release): make build-provenance attestation opt-in
 (fork-friendly) (#585)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "Attest build provenance" step calls actions/attest-build-provenance,
which needs the GitHub attestations feature — unavailable on private repos
without a paid plan. A fork that cuts a release therefore gets a noisy red
error on a step it can't use (the step is already continue-on-error, so the
release still publishes, but the failed annotation is misleading).

Gate it behind an opt-in `ATTESTATIONS_ENABLED` repo variable, mirroring the
existing `RELEASE_ENABLED` guard: forks leave it unset and the step is skipped
cleanly; repos with the feature set the variable to `true` — no workflow edit,
so it doesn't conflict when forks re-sync from upstream. continue-on-error is
kept as a belt-and-suspenders.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 .github/workflows/release.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 3e44990..8fde307 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -99,7 +99,15 @@ jobs:
           build-args: |
             VERSION=${{ steps.version.outputs.version }}
 
+      # Build-provenance attestation needs the GitHub attestations feature, which
+      # is unavailable on private repos without a paid plan — so a fork would get
+      # a noisy red error here for a step it can't use. Opt-in (fork-friendly):
+      # set the `ATTESTATIONS_ENABLED` repo variable to `true` to run it; forks
+      # leave it unset and the step is skipped. Enable without editing this file
+      # so upstream changes don't conflict on re-sync (same pattern as
+      # `RELEASE_ENABLED`). `continue-on-error` stays as a belt-and-suspenders.
       - name: Attest build provenance
+        if: vars.ATTESTATIONS_ENABLED == 'true'
         continue-on-error: true
         uses: actions/attest-build-provenance@v1
         with:

From 6794d11b6481f1679a8a6293e44c564bafc19f75 Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Fri, 5 Jun 2026 22:16:39 -0700
Subject: [PATCH 08/20] =?UTF-8?q?ci:=20A2A=20live-smoke=20job=20=E2=80=94?=
 =?UTF-8?q?=20boot=20the=20real=20server=20vs=20a=20fake=20model,=20drive?=
 =?UTF-8?q?=20a=20turn=20(#586)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prod-readiness audit P1 (the durable fix the memory note keeps flagging). Every
wire bug this repo shipped (CRLF SSE #563, A2A 404s / eval rot #524, lean-image
FastAPI gap #426) was unit-green but wire-broken because nothing booted the real
server and exercised the actual transport.

- scripts/fake_openai_server.py — a zero-dep OpenAI-compat endpoint that streams a
  canned completion, so a real A2A turn reaches a terminal state without a gateway.
- scripts/live_smoke.py — boots `python -m server --ui none` against the fake model,
  waits /healthz, fetches the agent card, POSTs a real SendStreamingMessage turn,
  and asserts the SSE frames decode + reach a terminal frame.
- checks.yml: new 'A2A live smoke (lean tier)' job that installs requirements-core.txt
  (the production image's deps — also guards the lean-import gap class) and runs it.

Verified locally: lean server boots, card serves, streaming turn completes
end-to-end against the fake model.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 .github/workflows/checks.yml  |  27 ++++++++
 scripts/fake_openai_server.py | 103 +++++++++++++++++++++++++++++
 scripts/live_smoke.py         | 119 ++++++++++++++++++++++++++++++++++
 3 files changed, 249 insertions(+)
 create mode 100644 scripts/fake_openai_server.py
 create mode 100644 scripts/live_smoke.py

diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
index 01b5a77..620572d 100644
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@@ -71,6 +71,33 @@ jobs:
           . .venv/bin/activate
           python -m pytest tests/ -q
 
+  live-smoke:
+    name: A2A live smoke (lean tier)
+    runs-on: namespace-profile-protolabs-linux
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: pip
+      - name: Install lean (core) deps
+        # requirements-core.txt is the PRODUCTION image's dependency set (the
+        # `--ui none` tier). Installing it here — not the full requirements.txt —
+        # also guards the lean-image import gap class (e.g. the FastAPI-missing
+        # bug that only push-to-main caught, #426).
+        run: |
+          python -m venv .venv
+          . .venv/bin/activate
+          python -m pip install --upgrade pip
+          pip install -r requirements-core.txt
+      - name: Boot the real server vs a fake model + drive a real A2A turn
+        # Catches the green-but-wire-broken class unit/mock tests miss — CRLF SSE
+        # framing, A2A routing + version negotiation, agent-card build, lean boot.
+        run: |
+          . .venv/bin/activate
+          python scripts/live_smoke.py
+
   web-e2e:
     name: Web E2E smoke
     runs-on: namespace-profile-protolabs-linux
diff --git a/scripts/fake_openai_server.py b/scripts/fake_openai_server.py
new file mode 100644
index 0000000..abc6204
--- /dev/null
+++ b/scripts/fake_openai_server.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+"""A zero-dependency fake OpenAI-compatible endpoint for the CI live-smoke.
+
+Boots the REAL protoAgent server against this so a live A2A turn exercises the
+real wire path (executor → chat stream → SSE framing → a2a-sdk handler) without
+a real model or gateway. The model just needs to return a valid completion so
+the turn reaches a terminal state and emits artifact/COMPLETED frames — that's
+what catches the green-but-wire-broken class (CRLF SSE, A2A routing/version,
+lean-image boot) that unit/mock tests miss.
+
+Serves:
+  GET  /v1/models               → a one-model list (drawer/validation)
+  POST /v1/chat/completions     → a streaming (or non-streaming) chat completion
+
+Run: python scripts/fake_openai_server.py <port>
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+
+# The agent wraps its answer in <output>…</output> (graph/output_format.py); emit
+# that shape so the parser extracts a clean final answer.
+_ANSWER = "<output>live smoke ok</output>"
+_MODEL = "protolabs/reasoning"
+
+
+def _chunk(delta: dict, finish=None, usage=None) -> bytes:
+    obj = {
+        "id": "smoke-1",
+        "object": "chat.completion.chunk",
+        "model": _MODEL,
+        "choices": [{"index": 0, "delta": delta, "finish_reason": finish}],
+    }
+    if usage is not None:
+        obj["usage"] = usage
+    return b"data: " + json.dumps(obj).encode() + b"\n\n"
+
+
+class _Handler(BaseHTTPRequestHandler):
+    def log_message(self, *a):  # quiet
+        pass
+
+    def do_GET(self):
+        if self.path.rstrip("/").endswith("/models"):
+            body = json.dumps({"object": "list", "data": [{"id": _MODEL, "object": "model"}]}).encode()
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Content-Length", str(len(body)))
+            self.end_headers()
+            self.wfile.write(body)
+        else:
+            self.send_error(404)
+
+    def do_POST(self):
+        length = int(self.headers.get("Content-Length", 0) or 0)
+        raw = self.rfile.read(length) if length else b"{}"
+        try:
+            req = json.loads(raw or b"{}")
+        except ValueError:
+            req = {}
+        streaming = bool(req.get("stream"))
+        usage = {"prompt_tokens": 7, "completion_tokens": 4, "total_tokens": 11}
+
+        if not self.path.rstrip("/").endswith("/chat/completions"):
+            self.send_error(404)
+            return
+
+        if streaming:
+            self.send_response(200)
+            self.send_header("Content-Type", "text/event-stream")
+            self.end_headers()
+            # role chunk, content chunk, finish chunk (with usage), [DONE].
+            self.wfile.write(_chunk({"role": "assistant", "content": ""}))
+            self.wfile.write(_chunk({"content": _ANSWER}))
+            self.wfile.write(_chunk({}, finish="stop", usage=usage))
+            self.wfile.write(b"data: [DONE]\n\n")
+            self.wfile.flush()
+        else:
+            body = json.dumps({
+                "id": "smoke-1", "object": "chat.completion", "model": _MODEL,
+                "choices": [{"index": 0, "message": {"role": "assistant", "content": _ANSWER},
+                             "finish_reason": "stop"}],
+                "usage": usage,
+            }).encode()
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Content-Length", str(len(body)))
+            self.end_headers()
+            self.wfile.write(body)
+
+
+def main() -> None:
+    port = int(sys.argv[1]) if len(sys.argv) > 1 else 8900
+    srv = ThreadingHTTPServer(("127.0.0.1", port), _Handler)
+    print(f"[fake-openai] listening on 127.0.0.1:{port}", flush=True)
+    srv.serve_forever()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/live_smoke.py b/scripts/live_smoke.py
new file mode 100644
index 0000000..a10130e
--- /dev/null
+++ b/scripts/live_smoke.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+"""CI live-smoke: boot the REAL server (lean `--ui none` tier) against a fake
+OpenAI endpoint and drive a real A2A turn end-to-end.
+
+This catches the green-but-wire-broken class that unit/mock tests miss — CRLF SSE
+framing, A2A routing + version negotiation, the agent-card build, and lean-image
+import gaps — by exercising the actual transport, not a mock. The fake model
+(scripts/fake_openai_server.py) returns a canned completion so the turn reaches a
+terminal state without a real gateway.
+
+Exit 0 on success, non-zero (with a diagnostic) on any failure.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import socket
+import subprocess
+import sys
+import tempfile
+import time
+import urllib.request
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parent.parent
+
+
+def _free_port() -> int:
+    s = socket.socket()
+    s.bind(("127.0.0.1", 0))
+    port = s.getsockname()[1]
+    s.close()
+    return port
+
+
+def _wait_healthz(port: int, timeout: float = 90.0) -> bool:
+    end = time.time() + timeout
+    url = f"http://127.0.0.1:{port}/healthz"
+    while time.time() < end:
+        try:
+            with urllib.request.urlopen(url, timeout=2) as r:
+                if r.status == 200:
+                    return True
+        except Exception:
+            pass
+        time.sleep(1.0)
+    return False
+
+
+def main() -> int:
+    fake_port, agent_port = _free_port(), _free_port()
+    cfg_dir = Path(tempfile.mkdtemp(prefix="smoke-cfg-"))
+    (cfg_dir / "langgraph-config.yaml").write_text(
+        "model:\n"
+        "  name: protolabs/reasoning\n"
+        f"  api_base: http://127.0.0.1:{fake_port}/v1\n"
+        "middleware:\n  knowledge: false\n  scheduler: false\n"
+    )
+    env = {
+        **os.environ,
+        "OPENAI_API_KEY": "fake-smoke-key",
+        "PROTOAGENT_CONFIG_DIR": str(cfg_dir),
+        "PROTOAGENT_INSTANCE": "cismoke",
+        "PROTOAGENT_HEADLESS_SETUP": "1",
+        "PYTHONPATH": str(ROOT),
+    }
+
+    fake = subprocess.Popen([sys.executable, str(ROOT / "scripts" / "fake_openai_server.py"), str(fake_port)])
+    agent = subprocess.Popen(
+        [sys.executable, "-m", "server", "--ui", "none", "--port", str(agent_port)],
+        cwd=str(ROOT), env=env,
+    )
+    try:
+        if not _wait_healthz(agent_port):
+            print("FAIL: /healthz never returned 200 (server did not become ready)")
+            return 1
+        print("ok: /healthz 200 (lean server booted + graph compiled)")
+
+        # Agent card serves + has identity.
+        with urllib.request.urlopen(f"http://127.0.0.1:{agent_port}/.well-known/agent-card.json", timeout=5) as r:
+            card = json.loads(r.read())
+        assert card.get("name"), "agent card has no name"
+        assert card.get("skills"), "agent card has no skills"
+        print(f"ok: agent card serves (name={card['name']}, skills={[s.get('id') for s in card['skills']]})")
+
+        # Real A2A streaming turn over the actual transport.
+        body = json.dumps({
+            "jsonrpc": "2.0", "id": "smoke", "method": "SendStreamingMessage",
+            "params": {"message": {"role": "ROLE_USER", "parts": [{"text": "ping"}],
+                                   "messageId": "m1", "contextId": "smoke"}},
+        }).encode()
+        req = urllib.request.Request(
+            f"http://127.0.0.1:{agent_port}/a2a", data=body,
+            headers={"A2A-Version": "1.0", "Content-Type": "application/json"},
+        )
+        with urllib.request.urlopen(req, timeout=60) as r:
+            raw = r.read().decode("utf-8", "replace")
+
+        assert "data:" in raw, f"no SSE data frames in response: {raw[:300]!r}"
+        terminal = ("COMPLETED" in raw or "live smoke ok" in raw or '"artifact' in raw.lower())
+        assert terminal, f"no terminal/answer frame; first 600 chars: {raw[:600]!r}"
+        print("ok: A2A SendStreamingMessage turn decoded + reached a terminal frame")
+        print("\nLIVE SMOKE PASSED ✓")
+        return 0
+    except Exception as e:  # noqa: BLE001 — smoke must report, not traceback-crash
+        print(f"FAIL: {type(e).__name__}: {e}")
+        return 1
+    finally:
+        for p in (agent, fake):
+            p.terminate()
+            try:
+                p.wait(timeout=5)
+            except Exception:
+                p.kill()
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 39f840741f0a6e1526e138e19a6a535308eda8ef Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Fri, 5 Jun 2026 22:22:20 -0700
Subject: [PATCH 09/20] ci: enforce ruff + clean dead imports/vars (lint gate)
 (#587)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prod-readiness audit P1 — no static analysis was enforced (no ruff/mypy in
pyproject.toml or CI), and the untyped wire boundaries are where the recent
CRLF/tool-call bugs lived.

- pyproject.toml [tool.ruff]: select E/F/W; ignore the stylistic rules the
  codebase intentionally uses (E402 lazy imports, E501 long comment lines,
  E702/E731 semicolon/lambda style); per-file-ignore F401 in **/__init__.py
  (the re-export surfaces, esp. server/__init__.py).
- Cleaned the real signals it surfaced: 45 unused imports auto-removed across the
  tree + 4 dead assignments (incl. a stale old_config the Discord-reconnect
  detection never used). One F841 false-positive (parenthesized multi-with)
  noqa'd with a note.
- checks.yml: new 'Lint (ruff)' job, ruff pinned so a release can't fail the gate.

ruff check . is clean; 1049 tests green.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 .github/workflows/checks.yml                  | 16 ++++++++++++
 audit.py                                      |  1 -
 chat_ui.py                                    |  2 +-
 evals/client.py                               |  1 -
 evals/verify.py                               |  1 -
 graph/config_io.py                            |  1 -
 graph/middleware/audit.py                     |  2 --
 graph/middleware/knowledge.py                 |  1 -
 graph/middleware/memory.py                    |  1 -
 graph/middleware/message_capture.py           |  1 -
 graph/skills/curator.py                       |  2 --
 pyproject.toml                                | 26 +++++++++++++++++++
 scheduler/workstacean.py                      |  1 -
 server/agent_init.py                          |  3 ---
 server/chat.py                                |  1 -
 .../test_audit_redaction_integration.py       |  8 +++---
 tests/middleware/test_redaction.py            |  1 -
 tests/test_a2a_handler.py                     |  1 -
 tests/test_cache_warmer.py                    |  1 -
 tests/test_config_io.py                       |  2 +-
 tests/test_discord_gateway.py                 |  1 -
 tests/test_eval_sweep.py                      |  2 --
 tests/test_eval_webhook.py                    |  1 -
 tests/test_hot_memory.py                      |  1 -
 tests/test_hybrid_store.py                    |  1 -
 tests/test_memory_load.py                     |  5 +---
 tests/test_memory_persistence.py              |  4 ---
 tests/test_model_validation.py                |  1 -
 tests/test_shell.py                           |  2 +-
 tests/test_skill_curator.py                   |  5 ----
 tests/test_skill_emission.py                  |  2 +-
 tests/test_skill_index.py                     |  2 --
 tests/utils/check_audit_clean.py              |  1 -
 33 files changed, 50 insertions(+), 51 deletions(-)

diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
index 620572d..314a48a 100644
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@@ -48,6 +48,22 @@ jobs:
           fi
           echo "ok: no stale single-file launches in deploy artifacts"
 
+  lint:
+    name: Lint (ruff)
+    runs-on: namespace-profile-protolabs-linux
+    timeout-minutes: 5
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Ruff check
+        # Pinned so a new ruff release can't add a rule that fails the gate
+        # out from under a PR. Config + ignores live in pyproject.toml.
+        run: |
+          pip install ruff==0.15.10
+          ruff check .
+
   tests:
     name: Python tests
     runs-on: namespace-profile-protolabs-linux
diff --git a/audit.py b/audit.py
index 420d3ce..d2a0c07 100644
--- a/audit.py
+++ b/audit.py
@@ -5,7 +5,6 @@
 """
 
 import json
-import os
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
diff --git a/chat_ui.py b/chat_ui.py
index ba832e3..56414b2 100644
--- a/chat_ui.py
+++ b/chat_ui.py
@@ -476,7 +476,7 @@ def _build() -> gr.Blocks:
                                 label="Launch this agent automatically on login",
                                 interactive=True,
                             )
-                            autostart_drawer_status = gr.Markdown("")
+                            gr.Markdown("")
 
                         with gr.Accordion("Persona (SOUL.md)", open=False):
                             soul_in = gr.Textbox(
diff --git a/evals/client.py b/evals/client.py
index effe8f3..b01d5d7 100644
--- a/evals/client.py
+++ b/evals/client.py
@@ -34,7 +34,6 @@
 import time
 import uuid
 from dataclasses import dataclass, field
-from typing import Any
 
 import httpx
 
diff --git a/evals/verify.py b/evals/verify.py
index 447aadc..94cc153 100644
--- a/evals/verify.py
+++ b/evals/verify.py
@@ -25,7 +25,6 @@
 import os
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Any
 
 log = logging.getLogger(__name__)
 
diff --git a/graph/config_io.py b/graph/config_io.py
index 1bbe20b..7fb1424 100644
--- a/graph/config_io.py
+++ b/graph/config_io.py
@@ -27,7 +27,6 @@
 
 import logging
 import os
-from io import StringIO
 from pathlib import Path
 from typing import Any
 
diff --git a/graph/middleware/audit.py b/graph/middleware/audit.py
index 272156c..0e52aff 100644
--- a/graph/middleware/audit.py
+++ b/graph/middleware/audit.py
@@ -5,11 +5,9 @@
 """
 
 import time
-from typing import Any, Callable
 
 from langchain.agents.middleware import AgentMiddleware
 
-from langgraph.prebuilt.chat_agent_executor import AgentState
 
 from graph.middleware.redaction import redact
 
diff --git a/graph/middleware/knowledge.py b/graph/middleware/knowledge.py
index 40a411d..6184dc7 100644
--- a/graph/middleware/knowledge.py
+++ b/graph/middleware/knowledge.py
@@ -16,7 +16,6 @@
 from langchain.agents.middleware import AgentMiddleware
 from langchain_core.messages import AIMessage, HumanMessage
 
-from langgraph.prebuilt.chat_agent_executor import AgentState
 
 if TYPE_CHECKING:
     from graph.skills.index import SkillRecord, SkillsIndex
diff --git a/graph/middleware/memory.py b/graph/middleware/memory.py
index 5ddc5b5..140b795 100644
--- a/graph/middleware/memory.py
+++ b/graph/middleware/memory.py
@@ -17,7 +17,6 @@
 from langchain.agents.middleware import AgentMiddleware
 from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
 
-from langgraph.prebuilt.chat_agent_executor import AgentState
 
 
 log = logging.getLogger(__name__)
diff --git a/graph/middleware/message_capture.py b/graph/middleware/message_capture.py
index 8300da4..0ce7e74 100644
--- a/graph/middleware/message_capture.py
+++ b/graph/middleware/message_capture.py
@@ -6,7 +6,6 @@
 """
 
 from langchain.agents.middleware import AgentMiddleware
-from langchain_core.messages import ToolMessage
 
 class MessageCaptureMiddleware(AgentMiddleware):
     """Intercept message() tool calls and capture their content."""
diff --git a/graph/skills/curator.py b/graph/skills/curator.py
index 51bddc8..042fd9b 100644
--- a/graph/skills/curator.py
+++ b/graph/skills/curator.py
@@ -74,7 +74,6 @@
 import os
 import uuid
 from datetime import datetime, timezone
-from typing import Any
 
 log = logging.getLogger(__name__)
 
@@ -333,7 +332,6 @@ def _build_similarity_matrix(
             texts = [_skill_text(s) for s in skills]
             embeddings = model.encode(texts, normalize_embeddings=True)
             # Cosine similarity via dot product (embeddings are normalised)
-            import numpy as np  # type: ignore
 
             matrix = (embeddings @ embeddings.T).tolist()
             log.debug("[curator] similarity matrix built with sentence-transformers")
diff --git a/pyproject.toml b/pyproject.toml
index 1b32861..fc6b748 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,3 +8,29 @@ requires-python = ">=3.11"
 asyncio_mode = "auto"
 testpaths = ["tests"]
 pythonpath = ["."]
+
+[tool.ruff]
+line-length = 120
+target-version = "py311"
+extend-exclude = [".venv", "apps", "build", "dist", "*.egg-info"]
+
+[tool.ruff.lint]
+# pyflakes (F) catches the real bugs — unused/undefined names; pycodestyle (E/W)
+# + isort (I) for hygiene. The stylistic rules the codebase intentionally and
+# pervasively uses are ignored rather than churned: lazy/late imports (E402),
+# semicolon/lambda style (E702/E731), and the bare-except guard form (E722 is
+# kept; bare `except:` is genuinely banned here).
+select = ["E", "F", "W"]
+ignore = [
+  "E402",  # module-level import not at top — late/lazy imports are idiomatic here
+  "E501",  # line-too-long — long, readable comment lines are intentional
+  "E702",  # multiple statements on one line (semicolon) — used deliberately
+  "E731",  # lambda assignment — used deliberately in a few places
+  "E741",  # ambiguous var name (l/O/I) — occasional, harmless
+]
+
+[tool.ruff.lint.per-file-ignores]
+# Package __init__ files re-export submodule symbols (esp. server/__init__.py,
+# which re-exports the whole public surface for the test suite) — F401 there is
+# intentional, not dead code.
+"**/__init__.py" = ["F401"]
diff --git a/scheduler/workstacean.py b/scheduler/workstacean.py
index 56df684..454a23e 100644
--- a/scheduler/workstacean.py
+++ b/scheduler/workstacean.py
@@ -26,7 +26,6 @@
 from __future__ import annotations
 
 import logging
-import os
 import uuid
 from typing import Any
 
diff --git a/server/agent_init.py b/server/agent_init.py
index 61695a4..3dc4212 100644
--- a/server/agent_init.py
+++ b/server/agent_init.py
@@ -841,9 +841,6 @@ def _reload_langgraph_agent() -> tuple[bool, str]:
         new_workflow_registry = None
         new_inbox_store = None
 
-    # Capture the outgoing config before the swap so we can tell whether the
-    # Discord surface needs a live reconnect (token/admin/enabled changed).
-    old_config = STATE.graph_config
     # Commit: config → A2A bearer → graph. All three reference the
     # same ``new_config`` so they stay consistent.
     STATE.graph_config = new_config
diff --git a/server/chat.py b/server/chat.py
index 90676b9..0613eee 100644
--- a/server/chat.py
+++ b/server/chat.py
@@ -428,7 +428,6 @@ async def _chat_langgraph_stream(
     it (e.g. per-project working memory) without editing this file.
     """
     import tracing
-    from langchain_core.messages import HumanMessage
 
     from graph.goals.goal_turn import goal_turn
 
diff --git a/tests/middleware/test_audit_redaction_integration.py b/tests/middleware/test_audit_redaction_integration.py
index b9b31c4..b378464 100644
--- a/tests/middleware/test_audit_redaction_integration.py
+++ b/tests/middleware/test_audit_redaction_integration.py
@@ -1,9 +1,7 @@
 """Integration tests: verify redaction is applied in AuditMiddleware flow."""
 
 import json
-import tempfile
-from pathlib import Path
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import MagicMock, patch
 
 import pytest
 
@@ -107,7 +105,7 @@ def fake_trace(tool_name, args, result, duration_ms, success, session_id):
             captured_trace_args.update(args)
 
         with (
-            patch("audit.audit_logger") as fake_audit,
+            patch("audit.audit_logger") as fake_audit,  # noqa: F841 (used below; ruff misreads the parenthesized with)
             patch("tracing.current_session_id", return_value="sess-3"),
             patch("tracing.trace_tool_call", side_effect=fake_trace),
             patch("metrics.record_tool_call"),
@@ -200,7 +198,7 @@ async def raising_handler(r):
             raise ValueError("tool failure")
 
         with (
-            patch("audit.audit_logger") as fake_audit,
+            patch("audit.audit_logger") as fake_audit,  # noqa: F841 (used below; ruff misreads the parenthesized with)
             patch("tracing.current_session_id", return_value="sess-exc"),
             patch("tracing.trace_tool_call"),
             patch("metrics.record_tool_call"),
diff --git a/tests/middleware/test_redaction.py b/tests/middleware/test_redaction.py
index 15895c9..dc60a52 100644
--- a/tests/middleware/test_redaction.py
+++ b/tests/middleware/test_redaction.py
@@ -1,6 +1,5 @@
 """Unit tests for graph.middleware.redaction module."""
 
-import pytest
 
 from graph.middleware.redaction import redact, PATTERNS
 
diff --git a/tests/test_a2a_handler.py b/tests/test_a2a_handler.py
index 3cec36c..c2b000e 100644
--- a/tests/test_a2a_handler.py
+++ b/tests/test_a2a_handler.py
@@ -27,7 +27,6 @@
 
 import httpx
 import pytest
-from a2a.server.context import ServerCallContext
 from a2a.server.request_handlers import DefaultRequestHandler
 from a2a.server.routes.agent_card_routes import create_agent_card_routes
 from a2a.server.routes.fastapi_routes import add_a2a_routes_to_fastapi
diff --git a/tests/test_cache_warmer.py b/tests/test_cache_warmer.py
index 70c5b37..8ec1f4e 100644
--- a/tests/test_cache_warmer.py
+++ b/tests/test_cache_warmer.py
@@ -9,7 +9,6 @@
 
 import pytest
 
-import graph.cache_warmer as cw
 from graph.cache_warmer import CacheWarmer
 from graph.config import LangGraphConfig
 
diff --git a/tests/test_config_io.py b/tests/test_config_io.py
index 6358787..f459b09 100644
--- a/tests/test_config_io.py
+++ b/tests/test_config_io.py
@@ -17,7 +17,7 @@
 from __future__ import annotations
 
 from pathlib import Path
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock
 
 import httpx
 import pytest
diff --git a/tests/test_discord_gateway.py b/tests/test_discord_gateway.py
index 0a48c52..1fdb0f4 100644
--- a/tests/test_discord_gateway.py
+++ b/tests/test_discord_gateway.py
@@ -9,7 +9,6 @@
 
 from __future__ import annotations
 
-import asyncio
 
 import pytest
 
diff --git a/tests/test_eval_sweep.py b/tests/test_eval_sweep.py
index ba62ad6..cd3b50f 100644
--- a/tests/test_eval_sweep.py
+++ b/tests/test_eval_sweep.py
@@ -8,9 +8,7 @@
 from __future__ import annotations
 
 import json
-import os
 
-import pytest
 
 from evals.report import build_report
 from evals.runner import CaseResult, _save_report
diff --git a/tests/test_eval_webhook.py b/tests/test_eval_webhook.py
index f206274..494891f 100644
--- a/tests/test_eval_webhook.py
+++ b/tests/test_eval_webhook.py
@@ -1,6 +1,5 @@
 """Tests for the eval webhook listener."""
 
-import json
 
 import httpx
 import pytest
diff --git a/tests/test_hot_memory.py b/tests/test_hot_memory.py
index d533e61..4073757 100644
--- a/tests/test_hot_memory.py
+++ b/tests/test_hot_memory.py
@@ -1,6 +1,5 @@
 """Tests for hot-memory (always-on domain='hot' facts)."""
 
-from unittest.mock import MagicMock
 
 from langchain_core.messages import HumanMessage
 
diff --git a/tests/test_hybrid_store.py b/tests/test_hybrid_store.py
index 8f4064d..d26328c 100644
--- a/tests/test_hybrid_store.py
+++ b/tests/test_hybrid_store.py
@@ -2,7 +2,6 @@
 
 import sqlite3
 
-import pytest
 
 from knowledge.hybrid_store import HybridKnowledgeStore
 
diff --git a/tests/test_memory_load.py b/tests/test_memory_load.py
index c42d081..9173eb9 100644
--- a/tests/test_memory_load.py
+++ b/tests/test_memory_load.py
@@ -15,11 +15,8 @@
 
 import json
 import os
-import tempfile
-from pathlib import Path
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock
 
-import pytest
 
 
 # ---------------------------------------------------------------------------
diff --git a/tests/test_memory_persistence.py b/tests/test_memory_persistence.py
index 6aab5da..ab55c97 100644
--- a/tests/test_memory_persistence.py
+++ b/tests/test_memory_persistence.py
@@ -13,12 +13,9 @@
 
 from __future__ import annotations
 
-import importlib
 import json
 import os
 import sys
-import tempfile
-from pathlib import Path
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -160,7 +157,6 @@ def test_atomic_write_no_partial_file_on_error(tmp_path):
     state = _make_state("atomic-test")
     dest = tmp_path / "atomic-test.json"
 
-    original_json_dump = json.dump
 
     def bad_dump(*args, **kwargs):
         raise OSError("simulated write error")
diff --git a/tests/test_model_validation.py b/tests/test_model_validation.py
index 6a40864..d58c33c 100644
--- a/tests/test_model_validation.py
+++ b/tests/test_model_validation.py
@@ -7,7 +7,6 @@
 
 from __future__ import annotations
 
-import contextlib
 
 from graph import config_io
 
diff --git a/tests/test_shell.py b/tests/test_shell.py
index b55cf61..3048bf7 100644
--- a/tests/test_shell.py
+++ b/tests/test_shell.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from tools.shell import ShellResult, run_command
+from tools.shell import run_command
 
 
 @pytest.mark.asyncio
diff --git a/tests/test_skill_curator.py b/tests/test_skill_curator.py
index 8c2fb43..e07e4b5 100644
--- a/tests/test_skill_curator.py
+++ b/tests/test_skill_curator.py
@@ -11,22 +11,17 @@
 from __future__ import annotations
 
 import json
-import math
 import os
-import tempfile
 import uuid
 from datetime import datetime, timedelta, timezone
 
-import pytest
 
 from graph.skills.curator import (
-    HALF_LIFE_DAYS,
     PRUNE_THRESHOLD,
     SIMILARITY_THRESHOLD,
     SkillCurator,
     _clamp,
     _jaccard,
-    _parse_iso,
 )
 
 
diff --git a/tests/test_skill_emission.py b/tests/test_skill_emission.py
index 8a5e775..865d195 100644
--- a/tests/test_skill_emission.py
+++ b/tests/test_skill_emission.py
@@ -14,7 +14,7 @@
 
 import logging
 from datetime import datetime, timezone
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import MagicMock
 
 import pytest
 from langchain_core.messages import AIMessage
diff --git a/tests/test_skill_index.py b/tests/test_skill_index.py
index b87b790..01984e9 100644
--- a/tests/test_skill_index.py
+++ b/tests/test_skill_index.py
@@ -16,10 +16,8 @@
 
 import os
 import sqlite3
-import tempfile
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
-from typing import Any
 from unittest.mock import MagicMock
 
 import pytest
diff --git a/tests/utils/check_audit_clean.py b/tests/utils/check_audit_clean.py
index 0c145ea..fcad09e 100644
--- a/tests/utils/check_audit_clean.py
+++ b/tests/utils/check_audit_clean.py
@@ -10,7 +10,6 @@
 
 from __future__ import annotations
 
-import json
 import re
 import sys
 from pathlib import Path

From a6cf03b066ddce6fa5ca628975eb100c2e045b8f Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Fri, 5 Jun 2026 22:29:36 -0700
Subject: [PATCH 10/20] obs: audit log rotation + instance-scoping + tail
 reads; broaden redaction (#588)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prod-readiness audit P1/P2 (observability + data integrity).

audit.py:
- Instance-scope the path via paths.scope_leaf (ADR 0004) — two instances on a
  shared FS no longer interleave records into one file. Path resolves LAZILY so
  it picks up PROTOAGENT_INSTANCE (seeded after import).
- Rotate at a 50MB cap (keep one .1 backup) — the log can't fill the disk.
- get_recent reads only the last 512KB (tail), not the whole file → no OOM on a
  large log.
- Cap _session_stats (OrderedDict, evict oldest) — no slow memory leak.

redaction.py:
- Add provider token shapes that a tool error could echo into the audit log:
  Discord bot, Google OAuth (ya29.) + API key (AIza), GitHub (gh?_), Slack (xox?-),
  AWS (AKIA), client_secret; + DISCORD_BOT_TOKEN/GATEWAY_API_KEY/GH_PAT/refresh_token
  to the env-key + assignment lists. Wired into _redact_string_simple.

Verified: benign text untouched (no over-redaction); 1054 tests green; ruff clean.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 audit.py                           | 122 ++++++++++++++++++++++-------
 graph/middleware/redaction.py      |  33 +++++++-
 tests/middleware/test_redaction.py |  23 ++++++
 tests/test_audit_hardening.py      |  51 ++++++++++++
 4 files changed, 199 insertions(+), 30 deletions(-)
 create mode 100644 tests/test_audit_hardening.py

diff --git a/audit.py b/audit.py
index d2a0c07..e5e5326 100644
--- a/audit.py
+++ b/audit.py
@@ -1,32 +1,71 @@
 """Audit logging for protoAgent tool executions.
 
-Writes JSONL entries to /sandbox/audit/audit.jsonl with tool call metadata.
-Enhanced with Langfuse trace context for cross-referencing.
+Append-only JSONL of tool-call metadata, enriched with Langfuse trace context for
+cross-referencing. The path is **instance-scoped** (ADR 0004) and resolved lazily
+so it picks up ``PROTOAGENT_INSTANCE`` (seeded during boot, after this module is
+imported). The file **rotates** at a size cap so a busy agent can't fill the disk,
+and ``get_recent`` reads only a bounded tail so a large log can't OOM a read.
 """
 
 import json
+import os
+from collections import OrderedDict
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
 
+# Rotate the JSONL when it crosses this; keep one ``.1`` backup. Reads only ever
+# touch the live file's tail.
+_MAX_BYTES = 50 * 1024 * 1024      # 50 MB
+_TAIL_BYTES = 512 * 1024           # get_recent reads at most the last 512 KB
+_MAX_SESSIONS = 1000               # cap the in-memory per-session stats dict
+
+_DEFAULT_LEAF = Path("/sandbox") / "audit" / "audit.jsonl"
 
-class AuditLogger:
-    """Append-only JSONL audit log for tool executions."""
 
-    def __init__(self, path: str | Path = "/sandbox/audit/audit.jsonl"):
-        self.path = Path(path)
+class AuditLogger:
+    """Append-only JSONL audit log for tool executions (rotating, instance-scoped)."""
+
+    def __init__(self, path: str | Path | None = None):
+        # Configured base path; the real (instance-scoped, writable) path is
+        # resolved on first use so PROTOAGENT_INSTANCE is already seeded.
+        self._base = Path(path) if path else _DEFAULT_LEAF
+        self._resolved: Path | None = None
+        self._session_stats: "OrderedDict[str, dict]" = OrderedDict()
+
+    def _ensure_path(self) -> Path | None:
+        """Resolve + create the instance-scoped path, with the standard
+        ``/sandbox`` → ``~/.protoagent`` writable fallback. Memoized. ``None`` if
+        nothing is writable (audit then degrades to a no-op)."""
+        if self._resolved is not None:
+            return self._resolved
+        from paths import scope_leaf  # ADR 0004 — per-instance scoping (no-op when unset)
+
+        candidate = scope_leaf(self._base)
         try:
-            self.path.parent.mkdir(parents=True, exist_ok=True)
+            candidate.parent.mkdir(parents=True, exist_ok=True)
+            self._resolved = candidate
+            return candidate
         except OSError:
-            # /sandbox isn't writable (non-root CI runner, local `python
-            # server.py`, etc.) — fall back to a per-user path so audit
-            # logging degrades gracefully instead of crashing at import.
-            self.path = Path.home() / ".protoagent" / "audit" / self.path.name
+            fb = scope_leaf(Path.home() / ".protoagent" / "audit" / candidate.name)
             try:
-                self.path.parent.mkdir(parents=True, exist_ok=True)
+                fb.parent.mkdir(parents=True, exist_ok=True)
             except OSError:
-                pass
-        self._session_stats: dict[str, dict] = {}
+                return None
+            self._resolved = fb
+            return fb
+
+    @property
+    def path(self) -> Path:
+        """The resolved on-disk path (for callers/tests that read it directly)."""
+        return self._ensure_path() or scope_leaf_safe(self._base)
+
+    def _maybe_rotate(self, path: Path) -> None:
+        try:
+            if path.exists() and path.stat().st_size > _MAX_BYTES:
+                path.replace(path.with_suffix(path.suffix + ".1"))  # overwrite the single backup
+        except OSError:
+            pass
 
     def log(
         self,
@@ -57,32 +96,48 @@ def log(
         if trace_id:
             entry["trace_id"] = trace_id
 
-        try:
-            with self.path.open("a") as f:
-                f.write(json.dumps(entry, default=str) + "\n")
-        except OSError:
-            pass
+        path = self._ensure_path()
+        if path is not None:
+            self._maybe_rotate(path)
+            try:
+                with path.open("a") as f:
+                    f.write(json.dumps(entry, default=str) + "\n")
+            except OSError:
+                pass
 
-        stats = self._session_stats.setdefault(session_id, {
-            "tool_calls": 0, "successes": 0, "failures": 0, "total_ms": 0,
-            "tools_used": set(),
-        })
+        stats = self._session_stats.get(session_id)
+        if stats is None:
+            stats = {"tool_calls": 0, "successes": 0, "failures": 0, "total_ms": 0, "tools_used": set()}
+            self._session_stats[session_id] = stats
+            # Cap the dict — evict the oldest sessions so a long-lived process
+            # serving many sessions doesn't leak memory.
+            while len(self._session_stats) > _MAX_SESSIONS:
+                self._session_stats.popitem(last=False)
+        else:
+            self._session_stats.move_to_end(session_id)
         stats["tool_calls"] += 1
         stats["successes" if success else "failures"] += 1
         stats["total_ms"] += duration_ms
         stats["tools_used"].add(tool)
 
-    def get_recent(
-        self, n: int = 20, session_id: str | None = None
-    ) -> list[dict[str, Any]]:
-        if not self.path.exists():
+    def get_recent(self, n: int = 20, session_id: str | None = None) -> list[dict[str, Any]]:
+        path = self._ensure_path()
+        if path is None or not path.exists():
             return []
         try:
-            lines = self.path.read_text().strip().splitlines()
+            with path.open("rb") as f:
+                f.seek(0, os.SEEK_END)
+                size = f.tell()
+                f.seek(max(0, size - _TAIL_BYTES))
+                blob = f.read()
         except OSError:
             return []
+        # Drop a possibly-partial first line when we didn't read from the start.
+        lines = blob.decode("utf-8", "replace").splitlines()
+        if size > _TAIL_BYTES and lines:
+            lines = lines[1:]
 
-        entries = []
+        entries: list[dict[str, Any]] = []
         for line in reversed(lines):
             if not line.strip():
                 continue
@@ -112,6 +167,15 @@ def get_session_stats(self, session_id: str) -> dict[str, Any]:
         }
 
 
+def scope_leaf_safe(p: Path) -> Path:
+    """``scope_leaf`` without raising — used only for the ``.path`` fallback."""
+    try:
+        from paths import scope_leaf
+        return scope_leaf(p)
+    except Exception:
+        return p
+
+
 def _sanitize_args(args: dict[str, Any]) -> dict[str, Any]:
     sanitized = {}
     for k, v in args.items():
diff --git a/graph/middleware/redaction.py b/graph/middleware/redaction.py
index 33e2b46..5a08e50 100644
--- a/graph/middleware/redaction.py
+++ b/graph/middleware/redaction.py
@@ -23,10 +23,21 @@
     "generic_api_key": re.compile(
         r"(?i)(?:api[_\-]?key|apikey)(?:[\"'\s:=]+)([A-Za-z0-9_\-]{16,})",
     ),
+    # Provider token shapes a tool error / payload might echo into the audit log.
+    "google_oauth": re.compile(r"\bya29\.[A-Za-z0-9_\-]+"),
+    "google_api_key": re.compile(r"\bAIza[A-Za-z0-9_\-]{20,}\b"),
+    "github_token": re.compile(r"\bgh[pousr]_[A-Za-z0-9]{30,}\b"),
+    "slack_token": re.compile(r"\bxox[baprs]-[A-Za-z0-9\-]{10,}\b"),
+    "aws_access_key": re.compile(r"\bAKIA[0-9A-Z]{16}\b"),
+    "discord_token": re.compile(r"\b[MNO][A-Za-z0-9_\-]{23}\.[A-Za-z0-9_\-]{6}\.[A-Za-z0-9_\-]{27,}\b"),
+    "client_secret": re.compile(
+        r"(?i)client[_\-]?secret(?:[\"'\s:=]+)([A-Za-z0-9_\-]{12,})",
+    ),
     "env_var_assignment": re.compile(
         r"(?i)\b(OPENAI_API_KEY|LANGFUSE_SECRET_KEY|LANGFUSE_PUBLIC_KEY|"
         r"A2A_AUTH_TOKEN|API_KEY|SECRET_KEY|AUTH_TOKEN|ACCESS_TOKEN|"
-        r"PRIVATE_KEY)\s*[=:]\s*\S+",
+        r"PRIVATE_KEY|DISCORD_BOT_TOKEN|BOT_TOKEN|GATEWAY_API_KEY|GH_PAT|"
+        r"CLIENT_SECRET)\s*[=:]\s*\S+",
     ),
 }
 
@@ -49,6 +60,14 @@
     "auth_token",
     "access_token",
     "private_key",
+    "DISCORD_BOT_TOKEN",
+    "bot_token",
+    "GATEWAY_API_KEY",
+    "GH_PAT",
+    "client_secret",
+    "CLIENT_SECRET",
+    "refresh_token",
+    "REFRESH_TOKEN",
 })
 
 _PLACEHOLDER = "[REDACTED]"
@@ -80,6 +99,18 @@ def _replace_env_var(m: re.Match) -> str:
         return key + _PLACEHOLDER
 
     value = PATTERNS["env_var_assignment"].sub(_replace_env_var, value)
+
+    # Provider token shapes — full-match redaction.
+    for _name in ("google_oauth", "google_api_key", "github_token",
+                  "slack_token", "aws_access_key", "discord_token"):
+        value = PATTERNS[_name].sub(_PLACEHOLDER, value)
+
+    # client_secret — redact the captured credential value group.
+    def _replace_client_secret(m: re.Match) -> str:
+        full, cred = m.group(0), m.group(1)
+        return full[: len(full) - len(cred)] + _PLACEHOLDER
+
+    value = PATTERNS["client_secret"].sub(_replace_client_secret, value)
     return value
 
 
diff --git a/tests/middleware/test_redaction.py b/tests/middleware/test_redaction.py
index dc60a52..91f72cf 100644
--- a/tests/middleware/test_redaction.py
+++ b/tests/middleware/test_redaction.py
@@ -296,3 +296,26 @@ def test_langfuse_redaction():
     assert result["input"]["query"] == "get config"
     assert "mytoken1234567890" not in result["output"]
     assert "[REDACTED]" in result["output"]
+
+
+def test_provider_token_shapes_redacted():
+    """Discord / Google / GitHub / Slack / AWS / client_secret tokens that a tool
+    error could echo must not survive into the audit log (prod-readiness)."""
+    from graph.middleware.redaction import redact
+
+    # Obviously-fake values that still match the regex SHAPES (so they exercise
+    # the patterns) but aren't real tokens — avoids tripping secret scanning.
+    samples = [
+        "M" + "A" * 23 + ".AAAAAA." + "A" * 30,        # discord shape
+        "ghp_" + "A" * 36,                              # github shape (fails real checksum)
+        "ya29." + "A" * 30,                             # google oauth shape
+        "AIza" + "A" * 30,                              # google api-key shape
+        "AKIA" + "A" * 16,                              # aws shape
+        "xoxb-0000000000-" + "a" * 12,                  # slack shape
+    ]
+    for s in samples:
+        assert "[REDACTED]" in redact(s), s
+    # captured-value shape
+    assert "[REDACTED]" in redact("client_secret: " + "A" * 16)
+    # benign text is left intact (no over-redaction)
+    assert redact("the quick brown fox AKIA jumps") == "the quick brown fox AKIA jumps"
diff --git a/tests/test_audit_hardening.py b/tests/test_audit_hardening.py
new file mode 100644
index 0000000..337882e
--- /dev/null
+++ b/tests/test_audit_hardening.py
@@ -0,0 +1,51 @@
+"""Prod-readiness: the audit log is instance-scoped, rotates at a size cap, and
+get_recent reads only a bounded tail (no unbounded growth / full-file OOM)."""
+
+import json
+
+from audit import AuditLogger
+
+
+def _log_n(a: AuditLogger, n: int):
+    for i in range(n):
+        a.log(session_id="s", tool=f"t{i}", args={}, result_summary="ok", duration_ms=1, success=True)
+
+
+def test_writes_and_get_recent_tail(tmp_path):
+    a = AuditLogger(path=tmp_path / "audit.jsonl")
+    _log_n(a, 50)
+    recent = a.get_recent(n=10)
+    assert len(recent) == 10
+    assert recent[-1]["tool"] == "t49"  # newest last, chronological
+    assert all(set(e) >= {"ts", "tool", "session_id"} for e in recent)
+
+
+def test_rotates_at_size_cap(tmp_path, monkeypatch):
+    import audit
+    monkeypatch.setattr(audit, "_MAX_BYTES", 2_000)  # tiny cap to force a rotation
+    a = AuditLogger(path=tmp_path / "audit.jsonl")
+    _log_n(a, 200)
+    live = tmp_path / "audit.jsonl"
+    backup = tmp_path / "audit.jsonl.1"
+    assert live.exists()
+    assert backup.exists(), "should have rotated a .1 backup"
+    # the live file stays bounded (≈ one cap worth), not the full 200 lines
+    assert live.stat().st_size <= 2_000 + 4_000
+
+
+def test_session_stats_capped(tmp_path, monkeypatch):
+    import audit
+    monkeypatch.setattr(audit, "_MAX_SESSIONS", 5)
+    a = AuditLogger(path=tmp_path / "audit.jsonl")
+    for i in range(20):
+        a.log(session_id=f"sess-{i}", tool="t", args={}, result_summary="", duration_ms=1, success=True)
+    assert len(a._session_stats) <= 5  # oldest evicted
+
+
+def test_instance_scoping(tmp_path, monkeypatch):
+    # PROTOAGENT_INSTANCE namespaces the path under an instance segment.
+    monkeypatch.setenv("PROTOAGENT_INSTANCE", "inst-a")
+    a = AuditLogger(path=tmp_path / "audit.jsonl")
+    a.log(session_id="s", tool="t", args={}, result_summary="", duration_ms=1, success=True)
+    assert "inst-a" in str(a.path)
+    assert json.loads(a.path.read_text().splitlines()[0])["tool"] == "t"

From e2cdcfeb53dae51227e74d3e0175d62250019137 Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Fri, 5 Jun 2026 22:31:33 -0700
Subject: [PATCH 11/20] data: WAL + busy_timeout on the
 telemetry/activity/inbox/beads stores (#589)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prod-readiness audit P2 (data integrity). These four sqlite stores opened
connections with no PRAGMAs, unlike the checkpointer + knowledge store — so under
concurrent access (scheduler tick + console read + agent write) they threw
'database is locked' instead of waiting, and got no WAL concurrency. Add
journal_mode=WAL + busy_timeout=5000 to each connect (matching graph/checkpointer).
busy_timeout especially helps beads' single shared connection.

Verified: PRAGMAs apply (journal_mode=wal, busy_timeout=5000); 65 store tests green; ruff clean.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 activity/store.py  | 2 ++
 beads/store.py     | 2 ++
 inbox/store.py     | 2 ++
 telemetry_store.py | 2 ++
 4 files changed, 8 insertions(+)

diff --git a/activity/store.py b/activity/store.py
index b75bf18..d813c8f 100644
--- a/activity/store.py
+++ b/activity/store.py
@@ -56,6 +56,8 @@ def __init__(self, db_path: str) -> None:
 
     def _connect(self) -> sqlite3.Connection:
         db = sqlite3.connect(self.path)
+        db.execute("PRAGMA journal_mode=WAL")   # concurrent reads during writes
+        db.execute("PRAGMA busy_timeout=5000")  # wait (don't error) on lock contention
         db.row_factory = sqlite3.Row
         return db
 
diff --git a/beads/store.py b/beads/store.py
index 695cee0..cf4f1e6 100644
--- a/beads/store.py
+++ b/beads/store.py
@@ -53,6 +53,8 @@ def __init__(self, db_path: str | None = None):
         self.path.parent.mkdir(parents=True, exist_ok=True)
         self._lock = threading.Lock()
         self._conn = sqlite3.connect(str(self.path), check_same_thread=False)
+        self._conn.execute("PRAGMA journal_mode=WAL")   # concurrent reads during writes
+        self._conn.execute("PRAGMA busy_timeout=5000")  # wait (don't error) on lock contention
         self._conn.row_factory = sqlite3.Row
         self._init_schema()
 
diff --git a/inbox/store.py b/inbox/store.py
index 3ef2822..9daaae6 100644
--- a/inbox/store.py
+++ b/inbox/store.py
@@ -38,6 +38,8 @@ def __init__(self, db_path: str, *, dedup_window_s: int = 300) -> None:
 
     def _connect(self) -> sqlite3.Connection:
         db = sqlite3.connect(self.path)
+        db.execute("PRAGMA journal_mode=WAL")   # concurrent reads during writes
+        db.execute("PRAGMA busy_timeout=5000")  # wait (don't error) on lock contention
         db.row_factory = sqlite3.Row
         return db
 
diff --git a/telemetry_store.py b/telemetry_store.py
index 89d8e8b..00c4f55 100644
--- a/telemetry_store.py
+++ b/telemetry_store.py
@@ -39,6 +39,8 @@ def __init__(self, db_path: str) -> None:
 
     def _connect(self) -> sqlite3.Connection:
         db = sqlite3.connect(self.path)
+        db.execute("PRAGMA journal_mode=WAL")   # concurrent reads during writes
+        db.execute("PRAGMA busy_timeout=5000")  # wait (don't error) on lock contention
         db.row_factory = sqlite3.Row
         return db
 

From b80798f54e70c161ed9562625dfb9ccfba170dd8 Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Fri, 5 Jun 2026 22:37:53 -0700
Subject: [PATCH 12/20] security: bearer-gate the operator/console +
 OpenAI-compat APIs (P0) (#591)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prod-readiness audit P0 (credential boundary — completes the console-auth fix
after the localhost-default bind). The A2A guard only covered /a2a, so /api/*
(run subagents, rewrite config/SOUL, schedule jobs), /api/chat, and /v1/* were
unauthenticated even when an A2A token was configured.

- a2a_auth: guard /a2a + /api/ + /v1/ (engages only when a token is set — the
  no-token default stays open so the local console works). Exempt the read-only
  /api/events SSE stream (EventSource can't send a bearer; it exposes no action).
  /healthz, /.well-known, /metrics, static /app are outside the prefixes → public.
- console (api.ts): send Authorization: Bearer from localStorage
  protoagent.authToken on every fetch-based API + A2A call (blank ⇒ none), so a
  token-protected deployment's console authenticates. Default (no token) unchanged.
- docs: env-vars auth section documents the scope + the console token.

Verified: 11 auth tests (incl. /api+/v1 guarded when token set, /api/events +
/healthz public, all-open when no token); web build + chat e2e green; LIVE-smoked
a booted server with A2A_AUTH_TOKEN — /api/runtime/status 401 without / 200 with,
/a2a 401 without, /healthz 200.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 a2a_auth.py                             | 19 +++++++++--
 apps/web/src/lib/api.ts                 | 25 ++++++++++++--
 docs/reference/environment-variables.md |  8 +++--
 tests/test_a2a_auth.py                  | 45 +++++++++++++++++++++++++
 4 files changed, 89 insertions(+), 8 deletions(-)

diff --git a/a2a_auth.py b/a2a_auth.py
index 09ede00..76b05c0 100644
--- a/a2a_auth.py
+++ b/a2a_auth.py
@@ -35,8 +35,18 @@
 # Allowed origins: None = verification disabled; list = allowlist.
 _ALLOWED_ORIGINS: list[list[str] | None] = [None]
 
-# Path prefix the guard applies to. The agent card + health are public.
-_GUARDED_PREFIX = "/a2a"
+# Path prefixes the guard applies to: the A2A JSON-RPC surface plus the operator
+# console + OpenAI-compat APIs (which drive subagents, rewrite config/SOUL,
+# schedule jobs, and run turns). The agent card, /healthz, /metrics, and the
+# static console assets live OUTSIDE these prefixes and stay public.
+_GUARDED_PREFIXES = ("/a2a", "/api/", "/v1/")
+
+# Exempt from the guard: the read-only Server-Sent-Events stream. Browsers'
+# EventSource cannot set an Authorization header, so a bearer can't be presented
+# here — and it only exposes activity/inbox events, not any action. The
+# agent-driving endpoints (/api/subagents/run, /api/config, /api/chat, /a2a, …)
+# stay guarded.
+_GUARD_EXEMPT = ("/api/events",)
 
 
 def set_bearer_token(token: str | None) -> None:
@@ -84,7 +94,10 @@ class A2AAuthMiddleware(BaseHTTPMiddleware):
     """Enforces bearer / X-API-Key / origin on the guarded A2A path."""
 
     async def dispatch(self, request: Request, call_next):
-        if not request.url.path.startswith(_GUARDED_PREFIX):
+        path = request.url.path
+        if not any(path.startswith(p) for p in _GUARDED_PREFIXES):
+            return await call_next(request)
+        if any(path.startswith(p) for p in _GUARD_EXEMPT):
             return await call_next(request)
 
         # X-API-Key (legacy) — enforced only when configured.
diff --git a/apps/web/src/lib/api.ts b/apps/web/src/lib/api.ts
index fabacbc..731103c 100644
--- a/apps/web/src/lib/api.ts
+++ b/apps/web/src/lib/api.ts
@@ -132,8 +132,27 @@ export function isDesktopWebview(): boolean {
   }
 }
 
+/** Operator bearer token, set in localStorage (`protoagent.authToken`). Sent on
+ * every fetch-based API + A2A call so a token-configured deployment's console
+ * authenticates against the server guard. Blank ⇒ no header — the default
+ * local/desktop case (no token) stays open. (The `/api/events` EventSource is
+ * exempt server-side since EventSource can't set headers.) */
+export function authToken(): string {
+  try {
+    return window.localStorage.getItem("protoagent.authToken") || "";
+  } catch {
+    return "";
+  }
+}
+
+function applyAuth(headers: Headers): Headers {
+  const t = authToken();
+  if (t) headers.set("Authorization", `Bearer ${t}`);
+  return headers;
+}
+
 async function request<T>(path: string, options: RequestOptions = {}): Promise<T> {
-  const headers = new Headers(options.headers);
+  const headers = applyAuth(new Headers(options.headers));
   let body: BodyInit | undefined;
   if (options.body !== undefined) {
     headers.set("Content-Type", "application/json");
@@ -578,7 +597,7 @@ export const api = {
       try {
         const res = await fetch(apiUrl("/api/chat"), {
           method: "POST",
-          headers: { "Content-Type": "application/json" },
+          headers: applyAuth(new Headers({ "Content-Type": "application/json" })),
           signal: handlers.signal,
           body: JSON.stringify({ message, session_id: sessionId }),
         });
@@ -608,7 +627,7 @@ export const api = {
     const rpcId = `web-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
     const response = await fetch(apiUrl("/a2a"), {
       method: "POST",
-      headers: { "Content-Type": "application/json", "A2A-Version": "1.0" },
+      headers: applyAuth(new Headers({ "Content-Type": "application/json", "A2A-Version": "1.0" })),
       signal: handlers.signal,
       // A2A 1.0 (a2a-sdk): the streaming RPC is `SendStreamingMessage` (0.3's
       // `message/stream` is gone → -32601 Method not found, the cause of a
diff --git a/docs/reference/environment-variables.md b/docs/reference/environment-variables.md
index 7930eef..269ba3c 100644
--- a/docs/reference/environment-variables.md
+++ b/docs/reference/environment-variables.md
@@ -39,9 +39,13 @@ Setup without the wizard: `python -m server --setup` validates the live config (
 
 | Variable | Default | What |
 |---|---|---|
-| `A2A_AUTH_TOKEN` | (unset — open mode) | Required bearer token for all A2A routes (POST `/a2a`, `message/send`, `tasks/*`, SSE streaming). When set, requests without `Authorization: Bearer <token>` get 401. Token comparison uses `hmac.compare_digest` (constant-time). |
+| `A2A_AUTH_TOKEN` | (unset — open mode) | Required bearer token for the A2A routes **and** the operator/console + OpenAI-compat APIs — `/a2a`, `/api/*`, `/api/chat`, `/v1/*`. When set, requests without `Authorization: Bearer <token>` get 401. Constant-time comparison (`hmac.compare_digest`). |
 
-When unset, the handler logs a WARNING at startup (`"A2A auth token not configured — endpoint is open"`) and accepts all traffic — appropriate for local development, not production. When set, the agent card advertises `securitySchemes.bearer` so A2A consumers know to present credentials.
+When unset, startup logs a WARNING (`"A2A auth token not configured — endpoint is open"`) and accepts all traffic — fine for local dev (and safe behind the loopback-default bind, see `PROTOAGENT_HOST`), not for an exposed deployment. When set, the agent card advertises `securitySchemes.bearer`.
+
+**Scope.** The guard covers everything that can drive the agent: `/a2a`, the operator API (`/api/*` — run subagents, rewrite config/SOUL, schedule jobs), `/api/chat`, and `/v1/*`. **Public (never guarded):** `/healthz`, `/.well-known/agent-card.json`, `/metrics`, the static console at `/app`, and the read-only `/api/events` SSE stream (browsers' `EventSource` can't send an `Authorization` header; it exposes only activity/inbox events, no action).
+
+**Console.** When a token is set, the React console reads it from `localStorage["protoagent.authToken"]` and sends it as a bearer on every API + A2A call. Set it once in the browser (`localStorage.setItem("protoagent.authToken", "<token>")`) for a token-protected deployment; local/desktop runs (no token) need nothing.
 
 ## A2A agent-card endpoint
 
diff --git a/tests/test_a2a_auth.py b/tests/test_a2a_auth.py
index 07a9cef..ce05122 100644
--- a/tests/test_a2a_auth.py
+++ b/tests/test_a2a_auth.py
@@ -96,3 +96,48 @@ def test_origin_guard_rejects_unlisted_origin():
 def test_origin_guard_disabled_when_unset():
     a2a_auth.configure(bearer_token="", api_key="", allowed_origins_raw="")
     assert _client().post("/a2a", headers={"Origin": "https://anything.example"}).status_code == 200
+
+
+# ── 3. guard covers the console + OpenAI-compat APIs (prod-readiness) ──────────
+
+
+def _client_multi() -> TestClient:
+    routes = [
+        Route(p, lambda r: PlainTextResponse("ok"), methods=["GET", "POST"])
+        for p in ("/a2a", "/api/config", "/api/events", "/v1/chat/completions", "/healthz")
+    ]
+    app = Starlette(routes=routes)
+    app.add_middleware(a2a_auth.A2AAuthMiddleware)
+    return TestClient(app)
+
+
+def test_api_and_v1_are_guarded_when_token_set(monkeypatch):
+    monkeypatch.delenv("A2A_AUTH_TOKEN", raising=False)
+    a2a_auth.configure(bearer_token="secret", api_key="", allowed_origins_raw="")
+    c = _client_multi()
+    # operator API + OpenAI-compat now require the bearer (the P0 gap)
+    assert c.post("/api/config").status_code == 401
+    assert c.post("/v1/chat/completions").status_code == 401
+    assert c.post("/a2a").status_code == 401
+    hdr = {"Authorization": "Bearer secret"}
+    assert c.post("/api/config", headers=hdr).status_code == 200
+    assert c.post("/v1/chat/completions", headers=hdr).status_code == 200
+
+
+def test_events_stream_and_healthz_stay_public_when_token_set(monkeypatch):
+    monkeypatch.delenv("A2A_AUTH_TOKEN", raising=False)
+    a2a_auth.configure(bearer_token="secret", api_key="", allowed_origins_raw="")
+    c = _client_multi()
+    # EventSource can't send a bearer; the read-only event stream is exempt.
+    assert c.get("/api/events").status_code == 200
+    # /healthz is outside the guarded prefixes (probes/scrapers stay open).
+    assert c.get("/healthz").status_code == 200
+
+
+def test_apis_open_when_no_token(monkeypatch):
+    monkeypatch.delenv("A2A_AUTH_TOKEN", raising=False)
+    a2a_auth.configure(bearer_token=None, api_key="", allowed_origins_raw="")
+    c = _client_multi()
+    # default (no token) → everything open (local console keeps working)
+    for p in ("/a2a", "/api/config", "/v1/chat/completions", "/api/events", "/healthz"):
+        assert c.post(p).status_code in (200, 405)  # 405 only if method not allowed

From 1011a4c847d670e1ab32091e03a009bc7c4f6610 Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Fri, 5 Jun 2026 22:41:24 -0700
Subject: [PATCH 13/20] obs: A2A turn-outcome metrics for /metrics alerting
 (#592)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prod-readiness audit P2. metrics.py covered LLM/tool/cost/sessions but had no
A2A-turn signal — so 'turns are failing' or 'turns backing up' couldn't be
alerted from /metrics (the richest signal lived only in the telemetry SQL store).

- metrics.py: a2a_turns_total{state} counter (completed/failed/canceled — low
  cardinality) + a2a_turn_seconds latency histogram + record_a2a_turn(state, dur).
- server/a2a.py: emit it from _record_a2a_telemetry, independent of the SQL store,
  best-effort so it never affects a turn.
- tests/test_metrics.py (the audit flagged none existed).

1059 tests green; ruff clean.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 metrics.py            | 27 +++++++++++++++++++++++++++
 server/a2a.py         |  8 ++++++++
 tests/test_metrics.py | 29 +++++++++++++++++++++++++++++
 3 files changed, 64 insertions(+)
 create mode 100644 tests/test_metrics.py

diff --git a/metrics.py b/metrics.py
index 4547c45..914babb 100644
--- a/metrics.py
+++ b/metrics.py
@@ -22,6 +22,8 @@
 _tool_calls = None
 _tool_latency = None
 _active_sessions = None
+_a2a_turns = None
+_a2a_turn_latency = None
 
 
 def _prefix() -> str:
@@ -32,6 +34,7 @@ def _prefix() -> str:
 def init():
     global _enabled, _llm_calls, _llm_latency, _llm_tokens, _llm_cache_tokens, _llm_cost
     global _tools_deferred, _compactions, _tool_calls, _tool_latency, _active_sessions
+    global _a2a_turns, _a2a_turn_latency
 
     try:
         from prometheus_client import Counter, Histogram, Gauge
@@ -77,6 +80,17 @@ def init():
         _active_sessions = Gauge(
             f"{p}_active_sessions", "Active chat sessions",
         )
+        # A2A turn outcomes — the durable signal for "turns are failing" alerting
+        # straight off /metrics (state ∈ completed|failed|canceled|…). Low
+        # cardinality. Paired latency histogram for turn duration.
+        _a2a_turns = Counter(
+            f"{p}_a2a_turns_total", "A2A turn outcomes by terminal state",
+            ["state"],
+        )
+        _a2a_turn_latency = Histogram(
+            f"{p}_a2a_turn_seconds", "A2A turn wall-clock duration",
+            buckets=[0.5, 1, 2, 5, 10, 20, 30, 60, 120, 300],
+        )
         _enabled = True
         print(f"[metrics] Prometheus metrics initialized (prefix={p}_)")
     except ImportError:
@@ -123,6 +137,19 @@ def record_compaction():
         _compactions.inc()
 
 
+def record_a2a_turn(state: str, duration_s: float | None = None):
+    """Count one terminal A2A turn by ``state`` (completed/failed/canceled) and,
+    when known, observe its wall-clock duration. Emitted from
+    ``server/a2a.py::_record_a2a_telemetry`` so /metrics can alert on a failing
+    or backed-up agent without scraping the telemetry SQL store."""
+    if not _enabled:
+        return
+    if _a2a_turns is not None:
+        _a2a_turns.labels(state=str(state or "unknown")).inc()
+    if _a2a_turn_latency is not None and duration_s is not None and duration_s >= 0:
+        _a2a_turn_latency.observe(duration_s)
+
+
 def record_tool_call(tool_name: str, success: bool, latency_s: float):
     if not _enabled:
         return
diff --git a/server/a2a.py b/server/a2a.py
index 3be2a9d..493a927 100644
--- a/server/a2a.py
+++ b/server/a2a.py
@@ -198,6 +198,14 @@ def _record_a2a_telemetry(outcome) -> None:
     """Write one per-turn telemetry row from an executor ``TurnOutcome``
     (ADR 0006 Slice 2). No-op when the telemetry store is off; best-effort so a
     failure never affects the turn."""
+    # Prometheus turn counter (independent of the SQL telemetry store) — lets
+    # /metrics alert on a failing/backed-up agent. Best-effort.
+    try:
+        import metrics
+        metrics.record_a2a_turn(outcome.state, (outcome.duration_ms or 0) / 1000.0)
+    except Exception:
+        pass
+
     store = STATE.telemetry_store
     if store is None:
         return
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
new file mode 100644
index 0000000..1a947ee
--- /dev/null
+++ b/tests/test_metrics.py
@@ -0,0 +1,29 @@
+"""The A2A-turn metrics emit safely (prod-readiness: /metrics can alert on a
+failing/backed-up agent) and never break a turn."""
+
+import metrics
+
+
+def test_record_a2a_turn_safe_when_disabled():
+    # No init() — must be a no-op, never raise (metrics are optional).
+    metrics.record_a2a_turn("completed", 1.5)
+    metrics.record_a2a_turn("failed")
+    metrics.record_a2a_turn("", None)
+
+
+def test_record_a2a_turn_increments_when_enabled():
+    if metrics.is_enabled() or _try_init():
+        from prometheus_client import REGISTRY
+        p = metrics._prefix()
+        before = REGISTRY.get_sample_value(f"{p}_a2a_turns_total", {"state": "completed"}) or 0.0
+        metrics.record_a2a_turn("completed", 2.0)
+        after = REGISTRY.get_sample_value(f"{p}_a2a_turns_total", {"state": "completed"}) or 0.0
+        assert after == before + 1.0
+
+
+def _try_init() -> bool:
+    try:
+        metrics.init()
+        return metrics.is_enabled()
+    except Exception:
+        return False  # already-registered or prometheus missing — skip

From d1c7605848c3035203683c87ab6bb2b247d7c945 Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Fri, 5 Jun 2026 23:40:01 -0700
Subject: [PATCH 14/20] =?UTF-8?q?docs:=20fork=20upstream-sync=20guide=20?=
 =?UTF-8?q?=E2=80=94=20merge-not-squash=20+=20CHANGELOG=20merge=3Dours=20(?=
 =?UTF-8?q?closes=20#590)=20(#593)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two footguns bit every fork sync this cycle (gina/jon/protoTrader):
1. Squash-merging an upstream-sync PR breaks the fork's merge base — the behind
   count stays inflated and every later sync re-conflicts on integrated code.
2. The inherited CHANGELOG merge=union splices upstream's whole changelog into
   the fork on each sync.

Promote the roxy-sync dev note into a fork-agnostic guide
(docs/guides/upstream-sync.md): the procedure (fetch both, branch off origin/main,
real merge), the merge-NOT-squash rule (+ the protoTrader before/after evidence),
the CHANGELOG merge=ours fork switch (+ the merge.ours.driver caveat), and the
now-tiny conflict surface (the operator-fork contract means edits → file a seam).

Register it in the sidebar; add a FORKS note to the template's .gitattributes
pointing at it (template keeps merge=union for its internal flow); mark the old
roxy note superseded. Docs build green.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 .gitattributes                       |   4 +
 docs/.vitepress/config.mts           |   1 +
 docs/dev/notes/roxy-upstream-sync.md |   5 ++
 docs/guides/upstream-sync.md         | 112 +++++++++++++++++++++++++++
 4 files changed, 122 insertions(+)
 create mode 100644 docs/guides/upstream-sync.md

diff --git a/.gitattributes b/.gitattributes
index 326b519..dfe702b 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -2,4 +2,8 @@
 # both sides add entries at the top. `merge=union` keeps BOTH sides instead of
 # raising a conflict — a fork's entries and upstream's coexist (re-order/dedupe
 # by hand at release time if needed). Confirmed pain across every downstream port.
+#
+# FORKS: switch this to `CHANGELOG.md merge=ours` (+ `git config merge.ours.driver
+# true` per clone) so an upstream sync keeps YOUR curated changelog instead of
+# splicing upstream's whole changelog back in. See docs/guides/upstream-sync.md.
 CHANGELOG.md merge=union
diff --git a/docs/.vitepress/config.mts b/docs/.vitepress/config.mts
index 9910808..ea3c8d4 100644
--- a/docs/.vitepress/config.mts
+++ b/docs/.vitepress/config.mts
@@ -65,6 +65,7 @@ export default defineConfig({
             { text: "Deploy via GHCR", link: "/guides/deploy" },
             { text: "Releasing", link: "/guides/releasing" },
             { text: "Build an operator fork (Roxy)", link: "/guides/operator-fork" },
+            { text: "Sync a fork from upstream", link: "/guides/upstream-sync" },
             { text: "Sandboxing & egress", link: "/guides/sandboxing" },
             { text: "Eval your fork", link: "/guides/evals" },
           ],
diff --git a/docs/dev/notes/roxy-upstream-sync.md b/docs/dev/notes/roxy-upstream-sync.md
index 76ade43..5533396 100644
--- a/docs/dev/notes/roxy-upstream-sync.md
+++ b/docs/dev/notes/roxy-upstream-sync.md
@@ -1,5 +1,10 @@
 # Note: syncing roxy from upstream protoAgent
 
+> **Superseded by the fork-agnostic guide:
+> [docs/guides/upstream-sync.md](../../guides/upstream-sync.md)** — it adds the two
+> lessons this note predates: **merge (never squash)** upstream syncs, and switch
+> the fork's `CHANGELOG.md` to `merge=ours`. Kept for the roxy-specific context below.
+
 **Topic:** operational playbook for porting protoAgent work into the roxy fork.
 **Status:** current as of 2026-06-02 (roxy caught up through #476 via roxy#27).
 
diff --git a/docs/guides/upstream-sync.md b/docs/guides/upstream-sync.md
new file mode 100644
index 0000000..49e1800
--- /dev/null
+++ b/docs/guides/upstream-sync.md
@@ -0,0 +1,112 @@
+# Sync a fork from upstream
+
+A fork of this template (roxy, protoTrader, gina, …) pulls fixes + features down
+from `upstream/main` with `git merge`. Two avoidable footguns bite that flow on
+almost every fork — both fixed by the rules below. Bake them in once and every
+sync after is near-trivial.
+
+> **The one rule that matters most:** sync with a **real merge commit, never a
+> squash.** Squashing breaks the fork's merge base — the "behind" count stays
+> permanently inflated and every later sync re-conflicts on code already
+> integrated.
+
+## Setup (once per fork)
+
+Two remotes — `origin` (your fork) and `upstream` (this template):
+
+```sh
+git remote add upstream https://github.com/protoLabsAI/protoAgent.git
+```
+
+And switch the changelog to fork-owned (see [CHANGELOG](#changelog-stop-the-duplicates)):
+
+```sh
+# in the fork's .gitattributes
+CHANGELOG.md merge=ours
+# + per clone / in CI (the driver isn't carried by .gitattributes alone):
+git config merge.ours.driver true
+```
+
+## The sync
+
+```sh
+# 1. Fetch BOTH — the local clone goes stale; a stale local main makes a
+#    divergent merge you can't push.
+git fetch upstream && git fetch origin
+
+# 2. See the real delta (should be a short list of genuinely-new commits).
+git log --oneline origin/main..upstream/main
+
+# 3. Branch off origin/main (NOT local main) and merge upstream.
+git checkout -b chore/sync-upstream-$(date +%Y%m%d) origin/main
+git merge upstream/main            # resolve: identity=ours, code=theirs (see hotspots)
+
+# 4. Run tests, open a PR, and merge it as a MERGE COMMIT:
+gh pr merge --merge                # NOT --squash
+```
+
+Fork **feature** PRs can still squash — they don't touch the upstream base. Only
+the *upstream-sync* PR must be a merge commit.
+
+## Why merge, not squash
+
+Squash collapses upstream's N commits into one *new* SHA on your fork. Git never
+sees upstream's commit SHAs in your ancestry, so **the merge base never advances**
+→ permanent "behind" inflation + recurring re-conflicts on already-integrated
+code. A real merge commit makes `upstream/main` an ancestor, so the base tracks
+and the next sync shows only genuinely-new commits.
+
+Real example (protoTrader, after two squash-syncs):
+
+| | Squash-synced | Re-done as a merge commit |
+|---|---|---|
+| Merge base | original fork point (pre-v0.15.0) | real upstream HEAD |
+| Behind count | 57 (mostly phantom) | 3 (all genuinely new) |
+
+If your fork is already in the squash-broken state, fix it once: do the next sync
+as a true merge commit (resolving the now-phantom conflicts in upstream's favor)
+and merge with `--merge`. The base re-anchors and the inflation clears.
+
+## CHANGELOG: stop the duplicates
+
+The template ships `.gitattributes` with `CHANGELOG.md merge=union` — correct for
+the template's *internal* feature-branch flow (distinct new entries coexist), but
+wrong for an upstream→fork sync: the two changelogs share long history your fork
+has curated, so `union` splices upstream's **whole** changelog back in (recurring
+duplicate `## [X.Y.Z]` sections to hand-dedupe).
+
+Forks own their changelog narrative — switch it to `merge=ours` (above) so a sync
+keeps your changelog and ignores upstream's. Don't import upstream's `## [X.Y.Z]`
+version headers; if you want a specific upstream entry, copy it into your own
+`[Unreleased]` by hand.
+
+## Conflict hotspots
+
+Thanks to the **[operator-fork contract](customize-and-deploy.md#the-operator-fork-contract)**,
+a clean fork's conflict surface is now tiny — fork identity & behavior are
+config/plugin-driven, so the files you *edit* (and therefore conflict on) should
+be almost none:
+
+- **`pyproject.toml` version line** — the one expected trivial conflict; keep your fork's version.
+- **`CHANGELOG.md`** — resolved automatically by `merge=ours` (above).
+- **Config / persona / plugins** (`config/`, `plugins/`, `SOUL.md`) — fork-owned (`ours`); these are *adds*, not edits, so they rarely conflict.
+
+If you're resolving a conflict in a core `.py` (e.g. `server/a2a.py`, `server/chat.py`),
+that usually means you edited a core file instead of using a seam — the card is
+config-driven (`a2a.skills`/`a2a.description`), the thread-id is a plugin resolver,
+the SSRF allowlist is config. [File the missing seam](https://github.com/protoLabsAI/protoAgent/issues)
+rather than re-porting the edit every sync.
+
+## Running the fork's tests
+
+A fork usually shares deps with the template; the template venv works:
+
+```sh
+PYTHONPATH=~/dev/<fork> ~/dev/protoAgent/.venv/bin/python -m pytest -q
+```
+
+## Related
+
+- [The operator-fork contract](customize-and-deploy.md#the-operator-fork-contract) — the seams that keep the conflict surface tiny
+- [Build an operator fork (Roxy)](operator-fork.md)
+- [Customize & deploy](customize-and-deploy.md)

From 9cfbce381d26961baae9fd604e56572672f9e06b Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Sat, 6 Jun 2026 00:21:53 -0700
Subject: [PATCH 15/20] =?UTF-8?q?feat(plugins):=20spawn=20CLI=20coding=20a?=
 =?UTF-8?q?gents=20over=20ACP=20(code=5Fwith)=20=E2=80=94=20ADR=200024=20(?=
 =?UTF-8?q?#596)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bring ORBIS's ACP-client pattern into protoAgent so the lead agent can hand a
real, repo-scoped coding job to a purpose-built CLI coding agent (protoCLI
`proto`, Claude Code, Codex, Gemini CLI) and get the result back.

New opt-in `coding_agent` plugin adds one tool, `code_with(agent, task)`, backed
by an ACP client (port of ORBIS's acp/client.py): launch the agent as a
subprocess, drive one session over JSON-RPC 2.0 on its stdio (initialize →
session/new → session/prompt), accumulate agent_message_chunk as the answer,
auto-allow session/request_permission. One client (subprocess + session) cached
per agent so follow-up calls continue the thread; a per-agent lock serializes
turns.

Security posture (PR1, ADR 0024): ships DISABLED with an empty agent list. Each
agent's workdir is config-pinned (the tool takes only agent+task, never a path)
and auto-allowed — the coding agent self-governs within its sandbox dir. HITL
gating + live A2A narration land in later PRs.

- plugins/coding_agent/{__init__.py,acp_client.py,protoagent.plugin.yaml}
- tests/test_coding_agent_plugin.py — config normalization, tool wiring, and a
  real ACP wire exchange against a fake agent subprocess (9 cases)
- ADR 0024 + index; docs/guides/coding-agents.md + sidebar; plugins guide note;
  CHANGELOG [Unreleased]

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 CHANGELOG.md                                 |  18 ++
 docs/.vitepress/config.mts                   |   1 +
 docs/adr/0024-spawn-cli-coding-agents-acp.md | 137 +++++++++
 docs/adr/index.md                            |   1 +
 docs/guides/coding-agents.md                 | 126 ++++++++
 docs/guides/plugins.md                       |   4 +-
 plugins/coding_agent/__init__.py             | 162 ++++++++++
 plugins/coding_agent/acp_client.py           | 304 +++++++++++++++++++
 plugins/coding_agent/protoagent.plugin.yaml  |  45 +++
 tests/test_coding_agent_plugin.py            | 173 +++++++++++
 10 files changed, 970 insertions(+), 1 deletion(-)
 create mode 100644 docs/adr/0024-spawn-cli-coding-agents-acp.md
 create mode 100644 docs/guides/coding-agents.md
 create mode 100644 plugins/coding_agent/__init__.py
 create mode 100644 plugins/coding_agent/acp_client.py
 create mode 100644 plugins/coding_agent/protoagent.plugin.yaml
 create mode 100644 tests/test_coding_agent_plugin.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e6590a6..1e96b73 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+- **Spawn CLI coding agents over ACP** — a new opt-in `coding_agent` plugin
+  (ADR 0024) adds a `code_with(agent, task)` tool that hands a real, repo-scoped
+  coding job to a purpose-built CLI coding agent (protoCLI `proto`, Claude Code,
+  Codex, Gemini CLI) and returns its result. protoAgent is the
+  [ACP](https://agentclientprotocol.com) *client* — it launches the agent as a
+  subprocess and drives one session over JSON-RPC 2.0 on its stdio
+  (`initialize` → `session/new` → `session/prompt`), accumulating the agent's
+  message as the answer. The ACP client is a port of ORBIS's canonical
+  implementation. Ships **disabled with no agents configured** — each agent gets
+  file + shell access in its (config-pinned, auto-allowed) workdir, so it's a
+  deliberate opt-in; enable with `plugins: { enabled: [coding_agent] }` and
+  declare agents under the `coding_agent` config section. One client (subprocess +
+  session) is cached per agent so follow-up calls continue the same thread.
+  PR1 is synchronous (final answer returned; `tool_call` titles logged); live
+  narration onto A2A working frames + HITL permission policy land next.
+  See [the guide](docs/guides/coding-agents.md).
+
 ## [0.15.1] - 2026-06-05
 
 ### Fixed
diff --git a/docs/.vitepress/config.mts b/docs/.vitepress/config.mts
index ea3c8d4..11208a9 100644
--- a/docs/.vitepress/config.mts
+++ b/docs/.vitepress/config.mts
@@ -59,6 +59,7 @@ export default defineConfig({
             { text: "Scheduler", link: "/guides/scheduler" },
             { text: "Discord surface", link: "/guides/discord" },
             { text: "Google (Gmail + Calendar)", link: "/guides/google" },
+            { text: "Spawn CLI coding agents (ACP)", link: "/guides/coding-agents" },
             { text: "Operator console (React/Tauri)", link: "/guides/react-tauri-ui" },
             { text: "Wire Langfuse + Prometheus", link: "/guides/observability" },
             { text: "Run multiple instances", link: "/guides/multi-instance" },
diff --git a/docs/adr/0024-spawn-cli-coding-agents-acp.md b/docs/adr/0024-spawn-cli-coding-agents-acp.md
new file mode 100644
index 0000000..c0ed7d6
--- /dev/null
+++ b/docs/adr/0024-spawn-cli-coding-agents-acp.md
@@ -0,0 +1,137 @@
+# 0024 — Spawn CLI coding agents over ACP (`code_with`)
+
+Status: **Accepted** (PR1 — thin vertical)
+
+## Context
+
+protoAgent's lead agent already delegates in two ways: to **in-process LLM
+subagents** via `task()` (DeerFlow pattern, `graph/subagents/`) and to **remote
+A2A peers** via `peer_consult` (`tools/peer_tools.py`). Both are *talk to another
+model* delegations — neither can pick up a repo, edit files, run the test suite,
+and hand back a diff.
+
+There is a whole class of work that wants exactly that: "add a `/healthz` route
+and run the tests", "fix the failing import in `server/chat.py`". A purpose-built
+**CLI coding agent** — protoCLI (`proto`), Claude Code, Codex, Gemini CLI — does
+this far better than a generic tool loop, because it carries its own file access,
+shell, repo-map, edit/verify harness, and approval UX.
+
+The protoLabs companion stack already solved this. **ORBIS** (the Python voice
+companion) is an **ACP client**: it launches a coding agent as a subprocess and
+drives one session over the [Agent Client Protocol](https://agentclientprotocol.com)
+— JSON-RPC 2.0, newline-delimited, on the child's stdin/stdout. The same
+`delegate_to` registry routes `a2a` / `openai` / `acp` delegate types. protoCLI,
+on the other side, speaks the matching server role (`proto --acp`).
+
+This ADR brings the **ACP-client leg** into protoAgent so our agents can spawn
+CLI coding agents. We do **not** port ORBIS's whole `delegate_to` registry —
+protoAgent already covers the `a2a` and `openai` legs differently (peers +
+subagents + the LiteLLM gateway). The new capability is just the ACP one.
+
+## Decision
+
+Ship ACP-client support as a **first-party, opt-in plugin** (`plugins/coding_agent`),
+not a core tool. It contributes one tool — `code_with(agent, task)` — backed by
+a small ACP client (a port of ORBIS's `acp/client.py`).
+
+### Why a plugin, not core
+
+- The plugin seam already gives config + secrets + Settings + enable/disable for
+  free, with **zero core edits** — matching the operator-fork contract (ADR 0019)
+  and the Discord/Google precedent (ADR 0018).
+- Spawning a coding agent with file + shell access in a workdir is a real
+  authority delegation. It should be **off by default** and explicitly opted into,
+  exactly like the shipped `hello` example plugin (`enabled: false`).
+- Not every fork wants this. A plugin keeps the default tool surface lean
+  (ADR 0005, tool pollution).
+
+### Shape
+
+```
+plugins/coding_agent/
+  protoagent.plugin.yaml   # config_section: coding_agent; agents: []; enabled: false
+  acp_client.py            # AcpClient — JSON-RPC 2.0 over the child's stdio
+  __init__.py              # register(): builds code_with from configured agents
+```
+
+Config (a top-level `coding_agent` section, ADR 0019):
+
+```yaml
+coding_agent:
+  default_timeout_s: 600        # coding is slow; per-agent override available
+  agents:
+    - name: proto              # the name the LLM passes to code_with(agent=…)
+      command: proto           # binary on PATH
+      args: ["--acp"]          # ACP server mode
+      workdir: ~/dev/my-repo   # session cwd — the confinement boundary
+      # env: { FOO: bar }      # optional extra env (merged over the process env)
+      # timeout_s: 900         # optional per-agent override
+```
+
+The tool the lead agent sees:
+
+```
+code_with(agent="proto", task="add a /healthz route and run the tests")
+  → the agent's final message text (the work happens in its own session)
+```
+
+### Confinement & permission posture (PR1)
+
+- **Workdir is config-pinned.** `code_with` takes only `agent` + `task` — never a
+  caller-chosen path. The cwd comes from the matched config entry, so the LLM
+  cannot point a coding agent at an arbitrary directory. Workdirs must be listed
+  in config; an unknown `agent` returns an error listing the configured ones.
+- **Auto-allow within the workdir.** The client advertises no client-served
+  `fs`/`terminal` capability, so the coding agent uses its *own* file/shell
+  access, scoped to the session cwd. Inbound `session/request_permission` is
+  auto-approved (first `allow` option), mirroring ORBIS's PR1 policy. The coding
+  agent self-governs inside its sandbox dir.
+- The subprocess inherits the server's env (plus any per-agent `env`). Run
+  protoAgent under an account whose ambient credentials you're willing to lend
+  the coding agent — or scope its `workdir` to a throwaway checkout.
+
+### Wire protocol (ACP, client side)
+
+```
+→ initialize        {protocolVersion: 1, clientCapabilities: {fs:{…false}, terminal:false}}
+→ session/new       {cwd, mcpServers: []}                       ← {sessionId}
+→ session/prompt    {sessionId, prompt: [{type:text, text}]}    ← {stopReason}
+← session/update    {update:{sessionUpdate:"agent_message_chunk", content:{text}}}   (accumulated → answer)
+← session/update    {update:{sessionUpdate:"tool_call", title}}                       (narration → logged)
+← session/request_permission  {options:[…]}  → auto-allow
+```
+
+One `AcpClient` owns one subprocess + one session, **cached per agent** so
+follow-up `code_with` calls continue the same thread (mirrors the A2A peer's
+sticky `contextId`). A per-agent lock serializes turns (a session is a single
+conversation; `task_batch` must not interleave two prompts on one session).
+
+## Scope
+
+**PR1 (this ADR):** ACP client + `code_with` + config + auto-allow + tests +
+docs. Synchronous — the final answer is returned; `tool_call` titles are logged,
+not yet streamed to callers.
+
+**Later PRs:** live narration of `tool_call` titles onto A2A working-status
+frames (so an operator watching a turn sees "Editing app.py"); richer permission
+policy (HITL gating via `ask_human`, allow-by-kind); more shipped agent recipes
+(claude-code-acp, codex, gemini); an eval case.
+
+## Consequences
+
+- protoAgent gains a third delegation altitude: **hand a real coding job to a
+  purpose-built CLI agent** and get the result back — without forking.
+- The security surface is explicit and opt-in: disabled by default, empty agent
+  list by default, workdir-confined, documented.
+- We take a dependency on the target agent being installed and on PATH; a missing
+  binary returns a clear error string, not a crash.
+
+## Alternatives considered
+
+- **Core tool module** (`tools/acp_tools.py`) — always present like `peer_tools`.
+  Rejected: edits core for a capability most forks won't use, and loses the
+  free config/Settings/enable-disable plumbing.
+- **Full `delegate_to` registry port** (a2a + openai + acp) — most faithful to
+  ORBIS. Rejected for now: largest blast radius, and it overlaps protoAgent's
+  existing peer + subagent + gateway seams. The ACP leg is the only genuinely
+  missing one.
diff --git a/docs/adr/index.md b/docs/adr/index.md
index 8e6e1c8..3f7a5db 100644
--- a/docs/adr/index.md
+++ b/docs/adr/index.md
@@ -32,3 +32,4 @@ decision, numbered, never deleted (supersede instead).
 | [0021](./0021-agent-memory-architecture.md) | Agent memory: extract, don't dump | Accepted |
 | [0022](./0022-activity-provenance-feed.md) | Activity is a provenance feed, not a second chat | Accepted |
 | [0023](./0023-server-decomposition.md) | Decompose server.py: AppState + composition root | Accepted |
+| [0024](./0024-spawn-cli-coding-agents-acp.md) | Spawn CLI coding agents over ACP (`code_with`) | Accepted (PR1) |
diff --git a/docs/guides/coding-agents.md b/docs/guides/coding-agents.md
new file mode 100644
index 0000000..e831db3
--- /dev/null
+++ b/docs/guides/coding-agents.md
@@ -0,0 +1,126 @@
+# Spawn CLI coding agents (ACP)
+
+An **optional, opt-in plugin** ([ADR 0024](/adr/0024-spawn-cli-coding-agents-acp))
+that lets the lead agent hand a real coding job to a purpose-built **CLI coding
+agent** — protoCLI (`proto`), Claude Code, Codex, Gemini CLI — and get the result
+back.
+
+Where `task()` delegates to an in-process LLM subagent and `peer_consult` talks
+to a remote A2A peer, **`code_with(agent, task)`** spawns a coding agent that
+carries its own file access, shell, repo-map, and edit/verify loop — so it can
+read/edit/run code in a repo far better than a generic tool loop. It drives the
+coding agent over the [Agent Client Protocol](https://agentclientprotocol.com)
+(ACP): JSON-RPC 2.0 over the child's stdin/stdout. protoAgent is the ACP
+*client*; `proto --acp` is the matching server.
+
+> **Security:** a configured coding agent gets **file + shell access in its
+> workdir** (auto-allowed, confined to that directory — see
+> [Permission posture](#permission-posture)). The plugin therefore ships
+> **disabled with no agents** — you enable it *and* declare agents explicitly.
+
+## Enable it
+
+The coding agent runs as a local subprocess, so this is configured in YAML, not
+the in-app Settings (each agent grants local authority and deserves a deliberate
+edit):
+
+```yaml
+# config/langgraph-config.yaml
+plugins:
+  enabled: [coding_agent]
+
+coding_agent:
+  default_timeout_s: 600          # coding is slow; per-agent override below
+  agents:
+    - name: proto                 # the name the LLM passes to code_with(agent=…)
+      command: proto              # binary on PATH
+      args: ["--acp"]             # ACP server mode
+      workdir: ~/dev/my-repo      # session cwd — the confinement boundary
+      # env: { SOME_KEY: value }  # optional extra env, merged over the process env
+      # timeout_s: 900            # optional per-agent override (seconds)
+```
+
+Enabling plugins needs a **restart** (plugin tools wire once at process init).
+On boot you'll see `[coding_agent] registered code_with for N agent(s)`.
+
+### Other coding agents
+
+Any agent that speaks ACP works — just point `command`/`args` at it:
+
+```yaml
+  agents:
+    - name: proto
+      command: proto
+      args: ["--acp"]
+      workdir: ~/dev/my-repo
+    - name: claude-code
+      command: npx
+      args: ["@zed-industries/claude-code-acp"]
+      workdir: ~/dev/my-repo
+```
+
+The binary must be installed and on the `PATH` of the process running protoAgent.
+A missing binary returns a clear error string to the agent (it doesn't crash).
+
+## Use it
+
+The lead agent calls the tool; the configured agent names appear in the tool's
+description so the model knows what it can pass:
+
+```
+code_with(agent="proto", task="Add a GET /healthz route to server/, wire it
+into the app, and run the tests. Report what you changed.")
+```
+
+Notes for whoever writes the `task`:
+
+- The coding agent **does not see this conversation** — make `task` a
+  self-contained brief: the goal, the relevant files if known, and the
+  definition of done ("run the tests", "and lint").
+- You **cannot** choose the directory — each agent works in its pre-configured
+  `workdir`. To work in a different repo, configure another agent.
+- The call **blocks** until the turn finishes (coding is slow). The default
+  timeout is `default_timeout_s` (600s) unless the agent overrides it.
+- **Follow-up calls to the same agent continue the same session** — so you can
+  iterate: `code_with(agent="proto", task="now also add a test for it")`.
+
+## Permission posture
+
+PR1 (the current cut) auto-allows the coding agent's actions, confined to its
+workdir:
+
+- **Workdir is config-pinned.** `code_with` takes only `agent` + `task` — never a
+  path. The cwd comes from config, so the model can't aim a coding agent at an
+  arbitrary directory.
+- **Auto-allow within the workdir.** protoAgent advertises no client-served
+  `fs`/`terminal` capability, so the coding agent uses its *own* file/shell
+  access, scoped to the session cwd. Inbound `session/request_permission` is
+  auto-approved. The coding agent self-governs inside its sandbox dir.
+- The subprocess **inherits protoAgent's environment** (plus any per-agent
+  `env`). Run protoAgent under an account whose ambient credentials you're
+  willing to lend the coding agent, or scope the `workdir` to a throwaway
+  checkout.
+
+Coming in later PRs (ADR 0024): live narration of the coding agent's progress
+("Editing app.py") onto A2A working-status frames; HITL permission gating via
+`ask_human`; allow-by-kind policy.
+
+## How it works
+
+```
+code_with(agent, task)
+  → AcpClient (plugins/coding_agent/acp_client.py)
+      → spawn `command args` in workdir, JSON-RPC 2.0 over its stdio:
+        initialize → session/new(cwd) → session/prompt(task)
+      ← session/update {agent_message_chunk}   → accumulated into the answer
+      ← session/update {tool_call, title}        → narrated (logged)
+      ← session/request_permission               → auto-allowed
+  → returns the agent's final message text
+```
+
+One `AcpClient` (subprocess + session) is **cached per agent** so follow-up calls
+continue the thread; a per-agent lock serializes turns (a session is a single
+conversation — `task_batch` won't interleave two prompts on one).
+
+See [Plugins](/guides/plugins) for the plugin model in general, and
+[ADR 0024](/adr/0024-spawn-cli-coding-agents-acp) for the design rationale.
diff --git a/docs/guides/plugins.md b/docs/guides/plugins.md
index ed38a86..6377ea0 100644
--- a/docs/guides/plugins.md
+++ b/docs/guides/plugins.md
@@ -10,7 +10,9 @@ explicitly — only enable plugins you trust.
 
 > The first-party **Discord** and **Google** integrations ship as plugins
 > (`plugins/discord/`, `plugins/google/`) — disable either with
-> `plugins: { disabled: [discord] }` / `[google]`, no core edit.
+> `plugins: { disabled: [discord] }` / `[google]`, no core edit. The opt-in
+> **coding_agent** plugin (`plugins/coding_agent/`) adds `code_with` to spawn a
+> CLI coding agent over ACP — see [Spawn CLI coding agents](/guides/coding-agents).
 
 > **Trust model.** This is the in-process / trusted model (matching Hermes): an
 > enabled plugin's `register()` runs as the agent. Don't enable code you
diff --git a/plugins/coding_agent/__init__.py b/plugins/coding_agent/__init__.py
new file mode 100644
index 0000000..3711c02
--- /dev/null
+++ b/plugins/coding_agent/__init__.py
@@ -0,0 +1,162 @@
+"""CLI coding-agent plugin — spawn a coding agent over ACP (ADR 0024).
+
+Contributes one tool, ``code_with(agent, task)``, that hands a coding job to a
+configured CLI coding agent (protoCLI ``proto``, Claude Code, Codex, Gemini CLI)
+and returns its result. The agent is driven over the Agent Client Protocol
+(JSON-RPC 2.0 over the child's stdio) by ``acp_client.AcpClient``.
+
+The plugin ships disabled with an empty agent list — each configured agent gets
+file + shell access in its workdir (auto-allowed, confined to that dir), so it's
+a deliberate opt-in. Enable with ``plugins: { enabled: [coding_agent] }`` and add
+agents under the ``coding_agent`` config section. See docs/guides/coding-agents.md.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+
+from langchain_core.tools import tool
+
+from tools.fallbacks import with_fallback
+
+from .acp_client import AcpClient, AcpError
+
+log = logging.getLogger("protoagent.plugins.coding_agent")
+
+# One client (subprocess + session) per agent, keyed by its launch signature so a
+# config change spins up a fresh client. Module-global so the session persists
+# across graph builds / turns; a per-agent lock serializes turns (a session is a
+# single conversation — ``task_batch`` must not interleave two prompts on one).
+_CLIENTS: dict[tuple, AcpClient] = {}
+_LOCKS: dict[str, asyncio.Lock] = {}
+
+
+def _normalize_agents(raw) -> dict[str, dict]:
+    """Validate the configured ``agents`` list → {name: spec}. Drops bad entries
+    (logged) rather than raising, so one typo can't break the plugin."""
+    agents: dict[str, dict] = {}
+    for entry in raw or []:
+        if not isinstance(entry, dict):
+            log.warning("[coding_agent] ignoring non-mapping agent entry: %r", entry)
+            continue
+        name = str(entry.get("name", "")).strip()
+        command = str(entry.get("command", "")).strip()
+        workdir = str(entry.get("workdir", "")).strip()
+        if not (name and command and workdir):
+            log.warning("[coding_agent] agent entry needs name+command+workdir: %r", entry)
+            continue
+        if name in agents:
+            log.warning("[coding_agent] duplicate agent name %r — keeping first", name)
+            continue
+        args = entry.get("args") or []
+        if not isinstance(args, (list, tuple)):
+            log.warning("[coding_agent] %s: args must be a list — ignoring", name)
+            args = []
+        env = entry.get("env") if isinstance(entry.get("env"), dict) else None
+        agents[name] = {
+            "name": name,
+            "command": command,
+            "args": [str(a) for a in args],
+            "workdir": workdir,
+            "env": {str(k): str(v) for k, v in env.items()} if env else None,
+            "timeout_s": entry.get("timeout_s"),
+        }
+    return agents
+
+
+def _client_for(spec: dict) -> AcpClient:
+    """Get-or-create the cached client for an agent spec."""
+    key = (spec["name"], spec["command"], tuple(spec["args"]), spec["workdir"])
+    client = _CLIENTS.get(key)
+    if client is None:
+        client = AcpClient(
+            spec["command"],
+            spec["args"],
+            cwd=spec["workdir"],
+            env=spec["env"],
+            name=spec["name"],
+        )
+        _CLIENTS[key] = client
+    return client
+
+
+def _build_code_with(agents: dict[str, dict], default_timeout_s: float):
+    """Build the ``code_with`` tool, closing over the configured agents."""
+    listing = ", ".join(
+        f"`{name}` (in `{spec['workdir']}`)" for name, spec in agents.items()
+    )
+
+    @tool
+    @with_fallback("The coding agent did not return a result.")
+    async def code_with(agent: str, task: str) -> str:
+        """Delegate a coding task to a CLI coding agent and return its result.
+
+        Use this to hand a real, repo-scoped coding job — read/edit/run code,
+        fix a failing test, add an endpoint — to a purpose-built coding agent
+        that has its own file access, shell, and edit/verify loop. Prefer this
+        over doing multi-file code edits inline.
+
+        Args:
+            agent: which configured coding agent to use (see the available list
+                in this tool's description).
+            task: the full, self-contained instruction (the coding agent does
+                not see this conversation — restate the goal, the relevant files
+                if known, and the definition of done, e.g. "run the tests").
+
+        Each agent works in its own pre-configured directory; you cannot point it
+        elsewhere. The call blocks until the agent finishes the turn (coding is
+        slow) and returns its final message. Follow-up calls to the same agent
+        continue the same session, so you can iterate ("now also …").
+        """
+        spec = agents.get(agent)
+        if spec is None:
+            return (
+                f"Error: unknown coding agent {agent!r}. "
+                f"Configured agents: {', '.join(agents) or '(none)'}."
+            )
+        if not str(task).strip():
+            return "Error: `task` is empty — give the coding agent a concrete instruction."
+
+        lock = _LOCKS.setdefault(agent, asyncio.Lock())
+        timeout = float(spec.get("timeout_s") or default_timeout_s)
+        client = _client_for(spec)
+
+        async def _narrate(title: str) -> None:
+            # PR1: log narration. A later PR streams these onto A2A working frames.
+            log.info("[coding_agent/%s] %s", agent, title)
+
+        async with lock:
+            try:
+                answer = await client.prompt(
+                    task, progress_callback=_narrate, timeout=timeout
+                )
+            except AcpError as exc:
+                # Drop the cached client so the next call relaunches cleanly.
+                _CLIENTS.pop((spec["name"], spec["command"], tuple(spec["args"]), spec["workdir"]), None)
+                return f"Error: {agent} (coding agent) failed: {exc}"
+        return answer or f"{agent} finished but returned no text."
+
+    # The configured agent names belong in the LLM-facing description so the model
+    # knows what it can pass as `agent` (the docstring can't interpolate them).
+    code_with.description = f"{code_with.description}\n\nAvailable agents: {listing}."
+    return code_with
+
+
+def register(registry) -> None:
+    """Entry point — called once at load with a PluginRegistry."""
+    cfg = registry.config or {}
+    agents = _normalize_agents(cfg.get("agents"))
+    if not agents:
+        log.warning(
+            "[coding_agent] enabled but no agents configured — add entries under "
+            "`coding_agent.agents` (see docs/guides/coding-agents.md). No tool registered."
+        )
+        return
+    try:
+        default_timeout_s = float(cfg.get("default_timeout_s") or 600)
+    except (TypeError, ValueError):
+        default_timeout_s = 600.0
+    registry.register_tool(_build_code_with(agents, default_timeout_s))
+    log.info("[coding_agent] registered code_with for %d agent(s): %s",
+             len(agents), ", ".join(agents))
diff --git a/plugins/coding_agent/acp_client.py b/plugins/coding_agent/acp_client.py
new file mode 100644
index 0000000..579c35b
--- /dev/null
+++ b/plugins/coding_agent/acp_client.py
@@ -0,0 +1,304 @@
+"""ACP client — launch a CLI coding agent and drive one session.
+
+protoAgent is the ACP *client*: one ``AcpClient`` owns one agent subprocess and
+one session, cached per configured agent so follow-up ``code_with`` calls
+continue the same thread (mirrors the A2A peer's sticky ``contextId``). Transport
+is JSON-RPC 2.0, newline-delimited, over the child's stdin/stdout. The matching
+server side is e.g. ``proto --acp``. Spec: https://agentclientprotocol.com.
+
+Ported from ORBIS's ``acp/client.py`` (the canonical protoLabs ACP client).
+ADR 0024.
+
+PR1 scope (the thin vertical):
+  * handshake: ``initialize`` → ``session/new`` (cwd = the agent's config workdir)
+  * one turn: ``session/prompt`` → accumulate ``agent_message_chunk`` text as the
+    answer; narrate ``tool_call`` titles via ``progress_callback`` ("Editing
+    app.py", "Running pytest")
+  * auto-allow ``session/request_permission`` (the coding agent self-governs,
+    scoped to the session cwd — see ADR 0024). Policy + HITL gating land next.
+  * ``fs/*`` and ``terminal/*`` are NOT advertised — the coding agent uses its own
+    file access, confined to the session ``cwd``.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Awaitable, Callable
+
+logger = logging.getLogger("protoagent.plugins.coding_agent")
+
+ProgressCallback = Callable[[str], Awaitable[None]]
+
+# ACP protocol version protoAgent speaks. Negotiated in `initialize`.
+PROTOCOL_VERSION = 1
+
+
+class AcpError(Exception):
+    """Any ACP transport / protocol failure. The caller speaks the message."""
+
+
+class AcpClient:
+    """Drive a single ACP agent subprocess + session.
+
+    Construct once per configured agent and reuse: the process + session persist
+    across turns. Not safe for concurrent prompts on one instance (a session is a
+    single conversation); callers serialize turns with a per-agent lock.
+    """
+
+    def __init__(
+        self,
+        command: str,
+        args: list[str] | None = None,
+        *,
+        cwd: str,
+        env: dict[str, str] | None = None,
+        name: str = "acp",
+    ) -> None:
+        self.command = command
+        self.args = list(args or [])
+        self.cwd = str(Path(cwd).expanduser())
+        self.env = env
+        self.name = name
+
+        self._proc: asyncio.subprocess.Process | None = None
+        self._session_id: str | None = None
+        self._next_id = 0
+        self._pending: dict[int, asyncio.Future] = {}
+        self._reader_task: asyncio.Task | None = None
+        self._start_lock = asyncio.Lock()
+
+        # Per-turn state (one turn at a time).
+        self._answer = ""
+        self._progress: ProgressCallback | None = None
+
+    # -- lifecycle -----------------------------------------------------------
+
+    async def _ensure_started(self) -> None:
+        async with self._start_lock:
+            if self._proc is not None and self._proc.returncode is None:
+                return
+            await self._start()
+
+    async def _start(self) -> None:
+        if not Path(self.cwd).is_dir():
+            raise AcpError(f"workdir does not exist: {self.cwd}")
+        try:
+            self._proc = await asyncio.create_subprocess_exec(
+                self.command,
+                *self.args,
+                cwd=self.cwd,
+                stdin=asyncio.subprocess.PIPE,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+                env={**os.environ, **(self.env or {})},
+            )
+        except FileNotFoundError as exc:
+            raise AcpError(
+                f"agent binary not found: {self.command!r} (is it installed and on PATH?)"
+            ) from exc
+
+        self._reader_task = asyncio.create_task(self._read_loop())
+        asyncio.create_task(self._drain_stderr())
+        await self._initialize()
+        await self._new_session()
+        logger.info(
+            "[acp/%s] up (pid=%s, session=%s, cwd=%s)",
+            self.name,
+            self._proc.pid,
+            self._session_id,
+            self.cwd,
+        )
+
+    async def close(self) -> None:
+        if self._reader_task and not self._reader_task.done():
+            self._reader_task.cancel()
+        if self._proc and self._proc.returncode is None:
+            try:
+                self._proc.terminate()
+            except ProcessLookupError:
+                pass
+
+    # -- I/O loops -----------------------------------------------------------
+
+    async def _drain_stderr(self) -> None:
+        assert self._proc and self._proc.stderr
+        async for raw in self._proc.stderr:
+            line = raw.decode(errors="replace").rstrip()
+            if line:
+                logger.debug("[acp/%s/stderr] %s", self.name, line)
+
+    async def _read_loop(self) -> None:
+        assert self._proc and self._proc.stdout
+        try:
+            async for raw in self._proc.stdout:
+                line = raw.decode(errors="replace").strip()
+                if not line:
+                    continue
+                try:
+                    msg = json.loads(line)
+                except json.JSONDecodeError:
+                    logger.warning("[acp/%s] non-JSON line: %.200s", self.name, line)
+                    continue
+                await self._handle(msg)
+        except asyncio.CancelledError:
+            raise
+        finally:
+            # Fail any in-flight requests if the process dies mid-turn.
+            for fut in self._pending.values():
+                if not fut.done():
+                    fut.set_exception(AcpError(f"{self.name} agent exited"))
+            self._pending.clear()
+
+    async def _handle(self, msg: dict) -> None:
+        # 1) Response to one of our outbound requests.
+        if "id" in msg and ("result" in msg or "error" in msg):
+            fut = self._pending.pop(msg["id"], None)
+            if fut and not fut.done():
+                if "error" in msg:
+                    fut.set_exception(AcpError(str(msg["error"])))
+                else:
+                    fut.set_result(msg.get("result"))
+            return
+        method = msg.get("method")
+        # 2) Inbound request from the agent (has id) — we must respond.
+        if method and "id" in msg:
+            await self._handle_request(msg)
+            return
+        # 3) Notification (no id).
+        if method == "session/update":
+            await self._handle_update(msg.get("params") or {})
+
+    # -- inbound updates + requests -----------------------------------------
+
+    async def _handle_update(self, params: dict) -> None:
+        update = params.get("update") or {}
+        kind = update.get("sessionUpdate")
+        if kind == "agent_message_chunk":
+            text = (update.get("content") or {}).get("text", "")
+            if text:
+                self._answer += text
+        elif kind == "tool_call":
+            # Narrate the tool's human title ("Editing app.py", "Running pytest")
+            # — this is progress, not the answer text.
+            title = update.get("title") or update.get("kind") or "working"
+            await self._narrate(str(title))
+
+    async def _handle_request(self, msg: dict) -> None:
+        method = msg.get("method")
+        rid = msg.get("id")
+        if method == "session/request_permission":
+            option_id = self._auto_allow(msg.get("params") or {})
+            outcome = (
+                {"outcome": "selected", "optionId": option_id}
+                if option_id
+                else {"outcome": "cancelled"}
+            )
+            await self._respond(rid, {"outcome": outcome})
+        else:
+            # We didn't advertise fs/terminal; decline anything else cleanly so
+            # the agent falls back to its own capabilities instead of hanging.
+            await self._respond_error(rid, -32601, f"method not supported: {method}")
+
+    @staticmethod
+    def _auto_allow(params: dict) -> str | None:
+        """PR1 permission policy: pick the first 'allow' option (else the first
+        option). The HITL / deny-policy layer replaces this (ADR 0024, later PR)."""
+        options = params.get("options") or []
+        for opt in options:
+            if str(opt.get("kind", "")).startswith("allow"):
+                return opt.get("optionId")
+        return options[0].get("optionId") if options else None
+
+    async def _narrate(self, text: str) -> None:
+        if self._progress and text:
+            try:
+                await self._progress(text)
+            except Exception as exc:  # progress is best-effort
+                logger.warning("[acp/%s] progress_callback raised: %s", self.name, exc)
+
+    # -- JSON-RPC primitives -------------------------------------------------
+
+    async def _send(self, obj: dict) -> None:
+        if not (self._proc and self._proc.stdin):
+            raise AcpError("agent not started")
+        self._proc.stdin.write((json.dumps(obj) + "\n").encode())
+        await self._proc.stdin.drain()
+
+    async def _request(self, method: str, params: dict, *, timeout: float = 120.0):
+        self._next_id += 1
+        rid = self._next_id
+        fut: asyncio.Future = asyncio.get_running_loop().create_future()
+        self._pending[rid] = fut
+        await self._send({"jsonrpc": "2.0", "id": rid, "method": method, "params": params})
+        try:
+            return await asyncio.wait_for(fut, timeout)
+        except asyncio.TimeoutError as exc:
+            self._pending.pop(rid, None)
+            raise AcpError(f"{method} timed out after {timeout}s") from exc
+
+    async def _respond(self, rid, result: dict) -> None:
+        await self._send({"jsonrpc": "2.0", "id": rid, "result": result})
+
+    async def _respond_error(self, rid, code: int, message: str) -> None:
+        await self._send({"jsonrpc": "2.0", "id": rid, "error": {"code": code, "message": message}})
+
+    # -- handshake -----------------------------------------------------------
+
+    async def _initialize(self) -> None:
+        await self._request(
+            "initialize",
+            {
+                "protocolVersion": PROTOCOL_VERSION,
+                # PR1: no client-served fs/terminal — the coding agent uses its own,
+                # confined to the session cwd.
+                "clientCapabilities": {
+                    "fs": {"readTextFile": False, "writeTextFile": False},
+                    "terminal": False,
+                },
+            },
+            timeout=30.0,
+        )
+
+    async def _new_session(self) -> None:
+        result = await self._request(
+            "session/new", {"cwd": self.cwd, "mcpServers": []}, timeout=30.0
+        )
+        self._session_id = (result or {}).get("sessionId")
+        if not self._session_id:
+            raise AcpError("session/new returned no sessionId")
+
+    # -- public: one turn ----------------------------------------------------
+
+    async def prompt(
+        self,
+        text: str,
+        *,
+        progress_callback: ProgressCallback | None = None,
+        timeout: float = 600.0,
+    ) -> str:
+        """Send one user turn; return the agent's accumulated message text.
+
+        Streams ``tool_call`` titles to ``progress_callback`` for narration while
+        the agent works. Raises ``AcpError`` on transport/protocol failure.
+        """
+        await self._ensure_started()
+        self._answer = ""
+        self._progress = progress_callback
+        try:
+            result = await self._request(
+                "session/prompt",
+                {
+                    "sessionId": self._session_id,
+                    "prompt": [{"type": "text", "text": text}],
+                },
+                timeout=timeout,
+            )
+        finally:
+            self._progress = None
+        stop = (result or {}).get("stopReason")
+        logger.info("[acp/%s] turn complete (stopReason=%s)", self.name, stop)
+        return self._answer.strip()
diff --git a/plugins/coding_agent/protoagent.plugin.yaml b/plugins/coding_agent/protoagent.plugin.yaml
new file mode 100644
index 0000000..90e83ce
--- /dev/null
+++ b/plugins/coding_agent/protoagent.plugin.yaml
@@ -0,0 +1,45 @@
+id: coding_agent
+name: CLI Coding Agent (ACP)
+version: 0.1.0
+description: >-
+  Let the lead agent spawn a CLI coding agent — protoCLI (`proto`), Claude Code,
+  Codex, Gemini CLI — and hand it a real coding job (read/edit/run code in a
+  repo), then get the result back. Adds one tool, `code_with(agent, task)`, that
+  drives the coding agent over ACP (Agent Client Protocol: JSON-RPC 2.0 over the
+  child's stdio). See ADR 0024 and docs/guides/coding-agents.md.
+
+  Ships DISABLED with an EMPTY agent list — enable it AND configure agents
+  explicitly, because each agent gets file + shell access in its workdir
+  (auto-allowed, confined to that dir). Enable with
+  `plugins: { enabled: [coding_agent] }`.
+enabled: false
+
+# Declarative only (not yet enforced) — surfaced in the console for transparency.
+# A spawned coding agent does real local I/O within its configured workdir.
+capabilities:
+  network: ["*"]            # the coding agent may fetch/install as it works
+  filesystem: workdir       # confined to each agent's configured workdir
+
+# Plugin config (ADR 0019) — claims the top-level `coding_agent` section.
+# `agents` is a list (round-trips through YAML); it is intentionally NOT a
+# Settings field — coding agents are declared in config, not the Settings UI,
+# because each one grants local file/shell authority and deserves a deliberate
+# edit. Only `default_timeout_s` is exposed as a scalar Setting.
+config_section: coding_agent
+config:
+  # Default per-turn timeout (seconds). Coding is slow; override per agent.
+  default_timeout_s: 600
+  # No agents by default — you MUST add your own (see the guide). Example:
+  #   agents:
+  #     - name: proto             # the name the LLM passes to code_with(agent=…)
+  #       command: proto          # binary on PATH
+  #       args: ["--acp"]         # ACP server mode
+  #       workdir: ~/dev/my-repo  # session cwd — the confinement boundary
+  #       # env: { SOME_KEY: value }   # optional extra env, merged over process env
+  #       # timeout_s: 900             # optional per-agent override
+  agents: []
+settings:
+  - key: default_timeout_s
+    label: "Default coding turn timeout (s)"
+    type: number
+    description: "Per-turn timeout for a coding agent. Coding is slow — 600s+ is typical."
diff --git a/tests/test_coding_agent_plugin.py b/tests/test_coding_agent_plugin.py
new file mode 100644
index 0000000..9c28a83
--- /dev/null
+++ b/tests/test_coding_agent_plugin.py
@@ -0,0 +1,173 @@
+"""Tests for the coding_agent plugin (ADR 0024).
+
+Covers config normalization + the `code_with` tool wiring (unit), and a real
+ACP wire exchange: AcpClient drives a fake ACP agent subprocess through
+initialize → session/new → session/prompt, accumulating agent_message_chunk
+text and auto-allowing a session/request_permission.
+"""
+
+from __future__ import annotations
+
+import sys
+
+import pytest
+
+from plugins.coding_agent import _normalize_agents, register
+from plugins.coding_agent.acp_client import AcpClient, AcpError
+
+# ── a minimal ACP "agent" (server side) we can drive over stdio ───────────────
+# Speaks just enough of the protocol: handshakes, opens a session, and on a
+# prompt emits a tool_call narration, asks one permission (server→client
+# request), then streams two agent_message_chunks — echoing the chosen option
+# id so the test can prove auto-allow picked the `allow` option.
+_FAKE_AGENT = r'''
+import sys, json
+
+def send(obj):
+    sys.stdout.write(json.dumps(obj) + "\n")
+    sys.stdout.flush()
+
+while True:
+    line = sys.stdin.readline()
+    if not line:
+        break
+    line = line.strip()
+    if not line:
+        continue
+    msg = json.loads(line)
+    method, mid = msg.get("method"), msg.get("id")
+    if method == "initialize":
+        send({"jsonrpc": "2.0", "id": mid, "result": {"protocolVersion": 1}})
+    elif method == "session/new":
+        send({"jsonrpc": "2.0", "id": mid, "result": {"sessionId": "s1"}})
+    elif method == "session/prompt":
+        send({"jsonrpc": "2.0", "method": "session/update", "params": {
+            "sessionId": "s1",
+            "update": {"sessionUpdate": "tool_call", "title": "Editing app.py"}}})
+        # Ask permission; the client must respond before we continue.
+        send({"jsonrpc": "2.0", "id": 999, "method": "session/request_permission",
+              "params": {"sessionId": "s1", "options": [
+                  {"optionId": "no", "kind": "deny"},
+                  {"optionId": "yes", "kind": "allow_once"}]}})
+        resp = json.loads(sys.stdin.readline().strip())
+        chosen = resp.get("result", {}).get("outcome", {}).get("optionId")
+        for chunk in ("Hello ", "world [" + str(chosen) + "]"):
+            send({"jsonrpc": "2.0", "method": "session/update", "params": {
+                "sessionId": "s1",
+                "update": {"sessionUpdate": "agent_message_chunk",
+                           "content": {"type": "text", "text": chunk}}}})
+        send({"jsonrpc": "2.0", "id": mid, "result": {"stopReason": "end_turn"}})
+'''
+
+
+# ── config normalization ──────────────────────────────────────────────────────
+
+
+def test_normalize_agents_keeps_valid_and_drops_bad():
+    agents = _normalize_agents([
+        {"name": "proto", "command": "proto", "args": ["--acp"], "workdir": "/tmp"},
+        {"name": "nofields"},                       # missing command/workdir
+        {"name": "x", "command": "x"},              # missing workdir
+        "not-a-dict",                                # wrong type
+        {"name": "proto", "command": "dup", "workdir": "/tmp"},  # duplicate name
+    ])
+    assert set(agents) == {"proto"}
+    assert agents["proto"]["command"] == "proto"     # first wins over the dup
+    assert agents["proto"]["args"] == ["--acp"]
+
+
+def test_normalize_agents_coerces_env_and_args():
+    agents = _normalize_agents([
+        {"name": "a", "command": "c", "workdir": "/tmp",
+         "args": "oops", "env": {"K": 1}},
+    ])
+    assert agents["a"]["args"] == []                 # non-list args ignored
+    assert agents["a"]["env"] == {"K": "1"}          # env values stringified
+
+
+# ── register() wiring ─────────────────────────────────────────────────────────
+
+
+class _StubRegistry:
+    def __init__(self, config):
+        self.config = config
+        self.tools = []
+
+    def register_tool(self, tool):
+        self.tools.append(tool)
+
+
+def test_register_no_agents_registers_nothing():
+    reg = _StubRegistry({"agents": []})
+    register(reg)
+    assert reg.tools == []
+
+
+def test_register_with_agents_exposes_code_with():
+    reg = _StubRegistry({"agents": [
+        {"name": "proto", "command": "proto", "args": ["--acp"], "workdir": "/tmp"},
+    ]})
+    register(reg)
+    assert [t.name for t in reg.tools] == ["code_with"]
+    # Configured agent names are surfaced in the LLM-facing description.
+    assert "proto" in reg.tools[0].description
+
+
+async def test_code_with_unknown_agent_returns_error():
+    reg = _StubRegistry({"agents": [
+        {"name": "proto", "command": "proto", "args": ["--acp"], "workdir": "/tmp"},
+    ]})
+    register(reg)
+    code_with = reg.tools[0]
+    out = await code_with.ainvoke({"agent": "nope", "task": "do it"})
+    assert "unknown coding agent" in out and "proto" in out
+
+
+async def test_code_with_empty_task_returns_error():
+    reg = _StubRegistry({"agents": [
+        {"name": "proto", "command": "proto", "args": ["--acp"], "workdir": "/tmp"},
+    ]})
+    register(reg)
+    code_with = reg.tools[0]
+    out = await code_with.ainvoke({"agent": "proto", "task": "   "})
+    assert "empty" in out.lower()
+
+
+# ── ACP wire exchange against the fake agent ──────────────────────────────────
+
+
+@pytest.fixture
+def fake_agent(tmp_path):
+    script = tmp_path / "fake_acp_agent.py"
+    script.write_text(_FAKE_AGENT, encoding="utf-8")
+    return script
+
+
+async def test_acp_client_drives_a_turn(fake_agent, tmp_path):
+    narrations: list[str] = []
+
+    async def on_progress(title: str) -> None:
+        narrations.append(title)
+
+    client = AcpClient(sys.executable, [str(fake_agent)], cwd=str(tmp_path), name="fake")
+    try:
+        answer = await client.prompt("add a healthz route", progress_callback=on_progress, timeout=30.0)
+    finally:
+        await client.close()
+
+    # agent_message_chunks accumulated; auto-allow picked the `allow_once` option.
+    assert answer == "Hello world [yes]"
+    # tool_call title narrated via the progress callback.
+    assert "Editing app.py" in narrations
+
+
+async def test_acp_client_missing_binary_raises_acp_error(tmp_path):
+    client = AcpClient("definitely-not-a-real-binary-xyz", [], cwd=str(tmp_path))
+    with pytest.raises(AcpError):
+        await client.prompt("hi", timeout=10.0)
+
+
+async def test_acp_client_bad_workdir_raises_acp_error():
+    client = AcpClient(sys.executable, [], cwd="/no/such/dir/anywhere")
+    with pytest.raises(AcpError):
+        await client.prompt("hi", timeout=10.0)

From 048b9f325948e688746534cc2d823fa4037f3c54 Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Sat, 6 Jun 2026 00:38:47 -0700
Subject: [PATCH 16/20] =?UTF-8?q?feat(coding=5Fagent):=20by-kind=20permiss?=
 =?UTF-8?q?ion=20policy=20+=20per-call=20consent=20gate=20=E2=80=94=20ADR?=
 =?UTF-8?q?=200024=20(PR3)=20(#599)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds safety controls to the coding_agent plugin (#596 shipped the base):

- Per-agent by-kind permission policy applied to the coding agent's
  session/request_permission requests (keyed on toolCall.kind):
    * auto (allow all — default, unchanged from PR1)
    * allowlist (allow all but execute/delete)
    * readonly (allow only read-like kinds)
  Overridable with allow_kinds / deny_kinds. AcpClient gains a pluggable
  permission resolver (default stays auto-allow, so PR1 behaviour is unchanged).
- Per-call consent gate: `confirm: true` makes code_with ask the operator via
  ask_human before each call (runs before any side effect, so LangGraph resume
  re-execution is idempotent). code_with drops @with_fallback so the interrupt
  control-flow exception propagates; the I/O is guarded with a local net.
- Agent recipes for protoCLI, Claude Code, Codex, Gemini CLI (docs + manifest).

Per-action live HITL is documented as deferred: pausing a blocking subprocess
session mid-turn is incompatible with interrupt()'s checkpoint/re-run model
(same coupling that blocks live narration). readonly/allowlist give
deterministic per-action control; confirm gives a per-call human gate.

- 17 new/updated tests incl. a wire test that a readonly policy rejects the
  fake agent's `edit` permission request (36 pass total).
- ADR 0024 scope + posture updated; guide permission section rewritten;
  CHANGELOG.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 CHANGELOG.md                                 |  13 +-
 docs/adr/0024-spawn-cli-coding-agents-acp.md |  39 ++++--
 docs/adr/index.md                            |   2 +-
 docs/guides/coding-agents.md                 |  81 ++++++++---
 plugins/coding_agent/__init__.py             | 135 ++++++++++++++++---
 plugins/coding_agent/acp_client.py           |  12 +-
 plugins/coding_agent/protoagent.plugin.yaml  |  18 ++-
 tests/test_coding_agent_plugin.py            | 122 ++++++++++++++++-
 8 files changed, 359 insertions(+), 63 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1e96b73..b54a9b2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,9 +25,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   deliberate opt-in; enable with `plugins: { enabled: [coding_agent] }` and
   declare agents under the `coding_agent` config section. One client (subprocess +
   session) is cached per agent so follow-up calls continue the same thread.
-  PR1 is synchronous (final answer returned; `tool_call` titles logged); live
-  narration onto A2A working frames + HITL permission policy land next.
+  Synchronous (final answer returned; `tool_call` titles logged).
   See [the guide](docs/guides/coding-agents.md).
+- **Coding-agent permission controls** (ADR 0024) — each configured agent takes a
+  by-kind permission policy applied to the coding agent's `session/request_permission`
+  requests: `auto` (allow all, default), `allowlist` (allow all but
+  `execute`/`delete`), or `readonly` (read-like kinds only) — overridable with
+  `allow_kinds` / `deny_kinds`. Plus a per-call consent gate (`confirm: true`)
+  that asks the operator via `ask_human` before each `code_with` call. Ships
+  agent recipes for protoCLI, Claude Code, Codex, and Gemini CLI. (Per-action
+  live HITL is deferred — pausing a blocking subprocess session mid-turn is
+  incompatible with LangGraph's resume model; use `readonly`/`allowlist` for
+  deterministic per-action control.)
 
 ## [0.15.1] - 2026-06-05
 
diff --git a/docs/adr/0024-spawn-cli-coding-agents-acp.md b/docs/adr/0024-spawn-cli-coding-agents-acp.md
index c0ed7d6..0fc329e 100644
--- a/docs/adr/0024-spawn-cli-coding-agents-acp.md
+++ b/docs/adr/0024-spawn-cli-coding-agents-acp.md
@@ -75,17 +75,30 @@ code_with(agent="proto", task="add a /healthz route and run the tests")
   → the agent's final message text (the work happens in its own session)
 ```
 
-### Confinement & permission posture (PR1)
+### Confinement & permission posture
 
 - **Workdir is config-pinned.** `code_with` takes only `agent` + `task` — never a
   caller-chosen path. The cwd comes from the matched config entry, so the LLM
   cannot point a coding agent at an arbitrary directory. Workdirs must be listed
   in config; an unknown `agent` returns an error listing the configured ones.
-- **Auto-allow within the workdir.** The client advertises no client-served
+- **By-kind permission policy (PR3).** The client advertises no client-served
   `fs`/`terminal` capability, so the coding agent uses its *own* file/shell
   access, scoped to the session cwd. Inbound `session/request_permission` is
-  auto-approved (first `allow` option), mirroring ORBIS's PR1 policy. The coding
-  agent self-governs inside its sandbox dir.
+  answered by the agent's `permissions` policy, keyed on the request's
+  `toolCall.kind`: `auto` (allow all — the PR1 default, mirroring ORBIS), `allowlist`
+  (deny `execute`/`delete`, allow the rest), or `readonly` (allow only read-like
+  kinds). Overridable per agent with `allow_kinds` / `deny_kinds`.
+- **Per-call consent gate (PR3).** `confirm: true` makes `code_with` ask the
+  operator (via `ask_human` → `input-required`) to approve *before each call* to
+  that agent. The gate runs before any side effect, so the LangGraph resume
+  re-execution is idempotent.
+- **Per-action live HITL is deferred** — approving each individual edit/shell
+  command mid-turn would require pausing a *blocking subprocess session* while
+  asking the operator. LangGraph's `interrupt()` checkpoints and **re-runs the
+  node** on resume, which can't resume a half-finished ACP session awaiting a
+  specific permission response. `readonly`/`allowlist` give deterministic
+  per-action control; `confirm` gives a per-call human gate. (Same coupling blocks
+  live narration — see Scope.)
 - The subprocess inherits the server's env (plus any per-agent `env`). Run
   protoAgent under an account whose ambient credentials you're willing to lend
   the coding agent — or scope its `workdir` to a throwaway checkout.
@@ -108,14 +121,20 @@ conversation; `task_batch` must not interleave two prompts on one session).
 
 ## Scope
 
-**PR1 (this ADR):** ACP client + `code_with` + config + auto-allow + tests +
-docs. Synchronous — the final answer is returned; `tool_call` titles are logged,
-not yet streamed to callers.
+**PR1 (#596):** ACP client + `code_with` + config + auto-allow + tests + docs.
+Synchronous — the final answer is returned; `tool_call` titles are logged.
+
+**PR3 (this update):** by-kind permission policy (`auto`/`allowlist`/`readonly` +
+`allow_kinds`/`deny_kinds`); per-call consent gate (`confirm`); shipped agent
+recipes (claude-code-acp, codex, gemini). Per-action live HITL is documented as
+deferred (the blocking-session constraint above).
 
 **Later PRs:** live narration of `tool_call` titles onto A2A working-status
-frames (so an operator watching a turn sees "Editing app.py"); richer permission
-policy (HITL gating via `ask_human`, allow-by-kind); more shipped agent recipes
-(claude-code-acp, codex, gemini); an eval case.
+frames (so an operator watching a turn sees "Editing app.py") — blocked on the
+same mid-session channel as per-action HITL, so it wants a general progress
+mechanism (event-bus ride-along or a stream-factory `progress` event), not a
+one-off; a gated eval case (needs an eval-runner skip mechanism so it doesn't
+require a coding agent in the default run).
 
 ## Consequences
 
diff --git a/docs/adr/index.md b/docs/adr/index.md
index 3f7a5db..81f07eb 100644
--- a/docs/adr/index.md
+++ b/docs/adr/index.md
@@ -32,4 +32,4 @@ decision, numbered, never deleted (supersede instead).
 | [0021](./0021-agent-memory-architecture.md) | Agent memory: extract, don't dump | Accepted |
 | [0022](./0022-activity-provenance-feed.md) | Activity is a provenance feed, not a second chat | Accepted |
 | [0023](./0023-server-decomposition.md) | Decompose server.py: AppState + composition root | Accepted |
-| [0024](./0024-spawn-cli-coding-agents-acp.md) | Spawn CLI coding agents over ACP (`code_with`) | Accepted (PR1) |
+| [0024](./0024-spawn-cli-coding-agents-acp.md) | Spawn CLI coding agents over ACP (`code_with`) | Accepted (PR1 + PR3) |
diff --git a/docs/guides/coding-agents.md b/docs/guides/coding-agents.md
index e831db3..95b3d98 100644
--- a/docs/guides/coding-agents.md
+++ b/docs/guides/coding-agents.md
@@ -38,6 +38,8 @@ coding_agent:
       workdir: ~/dev/my-repo      # session cwd — the confinement boundary
       # env: { SOME_KEY: value }  # optional extra env, merged over the process env
       # timeout_s: 900            # optional per-agent override (seconds)
+      # permissions: allowlist    # auto (default) | allowlist | readonly
+      # confirm: true             # ask the operator before each code_with call
 ```
 
 Enabling plugins needs a **restart** (plugin tools wire once at process init).
@@ -57,6 +59,14 @@ Any agent that speaks ACP works — just point `command`/`args` at it:
       command: npx
       args: ["@zed-industries/claude-code-acp"]
       workdir: ~/dev/my-repo
+    - name: codex
+      command: codex
+      args: ["acp"]
+      workdir: ~/dev/my-repo
+    - name: gemini
+      command: gemini
+      args: ["--experimental-acp"]
+      workdir: ~/dev/my-repo
 ```
 
 The binary must be installed and on the `PATH` of the process running protoAgent.
@@ -86,24 +96,59 @@ Notes for whoever writes the `task`:
 
 ## Permission posture
 
-PR1 (the current cut) auto-allows the coding agent's actions, confined to its
-workdir:
-
-- **Workdir is config-pinned.** `code_with` takes only `agent` + `task` — never a
-  path. The cwd comes from config, so the model can't aim a coding agent at an
-  arbitrary directory.
-- **Auto-allow within the workdir.** protoAgent advertises no client-served
-  `fs`/`terminal` capability, so the coding agent uses its *own* file/shell
-  access, scoped to the session cwd. Inbound `session/request_permission` is
-  auto-approved. The coding agent self-governs inside its sandbox dir.
-- The subprocess **inherits protoAgent's environment** (plus any per-agent
-  `env`). Run protoAgent under an account whose ambient credentials you're
-  willing to lend the coding agent, or scope the `workdir` to a throwaway
-  checkout.
-
-Coming in later PRs (ADR 0024): live narration of the coding agent's progress
-("Editing app.py") onto A2A working-status frames; HITL permission gating via
-`ask_human`; allow-by-kind policy.
+A coding agent works in its **config-pinned workdir** (`code_with` takes only
+`agent` + `task`, never a path — the model can't aim it elsewhere) and uses its
+*own* file/shell access there: protoAgent advertises no client-served
+`fs`/`terminal` capability. When the coding agent asks to do something risky it
+sends a `session/request_permission`, which protoAgent answers with the agent's
+**permission policy**:
+
+| `permissions` | Behaviour |
+|---|---|
+| `auto` *(default)* | Allow everything — the agent self-governs within its workdir. |
+| `allowlist` | Allow all action kinds **except** `execute` and `delete` (override with `allow_kinds` / `deny_kinds`). |
+| `readonly` | Allow only read-like kinds (`read`, `search`, `fetch`, …); deny edits, shell, and deletes. |
+
+Action kinds come from the ACP request (`toolCall.kind`: `read` / `edit` /
+`execute` / `delete` / `fetch` / `move` / `search` / …). Tune a policy per agent:
+
+```yaml
+    - name: proto
+      command: proto
+      args: ["--acp"]
+      workdir: ~/dev/my-repo
+      permissions: allowlist
+      deny_kinds: [execute, delete]   # the allowlist default, shown explicitly
+```
+
+### Per-call consent gate
+
+Set `confirm: true` on an agent and `code_with` asks the operator to approve
+**before each call** to that agent (via `ask_human` — the turn parks as
+`input-required` until you reply `yes`):
+
+```yaml
+    - name: proto
+      command: proto
+      args: ["--acp"]
+      workdir: ~/dev/my-repo
+      confirm: true
+```
+
+> **Per-action** live HITL (approve each individual edit/shell command as the
+> coding agent works) is **not** available: it would require pausing a blocking
+> subprocess session mid-turn, which LangGraph's checkpoint/resume model can't do
+> without re-running the tool. Use `permissions: readonly`/`allowlist` for
+> deterministic per-action control, and `confirm` for a per-call human gate.
+
+### Environment
+
+The subprocess **inherits protoAgent's environment** (plus any per-agent `env`).
+Run protoAgent under an account whose ambient credentials you're willing to lend
+the coding agent, or scope the `workdir` to a throwaway checkout.
+
+Coming in a later PR (ADR 0024): live narration of the coding agent's progress
+("Editing app.py") onto A2A working-status frames.
 
 ## How it works
 
diff --git a/plugins/coding_agent/__init__.py b/plugins/coding_agent/__init__.py
index 3711c02..e22b9d6 100644
--- a/plugins/coding_agent/__init__.py
+++ b/plugins/coding_agent/__init__.py
@@ -6,31 +6,49 @@
 (JSON-RPC 2.0 over the child's stdio) by ``acp_client.AcpClient``.
 
 The plugin ships disabled with an empty agent list — each configured agent gets
-file + shell access in its workdir (auto-allowed, confined to that dir), so it's
-a deliberate opt-in. Enable with ``plugins: { enabled: [coding_agent] }`` and add
-agents under the ``coding_agent`` config section. See docs/guides/coding-agents.md.
+file + shell access in its workdir, so it's a deliberate opt-in. Enable with
+``plugins: { enabled: [coding_agent] }`` and add agents under the ``coding_agent``
+config section. See docs/guides/coding-agents.md.
+
+Per-agent safety controls (ADR 0024):
+- ``permissions`` — by-kind permission policy the client applies to the coding
+  agent's ``session/request_permission`` requests: ``auto`` (allow all, default),
+  ``allowlist`` (allow all but deny ``execute``/``delete``), or ``readonly``
+  (allow only read-like kinds). Overridable with ``allow_kinds`` / ``deny_kinds``.
+- ``confirm`` — when true, ``code_with`` asks the operator (``ask_human``) to
+  approve *before* each call to that agent (a per-call consent gate). Per-action
+  live HITL is deferred — it would need to pause a blocking subprocess session,
+  which LangGraph's resume model can't do mid-tool.
 """
 
 from __future__ import annotations
 
 import asyncio
 import logging
+from typing import Callable
 
 from langchain_core.tools import tool
 
-from tools.fallbacks import with_fallback
-
 from .acp_client import AcpClient, AcpError
 
 log = logging.getLogger("protoagent.plugins.coding_agent")
 
-# One client (subprocess + session) per agent, keyed by its launch signature so a
-# config change spins up a fresh client. Module-global so the session persists
-# across graph builds / turns; a per-agent lock serializes turns (a session is a
-# single conversation — ``task_batch`` must not interleave two prompts on one).
+# One client (subprocess + session) per agent, keyed by its launch + policy
+# signature so a config change spins up a fresh client. Module-global so the
+# session persists across graph builds / turns; a per-agent lock serializes turns
+# (a session is a single conversation — ``task_batch`` must not interleave two
+# prompts on one).
 _CLIENTS: dict[tuple, AcpClient] = {}
 _LOCKS: dict[str, asyncio.Lock] = {}
 
+_VALID_POLICIES = {"auto", "allowlist", "readonly"}
+# ACP tool-call kinds treated as read-only (safe under ``readonly``).
+_READONLY_KINDS = {"read", "search", "fetch", "think", "glob", "grep", "list"}
+# Risky kinds denied by ``allowlist`` unless explicitly allowed.
+_DEFAULT_DENY = {"execute", "delete"}
+# Resume values that count as approval for the ``confirm`` gate.
+_APPROVALS = {"y", "yes", "approve", "approved", "ok", "okay", "allow", "proceed", "go"}
+
 
 def _normalize_agents(raw) -> dict[str, dict]:
     """Validate the configured ``agents`` list → {name: spec}. Drops bad entries
@@ -54,6 +72,10 @@ def _normalize_agents(raw) -> dict[str, dict]:
             log.warning("[coding_agent] %s: args must be a list — ignoring", name)
             args = []
         env = entry.get("env") if isinstance(entry.get("env"), dict) else None
+        policy = str(entry.get("permissions", "auto")).strip().lower() or "auto"
+        if policy not in _VALID_POLICIES:
+            log.warning("[coding_agent] %s: unknown permissions %r — using 'auto'", name, policy)
+            policy = "auto"
         agents[name] = {
             "name": name,
             "command": command,
@@ -61,13 +83,59 @@ def _normalize_agents(raw) -> dict[str, dict]:
             "workdir": workdir,
             "env": {str(k): str(v) for k, v in env.items()} if env else None,
             "timeout_s": entry.get("timeout_s"),
+            "permissions": policy,
+            "allow_kinds": [str(k).lower() for k in (entry.get("allow_kinds") or [])],
+            "deny_kinds": [str(k).lower() for k in (entry.get("deny_kinds") or [])],
+            "confirm": bool(entry.get("confirm", False)),
         }
     return agents
 
 
+def _make_permission(spec: dict) -> Callable[[dict], str | None]:
+    """Build the ACP permission resolver for an agent: given a request's params,
+    return the optionId to select (or None to cancel/deny). Decides per the
+    agent's ``permissions`` policy, using the request's ``toolCall.kind``."""
+    policy = spec["permissions"]
+    allow_set = set(spec["allow_kinds"])
+    deny_set = set(spec["deny_kinds"])
+
+    def _allowed(kind: str) -> bool:
+        if policy == "readonly":
+            return kind in (allow_set or _READONLY_KINDS)
+        if policy == "allowlist":
+            if kind in (deny_set or _DEFAULT_DENY):
+                return False
+            return kind in allow_set if allow_set else True
+        return True  # auto
+
+    def resolver(params: dict) -> str | None:
+        options = params.get("options") or []
+        kind = str(((params.get("toolCall") or {}).get("kind") or "")).lower()
+        allow = _allowed(kind)
+        prefix = "allow" if allow else "reject"
+        for opt in options:
+            if str(opt.get("kind", "")).startswith(prefix):
+                return opt.get("optionId")
+        # No option of the desired kind: allow ⇒ fall back to the first option;
+        # deny ⇒ cancel (None).
+        if allow:
+            return options[0].get("optionId") if options else None
+        log.info("[coding_agent/%s] denied %r action (policy=%s)", spec["name"], kind or "?", policy)
+        return None
+
+    return resolver
+
+
+def _cache_key(spec: dict) -> tuple:
+    return (
+        spec["name"], spec["command"], tuple(spec["args"]), spec["workdir"],
+        spec["permissions"], tuple(sorted(spec["allow_kinds"])), tuple(sorted(spec["deny_kinds"])),
+    )
+
+
 def _client_for(spec: dict) -> AcpClient:
     """Get-or-create the cached client for an agent spec."""
-    key = (spec["name"], spec["command"], tuple(spec["args"]), spec["workdir"])
+    key = _cache_key(spec)
     client = _CLIENTS.get(key)
     if client is None:
         client = AcpClient(
@@ -76,19 +144,28 @@ def _client_for(spec: dict) -> AcpClient:
             cwd=spec["workdir"],
             env=spec["env"],
             name=spec["name"],
+            permission=_make_permission(spec),
         )
         _CLIENTS[key] = client
     return client
 
 
+def _approved(decision) -> bool:
+    return str(decision).strip().lower() in _APPROVALS
+
+
 def _build_code_with(agents: dict[str, dict], default_timeout_s: float):
-    """Build the ``code_with`` tool, closing over the configured agents."""
+    """Build the ``code_with`` tool, closing over the configured agents.
+
+    Not wrapped in ``with_fallback``: the ``confirm`` gate calls ``interrupt()``,
+    whose control-flow exception that wrapper would swallow. Expected failures
+    return error strings; the I/O is guarded locally as the equivalent net.
+    """
     listing = ", ".join(
         f"`{name}` (in `{spec['workdir']}`)" for name, spec in agents.items()
     )
 
     @tool
-    @with_fallback("The coding agent did not return a result.")
     async def code_with(agent: str, task: str) -> str:
         """Delegate a coding task to a CLI coding agent and return its result.
 
@@ -118,23 +195,37 @@ async def code_with(agent: str, task: str) -> str:
         if not str(task).strip():
             return "Error: `task` is empty — give the coding agent a concrete instruction."
 
+        # Per-call consent gate (before any side effect, so re-execution on resume
+        # is idempotent). interrupt() parks the turn as input-required.
+        if spec["confirm"]:
+            from langgraph.types import interrupt
+
+            decision = interrupt({
+                "question": (
+                    f"Allow coding agent '{agent}' to work in {spec['workdir']}?\n\n"
+                    f"Task: {task}\n\nReply 'yes' to proceed, anything else to decline."
+                )
+            })
+            if not _approved(decision):
+                return f"Declined: the operator did not approve running '{agent}' on this task."
+
         lock = _LOCKS.setdefault(agent, asyncio.Lock())
         timeout = float(spec.get("timeout_s") or default_timeout_s)
         client = _client_for(spec)
 
         async def _narrate(title: str) -> None:
-            # PR1: log narration. A later PR streams these onto A2A working frames.
+            # Log narration. A later PR streams these onto A2A working frames.
             log.info("[coding_agent/%s] %s", agent, title)
 
-        async with lock:
-            try:
-                answer = await client.prompt(
-                    task, progress_callback=_narrate, timeout=timeout
-                )
-            except AcpError as exc:
-                # Drop the cached client so the next call relaunches cleanly.
-                _CLIENTS.pop((spec["name"], spec["command"], tuple(spec["args"]), spec["workdir"]), None)
-                return f"Error: {agent} (coding agent) failed: {exc}"
+        try:
+            async with lock:
+                answer = await client.prompt(task, progress_callback=_narrate, timeout=timeout)
+        except AcpError as exc:
+            _CLIENTS.pop(_cache_key(spec), None)  # drop so the next call relaunches
+            return f"Error: {agent} (coding agent) failed: {exc}"
+        except Exception as exc:  # noqa: BLE001 — local safety net (with_fallback is dropped)
+            log.warning("[coding_agent/%s] unexpected failure: %s", agent, exc)
+            return f"Error (partial result): {agent} could not complete: {type(exc).__name__}: {exc}"
         return answer or f"{agent} finished but returned no text."
 
     # The configured agent names belong in the LLM-facing description so the model
diff --git a/plugins/coding_agent/acp_client.py b/plugins/coding_agent/acp_client.py
index 579c35b..433f95a 100644
--- a/plugins/coding_agent/acp_client.py
+++ b/plugins/coding_agent/acp_client.py
@@ -57,12 +57,17 @@ def __init__(
         cwd: str,
         env: dict[str, str] | None = None,
         name: str = "acp",
+        permission: Callable[[dict], str | None] | None = None,
     ) -> None:
         self.command = command
         self.args = list(args or [])
         self.cwd = str(Path(cwd).expanduser())
         self.env = env
         self.name = name
+        # Permission resolver: ``(request_params) -> optionId | None`` (None ⇒
+        # cancel/deny). Defaults to ``_auto_allow`` — the coding agent self-governs
+        # within its workdir. The plugin injects a by-kind policy here (ADR 0024).
+        self._permission = permission
 
         self._proc: asyncio.subprocess.Process | None = None
         self._session_id: str | None = None
@@ -191,7 +196,8 @@ async def _handle_request(self, msg: dict) -> None:
         method = msg.get("method")
         rid = msg.get("id")
         if method == "session/request_permission":
-            option_id = self._auto_allow(msg.get("params") or {})
+            resolver = self._permission or self._auto_allow
+            option_id = resolver(msg.get("params") or {})
             outcome = (
                 {"outcome": "selected", "optionId": option_id}
                 if option_id
@@ -205,8 +211,8 @@ async def _handle_request(self, msg: dict) -> None:
 
     @staticmethod
     def _auto_allow(params: dict) -> str | None:
-        """PR1 permission policy: pick the first 'allow' option (else the first
-        option). The HITL / deny-policy layer replaces this (ADR 0024, later PR)."""
+        """Default permission policy: pick the first 'allow' option (else the
+        first option). The plugin's by-kind policy (ADR 0024) overrides this."""
         options = params.get("options") or []
         for opt in options:
             if str(opt.get("kind", "")).startswith("allow"):
diff --git a/plugins/coding_agent/protoagent.plugin.yaml b/plugins/coding_agent/protoagent.plugin.yaml
index 90e83ce..b5feff0 100644
--- a/plugins/coding_agent/protoagent.plugin.yaml
+++ b/plugins/coding_agent/protoagent.plugin.yaml
@@ -29,7 +29,7 @@ config_section: coding_agent
 config:
   # Default per-turn timeout (seconds). Coding is slow; override per agent.
   default_timeout_s: 600
-  # No agents by default — you MUST add your own (see the guide). Example:
+  # No agents by default — you MUST add your own (see the guide). Examples:
   #   agents:
   #     - name: proto             # the name the LLM passes to code_with(agent=…)
   #       command: proto          # binary on PATH
@@ -37,6 +37,22 @@ config:
   #       workdir: ~/dev/my-repo  # session cwd — the confinement boundary
   #       # env: { SOME_KEY: value }   # optional extra env, merged over process env
   #       # timeout_s: 900             # optional per-agent override
+  #       # permissions: allowlist     # auto (default) | allowlist | readonly
+  #       # allow_kinds: []            # override: kinds to allow (readonly/allowlist)
+  #       # deny_kinds: [execute, delete]  # override: kinds to deny (allowlist)
+  #       # confirm: true              # ask the operator before each code_with call
+  #     - name: claude-code
+  #       command: npx
+  #       args: ["@zed-industries/claude-code-acp"]
+  #       workdir: ~/dev/my-repo
+  #     - name: codex
+  #       command: codex
+  #       args: ["acp"]
+  #       workdir: ~/dev/my-repo
+  #     - name: gemini
+  #       command: gemini
+  #       args: ["--experimental-acp"]
+  #       workdir: ~/dev/my-repo
   agents: []
 settings:
   - key: default_timeout_s
diff --git a/tests/test_coding_agent_plugin.py b/tests/test_coding_agent_plugin.py
index 9c28a83..916e6be 100644
--- a/tests/test_coding_agent_plugin.py
+++ b/tests/test_coding_agent_plugin.py
@@ -12,7 +12,8 @@
 
 import pytest
 
-from plugins.coding_agent import _normalize_agents, register
+import plugins.coding_agent as P
+from plugins.coding_agent import _make_permission, _normalize_agents, register
 from plugins.coding_agent.acp_client import AcpClient, AcpError
 
 # ── a minimal ACP "agent" (server side) we can drive over stdio ───────────────
@@ -46,9 +47,11 @@ def send(obj):
             "update": {"sessionUpdate": "tool_call", "title": "Editing app.py"}}})
         # Ask permission; the client must respond before we continue.
         send({"jsonrpc": "2.0", "id": 999, "method": "session/request_permission",
-              "params": {"sessionId": "s1", "options": [
-                  {"optionId": "no", "kind": "deny"},
-                  {"optionId": "yes", "kind": "allow_once"}]}})
+              "params": {"sessionId": "s1",
+                         "toolCall": {"toolCallId": "t1", "kind": "edit"},
+                         "options": [
+                             {"optionId": "reject", "kind": "reject_once"},
+                             {"optionId": "ok", "kind": "allow_once"}]}})
         resp = json.loads(sys.stdin.readline().strip())
         chosen = resp.get("result", {}).get("outcome", {}).get("optionId")
         for chunk in ("Hello ", "world [" + str(chosen) + "]"):
@@ -155,12 +158,27 @@ async def on_progress(title: str) -> None:
     finally:
         await client.close()
 
-    # agent_message_chunks accumulated; auto-allow picked the `allow_once` option.
-    assert answer == "Hello world [yes]"
+    # agent_message_chunks accumulated; default auto-allow picked the allow option.
+    assert answer == "Hello world [ok]"
     # tool_call title narrated via the progress callback.
     assert "Editing app.py" in narrations
 
 
+async def test_acp_client_readonly_policy_denies_edit(fake_agent, tmp_path):
+    # A readonly policy must reject the fake's `edit` permission request — the
+    # client picks the reject_once option, which the fake echoes back.
+    spec = {"name": "ro", "permissions": "readonly", "allow_kinds": [], "deny_kinds": []}
+    client = AcpClient(
+        sys.executable, [str(fake_agent)], cwd=str(tmp_path),
+        permission=_make_permission(spec),
+    )
+    try:
+        answer = await client.prompt("edit a file", timeout=30.0)
+    finally:
+        await client.close()
+    assert answer == "Hello world [reject]"
+
+
 async def test_acp_client_missing_binary_raises_acp_error(tmp_path):
     client = AcpClient("definitely-not-a-real-binary-xyz", [], cwd=str(tmp_path))
     with pytest.raises(AcpError):
@@ -171,3 +189,95 @@ async def test_acp_client_bad_workdir_raises_acp_error():
     client = AcpClient(sys.executable, [], cwd="/no/such/dir/anywhere")
     with pytest.raises(AcpError):
         await client.prompt("hi", timeout=10.0)
+
+
+# ── permission policy ─────────────────────────────────────────────────────────
+
+_OPTS = [{"optionId": "a", "kind": "allow_once"}, {"optionId": "r", "kind": "reject_once"}]
+
+
+def _perm(policy, kind, options=None, allow=None, deny=None):
+    spec = {
+        "name": "x", "permissions": policy,
+        "allow_kinds": [k.lower() for k in (allow or [])],
+        "deny_kinds": [k.lower() for k in (deny or [])],
+    }
+    return _make_permission(spec)({"toolCall": {"kind": kind}, "options": options or _OPTS})
+
+
+def test_policy_auto_allows_everything():
+    assert _perm("auto", "execute") == "a"
+    assert _perm("auto", "delete") == "a"
+    assert _perm("auto", "edit") == "a"
+
+
+def test_policy_allowlist_denies_risky_allows_safe():
+    assert _perm("allowlist", "edit") == "a"
+    assert _perm("allowlist", "read") == "a"
+    assert _perm("allowlist", "execute") == "r"      # risky → reject option
+    assert _perm("allowlist", "delete") == "r"
+
+
+def test_policy_readonly_allows_read_denies_writes():
+    assert _perm("readonly", "read") == "a"
+    assert _perm("readonly", "search") == "a"
+    assert _perm("readonly", "edit") == "r"
+    assert _perm("readonly", "execute") == "r"
+
+
+def test_policy_deny_cancels_when_no_reject_option():
+    only_allow = [{"optionId": "a", "kind": "allow_once"}]
+    assert _perm("readonly", "edit", options=only_allow) is None
+
+
+def test_policy_custom_allow_deny_kinds():
+    assert _perm("allowlist", "edit", deny=["edit"]) == "r"        # explicitly denied
+    assert _perm("readonly", "edit", allow=["read", "edit"]) == "a"  # explicitly allowed
+
+
+def test_normalize_agents_parses_safety_fields():
+    a = _normalize_agents([{
+        "name": "p", "command": "c", "workdir": "/tmp",
+        "permissions": "READONLY", "confirm": True,
+        "allow_kinds": ["Read"], "deny_kinds": ["Execute"],
+    }])["p"]
+    assert a["permissions"] == "readonly"          # lower-cased
+    assert a["confirm"] is True
+    assert a["allow_kinds"] == ["read"] and a["deny_kinds"] == ["execute"]
+
+
+def test_normalize_agents_bad_policy_falls_back_to_auto():
+    a = _normalize_agents([{"name": "p", "command": "c", "workdir": "/tmp", "permissions": "yolo"}])["p"]
+    assert a["permissions"] == "auto"
+    assert a["confirm"] is False                    # default
+
+
+# ── per-call consent gate (confirm) ───────────────────────────────────────────
+
+
+async def test_confirm_gate_declines(monkeypatch):
+    import langgraph.types as lt
+    monkeypatch.setattr(lt, "interrupt", lambda payload: "no")
+    reg = _StubRegistry({"agents": [
+        {"name": "proto", "command": "proto", "workdir": "/tmp", "confirm": True},
+    ]})
+    register(reg)
+    out = await reg.tools[0].ainvoke({"agent": "proto", "task": "do it"})
+    assert "Declined" in out
+
+
+async def test_confirm_gate_approves_then_runs(monkeypatch):
+    import langgraph.types as lt
+    monkeypatch.setattr(lt, "interrupt", lambda payload: "yes")
+
+    class _StubClient:
+        async def prompt(self, task, progress_callback=None, timeout=600.0):
+            return "did the work"
+
+    monkeypatch.setattr(P, "_client_for", lambda spec: _StubClient())
+    reg = _StubRegistry({"agents": [
+        {"name": "proto", "command": "proto", "workdir": "/tmp", "confirm": True},
+    ]})
+    register(reg)
+    out = await reg.tools[0].ainvoke({"agent": "proto", "task": "do it"})
+    assert out == "did the work"

From 50ed5848be17ddc6b8f8d403a77cf1b52c59bc27 Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Sat, 6 Jun 2026 00:44:26 -0700
Subject: [PATCH 17/20] =?UTF-8?q?feat(evals):=20requires=5Fenv=20skip=20+?=
 =?UTF-8?q?=20gated=20code=5Fwith=20delegation=20eval=20=E2=80=94=20ADR=20?=
 =?UTF-8?q?0024=20(PR4)=20(#600)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Completes the ADR 0024 PR plan with the coding-agent eval, plus the runner
mechanism it needs:

- eval runner: a case may declare `requires_env: [VAR, …]`; when any is unset
  the case is SKIPPED (new CaseResult.skipped; shown SKIP, excluded from the
  pass/fail tally and from the non-zero exit) instead of run. Generally useful —
  any case needing an optional integration can gate on it without breaking the
  default board.
- tasks.json: `code_with_delegation` (kind=ask, gated on EVAL_CODING_AGENT) —
  drives a live A2A turn that asks the agent to use code_with and asserts (audit
  channel) the tool fired. Skips by default.
- tests: _requirements_unmet, board counts skipped separately, case present +
  gated (test_eval_coverage, 22 pass; full eval suite 36 pass).
- docs: evals guide (requires_env), coding-agents guide (Eval it), ADR 0024
  scope, CHANGELOG.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
---
 CHANGELOG.md                                 |  7 ++++
 docs/adr/0024-spawn-cli-coding-agents-acp.md |  8 +++--
 docs/guides/coding-agents.md                 | 14 ++++++++
 docs/guides/evals.md                         | 23 ++++++++++++
 evals/runner.py                              | 37 ++++++++++++++++----
 evals/tasks.json                             | 10 ++++++
 tests/test_eval_coverage.py                  | 37 ++++++++++++++++++++
 7 files changed, 128 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b54a9b2..39318df 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
+- **Eval-case gating (`requires_env`)** — an eval case can now declare
+  `requires_env: [VAR, …]`; when any is unset the case is **skipped** (shown
+  `SKIP`, excluded from the pass/fail tally) instead of run, so a case needing an
+  optional integration doesn't break the default board. Uses it to ship a gated
+  `code_with_delegation` case (ADR 0024) that verifies end-to-end coding-agent
+  delegation over a live A2A turn — run it with `EVAL_CODING_AGENT=1` once a
+  coding agent is configured. See [Eval your fork](docs/guides/evals.md).
 - **Spawn CLI coding agents over ACP** — a new opt-in `coding_agent` plugin
   (ADR 0024) adds a `code_with(agent, task)` tool that hands a real, repo-scoped
   coding job to a purpose-built CLI coding agent (protoCLI `proto`, Claude Code,
diff --git a/docs/adr/0024-spawn-cli-coding-agents-acp.md b/docs/adr/0024-spawn-cli-coding-agents-acp.md
index 0fc329e..4fe01f7 100644
--- a/docs/adr/0024-spawn-cli-coding-agents-acp.md
+++ b/docs/adr/0024-spawn-cli-coding-agents-acp.md
@@ -129,12 +129,16 @@ Synchronous — the final answer is returned; `tool_call` titles are logged.
 recipes (claude-code-acp, codex, gemini). Per-action live HITL is documented as
 deferred (the blocking-session constraint above).
 
+**PR4:** a gated eval case (`code_with_delegation`) verifying end-to-end
+delegation over a live A2A turn, plus a `requires_env` skip mechanism in the eval
+runner so the case (and any future case needing an optional integration) is
+skipped — not failed — when its prerequisite env var is unset.
+
 **Later PRs:** live narration of `tool_call` titles onto A2A working-status
 frames (so an operator watching a turn sees "Editing app.py") — blocked on the
 same mid-session channel as per-action HITL, so it wants a general progress
 mechanism (event-bus ride-along or a stream-factory `progress` event), not a
-one-off; a gated eval case (needs an eval-runner skip mechanism so it doesn't
-require a coding agent in the default run).
+one-off.
 
 ## Consequences
 
diff --git a/docs/guides/coding-agents.md b/docs/guides/coding-agents.md
index 95b3d98..2a0d3c5 100644
--- a/docs/guides/coding-agents.md
+++ b/docs/guides/coding-agents.md
@@ -167,5 +167,19 @@ One `AcpClient` (subprocess + session) is **cached per agent** so follow-up call
 continue the thread; a per-agent lock serializes turns (a session is a single
 conversation — `task_batch` won't interleave two prompts on one).
 
+## Eval it
+
+A gated eval case (`code_with_delegation`) verifies end-to-end delegation against
+a live agent. It's skipped unless you opt in — configure an agent, then:
+
+```bash
+export EVAL_CODING_AGENT=1
+python -m evals.runner --tasks code_with_delegation
+```
+
+It drives a real A2A turn that asks the agent to use `code_with`, and asserts
+(via the audit channel) that the tool fired. Without `EVAL_CODING_AGENT` set it
+`SKIP`s, so it never breaks the default board. See [Eval your fork](/guides/evals).
+
 See [Plugins](/guides/plugins) for the plugin model in general, and
 [ADR 0024](/adr/0024-spawn-cli-coding-agents-acp) for the design rationale.
diff --git a/docs/guides/evals.md b/docs/guides/evals.md
index 4518751..64a5229 100644
--- a/docs/guides/evals.md
+++ b/docs/guides/evals.md
@@ -154,6 +154,29 @@ The case `kind`s that ship:
   and assert on its synthesized output (patterns + rubric). Used to track the
   subagent workflows (research-and-brief, deep-research).
 
+### Gating a case on prerequisites — `requires_env`
+
+A case can declare `requires_env: [VAR, …]`. If any of those env vars is unset
+the case is **skipped** (shown `SKIP`, not counted as pass or fail) instead of
+run — so a case that needs an optional integration doesn't break the default
+board. For example the `code_with_delegation` case needs a live CLI coding agent
+([Spawn CLI coding agents](/guides/coding-agents)) configured on the instance, so
+it's gated:
+
+```json
+{
+  "id": "code_with_delegation",
+  "category": "tool",
+  "kind": "ask",
+  "requires_env": ["EVAL_CODING_AGENT"],
+  "prompt": "Use your code_with tool to have a coding agent run `pwd` …",
+  "expected_tools": ["code_with"]
+}
+```
+
+Configure the plugin + an agent, export `EVAL_CODING_AGENT=1`, and run
+`python -m evals.runner --tasks code_with_delegation`.
+
 ## Asserting the agent layer (subagents & workflows)
 
 Beyond single-tool selection, the suite tracks the layers recent work has been
diff --git a/evals/runner.py b/evals/runner.py
index 349c81d..52a988d 100644
--- a/evals/runner.py
+++ b/evals/runner.py
@@ -33,6 +33,7 @@
 import argparse
 import asyncio
 import json
+import os
 import sys
 import time
 from dataclasses import asdict, dataclass, field
@@ -58,6 +59,19 @@ class CaseResult:
     duration_ms: int = 0
     tokens: int = 0
     raw: dict = field(default_factory=dict)
+    skipped: bool = False
+
+
+def _requirements_unmet(case: dict) -> str | None:
+    """A reason to skip the case, or None to run it. A case declares
+    ``requires_env: [VAR, …]`` for prerequisites that aren't always present —
+    e.g. an optional integration the operator opts into. Skipped (not failed)
+    when any is unset, so a case needing a live coding agent doesn't break the
+    default board."""
+    for var in case.get("requires_env") or []:
+        if not os.environ.get(var):
+            return f"requires_env {var}"
+    return None
 
 
 # ── case runners ────────────────────────────────────────────────────────────
@@ -473,8 +487,8 @@ def _print_board(results: list[CaseResult]) -> None:
     print("-" * 90)
     pass_count = 0
     for r in results:
-        mark = "PASS" if r.passed else "FAIL"
-        if r.passed:
+        mark = "SKIP" if r.skipped else ("PASS" if r.passed else "FAIL")
+        if r.passed and not r.skipped:
             pass_count += 1
         time_s = f"{r.duration_ms}ms".rjust(6)
         tokens = str(r.tokens).rjust(6) if r.tokens else "  -   "
@@ -483,7 +497,9 @@ def _print_board(results: list[CaseResult]) -> None:
             f"{mark}    {time_s}  {tokens}  {r.detail[:80]}"
         )
     print("-" * 90)
-    print(f"\n{pass_count}/{len(results)} passed")
+    skip_count = sum(1 for r in results if r.skipped)
+    ran = len(results) - skip_count
+    print(f"\n{pass_count}/{ran} passed" + (f" ({skip_count} skipped)" if skip_count else ""))
 
 
 def _save_report(results: list[CaseResult], path: Path, *, model: str = "", base_url: str = "") -> None:
@@ -496,7 +512,8 @@ def _save_report(results: list[CaseResult], path: Path, *, model: str = "", base
         "model": model,
         "base_url": base_url,
         "total": len(results),
-        "passed": sum(1 for r in results if r.passed),
+        "passed": sum(1 for r in results if r.passed and not r.skipped),
+        "skipped": sum(1 for r in results if r.skipped),
         "results": [asdict(r) for r in results],
     }
     path.write_text(json.dumps(payload, indent=2))
@@ -544,8 +561,16 @@ async def main():
     for case in cases:
         sys.stdout.write(f"  {case['id']}... ")
         sys.stdout.flush()
-        result = await run_one(client, case)
-        sys.stdout.write(f"{'PASS' if result.passed else 'FAIL'}  {result.detail[:60]}\n")
+        skip = _requirements_unmet(case)
+        if skip:
+            result = CaseResult(
+                case["id"], case.get("category", "?"), case.get("name", "?"),
+                passed=True, detail=f"skipped: {skip}", skipped=True,
+            )
+        else:
+            result = await run_one(client, case)
+        mark = "SKIP" if result.skipped else ("PASS" if result.passed else "FAIL")
+        sys.stdout.write(f"{mark}  {result.detail[:60]}\n")
         results.append(result)
 
     _print_board(results)
diff --git a/evals/tasks.json b/evals/tasks.json
index 0882046..82d98c6 100644
--- a/evals/tasks.json
+++ b/evals/tasks.json
@@ -56,6 +56,16 @@
     "expected_tools": ["current_time"],
     "expected_patterns": ["UTC"]
   },
+  {
+    "id": "code_with_delegation",
+    "category": "tool",
+    "kind": "ask",
+    "name": "Delegates a coding task via code_with (ACP)",
+    "requires_env": ["EVAL_CODING_AGENT"],
+    "timeout_s": 240,
+    "prompt": "Use your code_with tool to have a coding agent run `pwd` and report back the absolute working directory it printed.",
+    "expected_tools": ["code_with"]
+  },
   {
     "id": "calculator_intent",
     "category": "tool",
diff --git a/tests/test_eval_coverage.py b/tests/test_eval_coverage.py
index 74fb9b3..dd847b9 100644
--- a/tests/test_eval_coverage.py
+++ b/tests/test_eval_coverage.py
@@ -188,3 +188,40 @@ def test_workflow_cases_reference_real_recipes():
     for c in TASKS:
         if c.get("kind") == "workflow":
             assert c["workflow"] in bundled, f"{c['id']} → unknown recipe {c['workflow']!r}"
+
+
+# ── requires_env skip mechanism (ADR 0024 coding-agent eval) ──────────────────
+
+
+def test_requirements_unmet_skips_when_env_missing(monkeypatch):
+    monkeypatch.delenv("EVAL_X", raising=False)
+    assert runner._requirements_unmet({"requires_env": ["EVAL_X"]}) == "requires_env EVAL_X"
+
+
+def test_requirements_met_runs_when_env_set(monkeypatch):
+    monkeypatch.setenv("EVAL_X", "1")
+    assert runner._requirements_unmet({"requires_env": ["EVAL_X"]}) is None
+
+
+def test_no_requirements_runs():
+    assert runner._requirements_unmet({}) is None
+    assert runner._requirements_unmet({"requires_env": []}) is None
+
+
+def test_code_with_eval_case_present_and_gated():
+    case = {c["id"]: c for c in TASKS}.get("code_with_delegation")
+    assert case is not None, "code_with_delegation case missing"
+    assert case["requires_env"] == ["EVAL_CODING_AGENT"]   # skips by default
+    assert case["expected_tools"] == ["code_with"]
+    assert case["kind"] == "ask"
+
+
+def test_board_counts_skipped_separately(capsys):
+    results = [
+        runner.CaseResult("a", "tool", "A", passed=True, detail="ok"),
+        runner.CaseResult("b", "tool", "B", passed=True, detail="skipped: requires_env X", skipped=True),
+    ]
+    runner._print_board(results)
+    out = capsys.readouterr().out
+    assert "1/1 passed (1 skipped)" in out
+    assert "SKIP" in out

From bada5c28a6fbcfd696d267399f6dbffb6366bec8 Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Sat, 6 Jun 2026 01:05:08 -0700
Subject: [PATCH 18/20] fix(mcp): a tool error degrades to a recoverable
 result, not a failed turn (#594) (#595)

MCP tools were appended in build_mcp_tools without handle_tool_error, so when
langchain-mcp-adapters raises ToolException on an isError result (e.g. a server
404 from a stale id), it propagates out of the ToolNode and fails the whole A2A
turn. A single recoverable tool error (stale arg, transient 4xx) shouldn't kill
an otherwise-fine turn.

Set handle_tool_error on each kept MCP tool so the exception is caught inside
BaseTool.arun and returned to the model as a tool result it can act on (retry,
skip, or adapt). Every fork that configures MCP servers inherits this.

Tests: ToolException degrades via handle_tool_error; build_mcp_tools wires the
handler onto kept tools; handler returns a non-fatal, actionable message.

Closes #594.

Co-authored-by: Josh (via Claude) <josh@protolabs.studio>
Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/test_mcp_tool_error_handling.py | 76 +++++++++++++++++++++++++++
 tools/mcp_tools.py                    | 24 +++++++++
 2 files changed, 100 insertions(+)
 create mode 100644 tests/test_mcp_tool_error_handling.py

diff --git a/tests/test_mcp_tool_error_handling.py b/tests/test_mcp_tool_error_handling.py
new file mode 100644
index 0000000..f7a1408
--- /dev/null
+++ b/tests/test_mcp_tool_error_handling.py
@@ -0,0 +1,76 @@
+"""roxy #58 — an MCP tool error must degrade into a recoverable tool result,
+not fail the whole A2A turn.
+
+`langchain-mcp-adapters` raises `ToolException` when an MCP server returns an
+error (e.g. a board `404 Feature not found` from a stale id). `build_mcp_tools`
+sets `_mcp_tool_error_handler` as each MCP tool's `handle_tool_error`, so the
+exception is caught *inside* the tool (BaseTool.arun) and returned to the model —
+never propagating out of the ToolNode to fail the turn.
+"""
+
+from __future__ import annotations
+
+import pytest
+from langchain_core.tools import StructuredTool, ToolException
+
+from tools.mcp_tools import _mcp_tool_error_handler
+
+
+def _raising_tool() -> StructuredTool:
+    """A stand-in for an MCP tool that errors the way the adapter does."""
+
+    async def _boom(**kwargs) -> str:
+        raise ToolException('API error 404: {"success":false,"error":"Feature not found"}')
+
+    return StructuredTool.from_function(
+        coroutine=_boom, name="automaker__get_feature", description="stub"
+    )
+
+
+def test_handler_returns_recoverable_message():
+    msg = _mcp_tool_error_handler(ToolException("API error 404: Feature not found"))
+    assert "Tool error:" in msg
+    assert "Feature not found" in msg
+    assert "fatal" in msg.lower()  # explicitly tells the model not to give up
+
+
+@pytest.mark.asyncio
+async def test_tool_error_degrades_instead_of_raising():
+    tool = _raising_tool()
+    # Without the handler, ainvoke would raise ToolException (the turn-killing path).
+    with pytest.raises(ToolException):
+        await tool.ainvoke({})
+
+    # With the handler set (as build_mcp_tools does), the error becomes a result.
+    tool.handle_tool_error = _mcp_tool_error_handler
+    out = await tool.ainvoke({})
+    assert isinstance(out, str)
+    assert "Tool error:" in out and "Feature not found" in out
+
+
+def test_build_mcp_tools_wires_the_handler(monkeypatch):
+    """The kept MCP tools come back with handle_tool_error set."""
+    import tools.mcp_tools as mt
+
+    kept = _raising_tool()
+
+    class _FakeClient:
+        def get_tools(self):  # awaited via _run_blocking
+            async def _coro():
+                return [kept]
+            return _coro()
+
+    monkeypatch.setattr(mt, "MultiServerMCPClient", lambda *a, **k: _FakeClient(), raising=False)
+    # patch the imported symbol path used inside build_mcp_tools
+    import langchain_mcp_adapters.client as mcp_client
+    monkeypatch.setattr(mcp_client, "MultiServerMCPClient", lambda *a, **k: _FakeClient())
+
+    class _Cfg:
+        mcp_enabled = True
+        mcp_timeout_seconds = 5.0
+        mcp_denylist = []
+        mcp_servers = [{"name": "automaker", "command": "x", "args": []}]
+
+    _clients, tools_out, _meta = mt.build_mcp_tools(_Cfg())
+    assert tools_out, "expected the stub MCP tool to be kept"
+    assert tools_out[0].handle_tool_error is _mcp_tool_error_handler
diff --git a/tools/mcp_tools.py b/tools/mcp_tools.py
index 1cd5db8..df79827 100644
--- a/tools/mcp_tools.py
+++ b/tools/mcp_tools.py
@@ -20,6 +20,24 @@
 log = logging.getLogger("protoagent.mcp")
 
 
+def _mcp_tool_error_handler(exc: Exception) -> str:
+    """Turn an MCP tool failure into a recoverable tool result (roxy #58).
+
+    ``langchain-mcp-adapters`` raises ``ToolException`` when the server returns an
+    error (e.g. a board ``404 Feature not found`` from a stale id). Left unhandled
+    that propagates out of the ToolNode and fails the WHOLE A2A turn. Setting this
+    as each MCP tool's ``handle_tool_error`` makes the tool return this string to
+    the model instead — so a single recoverable tool error (stale arg, transient
+    4xx) degrades into something the model can adapt to, not a dead turn.
+    """
+    return (
+        f"Tool error: {exc}. The tool call failed — commonly a stale/invalid "
+        "argument (e.g. an id that no longer exists) or a transient error. Do NOT "
+        "treat this as fatal: adjust the arguments and retry, try a different "
+        "approach, or continue without this tool's result."
+    )
+
+
 def _server_connection(server: dict) -> dict | None:
     """Map a config ``mcp.servers[]`` entry to a langchain-mcp-adapters
     connection dict. Returns ``None`` for an entry missing its essential fields
@@ -226,6 +244,12 @@ def build_mcp_tools(config, *, plugin_servers=None) -> tuple[list, list, list[di
             if tool.name in core_names:
                 log.warning("[mcp] %s: %s collides with a core tool — skipped", name, tool.name)
                 continue
+            # roxy #58: a tool error (e.g. board 404) must degrade into a tool
+            # result the model can recover from, not fail the whole turn.
+            try:
+                tool.handle_tool_error = _mcp_tool_error_handler
+            except Exception:  # noqa: BLE001 — best-effort; never block tool registration
+                log.debug("[mcp] %s: could not set handle_tool_error on %s", name, tool.name)
             kept.append(tool)
 
         clients.append(client)

From da0740df0335759dba2f57e8038536316d7b87e0 Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Sat, 6 Jun 2026 01:05:12 -0700
Subject: [PATCH 19/20] =?UTF-8?q?feat(a2a):=20opt-in=20deploy-time=20guard?=
 =?UTF-8?q?=20=E2=80=94=20refuse=20to=20start=20on=20a=20loopback=20card?=
 =?UTF-8?q?=20URL=20(#598)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(a2a): opt-in guard — refuse to start on a loopback card URL (https://github.com/protoLabsAI/protoAgent/issues/597)

A deployed agent that advertises http://127.0.0.1:.../a2a (e.g. A2A_PUBLIC_URL
unset after a redeploy) is silently unreachable to remote consumers — a config
regression no test catches, surfacing only at first cross-host dispatch.

Add a2a.require_routable_url (default false). When set, assert_routable_card_url()
runs at startup: if the resolved card URL host is loopback (127.0.0.1/localhost/
::1/0.0.0.0) it logs a clear error pointing at A2A_PUBLIC_URL and exits non-zero.
Off by default so local + desktop runs still advertise loopback correctly.

Tests cover each loopback form, off-by-default, and the routable-passes case.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* fix(lint): drop unused `import types` in test_routable_card_url

The ruff gate (ruff==0.15.10) flagged F401 — `types` is imported but never
used. Removes it so the Lint check passes.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

---------

Co-authored-by: Josh (via Claude) <josh@protolabs.studio>
Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 config/langgraph-config.example.yaml |  5 +++
 graph/config.py                      |  7 ++++
 server/__init__.py                   |  5 +++
 server/a2a.py                        | 36 +++++++++++++++++
 tests/test_routable_card_url.py      | 58 ++++++++++++++++++++++++++++
 5 files changed, 111 insertions(+)
 create mode 100644 tests/test_routable_card_url.py

diff --git a/config/langgraph-config.example.yaml b/config/langgraph-config.example.yaml
index ba16ac1..a0e7f06 100644
--- a/config/langgraph-config.example.yaml
+++ b/config/langgraph-config.example.yaml
@@ -91,6 +91,11 @@ skills:
 # free-text "chat" placeholder so a fresh clone stays callable. The card `name`
 # already follows identity.name / AGENT_NAME.
 a2a:
+  # Deploy-time guard: refuse to start if the card would advertise a loopback URL
+  # (e.g. A2A_PUBLIC_URL unset on a deployed agent → http://127.0.0.1:.../a2a,
+  # unreachable to remote consumers). Leave off for local/desktop runs (loopback
+  # is correct same-host); turn ON for any agent other agents dial over the network.
+  # require_routable_url: true
   # description: "Acme Bot — turns support tickets into triaged, drafted replies."
   # skills:
   #   - id: triage_ticket
diff --git a/graph/config.py b/graph/config.py
index 4d272cf..7c74512 100644
--- a/graph/config.py
+++ b/graph/config.py
@@ -357,6 +357,12 @@ class LangGraphConfig:
     # agent_name()).
     a2a_skills: list[dict] = field(default_factory=list)
     a2a_description: str = ""
+    # When true, refuse to start if the card would advertise a loopback URL
+    # (e.g. A2A_PUBLIC_URL unset on a deployed agent → http://127.0.0.1:.../a2a,
+    # silently unreachable to remote consumers). Off by default — local/desktop
+    # runs SHOULD advertise loopback (the client is same-host). Enforced by
+    # server/a2a.py::assert_routable_card_url() at startup.
+    a2a_require_routable_url: bool = False
 
     # Instance id for multi-instance data scoping (ADR 0004). When set, every
     # store nests under <base>/<id>/ so several instances can share one
@@ -552,6 +558,7 @@ def from_yaml(cls, path: str | Path) -> "LangGraphConfig":
             identity_operator=identity.get("operator", cls.identity_operator),
             a2a_skills=list(a2a.get("skills", []) or []),
             a2a_description=a2a.get("description", "") or "",
+            a2a_require_routable_url=bool(a2a.get("require_routable_url", False)),
             instance_id=data.get("instance", {}).get("id", "") or data.get("instance_id", cls.instance_id),
             auth_token=secret_auth_token or auth.get("token", cls.auth_token),
             autostart_on_boot=runtime.get("autostart_on_boot", cls.autostart_on_boot),
diff --git a/server/__init__.py b/server/__init__.py
index e45278f..4d7297e 100644
--- a/server/__init__.py
+++ b/server/__init__.py
@@ -224,6 +224,7 @@ def agent_name() -> str:
     _agent_skills,
     _bearer_configured,
     _build_agent_card_proto,
+    assert_routable_card_url,
     _package_version,
     _record_a2a_telemetry,
     structured_skill_schema,
@@ -615,6 +616,10 @@ async def _scheduler_shutdown() -> None:
     )
 
     a2a_card = _build_agent_card_proto()
+    # Deploy-time guard (opt-in): refuse to start if the card would advertise a
+    # loopback URL — a deployed agent that does so is silently unreachable to
+    # remote consumers. No-op unless a2a.require_routable_url is set.
+    assert_routable_card_url()
 
     # Durable SQLite-backed task + push-config stores (survive restart; 24h TTL
     # sweep on tasks). The push-config store rejects SSRF callback URLs at
diff --git a/server/a2a.py b/server/a2a.py
index 493a927..69e7b3a 100644
--- a/server/a2a.py
+++ b/server/a2a.py
@@ -154,6 +154,42 @@ def _a2a_card_url() -> str:
     return f"{base}/a2a"
 
 
+_LOOPBACK_HOSTS = {"127.0.0.1", "localhost", "::1", "0.0.0.0", ""}
+
+
+def assert_routable_card_url() -> None:
+    """Fail fast at startup if the card would advertise a loopback URL (opt-in).
+
+    A *deployed* agent that advertises ``http://127.0.0.1:.../a2a`` — e.g. because
+    ``A2A_PUBLIC_URL`` wasn't set after a redeploy — is silently unreachable to any
+    remote consumer that dials the card's interface URL. That's a deployment-config
+    regression no test catches; it surfaces only at first cross-host dispatch.
+
+    When ``a2a.require_routable_url`` is set, refuse to start (exit non-zero) rather
+    than discover it later. **Off by default** — local + desktop runs *should*
+    advertise loopback (the client is same-host, and the desktop port is dynamic).
+    Deployed agents opt in via config.
+    """
+    cfg = STATE.graph_config
+    if not (cfg and getattr(cfg, "a2a_require_routable_url", False)):
+        return
+    from urllib.parse import urlparse
+
+    url = _a2a_card_url()
+    host = (urlparse(url).hostname or "").lower()
+    if host in _LOOPBACK_HOSTS:
+        log.critical(
+            "[a2a] refusing to start: the agent card would advertise a loopback "
+            "URL %r (host=%r), unreachable to remote consumers. "
+            "a2a.require_routable_url is set — a deployed agent must advertise its "
+            "externally-reachable address. Set A2A_PUBLIC_URL to the host other "
+            "agents reach (e.g. http://roxy:7870).",
+            url, host or "<empty>",
+        )
+        raise SystemExit(1)
+    log.info("[a2a] card URL %s is routable (require_routable_url check passed)", url)
+
+
 # Template default card description — used when a fork sets no ``a2a.description``
 # in config (#570). Forks override via config, not by editing this file.
 _DEFAULT_CARD_DESCRIPTION = (
diff --git a/tests/test_routable_card_url.py b/tests/test_routable_card_url.py
new file mode 100644
index 0000000..c261a82
--- /dev/null
+++ b/tests/test_routable_card_url.py
@@ -0,0 +1,58 @@
+"""assert_routable_card_url — deploy-time guard against a loopback agent card.
+
+A deployed agent that advertises `http://127.0.0.1:.../a2a` (e.g. A2A_PUBLIC_URL
+unset after a redeploy) is silently unreachable to remote consumers. When
+`a2a.require_routable_url` is set, the agent refuses to start. Off by default so
+local/desktop runs still advertise loopback correctly.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+import server.a2a as a2a
+from runtime.state import STATE
+
+
+class _Cfg:
+    def __init__(self, require: bool):
+        self.a2a_require_routable_url = require
+
+
+def _set(monkeypatch, *, require: bool, public_url: str | None, port: int = 7870):
+    monkeypatch.setattr(STATE, "graph_config", _Cfg(require), raising=False)
+    monkeypatch.setattr(STATE, "active_port", port, raising=False)
+    if public_url is None:
+        monkeypatch.delenv("A2A_PUBLIC_URL", raising=False)
+    else:
+        monkeypatch.setenv("A2A_PUBLIC_URL", public_url)
+
+
+def test_noop_when_flag_off(monkeypatch):
+    # Loopback URL, but the guard is off → must NOT exit (local/desktop default).
+    _set(monkeypatch, require=False, public_url=None)
+    a2a.assert_routable_card_url()  # no raise
+
+
+def test_exits_on_loopback_when_required(monkeypatch):
+    _set(monkeypatch, require=True, public_url=None)  # falls back to 127.0.0.1
+    with pytest.raises(SystemExit) as ei:
+        a2a.assert_routable_card_url()
+    assert ei.value.code == 1
+
+
+@pytest.mark.parametrize("bad", [
+    "http://127.0.0.1:7870",
+    "http://localhost:7870",
+    "http://[::1]:7870",
+    "http://0.0.0.0:7870",
+])
+def test_exits_on_each_loopback_form(monkeypatch, bad):
+    _set(monkeypatch, require=True, public_url=bad)
+    with pytest.raises(SystemExit):
+        a2a.assert_routable_card_url()
+
+
+def test_passes_on_routable_host(monkeypatch):
+    _set(monkeypatch, require=True, public_url="http://roxy:7870")
+    a2a.assert_routable_card_url()  # routable → no raise

From 4afd97d4d83ce61603a4d7225b5e43a0aea88e9e Mon Sep 17 00:00:00 2001
From: Josh Mabry <31560031+mabry1985@users.noreply.github.com>
Date: Sat, 6 Jun 2026 01:15:44 -0700
Subject: [PATCH 20/20] chore: release v0.16.0 (#601)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 CHANGELOG.md   | 2 ++
 pyproject.toml | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 39318df..24bc9fa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.16.0] - 2026-06-06
+
 ### Added
 - **Eval-case gating (`requires_env`)** — an eval case can now declare
   `requires_env: [VAR, …]`; when any is unset the case is **skipped** (shown
diff --git a/pyproject.toml b/pyproject.toml
index fc6b748..9e495f9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "protoagent"
-version = "0.15.1"
+version = "0.16.0"
 description = "protoAgent — LangGraph + A2A template for spawning protoLabs agents"
 requires-python = ">=3.11"