Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 28 additions & 41 deletions src/whygraph/cli/commands/scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from rich.table import Table
from rich.text import Text

from whygraph.scan import Crawler, GitCrawler, GitHubCrawler
from whygraph.scan import CodeGraphCrawler, Crawler, GitCrawler, GitHubCrawler

from ..console import console

Expand Down Expand Up @@ -44,12 +44,12 @@
"refresh_codegraph",
default=True,
help=(
"Refresh the CodeGraph index before crawling — `codegraph sync` when "
"an index exists, `codegraph init -i` on first run. Uses the local "
"`codegraph` binary if present, else runs it inside the WhyGraph "
"Docker image. The crawl itself doesn't need CodeGraph (only the MCP "
"rationale/evidence tools do), so a failure here warns rather than "
"aborting. Default: on."
"Refresh the CodeGraph index concurrently with the crawl — "
"`codegraph sync` when an index exists, `codegraph init -i` on first "
"run. Uses the local `codegraph` binary if present, else runs it "
"inside the WhyGraph Docker image. The crawl itself doesn't need "
"CodeGraph (only the MCP rationale/evidence tools do), so a failure "
"here warns rather than aborting. Default: on."
),
)
@click.option(
Expand Down Expand Up @@ -122,13 +122,19 @@ def scan_cmd(
remote_enabled=remote,
)

# CodeGraph refresh runs before the crawl and outside the Progress
# live-display (it streams its own output on first index). It's
# best-effort: the crawl doesn't depend on it.
if refresh_codegraph:
_refresh_codegraph(repository.root, image=codegraph_image)

with Progress() as progress:
# CodeGraph refresh — runs concurrently as its own crawler. It
# writes .codegraph/ and has no data dependency on the WhyGraph DB,
# so it overlaps the entire crawl (started with phase 1, joined
# last). Best-effort: failures land on .warning, not .error.
codegraph_crawler = (
CodeGraphCrawler(
progress, project_root=repository.root, image=codegraph_image
)
if refresh_codegraph
else None
)

# Phase 1 — source crawlers, run concurrently.
phase1: list[Crawler] = [GitCrawler(progress, repository=repository)]
if github_client is not None:
Expand All @@ -148,6 +154,8 @@ def scan_cmd(
)
)

if codegraph_crawler is not None:
codegraph_crawler.start()
for c in phase1:
c.start()
for c in phase1:
Expand All @@ -156,8 +164,15 @@ def scan_cmd(
c.start()
for c in phase2:
c.join()
if codegraph_crawler is not None:
codegraph_crawler.join()

if codegraph_crawler is not None and codegraph_crawler.warning is not None:
console.print(Text(codegraph_crawler.warning, style="yellow"))

crawlers = phase1 + phase2
if codegraph_crawler is not None:
crawlers.append(codegraph_crawler)
failed = [c for c in crawlers if c.error is not None]
for c in failed:
click.echo(f"crawler {c.name!r} failed: {c.error}", err=True)
Expand Down Expand Up @@ -196,34 +211,6 @@ def _apply_github_token(config: "Config") -> None:
os.environ["GH_TOKEN"] = token


def _refresh_codegraph(project_root: Path, *, image: str | None) -> None:
"""Bring the CodeGraph index up to date before the crawl.

Best-effort: the git / GitHub / analyze crawl does not depend on the
CodeGraph index (only the MCP ``whygraph_rationale_brief`` and
``whygraph_evidence_for`` tools read it), so a CodeGraph failure is
reported as a warning and the scan continues.

Parameters
----------
project_root : Path
Repository root whose ``.codegraph/`` index is refreshed.
image : str or None
Docker image override for the fallback path; ``None`` uses the
pinned default. Ignored when a local ``codegraph`` binary is found.
"""
from whygraph.services.codegraph import (
CodeGraphBootstrapError,
refresh_codegraph_index,
)

console.print("Refreshing CodeGraph index…")
try:
refresh_codegraph_index(project_root, image=image)
except CodeGraphBootstrapError as exc:
console.print(Text(f"CodeGraph refresh skipped — {exc}", style="yellow"))


def _select_github_client(
provider: str, repository: "Repository"
) -> "GitHubClient | None":
Expand Down
17 changes: 13 additions & 4 deletions src/whygraph/scan/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,24 @@

Exposes :class:`Crawler` (the threaded base class) and the concrete
crawlers — :class:`GitCrawler` for local git history,
:class:`GitHubCrawler` for GitHub pull requests and issues, and
:class:`GitHubCrawler` for GitHub pull requests and issues,
:class:`CodeGraphCrawler` which refreshes the CodeGraph index, and
:class:`AnalyzeCrawler` which describes each commit's diff with an LLM
(run after :class:`GitCrawler`). The CLI runs the source crawlers
concurrently, then the analyzer, against the shared SQLite database.
(run after :class:`GitCrawler`). The CLI runs the source crawlers (and
CodeGraph) concurrently, then the analyzer, against the shared SQLite
database.
"""

from whygraph.scan.analyze_crawler import AnalyzeCrawler
from whygraph.scan.codegraph_crawler import CodeGraphCrawler
from whygraph.scan.crawler import Crawler
from whygraph.scan.git_crawler import GitCrawler
from whygraph.scan.github_crawler import GitHubCrawler

__all__ = ["AnalyzeCrawler", "Crawler", "GitCrawler", "GitHubCrawler"]
__all__ = [
"AnalyzeCrawler",
"CodeGraphCrawler",
"Crawler",
"GitCrawler",
"GitHubCrawler",
]
84 changes: 84 additions & 0 deletions src/whygraph/scan/codegraph_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""CodeGraphCrawler — refresh the CodeGraph index alongside the crawl.

Runs ``codegraph init -i`` (first index) or ``codegraph sync -q``
(incremental) as one more :class:`Crawler` thread, so the index builds
concurrently with the git / GitHub / analyze crawlers instead of blocking
before them. CodeGraph writes ``.codegraph/`` and has no data dependency
on the WhyGraph DB, so it can safely overlap the entire scan.

The refresh is **best-effort**: only the MCP rationale / evidence tools
read the index, not the crawl. A :class:`CodeGraphBootstrapError` (tool
missing, non-zero exit) is therefore swallowed into :attr:`warning`
rather than failing the scan; any other exception propagates into the
base class's :attr:`Crawler.error` and surfaces as a real failure.

Subprocess output is captured (``capture=True``) rather than streamed so
it cannot corrupt the shared :class:`rich.progress.Progress` display; the
captured tail is folded into :attr:`warning` on failure.
"""

from __future__ import annotations

from pathlib import Path

from rich.progress import Progress

from whygraph.services.codegraph import (
CodeGraphBootstrapError,
refresh_codegraph_index,
)
from whygraph.services.codegraph.paths import CODEGRAPH_DB_RELPATH

from .crawler import Crawler


class CodeGraphCrawler(Crawler):
"""Refresh ``<project_root>/.codegraph/codegraph.db`` concurrently.

Drives an indeterminate (pulsing) progress task — CodeGraph reports no
granular progress — and completes it cleanly on success. CodeGraph
bootstrap failures are recorded on :attr:`warning` rather than
:attr:`Crawler.error`, preserving the best-effort contract.

Parameters
----------
progress : rich.progress.Progress
Shared Progress instance owned by the orchestrator.
project_root : Path
Repository root whose ``.codegraph/`` index is refreshed.
image : str or None
Docker image override for the CodeGraph fallback path; ``None``
uses the pinned default. Ignored when a local ``codegraph`` binary
is found.

Attributes
----------
warning : str or None
Message describing a swallowed :class:`CodeGraphBootstrapError`,
for the orchestrator to surface after the crawl. ``None`` on
success.
"""

def __init__(
self, progress: Progress, *, project_root: Path, image: str | None
) -> None:
super().__init__("codegraph", progress, total=None)
self._project_root = project_root
self._image = image
self.warning: str | None = None

def work(self) -> None:
db_path = self._project_root / CODEGRAPH_DB_RELPATH
verb = "sync" if db_path.exists() else "init -i"
self.advance(0, description=f"codegraph {verb}")

try:
refresh_codegraph_index(self._project_root, image=self._image, capture=True)
except CodeGraphBootstrapError as exc:
self.warning = f"CodeGraph refresh skipped — {exc}"
return

# CodeGraph reports no granular progress, so land the pulsing bar
# on a clean "complete" once the refresh returns.
self.set_total(1)
self.advance(1)
31 changes: 27 additions & 4 deletions src/whygraph/services/codegraph/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def ensure_codegraph_db(
project_root: Path,
*,
image: str | None = None,
capture: bool = False,
) -> Path:
"""Idempotently materialize ``<project_root>/.codegraph/codegraph.db``.

Expand All @@ -61,6 +62,11 @@ def ensure_codegraph_db(
image : str, optional
Docker image tag for the fallback path. Defaults to
:data:`DEFAULT_CODEGRAPH_IMAGE`.
capture : bool, optional
When ``True``, capture the subprocess output instead of letting it
stream to the terminal (so it can't corrupt a concurrent progress
display). The captured tail is folded into the error message on
failure. Default ``False`` (stream live).

Returns
-------
Expand All @@ -78,7 +84,7 @@ def ensure_codegraph_db(
if db_path.exists():
return db_path

_run_codegraph(project_root, ["init", "-i"], image=image)
_run_codegraph(project_root, ["init", "-i"], image=image, capture=capture)

if not db_path.exists():
raise CodeGraphBootstrapError(
Expand All @@ -92,6 +98,7 @@ def refresh_codegraph_index(
project_root: Path,
*,
image: str | None = None,
capture: bool = False,
) -> Path:
"""Bring ``<project_root>/.codegraph/codegraph.db`` up to date.

Expand All @@ -107,6 +114,11 @@ def refresh_codegraph_index(
image : str, optional
Docker image tag for the fallback path. Defaults to
:data:`DEFAULT_CODEGRAPH_IMAGE`.
capture : bool, optional
When ``True``, capture the subprocess output instead of streaming
it (see :func:`ensure_codegraph_db`). ``whygraph scan`` passes this
so the refresh can run concurrently under a live progress display.
Default ``False``.

Returns
-------
Expand All @@ -122,9 +134,9 @@ def refresh_codegraph_index(
project_root = project_root.resolve()
db_path = project_root / CODEGRAPH_DB_RELPATH
if not db_path.exists():
return ensure_codegraph_db(project_root, image=image)
return ensure_codegraph_db(project_root, image=image, capture=capture)

_run_codegraph(project_root, ["sync", "-q"], image=image)
_run_codegraph(project_root, ["sync", "-q"], image=image, capture=capture)
return db_path


Expand All @@ -133,6 +145,7 @@ def _run_codegraph(
args: list[str],
*,
image: str | None,
capture: bool = False,
) -> None:
"""Run a ``codegraph`` subcommand against ``project_root``.

Expand All @@ -150,6 +163,10 @@ def _run_codegraph(
image : str or None
Docker image tag for the fallback path; ``None`` uses
:data:`DEFAULT_CODEGRAPH_IMAGE`.
capture : bool, optional
When ``True``, capture stdout/stderr rather than streaming them to
the terminal, and fold the captured tail into the error message on
failure. Default ``False`` (stream live).

Raises
------
Expand Down Expand Up @@ -187,8 +204,14 @@ def _run_codegraph(
)

try:
subprocess.run(cmd, check=True, cwd=cwd)
subprocess.run(cmd, check=True, cwd=cwd, capture_output=capture, text=capture)
except subprocess.CalledProcessError as exc:
if capture:
tail = (exc.stderr or exc.stdout or "").strip()
detail = f"\n{tail}" if tail else ""
raise CodeGraphBootstrapError(
f"`codegraph {label}` failed (exit {exc.returncode}){detail}"
) from exc
raise CodeGraphBootstrapError(
f"`codegraph {label}` failed (exit {exc.returncode}) — see output above"
) from exc
Expand Down
Loading
Loading