Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
380 changes: 294 additions & 86 deletions .claude/skills/uts-to-kotlin/SKILL.md

Large diffs are not rendered by default.

628 changes: 628 additions & 0 deletions .claude/skills/uts-to-kotlin/references/objects-mapping.md

Large diffs are not rendered by default.

242 changes: 242 additions & 0 deletions .claude/skills/uts-to-kotlin/scripts/audit_translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
#!/usr/bin/env python3
"""
audit_translation.py β€” deterministic faithfulness audit of a UTS spec against its
generated Kotlin test, so the per-spec review (SKILL.md Step 7) reconciles a concrete
extracted ledger instead of eyeballing two files.

Usage:
python3 audit_translation.py <spec-file.md> <GeneratedTest.kt>

It does ZERO semantic judgement β€” only mechanical extraction with regex, so the same
inputs always give the same report:

1. Test-ID coverage. Every `**Test ID**: \`<id>\`` in the spec vs every `@UTS <id>`
in the Kotlin file.
- missingInKotlin: a spec Test ID with no matching @UTS tag β†’ a whole test
case is absent; implement it (or consciously exclude it and explain why).
- orphanInKotlin: an @UTS tag with no matching spec Test ID β†’ a stale or
hand-edited tag. Investigate.

2. Per-test line ledger. Within each spec test block (from one Test ID to the next),
every non-blank, non-comment code line inside the ```pseudo fences is extracted
verbatim, grouped by its section (Setup / Test Steps / Assertions / …) β€” so setup,
operations AND assertions are all enumerated, nothing escapes the ledger. Each line
is tagged: "assert" (ASSERT*), "await" (AWAIT* / EXPECT), or "step" (everything else
β€” setup, mock construction, operations). For convenience the ASSERT* and AWAIT*
lines are also surfaced flat as specAsserts / specAwaits.
Alongside them, the count of assertion / await / poll calls in the matching Kotlin
method is reported as a tripwire: Kotlin assertions < spec ASSERTs for a test is a
strong signal an assertion was silently dropped.

Robustness contract: this tool must never crash mid-run on any spec/test pair. It is a
review aid β€” if it can't extract something it degrades to "couldn't verify" (fewer lines
in the ledger) rather than throwing. Whatever happens it emits ONE parseable JSON object,
never a traceback, so a caller can always rely on the output shape.

Exit status:
0 β€” clean audit (no missing/orphan Test IDs)
2 β€” audit ran, but there are missing/orphan Test IDs (gateable)
64 β€” could not run: bad usage, unreadable file, or an internal error (JSON carries `error`)
"""

import json
import re
import sys
from pathlib import Path

# Spec markers -------------------------------------------------------------
TEST_ID_RE = re.compile(r"\*\*Test ID\*\*:\s*`([^`]+)`")
HEADING_RE = re.compile(r"^#{1,4}\s+(.*\S)\s*$")
FENCE_RE = re.compile(r"^\s*```")
# Imperative spec keywords. Order matters: longest / most specific first so AWAIT_STATE
# is classified before the bare AWAIT.
DIRECTIVE_RE = re.compile(r"\b(ASSERT_[A-Z_]+|ASSERT|AWAIT_STATE|AWAIT_ERROR|AWAIT_ALL|AWAIT|EXPECT)\b")

# Kotlin markers -----------------------------------------------------------
UTS_TAG_RE = re.compile(r"@UTS\s+(\S+)")
KOTLIN_ASSERT_RE = re.compile(
r"\b(assertEquals|assertNotEquals|assertNull|assertNotNull|assertTrue|assertFalse|"
r"assertIs|assertIsNot|assertContains|assertFailsWith|assertFails|assertSame|"
r"assertNotSame|awaitState|awaitChannelState|pollUntil)\b"
)


def fail(code, message, status=64, **extra):
"""Emit a structured error (same JSON shape as resolve_uts.py) and exit. `status`
defaults to 64 β€” "couldn't run" β€” kept distinct from the audit's own 0 (clean) and
2 (missing/orphan IDs) outcomes so callers can tell the two apart."""
print(json.dumps({"ok": False, "error": code, "message": message, **extra}, indent=2))
sys.exit(status)


def read_lines(path):
"""Read a file into lines, tolerant of encoding issues β€” a stray non-UTF-8 byte in a
spec must never crash the audit, so undecodable bytes are replaced, not raised on."""
return Path(path).read_text(encoding="utf-8", errors="replace").splitlines()


def classify(keyword):
if keyword.startswith("ASSERT"):
return "assert"
if keyword.startswith("AWAIT") or keyword == "EXPECT":
return "await"
return "other"


def line_kind(stripped):
"""Tag a pseudocode line: assert / await / step."""
m = DIRECTIVE_RE.search(stripped)
if m:
k = classify(m.group(1))
if k in ("assert", "await"):
return k
return "step"


def parse_spec(path):
"""Return an ordered list of test-id dicts, each:
{testId, title, sections:[{heading, lines:[{text, kind}]}], asserts[], awaits[], codeLineTotal}."""
lines = read_lines(path)

# locate each Test ID line and the nearest preceding heading (its human title)
tests = []
last_heading = ""
for i, line in enumerate(lines):
h = HEADING_RE.match(line)
if h:
last_heading = h.group(1)
m = TEST_ID_RE.search(line)
if m:
tests.append({"testId": m.group(1), "title": last_heading, "_start": i})

# block boundaries: each test runs to the next test's start (or EOF)
for idx, t in enumerate(tests):
start = t["_start"]
end = tests[idx + 1]["_start"] if idx + 1 < len(tests) else len(lines)

sections = [] # [{heading, lines:[{text, kind}]}]
cur = None # current section being filled
in_fence = False
for line in lines[start:end]:
h = HEADING_RE.match(line)
if h and not in_fence:
cur = {"heading": h.group(1), "lines": []}
sections.append(cur)
continue
if FENCE_RE.match(line):
in_fence = not in_fence
continue
if not in_fence:
continue
stripped = line.strip()
if not stripped or stripped.startswith("#"): # blank / pseudocode comment
continue
if cur is None: # code before any heading in the block (rare)
cur = {"heading": "(preamble)", "lines": []}
sections.append(cur)
cur["lines"].append({"text": stripped, "kind": line_kind(stripped)})

# drop sections with no code (prose-only headings, requirement tables, the title line)
sections = [s for s in sections if s["lines"]]
asserts = [ln["text"] for s in sections for ln in s["lines"] if ln["kind"] == "assert"]
awaits = [ln["text"] for s in sections for ln in s["lines"] if ln["kind"] == "await"]
t["sections"] = sections
t["asserts"] = asserts
t["awaits"] = awaits
t["codeLineTotal"] = sum(len(s["lines"]) for s in sections)
del t["_start"]
return tests


def parse_kotlin(path):
"""Return dict: uts-id -> {assertionCalls[], assertionCount}. Method block for a tag
runs from its @UTS line to the next @UTS line (or EOF)."""
lines = read_lines(path)
tags = [(i, m.group(1)) for i, line in enumerate(lines) for m in [UTS_TAG_RE.search(line)] if m]
out = {}
for idx, (start, tag) in enumerate(tags):
end = tags[idx + 1][0] if idx + 1 < len(tags) else len(lines)
calls = []
for line in lines[start:end]:
for m in KOTLIN_ASSERT_RE.finditer(line):
calls.append(m.group(1))
out[tag] = {"assertionCalls": calls, "assertionCount": len(calls)}
return out


def main():
if len(sys.argv) != 3:
fail("USAGE", "Usage: audit_translation.py <spec-file.md> <GeneratedTest.kt>")

spec_path, kt_path = sys.argv[1], sys.argv[2]
for p in (spec_path, kt_path):
if not Path(p).is_file():
fail("FILE_NOT_FOUND", f"{p!r} not found.")

# Robustness backstop: never let an unexpected parsing/IO error surface as a traceback.
# fail() emits structured JSON and exits 64 (distinct from the 0/2 audit outcomes) so
# callers and the model can tell "couldn't run" apart from "ran, found gaps".
try:
spec_tests = parse_spec(spec_path)
kotlin = parse_kotlin(kt_path)
report = build_report(spec_path, kt_path, spec_tests, kotlin)
except Exception as exc: # noqa: BLE001 β€” deliberate catch-all; this tool must not crash
fail("INTERNAL_ERROR", f"{type(exc).__name__}: {exc}", spec=spec_path, kotlin=kt_path)

print(json.dumps(report, indent=2))
cov = report["idCoverage"]
sys.exit(2 if (cov["missingInKotlin"] or cov["orphanInKotlin"]) else 0)


def build_report(spec_path, kt_path, spec_tests, kotlin):
spec_ids = [t["testId"] for t in spec_tests]
kt_ids = list(kotlin.keys())
spec_id_set, kt_id_set = set(spec_ids), set(kt_ids)
missing = [i for i in spec_ids if i not in kt_id_set]
orphan = [i for i in kt_ids if i not in spec_id_set]

per_test = []
for t in spec_tests:
kt = kotlin.get(t["testId"])
per_test.append({
"testId": t["testId"],
"title": t["title"],
# every code line of the spec test, grouped by section (setup / steps / assertions),
# each tagged assert | await | step β€” the full "did I translate every line?" ledger
"sections": t["sections"],
"specCodeLineTotal": t["codeLineTotal"],
# flat convenience views of the observable lines
"specAsserts": t["asserts"],
"specAwaits": t["awaits"],
"specAssertCount": len(t["asserts"]),
"specAwaitCount": len(t["awaits"]),
"kotlinPresent": kt is not None,
"kotlinAssertionCalls": kt["assertionCalls"] if kt else [],
"kotlinAssertionCount": kt["assertionCount"] if kt else 0,
# tripwire: fewer Kotlin assertions than spec ASSERTs => likely a dropped assertion
"assertionShortfall": (len(t["asserts"]) - kt["assertionCount"]) if kt else len(t["asserts"]),
})

report = {
"ok": True,
"spec": spec_path,
"kotlin": kt_path,
"idCoverage": {
"specCount": len(spec_ids),
"kotlinCount": len(kt_ids),
"missingInKotlin": missing,
"orphanInKotlin": orphan,
},
"perTest": per_test,
"summary": {
"specAssertTotal": sum(len(t["asserts"]) for t in spec_tests),
"specAwaitTotal": sum(len(t["awaits"]) for t in spec_tests),
"kotlinAssertionTotal": sum(k["assertionCount"] for k in kotlin.values()),
"testsWithShortfall": [p["testId"] for p in per_test if p["assertionShortfall"] > 0],
},
}
return report


if __name__ == "__main__":
main()
Loading
Loading