Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 166 additions & 0 deletions packages/scoring/src/difficulty.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import { describe, expect, it } from "vitest";
import {
DEFAULT_DIFFICULTY_HEURISTIC,
difficultySignal,
episodeSuccessScore,
nextDifficulty,
updateSkillScore,
type EpisodeSignalInput,
} from "./difficulty.js";
import type { ConceptSkill } from "./policies/types.js";

function ep(overrides: Partial<EpisodeSignalInput> = {}): EpisodeSignalInput {
return {
passed: true,
reveal_clicked: false,
hints_used: 0,
submit_count: 1,
time_to_solve_ms: 60_000,
expected_time_ms: 120_000,
...overrides,
};
}

function skill(overrides: Partial<ConceptSkill> = {}): ConceptSkill {
return { concept_id: "list-comp", skill: 0.5, confidence: 0.3, attempts: 5, ...overrides };
}

describe("difficultySignal", () => {
it("returns +correctness_bonus for a perfect under-time solve", () => {
const s = difficultySignal(ep({ time_to_solve_ms: 60_000 }));
expect(s).toBeCloseTo(DEFAULT_DIFFICULTY_HEURISTIC.correctness_bonus, 6);
});

it("clamps overtime contribution at the configured cap (no runaway negative signal)", () => {
const huge = difficultySignal(ep({ time_to_solve_ms: 1_000_000_000, passed: false }));
// overtime clamped to 1 → -0.5 contribution from overtime, no other contributions
expect(huge).toBe(-0.5);
});

it("clamps hints contribution at the configured cap", () => {
const s = difficultySignal(ep({ hints_used: 999, time_to_solve_ms: 60_000, passed: false }));
// hints clamped to 1 → -0.3, no overtime, no fails, no correctness
expect(s).toBe(-0.3);
});
});

describe("nextDifficulty", () => {
it("perfect solve: easy → medium (step up)", () => {
const next = nextDifficulty("easy", ep({ time_to_solve_ms: 30_000 }));
expect(next).toBe("medium");
});

it("hint-heavy solve: same difficulty (no step in either direction)", () => {
const next = nextDifficulty(
"medium",
ep({ hints_used: 2, submit_count: 2, time_to_solve_ms: 100_000 }),
);
expect(next).toBe("medium");
});

it("repeated failures (heavy struggle, didn't pass): hard → medium (step down)", () => {
const next = nextDifficulty(
"hard",
ep({
passed: false,
submit_count: 4,
hints_used: 2,
time_to_solve_ms: 200_000,
}),
);
expect(next).toBe("medium");
});

it("massive overtime (~3× expected) on a passed solve still steps down", () => {
const next = nextDifficulty(
"medium",
ep({ time_to_solve_ms: 360_000, hints_used: 1, submit_count: 2 }),
);
expect(next).toBe("easy");
});

it("under-time clean solve at expert stays expert (cap at top of ladder)", () => {
const next = nextDifficulty("expert", ep({ time_to_solve_ms: 30_000 }));
expect(next).toBe("expert");
});

it("no-progress (failed, max hints, max retries, way overtime) at easy stays easy (cap at bottom)", () => {
const next = nextDifficulty(
"easy",
ep({
passed: false,
hints_used: 3,
submit_count: 5,
time_to_solve_ms: 240_000,
reveal_clicked: true,
}),
);
expect(next).toBe("easy");
});

it("respects an operator-injected stricter step_up_threshold (no step up on a perfect solve)", () => {
const next = nextDifficulty("medium", ep({ time_to_solve_ms: 30_000 }), {
...DEFAULT_DIFFICULTY_HEURISTIC,
step_up_threshold: 0.5,
});
expect(next).toBe("medium");
});
});

describe("episodeSuccessScore", () => {
it("clean solve = 1", () => {
expect(episodeSuccessScore(ep())).toBe(1);
});

it("revealed solution = 0 even if 'passed' is true", () => {
expect(episodeSuccessScore(ep({ reveal_clicked: true }))).toBe(0);
});

it("failed = 0", () => {
expect(episodeSuccessScore(ep({ passed: false }))).toBe(0);
});

it("hints + retries shave the score down (still positive)", () => {
const s = episodeSuccessScore(ep({ hints_used: 1, submit_count: 2 }));
// 1 - 1*0.15 (hint) - 1*0.10 (1 retry) = 0.75
expect(s).toBeCloseTo(0.75, 6);
});

it("excessive hints/retries floor at 0 (never negative)", () => {
const s = episodeSuccessScore(ep({ hints_used: 99, submit_count: 99 }));
expect(s).toBe(0);
});
});

describe("updateSkillScore", () => {
it("EWMA pulls skill toward 1 on a clean solve", () => {
const next = updateSkillScore(skill({ skill: 0.5 }), ep());
// 0.4 * 1 + 0.6 * 0.5 = 0.7
expect(next.skill).toBeCloseTo(0.7, 6);
expect(next.attempts).toBe(6);
});

it("EWMA pulls skill toward 0 on a failed solve", () => {
const next = updateSkillScore(skill({ skill: 0.5 }), ep({ passed: false }));
// 0.4 * 0 + 0.6 * 0.5 = 0.3
expect(next.skill).toBeCloseTo(0.3, 6);
});

it("confidence grows asymptotically toward confidence_max", () => {
let s = skill({ confidence: 0 });
for (let i = 0; i < 100; i++) s = updateSkillScore(s, ep());
expect(s.confidence).toBeGreaterThan(0.94);
expect(s.confidence).toBeLessThanOrEqual(0.95);
});

it("clamps skill into [0, 1] (defensive — formula already keeps it bounded)", () => {
const next = updateSkillScore(skill({ skill: 1 }), ep());
expect(next.skill).toBeLessThanOrEqual(1);
expect(next.skill).toBeGreaterThanOrEqual(0);
});

it("preserves the concept_id of the previous record", () => {
const next = updateSkillScore(skill({ concept_id: "dict-comp" }), ep());
expect(next.concept_id).toBe("dict-comp");
});
});
133 changes: 133 additions & 0 deletions packages/scoring/src/difficulty.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import { z } from "zod";
import { type ConceptSkill, type DifficultyTier } from "./policies/types.js";

// Heuristic per-episode difficulty signal: a single number `s` in [-1, +1] derived from how the
// learner just struggled (or didn't). The policy layer (see policies/difficulty-policy.ts) keeps
// looking at multi-episode EWMAs to pick a tier from a problem catalog; these helpers are the
// finer-grained per-episode step decision that callers can use directly between catalog lookups.
//
// Why heuristic, not learned: explicitly v1 territory. A learned model only earns its keep once we
// have enough episodes to fit one. Heuristics are interpretable, debuggable, and good enough until
// proven otherwise. See STORY-018 for the rationale.

export const TIER_ORDER: readonly DifficultyTier[] = ["easy", "medium", "hard", "expert"];

export const DifficultyHeuristicConfigSchema = z.object({
// Signal weights — overtime/hints/failures pull `s` negative (struggle); correctness pushes positive.
weight_overtime: z.number().default(-0.5),
weight_hint_usage: z.number().default(-0.3),
weight_failed_attempts: z.number().default(-0.2),
correctness_bonus: z.number().default(0.3),
// Step thresholds. `s > step_up_threshold` → harder; `s < step_down_threshold` → easier; otherwise same.
step_up_threshold: z.number().default(0.3),
step_down_threshold: z.number().default(-0.3),
// Normalization caps — ratios above these clamp to 1 (max struggle on that axis).
overtime_cap_ratio: z.number().min(1).default(2.0),
hints_cap: z.number().int().min(1).default(3),
failed_attempts_cap: z.number().int().min(1).default(4),
// EWMA + Bayesian-flavored confidence growth for the per-concept skill score.
ewma_alpha: z.number().min(0).max(1).default(0.4),
confidence_growth: z.number().min(0).max(1).default(0.1),
confidence_max: z.number().min(0).max(1).default(0.95),
hint_skill_penalty: z.number().min(0).max(1).default(0.15),
fail_skill_penalty: z.number().min(0).max(1).default(0.1),
});
export type DifficultyHeuristicConfig = z.infer<typeof DifficultyHeuristicConfigSchema>;

export const DEFAULT_DIFFICULTY_HEURISTIC: DifficultyHeuristicConfig =
DifficultyHeuristicConfigSchema.parse({});

export const EpisodeSignalInputSchema = z.object({
passed: z.boolean(),
reveal_clicked: z.boolean(),
hints_used: z.number().int().min(0),
submit_count: z.number().int().min(1),
time_to_solve_ms: z.number().int().min(0),
expected_time_ms: z.number().int().positive(),
});
export type EpisodeSignalInput = z.infer<typeof EpisodeSignalInputSchema>;

// Returns the per-episode difficulty signal `s` in (-∞..+∞ but typically clamped near [-1, +1]).
// Negative = the learner struggled (slow, lots of hints, retries); positive = breezed through.
export function difficultySignal(
episode: EpisodeSignalInput,
config: DifficultyHeuristicConfig = DEFAULT_DIFFICULTY_HEURISTIC,
): number {
const overtimeRatio = episode.time_to_solve_ms / episode.expected_time_ms;
const overtime = clamp01((overtimeRatio - 1) / (config.overtime_cap_ratio - 1));
const hintUsage = clamp01(episode.hints_used / config.hints_cap);
const failedAttempts = clamp01((episode.submit_count - 1) / config.failed_attempts_cap);
const correctness = episode.passed && !episode.reveal_clicked ? config.correctness_bonus : 0;
return (
config.weight_overtime * overtime +
config.weight_hint_usage * hintUsage +
config.weight_failed_attempts * failedAttempts +
correctness
);
}

// Returns the next difficulty tier given the current tier and the episode just completed.
// Step direction: signal > step_up_threshold → harder; < step_down_threshold → easier; else same.
// Caps at the ladder ends — "expert" stays "expert" if you keep crushing it.
export function nextDifficulty(
current: DifficultyTier,
episode: EpisodeSignalInput,
config: DifficultyHeuristicConfig = DEFAULT_DIFFICULTY_HEURISTIC,
): DifficultyTier {
const s = difficultySignal(episode, config);
if (s >= config.step_up_threshold) return stepTier(current, +1);
if (s <= config.step_down_threshold) return stepTier(current, -1);
return current;
}

// Per-episode contribution to the per-concept skill score, in [0, 1].
// Failed solve / revealed solution = 0. Clean solve = 1. Hints + retries shave it down.
export function episodeSuccessScore(
episode: EpisodeSignalInput,
config: DifficultyHeuristicConfig = DEFAULT_DIFFICULTY_HEURISTIC,
): number {
if (!episode.passed) return 0;
if (episode.reveal_clicked) return 0;
const hintPenalty = episode.hints_used * config.hint_skill_penalty;
const failPenalty = (episode.submit_count - 1) * config.fail_skill_penalty;
return Math.max(0, 1 - hintPenalty - failPenalty);
}

// Bayesian-flavored EMA update for a per-concept skill score:
// - skill: EWMA(prev.skill, this episode's success score).
// - confidence: monotonically grows toward `confidence_max` as attempts accumulate.
// - attempts: incremented.
export function updateSkillScore(
prev: ConceptSkill,
episode: EpisodeSignalInput,
config: DifficultyHeuristicConfig = DEFAULT_DIFFICULTY_HEURISTIC,
): ConceptSkill {
const x = episodeSuccessScore(episode, config);
const skill = config.ewma_alpha * x + (1 - config.ewma_alpha) * prev.skill;
const confidence = Math.min(
config.confidence_max,
prev.confidence + config.confidence_growth * (config.confidence_max - prev.confidence),
);
return {
concept_id: prev.concept_id,
skill: round6(clamp01(skill)),
confidence: round6(clamp01(confidence)),
attempts: prev.attempts + 1,
};
}

function stepTier(tier: DifficultyTier, delta: number): DifficultyTier {
const idx = TIER_ORDER.indexOf(tier);
const next = Math.max(0, Math.min(TIER_ORDER.length - 1, idx + delta));
return TIER_ORDER[next] as DifficultyTier;
}

function clamp01(n: number): number {
if (n < 0) return 0;
if (n > 1) return 1;
return n;
}

function round6(n: number): number {
return Math.round(n * 1_000_000) / 1_000_000;
}
12 changes: 12 additions & 0 deletions packages/scoring/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
export const PACKAGE_NAME = "@learnpro/scoring";

export * from "./policies/index.js";
export {
DEFAULT_DIFFICULTY_HEURISTIC,
DifficultyHeuristicConfigSchema,
EpisodeSignalInputSchema,
TIER_ORDER,
difficultySignal,
episodeSuccessScore,
nextDifficulty,
updateSkillScore,
type DifficultyHeuristicConfig,
type EpisodeSignalInput,
} from "./difficulty.js";
6 changes: 3 additions & 3 deletions project/BOARD.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# LearnPro Board

> **Last updated:** 2026-04-26 (STORY-012 done — versioned `MODEL_PRICING` table + `costFor()`, per-user `DailyTokenBudget` with Opus → Sonnet → Haiku tier ladder, `BudgetGatedLLMProvider` decorator wrapping any `LLMProvider`, `LLMTelemetryEvent` extended with `cost_usd`/`pricing_version`/`session_id`/`tool_used`/`cached_tokens`. DB-backed sink + `agent_calls` migration split into [STORY-060](./stories/STORY-060-agent-calls-db-sink.md) so STORY-012 stays at S.)
> **Last updated:** 2026-04-26 (STORY-018 done — heuristic per-episode `difficultySignal()` / `nextDifficulty()` / `episodeSuccessScore()` / `updateSkillScore()` in `packages/scoring/src/difficulty.ts`, all coefficients tunable via Zod-schema'd config, 20 unit tests covering 6+ representative scenarios, EWMA + asymptotic confidence growth for per-concept skill score. Complement to the catalog-level multi-episode `EloEwmaPolicy` already in `policies/difficulty-policy.ts`.)
> **How to read this:** This is the live status of every Epic, Story, and Task in the project. Hand-maintained for now (a regenerator script lives in the v1 backlog). When you change an item's `status:` frontmatter, also update the row here in the same commit.

---
Expand Down Expand Up @@ -34,7 +34,6 @@ Path A locked 2026-04-25. EPIC-019 (foundation) must land first since every othe
| [STORY-015](stories/STORY-015-session-plan.md) | Session plan agent (3–5 micro-objectives per session) | EPIC-006 | mvp | P0 | M |
| [STORY-016](stories/STORY-016-seed-bank.md) | Curated seed problem bank (~30 Python + ~30 TS) with hidden tests | EPIC-007 | mvp | P0 | L |
| [STORY-017](stories/STORY-017-hint-ladder.md) | 3-rung hint ladder | EPIC-007 | mvp | P0 | S |
| [STORY-018](stories/STORY-018-heuristic-difficulty.md) | Heuristic difficulty tuner (time + hints + errors → next difficulty) | EPIC-007 | mvp | P0 | S |
| [STORY-019](stories/STORY-019-python-track.md) | Python fundamentals track | EPIC-009 | mvp | P0 | M |
| [STORY-020](stories/STORY-020-typescript-track.md) | TypeScript fundamentals track | EPIC-009 | mvp | P0 | M |
| [STORY-021](stories/STORY-021-onboarding-interview.md) | Career-aware onboarding interview (target role, time budget, level) | EPIC-010 | mvp | P0 | S |
Expand Down Expand Up @@ -90,10 +89,11 @@ These stories were filed during EPIC-017 Phase C from the expanded idea catalog

## Recently Done

STORY-012 (per-call LLM cost telemetry + per-user daily token budget) landed 2026-04-26 — versioned `MODEL_PRICING` + `costFor()` calculator, `DailyTokenBudget` with Opus → Sonnet → Haiku tier ladder + downgrade at 80%, `BudgetGatedLLMProvider` decorator. DB-backed sink + `agent_calls` migration split into [STORY-060](./stories/STORY-060-agent-calls-db-sink.md). STORY-006 (Monaco editor + Run button + result panel) landed 2026-04-26 — first user-facing feature in `apps/web`. STORY-008 (TypeScript sandbox runner via Piston) landed 2026-04-26. STORY-007 (Python sandbox runner via Piston) landed 2026-04-26 (PR #14) — first feature Story under EPIC-003. STORY-013 (learner profile schema) landed 2026-04-26 (PR #11) — first feature Story under EPIC-005. STORY-009 (LLM gateway) landed 2026-04-26 (PR #9) — first feature Story under EPIC-004. EPIC-019 (foundation) closed 2026-04-26 with STORY-052 (monorepo skeleton, PR #5) and STORY-057 (policy adapters, PR #7). GitHub repo + PR workflow landed 2026-04-25 (PR #1, STORY-058). EPIC-017 product grooming closed in full on 2026-04-25 (Phases A + B + C). EPIC-001 closed on 2026-04-25 (initial scaffolding commit `c1e17a1`). Phase A commit: `bbf7300`.
STORY-018 (heuristic difficulty tuner) landed 2026-04-26 — per-episode `difficultySignal` + `nextDifficulty` + `episodeSuccessScore` + Bayesian-flavored `updateSkillScore` in `packages/scoring/src/difficulty.ts`, all tunable via Zod-schema'd config, 20 unit tests covering perfect/hint-heavy/repeated-failure/overtime/under-time/no-progress + capped-at-extremes + operator-stricter-threshold scenarios. STORY-012 (per-call LLM cost telemetry + per-user daily token budget) landed 2026-04-26 — versioned `MODEL_PRICING` + `costFor()` calculator, `DailyTokenBudget` with Opus → Sonnet → Haiku tier ladder + downgrade at 80%, `BudgetGatedLLMProvider` decorator. DB-backed sink + `agent_calls` migration split into [STORY-060](./stories/STORY-060-agent-calls-db-sink.md). STORY-006 (Monaco editor + Run button + result panel) landed 2026-04-26 — first user-facing feature in `apps/web`. STORY-008 (TypeScript sandbox runner via Piston) landed 2026-04-26. STORY-007 (Python sandbox runner via Piston) landed 2026-04-26 (PR #14) — first feature Story under EPIC-003. STORY-013 (learner profile schema) landed 2026-04-26 (PR #11) — first feature Story under EPIC-005. STORY-009 (LLM gateway) landed 2026-04-26 (PR #9) — first feature Story under EPIC-004. EPIC-019 (foundation) closed 2026-04-26 with STORY-052 (monorepo skeleton, PR #5) and STORY-057 (policy adapters, PR #7). GitHub repo + PR workflow landed 2026-04-25 (PR #1, STORY-058). EPIC-017 product grooming closed in full on 2026-04-25 (Phases A + B + C). EPIC-001 closed on 2026-04-25 (initial scaffolding commit `c1e17a1`). Phase A commit: `bbf7300`.

| ID | Title | Done |
|----|-------|------|
| [STORY-018](stories/STORY-018-heuristic-difficulty.md) | Heuristic difficulty tuner (per-episode signal + next-difficulty step + EWMA skill score) | 2026-04-26 |
| [STORY-012](stories/STORY-012-cost-telemetry.md) | Per-call LLM cost & latency telemetry + per-user daily token budget (DB sink → STORY-060) | 2026-04-26 |
| [STORY-006](stories/STORY-006-monaco-editor.md) | Monaco editor + Run button + result panel (`/playground` → Next.js proxy → Fastify `/sandbox/run`) | 2026-04-26 |
| [STORY-008](stories/STORY-008-typescript-runner.md) | TypeScript sandbox runner via Piston (TS-specific unit/integration/API tests on top of STORY-007 infra) | 2026-04-26 |
Expand Down
Loading
Loading