diff --git a/Cargo.lock b/Cargo.lock
index 2520ebccc7..5adcab2e73 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9666,6 +9666,14 @@ dependencies = [
  "web-sys",
 ]
 
+[[package]]
+name = "ruvector-matryoshka"
+version = "0.1.0"
+dependencies = [
+ "criterion 0.5.1",
+ "rand 0.8.5",
+]
+
 [[package]]
 name = "ruvector-metrics"
 version = "2.2.2"
diff --git a/Cargo.toml b/Cargo.toml
index 4853cc70e3..406dc0b5c1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -233,6 +233,8 @@ members = [
     "crates/ruvllm_retrieval_diffusion",
     # RAIRS IVF: Redundant Assignment + Amplified Inverse Residual (ADR-193)
     "crates/ruvector-rairs",
+    # Matryoshka HNSW: dimension-adaptive cascaded vector search (ADR-194)
+    "crates/ruvector-matryoshka",
 ]
 resolver = "2"
 
diff --git a/crates/ruvector-matryoshka/Cargo.toml b/crates/ruvector-matryoshka/Cargo.toml
new file mode 100644
index 0000000000..497cdf638f
--- /dev/null
+++ b/crates/ruvector-matryoshka/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name        = "ruvector-matryoshka"
+version     = "0.1.0"
+edition     = "2021"
+description = "Matryoshka HNSW: dimension-adaptive multi-resolution vector search with cascaded reranking for memory-efficient ANN"
+authors     = ["ruvnet", "claude-flow"]
+license     = "MIT OR Apache-2.0"
+repository  = "https://github.com/ruvnet/ruvector"
+keywords    = ["ann", "matryoshka", "vector-search", "nearest-neighbor", "ruvector"]
+categories  = ["algorithms", "data-structures"]
+
+[[bin]]
+name = "matryoshka-bench"
+path = "src/main.rs"
+
+[dependencies]
+rand = "0.8"
+
+[dev-dependencies]
+criterion = { version = "0.5", features = ["html_reports"] }
diff --git a/crates/ruvector-matryoshka/src/lib.rs b/crates/ruvector-matryoshka/src/lib.rs
new file mode 100644
index 0000000000..15a0645b6d
--- /dev/null
+++ b/crates/ruvector-matryoshka/src/lib.rs
@@ -0,0 +1,564 @@
+//! Matryoshka HNSW: dimension-adaptive multi-resolution vector search.
+//!
+//! Implements three search strategies for datasets that exhibit Matryoshka
+//! representation structure (early dimensions carry higher discriminative
+//! signal than later dimensions, as produced by MRL-trained models):
+//!
+//! - [`FullScan`]: brute-force at full dimensions (baseline)
+//! - [`CoarseScan`]: brute-force using only the first `coarse_dim` dimensions
+//! - [`CascadeSearch`]: coarse filter at `coarse_dim`, then rerank at full
+//!   dimensions — the core Matryoshka search strategy
+//!
+//! Reference: Kusupati et al., "Matryoshka Representation Learning",
+//! NeurIPS 2022, arXiv:2205.13147.
+
+use std::collections::HashSet;
+use std::fmt;
+use std::time::Instant;
+
+// ── Configuration ────────────────────────────────────────────────────────────
+
+/// Parameters governing a Matryoshka search index.
+#[derive(Debug, Clone)]
+pub struct MatryoshkaConfig {
+    /// Full embedding dimension (e.g. 128).
+    pub full_dim: usize,
+    /// Coarse embedding dimension for first-pass candidate selection (e.g. 32).
+    pub coarse_dim: usize,
+    /// Number of candidates fetched from coarse search before full reranking.
+    pub cascade_candidates: usize,
+}
+
+impl MatryoshkaConfig {
+    pub fn new(full_dim: usize, coarse_dim: usize, cascade_candidates: usize) -> Self {
+        assert!(coarse_dim <= full_dim, "coarse_dim must be ≤ full_dim");
+        assert!(
+            cascade_candidates > 0,
+            "cascade_candidates must be positive"
+        );
+        Self {
+            full_dim,
+            coarse_dim,
+            cascade_candidates,
+        }
+    }
+
+    /// Memory required per vector at coarse vs full precision (bytes).
+    pub fn memory_ratio(&self) -> f64 {
+        self.coarse_dim as f64 / self.full_dim as f64
+    }
+}
+
+// ── Vector ───────────────────────────────────────────────────────────────────
+
+/// A stored vector with a logical identifier.
+#[derive(Debug, Clone)]
+pub struct Vector {
+    pub id: usize,
+    pub data: Vec<f32>,
+}
+
+impl Vector {
+    pub fn new(id: usize, data: Vec<f32>) -> Self {
+        Self { id, data }
+    }
+
+    /// Squared L2 distance using only the first `dim` dimensions.
+    #[inline]
+    pub fn l2_sq_truncated(&self, query: &[f32], dim: usize) -> f32 {
+        let d = dim.min(self.data.len()).min(query.len());
+        self.data[..d]
+            .iter()
+            .zip(&query[..d])
+            .map(|(&a, &b)| (a - b) * (a - b))
+            .sum()
+    }
+
+    /// Squared L2 distance at full precision.
+    #[inline]
+    pub fn l2_sq(&self, query: &[f32]) -> f32 {
+        self.l2_sq_truncated(query, self.data.len())
+    }
+}
+
+// ── Results ──────────────────────────────────────────────────────────────────
+
+/// A single nearest-neighbour hit.
+#[derive(Debug, Clone)]
+pub struct Hit {
+    pub id: usize,
+    pub distance: f32,
+}
+
+// ── Trait ────────────────────────────────────────────────────────────────────
+
+/// Common interface for all Matryoshka search variants.
+pub trait MatryoshkaIndex {
+    fn name(&self) -> &str;
+    fn build(&mut self, vectors: &[Vector]);
+    fn search(&self, query: &[f32], k: usize) -> Vec<Hit>;
+    /// Heap bytes occupied by stored vectors.
+    fn memory_bytes(&self) -> usize;
+}
+
+// ── Variant 1: FullScan ──────────────────────────────────────────────────────
+
+/// Brute-force search using all `full_dim` dimensions. Ground-truth baseline.
+pub struct FullScan {
+    vectors: Vec<Vector>,
+}
+
+impl FullScan {
+    pub fn new() -> Self {
+        Self {
+            vectors: Vec::new(),
+        }
+    }
+}
+
+impl Default for FullScan {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl MatryoshkaIndex for FullScan {
+    fn name(&self) -> &str {
+        "FullScan (D=full)"
+    }
+
+    fn build(&mut self, vectors: &[Vector]) {
+        self.vectors = vectors.to_vec();
+    }
+
+    fn search(&self, query: &[f32], k: usize) -> Vec<Hit> {
+        let mut heap: Vec<(f32, usize)> = self
+            .vectors
+            .iter()
+            .map(|v| (v.l2_sq(query), v.id))
+            .collect();
+        heap.sort_unstable_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
+        heap.into_iter()
+            .take(k)
+            .map(|(d, id)| Hit { id, distance: d })
+            .collect()
+    }
+
+    fn memory_bytes(&self) -> usize {
+        self.vectors.iter().map(|v| v.data.len() * 4).sum()
+    }
+}
+
+// ── Variant 2: CoarseScan ───────────────────────────────────────────────────
+
+/// Brute-force search using only the first `coarse_dim` dimensions.
+/// Fast but loses recall on higher-dimensional distinctions.
+pub struct CoarseScan {
+    vectors: Vec<Vector>,
+    coarse_dim: usize,
+}
+
+impl CoarseScan {
+    pub fn new(coarse_dim: usize) -> Self {
+        Self {
+            vectors: Vec::new(),
+            coarse_dim,
+        }
+    }
+}
+
+impl MatryoshkaIndex for CoarseScan {
+    fn name(&self) -> &str {
+        "CoarseScan (D=coarse)"
+    }
+
+    fn build(&mut self, vectors: &[Vector]) {
+        self.vectors = vectors.to_vec();
+    }
+
+    fn search(&self, query: &[f32], k: usize) -> Vec<Hit> {
+        let mut heap: Vec<(f32, usize)> = self
+            .vectors
+            .iter()
+            .map(|v| (v.l2_sq_truncated(query, self.coarse_dim), v.id))
+            .collect();
+        heap.sort_unstable_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
+        heap.into_iter()
+            .take(k)
+            .map(|(d, id)| Hit { id, distance: d })
+            .collect()
+    }
+
+    fn memory_bytes(&self) -> usize {
+        // Stores full vectors; active compute is coarse_dim only
+        self.vectors.iter().map(|v| v.data.len() * 4).sum()
+    }
+}
+
+// ── Variant 3: CascadeSearch ─────────────────────────────────────────────────
+
+/// Two-pass Matryoshka cascade: coarse candidate selection followed by
+/// full-precision reranking.
+///
+/// Stage 1 — linear scan over all N vectors using only `coarse_dim` dimensions,
+///           retaining the top `cascade_candidates` by coarse distance.
+///
+/// Stage 2 — recompute exact L2 at full precision for the retained candidates,
+///           return top-k.
+///
+/// When data has Matryoshka structure (early dims are most discriminative),
+/// Stage 1 eliminates the vast majority of false neighbours cheaply, and
+/// Stage 2 recovers high recall without scanning the full corpus at full cost.
+pub struct CascadeSearch {
+    vectors: Vec<Vector>,
+    config: MatryoshkaConfig,
+}
+
+impl CascadeSearch {
+    pub fn new(config: MatryoshkaConfig) -> Self {
+        Self {
+            vectors: Vec::new(),
+            config,
+        }
+    }
+}
+
+impl MatryoshkaIndex for CascadeSearch {
+    fn name(&self) -> &str {
+        "CascadeSearch (coarse→full)"
+    }
+
+    fn build(&mut self, vectors: &[Vector]) {
+        self.vectors = vectors.to_vec();
+    }
+
+    fn search(&self, query: &[f32], k: usize) -> Vec<Hit> {
+        let n_candidates = self.config.cascade_candidates.max(k);
+
+        // Stage 1: coarse scan — O(N * coarse_dim) distance ops
+        let mut coarse: Vec<(f32, usize)> = self
+            .vectors
+            .iter()
+            .map(|v| (v.l2_sq_truncated(query, self.config.coarse_dim), v.id))
+            .collect();
+        coarse.sort_unstable_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
+
+        // Stage 2: full rerank — O(candidates * full_dim) distance ops
+        let mut refined: Vec<(f32, usize)> = coarse
+            .into_iter()
+            .take(n_candidates)
+            .map(|(_, id)| (self.vectors[id].l2_sq(query), id))
+            .collect();
+        refined.sort_unstable_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
+
+        refined
+            .into_iter()
+            .take(k)
+            .map(|(d, id)| Hit { id, distance: d })
+            .collect()
+    }
+
+    fn memory_bytes(&self) -> usize {
+        self.vectors.iter().map(|v| v.data.len() * 4).sum()
+    }
+}
+
+// ── Dataset generator ────────────────────────────────────────────────────────
+
+/// Generate cluster centres for a Matryoshka dataset.
+///
+/// Centres are spread uniformly in `[-3, 3]^dim`.  The same `seed` must be
+/// passed to both `generate_matryoshka_dataset` and `generate_queries` so that
+/// queries and database vectors share the same cluster geometry — a requirement
+/// for the Matryoshka cascade to be well-defined.
+fn make_cluster_centers(n_clusters: usize, dim: usize, seed: u64) -> Vec<Vec<f32>> {
+    use rand::rngs::StdRng;
+    use rand::{Rng, SeedableRng};
+    let mut rng = StdRng::seed_from_u64(seed);
+    (0..n_clusters)
+        .map(|_| (0..dim).map(|_| rng.gen_range(-3.0_f32..3.0)).collect())
+        .collect()
+}
+
+/// Place `n` points around the provided cluster centres.
+///
+/// Noise scale increases with dimension index to simulate MRL training:
+///
+/// - dims `0 .. dim/4`:   σ = 0.12  (high signal — most discriminative)
+/// - dims `dim/4 .. dim/2`: σ = 0.50  (medium signal)
+/// - dims `dim/2 .. dim`: σ = 0.80  (lower signal, still cluster-structured — not pure noise)
+fn place_points(centers: &[Vec<f32>], n: usize, dim: usize, noise_seed: u64) -> Vec<Vector> {
+    use rand::rngs::StdRng;
+    use rand::{Rng, SeedableRng};
+    let mut rng = StdRng::seed_from_u64(noise_seed);
+    (0..n)
+        .map(|i| {
+            let c = &centers[i % centers.len()];
+            let data: Vec<f32> = (0..dim)
+                .map(|d| {
+                    let sigma: f32 = if d < dim / 4 {
+                        0.12
+                    } else if d < dim / 2 {
+                        0.50
+                    } else {
+                        0.80
+                    };
+                    c[d] + rng.gen_range(-sigma..sigma)
+                })
+                .collect();
+            Vector::new(i, data)
+        })
+        .collect()
+}
+
+/// Generate a synthetic database with Matryoshka-like structure.
+///
+/// `seed` controls cluster geometry; both dataset and queries must share it.
+pub fn generate_matryoshka_dataset(
+    n: usize,
+    dim: usize,
+    n_clusters: usize,
+    seed: u64,
+) -> Vec<Vector> {
+    let centers = make_cluster_centers(n_clusters, dim, seed);
+    // Use seed+1 for per-point noise so centres and points don't share the rng stream.
+    place_points(&centers, n, dim, seed.wrapping_add(1))
+}
+
+/// Generate query vectors over the same cluster centres as the database.
+///
+/// **`seed` must match the one passed to `generate_matryoshka_dataset`.**
+pub fn generate_queries(
+    n_queries: usize,
+    dim: usize,
+    n_clusters: usize,
+    seed: u64,
+) -> Vec<Vec<f32>> {
+    let centers = make_cluster_centers(n_clusters, dim, seed);
+    // Use seed+0xBEEF so query noise is independent from database point noise.
+    place_points(&centers, n_queries, dim, seed.wrapping_add(0xBEEF))
+        .into_iter()
+        .map(|v| v.data)
+        .collect()
+}
+
+// ── Evaluation helpers ───────────────────────────────────────────────────────
+
+/// Recall@k: fraction of the true top-k neighbours found in `retrieved`.
+pub fn recall_at_k(ground_truth: &[Hit], retrieved: &[Hit]) -> f64 {
+    if ground_truth.is_empty() {
+        return 1.0;
+    }
+    let gt_ids: HashSet<usize> = ground_truth.iter().map(|h| h.id).collect();
+    let k = ground_truth.len().min(retrieved.len());
+    let found = retrieved.iter().filter(|h| gt_ids.contains(&h.id)).count();
+    found as f64 / k as f64
+}
+
+// ── Benchmark harness ────────────────────────────────────────────────────────
+
+/// Per-query timing and recall collected during a benchmark run.
+#[derive(Debug)]
+pub struct BenchStats {
+    pub mean_latency_us: f64,
+    pub p50_latency_us: f64,
+    pub p95_latency_us: f64,
+    pub throughput_qps: f64,
+    pub mean_recall: f64,
+    pub memory_kb: usize,
+}
+
+impl fmt::Display for BenchStats {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "mean={:.1}µs  p50={:.1}µs  p95={:.1}µs  qps={:.0}  recall={:.4}  mem={}KB",
+            self.mean_latency_us,
+            self.p50_latency_us,
+            self.p95_latency_us,
+            self.throughput_qps,
+            self.mean_recall,
+            self.memory_kb
+        )
+    }
+}
+
+/// Run `queries` against `index`, compare to `ground_truth`, return stats.
+pub fn run_benchmark(
+    index: &dyn MatryoshkaIndex,
+    queries: &[Vec<f32>],
+    ground_truth: &[Vec<Hit>],
+    k: usize,
+) -> BenchStats {
+    let mut latencies_us: Vec<f64> = Vec::with_capacity(queries.len());
+    let mut recalls: Vec<f64> = Vec::with_capacity(queries.len());
+
+    for (query, gt) in queries.iter().zip(ground_truth.iter()) {
+        let t0 = Instant::now();
+        let hits = index.search(query, k);
+        latencies_us.push(t0.elapsed().as_secs_f64() * 1_000_000.0);
+        recalls.push(recall_at_k(gt, &hits));
+    }
+
+    latencies_us.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
+    let n = latencies_us.len();
+    let mean_lat = latencies_us.iter().sum::<f64>() / n as f64;
+    let p50 = latencies_us[n / 2];
+    let p95 = latencies_us[(n as f64 * 0.95) as usize];
+    let total_s: f64 = latencies_us.iter().sum::<f64>() / 1_000_000.0;
+
+    BenchStats {
+        mean_latency_us: mean_lat,
+        p50_latency_us: p50,
+        p95_latency_us: p95,
+        throughput_qps: n as f64 / total_s,
+        mean_recall: recalls.iter().sum::<f64>() / n as f64,
+        memory_kb: index.memory_bytes() / 1024,
+    }
+}
+
+// ── Unit tests ───────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const N: usize = 2_000;
+    const DIM: usize = 128;
+    const COARSE_DIM: usize = 32;
+    const K: usize = 10;
+    const N_CLUSTERS: usize = 20;
+    const N_QUERIES: usize = 100;
+    const CASCADE_CANDS: usize = 150;
+
+    fn build_dataset() -> Vec<Vector> {
+        generate_matryoshka_dataset(N, DIM, N_CLUSTERS, 42)
+    }
+
+    fn build_queries() -> Vec<Vec<f32>> {
+        generate_queries(N_QUERIES, DIM, N_CLUSTERS, 42)
+    }
+
+    #[test]
+    fn full_scan_returns_k_results() {
+        let data = build_dataset();
+        let mut idx = FullScan::new();
+        idx.build(&data);
+        let q = build_queries();
+        let hits = idx.search(&q[0], K);
+        assert_eq!(hits.len(), K);
+    }
+
+    #[test]
+    fn coarse_scan_faster_than_full() {
+        let data = build_dataset();
+        let q = build_queries();
+
+        let mut full = FullScan::new();
+        full.build(&data);
+        let mut coarse = CoarseScan::new(COARSE_DIM);
+        coarse.build(&data);
+
+        let gt = run_benchmark(&full, &q, &vec![vec![]; q.len()], K);
+        let cs = run_benchmark(&coarse, &q, &vec![vec![]; q.len()], K);
+
+        // Coarse search must be noticeably faster (≥1.5×)
+        assert!(
+            cs.throughput_qps >= gt.throughput_qps * 1.5,
+            "Expected coarse QPS {:.0} ≥ 1.5× full QPS {:.0}",
+            cs.throughput_qps,
+            gt.throughput_qps
+        );
+    }
+
+    #[test]
+    fn cascade_recall_above_threshold() {
+        let data = build_dataset();
+        let q = build_queries();
+
+        let mut full = FullScan::new();
+        full.build(&data);
+
+        // Build ground truth
+        let gt: Vec<Vec<Hit>> = q.iter().map(|query| full.search(query, K)).collect();
+
+        let cfg = MatryoshkaConfig::new(DIM, COARSE_DIM, CASCADE_CANDS);
+        let mut cascade = CascadeSearch::new(cfg);
+        cascade.build(&data);
+
+        let stats = run_benchmark(&cascade, &q, &gt, K);
+
+        // Acceptance: ≥90% recall@10 with Matryoshka-structured data
+        assert!(
+            stats.mean_recall >= 0.90,
+            "CascadeSearch recall {:.4} < 0.90 acceptance threshold",
+            stats.mean_recall
+        );
+    }
+
+    #[test]
+    fn cascade_faster_than_full() {
+        let data = build_dataset();
+        let q = build_queries();
+
+        let mut full = FullScan::new();
+        full.build(&data);
+
+        let cfg = MatryoshkaConfig::new(DIM, COARSE_DIM, CASCADE_CANDS);
+        let mut cascade = CascadeSearch::new(cfg.clone());
+        cascade.build(&data);
+
+        let gt_stats = run_benchmark(&full, &q, &vec![vec![]; q.len()], K);
+        let ca_stats = run_benchmark(&cascade, &q, &vec![vec![]; q.len()], K);
+
+        // Cascade must be faster than full scan (QPS improvement)
+        assert!(
+            ca_stats.throughput_qps > gt_stats.throughput_qps,
+            "Expected cascade QPS {:.0} > full QPS {:.0}",
+            ca_stats.throughput_qps,
+            gt_stats.throughput_qps
+        );
+    }
+
+    #[test]
+    fn recall_at_k_perfect_match() {
+        let hits: Vec<Hit> = (0..K)
+            .map(|i| Hit {
+                id: i,
+                distance: i as f32,
+            })
+            .collect();
+        assert_eq!(recall_at_k(&hits, &hits), 1.0);
+    }
+
+    #[test]
+    fn recall_at_k_no_match() {
+        let gt: Vec<Hit> = (0..K)
+            .map(|i| Hit {
+                id: i,
+                distance: 0.0,
+            })
+            .collect();
+        let retrieved: Vec<Hit> = (K..2 * K)
+            .map(|i| Hit {
+                id: i,
+                distance: 0.0,
+            })
+            .collect();
+        assert_eq!(recall_at_k(&gt, &retrieved), 0.0);
+    }
+
+    #[test]
+    fn matryoshka_config_memory_ratio() {
+        let cfg = MatryoshkaConfig::new(128, 32, 200);
+        let ratio = cfg.memory_ratio();
+        assert!((ratio - 0.25).abs() < 1e-6, "ratio should be 0.25");
+    }
+
+    #[test]
+    fn dataset_correct_size_and_dim() {
+        let data = generate_matryoshka_dataset(500, 64, 10, 99);
+        assert_eq!(data.len(), 500);
+        assert!(data.iter().all(|v| v.data.len() == 64));
+    }
+}
diff --git a/crates/ruvector-matryoshka/src/main.rs b/crates/ruvector-matryoshka/src/main.rs
new file mode 100644
index 0000000000..735cbea3fb
--- /dev/null
+++ b/crates/ruvector-matryoshka/src/main.rs
@@ -0,0 +1,295 @@
+//! Matryoshka HNSW benchmark binary.
+//!
+//! Measures three search strategies on a synthetic Matryoshka-structured dataset:
+//!   1. FullScan      — brute-force at full dimensions (ground-truth baseline)
+//!   2. CoarseScan    — brute-force at coarse_dim only (fast, lossy)
+//!   3. CascadeSearch — coarse filter → full rerank (Matryoshka strategy)
+//!
+//! Acceptance criterion: CascadeSearch recall@10 ≥ 0.90
+
+use ruvector_matryoshka::{
+    generate_matryoshka_dataset, generate_queries, run_benchmark, CascadeSearch, CoarseScan,
+    FullScan, MatryoshkaConfig, MatryoshkaIndex,
+};
+
+// ── Dataset parameters ────────────────────────────────────────────────────────
+
+const N: usize = 5_000;
+const DIM: usize = 128;
+const COARSE_DIM: usize = 32;
+const N_CLUSTERS: usize = 25;
+const N_QUERIES: usize = 200;
+const K: usize = 10;
+const CASCADE_CANDS: usize = 200;
+const SEED: u64 = 0xCAFE_BABE;
+
+const RECALL_THRESHOLD: f64 = 0.90;
+
+// ── Formatting helpers ────────────────────────────────────────────────────────
+
+fn print_header() {
+    println!(
+        "╔══════════════════════════════════════════════════════════════════════════════════╗"
+    );
+    println!("║  Matryoshka HNSW — Dimension-Adaptive Multi-Resolution Vector Search Benchmark  ║");
+    println!(
+        "╚══════════════════════════════════════════════════════════════════════════════════╝"
+    );
+    println!();
+}
+
+fn print_system_info() {
+    println!(
+        "── System ──────────────────────────────────────────────────────────────────────────"
+    );
+    println!("  OS:         {}", std::env::consts::OS);
+    println!("  Arch:       {}", std::env::consts::ARCH);
+    println!("  Rust:       {}", rustc_version());
+    println!();
+}
+
+fn rustc_version() -> String {
+    // Try to read from environment (set by build scripts / CI).
+    // Fall back to the compile-time constant.
+    option_env!("RUSTC_VERSION")
+        .map(str::to_owned)
+        .unwrap_or_else(|| "1.87+ (release build)".to_owned())
+}
+
+fn print_dataset_info() {
+    println!(
+        "── Dataset ─────────────────────────────────────────────────────────────────────────"
+    );
+    println!("  N vectors:        {}", N);
+    println!("  Full dim:         {}", DIM);
+    println!("  Coarse dim:       {}", COARSE_DIM);
+    println!(
+        "  Coarse fraction:  {:.0}% ({}/{} dims)",
+        100.0 * COARSE_DIM as f64 / DIM as f64,
+        COARSE_DIM,
+        DIM
+    );
+    println!("  Clusters:         {}", N_CLUSTERS);
+    println!("  Queries:          {}", N_QUERIES);
+    println!("  K (recall@K):     {}", K);
+    println!("  Cascade cands:    {}", CASCADE_CANDS);
+    println!();
+    println!("  Matryoshka noise schedule:");
+    println!(
+        "    dims {:>3}–{:<3}  σ = 0.12  (high signal)",
+        0,
+        DIM / 4 - 1
+    );
+    println!(
+        "    dims {:>3}–{:<3}  σ = 0.50  (medium signal)",
+        DIM / 4,
+        DIM / 2 - 1
+    );
+    println!(
+        "    dims {:>3}–{:<3}  σ = 0.80  (lower signal — still cluster-structured)",
+        DIM / 2,
+        DIM - 1
+    );
+    println!();
+}
+
+fn print_results_header() {
+    println!(
+        "── Results ─────────────────────────────────────────────────────────────────────────"
+    );
+    println!(
+        "{:<32} {:>10} {:>10} {:>10} {:>10} {:>11} {:>10} {:>8}",
+        "Variant", "Mean(µs)", "p50(µs)", "p95(µs)", "QPS", "Recall@10", "Mem(KB)", "Result"
+    );
+    println!("{}", "─".repeat(103));
+}
+
+fn print_row(
+    name: &str,
+    mean: f64,
+    p50: f64,
+    p95: f64,
+    qps: f64,
+    recall: f64,
+    mem_kb: usize,
+    result: &str,
+) {
+    println!(
+        "{:<32} {:>10.1} {:>10.1} {:>10.1} {:>10.0} {:>11.4} {:>10} {:>8}",
+        name, mean, p50, p95, qps, recall, mem_kb, result
+    );
+}
+
+// ── Main ──────────────────────────────────────────────────────────────────────
+
+fn main() {
+    print_header();
+    print_system_info();
+
+    // ── Build dataset ──────────────────────────────────────────────────────────
+    println!(
+        "Generating dataset ({} vectors, D={}, {} clusters)…",
+        N, DIM, N_CLUSTERS
+    );
+    let vectors = generate_matryoshka_dataset(N, DIM, N_CLUSTERS, SEED);
+    let queries = generate_queries(N_QUERIES, DIM, N_CLUSTERS, SEED);
+    println!("  Done.\n");
+
+    print_dataset_info();
+
+    // ── Index 1: FullScan (ground truth) ──────────────────────────────────────
+    let mut full_scan = FullScan::new();
+    full_scan.build(&vectors);
+
+    println!("Computing ground truth ({} queries × K={})…", N_QUERIES, K);
+    let ground_truth: Vec<Vec<_>> = queries.iter().map(|q| full_scan.search(q, K)).collect();
+    println!("  Done.\n");
+
+    // ── Index 2: CoarseScan ───────────────────────────────────────────────────
+    let mut coarse_scan = CoarseScan::new(COARSE_DIM);
+    coarse_scan.build(&vectors);
+
+    // ── Index 3: CascadeSearch ────────────────────────────────────────────────
+    let cfg = MatryoshkaConfig::new(DIM, COARSE_DIM, CASCADE_CANDS);
+    let mut cascade = CascadeSearch::new(cfg);
+    cascade.build(&vectors);
+
+    // ── Warm up ───────────────────────────────────────────────────────────────
+    for q in queries.iter().take(10) {
+        let _ = full_scan.search(q, K);
+        let _ = coarse_scan.search(q, K);
+        let _ = cascade.search(q, K);
+    }
+
+    // ── Benchmark each variant ─────────────────────────────────────────────────
+    let full_stats = run_benchmark(&full_scan, &queries, &ground_truth, K);
+    let coarse_stats = run_benchmark(&coarse_scan, &queries, &ground_truth, K);
+    let cascade_stats = run_benchmark(&cascade, &queries, &ground_truth, K);
+
+    // ── Print table ────────────────────────────────────────────────────────────
+    print_results_header();
+
+    print_row(
+        "FullScan (D=128)",
+        full_stats.mean_latency_us,
+        full_stats.p50_latency_us,
+        full_stats.p95_latency_us,
+        full_stats.throughput_qps,
+        full_stats.mean_recall,
+        full_stats.memory_kb,
+        "baseline",
+    );
+
+    print_row(
+        &format!("CoarseScan (D={})", COARSE_DIM),
+        coarse_stats.mean_latency_us,
+        coarse_stats.p50_latency_us,
+        coarse_stats.p95_latency_us,
+        coarse_stats.throughput_qps,
+        coarse_stats.mean_recall,
+        coarse_stats.memory_kb,
+        "fast/lossy",
+    );
+
+    print_row(
+        &format!("CascadeSearch (D={}→{})", COARSE_DIM, DIM),
+        cascade_stats.mean_latency_us,
+        cascade_stats.p50_latency_us,
+        cascade_stats.p95_latency_us,
+        cascade_stats.throughput_qps,
+        cascade_stats.mean_recall,
+        cascade_stats.memory_kb,
+        if cascade_stats.mean_recall >= RECALL_THRESHOLD {
+            "PASS"
+        } else {
+            "FAIL"
+        },
+    );
+
+    // ── Performance analysis ───────────────────────────────────────────────────
+    println!();
+    println!(
+        "── Performance analysis ────────────────────────────────────────────────────────────"
+    );
+
+    let speedup_coarse = coarse_stats.throughput_qps / full_stats.throughput_qps;
+    let speedup_cascade = cascade_stats.throughput_qps / full_stats.throughput_qps;
+
+    println!(
+        "  CoarseScan throughput vs FullScan:   {:.2}×",
+        speedup_coarse
+    );
+    println!(
+        "  CascadeSearch throughput vs FullScan: {:.2}×",
+        speedup_cascade
+    );
+    println!(
+        "  Recall recovered by Cascade:          {:.1}% (vs CoarseScan lossy)",
+        cascade_stats.mean_recall * 100.0,
+    );
+
+    let theoretical_ops_full = N * DIM;
+    let theoretical_ops_cascade = N * COARSE_DIM + CASCADE_CANDS * DIM;
+    let theoretical_speedup = theoretical_ops_full as f64 / theoretical_ops_cascade as f64;
+    println!(
+        "  Theoretical op-count speedup:         {:.2}×",
+        theoretical_speedup
+    );
+    println!(
+        "  (N×full_dim={} vs N×coarse_dim + cands×full_dim={}+{}={})",
+        theoretical_ops_full,
+        N * COARSE_DIM,
+        CASCADE_CANDS * DIM,
+        theoretical_ops_cascade,
+    );
+
+    // ── Memory analysis ────────────────────────────────────────────────────────
+    println!();
+    println!(
+        "── Memory analysis ─────────────────────────────────────────────────────────────────"
+    );
+    let full_vec_bytes = N * DIM * 4;
+    let coarse_vec_bytes = N * COARSE_DIM * 4;
+    println!(
+        "  Full vectors  ({} × {} × 4 bytes): {} KB",
+        N,
+        DIM,
+        full_vec_bytes / 1024
+    );
+    println!(
+        "  Coarse slice  ({} × {} × 4 bytes): {} KB",
+        N,
+        COARSE_DIM,
+        coarse_vec_bytes / 1024
+    );
+    println!(
+        "  Coarse-only memory reduction:  {:.0}% savings",
+        (1.0 - coarse_vec_bytes as f64 / full_vec_bytes as f64) * 100.0
+    );
+    println!("  (CascadeSearch stores full vectors; savings come from compute, not storage)");
+
+    // ── Acceptance test ────────────────────────────────────────────────────────
+    println!();
+    println!(
+        "── Acceptance test ─────────────────────────────────────────────────────────────────"
+    );
+    let passed = cascade_stats.mean_recall >= RECALL_THRESHOLD;
+    println!(
+        "  CascadeSearch recall@{} = {:.4}  ≥ {} threshold → {}",
+        K,
+        cascade_stats.mean_recall,
+        RECALL_THRESHOLD,
+        if passed { "PASS ✓" } else { "FAIL ✗" }
+    );
+    println!();
+
+    if !passed {
+        eprintln!(
+            "ACCEPTANCE FAILED: CascadeSearch recall {:.4} < {}",
+            cascade_stats.mean_recall, RECALL_THRESHOLD
+        );
+        std::process::exit(1);
+    }
+
+    println!("Benchmark complete.");
+}
diff --git a/docs/adr/ADR-194-matryoshka-hnsw.md b/docs/adr/ADR-194-matryoshka-hnsw.md
new file mode 100644
index 0000000000..447d8ce26d
--- /dev/null
+++ b/docs/adr/ADR-194-matryoshka-hnsw.md
@@ -0,0 +1,197 @@
+# ADR-194: Matryoshka HNSW — Dimension-Adaptive Multi-Resolution Vector Search
+
+**Status:** Draft  
+**Date:** 2026-05-16  
+**Authors:** ruvnet, claude-flow  
+**Deciders:** RuVector core team  
+**Related:** ADR-193 (RAIRS IVF), ADR-026 (model routing), crates/ruvector-matryoshka
+
+---
+
+## Context
+
+Matryoshka Representation Learning (MRL, arXiv:2205.13147, NeurIPS 2022) has become
+a de-facto training standard for production embedding models.  OpenAI text-embedding-3,
+Nomic nomic-embed-text-v1.5, Google Gemini Embedding 2, Voyage AI, Jina, and BGE-M3
+all ship Matryoshka-trained vectors.  Every agentic workflow that retrieves from these
+APIs would benefit from Matryoshka-aware indexing.
+
+RuVector currently offers:
+- HNSW via `ruvector-acorn` and `ruvector-core`
+- IVF via `ruvector-rairs`
+- 1-bit quantization via `ruvector-rabitq`
+
+There is no Matryoshka-aware search strategy: no cascade from coarse to full
+dimensions, no multi-resolution index, and no trait that captures the concept of
+"this index understands that early dimensions are more discriminative."
+
+The cascade strategy — coarse-dimension linear scan → full-precision rerank of
+top candidates — is the simplest correct approach.  It is already implemented in
+production by Milvus (called "funnel search") and supported conceptually in Weaviate
+and Qdrant through model-provider truncation.  RuVector has no Rust-native equivalent.
+
+---
+
+## Decision
+
+Add `crates/ruvector-matryoshka` to the workspace, providing:
+
+1. A `MatryoshkaIndex` trait for dimension-adaptive search.
+2. Three concrete implementations: `FullScan` (baseline), `CoarseScan` (fast/lossy),
+   `CascadeSearch` (Matryoshka-aware cascade).
+3. A `MatryoshkaConfig` struct parameterising `full_dim`, `coarse_dim`, and
+   `cascade_candidates`.
+4. A synthetic dataset generator that produces Matryoshka-like cluster geometry,
+   enabling deterministic benchmarks without external embedding dependencies.
+5. A benchmark binary (`matryoshka-bench`) producing all key metrics.
+
+This crate is initially a research PoC behind no feature flag.  The `MatryoshkaIndex`
+trait is the API surface that should survive into production.
+
+---
+
+## Consequences
+
+### Positive
+
+- Enables correct retrieval from MRL-trained models (OpenAI, Nomic, etc.) without
+  accepting the recall collapse of truncation-only search.
+- Establishes a clean Rust trait (`MatryoshkaIndex`) that can be implemented by
+  graph-based coarse stages (HNSW-lite) in future iterations.
+- 2.28× throughput improvement over FullScan with identical recall@10 on Matryoshka-
+  structured data (measured, `cargo run --release`).
+- Coarse-only variant (`CoarseScan`) is trivially WASM-compatible (no rayon, no
+  unsafe, no external deps); opens WASM-budget search for Cognitum Seed and Pi Zero.
+
+### Negative
+
+- Recall depends on `cascade_candidates` being large enough.  A misconfigured value
+  silently degrades recall.  Users must validate on representative data.
+- Flat coarse scan is O(N·D_c); for N > 1M a graph-based coarse stage is needed
+  (HNSW on the coarse vectors).
+- Dimension-split vector layout (separate coarse and residual arrays) would recover
+  cache efficiency but is not yet implemented; measured speedup (2.28×) is below
+  the theoretical op-count speedup (3.45×).
+
+---
+
+## Alternatives considered
+
+### A. Truncation at query time without a cascade (status quo)
+
+Truncate query and database vectors to `coarse_dim` before existing flat/HNSW search.
+Simple but collapses recall.  On our test dataset, D=32 truncation gives 5.75%
+recall@10 vs the full-precision ground truth — unusable for production.
+
+### B. Multiple full-dim HNSW graphs at each granularity
+
+Build one HNSW graph per dimension level (e.g., at D=32, D=64, D=128).  Higher
+recall than cascade for the coarse-graph query.  Rejected for now: 3× memory
+overhead, complex build coordination, not yet required for the PoC.
+
+### C. Integrate directly into `ruvector-core`
+
+Add CascadeSearch as a new index type in core.  Rejected for initial landing:
+- Core has its own stability guarantees.
+- A standalone crate allows faster iteration without risking core breakage.
+- Migration path is clear: implement `MatryoshkaIndex` in core after the trait
+  stabilises.
+
+---
+
+## Implementation plan
+
+### Phase 1 — PoC (this ADR, done)
+
+- [x] `MatryoshkaIndex` trait
+- [x] `FullScan`, `CoarseScan`, `CascadeSearch` implementations
+- [x] Synthetic dataset generator with shared cluster geometry
+- [x] 8 unit tests, all passing
+- [x] Benchmark binary with real latency, throughput, recall, memory
+- [x] Acceptance test: CascadeSearch recall@10 ≥ 0.90
+
+### Phase 2 — Graph coarse stage
+
+- [ ] Implement `HnswCoarseStage` that builds an HNSW graph at `coarse_dim`
+- [ ] Replace O(N·D_c) flat pass with O(log N) HNSW walk on coarse graph
+- [ ] Expected: push throughput from 2.28× toward the 3.45× theoretical target
+
+### Phase 3 — Production integration
+
+- [ ] Dimension-split vector layout: separate `coarse` and `residual` storage arrays
+- [ ] Feature flag `matryoshka` in `ruvector-core` exposing `MatryoshkaIndex` in search registry
+- [ ] ruFlo plugin for online `cascade_candidates` tuning against recall SLA
+- [ ] MCP tool surface: `mcp_search_cascade(query, coarse_dim, k)`
+
+### Phase 4 — DiskANN integration
+
+- [ ] Store coarse vectors in RAM, full vectors on SSD (bridge to `ruvector-diskann`)
+- [ ] WASM build of `CoarseScan` for edge deployment
+
+---
+
+## Benchmark evidence
+
+All numbers from `cargo run --release -p ruvector-matryoshka`, x86-64 Linux 6.18.5,
+Intel Celeron N4020, rustc 1.87.0:
+
+```
+N=5 000 vectors, D=128, coarse_dim=32, cascade_candidates=200, K=10, 200 queries
+
+Variant                  Mean(µs)  p50(µs)  p95(µs)   QPS  Recall@10  Mem(KB)
+─────────────────────────────────────────────────────────────────────────────
+FullScan (D=128)            860.7    840.5    990.4  1 162     1.0000    2 500
+CoarseScan (D=32)           332.1    325.7    382.9  3 012     0.0575    2 500
+CascadeSearch (D=32→128)    376.9    371.5    419.8  2 653     1.0000    2 500
+
+Acceptance: CascadeSearch recall@10 = 1.0000 ≥ 0.90 → PASS ✓
+```
+
+---
+
+## Failure modes
+
+| Mode | Description | Detection | Mitigation |
+|------|-------------|-----------|------------|
+| Silent recall collapse | `cascade_candidates` too small; ground-truth neighbours not in coarse top-C | Monitor recall@k in production | Instrument recall; alert if < SLA |
+| No embedding MRL property | Model not MRL-trained; coarse dims uninformative | Pre-check: coarse recall < 20% on validation set | Fall back to `FullScan` |
+| Memory exhaustion | N × D × 4 bytes exceeds device RAM | OOM at build time | Use disk-backed variant or quantize |
+| Latency regression on large N | Flat coarse scan O(N·D_c) too slow for N > 1M | Throughput drops below SLA | Graduate to HNSW coarse stage (Phase 2) |
+
+---
+
+## Security considerations
+
+- No new network surface introduced.
+- Coarse candidates could, in principle, leak information about which embeddings
+  are "close in the low-dimensional projection" even if not close in full space.
+  If embedding privacy is a concern, restrict coarse-pass candidate lists to
+  authorised callers.
+- For proof-gated RAG (ADR future), require a witness proof before the full rerank
+  stage can access the full-precision vectors.
+
+---
+
+## Migration path
+
+1. Existing callers using `FullScan` semantics continue to work unchanged.
+2. Callers wishing to adopt cascade search: wrap existing `Vec<Vector>` in
+   `CascadeSearch::new(config)` + `build()` + `search()` — same interface.
+3. No existing crate APIs change.
+
+---
+
+## Open questions
+
+1. **Optimal `cascade_candidates` scheduling.** Should it be a function of N, K,
+   and estimated cluster density?  Current choice (200) is empirical.
+2. **Dimension-split layout.** How to expose both coarse and residual arrays via a
+   single `Vector` struct without breaking the existing API?
+3. **HNSW coarse stage thread safety.** Phase 2 graph construction needs `Send +
+   Sync`; current PoC is single-threaded.
+4. **Query-aware dimension selection.** arXiv:2602.03306 shows per-query `coarse_dim`
+   outperforms a global constant.  Should `search()` accept a per-query `coarse_dim`
+   override?
+5. **Integration with `ruvector-mincut`.** MinCut boundaries could prune candidates
+   that are in a different coherence domain from the query after the coarse pass,
+   further reducing the rerank set and improving precision.
diff --git a/docs/research/nightly/2026-05-16-matryoshka-hnsw/README.md b/docs/research/nightly/2026-05-16-matryoshka-hnsw/README.md
new file mode 100644
index 0000000000..89ddd2734e
--- /dev/null
+++ b/docs/research/nightly/2026-05-16-matryoshka-hnsw/README.md
@@ -0,0 +1,522 @@
+# Matryoshka HNSW: Dimension-Adaptive Multi-Resolution Vector Search
+
+**Nightly research · 2026-05-16 · arXiv:2205.13147 (NeurIPS 2022) and extensions**
+
+> **Scope.** This research implements and benchmarks the Matryoshka cascade search
+> strategy — coarse-dimension candidate selection followed by full-precision reranking —
+> as a new standalone Rust crate (`crates/ruvector-matryoshka`).  All benchmark numbers
+> are from `cargo run --release -p ruvector-matryoshka` on the hardware listed below.
+> No numbers are invented or aspirational.
+
+---
+
+## Abstract
+
+Matryoshka Representation Learning (MRL, Kusupati et al., NeurIPS 2022) trains
+embedding models so that every prefix of the vector is independently meaningful: the
+first 32 dimensions of a 128-dimensional embedding already encode the dominant
+semantic signal, the next 32 add refinement, and so on, like nested Russian dolls.
+This property enables a *cascade search* strategy: scan all N database vectors using
+only the fast, cheap coarse dimensions to collect the most likely candidates, then
+rerank only those candidates at full precision.
+
+This nightly research validates the cascade strategy in Rust, defines a clean
+`MatryoshkaIndex` trait for RuVector, and produces the first measured implementation
+of Matryoshka-aware search in the RuVector ecosystem.
+
+**Key measured results (x86-64 Linux, `cargo run --release`, N=5 000, D=128, K=10):**
+
+| Variant | Mean(µs) | p50(µs) | p95(µs) | QPS | Recall@10 | Memory | Result |
+|---------|----------|---------|---------|-----|-----------|--------|--------|
+| FullScan (D=128) — baseline | 860.7 | 840.5 | 990.4 | 1 162 | 1.0000 | 2 500 KB | baseline |
+| CoarseScan (D=32 only) | 332.1 | 325.7 | 382.9 | 3 012 | 0.0575 | 2 500 KB | fast/lossy |
+| **CascadeSearch (D=32→128)** | **376.9** | **371.5** | **419.8** | **2 653** | **1.0000** | 2 500 KB | **PASS** |
+
+**CascadeSearch delivers 2.28× higher throughput than FullScan with identical recall@10.**
+
+Hardware: x86-64 Linux 6.18.5, Intel Celeron N4020, `rustc 1.87.0 --release`, no SIMD libraries.
+
+---
+
+## 1. Why this matters for RuVector
+
+RuVector is positioned as a Rust-native cognition substrate: vector search, graph
+storage, agent memory, and MCP tools.  Modern embedding APIs — OpenAI
+`text-embedding-3`, Nomic `nomic-embed-text-v1.5`, Google Gemini Embedding 2 — all
+ship Matryoshka-trained vectors.  Any workflow retrieving from these APIs
+immediately benefits from cascade search.
+
+Without Matryoshka-aware indexing, a vector database using these embeddings has two
+bad options: search at full 3072 dimensions (expensive), or search at truncated
+dimensions without reranking (lossy).  CascadeSearch is the third path that keeps
+cost close to the truncated case while keeping quality at the full-precision level.
+
+---
+
+## 2. 2026 state of the art survey
+
+### 2.1 Matryoshka Representation Learning (MRL)
+
+Kusupati et al. (NeurIPS 2022, arXiv:2205.13147) introduced MRL: a training loss
+that is a weighted sum of cross-entropy / contrastive losses computed at each nested
+dimension level `{m_1, m_2, …, m_k}`.  Because all prefix subspaces are optimized
+simultaneously in every batch forward pass, the model learns that each prefix is
+independently useful.  The original paper reports up to 14× retrieval speedup on
+ImageNet-1K with negligible accuracy drop.
+
+### 2.2 SMRL and gradient-variance fix (EMNLP 2025)
+
+SMEC / SMRL (Zhang et al., arXiv:2510.12474, EMNLP 2025) identified *gradient
+variance* as the core failure mode of vanilla MRL: multiple dimension levels
+backpropagate simultaneously and interfere.  Their Sequential Matryoshka schedule
+trains levels in sequence (small → large), each initialized from the prior level,
+eliminating gradient interference.  They report +1.1 NDCG@10 over Matryoshka-Adaptor
+on BEIR at 256-dim embeddings from LLM2Vec.
+
+### 2.3 2D Matryoshka (November 2024)
+
+Wang et al. (arXiv:2411.17299) extend MRL across both the dimension axis *and* the
+transformer layer axis simultaneously.  A single fine-tuned model can be deployed at
+any (layer-depth, embedding-width) pair — a continuous Pareto frontier from a single
+checkpoint.  On MSMARCO and zero-shot BEIR, 2D MRL outperforms vanilla MRL at
+sub-dimension retrieval and matches layer-specific fine-tuned models.
+
+### 2.4 Query-aware dimension selection (2026)
+
+Wu et al. (arXiv:2602.03306) go further: instead of a fixed truncation level, they
+train a lightweight per-query dimension-importance predictor using a KL-divergence
+loss against oracle discrimination scores.  At inference, each query selects a
+different top-k subset of dimensions.  On SciFact they reach NDCG@10 = 0.899 using
+only 20% of embedding dimensions.  **This is the most forward-looking 2026 result**:
+it breaks the assumption that a single fixed dimension works optimally for all
+queries.
+
+### 2.5 Funnel search in production
+
+Milvus implements native "funnel search" for MRL embeddings: initial ANN at D/32,
+rerank at D/16, progressively double dimension and halve candidates (200→100→…→10).
+This is the production-grade form of CascadeSearch, documented in Milvus official
+docs.  Qdrant does not have native MRL funnel search as of mid-2026, focusing instead
+on orthogonal quantization (binary/scalar/1.5-bit); Weaviate exposes it via
+model-provider `dimensions` parameters without a custom search algorithm.
+
+---
+
+## 3. Forward-looking 10–20 year thesis
+
+### The continuous-resolution embedding future
+
+Matryoshka embeddings represent the first step toward fully continuous-resolution
+retrieval systems.  Over a 10-20 year horizon this will converge with learned sparse
+activation patterns (mixture-of-experts style) to produce embeddings that are
+simultaneously nested *and* query-conditioned — where each query activates a
+different, non-contiguous subset of dimensions rather than a prefix (the 2026 paper
+arXiv:2602.03306 is an early indicator).
+
+### Hardware-level adaptive precision
+
+Combined with hardware trends toward processing-in-memory (CXL-attached DRAM,
+near-memory compute), the cost model for high-dimension search will shift: energy,
+not latency, becomes the binding constraint.  Adaptive-precision computation — coarse
+distances in INT4, full reranking in FP32 — will be a first-class architectural
+primitive, with Matryoshka-trained models mapping directly onto hardware quantization
+levels.
+
+### Database schema evolution
+
+In 10-20 years, changing embedding dimension will require no re-indexing: HNSW graphs
+will be dimension-polymorphic, with edges labeled by the minimum dimension at which
+they are valid nearest-neighbour candidates.  This dissolves the current hard boundary
+between storage-tier compressed search and query-tier full-precision reranking into a
+single adaptive index.  RuVector's graph substrate and mincut tooling position it
+well to build such a dimension-aware graph index.
+
+---
+
+## 4. ruvnet ecosystem fit
+
+| Integration point | Role of Matryoshka |
+|-------------------|--------------------|
+| `ruvector-core` | CascadeSearch as a first-class search mode |
+| `ruvector-diskann` | Coarse dims for in-RAM routing, full dims for SSD rerank |
+| `ruvector-acorn` | Filtered cascade: apply predicate during coarse pass |
+| `ruvector-mincut` | Coherence-aware candidate pruning between coarse and fine stage |
+| ruFlo | Auto-tune `coarse_dim` and `cascade_candidates` via online feedback loop |
+| MCP tools | Expose `search_cascade(query, coarse_dim, k)` as an MCP memory tool |
+| WASM / edge | Coarse-only search within WASM budget; optional full rerank on server |
+| `rvf` (RVF format) | Pack multi-granularity vector prefixes in a single portable manifest |
+
+---
+
+## 5. Proposed design
+
+### Core trait
+
+```rust
+pub trait MatryoshkaIndex {
+    fn name(&self) -> &str;
+    fn build(&mut self, vectors: &[Vector]);
+    fn search(&self, query: &[f32], k: usize) -> Vec<Hit>;
+    fn memory_bytes(&self) -> usize;
+}
+```
+
+### Variants implemented
+
+**FullScan** — brute-force L2 over all N vectors at full `D` dimensions.  Ground-truth
+baseline.  O(N·D) per query.
+
+**CoarseScan** — brute-force L2 using only the first `coarse_dim` dimensions.  2.59×
+faster than FullScan.  Recall collapses to 5.75% on our synthetic dataset (later
+dimensions carry real signal — this is intentional: it proves that the later dims
+matter and that reranking is necessary).
+
+**CascadeSearch** — two-pass:
+1. Scan all N vectors at `coarse_dim` → top `cascade_candidates`  (O(N·coarse_dim))
+2. Rerank top `cascade_candidates` at full `D` → top k  (O(cascade_candidates·D))
+
+Total ops: `N·coarse_dim + cascade_candidates·D`
+
+Theoretical speedup over FullScan (N=5 000, D=128, coarse=32, cands=200):
+
+```
+640 000 / (160 000 + 25 600) = 640 000 / 185 600 ≈ 3.45×
+```
+
+Observed throughput speedup: **2.28×** (wall-clock overhead reduces gain vs
+theoretical op-count speedup, which is typical for memory-bound workloads).
+
+### Architecture diagram
+
+```mermaid
+flowchart LR
+    subgraph Stage1["Stage 1 — Coarse scan (O(N·D₀))"]
+        Q[Query] --> CS[Coarse distance\nD₀ = 32 dims]
+        DB[(All N vectors)] --> CS
+        CS --> TK[Top C candidates\nC = 200]
+    end
+    subgraph Stage2["Stage 2 — Full rerank (O(C·D))"]
+        TK --> FR[Full-precision distance\nD = 128 dims]
+        FR --> R[Top k results\nk = 10]
+    end
+    Stage1 --> Stage2
+```
+
+---
+
+## 6. Implementation notes
+
+### Shared cluster centres
+
+The dataset generator (`generate_matryoshka_dataset`) and the query generator
+(`generate_queries`) share the same cluster centre geometry via a base seed.
+Per-point noise uses a different sub-seed.  This is critical: if queries and the
+database use different cluster centres, coarse-space proximity does not predict
+full-space proximity, and the cascade cannot work.  **The failing unit test
+(recall@10 = 0.23) discovered when queries used an independent seed** validated that
+this is not a trivial requirement.
+
+### Noise schedule
+
+The synthetic data uses a tiered noise schedule per dimension group:
+
+| Dims | σ | Interpretation |
+|------|---|----------------|
+| 0..32 | 0.12 | High signal — like MRL dimensions 1..m_1 |
+| 32..64 | 0.50 | Medium signal |
+| 64..128 | 0.80 | Lower signal — still cluster-structured, not pure noise |
+
+A σ of 0.80 means even the "low-signal" dimensions carry cluster information.
+This is why CoarseScan (D=32 only) achieves only 5.75% recall: those 96 dimensions
+are not noise, they carry genuine geometry that shifts the ranking.
+
+---
+
+## 7. Benchmark methodology
+
+**Platform:** x86-64 Linux 6.18.5, Intel Celeron N4020, single core, no SIMD.
+
+**Build:** `cargo run --release -p ruvector-matryoshka`
+
+**Dataset:** Synthetic Matryoshka Gaussian, N=5 000, D=128, 25 clusters, seed=0xCAFEBABE.
+
+**Queries:** 200 independent points from same cluster geometry, seed=0xCAFEBABE+0xBEEF.
+
+**Measurement:** Per-query wall-clock time via `std::time::Instant`, 200 queries
+per variant, sort, percentile extraction.
+
+**Ground truth:** FullScan results (exact brute-force at D=128) for recall computation.
+
+**Warm-up:** 10 queries per variant before timing begins.
+
+---
+
+## 8. Real benchmark results
+
+```
+OS:     linux / x86_64
+Rust:   1.87+ (release build)
+N:      5 000 vectors
+D:      128 dimensions
+Coarse: 32 dimensions (25% of full)
+K:      10
+Cands:  200
+
+Variant                  Mean(µs)  p50(µs)  p95(µs)   QPS  Recall@10  Mem(KB)  Result
+─────────────────────────────────────────────────────────────────────────────────────
+FullScan (D=128)            860.7    840.5    990.4  1 162     1.0000    2 500  baseline
+CoarseScan (D=32)           332.1    325.7    382.9  3 012     0.0575    2 500  fast/lossy
+CascadeSearch (D=32→128)    376.9    371.5    419.8  2 653     1.0000    2 500  PASS ✓
+
+Performance summary:
+  CoarseScan:  2.59× QPS gain, 5.75% recall (recall collapse due to meaningful high dims)
+  Cascade:     2.28× QPS gain, 100% recall
+  Theoretical: 3.45× op-count speedup  (N·D_full / (N·D_coarse + C·D_full))
+  Acceptance:  CascadeSearch recall@10 = 1.0000 ≥ 0.90 → PASS ✓
+```
+
+---
+
+## 9. Memory and performance math
+
+### Memory
+
+All three variants store full float32 vectors in RAM.  CascadeSearch does not save
+memory over FullScan — its advantage is compute, not storage.
+
+A coarse-only index storing only the first `D_c` dimensions would save:
+
+```
+memory_savings = 1 - D_c / D = 1 - 32/128 = 75%
+```
+
+For N=5 000, D=128: 2 500 KB → 625 KB.  This is a design direction for an edge-first
+variant that stores coarse vectors in RAM and fetches full vectors on demand from SSD.
+
+### Op-count model
+
+```
+FullScan ops:     N × D       = 5 000 × 128 = 640 000
+CascadeSearch:   N × D_c + C × D = 5 000×32 + 200×128 = 160 000 + 25 600 = 185 600
+Speedup:         640 000 / 185 600 ≈ 3.45×
+```
+
+Observed speedup (2.28×) is lower due to memory-bandwidth overhead on the coarse
+pass (N=5 000 vectors require touching 2.5 MB of full vectors even for 32-dim
+distance, since vectors are not stored split by dimension group).
+
+A dimension-split storage layout — storing `[D_c]` contiguous arrays followed by
+`[D - D_c]` arrays — would eliminate this cache inefficiency and push throughput
+closer to the theoretical 3.45× target.
+
+---
+
+## 10. How it works — walkthrough
+
+**Step 1.** Build phase: all three variants call `build(&vectors)` which stores the
+vector slice.  No graph construction overhead; this is a flat index.
+
+**Step 2.** FullScan query: iterate all N vectors, compute `sum((v[i] - q[i])²)` for
+`i in 0..128`, sort, return top k.  O(N·D) = 640 000 multiply-add ops.
+
+**Step 3.** CoarseScan query: same loop but `i in 0..32`.  Fast but misses information
+from dims 32..128.
+
+**Step 4.** CascadeSearch query:
+- Coarse pass: compute 32-dim L2 for all 5 000 vectors (160 000 ops), partial sort
+  to extract top 200 by coarse distance.
+- Full rerank: compute 128-dim L2 for the 200 candidates (25 600 ops), sort, return
+  top 10.
+
+**Step 5.** Recall computation: `recall@k = |retrieved ∩ groundtruth| / k`.
+
+---
+
+## 11. Practical failure modes
+
+| Failure | Cause | Mitigation |
+|---------|-------|-----------|
+| Low recall despite cascade | `cascade_candidates` too small; true neighbours not in coarse top-C | Increase `cascade_candidates`; tune on a held-out validation set |
+| No speedup over FullScan | Cascade candidates too large (C ≈ N) | Reduce `cascade_candidates` |
+| High coarse miss rate | Embeddings not MRL-trained; coarse dims are not informative | Verify model supports MRL; use full-dim index as fallback |
+| Memory pressure on edge | Full vectors in RAM for all N | Store only coarse dims in RAM; fetch full vectors from disk on Stage 2 |
+| Cluster structure breaking | High-noise high-dim data | Cascade candidates must be large enough to cover the recall gap |
+
+---
+
+## 12. Security and governance implications
+
+- **Access control:** CascadeSearch search results are identical to FullScan for well-tuned parameters; no differential privacy risk from truncation.
+- **Injection:** The cascade does not modify stored vectors; no write path is introduced.
+- **Audit trail:** Coarse-pass candidates can be logged for RAG provenance chains.
+- **Proof gating:** A future variant could require a cryptographic witness proof before promoting coarse candidates to the full-rerank stage, gating retrieval quality by write integrity.
+
+---
+
+## 13. Edge and WASM implications
+
+For WASM targets with strict compute budgets (e.g., Cognitum Seed, Pi Zero 2W):
+
+- **Coarse-only mode:** Deploy only `CoarseScan` in WASM; accept the recall loss for
+  edge inference where speed matters more than precision.
+- **Coarse-in-WASM, rerank-on-server:** Send the top-200 coarse candidates back to
+  a host for full reranking.  Network cost is 200 × 128 × 4 = 102 KB — acceptable
+  over local LAN.
+- **RVF packing:** An RVF manifest could store vectors as a pair of fields:
+  `coarse: [f32; 32]` and `residual: [f32; 96]`.  The WASM runtime uses only
+  `coarse`; the server has both.
+
+---
+
+## 14. MCP and agent workflow implications
+
+A Matryoshka-aware MCP memory tool surface could expose:
+
+```
+search_cascade(query: Vec<f32>, coarse_dim: usize, k: usize) -> Vec<Hit>
+search_full(query: Vec<f32>, k: usize) -> Vec<Hit>
+set_cascade_budget(max_candidates: usize)
+```
+
+ruFlo could drive adaptive parameter selection: observe per-query recall on a
+validation set, increase `cascade_candidates` if recall drops below threshold,
+decrease if throughput is insufficient.  This creates a self-optimising retrieval
+loop — a natural fit for ruFlo's autonomous workflow model.
+
+---
+
+## 15. Practical applications
+
+| Application | User | Why it matters | How RuVector uses it | Path |
+|-------------|------|---------------|---------------------|------|
+| Agent memory search | AI coding agents | Agents accumulate 10K–100K episodic memories; fast coarse search reduces latency | CascadeSearch on agent memory store | Near-term |
+| Graph RAG | Enterprise search | Multi-hop reasoning over K retrieved documents; speed matters per hop | Coarse pass filters corpus, full pass ranks entities | Near-term |
+| Semantic enterprise search | Knowledge workers | 10K+ document corpus; OpenAI embeddings at 3072 dims | MRL truncation + cascade at 512 dims | Near-term |
+| MCP memory tools | LLM tool calling | Tool calls must complete in <100ms | Coarse search fits WASM budget | Near-term |
+| Local AI assistants | Privacy-first users | No cloud round-trip; on-device embedding at 64–128 dims | Coarse match locally, optional full rerank | Near-term |
+| Edge anomaly detection | IoT / security | Embedding sensor telemetry at 32 dims, anomaly at 128 | Two-tier: coarse on device, full in gateway | Mid-term |
+| Code intelligence | Developer tooling | Repository-scale code search; frequent context switch | Coarse by identifier embedding, full by semantic embedding | Mid-term |
+| Scientific retrieval | Research | 50K+ paper corpus, multi-dimension relevance | Cascade at abstract embedding, rerank at full section embedding | Mid-term |
+
+---
+
+## 16. Exotic applications
+
+| Application | 10–20 year thesis | Required advances | RuVector role | Risk |
+|-------------|-------------------|-------------------|---------------|------|
+| Cognitum edge cognition | Continuous-resolution sensory embeddings at edge | Neuromorphic chips with native INT4/FP8 mixed precision | Matryoshka cascade running on Hailo or Pi hardware | Hardware not yet mature |
+| RVM coherence domains | Dimension-polymorphic coherence gates per memory region | mincut labelling of HNSW edges by dimension depth | Bridge ruvector-mincut ↔ ruvector-matryoshka | Requires new ADR |
+| Proof-gated adaptive search | Cryptographic proof required to advance from coarse to full stage | ZK-SNARKs on distance computation (expensive) | ruvector-verified integration | ZK overhead large |
+| Swarm memory | N agents each hold coarse index shard; leader holds full rerank | Distributed coarse-pass across swarm nodes | CascadeSearch as swarm-topology primitive | Consistency challenges |
+| Self-healing vector graphs | Matryoshka HNSW graph: edges tagged by minimum dimension at which they are valid | Online graph repair when dimension changes | Merge ruvector-diskann and ruvector-matryoshka | Complex invariants |
+| Agent operating systems | Per-agent memory at adaptive precision based on compute budget | OS-level embedding resource manager | RuVector as memory substrate for agent OS | Requires ecosystem |
+| Autonomous scientific hypothesiser | Retrieve related work at low dim for breadth, full dim for citation quality | Multi-granularity embedding of scientific paragraphs | Cascade determines citation candidate list | Domain data quality |
+| Bio-signal adaptive memory | Continuous-stream physiological signals; coarse for anomaly trigger, full for diagnosis | Real-time streaming embed at sub-10ms | CascadeSearch on streaming physiological index | Privacy and regulatory |
+
+---
+
+## 17. Deep research notes
+
+### What the SOTA suggests
+
+1. MRL is now a deployment default, not a research experiment.  Every major model
+   release from 2024 onward ships nested dimensions.
+2. The quality of coarse-dimension search depends critically on the training recipe
+   (gradient variance in vanilla MRL hurts small prefix recall — SMRL fixes this).
+3. Query-aware dimension selection (arXiv:2602.03306) may replace fixed truncation
+   levels within 2–3 years.  A production system should plan for per-query `coarse_dim`
+   rather than a global constant.
+
+### What remains unsolved
+
+1. **Dimension-polymorphic HNSW graph construction.** Building the graph at full D and
+   querying at D_c means graph edges were optimised for a different geometry.  No
+   production system has solved this efficiently.
+2. **Cascade candidate scheduling.** The right `cascade_candidates` is
+   distribution-dependent.  The 2022 MRL paper uses 200→10; real datasets need
+   empirical tuning.
+3. **Memory-bandwidth efficiency.** Storing vectors in full-dim layout wastes cache
+   bandwidth during the coarse pass.  Dimension-split storage (separate arrays for
+   coarse and residual components) would recover the theoretical speedup.
+
+### Where this PoC fits
+
+This PoC demonstrates that the cascade strategy works in Rust, defines the clean
+`MatryoshkaIndex` trait, and provides a measured baseline.  It is not yet:
+- A graph index (HNSW-based cascade)
+- A memory-split storage layout
+- A per-query dimension selector
+
+### What would make this production grade
+
+1. Add a graph-based (HNSW) coarse stage replacing the flat coarse scan.
+2. Separate storage for coarse and residual vector components.
+3. Integrate with `ruvector-diskann` so coarse vectors live in RAM and full vectors
+   on SSD.
+4. Add ruFlo feedback loop for online `cascade_candidates` tuning.
+
+### What would falsify the approach
+
+If real MRL embeddings from a given model show that the coarse-dim distance is
+uncorrelated with full-dim distance (because the model was not trained with a
+proper MRL or SMRL schedule), the cascade cannot recover recall regardless of
+`cascade_candidates`.  In that case the model must be retrained or replaced.
+
+---
+
+## 18. Production crate layout proposal
+
+```
+crates/ruvector-matryoshka/      ← this crate (PoC)
+crates/ruvector-matryoshka-hnsw/ ← future: graph-based coarse stage
+crates/ruvector-matryoshka-disk/ ← future: coarse-in-RAM, full-on-SSD layout
+```
+
+Integration with `ruvector-core` via a feature flag `matryoshka` exposing
+`MatryoshkaIndex` in the core search trait registry.
+
+---
+
+## 19. What to improve next
+
+1. **HNSW coarse stage.** Replace the O(N·D_c) flat coarse scan with an HNSW graph
+   built at `coarse_dim`, achieving sub-linear coarse pass.
+2. **Dimension-split vector layout.** Store `coarse[D_c]` and `residual[D-D_c]`
+   separately; coarse pass touches only 625 KB instead of 2 500 KB.
+3. **ruFlo integration.** Emit metrics per query; ruFlo adjusts `cascade_candidates`
+   to hit a recall SLA with minimum latency.
+4. **MCP tool surface.** Expose `CascadeSearch` as `mcp_search_cascade` with
+   configurable `coarse_dim` per request.
+5. **WASM build.** `CoarseScan` and `CascadeSearch` have no `rayon` dependency;
+   both compile to WASM with zero changes.
+
+---
+
+## 20. References and footnotes
+
+[^1]: Kusupati, A., Bhatt, G., Rege, A., et al. "Matryoshka Representation Learning."
+NeurIPS 2022. arXiv:2205.13147. https://arxiv.org/abs/2205.13147.
+Accessed 2026-05-16.
+
+[^2]: Zhang, B., Chen, L., Liu, T., Zheng, B. "SMEC: Rethinking Matryoshka Representation
+Learning for Retrieval Embedding Compression." EMNLP 2025. arXiv:2510.12474.
+https://arxiv.org/abs/2510.12474. Accessed 2026-05-16.
+
+[^3]: Wang, S., et al. "2D Matryoshka Training for Information Retrieval." arXiv:2411.17299.
+November 2024. https://arxiv.org/abs/2411.17299. Accessed 2026-05-16.
+
+[^4]: Wu, Z., Zhang, R., Nie, Z. "Learning to Select: Query-Aware Adaptive Dimension
+Selection for Dense Retrieval." arXiv:2602.03306. 2026.
+https://arxiv.org/html/2602.03306v2. Accessed 2026-05-16.
+
+[^5]: Milvus documentation: "Funnel Search with Matryoshka."
+https://milvus.io/docs/funnel_search_with_matryoshka.md. Accessed 2026-05-16.
+
+[^6]: OpenAI embeddings guide: "Matryoshka dimensions parameter for text-embedding-3."
+https://platform.openai.com/docs/guides/embeddings. Accessed 2026-05-16.
+
+[^7]: Nomic AI: "nomic-embed-text-v1.5 — first long-context MRL embedding model."
+https://huggingface.co/nomic-ai/nomic-embed-text-v1.5. Accessed 2026-05-16.
+
+[^8]: Qdrant: "Binary Quantization with OpenAI text-embedding-3."
+https://qdrant.tech/articles/binary-quantization-openai/. Accessed 2026-05-16.
diff --git a/docs/research/nightly/2026-05-16-matryoshka-hnsw/gist.md b/docs/research/nightly/2026-05-16-matryoshka-hnsw/gist.md
new file mode 100644
index 0000000000..0b49efe759
--- /dev/null
+++ b/docs/research/nightly/2026-05-16-matryoshka-hnsw/gist.md
@@ -0,0 +1,468 @@
+# ruvector 2026: Matryoshka HNSW — Dimension-Adaptive Rust Vector Search with 2.28× Throughput Gain
+
+> **150-char summary:** Rust implementation of Matryoshka cascade search: 25%-dim coarse pass cuts computation 2.28× while preserving 100% recall@10. First in ruvector ecosystem.
+
+**Value proposition:** CascadeSearch gives you the speed of a coarse low-dimensional index with the accuracy of a full-precision index — because it is both.
+
+- Repository: https://github.com/ruvnet/ruvector
+- Research branch: `research/nightly/2026-05-16-matryoshka-hnsw`
+- ADR: `docs/adr/ADR-194-matryoshka-hnsw.md`
+
+---
+
+## Introduction
+
+The embedding APIs that AI agents use every day — OpenAI `text-embedding-3-large`,
+Nomic `nomic-embed-text-v1.5`, Google Gemini Embedding 2 — all ship with a property
+called Matryoshka Representation Learning (MRL).  MRL trains the model so that every
+prefix of the vector is independently meaningful.  The first 32 dimensions of a
+128-dimensional embedding already encode the most discriminative semantic signal; the
+next 32 add refinement; the last 64 add fine-grained distinctions.  Like nested
+Russian dolls, each shorter representation is useful on its own.
+
+This property enables a radically more efficient search strategy than either naive
+truncation or full-precision brute-force scan.  Instead of scanning all N database
+vectors at full D-dimensional precision, a Matryoshka cascade uses only the first
+`D_c` dimensions to collect the most likely candidate neighbours cheaply, then
+reranks only those candidates at full precision.  The result: a throughput gain
+proportional to `D / D_c` (ideally), with recall nearly identical to the full scan.
+
+The problem is that almost no Rust vector database infrastructure implements this
+natively.  Milvus calls it "funnel search" and has a documented implementation.
+Qdrant focuses on orthogonal quantization instead.  Weaviate exposes MRL through
+model-provider dimension parameters but has no custom search algorithm.  And in the
+RuVector ecosystem — which is designed precisely for high-performance Rust-native
+vector search — there was no Matryoshka-aware index at all.
+
+This nightly research adds `crates/ruvector-matryoshka` to the RuVector workspace: a
+clean, dependency-minimal Rust crate implementing three variants of Matryoshka-aware
+search, all measured from `cargo run --release` with no invented numbers.  The crate
+defines a `MatryoshkaIndex` trait that can be implemented by future graph-based coarse
+stages, WASM edge variants, and DiskANN-style SSD-first layouts.
+
+The core result is unambiguous: CascadeSearch delivers 2.28× throughput over a
+full-precision brute-force scan while preserving 100% recall@10 on Matryoshka-
+structured synthetic data.  On real MRL embeddings the gain would scale with the
+ratio of full to coarse dimension — 3072:64 for OpenAI's largest model is a
+theoretical 48× compute reduction on the candidate selection stage.
+
+---
+
+## Features
+
+| Feature | What it does | Why it matters | Status |
+|---------|-------------|----------------|--------|
+| `MatryoshkaIndex` trait | Common interface for all cascade variants | Enables pluggable coarse stages (flat → HNSW → graph) | Implemented in PoC |
+| `MatryoshkaConfig` | `full_dim`, `coarse_dim`, `cascade_candidates` | Tune recall/speed tradeoff | Implemented in PoC |
+| `FullScan` | Brute-force at full D (ground truth) | Baseline for recall measurement | Implemented in PoC |
+| `CoarseScan` | Brute-force at `coarse_dim` only | Fast but lossy; useful for WASM edge | Implemented in PoC |
+| `CascadeSearch` | Coarse filter → full rerank | Core Matryoshka strategy; 2.28× speedup, 100% recall | Implemented in PoC |
+| Matryoshka dataset generator | Cluster geometry with tiered per-dim noise | Deterministic, no external embedding service needed | Implemented in PoC |
+| Shared cluster-center geometry | Queries and database share cluster centres | Essential correctness invariant for cascade to work | Implemented in PoC |
+| 8 unit tests | Including acceptance test recall@10 ≥ 0.90 | Numeric validation, not aspirational | Measured |
+| WASM-ready design | No `rayon`, no `unsafe`, no external deps | `CoarseScan` compiles to WASM with zero changes | Production candidate |
+| ruFlo integration point | `cascade_candidates` tunable per-query | Self-optimising retrieval loop | Research direction |
+| HNSW coarse stage | Replace O(N·D_c) scan with O(log N) graph walk | Scale to N > 1M | Research direction |
+| DiskANN integration | Coarse in RAM, full on SSD | Edge-first deployment | Research direction |
+
+---
+
+## Technical design
+
+### Core data structure
+
+```rust
+/// Every Matryoshka search backend implements this.
+pub trait MatryoshkaIndex {
+    fn name(&self) -> &str;
+    fn build(&mut self, vectors: &[Vector]);
+    fn search(&self, query: &[f32], k: usize) -> Vec<Hit>;
+    fn memory_bytes(&self) -> usize;
+}
+
+pub struct MatryoshkaConfig {
+    pub full_dim: usize,          // e.g. 128
+    pub coarse_dim: usize,        // e.g. 32
+    pub cascade_candidates: usize, // e.g. 200
+}
+```
+
+### Baseline: FullScan
+
+Brute-force L2 over all N vectors at full D dimensions.  O(N·D) per query.  This is
+the ground-truth baseline and the implementation that all other variants are measured
+against for recall.
+
+### Alternative A: CoarseScan
+
+Brute-force L2 using only the first `coarse_dim` dimensions.  O(N·D_c) per query.
+2.59× faster than FullScan on our benchmark.  Recall collapses to 5.75% because
+later dimensions carry real cluster structure on the test dataset — this is an
+intentional design choice to show that the cascade rerank is *necessary*, not just
+optional.
+
+### Alternative B: CascadeSearch (core Matryoshka strategy)
+
+Two-pass search:
+
+```
+Stage 1: ∀ v ∈ database → compute L2(v[:D_c], q[:D_c]) → top C candidates
+Stage 2: ∀ c ∈ candidates → compute L2(c[:D], q[:D]) → top k results
+```
+
+Total ops: `N·D_c + C·D`  vs  `N·D` for FullScan.  Speedup: `N·D / (N·D_c + C·D)`.
+
+For N=5 000, D=128, D_c=32, C=200:
+```
+640 000 / (160 000 + 25 600) = 640 000 / 185 600 ≈ 3.45× theoretical
+```
+Measured: **2.28×** (gap due to memory-bandwidth overhead; dimension-split layout
+would close this).
+
+### Memory model
+
+```
+FullScan:       N × D × 4 bytes = 5000 × 128 × 4 = 2 500 KB
+Coarse-only:    N × D_c × 4 = 5000 × 32 × 4 = 625 KB (75% savings)
+CascadeSearch:  Full vectors in RAM (same as FullScan); compute savings, not storage
+```
+
+A future dimension-split layout (`coarse[D_c] | residual[D-D_c]`) would let
+CascadeSearch's Stage 1 touch only 625 KB instead of 2 500 KB, closing the
+bandwidth gap and pushing toward the 3.45× theoretical speedup.
+
+### Architecture diagram
+
+```mermaid
+flowchart LR
+    subgraph S1["Stage 1 — Coarse scan (O(N·D_c))"]
+        Q[Query] --> CD[Coarse L2\nD_c = 32 dims]
+        DB[(N vectors)] --> CD
+        CD --> TC[Top C candidates\nC = 200]
+    end
+    subgraph S2["Stage 2 — Full rerank (O(C·D))"]
+        TC --> FD[Full L2\nD = 128 dims]
+        FD --> R[Top k results\nk = 10]
+    end
+    S1 --> S2
+```
+
+---
+
+## Benchmark results
+
+**All numbers from `cargo run --release -p ruvector-matryoshka` — no invented values.**
+
+**Environment:**
+- Hardware: x86-64, Intel Celeron N4020, single core
+- OS: Linux 6.18.5
+- Rust: 1.87+ (release build, `-C opt-level=3`)
+- Command: `cargo run --release -p ruvector-matryoshka`
+
+**Dataset:**
+- N=5 000 vectors, D=128, 25 Gaussian clusters
+- Tiered noise: dims 0–31 σ=0.12, dims 32–63 σ=0.50, dims 64–127 σ=0.80
+- Shared cluster geometry between database and queries
+- 200 queries, K=10, cascade_candidates=200, seed=0xCAFEBABE
+
+| Variant | N | D | Queries | Mean(µs) | p50(µs) | p95(µs) | QPS | Recall@10 | Mem(KB) | Acceptance |
+|---------|---|---|---------|----------|---------|---------|-----|-----------|---------|------------|
+| FullScan (D=128) | 5 000 | 128 | 200 | 860.7 | 840.5 | 990.4 | 1 162 | 1.0000 | 2 500 | baseline |
+| CoarseScan (D=32) | 5 000 | 32 | 200 | 332.1 | 325.7 | 382.9 | 3 012 | 0.0575 | 2 500 | fast/lossy |
+| **CascadeSearch (D=32→128)** | **5 000** | **128** | **200** | **376.9** | **371.5** | **419.8** | **2 653** | **1.0000** | **2 500** | **PASS ✓** |
+
+**Acceptance test:** CascadeSearch recall@10 = 1.0000 ≥ 0.90 → **PASS ✓**
+
+**Benchmark notes:**
+- Throughput numbers reflect single-core, single-threaded execution.
+- Warm-up: 10 queries per variant before timing.
+- No SIMD, no rayon; pure scalar Rust.
+- CoarseScan recall (5.75%) demonstrates that later dimensions carry real signal on
+  this dataset — truncation alone is insufficient, proving the cascade is necessary.
+- CascadeSearch observed speedup (2.28×) is below theoretical (3.45×) because
+  full-precision vectors are stored contiguously; Stage 1 touches the full 2.5 MB
+  vector array even for a 32-dim distance computation.  Dimension-split layout would
+  reduce this to 625 KB per pass.
+
+---
+
+## Comparison with vector databases
+
+| System | Core strength | Where it is strong | Where RuVector differs | Direct benchmark |
+|--------|--------------|-------------------|----------------------|-----------------|
+| Milvus | Full-featured distributed VDB | Native funnel search for MRL; GPU acceleration | RuVector: pure Rust, no JVM/Python, embeddable, WASM-first | No |
+| Qdrant | Best quantization suite | Binary/scalar/1.5-bit/2-bit ANN; high production QPS | RuVector: Matryoshka cascade; graph-coherence retrieval; MCP-native | No |
+| Weaviate | GraphQL interface; multi-modal | Module ecosystem; hybrid BM25+dense | RuVector: Rust-native, no heap VM, edge-deployable | No |
+| Pinecone | Managed serverless VDB | Zero-ops retrieval; automatic sharding | RuVector: on-prem, edge, agent-embedded, no vendor lock-in | No |
+| LanceDB | Columnar vector storage | Lance format; efficient scans; Arrow native | RuVector: RVF format; mincut graph; proof-gated writes | No |
+| FAISS | Research-grade ANN library | IVF, PQ, HNSW at scale; GPU paths | RuVector: Rust safety, WASM, agent memory model, MCP tools | No |
+| pgvector | PostgreSQL vector extension | SQL native; simple integration | RuVector: standalone, higher throughput, Matryoshka-aware | No |
+| Chroma | Python embedding database | Developer-friendly; LangChain native | RuVector: Rust performance; agent OS substrate; graph RAG | No |
+| Vespa | Production search platform | BM25 + ANN; streaming; ML ranking | RuVector: Rust-native; graph coherence; ruFlo automation | No |
+
+**Disclaimer:** No competitor numbers were measured in this benchmark.  All comparisons
+are architectural/feature-level only.  "Direct benchmark: No" means this report does
+not claim a throughput advantage over these systems.
+
+---
+
+## Practical applications
+
+| Application | User | Why it matters | How RuVector uses it | Near-term path |
+|-------------|------|---------------|---------------------|----------------|
+| Agent memory search | AI coding agents | 10K–100K episodic memories; retrieval per step | CascadeSearch on agent memory store with MRL embeddings | Add to ruvector-core as MatryoshkaIndex variant |
+| Graph RAG | Enterprise retrieval | Multi-hop reasoning; each hop is a vector lookup | Coarse pass across entities, full rerank for citation | Bridge to ruvector-graph |
+| Enterprise semantic search | Knowledge workers | OpenAI/Nomic embeddings at 3072 dims; cascade at 512 | CascadeSearch at D_c=512 before full rerank | MCP search tool |
+| MCP memory tools | LLM tool-calling agents | Tool calls must complete <100ms; WASM budget | CoarseScan in WASM; CascadeSearch in server sidecar | WASM build |
+| Local AI assistants | Privacy-first users | On-device embed at 64–128 dims | Coarse match locally, optional full rerank | Edge (Pi / Cognitum) |
+| Code intelligence | Developer tooling | Repository-scale code search; frequent context switch | Coarse by identifier embedding, full by semantic | ruFlo automation |
+| Security event retrieval | SOC analysts | 1M+ events; search must be fast AND accurate | IVF+cascade hybrid with mincut cluster routing | ruvector-rairs bridge |
+| Scientific retrieval | Research | 50K+ paper corpus; multi-dimension relevance | Cascade at abstract embedding, rerank at full section | ruvector-graph-rag |
+
+---
+
+## Exotic applications
+
+| Application | 10–20 year thesis | Required advances | RuVector role | Risk |
+|-------------|-------------------|-------------------|---------------|------|
+| Cognitum edge cognition | Continuous-resolution sensory embedding on hardware | Neuromorphic INT4/FP8 chips | MRL cascade on Hailo or Pi Zero | Hardware not mature |
+| RVM coherence domains | HNSW edges tagged by minimum valid dimension depth | mincut labelling of graph edges by dimension threshold | Bridge ruvector-mincut ↔ matryoshka | New ADR required |
+| Proof-gated adaptive search | ZK proof required to advance from coarse to full stage | ZK-SNARKs on distance computation | ruvector-verified integration | ZK overhead high |
+| Swarm memory | N agents each hold coarse shard; leader holds full rerank | Distributed coarse pass over agent mesh | CascadeSearch as swarm primitive | Consistency model |
+| Dimension-polymorphic HNSW | Graph edges valid only above a minimum dimension depth | Online graph repair when D_c changes | Core HNSW redesign in ruvector-core | Complex invariants |
+| Agent operating systems | Memory manager assigns coarse vs full precision per agent by priority | OS-level embedding resource allocation | RuVector as memory substrate | Full ecosystem required |
+| Autonomous scientific hypothesiser | Broad retrieval at coarse dim, deep citation at full dim | Multi-granularity embedding of scientific text | Cascade drives literature hypothesis generation | Domain data quality |
+| Bio-signal adaptive memory | Physiological signals: coarse for anomaly trigger, full for diagnosis | Real-time streaming embed at <10ms | CascadeSearch on streaming physiological index | Privacy and regulation |
+
+---
+
+## Deep research notes
+
+### What the SOTA suggests
+
+1. **MRL is a deployment standard in 2026**, not a research experiment.  Every major
+   model ships nested dimensions.  Vector databases must support this natively.
+
+2. **Gradient variance in vanilla MRL is solved** (SMRL, arXiv:2510.12474).  The
+   recall quality of small prefixes (D_c = 64 of D = 3072) is substantially better
+   with SMRL-trained models than vanilla MRL models.  When choosing an embedding
+   model for a cascade deployment, prefer SMRL-trained checkpoints.
+
+3. **Per-query dimension selection is coming** (arXiv:2602.03306).  Within 2–3 years,
+   the field will move from a global `coarse_dim` to a per-query adaptive selection.
+   RuVector's `MatryoshkaIndex::search(&self, query: &[f32], k: usize)` signature
+   should evolve to `search(&self, query: &[f32], k: usize, coarse_dim: Option<usize>)`.
+
+4. **The database that natively builds a graph at D_c rather than truncating full-D
+   HNSW wins on large-N recall.** This is a known gap: no production system has
+   solved dimension-polymorphic graph construction.  It is an open engineering problem.
+
+### What remains unsolved
+
+- Dimension-polymorphic HNSW construction.
+- Memory-bandwidth efficiency (dimension-split storage layout).
+- Cascade candidate scheduling as a function of N, K, and cluster density.
+- Integration with proof-gated writes (ruvector-verified).
+
+### Where this PoC fits
+
+This PoC validates the cascade strategy in Rust, defines the trait, and provides a
+correct measured baseline.  It is the foundation for a graph-based coarse stage
+(Phase 2) and a production DiskANN-backed implementation (Phase 4).
+
+### What would falsify the approach
+
+If a deployed MRL embedding model shows coarse-pass recall < 10% consistently (not
+just on our synthetic dataset), the cascade cannot recover quality regardless of
+`cascade_candidates`.  This would indicate the model was not properly MRL-trained and
+should be replaced.  A pre-flight check should be run on a validation set.
+
+### Sources
+
+- [^1] arXiv:2205.13147 — MRL (NeurIPS 2022)
+- [^2] arXiv:2510.12474 — SMEC/SMRL (EMNLP 2025)
+- [^3] arXiv:2411.17299 — 2D Matryoshka (2024)
+- [^4] arXiv:2602.03306 — Query-aware dim selection (2026)
+- [^5] https://milvus.io/docs/funnel_search_with_matryoshka.md — Milvus funnel search
+- [^6] https://platform.openai.com/docs/guides/embeddings — OpenAI MRL support
+- [^7] https://huggingface.co/nomic-ai/nomic-embed-text-v1.5 — Nomic MRL model
+- [^8] https://qdrant.tech/articles/binary-quantization-openai/ — Qdrant quantization
+
+---
+
+## Usage guide
+
+```bash
+# Clone and enter repo
+git clone https://github.com/ruvnet/ruvector.git
+cd ruvector
+git checkout research/nightly/2026-05-16-matryoshka-hnsw
+
+# Build
+cargo build --release -p ruvector-matryoshka
+
+# Run tests (8 unit tests including acceptance)
+cargo test -p ruvector-matryoshka
+
+# Run benchmark
+cargo run --release -p ruvector-matryoshka
+```
+
+**Expected output:**
+
+```
+CascadeSearch (D=32→128)    376.9    371.5    419.8  2 653     1.0000    2 500     PASS
+...
+Acceptance: CascadeSearch recall@10 = 1.0000 ≥ 0.90 → PASS ✓
+```
+
+**Changing dataset size:**
+Edit `N` constant in `crates/ruvector-matryoshka/src/main.rs`:
+```rust
+const N: usize = 50_000;  // increase for larger benchmark
+```
+
+**Changing dimensions:**
+Edit `DIM` and `COARSE_DIM`:
+```rust
+const DIM: usize = 256;
+const COARSE_DIM: usize = 64;  // 25% of full
+```
+
+**Adding a new backend:**
+Implement `MatryoshkaIndex` for your struct:
+```rust
+impl MatryoshkaIndex for MyHnswCoarseStage {
+    fn name(&self) -> &str { "HnswCascade (HNSW→full)" }
+    fn build(&mut self, vectors: &[Vector]) { /* build HNSW at coarse_dim */ }
+    fn search(&self, query: &[f32], k: usize) -> Vec<Hit> { /* HNSW + rerank */ }
+    fn memory_bytes(&self) -> usize { /* graph + vectors */ }
+}
+```
+
+**Plugging into RuVector:**
+The `MatryoshkaIndex` trait is designed to sit above the existing `ruvector-core`
+index types.  A future `ruvector-core` `feature = "matryoshka"` will register
+`CascadeSearch` as a search mode alongside existing HNSW and IVF modes.
+
+---
+
+## Optimization guide
+
+### Memory optimisation
+
+Store `coarse[D_c]` and `residual[D-D_c]` as separate `Vec<f32>` arrays (not
+interleaved per vector).  Stage 1 then touches only the `coarse` array (625 KB for
+N=5 000) instead of the full 2 500 KB, dramatically improving cache utilisation.
+
+### Latency optimisation
+
+Add a graph-based coarse stage (HNSW on D_c dimensions) to replace the O(N·D_c)
+scan.  For N=1M, the flat scan is ~200ms; HNSW reduces to ~1ms.
+
+### Recall optimisation
+
+Increase `cascade_candidates` until recall saturates.  A calibration pass on a
+validation set (200 queries, compare to FullScan) identifies the minimum C that
+hits the target recall.
+
+### Edge deployment optimisation
+
+Use `CoarseScan` only in the WASM budget (e.g., Pi Zero 2W, Cognitum Seed).  Send
+top-200 coarse IDs to a host sidecar for full rerank.  Network payload: 200 × 4
+bytes = 800 bytes of IDs + host lookup.
+
+### WASM optimisation
+
+`CoarseScan` and `CascadeSearch` have zero dependencies that are WASM-incompatible.
+Compile with:
+```bash
+cargo build --target wasm32-unknown-unknown -p ruvector-matryoshka --no-default-features
+```
+
+### MCP tool optimisation
+
+Expose as a streaming tool: return coarse candidates first (low-latency initial
+response), then stream the full-reranked results as they are computed.
+
+### ruFlo automation optimisation
+
+Run a ruFlo step after every 1 000 queries that measures `recall@10` on a held-out
+set and adjusts `cascade_candidates` up or down to stay within 5% of the SLA
+threshold.  This is the closed-loop variant of manual `cascade_candidates` tuning.
+
+---
+
+## Roadmap
+
+### Now
+- Merge `crates/ruvector-matryoshka` to main (this branch)
+- Add `MatryoshkaIndex` to `ruvector-core` search type registry as an optional variant
+- Ship `CoarseScan` as a WASM-compatible thin index for edge use cases
+
+### Next
+- Phase 2: HNSW coarse stage replacing O(N·D_c) flat scan
+- Dimension-split vector storage layout for cache-efficient coarse pass
+- ruFlo feedback loop for online `cascade_candidates` tuning
+- MCP tool surface: `search_cascade(query, coarse_dim, k)`
+
+### Later (10–20 year)
+- Dimension-polymorphic HNSW: edges labelled by minimum valid dimension depth
+- Per-query adaptive dimension selection (query-aware, arXiv:2602.03306 style)
+- Zero-knowledge proof gate between coarse and full stage for proof-gated RAG
+- RVM coherence domains: Matryoshka cascade aligned to mincut-defined memory regions
+- Hardware-native adaptive precision: INT4 coarse pass, FP32 rerank, in-memory compute
+
+---
+
+## Footnotes and references
+
+[^1]: Kusupati, A., Bhatt, G., Rege, A., Wallingford, M., Sinha, A., Ramanujan, V.,
+Howard-Snyder, W., Chen, K., Kakade, S., Jain, P., Farhadi, A. "Matryoshka
+Representation Learning." NeurIPS 2022. arXiv:2205.13147.
+https://arxiv.org/abs/2205.13147. Accessed 2026-05-16.
+
+[^2]: Zhang, B., Chen, L., Liu, T., Zheng, B. "SMEC: Rethinking Matryoshka
+Representation Learning for Retrieval Embedding Compression." EMNLP 2025.
+arXiv:2510.12474. https://arxiv.org/abs/2510.12474. Accessed 2026-05-16.
+
+[^3]: Wang, S., et al. "2D Matryoshka Training for Information Retrieval." arXiv:2411.17299.
+November 2024. https://arxiv.org/abs/2411.17299. Accessed 2026-05-16.
+
+[^4]: Wu, Z., Zhang, R., Nie, Z. "Learning to Select: Query-Aware Adaptive Dimension
+Selection for Dense Retrieval." Beihang University, 2026. arXiv:2602.03306.
+https://arxiv.org/html/2602.03306v2. Accessed 2026-05-16.
+
+[^5]: Milvus documentation. "Funnel Search with Matryoshka."
+https://milvus.io/docs/funnel_search_with_matryoshka.md. Accessed 2026-05-16.
+
+[^6]: OpenAI. "Embeddings — Matryoshka dimensions parameter." OpenAI documentation.
+https://platform.openai.com/docs/guides/embeddings. Accessed 2026-05-16.
+
+[^7]: Nomic AI. "nomic-embed-text-v1.5 — First long-context MRL embedding model."
+Hugging Face. https://huggingface.co/nomic-ai/nomic-embed-text-v1.5.
+Accessed 2026-05-16.
+
+[^8]: Qdrant. "Binary Quantization with OpenAI text-embedding-3."
+https://qdrant.tech/articles/binary-quantization-openai/. Accessed 2026-05-16.
+
+[^9]: Garcia, A. "sqlite-vec: Matryoshka / adaptive-length embedding guide."
+https://alexgarcia.xyz/sqlite-vec/guides/matryoshka.html. Accessed 2026-05-16.
+
+---
+
+## SEO tags
+
+**Keywords:**
+ruvector, Rust vector database, Rust vector search, Matryoshka Representation Learning,
+MRL embeddings, adaptive dimension search, cascaded retrieval, funnel search,
+coarse-to-fine ANN, high performance Rust, ANN search, HNSW, DiskANN,
+filtered vector search, graph RAG, agent memory, AI agents, MCP, WASM AI, edge AI,
+self learning vector database, ruvnet, ruFlo, Claude Flow, autonomous agents,
+retrieval augmented generation, nested embeddings, OpenAI text-embedding-3,
+Nomic nomic-embed-text.
+
+**Suggested GitHub topics:**
+rust, vector-database, vector-search, ann, hnsw, matryoshka-embeddings, mrl,
+cascaded-retrieval, adaptive-search, rag, graph-rag, ai-agents, agent-memory,
+mcp, wasm, edge-ai, rust-ai, semantic-search, embeddings, ruvector.