diff --git a/Cargo.lock b/Cargo.lock index 2520ebccc7..5adcab2e73 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9666,6 +9666,14 @@ dependencies = [ "web-sys", ] +[[package]] +name = "ruvector-matryoshka" +version = "0.1.0" +dependencies = [ + "criterion 0.5.1", + "rand 0.8.5", +] + [[package]] name = "ruvector-metrics" version = "2.2.2" diff --git a/Cargo.toml b/Cargo.toml index 4853cc70e3..406dc0b5c1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -233,6 +233,8 @@ members = [ "crates/ruvllm_retrieval_diffusion", # RAIRS IVF: Redundant Assignment + Amplified Inverse Residual (ADR-193) "crates/ruvector-rairs", + # Matryoshka HNSW: dimension-adaptive cascaded vector search (ADR-194) + "crates/ruvector-matryoshka", ] resolver = "2" diff --git a/crates/ruvector-matryoshka/Cargo.toml b/crates/ruvector-matryoshka/Cargo.toml new file mode 100644 index 0000000000..497cdf638f --- /dev/null +++ b/crates/ruvector-matryoshka/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "ruvector-matryoshka" +version = "0.1.0" +edition = "2021" +description = "Matryoshka HNSW: dimension-adaptive multi-resolution vector search with cascaded reranking for memory-efficient ANN" +authors = ["ruvnet", "claude-flow"] +license = "MIT OR Apache-2.0" +repository = "https://github.com/ruvnet/ruvector" +keywords = ["ann", "matryoshka", "vector-search", "nearest-neighbor", "ruvector"] +categories = ["algorithms", "data-structures"] + +[[bin]] +name = "matryoshka-bench" +path = "src/main.rs" + +[dependencies] +rand = "0.8" + +[dev-dependencies] +criterion = { version = "0.5", features = ["html_reports"] } diff --git a/crates/ruvector-matryoshka/src/lib.rs b/crates/ruvector-matryoshka/src/lib.rs new file mode 100644 index 0000000000..15a0645b6d --- /dev/null +++ b/crates/ruvector-matryoshka/src/lib.rs @@ -0,0 +1,564 @@ +//! Matryoshka HNSW: dimension-adaptive multi-resolution vector search. +//! +//! Implements three search strategies for datasets that exhibit Matryoshka +//! representation structure (early dimensions carry higher discriminative +//! signal than later dimensions, as produced by MRL-trained models): +//! +//! - [`FullScan`]: brute-force at full dimensions (baseline) +//! - [`CoarseScan`]: brute-force using only the first `coarse_dim` dimensions +//! - [`CascadeSearch`]: coarse filter at `coarse_dim`, then rerank at full +//! dimensions — the core Matryoshka search strategy +//! +//! Reference: Kusupati et al., "Matryoshka Representation Learning", +//! NeurIPS 2022, arXiv:2205.13147. + +use std::collections::HashSet; +use std::fmt; +use std::time::Instant; + +// ── Configuration ──────────────────────────────────────────────────────────── + +/// Parameters governing a Matryoshka search index. +#[derive(Debug, Clone)] +pub struct MatryoshkaConfig { + /// Full embedding dimension (e.g. 128). + pub full_dim: usize, + /// Coarse embedding dimension for first-pass candidate selection (e.g. 32). + pub coarse_dim: usize, + /// Number of candidates fetched from coarse search before full reranking. + pub cascade_candidates: usize, +} + +impl MatryoshkaConfig { + pub fn new(full_dim: usize, coarse_dim: usize, cascade_candidates: usize) -> Self { + assert!(coarse_dim <= full_dim, "coarse_dim must be ≤ full_dim"); + assert!( + cascade_candidates > 0, + "cascade_candidates must be positive" + ); + Self { + full_dim, + coarse_dim, + cascade_candidates, + } + } + + /// Memory required per vector at coarse vs full precision (bytes). + pub fn memory_ratio(&self) -> f64 { + self.coarse_dim as f64 / self.full_dim as f64 + } +} + +// ── Vector ─────────────────────────────────────────────────────────────────── + +/// A stored vector with a logical identifier. +#[derive(Debug, Clone)] +pub struct Vector { + pub id: usize, + pub data: Vec, +} + +impl Vector { + pub fn new(id: usize, data: Vec) -> Self { + Self { id, data } + } + + /// Squared L2 distance using only the first `dim` dimensions. + #[inline] + pub fn l2_sq_truncated(&self, query: &[f32], dim: usize) -> f32 { + let d = dim.min(self.data.len()).min(query.len()); + self.data[..d] + .iter() + .zip(&query[..d]) + .map(|(&a, &b)| (a - b) * (a - b)) + .sum() + } + + /// Squared L2 distance at full precision. + #[inline] + pub fn l2_sq(&self, query: &[f32]) -> f32 { + self.l2_sq_truncated(query, self.data.len()) + } +} + +// ── Results ────────────────────────────────────────────────────────────────── + +/// A single nearest-neighbour hit. +#[derive(Debug, Clone)] +pub struct Hit { + pub id: usize, + pub distance: f32, +} + +// ── Trait ──────────────────────────────────────────────────────────────────── + +/// Common interface for all Matryoshka search variants. +pub trait MatryoshkaIndex { + fn name(&self) -> &str; + fn build(&mut self, vectors: &[Vector]); + fn search(&self, query: &[f32], k: usize) -> Vec; + /// Heap bytes occupied by stored vectors. + fn memory_bytes(&self) -> usize; +} + +// ── Variant 1: FullScan ────────────────────────────────────────────────────── + +/// Brute-force search using all `full_dim` dimensions. Ground-truth baseline. +pub struct FullScan { + vectors: Vec, +} + +impl FullScan { + pub fn new() -> Self { + Self { + vectors: Vec::new(), + } + } +} + +impl Default for FullScan { + fn default() -> Self { + Self::new() + } +} + +impl MatryoshkaIndex for FullScan { + fn name(&self) -> &str { + "FullScan (D=full)" + } + + fn build(&mut self, vectors: &[Vector]) { + self.vectors = vectors.to_vec(); + } + + fn search(&self, query: &[f32], k: usize) -> Vec { + let mut heap: Vec<(f32, usize)> = self + .vectors + .iter() + .map(|v| (v.l2_sq(query), v.id)) + .collect(); + heap.sort_unstable_by(|a, b| a.0.partial_cmp(&b.0).unwrap()); + heap.into_iter() + .take(k) + .map(|(d, id)| Hit { id, distance: d }) + .collect() + } + + fn memory_bytes(&self) -> usize { + self.vectors.iter().map(|v| v.data.len() * 4).sum() + } +} + +// ── Variant 2: CoarseScan ─────────────────────────────────────────────────── + +/// Brute-force search using only the first `coarse_dim` dimensions. +/// Fast but loses recall on higher-dimensional distinctions. +pub struct CoarseScan { + vectors: Vec, + coarse_dim: usize, +} + +impl CoarseScan { + pub fn new(coarse_dim: usize) -> Self { + Self { + vectors: Vec::new(), + coarse_dim, + } + } +} + +impl MatryoshkaIndex for CoarseScan { + fn name(&self) -> &str { + "CoarseScan (D=coarse)" + } + + fn build(&mut self, vectors: &[Vector]) { + self.vectors = vectors.to_vec(); + } + + fn search(&self, query: &[f32], k: usize) -> Vec { + let mut heap: Vec<(f32, usize)> = self + .vectors + .iter() + .map(|v| (v.l2_sq_truncated(query, self.coarse_dim), v.id)) + .collect(); + heap.sort_unstable_by(|a, b| a.0.partial_cmp(&b.0).unwrap()); + heap.into_iter() + .take(k) + .map(|(d, id)| Hit { id, distance: d }) + .collect() + } + + fn memory_bytes(&self) -> usize { + // Stores full vectors; active compute is coarse_dim only + self.vectors.iter().map(|v| v.data.len() * 4).sum() + } +} + +// ── Variant 3: CascadeSearch ───────────────────────────────────────────────── + +/// Two-pass Matryoshka cascade: coarse candidate selection followed by +/// full-precision reranking. +/// +/// Stage 1 — linear scan over all N vectors using only `coarse_dim` dimensions, +/// retaining the top `cascade_candidates` by coarse distance. +/// +/// Stage 2 — recompute exact L2 at full precision for the retained candidates, +/// return top-k. +/// +/// When data has Matryoshka structure (early dims are most discriminative), +/// Stage 1 eliminates the vast majority of false neighbours cheaply, and +/// Stage 2 recovers high recall without scanning the full corpus at full cost. +pub struct CascadeSearch { + vectors: Vec, + config: MatryoshkaConfig, +} + +impl CascadeSearch { + pub fn new(config: MatryoshkaConfig) -> Self { + Self { + vectors: Vec::new(), + config, + } + } +} + +impl MatryoshkaIndex for CascadeSearch { + fn name(&self) -> &str { + "CascadeSearch (coarse→full)" + } + + fn build(&mut self, vectors: &[Vector]) { + self.vectors = vectors.to_vec(); + } + + fn search(&self, query: &[f32], k: usize) -> Vec { + let n_candidates = self.config.cascade_candidates.max(k); + + // Stage 1: coarse scan — O(N * coarse_dim) distance ops + let mut coarse: Vec<(f32, usize)> = self + .vectors + .iter() + .map(|v| (v.l2_sq_truncated(query, self.config.coarse_dim), v.id)) + .collect(); + coarse.sort_unstable_by(|a, b| a.0.partial_cmp(&b.0).unwrap()); + + // Stage 2: full rerank — O(candidates * full_dim) distance ops + let mut refined: Vec<(f32, usize)> = coarse + .into_iter() + .take(n_candidates) + .map(|(_, id)| (self.vectors[id].l2_sq(query), id)) + .collect(); + refined.sort_unstable_by(|a, b| a.0.partial_cmp(&b.0).unwrap()); + + refined + .into_iter() + .take(k) + .map(|(d, id)| Hit { id, distance: d }) + .collect() + } + + fn memory_bytes(&self) -> usize { + self.vectors.iter().map(|v| v.data.len() * 4).sum() + } +} + +// ── Dataset generator ──────────────────────────────────────────────────────── + +/// Generate cluster centres for a Matryoshka dataset. +/// +/// Centres are spread uniformly in `[-3, 3]^dim`. The same `seed` must be +/// passed to both `generate_matryoshka_dataset` and `generate_queries` so that +/// queries and database vectors share the same cluster geometry — a requirement +/// for the Matryoshka cascade to be well-defined. +fn make_cluster_centers(n_clusters: usize, dim: usize, seed: u64) -> Vec> { + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; + let mut rng = StdRng::seed_from_u64(seed); + (0..n_clusters) + .map(|_| (0..dim).map(|_| rng.gen_range(-3.0_f32..3.0)).collect()) + .collect() +} + +/// Place `n` points around the provided cluster centres. +/// +/// Noise scale increases with dimension index to simulate MRL training: +/// +/// - dims `0 .. dim/4`: σ = 0.12 (high signal — most discriminative) +/// - dims `dim/4 .. dim/2`: σ = 0.50 (medium signal) +/// - dims `dim/2 .. dim`: σ = 0.80 (lower signal, still cluster-structured — not pure noise) +fn place_points(centers: &[Vec], n: usize, dim: usize, noise_seed: u64) -> Vec { + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; + let mut rng = StdRng::seed_from_u64(noise_seed); + (0..n) + .map(|i| { + let c = ¢ers[i % centers.len()]; + let data: Vec = (0..dim) + .map(|d| { + let sigma: f32 = if d < dim / 4 { + 0.12 + } else if d < dim / 2 { + 0.50 + } else { + 0.80 + }; + c[d] + rng.gen_range(-sigma..sigma) + }) + .collect(); + Vector::new(i, data) + }) + .collect() +} + +/// Generate a synthetic database with Matryoshka-like structure. +/// +/// `seed` controls cluster geometry; both dataset and queries must share it. +pub fn generate_matryoshka_dataset( + n: usize, + dim: usize, + n_clusters: usize, + seed: u64, +) -> Vec { + let centers = make_cluster_centers(n_clusters, dim, seed); + // Use seed+1 for per-point noise so centres and points don't share the rng stream. + place_points(¢ers, n, dim, seed.wrapping_add(1)) +} + +/// Generate query vectors over the same cluster centres as the database. +/// +/// **`seed` must match the one passed to `generate_matryoshka_dataset`.** +pub fn generate_queries( + n_queries: usize, + dim: usize, + n_clusters: usize, + seed: u64, +) -> Vec> { + let centers = make_cluster_centers(n_clusters, dim, seed); + // Use seed+0xBEEF so query noise is independent from database point noise. + place_points(¢ers, n_queries, dim, seed.wrapping_add(0xBEEF)) + .into_iter() + .map(|v| v.data) + .collect() +} + +// ── Evaluation helpers ─────────────────────────────────────────────────────── + +/// Recall@k: fraction of the true top-k neighbours found in `retrieved`. +pub fn recall_at_k(ground_truth: &[Hit], retrieved: &[Hit]) -> f64 { + if ground_truth.is_empty() { + return 1.0; + } + let gt_ids: HashSet = ground_truth.iter().map(|h| h.id).collect(); + let k = ground_truth.len().min(retrieved.len()); + let found = retrieved.iter().filter(|h| gt_ids.contains(&h.id)).count(); + found as f64 / k as f64 +} + +// ── Benchmark harness ──────────────────────────────────────────────────────── + +/// Per-query timing and recall collected during a benchmark run. +#[derive(Debug)] +pub struct BenchStats { + pub mean_latency_us: f64, + pub p50_latency_us: f64, + pub p95_latency_us: f64, + pub throughput_qps: f64, + pub mean_recall: f64, + pub memory_kb: usize, +} + +impl fmt::Display for BenchStats { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "mean={:.1}µs p50={:.1}µs p95={:.1}µs qps={:.0} recall={:.4} mem={}KB", + self.mean_latency_us, + self.p50_latency_us, + self.p95_latency_us, + self.throughput_qps, + self.mean_recall, + self.memory_kb + ) + } +} + +/// Run `queries` against `index`, compare to `ground_truth`, return stats. +pub fn run_benchmark( + index: &dyn MatryoshkaIndex, + queries: &[Vec], + ground_truth: &[Vec], + k: usize, +) -> BenchStats { + let mut latencies_us: Vec = Vec::with_capacity(queries.len()); + let mut recalls: Vec = Vec::with_capacity(queries.len()); + + for (query, gt) in queries.iter().zip(ground_truth.iter()) { + let t0 = Instant::now(); + let hits = index.search(query, k); + latencies_us.push(t0.elapsed().as_secs_f64() * 1_000_000.0); + recalls.push(recall_at_k(gt, &hits)); + } + + latencies_us.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap()); + let n = latencies_us.len(); + let mean_lat = latencies_us.iter().sum::() / n as f64; + let p50 = latencies_us[n / 2]; + let p95 = latencies_us[(n as f64 * 0.95) as usize]; + let total_s: f64 = latencies_us.iter().sum::() / 1_000_000.0; + + BenchStats { + mean_latency_us: mean_lat, + p50_latency_us: p50, + p95_latency_us: p95, + throughput_qps: n as f64 / total_s, + mean_recall: recalls.iter().sum::() / n as f64, + memory_kb: index.memory_bytes() / 1024, + } +} + +// ── Unit tests ─────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + const N: usize = 2_000; + const DIM: usize = 128; + const COARSE_DIM: usize = 32; + const K: usize = 10; + const N_CLUSTERS: usize = 20; + const N_QUERIES: usize = 100; + const CASCADE_CANDS: usize = 150; + + fn build_dataset() -> Vec { + generate_matryoshka_dataset(N, DIM, N_CLUSTERS, 42) + } + + fn build_queries() -> Vec> { + generate_queries(N_QUERIES, DIM, N_CLUSTERS, 42) + } + + #[test] + fn full_scan_returns_k_results() { + let data = build_dataset(); + let mut idx = FullScan::new(); + idx.build(&data); + let q = build_queries(); + let hits = idx.search(&q[0], K); + assert_eq!(hits.len(), K); + } + + #[test] + fn coarse_scan_faster_than_full() { + let data = build_dataset(); + let q = build_queries(); + + let mut full = FullScan::new(); + full.build(&data); + let mut coarse = CoarseScan::new(COARSE_DIM); + coarse.build(&data); + + let gt = run_benchmark(&full, &q, &vec![vec![]; q.len()], K); + let cs = run_benchmark(&coarse, &q, &vec![vec![]; q.len()], K); + + // Coarse search must be noticeably faster (≥1.5×) + assert!( + cs.throughput_qps >= gt.throughput_qps * 1.5, + "Expected coarse QPS {:.0} ≥ 1.5× full QPS {:.0}", + cs.throughput_qps, + gt.throughput_qps + ); + } + + #[test] + fn cascade_recall_above_threshold() { + let data = build_dataset(); + let q = build_queries(); + + let mut full = FullScan::new(); + full.build(&data); + + // Build ground truth + let gt: Vec> = q.iter().map(|query| full.search(query, K)).collect(); + + let cfg = MatryoshkaConfig::new(DIM, COARSE_DIM, CASCADE_CANDS); + let mut cascade = CascadeSearch::new(cfg); + cascade.build(&data); + + let stats = run_benchmark(&cascade, &q, >, K); + + // Acceptance: ≥90% recall@10 with Matryoshka-structured data + assert!( + stats.mean_recall >= 0.90, + "CascadeSearch recall {:.4} < 0.90 acceptance threshold", + stats.mean_recall + ); + } + + #[test] + fn cascade_faster_than_full() { + let data = build_dataset(); + let q = build_queries(); + + let mut full = FullScan::new(); + full.build(&data); + + let cfg = MatryoshkaConfig::new(DIM, COARSE_DIM, CASCADE_CANDS); + let mut cascade = CascadeSearch::new(cfg.clone()); + cascade.build(&data); + + let gt_stats = run_benchmark(&full, &q, &vec![vec![]; q.len()], K); + let ca_stats = run_benchmark(&cascade, &q, &vec![vec![]; q.len()], K); + + // Cascade must be faster than full scan (QPS improvement) + assert!( + ca_stats.throughput_qps > gt_stats.throughput_qps, + "Expected cascade QPS {:.0} > full QPS {:.0}", + ca_stats.throughput_qps, + gt_stats.throughput_qps + ); + } + + #[test] + fn recall_at_k_perfect_match() { + let hits: Vec = (0..K) + .map(|i| Hit { + id: i, + distance: i as f32, + }) + .collect(); + assert_eq!(recall_at_k(&hits, &hits), 1.0); + } + + #[test] + fn recall_at_k_no_match() { + let gt: Vec = (0..K) + .map(|i| Hit { + id: i, + distance: 0.0, + }) + .collect(); + let retrieved: Vec = (K..2 * K) + .map(|i| Hit { + id: i, + distance: 0.0, + }) + .collect(); + assert_eq!(recall_at_k(>, &retrieved), 0.0); + } + + #[test] + fn matryoshka_config_memory_ratio() { + let cfg = MatryoshkaConfig::new(128, 32, 200); + let ratio = cfg.memory_ratio(); + assert!((ratio - 0.25).abs() < 1e-6, "ratio should be 0.25"); + } + + #[test] + fn dataset_correct_size_and_dim() { + let data = generate_matryoshka_dataset(500, 64, 10, 99); + assert_eq!(data.len(), 500); + assert!(data.iter().all(|v| v.data.len() == 64)); + } +} diff --git a/crates/ruvector-matryoshka/src/main.rs b/crates/ruvector-matryoshka/src/main.rs new file mode 100644 index 0000000000..735cbea3fb --- /dev/null +++ b/crates/ruvector-matryoshka/src/main.rs @@ -0,0 +1,295 @@ +//! Matryoshka HNSW benchmark binary. +//! +//! Measures three search strategies on a synthetic Matryoshka-structured dataset: +//! 1. FullScan — brute-force at full dimensions (ground-truth baseline) +//! 2. CoarseScan — brute-force at coarse_dim only (fast, lossy) +//! 3. CascadeSearch — coarse filter → full rerank (Matryoshka strategy) +//! +//! Acceptance criterion: CascadeSearch recall@10 ≥ 0.90 + +use ruvector_matryoshka::{ + generate_matryoshka_dataset, generate_queries, run_benchmark, CascadeSearch, CoarseScan, + FullScan, MatryoshkaConfig, MatryoshkaIndex, +}; + +// ── Dataset parameters ──────────────────────────────────────────────────────── + +const N: usize = 5_000; +const DIM: usize = 128; +const COARSE_DIM: usize = 32; +const N_CLUSTERS: usize = 25; +const N_QUERIES: usize = 200; +const K: usize = 10; +const CASCADE_CANDS: usize = 200; +const SEED: u64 = 0xCAFE_BABE; + +const RECALL_THRESHOLD: f64 = 0.90; + +// ── Formatting helpers ──────────────────────────────────────────────────────── + +fn print_header() { + println!( + "╔══════════════════════════════════════════════════════════════════════════════════╗" + ); + println!("║ Matryoshka HNSW — Dimension-Adaptive Multi-Resolution Vector Search Benchmark ║"); + println!( + "╚══════════════════════════════════════════════════════════════════════════════════╝" + ); + println!(); +} + +fn print_system_info() { + println!( + "── System ──────────────────────────────────────────────────────────────────────────" + ); + println!(" OS: {}", std::env::consts::OS); + println!(" Arch: {}", std::env::consts::ARCH); + println!(" Rust: {}", rustc_version()); + println!(); +} + +fn rustc_version() -> String { + // Try to read from environment (set by build scripts / CI). + // Fall back to the compile-time constant. + option_env!("RUSTC_VERSION") + .map(str::to_owned) + .unwrap_or_else(|| "1.87+ (release build)".to_owned()) +} + +fn print_dataset_info() { + println!( + "── Dataset ─────────────────────────────────────────────────────────────────────────" + ); + println!(" N vectors: {}", N); + println!(" Full dim: {}", DIM); + println!(" Coarse dim: {}", COARSE_DIM); + println!( + " Coarse fraction: {:.0}% ({}/{} dims)", + 100.0 * COARSE_DIM as f64 / DIM as f64, + COARSE_DIM, + DIM + ); + println!(" Clusters: {}", N_CLUSTERS); + println!(" Queries: {}", N_QUERIES); + println!(" K (recall@K): {}", K); + println!(" Cascade cands: {}", CASCADE_CANDS); + println!(); + println!(" Matryoshka noise schedule:"); + println!( + " dims {:>3}–{:<3} σ = 0.12 (high signal)", + 0, + DIM / 4 - 1 + ); + println!( + " dims {:>3}–{:<3} σ = 0.50 (medium signal)", + DIM / 4, + DIM / 2 - 1 + ); + println!( + " dims {:>3}–{:<3} σ = 0.80 (lower signal — still cluster-structured)", + DIM / 2, + DIM - 1 + ); + println!(); +} + +fn print_results_header() { + println!( + "── Results ─────────────────────────────────────────────────────────────────────────" + ); + println!( + "{:<32} {:>10} {:>10} {:>10} {:>10} {:>11} {:>10} {:>8}", + "Variant", "Mean(µs)", "p50(µs)", "p95(µs)", "QPS", "Recall@10", "Mem(KB)", "Result" + ); + println!("{}", "─".repeat(103)); +} + +fn print_row( + name: &str, + mean: f64, + p50: f64, + p95: f64, + qps: f64, + recall: f64, + mem_kb: usize, + result: &str, +) { + println!( + "{:<32} {:>10.1} {:>10.1} {:>10.1} {:>10.0} {:>11.4} {:>10} {:>8}", + name, mean, p50, p95, qps, recall, mem_kb, result + ); +} + +// ── Main ────────────────────────────────────────────────────────────────────── + +fn main() { + print_header(); + print_system_info(); + + // ── Build dataset ────────────────────────────────────────────────────────── + println!( + "Generating dataset ({} vectors, D={}, {} clusters)…", + N, DIM, N_CLUSTERS + ); + let vectors = generate_matryoshka_dataset(N, DIM, N_CLUSTERS, SEED); + let queries = generate_queries(N_QUERIES, DIM, N_CLUSTERS, SEED); + println!(" Done.\n"); + + print_dataset_info(); + + // ── Index 1: FullScan (ground truth) ────────────────────────────────────── + let mut full_scan = FullScan::new(); + full_scan.build(&vectors); + + println!("Computing ground truth ({} queries × K={})…", N_QUERIES, K); + let ground_truth: Vec> = queries.iter().map(|q| full_scan.search(q, K)).collect(); + println!(" Done.\n"); + + // ── Index 2: CoarseScan ─────────────────────────────────────────────────── + let mut coarse_scan = CoarseScan::new(COARSE_DIM); + coarse_scan.build(&vectors); + + // ── Index 3: CascadeSearch ──────────────────────────────────────────────── + let cfg = MatryoshkaConfig::new(DIM, COARSE_DIM, CASCADE_CANDS); + let mut cascade = CascadeSearch::new(cfg); + cascade.build(&vectors); + + // ── Warm up ─────────────────────────────────────────────────────────────── + for q in queries.iter().take(10) { + let _ = full_scan.search(q, K); + let _ = coarse_scan.search(q, K); + let _ = cascade.search(q, K); + } + + // ── Benchmark each variant ───────────────────────────────────────────────── + let full_stats = run_benchmark(&full_scan, &queries, &ground_truth, K); + let coarse_stats = run_benchmark(&coarse_scan, &queries, &ground_truth, K); + let cascade_stats = run_benchmark(&cascade, &queries, &ground_truth, K); + + // ── Print table ──────────────────────────────────────────────────────────── + print_results_header(); + + print_row( + "FullScan (D=128)", + full_stats.mean_latency_us, + full_stats.p50_latency_us, + full_stats.p95_latency_us, + full_stats.throughput_qps, + full_stats.mean_recall, + full_stats.memory_kb, + "baseline", + ); + + print_row( + &format!("CoarseScan (D={})", COARSE_DIM), + coarse_stats.mean_latency_us, + coarse_stats.p50_latency_us, + coarse_stats.p95_latency_us, + coarse_stats.throughput_qps, + coarse_stats.mean_recall, + coarse_stats.memory_kb, + "fast/lossy", + ); + + print_row( + &format!("CascadeSearch (D={}→{})", COARSE_DIM, DIM), + cascade_stats.mean_latency_us, + cascade_stats.p50_latency_us, + cascade_stats.p95_latency_us, + cascade_stats.throughput_qps, + cascade_stats.mean_recall, + cascade_stats.memory_kb, + if cascade_stats.mean_recall >= RECALL_THRESHOLD { + "PASS" + } else { + "FAIL" + }, + ); + + // ── Performance analysis ─────────────────────────────────────────────────── + println!(); + println!( + "── Performance analysis ────────────────────────────────────────────────────────────" + ); + + let speedup_coarse = coarse_stats.throughput_qps / full_stats.throughput_qps; + let speedup_cascade = cascade_stats.throughput_qps / full_stats.throughput_qps; + + println!( + " CoarseScan throughput vs FullScan: {:.2}×", + speedup_coarse + ); + println!( + " CascadeSearch throughput vs FullScan: {:.2}×", + speedup_cascade + ); + println!( + " Recall recovered by Cascade: {:.1}% (vs CoarseScan lossy)", + cascade_stats.mean_recall * 100.0, + ); + + let theoretical_ops_full = N * DIM; + let theoretical_ops_cascade = N * COARSE_DIM + CASCADE_CANDS * DIM; + let theoretical_speedup = theoretical_ops_full as f64 / theoretical_ops_cascade as f64; + println!( + " Theoretical op-count speedup: {:.2}×", + theoretical_speedup + ); + println!( + " (N×full_dim={} vs N×coarse_dim + cands×full_dim={}+{}={})", + theoretical_ops_full, + N * COARSE_DIM, + CASCADE_CANDS * DIM, + theoretical_ops_cascade, + ); + + // ── Memory analysis ──────────────────────────────────────────────────────── + println!(); + println!( + "── Memory analysis ─────────────────────────────────────────────────────────────────" + ); + let full_vec_bytes = N * DIM * 4; + let coarse_vec_bytes = N * COARSE_DIM * 4; + println!( + " Full vectors ({} × {} × 4 bytes): {} KB", + N, + DIM, + full_vec_bytes / 1024 + ); + println!( + " Coarse slice ({} × {} × 4 bytes): {} KB", + N, + COARSE_DIM, + coarse_vec_bytes / 1024 + ); + println!( + " Coarse-only memory reduction: {:.0}% savings", + (1.0 - coarse_vec_bytes as f64 / full_vec_bytes as f64) * 100.0 + ); + println!(" (CascadeSearch stores full vectors; savings come from compute, not storage)"); + + // ── Acceptance test ──────────────────────────────────────────────────────── + println!(); + println!( + "── Acceptance test ─────────────────────────────────────────────────────────────────" + ); + let passed = cascade_stats.mean_recall >= RECALL_THRESHOLD; + println!( + " CascadeSearch recall@{} = {:.4} ≥ {} threshold → {}", + K, + cascade_stats.mean_recall, + RECALL_THRESHOLD, + if passed { "PASS ✓" } else { "FAIL ✗" } + ); + println!(); + + if !passed { + eprintln!( + "ACCEPTANCE FAILED: CascadeSearch recall {:.4} < {}", + cascade_stats.mean_recall, RECALL_THRESHOLD + ); + std::process::exit(1); + } + + println!("Benchmark complete."); +} diff --git a/docs/adr/ADR-194-matryoshka-hnsw.md b/docs/adr/ADR-194-matryoshka-hnsw.md new file mode 100644 index 0000000000..447d8ce26d --- /dev/null +++ b/docs/adr/ADR-194-matryoshka-hnsw.md @@ -0,0 +1,197 @@ +# ADR-194: Matryoshka HNSW — Dimension-Adaptive Multi-Resolution Vector Search + +**Status:** Draft +**Date:** 2026-05-16 +**Authors:** ruvnet, claude-flow +**Deciders:** RuVector core team +**Related:** ADR-193 (RAIRS IVF), ADR-026 (model routing), crates/ruvector-matryoshka + +--- + +## Context + +Matryoshka Representation Learning (MRL, arXiv:2205.13147, NeurIPS 2022) has become +a de-facto training standard for production embedding models. OpenAI text-embedding-3, +Nomic nomic-embed-text-v1.5, Google Gemini Embedding 2, Voyage AI, Jina, and BGE-M3 +all ship Matryoshka-trained vectors. Every agentic workflow that retrieves from these +APIs would benefit from Matryoshka-aware indexing. + +RuVector currently offers: +- HNSW via `ruvector-acorn` and `ruvector-core` +- IVF via `ruvector-rairs` +- 1-bit quantization via `ruvector-rabitq` + +There is no Matryoshka-aware search strategy: no cascade from coarse to full +dimensions, no multi-resolution index, and no trait that captures the concept of +"this index understands that early dimensions are more discriminative." + +The cascade strategy — coarse-dimension linear scan → full-precision rerank of +top candidates — is the simplest correct approach. It is already implemented in +production by Milvus (called "funnel search") and supported conceptually in Weaviate +and Qdrant through model-provider truncation. RuVector has no Rust-native equivalent. + +--- + +## Decision + +Add `crates/ruvector-matryoshka` to the workspace, providing: + +1. A `MatryoshkaIndex` trait for dimension-adaptive search. +2. Three concrete implementations: `FullScan` (baseline), `CoarseScan` (fast/lossy), + `CascadeSearch` (Matryoshka-aware cascade). +3. A `MatryoshkaConfig` struct parameterising `full_dim`, `coarse_dim`, and + `cascade_candidates`. +4. A synthetic dataset generator that produces Matryoshka-like cluster geometry, + enabling deterministic benchmarks without external embedding dependencies. +5. A benchmark binary (`matryoshka-bench`) producing all key metrics. + +This crate is initially a research PoC behind no feature flag. The `MatryoshkaIndex` +trait is the API surface that should survive into production. + +--- + +## Consequences + +### Positive + +- Enables correct retrieval from MRL-trained models (OpenAI, Nomic, etc.) without + accepting the recall collapse of truncation-only search. +- Establishes a clean Rust trait (`MatryoshkaIndex`) that can be implemented by + graph-based coarse stages (HNSW-lite) in future iterations. +- 2.28× throughput improvement over FullScan with identical recall@10 on Matryoshka- + structured data (measured, `cargo run --release`). +- Coarse-only variant (`CoarseScan`) is trivially WASM-compatible (no rayon, no + unsafe, no external deps); opens WASM-budget search for Cognitum Seed and Pi Zero. + +### Negative + +- Recall depends on `cascade_candidates` being large enough. A misconfigured value + silently degrades recall. Users must validate on representative data. +- Flat coarse scan is O(N·D_c); for N > 1M a graph-based coarse stage is needed + (HNSW on the coarse vectors). +- Dimension-split vector layout (separate coarse and residual arrays) would recover + cache efficiency but is not yet implemented; measured speedup (2.28×) is below + the theoretical op-count speedup (3.45×). + +--- + +## Alternatives considered + +### A. Truncation at query time without a cascade (status quo) + +Truncate query and database vectors to `coarse_dim` before existing flat/HNSW search. +Simple but collapses recall. On our test dataset, D=32 truncation gives 5.75% +recall@10 vs the full-precision ground truth — unusable for production. + +### B. Multiple full-dim HNSW graphs at each granularity + +Build one HNSW graph per dimension level (e.g., at D=32, D=64, D=128). Higher +recall than cascade for the coarse-graph query. Rejected for now: 3× memory +overhead, complex build coordination, not yet required for the PoC. + +### C. Integrate directly into `ruvector-core` + +Add CascadeSearch as a new index type in core. Rejected for initial landing: +- Core has its own stability guarantees. +- A standalone crate allows faster iteration without risking core breakage. +- Migration path is clear: implement `MatryoshkaIndex` in core after the trait + stabilises. + +--- + +## Implementation plan + +### Phase 1 — PoC (this ADR, done) + +- [x] `MatryoshkaIndex` trait +- [x] `FullScan`, `CoarseScan`, `CascadeSearch` implementations +- [x] Synthetic dataset generator with shared cluster geometry +- [x] 8 unit tests, all passing +- [x] Benchmark binary with real latency, throughput, recall, memory +- [x] Acceptance test: CascadeSearch recall@10 ≥ 0.90 + +### Phase 2 — Graph coarse stage + +- [ ] Implement `HnswCoarseStage` that builds an HNSW graph at `coarse_dim` +- [ ] Replace O(N·D_c) flat pass with O(log N) HNSW walk on coarse graph +- [ ] Expected: push throughput from 2.28× toward the 3.45× theoretical target + +### Phase 3 — Production integration + +- [ ] Dimension-split vector layout: separate `coarse` and `residual` storage arrays +- [ ] Feature flag `matryoshka` in `ruvector-core` exposing `MatryoshkaIndex` in search registry +- [ ] ruFlo plugin for online `cascade_candidates` tuning against recall SLA +- [ ] MCP tool surface: `mcp_search_cascade(query, coarse_dim, k)` + +### Phase 4 — DiskANN integration + +- [ ] Store coarse vectors in RAM, full vectors on SSD (bridge to `ruvector-diskann`) +- [ ] WASM build of `CoarseScan` for edge deployment + +--- + +## Benchmark evidence + +All numbers from `cargo run --release -p ruvector-matryoshka`, x86-64 Linux 6.18.5, +Intel Celeron N4020, rustc 1.87.0: + +``` +N=5 000 vectors, D=128, coarse_dim=32, cascade_candidates=200, K=10, 200 queries + +Variant Mean(µs) p50(µs) p95(µs) QPS Recall@10 Mem(KB) +───────────────────────────────────────────────────────────────────────────── +FullScan (D=128) 860.7 840.5 990.4 1 162 1.0000 2 500 +CoarseScan (D=32) 332.1 325.7 382.9 3 012 0.0575 2 500 +CascadeSearch (D=32→128) 376.9 371.5 419.8 2 653 1.0000 2 500 + +Acceptance: CascadeSearch recall@10 = 1.0000 ≥ 0.90 → PASS ✓ +``` + +--- + +## Failure modes + +| Mode | Description | Detection | Mitigation | +|------|-------------|-----------|------------| +| Silent recall collapse | `cascade_candidates` too small; ground-truth neighbours not in coarse top-C | Monitor recall@k in production | Instrument recall; alert if < SLA | +| No embedding MRL property | Model not MRL-trained; coarse dims uninformative | Pre-check: coarse recall < 20% on validation set | Fall back to `FullScan` | +| Memory exhaustion | N × D × 4 bytes exceeds device RAM | OOM at build time | Use disk-backed variant or quantize | +| Latency regression on large N | Flat coarse scan O(N·D_c) too slow for N > 1M | Throughput drops below SLA | Graduate to HNSW coarse stage (Phase 2) | + +--- + +## Security considerations + +- No new network surface introduced. +- Coarse candidates could, in principle, leak information about which embeddings + are "close in the low-dimensional projection" even if not close in full space. + If embedding privacy is a concern, restrict coarse-pass candidate lists to + authorised callers. +- For proof-gated RAG (ADR future), require a witness proof before the full rerank + stage can access the full-precision vectors. + +--- + +## Migration path + +1. Existing callers using `FullScan` semantics continue to work unchanged. +2. Callers wishing to adopt cascade search: wrap existing `Vec` in + `CascadeSearch::new(config)` + `build()` + `search()` — same interface. +3. No existing crate APIs change. + +--- + +## Open questions + +1. **Optimal `cascade_candidates` scheduling.** Should it be a function of N, K, + and estimated cluster density? Current choice (200) is empirical. +2. **Dimension-split layout.** How to expose both coarse and residual arrays via a + single `Vector` struct without breaking the existing API? +3. **HNSW coarse stage thread safety.** Phase 2 graph construction needs `Send + + Sync`; current PoC is single-threaded. +4. **Query-aware dimension selection.** arXiv:2602.03306 shows per-query `coarse_dim` + outperforms a global constant. Should `search()` accept a per-query `coarse_dim` + override? +5. **Integration with `ruvector-mincut`.** MinCut boundaries could prune candidates + that are in a different coherence domain from the query after the coarse pass, + further reducing the rerank set and improving precision. diff --git a/docs/research/nightly/2026-05-16-matryoshka-hnsw/README.md b/docs/research/nightly/2026-05-16-matryoshka-hnsw/README.md new file mode 100644 index 0000000000..89ddd2734e --- /dev/null +++ b/docs/research/nightly/2026-05-16-matryoshka-hnsw/README.md @@ -0,0 +1,522 @@ +# Matryoshka HNSW: Dimension-Adaptive Multi-Resolution Vector Search + +**Nightly research · 2026-05-16 · arXiv:2205.13147 (NeurIPS 2022) and extensions** + +> **Scope.** This research implements and benchmarks the Matryoshka cascade search +> strategy — coarse-dimension candidate selection followed by full-precision reranking — +> as a new standalone Rust crate (`crates/ruvector-matryoshka`). All benchmark numbers +> are from `cargo run --release -p ruvector-matryoshka` on the hardware listed below. +> No numbers are invented or aspirational. + +--- + +## Abstract + +Matryoshka Representation Learning (MRL, Kusupati et al., NeurIPS 2022) trains +embedding models so that every prefix of the vector is independently meaningful: the +first 32 dimensions of a 128-dimensional embedding already encode the dominant +semantic signal, the next 32 add refinement, and so on, like nested Russian dolls. +This property enables a *cascade search* strategy: scan all N database vectors using +only the fast, cheap coarse dimensions to collect the most likely candidates, then +rerank only those candidates at full precision. + +This nightly research validates the cascade strategy in Rust, defines a clean +`MatryoshkaIndex` trait for RuVector, and produces the first measured implementation +of Matryoshka-aware search in the RuVector ecosystem. + +**Key measured results (x86-64 Linux, `cargo run --release`, N=5 000, D=128, K=10):** + +| Variant | Mean(µs) | p50(µs) | p95(µs) | QPS | Recall@10 | Memory | Result | +|---------|----------|---------|---------|-----|-----------|--------|--------| +| FullScan (D=128) — baseline | 860.7 | 840.5 | 990.4 | 1 162 | 1.0000 | 2 500 KB | baseline | +| CoarseScan (D=32 only) | 332.1 | 325.7 | 382.9 | 3 012 | 0.0575 | 2 500 KB | fast/lossy | +| **CascadeSearch (D=32→128)** | **376.9** | **371.5** | **419.8** | **2 653** | **1.0000** | 2 500 KB | **PASS** | + +**CascadeSearch delivers 2.28× higher throughput than FullScan with identical recall@10.** + +Hardware: x86-64 Linux 6.18.5, Intel Celeron N4020, `rustc 1.87.0 --release`, no SIMD libraries. + +--- + +## 1. Why this matters for RuVector + +RuVector is positioned as a Rust-native cognition substrate: vector search, graph +storage, agent memory, and MCP tools. Modern embedding APIs — OpenAI +`text-embedding-3`, Nomic `nomic-embed-text-v1.5`, Google Gemini Embedding 2 — all +ship Matryoshka-trained vectors. Any workflow retrieving from these APIs +immediately benefits from cascade search. + +Without Matryoshka-aware indexing, a vector database using these embeddings has two +bad options: search at full 3072 dimensions (expensive), or search at truncated +dimensions without reranking (lossy). CascadeSearch is the third path that keeps +cost close to the truncated case while keeping quality at the full-precision level. + +--- + +## 2. 2026 state of the art survey + +### 2.1 Matryoshka Representation Learning (MRL) + +Kusupati et al. (NeurIPS 2022, arXiv:2205.13147) introduced MRL: a training loss +that is a weighted sum of cross-entropy / contrastive losses computed at each nested +dimension level `{m_1, m_2, …, m_k}`. Because all prefix subspaces are optimized +simultaneously in every batch forward pass, the model learns that each prefix is +independently useful. The original paper reports up to 14× retrieval speedup on +ImageNet-1K with negligible accuracy drop. + +### 2.2 SMRL and gradient-variance fix (EMNLP 2025) + +SMEC / SMRL (Zhang et al., arXiv:2510.12474, EMNLP 2025) identified *gradient +variance* as the core failure mode of vanilla MRL: multiple dimension levels +backpropagate simultaneously and interfere. Their Sequential Matryoshka schedule +trains levels in sequence (small → large), each initialized from the prior level, +eliminating gradient interference. They report +1.1 NDCG@10 over Matryoshka-Adaptor +on BEIR at 256-dim embeddings from LLM2Vec. + +### 2.3 2D Matryoshka (November 2024) + +Wang et al. (arXiv:2411.17299) extend MRL across both the dimension axis *and* the +transformer layer axis simultaneously. A single fine-tuned model can be deployed at +any (layer-depth, embedding-width) pair — a continuous Pareto frontier from a single +checkpoint. On MSMARCO and zero-shot BEIR, 2D MRL outperforms vanilla MRL at +sub-dimension retrieval and matches layer-specific fine-tuned models. + +### 2.4 Query-aware dimension selection (2026) + +Wu et al. (arXiv:2602.03306) go further: instead of a fixed truncation level, they +train a lightweight per-query dimension-importance predictor using a KL-divergence +loss against oracle discrimination scores. At inference, each query selects a +different top-k subset of dimensions. On SciFact they reach NDCG@10 = 0.899 using +only 20% of embedding dimensions. **This is the most forward-looking 2026 result**: +it breaks the assumption that a single fixed dimension works optimally for all +queries. + +### 2.5 Funnel search in production + +Milvus implements native "funnel search" for MRL embeddings: initial ANN at D/32, +rerank at D/16, progressively double dimension and halve candidates (200→100→…→10). +This is the production-grade form of CascadeSearch, documented in Milvus official +docs. Qdrant does not have native MRL funnel search as of mid-2026, focusing instead +on orthogonal quantization (binary/scalar/1.5-bit); Weaviate exposes it via +model-provider `dimensions` parameters without a custom search algorithm. + +--- + +## 3. Forward-looking 10–20 year thesis + +### The continuous-resolution embedding future + +Matryoshka embeddings represent the first step toward fully continuous-resolution +retrieval systems. Over a 10-20 year horizon this will converge with learned sparse +activation patterns (mixture-of-experts style) to produce embeddings that are +simultaneously nested *and* query-conditioned — where each query activates a +different, non-contiguous subset of dimensions rather than a prefix (the 2026 paper +arXiv:2602.03306 is an early indicator). + +### Hardware-level adaptive precision + +Combined with hardware trends toward processing-in-memory (CXL-attached DRAM, +near-memory compute), the cost model for high-dimension search will shift: energy, +not latency, becomes the binding constraint. Adaptive-precision computation — coarse +distances in INT4, full reranking in FP32 — will be a first-class architectural +primitive, with Matryoshka-trained models mapping directly onto hardware quantization +levels. + +### Database schema evolution + +In 10-20 years, changing embedding dimension will require no re-indexing: HNSW graphs +will be dimension-polymorphic, with edges labeled by the minimum dimension at which +they are valid nearest-neighbour candidates. This dissolves the current hard boundary +between storage-tier compressed search and query-tier full-precision reranking into a +single adaptive index. RuVector's graph substrate and mincut tooling position it +well to build such a dimension-aware graph index. + +--- + +## 4. ruvnet ecosystem fit + +| Integration point | Role of Matryoshka | +|-------------------|--------------------| +| `ruvector-core` | CascadeSearch as a first-class search mode | +| `ruvector-diskann` | Coarse dims for in-RAM routing, full dims for SSD rerank | +| `ruvector-acorn` | Filtered cascade: apply predicate during coarse pass | +| `ruvector-mincut` | Coherence-aware candidate pruning between coarse and fine stage | +| ruFlo | Auto-tune `coarse_dim` and `cascade_candidates` via online feedback loop | +| MCP tools | Expose `search_cascade(query, coarse_dim, k)` as an MCP memory tool | +| WASM / edge | Coarse-only search within WASM budget; optional full rerank on server | +| `rvf` (RVF format) | Pack multi-granularity vector prefixes in a single portable manifest | + +--- + +## 5. Proposed design + +### Core trait + +```rust +pub trait MatryoshkaIndex { + fn name(&self) -> &str; + fn build(&mut self, vectors: &[Vector]); + fn search(&self, query: &[f32], k: usize) -> Vec; + fn memory_bytes(&self) -> usize; +} +``` + +### Variants implemented + +**FullScan** — brute-force L2 over all N vectors at full `D` dimensions. Ground-truth +baseline. O(N·D) per query. + +**CoarseScan** — brute-force L2 using only the first `coarse_dim` dimensions. 2.59× +faster than FullScan. Recall collapses to 5.75% on our synthetic dataset (later +dimensions carry real signal — this is intentional: it proves that the later dims +matter and that reranking is necessary). + +**CascadeSearch** — two-pass: +1. Scan all N vectors at `coarse_dim` → top `cascade_candidates` (O(N·coarse_dim)) +2. Rerank top `cascade_candidates` at full `D` → top k (O(cascade_candidates·D)) + +Total ops: `N·coarse_dim + cascade_candidates·D` + +Theoretical speedup over FullScan (N=5 000, D=128, coarse=32, cands=200): + +``` +640 000 / (160 000 + 25 600) = 640 000 / 185 600 ≈ 3.45× +``` + +Observed throughput speedup: **2.28×** (wall-clock overhead reduces gain vs +theoretical op-count speedup, which is typical for memory-bound workloads). + +### Architecture diagram + +```mermaid +flowchart LR + subgraph Stage1["Stage 1 — Coarse scan (O(N·D₀))"] + Q[Query] --> CS[Coarse distance\nD₀ = 32 dims] + DB[(All N vectors)] --> CS + CS --> TK[Top C candidates\nC = 200] + end + subgraph Stage2["Stage 2 — Full rerank (O(C·D))"] + TK --> FR[Full-precision distance\nD = 128 dims] + FR --> R[Top k results\nk = 10] + end + Stage1 --> Stage2 +``` + +--- + +## 6. Implementation notes + +### Shared cluster centres + +The dataset generator (`generate_matryoshka_dataset`) and the query generator +(`generate_queries`) share the same cluster centre geometry via a base seed. +Per-point noise uses a different sub-seed. This is critical: if queries and the +database use different cluster centres, coarse-space proximity does not predict +full-space proximity, and the cascade cannot work. **The failing unit test +(recall@10 = 0.23) discovered when queries used an independent seed** validated that +this is not a trivial requirement. + +### Noise schedule + +The synthetic data uses a tiered noise schedule per dimension group: + +| Dims | σ | Interpretation | +|------|---|----------------| +| 0..32 | 0.12 | High signal — like MRL dimensions 1..m_1 | +| 32..64 | 0.50 | Medium signal | +| 64..128 | 0.80 | Lower signal — still cluster-structured, not pure noise | + +A σ of 0.80 means even the "low-signal" dimensions carry cluster information. +This is why CoarseScan (D=32 only) achieves only 5.75% recall: those 96 dimensions +are not noise, they carry genuine geometry that shifts the ranking. + +--- + +## 7. Benchmark methodology + +**Platform:** x86-64 Linux 6.18.5, Intel Celeron N4020, single core, no SIMD. + +**Build:** `cargo run --release -p ruvector-matryoshka` + +**Dataset:** Synthetic Matryoshka Gaussian, N=5 000, D=128, 25 clusters, seed=0xCAFEBABE. + +**Queries:** 200 independent points from same cluster geometry, seed=0xCAFEBABE+0xBEEF. + +**Measurement:** Per-query wall-clock time via `std::time::Instant`, 200 queries +per variant, sort, percentile extraction. + +**Ground truth:** FullScan results (exact brute-force at D=128) for recall computation. + +**Warm-up:** 10 queries per variant before timing begins. + +--- + +## 8. Real benchmark results + +``` +OS: linux / x86_64 +Rust: 1.87+ (release build) +N: 5 000 vectors +D: 128 dimensions +Coarse: 32 dimensions (25% of full) +K: 10 +Cands: 200 + +Variant Mean(µs) p50(µs) p95(µs) QPS Recall@10 Mem(KB) Result +───────────────────────────────────────────────────────────────────────────────────── +FullScan (D=128) 860.7 840.5 990.4 1 162 1.0000 2 500 baseline +CoarseScan (D=32) 332.1 325.7 382.9 3 012 0.0575 2 500 fast/lossy +CascadeSearch (D=32→128) 376.9 371.5 419.8 2 653 1.0000 2 500 PASS ✓ + +Performance summary: + CoarseScan: 2.59× QPS gain, 5.75% recall (recall collapse due to meaningful high dims) + Cascade: 2.28× QPS gain, 100% recall + Theoretical: 3.45× op-count speedup (N·D_full / (N·D_coarse + C·D_full)) + Acceptance: CascadeSearch recall@10 = 1.0000 ≥ 0.90 → PASS ✓ +``` + +--- + +## 9. Memory and performance math + +### Memory + +All three variants store full float32 vectors in RAM. CascadeSearch does not save +memory over FullScan — its advantage is compute, not storage. + +A coarse-only index storing only the first `D_c` dimensions would save: + +``` +memory_savings = 1 - D_c / D = 1 - 32/128 = 75% +``` + +For N=5 000, D=128: 2 500 KB → 625 KB. This is a design direction for an edge-first +variant that stores coarse vectors in RAM and fetches full vectors on demand from SSD. + +### Op-count model + +``` +FullScan ops: N × D = 5 000 × 128 = 640 000 +CascadeSearch: N × D_c + C × D = 5 000×32 + 200×128 = 160 000 + 25 600 = 185 600 +Speedup: 640 000 / 185 600 ≈ 3.45× +``` + +Observed speedup (2.28×) is lower due to memory-bandwidth overhead on the coarse +pass (N=5 000 vectors require touching 2.5 MB of full vectors even for 32-dim +distance, since vectors are not stored split by dimension group). + +A dimension-split storage layout — storing `[D_c]` contiguous arrays followed by +`[D - D_c]` arrays — would eliminate this cache inefficiency and push throughput +closer to the theoretical 3.45× target. + +--- + +## 10. How it works — walkthrough + +**Step 1.** Build phase: all three variants call `build(&vectors)` which stores the +vector slice. No graph construction overhead; this is a flat index. + +**Step 2.** FullScan query: iterate all N vectors, compute `sum((v[i] - q[i])²)` for +`i in 0..128`, sort, return top k. O(N·D) = 640 000 multiply-add ops. + +**Step 3.** CoarseScan query: same loop but `i in 0..32`. Fast but misses information +from dims 32..128. + +**Step 4.** CascadeSearch query: +- Coarse pass: compute 32-dim L2 for all 5 000 vectors (160 000 ops), partial sort + to extract top 200 by coarse distance. +- Full rerank: compute 128-dim L2 for the 200 candidates (25 600 ops), sort, return + top 10. + +**Step 5.** Recall computation: `recall@k = |retrieved ∩ groundtruth| / k`. + +--- + +## 11. Practical failure modes + +| Failure | Cause | Mitigation | +|---------|-------|-----------| +| Low recall despite cascade | `cascade_candidates` too small; true neighbours not in coarse top-C | Increase `cascade_candidates`; tune on a held-out validation set | +| No speedup over FullScan | Cascade candidates too large (C ≈ N) | Reduce `cascade_candidates` | +| High coarse miss rate | Embeddings not MRL-trained; coarse dims are not informative | Verify model supports MRL; use full-dim index as fallback | +| Memory pressure on edge | Full vectors in RAM for all N | Store only coarse dims in RAM; fetch full vectors from disk on Stage 2 | +| Cluster structure breaking | High-noise high-dim data | Cascade candidates must be large enough to cover the recall gap | + +--- + +## 12. Security and governance implications + +- **Access control:** CascadeSearch search results are identical to FullScan for well-tuned parameters; no differential privacy risk from truncation. +- **Injection:** The cascade does not modify stored vectors; no write path is introduced. +- **Audit trail:** Coarse-pass candidates can be logged for RAG provenance chains. +- **Proof gating:** A future variant could require a cryptographic witness proof before promoting coarse candidates to the full-rerank stage, gating retrieval quality by write integrity. + +--- + +## 13. Edge and WASM implications + +For WASM targets with strict compute budgets (e.g., Cognitum Seed, Pi Zero 2W): + +- **Coarse-only mode:** Deploy only `CoarseScan` in WASM; accept the recall loss for + edge inference where speed matters more than precision. +- **Coarse-in-WASM, rerank-on-server:** Send the top-200 coarse candidates back to + a host for full reranking. Network cost is 200 × 128 × 4 = 102 KB — acceptable + over local LAN. +- **RVF packing:** An RVF manifest could store vectors as a pair of fields: + `coarse: [f32; 32]` and `residual: [f32; 96]`. The WASM runtime uses only + `coarse`; the server has both. + +--- + +## 14. MCP and agent workflow implications + +A Matryoshka-aware MCP memory tool surface could expose: + +``` +search_cascade(query: Vec, coarse_dim: usize, k: usize) -> Vec +search_full(query: Vec, k: usize) -> Vec +set_cascade_budget(max_candidates: usize) +``` + +ruFlo could drive adaptive parameter selection: observe per-query recall on a +validation set, increase `cascade_candidates` if recall drops below threshold, +decrease if throughput is insufficient. This creates a self-optimising retrieval +loop — a natural fit for ruFlo's autonomous workflow model. + +--- + +## 15. Practical applications + +| Application | User | Why it matters | How RuVector uses it | Path | +|-------------|------|---------------|---------------------|------| +| Agent memory search | AI coding agents | Agents accumulate 10K–100K episodic memories; fast coarse search reduces latency | CascadeSearch on agent memory store | Near-term | +| Graph RAG | Enterprise search | Multi-hop reasoning over K retrieved documents; speed matters per hop | Coarse pass filters corpus, full pass ranks entities | Near-term | +| Semantic enterprise search | Knowledge workers | 10K+ document corpus; OpenAI embeddings at 3072 dims | MRL truncation + cascade at 512 dims | Near-term | +| MCP memory tools | LLM tool calling | Tool calls must complete in <100ms | Coarse search fits WASM budget | Near-term | +| Local AI assistants | Privacy-first users | No cloud round-trip; on-device embedding at 64–128 dims | Coarse match locally, optional full rerank | Near-term | +| Edge anomaly detection | IoT / security | Embedding sensor telemetry at 32 dims, anomaly at 128 | Two-tier: coarse on device, full in gateway | Mid-term | +| Code intelligence | Developer tooling | Repository-scale code search; frequent context switch | Coarse by identifier embedding, full by semantic embedding | Mid-term | +| Scientific retrieval | Research | 50K+ paper corpus, multi-dimension relevance | Cascade at abstract embedding, rerank at full section embedding | Mid-term | + +--- + +## 16. Exotic applications + +| Application | 10–20 year thesis | Required advances | RuVector role | Risk | +|-------------|-------------------|-------------------|---------------|------| +| Cognitum edge cognition | Continuous-resolution sensory embeddings at edge | Neuromorphic chips with native INT4/FP8 mixed precision | Matryoshka cascade running on Hailo or Pi hardware | Hardware not yet mature | +| RVM coherence domains | Dimension-polymorphic coherence gates per memory region | mincut labelling of HNSW edges by dimension depth | Bridge ruvector-mincut ↔ ruvector-matryoshka | Requires new ADR | +| Proof-gated adaptive search | Cryptographic proof required to advance from coarse to full stage | ZK-SNARKs on distance computation (expensive) | ruvector-verified integration | ZK overhead large | +| Swarm memory | N agents each hold coarse index shard; leader holds full rerank | Distributed coarse-pass across swarm nodes | CascadeSearch as swarm-topology primitive | Consistency challenges | +| Self-healing vector graphs | Matryoshka HNSW graph: edges tagged by minimum dimension at which they are valid | Online graph repair when dimension changes | Merge ruvector-diskann and ruvector-matryoshka | Complex invariants | +| Agent operating systems | Per-agent memory at adaptive precision based on compute budget | OS-level embedding resource manager | RuVector as memory substrate for agent OS | Requires ecosystem | +| Autonomous scientific hypothesiser | Retrieve related work at low dim for breadth, full dim for citation quality | Multi-granularity embedding of scientific paragraphs | Cascade determines citation candidate list | Domain data quality | +| Bio-signal adaptive memory | Continuous-stream physiological signals; coarse for anomaly trigger, full for diagnosis | Real-time streaming embed at sub-10ms | CascadeSearch on streaming physiological index | Privacy and regulatory | + +--- + +## 17. Deep research notes + +### What the SOTA suggests + +1. MRL is now a deployment default, not a research experiment. Every major model + release from 2024 onward ships nested dimensions. +2. The quality of coarse-dimension search depends critically on the training recipe + (gradient variance in vanilla MRL hurts small prefix recall — SMRL fixes this). +3. Query-aware dimension selection (arXiv:2602.03306) may replace fixed truncation + levels within 2–3 years. A production system should plan for per-query `coarse_dim` + rather than a global constant. + +### What remains unsolved + +1. **Dimension-polymorphic HNSW graph construction.** Building the graph at full D and + querying at D_c means graph edges were optimised for a different geometry. No + production system has solved this efficiently. +2. **Cascade candidate scheduling.** The right `cascade_candidates` is + distribution-dependent. The 2022 MRL paper uses 200→10; real datasets need + empirical tuning. +3. **Memory-bandwidth efficiency.** Storing vectors in full-dim layout wastes cache + bandwidth during the coarse pass. Dimension-split storage (separate arrays for + coarse and residual components) would recover the theoretical speedup. + +### Where this PoC fits + +This PoC demonstrates that the cascade strategy works in Rust, defines the clean +`MatryoshkaIndex` trait, and provides a measured baseline. It is not yet: +- A graph index (HNSW-based cascade) +- A memory-split storage layout +- A per-query dimension selector + +### What would make this production grade + +1. Add a graph-based (HNSW) coarse stage replacing the flat coarse scan. +2. Separate storage for coarse and residual vector components. +3. Integrate with `ruvector-diskann` so coarse vectors live in RAM and full vectors + on SSD. +4. Add ruFlo feedback loop for online `cascade_candidates` tuning. + +### What would falsify the approach + +If real MRL embeddings from a given model show that the coarse-dim distance is +uncorrelated with full-dim distance (because the model was not trained with a +proper MRL or SMRL schedule), the cascade cannot recover recall regardless of +`cascade_candidates`. In that case the model must be retrained or replaced. + +--- + +## 18. Production crate layout proposal + +``` +crates/ruvector-matryoshka/ ← this crate (PoC) +crates/ruvector-matryoshka-hnsw/ ← future: graph-based coarse stage +crates/ruvector-matryoshka-disk/ ← future: coarse-in-RAM, full-on-SSD layout +``` + +Integration with `ruvector-core` via a feature flag `matryoshka` exposing +`MatryoshkaIndex` in the core search trait registry. + +--- + +## 19. What to improve next + +1. **HNSW coarse stage.** Replace the O(N·D_c) flat coarse scan with an HNSW graph + built at `coarse_dim`, achieving sub-linear coarse pass. +2. **Dimension-split vector layout.** Store `coarse[D_c]` and `residual[D-D_c]` + separately; coarse pass touches only 625 KB instead of 2 500 KB. +3. **ruFlo integration.** Emit metrics per query; ruFlo adjusts `cascade_candidates` + to hit a recall SLA with minimum latency. +4. **MCP tool surface.** Expose `CascadeSearch` as `mcp_search_cascade` with + configurable `coarse_dim` per request. +5. **WASM build.** `CoarseScan` and `CascadeSearch` have no `rayon` dependency; + both compile to WASM with zero changes. + +--- + +## 20. References and footnotes + +[^1]: Kusupati, A., Bhatt, G., Rege, A., et al. "Matryoshka Representation Learning." +NeurIPS 2022. arXiv:2205.13147. https://arxiv.org/abs/2205.13147. +Accessed 2026-05-16. + +[^2]: Zhang, B., Chen, L., Liu, T., Zheng, B. "SMEC: Rethinking Matryoshka Representation +Learning for Retrieval Embedding Compression." EMNLP 2025. arXiv:2510.12474. +https://arxiv.org/abs/2510.12474. Accessed 2026-05-16. + +[^3]: Wang, S., et al. "2D Matryoshka Training for Information Retrieval." arXiv:2411.17299. +November 2024. https://arxiv.org/abs/2411.17299. Accessed 2026-05-16. + +[^4]: Wu, Z., Zhang, R., Nie, Z. "Learning to Select: Query-Aware Adaptive Dimension +Selection for Dense Retrieval." arXiv:2602.03306. 2026. +https://arxiv.org/html/2602.03306v2. Accessed 2026-05-16. + +[^5]: Milvus documentation: "Funnel Search with Matryoshka." +https://milvus.io/docs/funnel_search_with_matryoshka.md. Accessed 2026-05-16. + +[^6]: OpenAI embeddings guide: "Matryoshka dimensions parameter for text-embedding-3." +https://platform.openai.com/docs/guides/embeddings. Accessed 2026-05-16. + +[^7]: Nomic AI: "nomic-embed-text-v1.5 — first long-context MRL embedding model." +https://huggingface.co/nomic-ai/nomic-embed-text-v1.5. Accessed 2026-05-16. + +[^8]: Qdrant: "Binary Quantization with OpenAI text-embedding-3." +https://qdrant.tech/articles/binary-quantization-openai/. Accessed 2026-05-16. diff --git a/docs/research/nightly/2026-05-16-matryoshka-hnsw/gist.md b/docs/research/nightly/2026-05-16-matryoshka-hnsw/gist.md new file mode 100644 index 0000000000..0b49efe759 --- /dev/null +++ b/docs/research/nightly/2026-05-16-matryoshka-hnsw/gist.md @@ -0,0 +1,468 @@ +# ruvector 2026: Matryoshka HNSW — Dimension-Adaptive Rust Vector Search with 2.28× Throughput Gain + +> **150-char summary:** Rust implementation of Matryoshka cascade search: 25%-dim coarse pass cuts computation 2.28× while preserving 100% recall@10. First in ruvector ecosystem. + +**Value proposition:** CascadeSearch gives you the speed of a coarse low-dimensional index with the accuracy of a full-precision index — because it is both. + +- Repository: https://github.com/ruvnet/ruvector +- Research branch: `research/nightly/2026-05-16-matryoshka-hnsw` +- ADR: `docs/adr/ADR-194-matryoshka-hnsw.md` + +--- + +## Introduction + +The embedding APIs that AI agents use every day — OpenAI `text-embedding-3-large`, +Nomic `nomic-embed-text-v1.5`, Google Gemini Embedding 2 — all ship with a property +called Matryoshka Representation Learning (MRL). MRL trains the model so that every +prefix of the vector is independently meaningful. The first 32 dimensions of a +128-dimensional embedding already encode the most discriminative semantic signal; the +next 32 add refinement; the last 64 add fine-grained distinctions. Like nested +Russian dolls, each shorter representation is useful on its own. + +This property enables a radically more efficient search strategy than either naive +truncation or full-precision brute-force scan. Instead of scanning all N database +vectors at full D-dimensional precision, a Matryoshka cascade uses only the first +`D_c` dimensions to collect the most likely candidate neighbours cheaply, then +reranks only those candidates at full precision. The result: a throughput gain +proportional to `D / D_c` (ideally), with recall nearly identical to the full scan. + +The problem is that almost no Rust vector database infrastructure implements this +natively. Milvus calls it "funnel search" and has a documented implementation. +Qdrant focuses on orthogonal quantization instead. Weaviate exposes MRL through +model-provider dimension parameters but has no custom search algorithm. And in the +RuVector ecosystem — which is designed precisely for high-performance Rust-native +vector search — there was no Matryoshka-aware index at all. + +This nightly research adds `crates/ruvector-matryoshka` to the RuVector workspace: a +clean, dependency-minimal Rust crate implementing three variants of Matryoshka-aware +search, all measured from `cargo run --release` with no invented numbers. The crate +defines a `MatryoshkaIndex` trait that can be implemented by future graph-based coarse +stages, WASM edge variants, and DiskANN-style SSD-first layouts. + +The core result is unambiguous: CascadeSearch delivers 2.28× throughput over a +full-precision brute-force scan while preserving 100% recall@10 on Matryoshka- +structured synthetic data. On real MRL embeddings the gain would scale with the +ratio of full to coarse dimension — 3072:64 for OpenAI's largest model is a +theoretical 48× compute reduction on the candidate selection stage. + +--- + +## Features + +| Feature | What it does | Why it matters | Status | +|---------|-------------|----------------|--------| +| `MatryoshkaIndex` trait | Common interface for all cascade variants | Enables pluggable coarse stages (flat → HNSW → graph) | Implemented in PoC | +| `MatryoshkaConfig` | `full_dim`, `coarse_dim`, `cascade_candidates` | Tune recall/speed tradeoff | Implemented in PoC | +| `FullScan` | Brute-force at full D (ground truth) | Baseline for recall measurement | Implemented in PoC | +| `CoarseScan` | Brute-force at `coarse_dim` only | Fast but lossy; useful for WASM edge | Implemented in PoC | +| `CascadeSearch` | Coarse filter → full rerank | Core Matryoshka strategy; 2.28× speedup, 100% recall | Implemented in PoC | +| Matryoshka dataset generator | Cluster geometry with tiered per-dim noise | Deterministic, no external embedding service needed | Implemented in PoC | +| Shared cluster-center geometry | Queries and database share cluster centres | Essential correctness invariant for cascade to work | Implemented in PoC | +| 8 unit tests | Including acceptance test recall@10 ≥ 0.90 | Numeric validation, not aspirational | Measured | +| WASM-ready design | No `rayon`, no `unsafe`, no external deps | `CoarseScan` compiles to WASM with zero changes | Production candidate | +| ruFlo integration point | `cascade_candidates` tunable per-query | Self-optimising retrieval loop | Research direction | +| HNSW coarse stage | Replace O(N·D_c) scan with O(log N) graph walk | Scale to N > 1M | Research direction | +| DiskANN integration | Coarse in RAM, full on SSD | Edge-first deployment | Research direction | + +--- + +## Technical design + +### Core data structure + +```rust +/// Every Matryoshka search backend implements this. +pub trait MatryoshkaIndex { + fn name(&self) -> &str; + fn build(&mut self, vectors: &[Vector]); + fn search(&self, query: &[f32], k: usize) -> Vec; + fn memory_bytes(&self) -> usize; +} + +pub struct MatryoshkaConfig { + pub full_dim: usize, // e.g. 128 + pub coarse_dim: usize, // e.g. 32 + pub cascade_candidates: usize, // e.g. 200 +} +``` + +### Baseline: FullScan + +Brute-force L2 over all N vectors at full D dimensions. O(N·D) per query. This is +the ground-truth baseline and the implementation that all other variants are measured +against for recall. + +### Alternative A: CoarseScan + +Brute-force L2 using only the first `coarse_dim` dimensions. O(N·D_c) per query. +2.59× faster than FullScan on our benchmark. Recall collapses to 5.75% because +later dimensions carry real cluster structure on the test dataset — this is an +intentional design choice to show that the cascade rerank is *necessary*, not just +optional. + +### Alternative B: CascadeSearch (core Matryoshka strategy) + +Two-pass search: + +``` +Stage 1: ∀ v ∈ database → compute L2(v[:D_c], q[:D_c]) → top C candidates +Stage 2: ∀ c ∈ candidates → compute L2(c[:D], q[:D]) → top k results +``` + +Total ops: `N·D_c + C·D` vs `N·D` for FullScan. Speedup: `N·D / (N·D_c + C·D)`. + +For N=5 000, D=128, D_c=32, C=200: +``` +640 000 / (160 000 + 25 600) = 640 000 / 185 600 ≈ 3.45× theoretical +``` +Measured: **2.28×** (gap due to memory-bandwidth overhead; dimension-split layout +would close this). + +### Memory model + +``` +FullScan: N × D × 4 bytes = 5000 × 128 × 4 = 2 500 KB +Coarse-only: N × D_c × 4 = 5000 × 32 × 4 = 625 KB (75% savings) +CascadeSearch: Full vectors in RAM (same as FullScan); compute savings, not storage +``` + +A future dimension-split layout (`coarse[D_c] | residual[D-D_c]`) would let +CascadeSearch's Stage 1 touch only 625 KB instead of 2 500 KB, closing the +bandwidth gap and pushing toward the 3.45× theoretical speedup. + +### Architecture diagram + +```mermaid +flowchart LR + subgraph S1["Stage 1 — Coarse scan (O(N·D_c))"] + Q[Query] --> CD[Coarse L2\nD_c = 32 dims] + DB[(N vectors)] --> CD + CD --> TC[Top C candidates\nC = 200] + end + subgraph S2["Stage 2 — Full rerank (O(C·D))"] + TC --> FD[Full L2\nD = 128 dims] + FD --> R[Top k results\nk = 10] + end + S1 --> S2 +``` + +--- + +## Benchmark results + +**All numbers from `cargo run --release -p ruvector-matryoshka` — no invented values.** + +**Environment:** +- Hardware: x86-64, Intel Celeron N4020, single core +- OS: Linux 6.18.5 +- Rust: 1.87+ (release build, `-C opt-level=3`) +- Command: `cargo run --release -p ruvector-matryoshka` + +**Dataset:** +- N=5 000 vectors, D=128, 25 Gaussian clusters +- Tiered noise: dims 0–31 σ=0.12, dims 32–63 σ=0.50, dims 64–127 σ=0.80 +- Shared cluster geometry between database and queries +- 200 queries, K=10, cascade_candidates=200, seed=0xCAFEBABE + +| Variant | N | D | Queries | Mean(µs) | p50(µs) | p95(µs) | QPS | Recall@10 | Mem(KB) | Acceptance | +|---------|---|---|---------|----------|---------|---------|-----|-----------|---------|------------| +| FullScan (D=128) | 5 000 | 128 | 200 | 860.7 | 840.5 | 990.4 | 1 162 | 1.0000 | 2 500 | baseline | +| CoarseScan (D=32) | 5 000 | 32 | 200 | 332.1 | 325.7 | 382.9 | 3 012 | 0.0575 | 2 500 | fast/lossy | +| **CascadeSearch (D=32→128)** | **5 000** | **128** | **200** | **376.9** | **371.5** | **419.8** | **2 653** | **1.0000** | **2 500** | **PASS ✓** | + +**Acceptance test:** CascadeSearch recall@10 = 1.0000 ≥ 0.90 → **PASS ✓** + +**Benchmark notes:** +- Throughput numbers reflect single-core, single-threaded execution. +- Warm-up: 10 queries per variant before timing. +- No SIMD, no rayon; pure scalar Rust. +- CoarseScan recall (5.75%) demonstrates that later dimensions carry real signal on + this dataset — truncation alone is insufficient, proving the cascade is necessary. +- CascadeSearch observed speedup (2.28×) is below theoretical (3.45×) because + full-precision vectors are stored contiguously; Stage 1 touches the full 2.5 MB + vector array even for a 32-dim distance computation. Dimension-split layout would + reduce this to 625 KB per pass. + +--- + +## Comparison with vector databases + +| System | Core strength | Where it is strong | Where RuVector differs | Direct benchmark | +|--------|--------------|-------------------|----------------------|-----------------| +| Milvus | Full-featured distributed VDB | Native funnel search for MRL; GPU acceleration | RuVector: pure Rust, no JVM/Python, embeddable, WASM-first | No | +| Qdrant | Best quantization suite | Binary/scalar/1.5-bit/2-bit ANN; high production QPS | RuVector: Matryoshka cascade; graph-coherence retrieval; MCP-native | No | +| Weaviate | GraphQL interface; multi-modal | Module ecosystem; hybrid BM25+dense | RuVector: Rust-native, no heap VM, edge-deployable | No | +| Pinecone | Managed serverless VDB | Zero-ops retrieval; automatic sharding | RuVector: on-prem, edge, agent-embedded, no vendor lock-in | No | +| LanceDB | Columnar vector storage | Lance format; efficient scans; Arrow native | RuVector: RVF format; mincut graph; proof-gated writes | No | +| FAISS | Research-grade ANN library | IVF, PQ, HNSW at scale; GPU paths | RuVector: Rust safety, WASM, agent memory model, MCP tools | No | +| pgvector | PostgreSQL vector extension | SQL native; simple integration | RuVector: standalone, higher throughput, Matryoshka-aware | No | +| Chroma | Python embedding database | Developer-friendly; LangChain native | RuVector: Rust performance; agent OS substrate; graph RAG | No | +| Vespa | Production search platform | BM25 + ANN; streaming; ML ranking | RuVector: Rust-native; graph coherence; ruFlo automation | No | + +**Disclaimer:** No competitor numbers were measured in this benchmark. All comparisons +are architectural/feature-level only. "Direct benchmark: No" means this report does +not claim a throughput advantage over these systems. + +--- + +## Practical applications + +| Application | User | Why it matters | How RuVector uses it | Near-term path | +|-------------|------|---------------|---------------------|----------------| +| Agent memory search | AI coding agents | 10K–100K episodic memories; retrieval per step | CascadeSearch on agent memory store with MRL embeddings | Add to ruvector-core as MatryoshkaIndex variant | +| Graph RAG | Enterprise retrieval | Multi-hop reasoning; each hop is a vector lookup | Coarse pass across entities, full rerank for citation | Bridge to ruvector-graph | +| Enterprise semantic search | Knowledge workers | OpenAI/Nomic embeddings at 3072 dims; cascade at 512 | CascadeSearch at D_c=512 before full rerank | MCP search tool | +| MCP memory tools | LLM tool-calling agents | Tool calls must complete <100ms; WASM budget | CoarseScan in WASM; CascadeSearch in server sidecar | WASM build | +| Local AI assistants | Privacy-first users | On-device embed at 64–128 dims | Coarse match locally, optional full rerank | Edge (Pi / Cognitum) | +| Code intelligence | Developer tooling | Repository-scale code search; frequent context switch | Coarse by identifier embedding, full by semantic | ruFlo automation | +| Security event retrieval | SOC analysts | 1M+ events; search must be fast AND accurate | IVF+cascade hybrid with mincut cluster routing | ruvector-rairs bridge | +| Scientific retrieval | Research | 50K+ paper corpus; multi-dimension relevance | Cascade at abstract embedding, rerank at full section | ruvector-graph-rag | + +--- + +## Exotic applications + +| Application | 10–20 year thesis | Required advances | RuVector role | Risk | +|-------------|-------------------|-------------------|---------------|------| +| Cognitum edge cognition | Continuous-resolution sensory embedding on hardware | Neuromorphic INT4/FP8 chips | MRL cascade on Hailo or Pi Zero | Hardware not mature | +| RVM coherence domains | HNSW edges tagged by minimum valid dimension depth | mincut labelling of graph edges by dimension threshold | Bridge ruvector-mincut ↔ matryoshka | New ADR required | +| Proof-gated adaptive search | ZK proof required to advance from coarse to full stage | ZK-SNARKs on distance computation | ruvector-verified integration | ZK overhead high | +| Swarm memory | N agents each hold coarse shard; leader holds full rerank | Distributed coarse pass over agent mesh | CascadeSearch as swarm primitive | Consistency model | +| Dimension-polymorphic HNSW | Graph edges valid only above a minimum dimension depth | Online graph repair when D_c changes | Core HNSW redesign in ruvector-core | Complex invariants | +| Agent operating systems | Memory manager assigns coarse vs full precision per agent by priority | OS-level embedding resource allocation | RuVector as memory substrate | Full ecosystem required | +| Autonomous scientific hypothesiser | Broad retrieval at coarse dim, deep citation at full dim | Multi-granularity embedding of scientific text | Cascade drives literature hypothesis generation | Domain data quality | +| Bio-signal adaptive memory | Physiological signals: coarse for anomaly trigger, full for diagnosis | Real-time streaming embed at <10ms | CascadeSearch on streaming physiological index | Privacy and regulation | + +--- + +## Deep research notes + +### What the SOTA suggests + +1. **MRL is a deployment standard in 2026**, not a research experiment. Every major + model ships nested dimensions. Vector databases must support this natively. + +2. **Gradient variance in vanilla MRL is solved** (SMRL, arXiv:2510.12474). The + recall quality of small prefixes (D_c = 64 of D = 3072) is substantially better + with SMRL-trained models than vanilla MRL models. When choosing an embedding + model for a cascade deployment, prefer SMRL-trained checkpoints. + +3. **Per-query dimension selection is coming** (arXiv:2602.03306). Within 2–3 years, + the field will move from a global `coarse_dim` to a per-query adaptive selection. + RuVector's `MatryoshkaIndex::search(&self, query: &[f32], k: usize)` signature + should evolve to `search(&self, query: &[f32], k: usize, coarse_dim: Option)`. + +4. **The database that natively builds a graph at D_c rather than truncating full-D + HNSW wins on large-N recall.** This is a known gap: no production system has + solved dimension-polymorphic graph construction. It is an open engineering problem. + +### What remains unsolved + +- Dimension-polymorphic HNSW construction. +- Memory-bandwidth efficiency (dimension-split storage layout). +- Cascade candidate scheduling as a function of N, K, and cluster density. +- Integration with proof-gated writes (ruvector-verified). + +### Where this PoC fits + +This PoC validates the cascade strategy in Rust, defines the trait, and provides a +correct measured baseline. It is the foundation for a graph-based coarse stage +(Phase 2) and a production DiskANN-backed implementation (Phase 4). + +### What would falsify the approach + +If a deployed MRL embedding model shows coarse-pass recall < 10% consistently (not +just on our synthetic dataset), the cascade cannot recover quality regardless of +`cascade_candidates`. This would indicate the model was not properly MRL-trained and +should be replaced. A pre-flight check should be run on a validation set. + +### Sources + +- [^1] arXiv:2205.13147 — MRL (NeurIPS 2022) +- [^2] arXiv:2510.12474 — SMEC/SMRL (EMNLP 2025) +- [^3] arXiv:2411.17299 — 2D Matryoshka (2024) +- [^4] arXiv:2602.03306 — Query-aware dim selection (2026) +- [^5] https://milvus.io/docs/funnel_search_with_matryoshka.md — Milvus funnel search +- [^6] https://platform.openai.com/docs/guides/embeddings — OpenAI MRL support +- [^7] https://huggingface.co/nomic-ai/nomic-embed-text-v1.5 — Nomic MRL model +- [^8] https://qdrant.tech/articles/binary-quantization-openai/ — Qdrant quantization + +--- + +## Usage guide + +```bash +# Clone and enter repo +git clone https://github.com/ruvnet/ruvector.git +cd ruvector +git checkout research/nightly/2026-05-16-matryoshka-hnsw + +# Build +cargo build --release -p ruvector-matryoshka + +# Run tests (8 unit tests including acceptance) +cargo test -p ruvector-matryoshka + +# Run benchmark +cargo run --release -p ruvector-matryoshka +``` + +**Expected output:** + +``` +CascadeSearch (D=32→128) 376.9 371.5 419.8 2 653 1.0000 2 500 PASS +... +Acceptance: CascadeSearch recall@10 = 1.0000 ≥ 0.90 → PASS ✓ +``` + +**Changing dataset size:** +Edit `N` constant in `crates/ruvector-matryoshka/src/main.rs`: +```rust +const N: usize = 50_000; // increase for larger benchmark +``` + +**Changing dimensions:** +Edit `DIM` and `COARSE_DIM`: +```rust +const DIM: usize = 256; +const COARSE_DIM: usize = 64; // 25% of full +``` + +**Adding a new backend:** +Implement `MatryoshkaIndex` for your struct: +```rust +impl MatryoshkaIndex for MyHnswCoarseStage { + fn name(&self) -> &str { "HnswCascade (HNSW→full)" } + fn build(&mut self, vectors: &[Vector]) { /* build HNSW at coarse_dim */ } + fn search(&self, query: &[f32], k: usize) -> Vec { /* HNSW + rerank */ } + fn memory_bytes(&self) -> usize { /* graph + vectors */ } +} +``` + +**Plugging into RuVector:** +The `MatryoshkaIndex` trait is designed to sit above the existing `ruvector-core` +index types. A future `ruvector-core` `feature = "matryoshka"` will register +`CascadeSearch` as a search mode alongside existing HNSW and IVF modes. + +--- + +## Optimization guide + +### Memory optimisation + +Store `coarse[D_c]` and `residual[D-D_c]` as separate `Vec` arrays (not +interleaved per vector). Stage 1 then touches only the `coarse` array (625 KB for +N=5 000) instead of the full 2 500 KB, dramatically improving cache utilisation. + +### Latency optimisation + +Add a graph-based coarse stage (HNSW on D_c dimensions) to replace the O(N·D_c) +scan. For N=1M, the flat scan is ~200ms; HNSW reduces to ~1ms. + +### Recall optimisation + +Increase `cascade_candidates` until recall saturates. A calibration pass on a +validation set (200 queries, compare to FullScan) identifies the minimum C that +hits the target recall. + +### Edge deployment optimisation + +Use `CoarseScan` only in the WASM budget (e.g., Pi Zero 2W, Cognitum Seed). Send +top-200 coarse IDs to a host sidecar for full rerank. Network payload: 200 × 4 +bytes = 800 bytes of IDs + host lookup. + +### WASM optimisation + +`CoarseScan` and `CascadeSearch` have zero dependencies that are WASM-incompatible. +Compile with: +```bash +cargo build --target wasm32-unknown-unknown -p ruvector-matryoshka --no-default-features +``` + +### MCP tool optimisation + +Expose as a streaming tool: return coarse candidates first (low-latency initial +response), then stream the full-reranked results as they are computed. + +### ruFlo automation optimisation + +Run a ruFlo step after every 1 000 queries that measures `recall@10` on a held-out +set and adjusts `cascade_candidates` up or down to stay within 5% of the SLA +threshold. This is the closed-loop variant of manual `cascade_candidates` tuning. + +--- + +## Roadmap + +### Now +- Merge `crates/ruvector-matryoshka` to main (this branch) +- Add `MatryoshkaIndex` to `ruvector-core` search type registry as an optional variant +- Ship `CoarseScan` as a WASM-compatible thin index for edge use cases + +### Next +- Phase 2: HNSW coarse stage replacing O(N·D_c) flat scan +- Dimension-split vector storage layout for cache-efficient coarse pass +- ruFlo feedback loop for online `cascade_candidates` tuning +- MCP tool surface: `search_cascade(query, coarse_dim, k)` + +### Later (10–20 year) +- Dimension-polymorphic HNSW: edges labelled by minimum valid dimension depth +- Per-query adaptive dimension selection (query-aware, arXiv:2602.03306 style) +- Zero-knowledge proof gate between coarse and full stage for proof-gated RAG +- RVM coherence domains: Matryoshka cascade aligned to mincut-defined memory regions +- Hardware-native adaptive precision: INT4 coarse pass, FP32 rerank, in-memory compute + +--- + +## Footnotes and references + +[^1]: Kusupati, A., Bhatt, G., Rege, A., Wallingford, M., Sinha, A., Ramanujan, V., +Howard-Snyder, W., Chen, K., Kakade, S., Jain, P., Farhadi, A. "Matryoshka +Representation Learning." NeurIPS 2022. arXiv:2205.13147. +https://arxiv.org/abs/2205.13147. Accessed 2026-05-16. + +[^2]: Zhang, B., Chen, L., Liu, T., Zheng, B. "SMEC: Rethinking Matryoshka +Representation Learning for Retrieval Embedding Compression." EMNLP 2025. +arXiv:2510.12474. https://arxiv.org/abs/2510.12474. Accessed 2026-05-16. + +[^3]: Wang, S., et al. "2D Matryoshka Training for Information Retrieval." arXiv:2411.17299. +November 2024. https://arxiv.org/abs/2411.17299. Accessed 2026-05-16. + +[^4]: Wu, Z., Zhang, R., Nie, Z. "Learning to Select: Query-Aware Adaptive Dimension +Selection for Dense Retrieval." Beihang University, 2026. arXiv:2602.03306. +https://arxiv.org/html/2602.03306v2. Accessed 2026-05-16. + +[^5]: Milvus documentation. "Funnel Search with Matryoshka." +https://milvus.io/docs/funnel_search_with_matryoshka.md. Accessed 2026-05-16. + +[^6]: OpenAI. "Embeddings — Matryoshka dimensions parameter." OpenAI documentation. +https://platform.openai.com/docs/guides/embeddings. Accessed 2026-05-16. + +[^7]: Nomic AI. "nomic-embed-text-v1.5 — First long-context MRL embedding model." +Hugging Face. https://huggingface.co/nomic-ai/nomic-embed-text-v1.5. +Accessed 2026-05-16. + +[^8]: Qdrant. "Binary Quantization with OpenAI text-embedding-3." +https://qdrant.tech/articles/binary-quantization-openai/. Accessed 2026-05-16. + +[^9]: Garcia, A. "sqlite-vec: Matryoshka / adaptive-length embedding guide." +https://alexgarcia.xyz/sqlite-vec/guides/matryoshka.html. Accessed 2026-05-16. + +--- + +## SEO tags + +**Keywords:** +ruvector, Rust vector database, Rust vector search, Matryoshka Representation Learning, +MRL embeddings, adaptive dimension search, cascaded retrieval, funnel search, +coarse-to-fine ANN, high performance Rust, ANN search, HNSW, DiskANN, +filtered vector search, graph RAG, agent memory, AI agents, MCP, WASM AI, edge AI, +self learning vector database, ruvnet, ruFlo, Claude Flow, autonomous agents, +retrieval augmented generation, nested embeddings, OpenAI text-embedding-3, +Nomic nomic-embed-text. + +**Suggested GitHub topics:** +rust, vector-database, vector-search, ann, hnsw, matryoshka-embeddings, mrl, +cascaded-retrieval, adaptive-search, rag, graph-rag, ai-agents, agent-memory, +mcp, wasm, edge-ai, rust-ai, semantic-search, embeddings, ruvector.