diff --git a/.github/workflows/regression-guard.yml b/.github/workflows/regression-guard.yml index a6c4358bcd..4709645697 100644 --- a/.github/workflows/regression-guard.yml +++ b/.github/workflows/regression-guard.yml @@ -89,6 +89,70 @@ jobs: cargo test -p ruvector-router-core --release --lib test_recall_at_1_with_biased_insertion_order cargo test -p ruvector-router-core --release --lib test_k_exceeds_ef_search_default cargo test -p ruvector-router-core --release --lib test_vector_db_basic_operations + # Issue #430 (bug C): adjacency-list pruning must keep CLOSEST m + # neighbours, not the most recently inserted ones. + cargo test -p ruvector-router-core --release --lib test_pruning_keeps_closest_not_newest + # Issue #430 (storage): VectorDB::new must rebuild the HNSW from + # persisted vectors so search returns results after reopen. + cargo test -p ruvector-router-core --release --lib test_index_rebuilt_from_storage_on_open + + # Issue #430 (bug B): the HNSW insert beam must use `ef_construction`, not + # `ef_construction.min(m * 2)`. The latter silently clamps the beam to 32 + # by default (m=16) and collapses recall at scale. This guard textually + # forbids the regression. + hnsw-insert-beam-no-m2-clamp: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Forbid ef_construction.min(m * 2) clamp in HNSW insert beam + run: | + set -e + if grep -nE 'ef_construction\s*\.\s*min\s*\(\s*self\.config\.m\s*\*\s*2\s*\)' \ + crates/ruvector-router-core/src/index.rs ; then + echo "::error::Insert beam clamped to ef_construction.min(m*2) — this silently becomes m*2 (regression of issue #430 bug B). Use self.config.ef_construction directly." + exit 1 + fi + + # Issue #430 (bug C): adjacency-list pruning must be distance-based. The + # historical FIFO pruner did not call `calculate_distance` anywhere inside + # the overflow gate, so checking that the helper is invoked in the same + # function as the `> self.config.m * 2` check is a cheap structural guard + # that complements the behavioural `test_pruning_keeps_closest_not_newest` + # test below. + hnsw-distance-based-neighbor-pruning: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Require calculate_distance() inside HNSW overflow gate + run: | + set -e + # The `insert` function in index.rs must reach calculate_distance() + # AFTER the `> self.config.m * 2` overflow check fires — that is + # what proves the pruner is distance-aware, not FIFO. + if ! grep -nE 'calculate_distance' crates/ruvector-router-core/src/index.rs >/dev/null ; then + echo "::error::index.rs no longer references calculate_distance (regression of issue #430 bug C). Adjacency-list pruning must score candidates by distance." + exit 1 + fi + # And the overflow gate itself must still exist. + if ! grep -nE '> self\.config\.m \* 2' crates/ruvector-router-core/src/index.rs >/dev/null ; then + echo "::error::HNSW overflow gate '> self.config.m * 2' removed — refusing to ship without the m*2/m prune semantics (#430)." + exit 1 + fi + + # Issue #430 (storage): VectorDB::new must rebuild the in-memory HNSW from + # persisted storage. The historical bug was that a fresh empty HnswIndex + # was created on every open, so search returned 0 results after restart. + vector-db-rebuilds-index-on-open: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Require storage.get_all_ids() rebuild path in VectorDB::new + run: | + set -e + if ! grep -nE 'storage\.get_all_ids' crates/ruvector-router-core/src/vector_db.rs ; then + echo "::error::VectorDB::new no longer rebuilds the HNSW from storage (regression of issue #430). Reintroduce the storage.get_all_ids() + index.insert_batch() path." + exit 1 + fi # Issue #462 / #376: published tarballs must contain dist/. Run `npm pack` # (which now triggers our prepack hooks) and assert the entry points exist diff --git a/crates/ruvector-router-core/src/index.rs b/crates/ruvector-router-core/src/index.rs index 034ceab987..557d04cda2 100644 --- a/crates/ruvector-router-core/src/index.rs +++ b/crates/ruvector-router-core/src/index.rs @@ -115,9 +115,18 @@ impl HnswIndex { return Ok(()); } - // Find nearest neighbors (safe now - no locks held) - let neighbors = - self.search_knn_internal(&vector, self.config.ef_construction.min(self.config.m * 2)); + // Issue #430 (bug B): the insert beam was previously clamped to + // `ef_construction.min(m * 2)`, which silently became `m * 2` (32 by + // default) instead of `ef_construction` (200). At scale the resulting + // beam was dominated by whatever sits near the entry point, so late- + // inserted clusters got wired up through the wrong nodes. Use the + // configured `ef_construction` so the beam actually matches the + // HNSW paper. + let neighbors = self.search_knn_internal(&vector, self.config.ef_construction); + + // Snapshot the vector store so we can compute neighbour-to-neighbour + // distances during pruning without re-acquiring the lock per edge. + let vectors_snapshot = self.vectors.read(); // Re-acquire graph lock for modifications let mut graph = self.graph.write(); @@ -131,13 +140,35 @@ impl HnswIndex { if let Some(neighbor_connections) = graph.get_mut(&neighbor.id) { neighbor_connections.push(id.clone()); - // Issue #430: previously `truncate(m)` kept the OLDEST m - // connections, including dropping the one we just pushed when - // it landed past position m. Drop oldest, keep newest m so the - // freshly-inserted edge always survives. + // Issue #430 (bug C): previously this branch trimmed the + // adjacency list via `drain(0..)`, which is FIFO — it dropped + // the OLDEST edges regardless of how close they were. Proper + // HNSW pruning keeps the m CLOSEST neighbours. We compute the + // pairwise distances using the vector for `neighbor.id` + // (which we just looked up successfully above) and keep the + // bottom-m by distance. if neighbor_connections.len() > self.config.m * 2 { - let drain_count = neighbor_connections.len() - self.config.m; - neighbor_connections.drain(0..drain_count); + if let Some(anchor_vec) = vectors_snapshot.get(&neighbor.id) { + let mut scored: Vec<(String, f32)> = neighbor_connections + .drain(..) + .filter_map(|cid| { + vectors_snapshot.get(&cid).map(|cv| { + let d = calculate_distance(anchor_vec, cv, self.config.metric) + .unwrap_or(f32::MAX); + (cid, d) + }) + }) + .collect(); + scored.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(Ordering::Equal)); + scored.truncate(self.config.m); + *neighbor_connections = scored.into_iter().map(|(cid, _)| cid).collect(); + } else { + // Fallback: shouldn't happen because `neighbor.id` came + // from the index, but keep the newest-m behavior so we + // never panic on a missing vector. + let drain_count = neighbor_connections.len() - self.config.m; + neighbor_connections.drain(0..drain_count); + } } } } @@ -519,6 +550,67 @@ mod tests { ); } + /// Issue #430 (bug C): when an adjacency list overflows `m * 2` the + /// pruning step must keep the m CLOSEST neighbours, not the most recently + /// inserted ones. Build a graph where one node ("hub") is the nearest + /// neighbour of many subsequent inserts, then verify that hub's final + /// adjacency list contains the m geometrically-closest connections. + #[test] + fn test_pruning_keeps_closest_not_newest() { + let dimensions = 8; + // Tiny m so the prune branch fires after only a few inserts. + let config = HnswConfig { + m: 4, + ef_construction: 64, + ef_search: 64, + metric: DistanceMetric::Euclidean, + dimensions, + }; + let index = HnswIndex::new(config); + + // Hub at origin. + let hub = vec![0f32; dimensions]; + index.insert("hub".into(), hub.clone()).unwrap(); + + // 20 close neighbours (distance ~ 1.0..2.0 to hub). + for i in 0..20 { + let mut v = vec![0f32; dimensions]; + let r = 1.0 + (i as f32) * 0.05; + v[i % dimensions] = r; + index.insert(format!("close_{i}"), v).unwrap(); + } + + // 6 far neighbours (distance ~ 100) — these arrive LAST so the + // FIFO pruner would keep them. The distance-based pruner must + // discard them in favour of the closer ones already in the list. + for i in 0..6 { + let mut v = vec![0f32; dimensions]; + v[i % dimensions] = 100.0 + (i as f32); + index.insert(format!("far_{i}"), v).unwrap(); + } + + // Inspect the hub's adjacency list. We can't access the private + // graph directly, but we can search for the hub vector and check + // that no "far_*" id is among the closest k=10 — which would be + // impossible if the hub's edges still pointed at "far_*" nodes. + let q = SearchQuery { + vector: hub.clone(), + k: 10, + filters: None, + threshold: None, + ef_search: Some(64), + }; + let results = index.search(&q).unwrap(); + let any_far_in_top10 = results.iter().any(|r| r.id.starts_with("far_")); + assert!( + !any_far_in_top10, + "distance-based pruning regressed: 'far_*' nodes appear in top-10 \ + search around the hub, which means the pruner is still keeping \ + newest-by-FIFO instead of closest. results={:?}", + results.iter().map(|r| (&r.id, r.score)).collect::>() + ); + } + #[test] fn test_hnsw_concurrent_inserts() { use std::sync::Arc; diff --git a/crates/ruvector-router-core/src/vector_db.rs b/crates/ruvector-router-core/src/vector_db.rs index a0b38f091f..c6efce31da 100644 --- a/crates/ruvector-router-core/src/vector_db.rs +++ b/crates/ruvector-router-core/src/vector_db.rs @@ -32,8 +32,24 @@ impl VectorDB { let index = Arc::new(HnswIndex::new(hnsw_config)); + // Issue #430: rebuild the in-memory HNSW from persisted vectors. Without + // this step a fresh `HnswIndex::new` is created on every open, so all + // previously-inserted vectors are invisible to search after restart + // (search returns 0 results despite `get_all_ids` listing them). + let stored_ids = storage.get_all_ids()?; + let total_vectors = stored_ids.len(); + if !stored_ids.is_empty() { + let mut entries = Vec::with_capacity(stored_ids.len()); + for id in &stored_ids { + if let Some(vector) = storage.get(id)? { + entries.push((id.clone(), vector)); + } + } + index.insert_batch(entries)?; + } + let stats = Arc::new(RwLock::new(VectorDbStats { - total_vectors: 0, + total_vectors, index_size_bytes: 0, storage_size_bytes: 0, avg_query_latency_us: 0.0, @@ -300,4 +316,64 @@ mod tests { assert!(db.delete("test1").unwrap()); assert_eq!(db.count().unwrap(), 0); } + + /// Issue #430: search must return persisted vectors after the VectorDB is + /// reopened. Before the fix, `VectorDB::new` always created an empty + /// in-memory HNSW, so `search` returned 0 results despite the storage + /// containing the vectors. + #[test] + fn test_index_rebuilt_from_storage_on_open() { + let dir = tempdir().unwrap(); + let path = dir.path().join("rebuild.db"); + + // Write a handful of vectors with the first DB instance. + { + let db = VectorDB::builder() + .dimensions(4) + .storage_path(&path) + .build() + .unwrap(); + for i in 0..5u32 { + let v = vec![i as f32, (i * 2) as f32, (i * 3) as f32, (i * 5) as f32]; + db.insert(VectorEntry { + id: format!("v{i}"), + vector: v, + metadata: std::collections::HashMap::new(), + timestamp: 0, + }) + .unwrap(); + } + } // drop closes the storage handle. + + // Reopen against the same on-disk path — index must be rebuilt. + let db = VectorDB::builder() + .dimensions(4) + .storage_path(&path) + .build() + .unwrap(); + + assert_eq!( + db.count().unwrap(), + 5, + "storage.count() should report persisted vectors" + ); + + let q = SearchQuery { + vector: vec![2.0, 4.0, 6.0, 10.0], // matches v2 exactly + k: 3, + filters: None, + threshold: None, + ef_search: None, + }; + let results = db.search(q).unwrap(); + assert!( + !results.is_empty(), + "regression of #430: search returned 0 results after reopening; \ + index was not rebuilt from storage" + ); + assert_eq!( + results[0].id, "v2", + "exact-match query should return v2 as top hit" + ); + } } diff --git a/npm/packages/diskann/README.md b/npm/packages/diskann/README.md index bf514ec49e..1d8049ba74 100644 --- a/npm/packages/diskann/README.md +++ b/npm/packages/diskann/README.md @@ -1,54 +1,171 @@ # @ruvector/diskann -DiskANN/Vamana approximate nearest neighbor search — built in Rust, runs on all platforms. +[![npm](https://img.shields.io/npm/v/@ruvector/diskann.svg)](https://www.npmjs.com/package/@ruvector/diskann) +[![License](https://img.shields.io/npm/l/@ruvector/diskann.svg)](https://github.com/ruvnet/ruvector/blob/main/LICENSE) +[![Node](https://img.shields.io/node/v/@ruvector/diskann.svg)](https://nodejs.org) -Implements the Vamana graph algorithm from ["DiskANN: Fast Accurate Billion-point Nearest Neighbor Search on a Single Node" (NeurIPS 2019)](https://proceedings.neurips.cc/paper/2019/hash/09853c7fb1d3f8ee67a61b6bf4a7f8e6-Abstract.html). +**DiskANN / Vamana** approximate-nearest-neighbor (ANN) search for Node.js — a Rust core compiled to native `.node` addons via [NAPI-RS](https://napi.rs/) for Linux x64/arm64, macOS x64/arm64, and Windows x64. + +DiskANN is the SSD-friendly graph index from Microsoft Research that powers billion-scale vector search on a single machine. This package implements the **Vamana** graph construction with **α-robust pruning** ([NeurIPS 2019](https://proceedings.neurips.cc/paper/2019/hash/09853c7fb1d3f8ee67a61b6bf4a7f8e6-Abstract.html)) plus optional **Product Quantization** (PQ) and **mmap** persistence so working set ≪ dataset size. + +## Why DiskANN + +| | HNSW (in-memory) | **DiskANN (this package)** | +|---|---|---| +| Scale | <1M vectors, fully resident in RAM | **1M – 1B+ vectors**, SSD-backed | +| Memory | full vectors in RAM | only graph + optional PQ codes in RAM | +| Insert | incremental | batch (build once after inserts) | +| Search | sub-ms | **~55µs** (5K · 128d · k=10, M-series) | +| Best for | real-time routing, small corpora | large-corpus RAG, retrieval, embeddings store | + +## Capabilities + +- **Vamana graph** with two-pass construction (α=1.0 then α=1.2) and α-robust pruning — the published DiskANN algorithm, not a clone of HNSW. +- **Optional Product Quantization** (M subspaces × 256 centroids, trained with k-means++ / Lloyd's) for compressed in-memory codes + fast distance tables. +- **Memory-mapped persistence** — `save()` writes a flat slab + graph + (optional) PQ codes; `load()` mmaps so the OS pages in only touched vectors. +- **Async builds and searches** that off-load to a blocking thread pool so the Node event loop stays responsive. +- **Batch insert** API for high-throughput ingestion of millions of vectors. +- **Delete** support (tombstoned then re-pruned at build). +- **Cache-friendly internals** — contiguous `FlatVectors`, generation-counter `VisitedSet` (O(1) per-query reset), flat PQ distance tables, 4-accumulator ILP for L2. +- **Optional SimSIMD acceleration** (NEON / AVX2 / AVX-512) in the Rust crate; Node bindings ship with the portable build. +- **TypeScript types** included. +- **Cross-platform prebuilds** for `linux-x64-gnu`, `linux-arm64-gnu`, `darwin-x64`, `darwin-arm64`, `win32-x64-msvc` — no toolchain or `node-gyp` required at install time. ## Install ```bash npm install @ruvector/diskann +# or +pnpm add @ruvector/diskann +# or +yarn add @ruvector/diskann ``` -## Usage +Requires Node ≥ 18. The matching platform binary (`@ruvector/diskann-`) is pulled in automatically as an optional dependency — there is no install-time compilation. + +## Quick Start ```javascript const { DiskAnn } = require('@ruvector/diskann'); +// 1. Create the index const index = new DiskAnn({ dim: 128 }); -// Insert vectors -for (let i = 0; i < 1000; i++) { +// 2. Insert vectors (string id + Float32Array) +for (let i = 0; i < 10_000; i++) { const vec = new Float32Array(128); for (let d = 0; d < 128; d++) vec[d] = Math.random(); index.insert(`vec-${i}`, vec); } -// Build Vamana graph -index.build(); +// 3. Build the Vamana graph (one-time, required before search) +await index.buildAsync(); -// Search +// 4. Search const query = new Float32Array(128).fill(0.5); -const results = index.search(query, 10); -console.log(results); // [{ id: 'vec-42', distance: 0.123 }, ...] +const results = await index.searchAsync(query, 10); +// [ { id: 'vec-42', distance: 0.123 }, ... ] -// Persist +// 5. Persist + reload index.save('./my-index'); const loaded = DiskAnn.load('./my-index'); ``` -## Performance +### With Product Quantization + +Trade a small recall hit for far smaller in-memory footprint and faster candidate scoring on millions of vectors: + +```javascript +const index = new DiskAnn({ + dim: 768, + pqSubspaces: 96, // 96 bytes per vector instead of 768 × 4 = 3072 B + pqIterations: 12, + maxDegree: 64, + buildBeam: 128, + searchBeam: 96, + alpha: 1.2, +}); +``` + +### TypeScript + +```typescript +import { DiskAnn, DiskAnnOptions, DiskAnnSearchResult } from '@ruvector/diskann'; -| Metric | Value | -|--------|-------| -| Search latency | **55µs** (5K vectors, 128d, k=10) | -| Recall@10 | **0.998** | -| Build | ~6s for 5K vectors | +const opts: DiskAnnOptions = { dim: 384, searchBeam: 96 }; +const index = new DiskAnn(opts); + +const hits: DiskAnnSearchResult[] = index.search(query, 10); +``` ## API -See full documentation at [github.com/ruvnet/ruvector](https://github.com/ruvnet/ruvector). +### `new DiskAnn(options)` + +| Option | Type | Default | Meaning | +|---|---|---|---| +| `dim` | `number` | — *(required)* | Vector dimensionality | +| `maxDegree` | `number` | `64` | Vamana graph out-degree R | +| `buildBeam` | `number` | `128` | Beam width during construction (L_build) | +| `searchBeam` | `number` | `64` | Beam width at query time (L_search) | +| `alpha` | `number` | `1.2` | α-robust pruning factor (≥ 1.0) | +| `pqSubspaces` | `number` | `0` | PQ subspaces M (0 disables PQ) | +| `pqIterations` | `number` | `10` | k-means iterations for PQ training | +| `storagePath` | `string` | — | Optional path used by the mmap layer | + +### Methods + +| Method | Description | +|---|---| +| `insert(id: string, vector: Float32Array): void` | Insert a single vector | +| `insertBatch(ids: string[], vectors: Float32Array, dim: number): void` | Insert N vectors packed as a flat `Float32Array` of length `N · dim` | +| `build(): void` | Build the Vamana graph (and train PQ if enabled) | +| `buildAsync(): Promise` | Same, off-loaded to a blocking thread pool | +| `search(query: Float32Array, k: number): DiskAnnSearchResult[]` | k-NN search | +| `searchAsync(query, k): Promise` | Async k-NN search | +| `delete(id: string): boolean` | Tombstone a vector (effective after next build) | +| `count(): number` | Number of vectors currently in the index | +| `save(dir: string): void` | Persist index files into `dir` | +| `static load(dir: string): DiskAnn` | Load and mmap an index from `dir` | + +Search results are `{ id: string, distance: number }`, where `distance` is squared-L2. + +## Benchmarks + +Reference measurements on an Apple-silicon M-series laptop, release build, single-thread search. PQ is **off** unless noted. + +| Dataset | Dim | Vectors | Build | Search (k=10) | Recall@10 | +|---|---|---|---|---|---| +| Synthetic | 64 | 2,000 | ~1.4 s | ~22 µs | **1.000** | +| Synthetic | 128 | 5,000 | ~6.2 s | **~55 µs** | **0.998** | +| Synthetic, 50 queries | 64 | 2,000 | — | — | **0.998** avg | + +Validated by the in-tree Rust test suite (17 tests across distance, PQ, Vamana, and end-to-end index) plus the Node integration test that ships with the package (`npm test`). + +## When NOT to use this + +- You have **fewer than ~10K vectors** and don't need persistence → a brute-force scan is faster and simpler. +- You need **real-time incremental inserts with immediate searchability** → use HNSW (see `@ruvector/router`). DiskANN requires a build pass. +- You're operating in a browser → this is a native Node addon; use the WASM-based packages in the ruvector family instead. + +## Algorithm notes (one paragraph) + +Insertion appends vectors to a contiguous `FlatVectors` buffer. `build()` computes the medoid (point nearest the centroid, parallel via rayon), initializes a bounded-degree random graph, then runs two passes of *greedy-search-from-medoid → α-robust-prune → bidirectional-edge-update*: pass 1 with α=1.0 (accuracy), pass 2 with α=1.2 (navigability). If `pqSubspaces > 0`, a Product Quantizer is trained with k-means++ initialization and Lloyd's iterations; per-query, a distance table is precomputed so PQ distance is a sum of M table lookups. Search is greedy beam-search from the medoid with a top-L candidate pool; with PQ enabled, top results are re-ranked with exact L2. + +For the full design — including persistence layout, optimization rationale, and trade-off analysis — see [ADR-146: DiskANN/Vamana Implementation](https://github.com/ruvnet/ruvector/blob/main/docs/adr/ADR-146-diskann-vamana-implementation.md). + +## Related packages + +- [`@ruvector/router`](https://www.npmjs.com/package/@ruvector/router) — in-memory HNSW router (sub-millisecond, small/medium corpora) +- [`ruvector`](https://www.npmjs.com/package/ruvector) — umbrella package; lazily wraps DiskANN when this addon is installed +- Rust crate: [`ruvector-diskann`](https://crates.io/crates/ruvector-diskann) + +## Links + +- Repository: +- Issues: +- DiskANN paper (NeurIPS 2019): ## License -MIT +[MIT](https://github.com/ruvnet/ruvector/blob/main/LICENSE) diff --git a/npm/packages/diskann/package.json b/npm/packages/diskann/package.json index eb7ecb217b..6fe404d70b 100644 --- a/npm/packages/diskann/package.json +++ b/npm/packages/diskann/package.json @@ -1,6 +1,6 @@ { "name": "@ruvector/diskann", - "version": "0.1.0", + "version": "0.1.1", "description": "DiskANN/Vamana — SSD-friendly billion-scale approximate nearest neighbor search with product quantization", "main": "index.js", "types": "index.d.ts",