Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions .github/workflows/regression-guard.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,70 @@ jobs:
cargo test -p ruvector-router-core --release --lib test_recall_at_1_with_biased_insertion_order
cargo test -p ruvector-router-core --release --lib test_k_exceeds_ef_search_default
cargo test -p ruvector-router-core --release --lib test_vector_db_basic_operations
# Issue #430 (bug C): adjacency-list pruning must keep CLOSEST m
# neighbours, not the most recently inserted ones.
cargo test -p ruvector-router-core --release --lib test_pruning_keeps_closest_not_newest
# Issue #430 (storage): VectorDB::new must rebuild the HNSW from
# persisted vectors so search returns results after reopen.
cargo test -p ruvector-router-core --release --lib test_index_rebuilt_from_storage_on_open

# Issue #430 (bug B): the HNSW insert beam must use `ef_construction`, not
# `ef_construction.min(m * 2)`. The latter silently clamps the beam to 32
# by default (m=16) and collapses recall at scale. This guard textually
# forbids the regression.
hnsw-insert-beam-no-m2-clamp:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Forbid ef_construction.min(m * 2) clamp in HNSW insert beam
run: |
set -e
if grep -nE 'ef_construction\s*\.\s*min\s*\(\s*self\.config\.m\s*\*\s*2\s*\)' \
crates/ruvector-router-core/src/index.rs ; then
echo "::error::Insert beam clamped to ef_construction.min(m*2) — this silently becomes m*2 (regression of issue #430 bug B). Use self.config.ef_construction directly."
exit 1
fi

# Issue #430 (bug C): adjacency-list pruning must be distance-based. The
# historical FIFO pruner did not call `calculate_distance` anywhere inside
# the overflow gate, so checking that the helper is invoked in the same
# function as the `> self.config.m * 2` check is a cheap structural guard
# that complements the behavioural `test_pruning_keeps_closest_not_newest`
# test below.
hnsw-distance-based-neighbor-pruning:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Require calculate_distance() inside HNSW overflow gate
run: |
set -e
# The `insert` function in index.rs must reach calculate_distance()
# AFTER the `> self.config.m * 2` overflow check fires — that is
# what proves the pruner is distance-aware, not FIFO.
if ! grep -nE 'calculate_distance' crates/ruvector-router-core/src/index.rs >/dev/null ; then
echo "::error::index.rs no longer references calculate_distance (regression of issue #430 bug C). Adjacency-list pruning must score candidates by distance."
exit 1
fi
# And the overflow gate itself must still exist.
if ! grep -nE '> self\.config\.m \* 2' crates/ruvector-router-core/src/index.rs >/dev/null ; then
echo "::error::HNSW overflow gate '> self.config.m * 2' removed — refusing to ship without the m*2/m prune semantics (#430)."
exit 1
fi

# Issue #430 (storage): VectorDB::new must rebuild the in-memory HNSW from
# persisted storage. The historical bug was that a fresh empty HnswIndex
# was created on every open, so search returned 0 results after restart.
vector-db-rebuilds-index-on-open:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Require storage.get_all_ids() rebuild path in VectorDB::new
run: |
set -e
if ! grep -nE 'storage\.get_all_ids' crates/ruvector-router-core/src/vector_db.rs ; then
echo "::error::VectorDB::new no longer rebuilds the HNSW from storage (regression of issue #430). Reintroduce the storage.get_all_ids() + index.insert_batch() path."
exit 1
fi

# Issue #462 / #376: published tarballs must contain dist/. Run `npm pack`
# (which now triggers our prepack hooks) and assert the entry points exist
Expand Down
110 changes: 101 additions & 9 deletions crates/ruvector-router-core/src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,18 @@ impl HnswIndex {
return Ok(());
}

// Find nearest neighbors (safe now - no locks held)
let neighbors =
self.search_knn_internal(&vector, self.config.ef_construction.min(self.config.m * 2));
// Issue #430 (bug B): the insert beam was previously clamped to
// `ef_construction.min(m * 2)`, which silently became `m * 2` (32 by
// default) instead of `ef_construction` (200). At scale the resulting
// beam was dominated by whatever sits near the entry point, so late-
// inserted clusters got wired up through the wrong nodes. Use the
// configured `ef_construction` so the beam actually matches the
// HNSW paper.
let neighbors = self.search_knn_internal(&vector, self.config.ef_construction);

// Snapshot the vector store so we can compute neighbour-to-neighbour
// distances during pruning without re-acquiring the lock per edge.
let vectors_snapshot = self.vectors.read();

// Re-acquire graph lock for modifications
let mut graph = self.graph.write();
Expand All @@ -131,13 +140,35 @@ impl HnswIndex {
if let Some(neighbor_connections) = graph.get_mut(&neighbor.id) {
neighbor_connections.push(id.clone());

// Issue #430: previously `truncate(m)` kept the OLDEST m
// connections, including dropping the one we just pushed when
// it landed past position m. Drop oldest, keep newest m so the
// freshly-inserted edge always survives.
// Issue #430 (bug C): previously this branch trimmed the
// adjacency list via `drain(0..)`, which is FIFO — it dropped
// the OLDEST edges regardless of how close they were. Proper
// HNSW pruning keeps the m CLOSEST neighbours. We compute the
// pairwise distances using the vector for `neighbor.id`
// (which we just looked up successfully above) and keep the
// bottom-m by distance.
if neighbor_connections.len() > self.config.m * 2 {
let drain_count = neighbor_connections.len() - self.config.m;
neighbor_connections.drain(0..drain_count);
if let Some(anchor_vec) = vectors_snapshot.get(&neighbor.id) {
let mut scored: Vec<(String, f32)> = neighbor_connections
.drain(..)
.filter_map(|cid| {
vectors_snapshot.get(&cid).map(|cv| {
let d = calculate_distance(anchor_vec, cv, self.config.metric)
.unwrap_or(f32::MAX);
(cid, d)
})
})
.collect();
scored.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(Ordering::Equal));
scored.truncate(self.config.m);
*neighbor_connections = scored.into_iter().map(|(cid, _)| cid).collect();
} else {
// Fallback: shouldn't happen because `neighbor.id` came
// from the index, but keep the newest-m behavior so we
// never panic on a missing vector.
let drain_count = neighbor_connections.len() - self.config.m;
neighbor_connections.drain(0..drain_count);
}
}
}
}
Expand Down Expand Up @@ -519,6 +550,67 @@ mod tests {
);
}

/// Issue #430 (bug C): when an adjacency list overflows `m * 2` the
/// pruning step must keep the m CLOSEST neighbours, not the most recently
/// inserted ones. Build a graph where one node ("hub") is the nearest
/// neighbour of many subsequent inserts, then verify that hub's final
/// adjacency list contains the m geometrically-closest connections.
#[test]
fn test_pruning_keeps_closest_not_newest() {
let dimensions = 8;
// Tiny m so the prune branch fires after only a few inserts.
let config = HnswConfig {
m: 4,
ef_construction: 64,
ef_search: 64,
metric: DistanceMetric::Euclidean,
dimensions,
};
let index = HnswIndex::new(config);

// Hub at origin.
let hub = vec![0f32; dimensions];
index.insert("hub".into(), hub.clone()).unwrap();

// 20 close neighbours (distance ~ 1.0..2.0 to hub).
for i in 0..20 {
let mut v = vec![0f32; dimensions];
let r = 1.0 + (i as f32) * 0.05;
v[i % dimensions] = r;
index.insert(format!("close_{i}"), v).unwrap();
}

// 6 far neighbours (distance ~ 100) — these arrive LAST so the
// FIFO pruner would keep them. The distance-based pruner must
// discard them in favour of the closer ones already in the list.
for i in 0..6 {
let mut v = vec![0f32; dimensions];
v[i % dimensions] = 100.0 + (i as f32);
index.insert(format!("far_{i}"), v).unwrap();
}

// Inspect the hub's adjacency list. We can't access the private
// graph directly, but we can search for the hub vector and check
// that no "far_*" id is among the closest k=10 — which would be
// impossible if the hub's edges still pointed at "far_*" nodes.
let q = SearchQuery {
vector: hub.clone(),
k: 10,
filters: None,
threshold: None,
ef_search: Some(64),
};
let results = index.search(&q).unwrap();
let any_far_in_top10 = results.iter().any(|r| r.id.starts_with("far_"));
assert!(
!any_far_in_top10,
"distance-based pruning regressed: 'far_*' nodes appear in top-10 \
search around the hub, which means the pruner is still keeping \
newest-by-FIFO instead of closest. results={:?}",
results.iter().map(|r| (&r.id, r.score)).collect::<Vec<_>>()
);
}

#[test]
fn test_hnsw_concurrent_inserts() {
use std::sync::Arc;
Expand Down
78 changes: 77 additions & 1 deletion crates/ruvector-router-core/src/vector_db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,24 @@ impl VectorDB {

let index = Arc::new(HnswIndex::new(hnsw_config));

// Issue #430: rebuild the in-memory HNSW from persisted vectors. Without
// this step a fresh `HnswIndex::new` is created on every open, so all
// previously-inserted vectors are invisible to search after restart
// (search returns 0 results despite `get_all_ids` listing them).
let stored_ids = storage.get_all_ids()?;
let total_vectors = stored_ids.len();
if !stored_ids.is_empty() {
let mut entries = Vec::with_capacity(stored_ids.len());
for id in &stored_ids {
if let Some(vector) = storage.get(id)? {
entries.push((id.clone(), vector));
}
}
index.insert_batch(entries)?;
}

let stats = Arc::new(RwLock::new(VectorDbStats {
total_vectors: 0,
total_vectors,
index_size_bytes: 0,
storage_size_bytes: 0,
avg_query_latency_us: 0.0,
Expand Down Expand Up @@ -300,4 +316,64 @@ mod tests {
assert!(db.delete("test1").unwrap());
assert_eq!(db.count().unwrap(), 0);
}

/// Issue #430: search must return persisted vectors after the VectorDB is
/// reopened. Before the fix, `VectorDB::new` always created an empty
/// in-memory HNSW, so `search` returned 0 results despite the storage
/// containing the vectors.
#[test]
fn test_index_rebuilt_from_storage_on_open() {
let dir = tempdir().unwrap();
let path = dir.path().join("rebuild.db");

// Write a handful of vectors with the first DB instance.
{
let db = VectorDB::builder()
.dimensions(4)
.storage_path(&path)
.build()
.unwrap();
for i in 0..5u32 {
let v = vec![i as f32, (i * 2) as f32, (i * 3) as f32, (i * 5) as f32];
db.insert(VectorEntry {
id: format!("v{i}"),
vector: v,
metadata: std::collections::HashMap::new(),
timestamp: 0,
})
.unwrap();
}
} // drop closes the storage handle.

// Reopen against the same on-disk path — index must be rebuilt.
let db = VectorDB::builder()
.dimensions(4)
.storage_path(&path)
.build()
.unwrap();

assert_eq!(
db.count().unwrap(),
5,
"storage.count() should report persisted vectors"
);

let q = SearchQuery {
vector: vec![2.0, 4.0, 6.0, 10.0], // matches v2 exactly
k: 3,
filters: None,
threshold: None,
ef_search: None,
};
let results = db.search(q).unwrap();
assert!(
!results.is_empty(),
"regression of #430: search returned 0 results after reopening; \
index was not rebuilt from storage"
);
assert_eq!(
results[0].id, "v2",
"exact-match query should return v2 as top hit"
);
}
}
Loading
Loading