Skip to content

Commit

Permalink
Merge pull request #98 from meilisearch/fix-the-append
Browse files Browse the repository at this point in the history
Reorganize the NodeId to make the append works again and add a test
  • Loading branch information
irevoire authored Oct 1, 2024
2 parents e6dd6df + 799b46b commit 24083df
Show file tree
Hide file tree
Showing 8 changed files with 402 additions and 354 deletions.
12 changes: 8 additions & 4 deletions examples/import-vectors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use std::fs;
use std::io::{BufRead, BufReader};
use std::path::PathBuf;
use std::time::Instant;
use std::time::{Duration, Instant};

use arroy::distances::DotProduct;
use arroy::{Database, Writer};
Expand All @@ -12,8 +12,8 @@ use heed::{EnvFlags, EnvOpenOptions};
use rand::rngs::StdRng;
use rand::SeedableRng;

/// 2 GiB
const DEFAULT_MAP_SIZE: usize = 1024 * 1024 * 1024 * 2;
/// 200 GiB
const DEFAULT_MAP_SIZE: usize = 1024 * 1024 * 1024 * 200;

#[derive(Parser)]
#[command(author, version, about, long_about = None)]
Expand Down Expand Up @@ -74,6 +74,7 @@ fn main() -> Result<(), heed::BoxedError> {
// === END vectors ===

let now = Instant::now();
let mut insertion_time = Duration::default();
let mut count = 0;
for line in reader.lines() {
let line = line?;
Expand All @@ -89,14 +90,17 @@ fn main() -> Result<(), heed::BoxedError> {
.map(|s| s.trim().parse::<f32>().unwrap())
.collect();

let now = Instant::now();
if no_append {
writer.add_item(&mut wtxn, id, &vector)?;
} else {
writer.append_item(&mut wtxn, id, &vector)?;
}
insertion_time += now.elapsed();
count += 1;
}
println!("Took {:.2?} to parse and insert into arroy", now.elapsed());
println!("Took {:.2?} to parse and insert into arroy", now.elapsed() - insertion_time);
println!("Took {insertion_time:.2?} insert into arroy");
println!("There are {count} vectors");
println!();

Expand Down
12 changes: 6 additions & 6 deletions src/node_id.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ use crate::ItemId;
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(u8)]
pub enum NodeMode {
Item = 0,
Metadata = 0,
Tree = 1,
Metadata = 2,
Item = 2,
}

impl TryFrom<u8> for NodeMode {
Expand Down Expand Up @@ -107,11 +107,11 @@ mod test {
assert!(NodeId::tree(1) > NodeId::tree(0));
assert!(NodeId::tree(0) < NodeId::tree(1));

// tree > item whatever is the value
assert!(NodeId::tree(0) > NodeId::item(1));
// tree < item whatever is the value
assert!(NodeId::tree(u32::MAX) < NodeId::item(0));

assert!(NodeId::metadata() == NodeId::metadata());
assert!(NodeId::metadata() > NodeId::tree(12));
assert!(NodeId::metadata() > NodeId::item(12));
assert!(NodeId::metadata() < NodeId::tree(u32::MAX));
assert!(NodeId::metadata() < NodeId::item(u32::MAX));
}
}
4 changes: 2 additions & 2 deletions src/tests/binary_quantized.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ fn write_and_retrieve_binary_quantized_vector() {
insta::assert_snapshot!(handle, @r###"
==================
Dumping index 0
Item 0: Leaf(Leaf { header: NodeHeaderBinaryQuantizedEuclidean { bias: 0.0 }, vector: [-1.0000, -1.0000, 1.0000, -1.0000, 1.0000, 1.0000, -1.0000, 1.0000, -1.0000, -1.0000, "other ..."] })
Tree 0: Descendants(Descendants { descendants: [0] })
Root: Metadata { dimensions: 16, items: RoaringBitmap<[0]>, roots: [0], distance: "binary quantized euclidean" }
Tree 0: Descendants(Descendants { descendants: [0] })
Item 0: Leaf(Leaf { header: NodeHeaderBinaryQuantizedEuclidean { bias: 0.0 }, vector: [-1.0000, -1.0000, 1.0000, -1.0000, 1.0000, 1.0000, -1.0000, 1.0000, -1.0000, -1.0000, "other ..."] })
"###);
}
10 changes: 5 additions & 5 deletions src/tests/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,10 @@ fn two_dimension_on_a_line() {
let reader = Reader::<Euclidean>::open(&rtxn, 0, handle.database).unwrap();

// if we can't look into enough nodes we find some random points
let ret = reader.nns(5).search_k(NonZeroUsize::new(1).unwrap()).by_item(&rtxn, 0).unwrap();
let ret = reader.nns(5).search_k(NonZeroUsize::new(1).unwrap()).by_item(&rtxn, 1).unwrap();
insta::assert_snapshot!(NnsRes(ret), @r###"
id(48): distance(48)
id(92): distance(92)
id(48): distance(47)
id(92): distance(91)
"###);

// if we can look into all the node there is no inifinite loop and it works
Expand All @@ -131,11 +131,11 @@ fn two_dimension_on_a_line() {

let ret = reader.nns(5).by_item(&rtxn, 0).unwrap();
insta::assert_snapshot!(NnsRes(ret), @r###"
id(0): distance(0)
id(1): distance(1)
id(2): distance(2)
id(3): distance(3)
id(4): distance(4)
id(5): distance(5)
"###);
}

Expand All @@ -162,11 +162,11 @@ fn two_dimension_on_a_column() {
let ret = reader.nns(5).by_item(&rtxn, 0).unwrap();

insta::assert_snapshot!(NnsRes(ret), @r###"
id(0): distance(0)
id(1): distance(1)
id(2): distance(2)
id(3): distance(3)
id(4): distance(4)
id(5): distance(5)
"###);
}

Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

317 changes: 180 additions & 137 deletions src/tests/writer.rs

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ impl<D: Distance> Writer<D> {
.remap_data_type::<RoaringBitmapCodec>()
.get(wtxn, &Key::updated(self.index))?
.unwrap_or_default();
// We cannot append here because we may have removed an item with a larger id before
updated.insert(item);
self.database.remap_data_type::<RoaringBitmapCodec>().put(
wtxn,
Expand Down

0 comments on commit 24083df

Please sign in to comment.