Skip to content

Make splitnode only refers to tree nodes #127

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "arroy"
description = "Annoy-inspired Approximate Nearest Neighbors in Rust, based on LMDB and optimized for memory usage"
version = "0.6.1"
version = "0.7.0"
documentation = "https://docs.rs/arroy"
repository = "https://github.com/meilisearch/arroy"
keywords = ["ANN-search", "Graph-algorithms", "Vector-Search", "Store"]
Expand Down
9 changes: 8 additions & 1 deletion src/error.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::io;

use crate::{key::Key, node_id::NodeMode, ItemId};
use crate::{key::Key, node_id::NodeMode, version::Version, ItemId};

/// The different set of errors that arroy can encounter.
#[derive(Debug, thiserror::Error)]
Expand Down Expand Up @@ -72,6 +72,13 @@ pub enum Error {
/// The mode that couldn't be decoded.
mode: NodeMode,
},

/// Unknown version
#[error("Unknown version: v{version}")]
UnknownVersion {
/// The version that is unknown.
version: Version,
},
}

impl Error {
Expand Down
160 changes: 151 additions & 9 deletions src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@ use std::fmt;
use std::mem::size_of;

use bytemuck::{bytes_of, cast_slice, pod_read_unaligned};
use byteorder::{ByteOrder, NativeEndian};
use byteorder::{BigEndian, ByteOrder, NativeEndian};
use heed::{BoxedError, BytesDecode, BytesEncode};
use roaring::RoaringBitmap;

use crate::distance::Distance;
use crate::node_id::NodeId;
use crate::unaligned_vector::UnalignedVector;
use crate::{ItemId, NodeId};
use crate::ItemId;

#[derive(Clone, Debug)]
pub enum Node<'a, D: Distance> {
Expand All @@ -18,6 +19,15 @@ pub enum Node<'a, D: Distance> {
SplitPlaneNormal(SplitPlaneNormal<'a, D>),
}

/// A node generic over the version of the database.
/// Should only be used while reading from the database.
#[derive(Clone, Debug)]
pub enum GenericReadNode<'a, D: Distance> {
Leaf(Leaf<'a, D>),
Descendants(Descendants<'a>),
SplitPlaneNormal(GenericReadSplitPlaneNormal<'a, D>),
}

const LEAF_TAG: u8 = 0;
const DESCENDANTS_TAG: u8 = 1;
const SPLIT_PLANE_NORMAL_TAG: u8 = 2;
Expand Down Expand Up @@ -113,8 +123,8 @@ impl fmt::Debug for ItemIds<'_> {
}

pub struct SplitPlaneNormal<'a, D: Distance> {
pub left: NodeId,
pub right: NodeId,
pub left: ItemId,
pub right: ItemId,
pub normal: Option<Cow<'a, UnalignedVector<D::VectorCodec>>>,
}

Expand All @@ -138,6 +148,35 @@ impl<D: Distance> Clone for SplitPlaneNormal<'_, D> {
}
}

pub struct GenericReadSplitPlaneNormal<'a, D: Distance> {
// Before version 0.7.0 the split plane normal was stored as a `NodeId` and could point directly to items.
pub left: NodeId,
pub right: NodeId,
// Before version 0.7.0 instead of storing `None` for a missing normal, we were
// storing a vector filled with zeros, that will be overwritten while creating this type.
pub normal: Option<Cow<'a, UnalignedVector<D::VectorCodec>>>,
}

impl<D: Distance> fmt::Debug for GenericReadSplitPlaneNormal<'_, D> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let name = format!("GenericReadSplitPlaneNormal<{}>", D::name());
let mut debug = f.debug_struct(&name);

debug.field("left", &self.left).field("right", &self.right);
match &self.normal {
Some(normal) => debug.field("normal", &normal),
None => debug.field("normal", &"none"),
};
debug.finish()
}
}

impl<D: Distance> Clone for GenericReadSplitPlaneNormal<'_, D> {
fn clone(&self) -> Self {
Self { left: self.left, right: self.right, normal: self.normal.clone() }
}
}

/// The codec used internally to encode and decode nodes.
pub struct NodeCodec<D>(D);

Expand All @@ -154,8 +193,8 @@ impl<'a, D: Distance> BytesEncode<'a> for NodeCodec<D> {
}
Node::SplitPlaneNormal(SplitPlaneNormal { normal, left, right }) => {
bytes.push(SPLIT_PLANE_NORMAL_TAG);
bytes.extend_from_slice(&left.to_bytes());
bytes.extend_from_slice(&right.to_bytes());
bytes.extend_from_slice(&left.to_be_bytes());
bytes.extend_from_slice(&right.to_be_bytes());
if let Some(normal) = normal {
bytes.extend_from_slice(normal.as_bytes());
}
Expand All @@ -182,8 +221,10 @@ impl<'a, D: Distance> BytesDecode<'a> for NodeCodec<D> {
Ok(Node::Leaf(Leaf { header, vector }))
}
[SPLIT_PLANE_NORMAL_TAG, bytes @ ..] => {
let (left, bytes) = NodeId::from_bytes(bytes);
let (right, bytes) = NodeId::from_bytes(bytes);
let left = BigEndian::read_u32(bytes);
let bytes = &bytes[std::mem::size_of_val(&left)..];
let right = BigEndian::read_u32(bytes);
let bytes = &bytes[std::mem::size_of_val(&right)..];
let normal = if bytes.is_empty() {
None
} else {
Expand All @@ -194,7 +235,108 @@ impl<'a, D: Distance> BytesDecode<'a> for NodeCodec<D> {
[DESCENDANTS_TAG, bytes @ ..] => Ok(Node::Descendants(Descendants {
descendants: Cow::Owned(RoaringBitmap::deserialize_from(bytes)?),
})),
unknown => panic!("What the fuck is an {unknown:?}"),
unknown => panic!(
"Did not recognize node tag type: {unknown:?} while decoding a node from v0.7.0"
),
}
}
}

/// The codec used internally during read operations to decode nodes to a common interface from the v0.4.0.
pub struct GenericReadNodeCodecFromV0_4_0<D>(D);

impl<'a, D: Distance> BytesDecode<'a> for GenericReadNodeCodecFromV0_4_0<D> {
type DItem = GenericReadNode<'a, D>;

fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
match bytes {
[LEAF_TAG, bytes @ ..] => {
let (header_bytes, remaining) = bytes.split_at(size_of::<D::Header>());
let header = pod_read_unaligned(header_bytes);
let vector = UnalignedVector::<D::VectorCodec>::from_bytes(remaining)?;

Ok(GenericReadNode::Leaf(Leaf { header, vector }))
}
[SPLIT_PLANE_NORMAL_TAG, bytes @ ..] => {
// From v0.4.0 to v0.5.0 included, the children were stored as `NodeId` and could point directly to items.
let (left, bytes) = NodeId::from_bytes(bytes);
let (right, bytes) = NodeId::from_bytes(bytes);
// And the normal could not be null, but it could be a vector filled with zeros.
let normal = UnalignedVector::<D::VectorCodec>::from_bytes(bytes)?;
let normal = if normal.is_zero() {
None
} else {
Some(normal)
};
Ok(GenericReadNode::SplitPlaneNormal(GenericReadSplitPlaneNormal { normal, left, right }))
}
[DESCENDANTS_TAG, bytes @ ..] => Ok(GenericReadNode::Descendants(Descendants {
descendants: Cow::Owned(RoaringBitmap::deserialize_from(bytes)?),
})),
unknown => panic!("Did not recognize node tag type: {unknown:?} while decoding a generic read node from v0.4.0"),
}
}
}

/// The codec used internally during read operations to decode nodes to a common interface from the v0.7.0.
pub struct GenericReadNodeCodecFromV0_7_0<D>(D);

impl<'a, D: Distance> BytesDecode<'a> for GenericReadNodeCodecFromV0_7_0<D> {
type DItem = GenericReadNode<'a, D>;

fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
NodeCodec::bytes_decode(bytes).map(|node| match node {
Node::SplitPlaneNormal(split_plane_normal) => {
GenericReadNode::SplitPlaneNormal(GenericReadSplitPlaneNormal {
// From v0.6.0 the split plane normal always points to a tree node.
left: NodeId::tree(split_plane_normal.left),
right: NodeId::tree(split_plane_normal.right),
normal: split_plane_normal.normal,
})
}
Node::Descendants(descendants) => GenericReadNode::Descendants(descendants),
Node::Leaf(leaf) => GenericReadNode::Leaf(leaf),
})
}
}

/// The codec used internally during read operations to decode nodes to a common interface from the v0.4.0.
pub struct WriteNodeCodecForV0_5_0<D>(D);

impl<'a, D: Distance> BytesEncode<'a> for WriteNodeCodecForV0_5_0<D> {
// Since the dimension of the vector has been lost while converting to a generic node, we need to get it back.
type EItem = (GenericReadNode<'a, D>, usize);

fn bytes_encode(item: &Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
// It's ok to clone and be slow because that only happens once when upgrading from v0.4.0 to v0.5.0.
match &item.0 {
// The leaf didn't change between v0.4.0 and today.
GenericReadNode::Leaf(leaf) => {
Ok(NodeCodec::bytes_encode(&Node::Leaf(leaf.clone()))?.into_owned().into())
}
// The descendants didn't change between v0.4.0 and today.
GenericReadNode::Descendants(descendants) => {
Ok(NodeCodec::bytes_encode(&Node::<D>::Descendants(descendants.clone()))?
.into_owned()
.into())
}
GenericReadNode::SplitPlaneNormal(GenericReadSplitPlaneNormal {
left,
right,
normal,
}) => {
// Original code at: https://github.com/meilisearch/arroy/blob/5b748bac2c69c65a97980901b02067a3a545e357/src/node.rs#L152-L157
let mut bytes = Vec::new();
bytes.push(SPLIT_PLANE_NORMAL_TAG);
bytes.extend_from_slice(&left.to_bytes());
bytes.extend_from_slice(&right.to_bytes());
match normal {
Some(normal) => bytes.extend_from_slice(normal.as_bytes()),
// If the normal is None, we need to write a vector filled with zeros.
None => bytes.extend_from_slice(&vec![0; item.1]),
}
Ok(Cow::Owned(bytes))
}
}
}
}
10 changes: 2 additions & 8 deletions src/parallel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ use roaring::{RoaringBitmap, RoaringTreemap};
use crate::internals::{KeyCodec, Leaf, NodeCodec};
use crate::key::{Key, Prefix, PrefixCodec};
use crate::node::{Node, SplitPlaneNormal};
use crate::node_id::NodeMode;
use crate::{Database, Distance, Error, ItemId, Result};

/// A structure to store the tree nodes out of the heed database.
Expand Down Expand Up @@ -489,13 +488,8 @@ impl<'t, D: Distance> ImmutableTrees<'t, D> {
}
Node::SplitPlaneNormal(SplitPlaneNormal { left, right, normal: _ }) => {
trees.insert(current, (bytes.len(), bytes.as_ptr()));
// We must avoid the items and only push the tree nodes
if left.mode == NodeMode::Tree {
explore.push(left.item);
}
if right.mode == NodeMode::Tree {
explore.push(right.item);
}
explore.push(left);
explore.push(right);
}
}
}
Expand Down
Loading
Loading