Skip to content

Implement Hamming distance for binary strings #124

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions proptest-regressions/unaligned_vector/binary_test.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Seeds for failure cases proptest has generated in the past. It is
# automatically read and these particular cases re-run before any
# novel cases are generated.
#
# It is recommended to check this file in to source control so that
# everyone who runs the test benefits from these saved cases.
cc 4044d46b46fbadeb98b1c37a1b7ec57f00fe21008401479d30f2f39aa83e3195 # shrinks to original = [0.0]
164 changes: 164 additions & 0 deletions src/distance/hamming.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
use crate::distance::Distance;
use crate::internals::Side;
use crate::node::Leaf;
use crate::parallel::ImmutableSubsetLeafs;
use crate::unaligned_vector::{Binary, UnalignedVector};
use bytemuck::{Pod, Zeroable};
use rand::Rng;
use std::borrow::Cow;

/// The Hamming distance between two vectors is the number of positions at
/// which the corresponding symbols are different.
///
/// `d(u,v) = ||u ^ v||₁`
///
/// /!\ This distance function is binary, which means it loses all its precision
/// and their scalar values are converted to `0` or `1` under the rule
/// `x > 0.0 => 1`, otherwise `0`
#[derive(Debug, Clone)]
pub enum Hamming {}

/// The header of BinaryEuclidean leaf nodes.
#[repr(C)]
#[derive(Pod, Zeroable, Debug, Clone, Copy)]
pub struct NodeHeaderHamming {}

impl Distance for Hamming {
const DEFAULT_OVERSAMPLING: usize = 3;

type Header = NodeHeaderHamming;
type VectorCodec = Binary;

fn name() -> &'static str {
"hamming"
}

fn new_header(_vector: &UnalignedVector<Self::VectorCodec>) -> Self::Header {
NodeHeaderHamming {}
}

fn built_distance(p: &Leaf<Self>, q: &Leaf<Self>) -> f32 {
hamming_bitwise_fast(p.vector.as_bytes(), q.vector.as_bytes())
}

fn normalized_distance(d: f32, _: usize) -> f32 {
d
}

fn norm_no_header(v: &UnalignedVector<Self::VectorCodec>) -> f32 {
v.as_bytes().iter().map(|b| b.count_ones() as i32).sum::<i32>() as f32
}

fn init(_node: &mut Leaf<Self>) {}

fn create_split<'a, R: Rng>(
children: &'a ImmutableSubsetLeafs<Self>,
rng: &mut R,
) -> heed::Result<Cow<'a, UnalignedVector<Self::VectorCodec>>> {
// unlike other distances which build a seperating hyperplane we
// construct an LSH by bit sampling and store the random bit in a one-hot
// vector
// https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Bit_sampling_for_Hamming_distance

const ITERATION_STEPS: usize = 200;

let is_valid_split = |v: &UnalignedVector<Self::VectorCodec>, rng: &mut R| {
let mut count = 0;
for _ in 0..ITERATION_STEPS {
let u = children.choose(rng)?.unwrap().vector;
if <Self as Distance>::margin_no_header(v, u.as_ref()) > 0.0 {
count += 1;
}
}
Ok::<bool, heed::Error>(count > 0 && count < ITERATION_STEPS)
};

// first try random index
let dim = children.choose(rng)?.unwrap().vector.len();
let mut n: Vec<f32> = vec![0.0; dim];
let idx = rng.gen_range(0..dim);
n[idx] = 1.0;
let mut normal = UnalignedVector::from_vec(n);

if is_valid_split(&normal, rng)? {
return Ok(Cow::Owned(normal.into_owned()));
}

// otherwise brute-force search for a splitting coordinate
for j in 0..dim {
let mut n: Vec<f32> = vec![0.0; dim];
n[j] = 1.0;
normal = UnalignedVector::from_vec(n);

if is_valid_split(&normal, rng)? {
return Ok(Cow::Owned(normal.into_owned()));
}
}

// fallback
Ok(Cow::Owned(normal.into_owned()))
}

fn margin_no_header(
p: &UnalignedVector<Self::VectorCodec>,
q: &UnalignedVector<Self::VectorCodec>,
) -> f32 {
// p is a mask with 1 bit set
let ret =
p.as_bytes().iter().zip(q.as_bytes()).map(|(u, v)| (u & v).count_ones()).sum::<u32>();
ret as f32
}

fn pq_distance(distance: f32, margin: f32, side: Side) -> f32 {
match side {
Side::Left => distance - margin,
Side::Right => distance - (1.0 - margin),
}
}

fn side<R: Rng>(
normal_plane: &UnalignedVector<Self::VectorCodec>,
node: &Leaf<Self>,
_rng: &mut R,
) -> Side {
let dot = Self::margin_no_header(&node.vector, normal_plane);
if dot > 0.0 {
Side::Right
} else {
Side::Left
}
}
}

#[inline]
pub fn hamming_bitwise_fast(u: &[u8], v: &[u8]) -> f32 {
// based on : https://github.com/emschwartz/hamming-bitwise-fast
// Explicitly structuring the code as below lends itself to SIMD optimizations by
// the compiler -> https://matklad.github.io/2023/04/09/can-you-trust-a-compiler-to-optimize-your-code.html
assert_eq!(u.len(), v.len());

type BitPackedWord = u64;
const CHUNK_SIZE: usize = std::mem::size_of::<BitPackedWord>();

let mut distance = u
.chunks_exact(CHUNK_SIZE)
.zip(v.chunks_exact(CHUNK_SIZE))
.map(|(u_chunk, v_chunk)| {
let u_val = BitPackedWord::from_ne_bytes(u_chunk.try_into().unwrap());
let v_val = BitPackedWord::from_ne_bytes(v_chunk.try_into().unwrap());
(u_val ^ v_val).count_ones()
})
.sum::<u32>();

if u.len() % CHUNK_SIZE != 0 {
distance += u
.chunks_exact(CHUNK_SIZE)
.remainder()
.iter()
.zip(v.chunks_exact(CHUNK_SIZE).remainder())
.map(|(u_byte, v_byte)| (u_byte ^ v_byte).count_ones())
.sum::<u32>();
}

distance as f32
}
2 changes: 2 additions & 0 deletions src/distance/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use bytemuck::{Pod, Zeroable};
pub use cosine::{Cosine, NodeHeaderCosine};
pub use dot_product::{DotProduct, NodeHeaderDotProduct};
pub use euclidean::{Euclidean, NodeHeaderEuclidean};
pub use hamming::{Hamming, NodeHeaderHamming};
use heed::{RwPrefix, RwTxn};
pub use manhattan::{Manhattan, NodeHeaderManhattan};
use rand::Rng;
Expand All @@ -28,6 +29,7 @@ mod binary_quantized_manhattan;
mod cosine;
mod dot_product;
mod euclidean;
mod hamming;
mod manhattan;

fn new_leaf<D: Distance>(vec: Vec<f32>) -> Leaf<'static, D> {
Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ pub mod internals {
pub use crate::distance::{
NodeHeaderBinaryQuantizedCosine, NodeHeaderBinaryQuantizedEuclidean,
NodeHeaderBinaryQuantizedManhattan, NodeHeaderCosine, NodeHeaderDotProduct,
NodeHeaderEuclidean, NodeHeaderManhattan,
NodeHeaderEuclidean, NodeHeaderHamming, NodeHeaderManhattan,
};
pub use crate::key::KeyCodec;
pub use crate::node::{Leaf, NodeCodec};
Expand Down Expand Up @@ -145,7 +145,7 @@ pub mod internals {
pub mod distances {
pub use crate::distance::{
BinaryQuantizedCosine, BinaryQuantizedEuclidean, BinaryQuantizedManhattan, Cosine,
DotProduct, Euclidean, Manhattan,
DotProduct, Euclidean, Hamming, Manhattan,
};
}

Expand Down
55 changes: 55 additions & 0 deletions src/tests/binary.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
use crate::{
distance::Hamming,
tests::{create_database, rng},
Writer,
};

#[test]
fn write_and_retrieve_binary_vector() {
let handle = create_database::<Hamming>();
let mut wtxn = handle.env.write_txn().unwrap();
let writer = Writer::new(handle.database, 0, 16);
writer
.add_item(
&mut wtxn,
0,
&[
-2.0, -1.0, 0.0, -0.1, 2.0, 2.0, -12.4, 21.2, -2.0, -1.0, 0.0, 1.0, 2.0, 2.0,
-12.4, 21.2,
],
)
.unwrap();
let vec = writer.item_vector(&wtxn, 0).unwrap().unwrap();
insta::assert_debug_snapshot!(vec, @r###"
[
0.0,
0.0,
0.0,
0.0,
1.0,
1.0,
0.0,
1.0,
0.0,
0.0,
0.0,
1.0,
1.0,
1.0,
0.0,
1.0,
]
"###);

writer.builder(&mut rng()).n_trees(1).build(&mut wtxn).unwrap();
wtxn.commit().unwrap();

insta::assert_snapshot!(handle, @r###"
==================
Dumping index 0
Root: Metadata { dimensions: 16, items: RoaringBitmap<[0]>, roots: [0], distance: "hamming" }
Version: Version { major: 0, minor: 6, patch: 1 }
Tree 0: Descendants(Descendants { descendants: [0] })
Item 0: Leaf(Leaf { header: NodeHeaderHamming, vector: [0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000, 1.0000, 0.0000, 0.0000, "other ..."] })
"###);
}
1 change: 1 addition & 0 deletions src/tests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use tempfile::TempDir;
use crate::version::VersionCodec;
use crate::{Database, Distance, MetadataCodec, NodeCodec, NodeMode, Reader};

mod binary;
mod binary_quantized;
mod reader;
mod writer;
Expand Down
Loading