Skip to content
This repository was archived by the owner on Apr 4, 2023. It is now read-only.

Commit b307c93

Browse files
committed
Store fuzzy/bucketed positions in word_position_docids database
Fixes (when merged into meilisearch): meilisearch/meilisearch#3222
1 parent a8defb5 commit b307c93

File tree

3 files changed

+87
-8
lines changed

3 files changed

+87
-8
lines changed

milli/src/lib.rs

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,27 @@ pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosi
100100
(field_id as u32) << 16 | (relative as u32)
101101
}
102102

103+
/// Compute the "bucketed" absolute position from the field id and relative position in the field.
104+
///
105+
/// In a bucketed position, the accuracy of the relative position is reduced exponentially as it gets larger.
106+
pub fn bucketed_absolute_from_relative_position(
107+
field_id: FieldId,
108+
relative: RelativePosition,
109+
) -> Position {
110+
// The first few relative positions are kept intact.
111+
if relative < 16 {
112+
absolute_from_relative_position(field_id, relative)
113+
} else if relative < 24 {
114+
// Relative positions between 16 and 24 all become equal to 24
115+
absolute_from_relative_position(field_id, 24)
116+
} else {
117+
// Then, groups of positions that have the same base-2 logarithm are reduced to
118+
// the same relative position: the smallest power of 2 that is greater than them
119+
let relative = (relative as f64).log2().ceil().exp2() as u16;
120+
absolute_from_relative_position(field_id, relative)
121+
}
122+
}
123+
103124
/// Transform a raw obkv store into a JSON Object.
104125
pub fn obkv_to_json(
105126
displayed_fields: &[FieldId],
@@ -329,4 +350,51 @@ mod tests {
329350

330351
assert_eq!(&actual, expected);
331352
}
353+
354+
#[test]
355+
fn bucketed_position() {
356+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 0), @"0");
357+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 1), @"1");
358+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 2), @"2");
359+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 15), @"15");
360+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 16), @"24");
361+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 19), @"24");
362+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 20), @"24");
363+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 21), @"24");
364+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 22), @"24");
365+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 23), @"24");
366+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 24), @"32");
367+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 25), @"32");
368+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 30), @"32");
369+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 40), @"64");
370+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 50), @"64");
371+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 60), @"64");
372+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 70), @"128");
373+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 80), @"128");
374+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 90), @"128");
375+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 100), @"128");
376+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 110), @"128");
377+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 120), @"128");
378+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 130), @"256");
379+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 1000), @"1024");
380+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 2000), @"2048");
381+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 4000), @"4096");
382+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 8000), @"8192");
383+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 9000), @"16384");
384+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 10_000), @"16384");
385+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 65_500), @"65535");
386+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 65_535), @"65535");
387+
388+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 0), @"65536");
389+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 1), @"65537");
390+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 20), @"65560");
391+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 1000), @"66560");
392+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 65_535), @"131071");
393+
394+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(2, 0), @"131072");
395+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(2, 65_535), @"196607");
396+
397+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(65_535, 0), @"4294901760");
398+
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(65_535, 65_535), @"4294967295");
399+
}
332400
}

milli/src/search/criteria/exactness.rs

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ use crate::search::criteria::{
1111
InitialCandidates,
1212
};
1313
use crate::search::query_tree::{Operation, PrimitiveQueryPart};
14-
use crate::{absolute_from_relative_position, FieldId, Result};
14+
use crate::{
15+
absolute_from_relative_position, bucketed_absolute_from_relative_position, FieldId, Result,
16+
};
1517

1618
pub struct Exactness<'t> {
1719
ctx: &'t dyn Context<'t>,
@@ -285,30 +287,34 @@ fn attribute_start_with_docids(
285287
) -> heed::Result<Vec<RoaringBitmap>> {
286288
let mut attribute_candidates_array = Vec::new();
287289
// start from attribute first position
288-
let mut pos = absolute_from_relative_position(attribute_id, 0);
290+
let mut relative_pos = 0;
289291
for part in query {
290292
use ExactQueryPart::*;
291293
match part {
292294
Synonyms(synonyms) => {
295+
let bucketed_position =
296+
bucketed_absolute_from_relative_position(attribute_id, relative_pos);
293297
let mut synonyms_candidates = RoaringBitmap::new();
294298
for word in synonyms {
295-
let wc = ctx.word_position_docids(word, pos)?;
299+
let wc = ctx.word_position_docids(word, bucketed_position)?;
296300
if let Some(word_candidates) = wc {
297301
synonyms_candidates |= word_candidates;
298302
}
299303
}
300304
attribute_candidates_array.push(synonyms_candidates);
301-
pos += 1;
305+
relative_pos += 1;
302306
}
303307
Phrase(phrase) => {
304308
for word in phrase {
309+
let bucketed_position =
310+
bucketed_absolute_from_relative_position(attribute_id, relative_pos);
305311
if let Some(word) = word {
306-
let wc = ctx.word_position_docids(word, pos)?;
312+
let wc = ctx.word_position_docids(word, bucketed_position)?;
307313
if let Some(word_candidates) = wc {
308314
attribute_candidates_array.push(word_candidates);
309315
}
310316
}
311-
pos += 1;
317+
relative_pos += 1;
312318
}
313319
}
314320
}

milli/src/update/index_documents/extract/extract_word_position_docids.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ use super::helpers::{
77
};
88
use crate::error::SerializationError;
99
use crate::index::db_name::DOCID_WORD_POSITIONS;
10-
use crate::{DocumentId, Result};
10+
use crate::{
11+
bucketed_absolute_from_relative_position, relative_from_absolute_position, DocumentId, Result,
12+
};
1113

1214
/// Extracts the word positions and the documents ids where this word appear.
1315
///
@@ -37,9 +39,12 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
3739
let document_id = DocumentId::from_be_bytes(document_id_bytes);
3840

3941
for position in read_u32_ne_bytes(value) {
42+
let (field_id, relative) = relative_from_absolute_position(position);
43+
let bucketed_position = bucketed_absolute_from_relative_position(field_id, relative);
44+
4045
key_buffer.clear();
4146
key_buffer.extend_from_slice(word_bytes);
42-
key_buffer.extend_from_slice(&position.to_be_bytes());
47+
key_buffer.extend_from_slice(&bucketed_position.to_be_bytes());
4348

4449
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
4550
}

0 commit comments

Comments
 (0)