Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 146 additions & 0 deletions lnx-tantivy/src/collectors/top_docs/distinct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,16 @@
inner_collector: C,
}

impl<C> Distinct<C> {
/// Creates a new distinct collector which de-duplicates value by the provided fields.
pub fn for_fields(fields: Vec<Field>, inner: C) -> Self {

Check warning on line 25 in lnx-tantivy/src/collectors/top_docs/distinct.rs

View workflow job for this annotation

GitHub Actions / clippy

associated function `for_fields` is never used

warning: associated function `for_fields` is never used --> lnx-tantivy/src/collectors/top_docs/distinct.rs:25:12 | 23 | impl<C> Distinct<C> { | ------------------- associated function in this implementation 24 | /// Creates a new distinct collector which de-duplicates value by the provided fields. 25 | pub fn for_fields(fields: Vec<Field>, inner: C) -> Self { | ^^^^^^^^^^
Self {
select_fields: fields,
inner_collector: inner,
}
}
}

impl<C> Collector for Distinct<C>
where
C: Collector + CollectorExt,
Expand Down Expand Up @@ -181,3 +191,139 @@
}
}
}


#[cfg(test)]
mod tests {
use tantivy::{doc, Index, TantivyError};
use tantivy::indexer::IndexWriterOptions;
use tantivy::query::AllQuery;
use tantivy::schema::{Schema, FAST, INDEXED, STORED, TEXT};
use crate::collectors::top_docs::TopDocs;
use super::*;

fn create_test_index() -> Index {
let mut schema_builder = Schema::builder();
let id = schema_builder.add_u64_field("id", INDEXED | STORED | FAST);
let title = schema_builder.add_text_field("title", TEXT | STORED | FAST);
let description = schema_builder.add_text_field("description", STORED);
let ip = schema_builder.add_ip_addr_field("ip", STORED | FAST);
let schema = schema_builder.build();

let index = Index::create_in_ram(schema);
let options = IndexWriterOptions::builder()
.num_worker_threads(1)
.num_merge_threads(0)
.build();
let mut writer = index.writer_with_options(options).unwrap();
writer.add_document(doc!(
id => 1u64,
title => "The old man and the sea",
description => "example text here today",
ip => Ipv6Addr::LOCALHOST,
)).unwrap();
writer.add_document(doc!(
id => 2u64,
title => "The old man and the sea",
description => "example text here today",
ip => Ipv6Addr::LOCALHOST,
)).unwrap();
writer.add_document(doc!(
id => 3u64,
title => "X men",
description => "Something something rivals",
)).unwrap();
writer.commit().unwrap();
index
}

#[test]
fn test_distinct_without_offset() {
let index = create_test_index();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let schema = index.schema();
let title_field = schema.get_field("title").unwrap();

let collector = Distinct::for_fields(vec![title_field], TopDocs::with_limit(10));
let all_docs = searcher.search(&AllQuery, &collector).expect("Complete search");
assert_eq!(all_docs.len(), 2);
}

#[test]
fn test_distinct_no_duplicates() {
let index = create_test_index();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let schema = index.schema();
let id_field = schema.get_field("id").unwrap();

let collector = Distinct::for_fields(vec![id_field], TopDocs::with_limit(10));
let all_docs = searcher.search(&AllQuery, &collector).expect("Complete search");
assert_eq!(all_docs.len(), 3);
}

#[test]
fn test_distinct_zero_limit() {
let index = create_test_index();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let schema = index.schema();
let title_field = schema.get_field("title").unwrap();

let collector = Distinct::for_fields(vec![title_field], TopDocs::with_limit(0));
let all_docs = searcher.search(&AllQuery, &collector).expect("Complete search");
assert_eq!(all_docs.len(), 0);
}

#[test]
fn test_distinct_with_offset() {
let index = create_test_index();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let schema = index.schema();
let title_field = schema.get_field("title").unwrap();

let collector = Distinct::for_fields(
vec![title_field],
TopDocs::with_limit(3)
.and_offset(1)
);
let all_docs = searcher.search(&AllQuery, &collector).expect("Complete search");
assert_eq!(all_docs.len(), 1);
}

#[test]
fn test_distinct_non_fast_field() {
let index = create_test_index();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let schema = index.schema();
let description_field = schema.get_field("description").unwrap();

let collector = Distinct::for_fields(
vec![description_field],
TopDocs::with_limit(3)
);
let error = searcher.search(&AllQuery, &collector)
.expect_err("Cannot search distinct on non-fast field");
assert!(matches!(error, TantivyError::InvalidArgument(_)));
}

#[test]
fn test_distinct_ip_field() {
let index = create_test_index();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let schema = index.schema();
let ip_field = schema.get_field("ip").unwrap();

let collector = Distinct::for_fields(
vec![ip_field],
TopDocs::with_limit(3)
);
let all_docs = searcher.search(&AllQuery, &collector).expect("Complete search");
assert_eq!(all_docs.len(), 2);
}

}
1 change: 0 additions & 1 deletion lnx-tantivy/src/collectors/top_docs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

impl TopDocs {
/// Creates a new [TopDocs] using the given limit.
pub fn with_limit(limit: usize) -> Self {

Check warning on line 24 in lnx-tantivy/src/collectors/top_docs/mod.rs

View workflow job for this annotation

GitHub Actions / clippy

associated items `with_limit` and `and_offset` are never used

warning: associated items `with_limit` and `and_offset` are never used --> lnx-tantivy/src/collectors/top_docs/mod.rs:24:12 | 22 | impl TopDocs { | ------------ associated items in this implementation 23 | /// Creates a new [TopDocs] using the given limit. 24 | pub fn with_limit(limit: usize) -> Self { | ^^^^^^^^^^ ... 33 | pub fn and_offset(mut self, offset: usize) -> Self { | ^^^^^^^^^^ | = note: `#[warn(dead_code)]` on by default
Self {
limit,
offset: 0,
Expand Down Expand Up @@ -196,7 +196,7 @@
}

impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
fn new(segment_ord: SegmentOrdinal, limit: usize) -> TopSegmentCollector<T> {

Check warning on line 199 in lnx-tantivy/src/collectors/top_docs/mod.rs

View workflow job for this annotation

GitHub Actions / clippy

associated function `new` is never used

warning: associated function `new` is never used --> lnx-tantivy/src/collectors/top_docs/mod.rs:199:8 | 198 | impl<T: PartialOrd + Clone> TopSegmentCollector<T> { | -------------------------------------------------- associated function in this implementation 199 | fn new(segment_ord: SegmentOrdinal, limit: usize) -> TopSegmentCollector<T> { | ^^^ | = note: `#[warn(dead_code)]` on by default

Check warning on line 199 in lnx-tantivy/src/collectors/top_docs/mod.rs

View workflow job for this annotation

GitHub Actions / clippy

associated function `new` is never used

warning: associated function `new` is never used --> lnx-tantivy/src/collectors/top_docs/mod.rs:199:8 | 198 | impl<T: PartialOrd + Clone> TopSegmentCollector<T> { | -------------------------------------------------- associated function in this implementation 199 | fn new(segment_ord: SegmentOrdinal, limit: usize) -> TopSegmentCollector<T> { | ^^^
TopSegmentCollector {
topn_computer: TopNComputer::new(limit),
segment_ord,
Expand Down Expand Up @@ -448,7 +448,7 @@
}

let mut expected_docs_titles = Vec::new();
for (score, doc) in expected_docs {

Check warning on line 451 in lnx-tantivy/src/collectors/top_docs/mod.rs

View workflow job for this annotation

GitHub Actions / clippy

unused variable: `score`

warning: unused variable: `score` --> lnx-tantivy/src/collectors/top_docs/mod.rs:451:14 | 451 | for (score, doc) in expected_docs { | ^^^^^ help: if this is intentional, prefix it with an underscore: `_score` | = note: `#[warn(unused_variables)]` on by default
let doc = searcher.doc::<tantivy::TantivyDocument>(doc).unwrap();
let title = doc.get_first(title_field).unwrap();
expected_docs_titles.push(title.as_str().unwrap().to_string());
Expand All @@ -463,7 +463,6 @@
let reader = index.reader().unwrap();
let schema = index.schema();
let id_field = schema.get_field("id").unwrap();
let title_field = schema.get_field("title").unwrap();

let options = IndexWriterOptions::builder()
.num_worker_threads(1)
Expand Down
Loading