lnx-search · ChillFish8 · Jan 16, 2025 · Jan 16, 2025
diff --git a/lnx-tantivy/src/collectors/top_docs/distinct.rs b/lnx-tantivy/src/collectors/top_docs/distinct.rs
@@ -20,6 +20,16 @@
     inner_collector: C,
 }
 
+impl<C> Distinct<C> {
+    /// Creates a new distinct collector which de-duplicates value by the provided fields.
+    pub fn for_fields(fields: Vec<Field>, inner: C) -> Self {
+        Self {
+            select_fields: fields,
+            inner_collector: inner,
+        }
+    }
+}
+
 impl<C> Collector for Distinct<C>
 where
     C: Collector + CollectorExt,
@@ -181,3 +191,139 @@
         }
     }
 }
+
+
+#[cfg(test)]
+mod tests {
+    use tantivy::{doc, Index, TantivyError};
+    use tantivy::indexer::IndexWriterOptions;
+    use tantivy::query::AllQuery;
+    use tantivy::schema::{Schema, FAST, INDEXED, STORED, TEXT};
+    use crate::collectors::top_docs::TopDocs;
+    use super::*;
+
+    fn create_test_index() -> Index {
+        let mut schema_builder = Schema::builder();
+        let id = schema_builder.add_u64_field("id", INDEXED | STORED | FAST);
+        let title = schema_builder.add_text_field("title", TEXT | STORED | FAST);
+        let description = schema_builder.add_text_field("description", STORED);
+        let ip = schema_builder.add_ip_addr_field("ip", STORED | FAST);
+        let schema = schema_builder.build();
+
+        let index = Index::create_in_ram(schema);
+        let options = IndexWriterOptions::builder()
+            .num_worker_threads(1)
+            .num_merge_threads(0)
+            .build();
+        let mut writer = index.writer_with_options(options).unwrap();
+        writer.add_document(doc!(
+            id => 1u64,
+            title => "The old man and the sea",
+            description => "example text here today",
+            ip => Ipv6Addr::LOCALHOST,
+        )).unwrap();
+        writer.add_document(doc!(
+            id => 2u64,
+            title => "The old man and the sea",
+            description => "example text here today",
+            ip => Ipv6Addr::LOCALHOST,
+        )).unwrap();
+        writer.add_document(doc!(
+            id => 3u64,
+            title => "X men",
+            description => "Something something rivals",
+        )).unwrap();
+        writer.commit().unwrap();
+        index
+    }
+
+    #[test]
+    fn test_distinct_without_offset() {
+        let index = create_test_index();
+        let reader = index.reader().unwrap();
+        let searcher = reader.searcher();
+        let schema = index.schema();
+        let title_field = schema.get_field("title").unwrap();
+
+        let collector = Distinct::for_fields(vec![title_field], TopDocs::with_limit(10));
+        let all_docs = searcher.search(&AllQuery, &collector).expect("Complete search");
+        assert_eq!(all_docs.len(), 2);
+    }
+
+    #[test]
+    fn test_distinct_no_duplicates() {
+        let index = create_test_index();
+        let reader = index.reader().unwrap();
+        let searcher = reader.searcher();
+        let schema = index.schema();
+        let id_field = schema.get_field("id").unwrap();
+
+        let collector = Distinct::for_fields(vec![id_field], TopDocs::with_limit(10));
+        let all_docs = searcher.search(&AllQuery, &collector).expect("Complete search");
+        assert_eq!(all_docs.len(), 3);
+    }
+
+    #[test]
+    fn test_distinct_zero_limit() {
+        let index = create_test_index();
+        let reader = index.reader().unwrap();
+        let searcher = reader.searcher();
+        let schema = index.schema();
+        let title_field = schema.get_field("title").unwrap();
+
+        let collector = Distinct::for_fields(vec![title_field], TopDocs::with_limit(0));
+        let all_docs = searcher.search(&AllQuery, &collector).expect("Complete search");
+        assert_eq!(all_docs.len(), 0);
+    }
+
+    #[test]
+    fn test_distinct_with_offset() {
+        let index = create_test_index();
+        let reader = index.reader().unwrap();
+        let searcher = reader.searcher();
+        let schema = index.schema();
+        let title_field = schema.get_field("title").unwrap();
+
+        let collector = Distinct::for_fields(
+            vec![title_field], 
+            TopDocs::with_limit(3)
+                .and_offset(1)
+        );
+        let all_docs = searcher.search(&AllQuery, &collector).expect("Complete search");
+        assert_eq!(all_docs.len(), 1);
+    }
+
+    #[test]
+    fn test_distinct_non_fast_field() {
+        let index = create_test_index();
+        let reader = index.reader().unwrap();
+        let searcher = reader.searcher();
+        let schema = index.schema();
+        let description_field = schema.get_field("description").unwrap();
+
+        let collector = Distinct::for_fields(
+            vec![description_field],
+            TopDocs::with_limit(3)
+        );
+        let error = searcher.search(&AllQuery, &collector)
+            .expect_err("Cannot search distinct on non-fast field");
+        assert!(matches!(error, TantivyError::InvalidArgument(_)));
+    }
+
+    #[test]
+    fn test_distinct_ip_field() {
+        let index = create_test_index();
+        let reader = index.reader().unwrap();
+        let searcher = reader.searcher();
+        let schema = index.schema();
+        let ip_field = schema.get_field("ip").unwrap();
+
+        let collector = Distinct::for_fields(
+            vec![ip_field],
+            TopDocs::with_limit(3)
+        );
+        let all_docs = searcher.search(&AllQuery, &collector).expect("Complete search");
+        assert_eq!(all_docs.len(), 2);
+    }
+
+}
diff --git a/lnx-tantivy/src/collectors/top_docs/mod.rs b/lnx-tantivy/src/collectors/top_docs/mod.rs
@@ -21,7 +21,7 @@

 impl TopDocs {
    /// Creates a new [TopDocs] using the given limit.
    pub fn with_limit(limit: usize) -> Self {
        Self {
            limit,
            offset: 0,
@@ -196,7 +196,7 @@
 }

 impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
    fn new(segment_ord: SegmentOrdinal, limit: usize) -> TopSegmentCollector<T> {
        TopSegmentCollector {
            topn_computer: TopNComputer::new(limit),
            segment_ord,
@@ -448,7 +448,7 @@
        }

        let mut expected_docs_titles = Vec::new();
        for (score, doc) in expected_docs {
            let doc = searcher.doc::<tantivy::TantivyDocument>(doc).unwrap();
            let title = doc.get_first(title_field).unwrap();
            expected_docs_titles.push(title.as_str().unwrap().to_string());
@@ -463,7 +463,6 @@
         let reader = index.reader().unwrap();
         let schema = index.schema();
         let id_field = schema.get_field("id").unwrap();
-        let title_field = schema.get_field("title").unwrap();
 
         let options = IndexWriterOptions::builder()
             .num_worker_threads(1)