Add sparse index

pierrecouch · ajaybhullar · commit 934bb10b11e3 · 2026-04-14T23:07:22.000Z
Change-Id: I667476574384f700cc497375ffb7a692e32df7ac Reviewed-on: https://review.couchbase.org/c/testrunner/+/243200 Tested-by: Ajay Bhullar <ajay.bhullar@couchbase.com> Reviewed-by: Ajay Bhullar <ajay.bhullar@couchbase.com>
diff --git a/conf/tuq/py-tuq-vector-sparse.conf b/conf/tuq/py-tuq-vector-sparse.conf
@@ -0,0 +1,11 @@
+# Run the vector search tests with sparse vectors and DOT distance.
+# with following options:
+# - use_xattr=true, use_base64=true
+# - use_xattr=true, use_base64=false
+# - use_xattr=false, use_base64=true
+# - use_xattr=false, use_base64=false
+tuqquery.tuq_vectorsearch.VectorSearchTests:
+    test_knn_distances_sparse,vector_type=sparse,distance=DOT
+    test_knn_search,vector_type=sparse,distance=DOT
+    tuq_ann_search,vector_type=sparse,distance=DOT,nprobes=24,train=20000
+    tuq_ann_search,vector_type=sparse,distance=DOT,nprobes=24,use_bhive=True,train=20000
diff --git a/lib/vector/vector.py b/lib/vector/vector.py
@@ -376,19 +376,23 @@ def multi_upsert_document_into_cb(self, cb_coll, documents):
             print(e)
 
 class IndexVector(object):
-    def create_index(self, cluster, bucket='default', scope='_default', collection='_default', index_order='tail', vector_field='vec', is_xattr=False, is_base64=False, network_byte_order=False, dimension=128, train=10000, description='IVF,PQ32x8', similarity='L2_SQUARED', nprobes=3, use_bhive=False, custom_index_fields=None,custom_name=None,use_partition=False):
+    def create_index(self, cluster, bucket='default', scope='_default', collection='_default', index_order='tail', vector_field='vec', is_xattr=False, is_base64=False, network_byte_order=False, dimension=128, train=10000, description='IVF,PQ32x8', similarity='L2_SQUARED', nprobes=3, use_bhive=False, vector_type='dense', custom_index_fields=None,custom_name=None,use_partition=False):
         cb = cluster.bucket(bucket)
         cb_scope = cb.scope(scope)
         if is_xattr:
             vector_field = f"meta().xattrs.{vector_field}"
         if is_base64:
             vector_field = f"DECODE_VECTOR({vector_field}, {network_byte_order})"
         
-        vector_definition = {"dimension": dimension, "train_list": train, "description": description, "similarity": similarity, "scan_nprobes": nprobes}
+        if vector_type == 'dense':
+            vector_definition = {"dimension": dimension, "train_list": train, "description": description, "similarity": similarity, "scan_nprobes": nprobes}
+        elif vector_type == 'sparse':
+            vector_definition = {"similarity": "DOT", "train_list": train, "scan_nprobes": nprobes}
+
         index_queries = {
-            'tail': f'CREATE INDEX vector_index_{similarity} IF NOT EXISTS ON {collection}(size, brand, {vector_field} VECTOR) WITH {vector_definition}',
-            'mid': f'CREATE INDEX vector_index_{similarity} IF NOT EXISTS ON {collection}(size, {vector_field} VECTOR, brand) WITH {vector_definition}',
-            'lead': f'CREATE INDEX vector_index_{similarity} IF NOT EXISTS ON {collection}({vector_field} VECTOR, size, brand) WITH {vector_definition}',
+            'tail': f'CREATE INDEX vector_index_{similarity} IF NOT EXISTS ON {collection}(size, brand, {vector_field} {vector_type} VECTOR) WITH {vector_definition}',
+            'mid': f'CREATE INDEX vector_index_{similarity} IF NOT EXISTS ON {collection}(size, {vector_field} {vector_type} VECTOR, brand) WITH {vector_definition}',
+            'lead': f'CREATE INDEX vector_index_{similarity} IF NOT EXISTS ON {collection}({vector_field} {vector_type} VECTOR, size, brand) WITH {vector_definition}',
         }
         if custom_index_fields:
             if custom_name:
@@ -402,17 +406,17 @@ def create_index(self, cluster, bucket='default', scope='_default', collection='
                 if ",vec VECTOR" in custom_index_fields:
                     custom_index_fields = custom_index_fields.replace(",vec VECTOR", "")
                     if custom_name:
-                        index_query = f'CREATE VECTOR INDEX {custom_name} IF NOT EXISTS ON {collection}({vector_field} VECTOR) INCLUDE({custom_index_fields}) WITH {vector_definition}'
+                        index_query = f'CREATE VECTOR INDEX {custom_name} IF NOT EXISTS ON {collection}({vector_field} {vector_type} VECTOR) INCLUDE({custom_index_fields}) WITH {vector_definition}'
                     else:
-                        index_query = f'CREATE VECTOR INDEX vector_bhive_index_{similarity}_custom IF NOT EXISTS ON {collection}({vector_field} VECTOR) INCLUDE({custom_index_fields}) WITH {vector_definition}'
+                        index_query = f'CREATE VECTOR INDEX vector_bhive_index_{similarity}_custom IF NOT EXISTS ON {collection}({vector_field} {vector_type} VECTOR) INCLUDE({custom_index_fields}) WITH {vector_definition}'
                 elif "vec VECTOR" in custom_index_fields:
                     custom_index_fields = custom_index_fields.replace("vec VECTOR", "")
                     if custom_name:
-                        index_query = f'CREATE VECTOR INDEX {custom_name} IF NOT EXISTS ON {collection}({vector_field} VECTOR) WITH {vector_definition}'
+                        index_query = f'CREATE VECTOR INDEX {custom_name} IF NOT EXISTS ON {collection}({vector_field} {vector_type} VECTOR) WITH {vector_definition}'
                     else:
-                        index_query = f'CREATE VECTOR INDEX vector_bhive_index_{similarity}_custom IF NOT EXISTS ON {collection}({vector_field} VECTOR) WITH {vector_definition}'
+                        index_query = f'CREATE VECTOR INDEX vector_bhive_index_{similarity}_custom IF NOT EXISTS ON {collection}({vector_field} {vector_type} VECTOR) WITH {vector_definition}'
             else:
-                index_query = f'CREATE VECTOR INDEX vector_bhive_index_{similarity} IF NOT EXISTS ON {collection}({vector_field} VECTOR) INCLUDE(size, brand) WITH {vector_definition}'
+                index_query = f'CREATE VECTOR INDEX vector_bhive_index_{similarity} IF NOT EXISTS ON {collection}({vector_field} {vector_type} VECTOR) INCLUDE(size, brand) WITH {vector_definition}'
         if use_partition:
             index_query = index_query.split("WITH")[0] + f" PARTITION BY HASH(meta().id) WITH " + index_query.split("WITH")[1]
         print(index_query)
@@ -505,10 +509,10 @@ def vector_ann_query(self, vector_field='vec', collection='_default', search_fun
         ]
         if vector_type == 'sparse':
             query = (
-                f"SELECT id, SPARSE_ANN_DISTANCE({vector_field}, $qvec, {nprobes}) as distance "
+                f"SELECT id, SPARSE_VECTOR_DISTANCE({vector_field}, $qvec, {nprobes}) as distance "
                 f"FROM {collection} "
                 f"WHERE {size_predicate[random.randint(0,4)]} AND brand IN $brand "
-                f"ORDER BY SPARSE_ANN_DISTANCE({vector_field}, $qvec, {nprobes}) {direction} LIMIT {k}"
+                f"ORDER BY SPARSE_VECTOR_DISTANCE({vector_field}, $qvec, {nprobes}) {direction} LIMIT {k}"
             )
         else:
             query = (
diff --git a/pytests/tuqquery/tuq_vectorsearch.py b/pytests/tuqquery/tuq_vectorsearch.py
@@ -18,6 +18,7 @@ def setUp(self):
         self.bucket = "default"
         self.recall_knn = 100
         self.recall_ann = 40 # TBD
+        self.recall_sparse_ann = 40 # TBD
         self.accuracy_ann = 2 # TBD
         self.vector = self.input.param("vector", [1,2,3])
         self.use_xattr = self.input.param("use_xattr", False)
@@ -183,27 +184,38 @@ def test_ann_search(self):
         # we use existing SIFT ground truth for verification for L2/EUCLIDEAN
         try:
             self.log.info("Create Vector Index")
-            IndexVector().create_index(self.database, index_order=self.index_order, similarity=self.distance, nprobes=self.nprobes, is_xattr=self.use_xattr, is_base64=self.use_base64, network_byte_order=self.use_bigendian, description=self.description, dimension=self.dimension, train=self.train, use_bhive=self.use_bhive)
+            IndexVector().create_index(self.database, index_order=self.index_order, similarity=self.distance, nprobes=self.nprobes, is_xattr=self.use_xattr, is_base64=self.use_base64, network_byte_order=self.use_bigendian, description=self.description, dimension=self.dimension, train=self.train, use_bhive=self.use_bhive, vector_type=self.vector_type)
 
-            self.log.info("Verify Vector Index Metadata and Stats")
-            self.verify_vector_index_metadata_and_stats(expected_description=self.description,
-                                            expected_dimension=self.dimension,
-                                            expected_train_list=self.train,
-                                            expected_nprobes=self.nprobes,
-                                            expected_similarity=self.distance)
+            if self.vector_type == 'dense':
+                self.log.info("Verify Vector Index Metadata and Stats")
+                self.verify_vector_index_metadata_and_stats(expected_description=self.description,
+                                                expected_dimension=self.dimension,
+                                                expected_train_list=self.train,
+                                                expected_nprobes=self.nprobes,
+                                                expected_similarity=self.distance)
 
-            begin = random.randint(0, len(self.xq) - self.query_count)
+            begin = random.randint(0, self.xq.shape[0] - self.query_count)
             self.log.info(f"Running ANN query for range [{begin}:{begin+self.query_count}]")
-            distances, indices = QueryVector().search(self.database, self.xq[begin:begin+self.query_count], search_function=self.distance, type='ANN', is_xattr=self.use_xattr, is_base64=self.use_base64, is_bigendian=self.use_bigendian, nprobes=self.nprobes)
+            distances, indices = QueryVector().search(self.database, self.xq[begin:begin+self.query_count], search_function=self.distance, type='ANN', is_xattr=self.use_xattr, is_base64=self.use_base64, is_bigendian=self.use_bigendian, vector_type=self.vector_type, nprobes=self.nprobes)
             for i in range(self.query_count):
-                self.log.info(f"Check recall rate for query {begin+i} compare to SIFT ({self.distance})")
-                recall, accuracy = UtilVector().compare_result(self.gt[begin+i].tolist(), indices[i].tolist())
-                self.log.info(f'Recall rate: {round(recall, 2)}% with acccuracy: {round(accuracy,2)}%')
-                if recall < self.recall_ann:
-                    self.log.warn(f"Expected: {self.gt[begin+i].tolist()}")
-                    self.log.warn(f"Actual: {indices[i].tolist()}")
-                    self.log.warn(f"Distances: {distances[i].tolist()}")
-                    self.fail(f"Recall rate of {recall} is less than expected {self.recall_ann}")
+                if self.vector_type == 'dense':
+                    self.log.info(f"Check recall rate for query {begin+i} compare to SIFT ({self.distance})")
+                    recall, accuracy = UtilVector().compare_result(self.gt[begin+i].tolist(), indices[i].tolist())
+                    self.log.info(f'Recall rate: {round(recall, 2)}% with acccuracy: {round(accuracy,2)}%')
+                    if recall < self.recall_ann:
+                        self.log.warn(f"Expected: {self.gt[begin+i].tolist()}")
+                        self.log.warn(f"Actual: {indices[i].tolist()}")
+                        self.log.warn(f"Distances: {distances[i].tolist()}")
+                        self.fail(f"Recall rate of {recall} is less than expected {self.recall_ann}")
+                elif self.vector_type == 'sparse':
+                    self.log.info(f"Check recall rate for {self.vector_type} query {begin+i} compare to GT")
+                    recall, accuracy = UtilVector().compare_result(self.gt[0][begin+i].tolist(), indices[i][:10].tolist())
+                    self.log.info(f'Recall rate: {round(recall, 2)}% with acccuracy: {round(accuracy,2)}%')
+                    if recall < self.recall_sparse_ann:
+                        self.log.warn(f"Expected: {self.gt[0][begin+i].tolist()}")
+                        self.log.warn(f"Actual: {indices[i][:10].tolist()}")
+                        self.log.warn(f"Distances: {distances[i][:10].tolist()}")
+                        self.fail(f"Recall rate of {recall} is less than expected {self.recall_sparse_ann}")
         finally:
             IndexVector().drop_index(self.database, similarity=self.distance, use_bhive=self.use_bhive)