Skip to content

Commit 934bb10

Browse files
pierrecouchajaybhullar
authored andcommitted
Add sparse index
Change-Id: I667476574384f700cc497375ffb7a692e32df7ac Reviewed-on: https://review.couchbase.org/c/testrunner/+/243200 Tested-by: Ajay Bhullar <ajay.bhullar@couchbase.com> Reviewed-by: Ajay Bhullar <ajay.bhullar@couchbase.com>
1 parent 2d9fbc0 commit 934bb10

3 files changed

Lines changed: 56 additions & 29 deletions

File tree

conf/tuq/py-tuq-vector-sparse.conf

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Run the vector search tests with sparse vectors and DOT distance.
2+
# with following options:
3+
# - use_xattr=true, use_base64=true
4+
# - use_xattr=true, use_base64=false
5+
# - use_xattr=false, use_base64=true
6+
# - use_xattr=false, use_base64=false
7+
tuqquery.tuq_vectorsearch.VectorSearchTests:
8+
test_knn_distances_sparse,vector_type=sparse,distance=DOT
9+
test_knn_search,vector_type=sparse,distance=DOT
10+
tuq_ann_search,vector_type=sparse,distance=DOT,nprobes=24,train=20000
11+
tuq_ann_search,vector_type=sparse,distance=DOT,nprobes=24,use_bhive=True,train=20000

lib/vector/vector.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -376,19 +376,23 @@ def multi_upsert_document_into_cb(self, cb_coll, documents):
376376
print(e)
377377

378378
class IndexVector(object):
379-
def create_index(self, cluster, bucket='default', scope='_default', collection='_default', index_order='tail', vector_field='vec', is_xattr=False, is_base64=False, network_byte_order=False, dimension=128, train=10000, description='IVF,PQ32x8', similarity='L2_SQUARED', nprobes=3, use_bhive=False, custom_index_fields=None,custom_name=None,use_partition=False):
379+
def create_index(self, cluster, bucket='default', scope='_default', collection='_default', index_order='tail', vector_field='vec', is_xattr=False, is_base64=False, network_byte_order=False, dimension=128, train=10000, description='IVF,PQ32x8', similarity='L2_SQUARED', nprobes=3, use_bhive=False, vector_type='dense', custom_index_fields=None,custom_name=None,use_partition=False):
380380
cb = cluster.bucket(bucket)
381381
cb_scope = cb.scope(scope)
382382
if is_xattr:
383383
vector_field = f"meta().xattrs.{vector_field}"
384384
if is_base64:
385385
vector_field = f"DECODE_VECTOR({vector_field}, {network_byte_order})"
386386

387-
vector_definition = {"dimension": dimension, "train_list": train, "description": description, "similarity": similarity, "scan_nprobes": nprobes}
387+
if vector_type == 'dense':
388+
vector_definition = {"dimension": dimension, "train_list": train, "description": description, "similarity": similarity, "scan_nprobes": nprobes}
389+
elif vector_type == 'sparse':
390+
vector_definition = {"similarity": "DOT", "train_list": train, "scan_nprobes": nprobes}
391+
388392
index_queries = {
389-
'tail': f'CREATE INDEX vector_index_{similarity} IF NOT EXISTS ON {collection}(size, brand, {vector_field} VECTOR) WITH {vector_definition}',
390-
'mid': f'CREATE INDEX vector_index_{similarity} IF NOT EXISTS ON {collection}(size, {vector_field} VECTOR, brand) WITH {vector_definition}',
391-
'lead': f'CREATE INDEX vector_index_{similarity} IF NOT EXISTS ON {collection}({vector_field} VECTOR, size, brand) WITH {vector_definition}',
393+
'tail': f'CREATE INDEX vector_index_{similarity} IF NOT EXISTS ON {collection}(size, brand, {vector_field} {vector_type} VECTOR) WITH {vector_definition}',
394+
'mid': f'CREATE INDEX vector_index_{similarity} IF NOT EXISTS ON {collection}(size, {vector_field} {vector_type} VECTOR, brand) WITH {vector_definition}',
395+
'lead': f'CREATE INDEX vector_index_{similarity} IF NOT EXISTS ON {collection}({vector_field} {vector_type} VECTOR, size, brand) WITH {vector_definition}',
392396
}
393397
if custom_index_fields:
394398
if custom_name:
@@ -402,17 +406,17 @@ def create_index(self, cluster, bucket='default', scope='_default', collection='
402406
if ",vec VECTOR" in custom_index_fields:
403407
custom_index_fields = custom_index_fields.replace(",vec VECTOR", "")
404408
if custom_name:
405-
index_query = f'CREATE VECTOR INDEX {custom_name} IF NOT EXISTS ON {collection}({vector_field} VECTOR) INCLUDE({custom_index_fields}) WITH {vector_definition}'
409+
index_query = f'CREATE VECTOR INDEX {custom_name} IF NOT EXISTS ON {collection}({vector_field} {vector_type} VECTOR) INCLUDE({custom_index_fields}) WITH {vector_definition}'
406410
else:
407-
index_query = f'CREATE VECTOR INDEX vector_bhive_index_{similarity}_custom IF NOT EXISTS ON {collection}({vector_field} VECTOR) INCLUDE({custom_index_fields}) WITH {vector_definition}'
411+
index_query = f'CREATE VECTOR INDEX vector_bhive_index_{similarity}_custom IF NOT EXISTS ON {collection}({vector_field} {vector_type} VECTOR) INCLUDE({custom_index_fields}) WITH {vector_definition}'
408412
elif "vec VECTOR" in custom_index_fields:
409413
custom_index_fields = custom_index_fields.replace("vec VECTOR", "")
410414
if custom_name:
411-
index_query = f'CREATE VECTOR INDEX {custom_name} IF NOT EXISTS ON {collection}({vector_field} VECTOR) WITH {vector_definition}'
415+
index_query = f'CREATE VECTOR INDEX {custom_name} IF NOT EXISTS ON {collection}({vector_field} {vector_type} VECTOR) WITH {vector_definition}'
412416
else:
413-
index_query = f'CREATE VECTOR INDEX vector_bhive_index_{similarity}_custom IF NOT EXISTS ON {collection}({vector_field} VECTOR) WITH {vector_definition}'
417+
index_query = f'CREATE VECTOR INDEX vector_bhive_index_{similarity}_custom IF NOT EXISTS ON {collection}({vector_field} {vector_type} VECTOR) WITH {vector_definition}'
414418
else:
415-
index_query = f'CREATE VECTOR INDEX vector_bhive_index_{similarity} IF NOT EXISTS ON {collection}({vector_field} VECTOR) INCLUDE(size, brand) WITH {vector_definition}'
419+
index_query = f'CREATE VECTOR INDEX vector_bhive_index_{similarity} IF NOT EXISTS ON {collection}({vector_field} {vector_type} VECTOR) INCLUDE(size, brand) WITH {vector_definition}'
416420
if use_partition:
417421
index_query = index_query.split("WITH")[0] + f" PARTITION BY HASH(meta().id) WITH " + index_query.split("WITH")[1]
418422
print(index_query)
@@ -505,10 +509,10 @@ def vector_ann_query(self, vector_field='vec', collection='_default', search_fun
505509
]
506510
if vector_type == 'sparse':
507511
query = (
508-
f"SELECT id, SPARSE_ANN_DISTANCE({vector_field}, $qvec, {nprobes}) as distance "
512+
f"SELECT id, SPARSE_VECTOR_DISTANCE({vector_field}, $qvec, {nprobes}) as distance "
509513
f"FROM {collection} "
510514
f"WHERE {size_predicate[random.randint(0,4)]} AND brand IN $brand "
511-
f"ORDER BY SPARSE_ANN_DISTANCE({vector_field}, $qvec, {nprobes}) {direction} LIMIT {k}"
515+
f"ORDER BY SPARSE_VECTOR_DISTANCE({vector_field}, $qvec, {nprobes}) {direction} LIMIT {k}"
512516
)
513517
else:
514518
query = (

pytests/tuqquery/tuq_vectorsearch.py

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ def setUp(self):
1818
self.bucket = "default"
1919
self.recall_knn = 100
2020
self.recall_ann = 40 # TBD
21+
self.recall_sparse_ann = 40 # TBD
2122
self.accuracy_ann = 2 # TBD
2223
self.vector = self.input.param("vector", [1,2,3])
2324
self.use_xattr = self.input.param("use_xattr", False)
@@ -183,27 +184,38 @@ def test_ann_search(self):
183184
# we use existing SIFT ground truth for verification for L2/EUCLIDEAN
184185
try:
185186
self.log.info("Create Vector Index")
186-
IndexVector().create_index(self.database, index_order=self.index_order, similarity=self.distance, nprobes=self.nprobes, is_xattr=self.use_xattr, is_base64=self.use_base64, network_byte_order=self.use_bigendian, description=self.description, dimension=self.dimension, train=self.train, use_bhive=self.use_bhive)
187+
IndexVector().create_index(self.database, index_order=self.index_order, similarity=self.distance, nprobes=self.nprobes, is_xattr=self.use_xattr, is_base64=self.use_base64, network_byte_order=self.use_bigendian, description=self.description, dimension=self.dimension, train=self.train, use_bhive=self.use_bhive, vector_type=self.vector_type)
187188

188-
self.log.info("Verify Vector Index Metadata and Stats")
189-
self.verify_vector_index_metadata_and_stats(expected_description=self.description,
190-
expected_dimension=self.dimension,
191-
expected_train_list=self.train,
192-
expected_nprobes=self.nprobes,
193-
expected_similarity=self.distance)
189+
if self.vector_type == 'dense':
190+
self.log.info("Verify Vector Index Metadata and Stats")
191+
self.verify_vector_index_metadata_and_stats(expected_description=self.description,
192+
expected_dimension=self.dimension,
193+
expected_train_list=self.train,
194+
expected_nprobes=self.nprobes,
195+
expected_similarity=self.distance)
194196

195-
begin = random.randint(0, len(self.xq) - self.query_count)
197+
begin = random.randint(0, self.xq.shape[0] - self.query_count)
196198
self.log.info(f"Running ANN query for range [{begin}:{begin+self.query_count}]")
197-
distances, indices = QueryVector().search(self.database, self.xq[begin:begin+self.query_count], search_function=self.distance, type='ANN', is_xattr=self.use_xattr, is_base64=self.use_base64, is_bigendian=self.use_bigendian, nprobes=self.nprobes)
199+
distances, indices = QueryVector().search(self.database, self.xq[begin:begin+self.query_count], search_function=self.distance, type='ANN', is_xattr=self.use_xattr, is_base64=self.use_base64, is_bigendian=self.use_bigendian, vector_type=self.vector_type, nprobes=self.nprobes)
198200
for i in range(self.query_count):
199-
self.log.info(f"Check recall rate for query {begin+i} compare to SIFT ({self.distance})")
200-
recall, accuracy = UtilVector().compare_result(self.gt[begin+i].tolist(), indices[i].tolist())
201-
self.log.info(f'Recall rate: {round(recall, 2)}% with acccuracy: {round(accuracy,2)}%')
202-
if recall < self.recall_ann:
203-
self.log.warn(f"Expected: {self.gt[begin+i].tolist()}")
204-
self.log.warn(f"Actual: {indices[i].tolist()}")
205-
self.log.warn(f"Distances: {distances[i].tolist()}")
206-
self.fail(f"Recall rate of {recall} is less than expected {self.recall_ann}")
201+
if self.vector_type == 'dense':
202+
self.log.info(f"Check recall rate for query {begin+i} compare to SIFT ({self.distance})")
203+
recall, accuracy = UtilVector().compare_result(self.gt[begin+i].tolist(), indices[i].tolist())
204+
self.log.info(f'Recall rate: {round(recall, 2)}% with acccuracy: {round(accuracy,2)}%')
205+
if recall < self.recall_ann:
206+
self.log.warn(f"Expected: {self.gt[begin+i].tolist()}")
207+
self.log.warn(f"Actual: {indices[i].tolist()}")
208+
self.log.warn(f"Distances: {distances[i].tolist()}")
209+
self.fail(f"Recall rate of {recall} is less than expected {self.recall_ann}")
210+
elif self.vector_type == 'sparse':
211+
self.log.info(f"Check recall rate for {self.vector_type} query {begin+i} compare to GT")
212+
recall, accuracy = UtilVector().compare_result(self.gt[0][begin+i].tolist(), indices[i][:10].tolist())
213+
self.log.info(f'Recall rate: {round(recall, 2)}% with acccuracy: {round(accuracy,2)}%')
214+
if recall < self.recall_sparse_ann:
215+
self.log.warn(f"Expected: {self.gt[0][begin+i].tolist()}")
216+
self.log.warn(f"Actual: {indices[i][:10].tolist()}")
217+
self.log.warn(f"Distances: {distances[i][:10].tolist()}")
218+
self.fail(f"Recall rate of {recall} is less than expected {self.recall_sparse_ann}")
207219
finally:
208220
IndexVector().drop_index(self.database, similarity=self.distance, use_bhive=self.use_bhive)
209221

0 commit comments

Comments
 (0)