datastax · adelapena · Mar 9, 2026
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/VectorSiftSmallCompactionTest.java b/test/unit/org/apache/cassandra/index/sai/cql/VectorSiftSmallCompactionTest.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql;
+
+import java.util.List;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.disk.vector.JVectorVersionUtil;
+
+import static org.junit.Assert.assertTrue;
+
+public class VectorSiftSmallCompactionTest extends VectorSiftSmallTester
+{
+    @Test
+    public void testCompaction() throws Throwable
+    {
+        var baseVectors = readFvecs(String.format("test/data/%s/%s_base.fvecs", DATASET, DATASET));
+        var queryVectors = readFvecs(String.format("test/data/%s/%s_query.fvecs", DATASET, DATASET));
+        var groundTruth = readIvecs(String.format("test/data/%s/%s_groundtruth.ivecs", DATASET, DATASET));
+
+        // Create table and index
+        createTable();
+        createIndex();
+
+        // we're going to compact manually, so disable background compactions to avoid interference
+        disableCompaction();
+
+        int segments = 10;
+        int vectorsPerSegment = baseVectors.size() / segments;
+        assert baseVectors.size() % vectorsPerSegment == 0; // simplifies split logic
+        for (int i = 0; i < segments; i++)
+        {
+            insertVectors(baseVectors.subList(i * vectorsPerSegment, (i + 1) * vectorsPerSegment), i * vectorsPerSegment);
+            flush();
+        }
+        for (int topK : List.of(1, 100))
+        {
+            double recall = testRecall(topK, queryVectors, groundTruth);
+            assertTrue("Pre-compaction recall is " + recall, recall > 0.975);
+        }
+
+        // When NVQ is enabled, we expect worse recall
+        float postCompactionRecall = JVectorVersionUtil.ENABLE_NVQ ? 0.9499f : 0.975f;
+
+        // Take the CassandraOnHeapGraph code path.
+        compact();
+        for (int topK : List.of(1, 100))
+        {
+            var recall = testRecall(topK, queryVectors, groundTruth);
+            assertTrue("Post-compaction recall is " + recall, recall > postCompactionRecall);
+        }
+
+        // Compact again to take the CompactionGraph code path.
+        compact();
+        for (int topK : List.of(1, 100))
+        {
+            var recall = testRecall(topK, queryVectors, groundTruth);
+            assertTrue("Post-compaction recall is " + recall, recall > postCompactionRecall);
+        }
+
+        // Set force PQ training size to ensure we hit the refine code path and apply it to half the vectors.
+        // TODO this test fails as of this commit due to recall issues. Will investigate further.
+        // CompactionGraph.PQ_TRAINING_SIZE = baseVectors.size() / 2;
+
+        // Compact again to take the CompactionGraph code path that calls the refine logic
+        compact();
+        for (int topK : List.of(1, 100))
+        {
+            var recall = testRecall(topK, queryVectors, groundTruth);
+            // This assertion will fail until we address the design the bug discussed
+            // in https://github.com/riptano/cndb/issues/16637.
+            // assertTrue("Post-compaction recall is " + recall, recall > postCompactionRecall);
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/VectorSiftSmallMultiSegmentBuildTest.java b/test/unit/org/apache/cassandra/index/sai/cql/VectorSiftSmallMultiSegmentBuildTest.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.disk.v1.SegmentBuilder;
+import org.apache.cassandra.index.sai.disk.v2.V2VectorIndexSearcher;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+public class VectorSiftSmallMultiSegmentBuildTest extends VectorSiftSmallTester
+{
+    // exercise the path where we use the PQ from the first segment (constructed on-heap)
+    // to construct the others off-heap
+    @Test
+    public void testMultiSegmentBuild() throws Throwable
+    {
+        var baseVectors = readFvecs(String.format("test/data/%s/%s_base.fvecs", DATASET, DATASET));
+        var queryVectors = readFvecs(String.format("test/data/%s/%s_query.fvecs", DATASET, DATASET));
+        var groundTruth = readIvecs(String.format("test/data/%s/%s_groundtruth.ivecs", DATASET, DATASET));
+
+        // Create table without index
+        createTable();
+
+        // we're going to compact manually, so disable background compactions to avoid interference
+        disableCompaction();
+
+        insertVectors(baseVectors, 0);
+        // single big sstable before creating index
+        flush();
+        compact();
+
+        SegmentBuilder.updateLastValidSegmentRowId(2000); // 2000 rows per segment, enough for PQ to be created
+        createIndex();
+
+        // verify that we got the expected number of segments and that PQ is present in all of them
+        var sim = getCurrentColumnFamilyStore().getIndexManager();
+        var index = (StorageAttachedIndex) sim.listIndexes().iterator().next();
+        var searchableIndex = index.getIndexContext().getView().getIndexes().iterator().next();
+        var segments = searchableIndex.getSegments();
+        assertEquals(5, segments.size());
+        for (int i = 0; i < 5; i++)
+            assertNotNull(((V2VectorIndexSearcher) segments.get(0).getIndexSearcher()).getPQ());
+
+        var recall = testRecall(100, queryVectors, groundTruth);
+        assertTrue("Post-compaction recall is " + recall, recall > 0.975);
+    }
+}
diff --git a/...ndra/index/sai/cql/VectorSiftSmallRerankKZeroOrderMatchesFullPrecisionSimilarityTest.java b/...ndra/index/sai/cql/VectorSiftSmallRerankKZeroOrderMatchesFullPrecisionSimilarityTest.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql;
+
+import java.util.Arrays;
+
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.UntypedResultSet;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class VectorSiftSmallRerankKZeroOrderMatchesFullPrecisionSimilarityTest extends VectorSiftSmallTester
+{
+
+    // Note: test only fails when scores are sent from replica to coordinator.
+    @Test
+    public void testRerankKZeroOrderMatchesFullPrecisionSimilarity() throws Throwable
+    {
+        var baseVectors = readFvecs(String.format("test/data/%s/%s_base.fvecs", DATASET, DATASET));
+        var queryVectors = readFvecs(String.format("test/data/%s/%s_query.fvecs", DATASET, DATASET));
+
+        // Create table and index
+        createTable();
+        createIndex();
+
+        // Flush because in memory index uses FP vectors, therefore ignoring rerank_k = 0
+        insertVectors(baseVectors, 0);
+        flush();
+
+        // Test with a subset of query vectors to keep test runtime reasonable, but query with a high limit to
+        // increase probability for incorrect ordering
+        int numQueriesToTest = 10;
+        int limit = 100;
+
+        for (int queryIdx = 0; queryIdx < numQueriesToTest; queryIdx++)
+        {
+            float[] queryVector = queryVectors.get(queryIdx);
+            String queryVectorAsString = Arrays.toString(queryVector);
+
+            // Execute query with rerank_k = 0 and get the similarity scores computed by the coordinator
+            String query = String.format("SELECT pk, similarity_euclidean(val, %s) as similarity FROM %%s ORDER BY val ANN OF %s LIMIT %d WITH ann_options = {'rerank_k': 0}",
+                                         queryVectorAsString, queryVectorAsString, limit);
+            UntypedResultSet result = execute(query);
+
+            // Verify that results are in descending order of similarity score
+            // (Euclidean similarity is 1.0 / (1.0 + distance²), so higher score = more similar)
+            float lastSimilarity = Float.MAX_VALUE;
+            assertEquals(limit, result.size());
+            for (UntypedResultSet.Row row : result)
+            {
+                float similarity = row.getFloat("similarity");
+                assertTrue(String.format("Query %d: Similarity scores should be in descending order (higher score = more similar). " +
+                                         "Previous: %.10f, Current: %.10f",
+                                         queryIdx, lastSimilarity, similarity),
+                           similarity <= lastSimilarity);
+                lastSimilarity = similarity;
+            }
+        }
+    }
+}