Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.cassandra.index.sai.cql;

import java.util.List;

import org.junit.Test;

import org.apache.cassandra.index.sai.disk.vector.JVectorVersionUtil;

import static org.junit.Assert.assertTrue;

public class VectorSiftSmallCompactionTest extends VectorSiftSmallTester
{
@Test
public void testCompaction() throws Throwable
{
var baseVectors = readFvecs(String.format("test/data/%s/%s_base.fvecs", DATASET, DATASET));
var queryVectors = readFvecs(String.format("test/data/%s/%s_query.fvecs", DATASET, DATASET));
var groundTruth = readIvecs(String.format("test/data/%s/%s_groundtruth.ivecs", DATASET, DATASET));

// Create table and index
createTable();
createIndex();

// we're going to compact manually, so disable background compactions to avoid interference
disableCompaction();

int segments = 10;
int vectorsPerSegment = baseVectors.size() / segments;
assert baseVectors.size() % vectorsPerSegment == 0; // simplifies split logic
for (int i = 0; i < segments; i++)
{
insertVectors(baseVectors.subList(i * vectorsPerSegment, (i + 1) * vectorsPerSegment), i * vectorsPerSegment);
flush();
}
for (int topK : List.of(1, 100))
{
double recall = testRecall(topK, queryVectors, groundTruth);
assertTrue("Pre-compaction recall is " + recall, recall > 0.975);
}

// When NVQ is enabled, we expect worse recall
float postCompactionRecall = JVectorVersionUtil.ENABLE_NVQ ? 0.9499f : 0.975f;

// Take the CassandraOnHeapGraph code path.
compact();
for (int topK : List.of(1, 100))
{
var recall = testRecall(topK, queryVectors, groundTruth);
assertTrue("Post-compaction recall is " + recall, recall > postCompactionRecall);
}

// Compact again to take the CompactionGraph code path.
compact();
for (int topK : List.of(1, 100))
{
var recall = testRecall(topK, queryVectors, groundTruth);
assertTrue("Post-compaction recall is " + recall, recall > postCompactionRecall);
}

// Set force PQ training size to ensure we hit the refine code path and apply it to half the vectors.
// TODO this test fails as of this commit due to recall issues. Will investigate further.
// CompactionGraph.PQ_TRAINING_SIZE = baseVectors.size() / 2;

// Compact again to take the CompactionGraph code path that calls the refine logic
compact();
for (int topK : List.of(1, 100))
{
var recall = testRecall(topK, queryVectors, groundTruth);
// This assertion will fail until we address the design the bug discussed
// in https://github.com/riptano/cndb/issues/16637.
// assertTrue("Post-compaction recall is " + recall, recall > postCompactionRecall);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.cassandra.index.sai.cql;

import org.junit.Test;

import org.apache.cassandra.index.sai.StorageAttachedIndex;
import org.apache.cassandra.index.sai.disk.v1.SegmentBuilder;
import org.apache.cassandra.index.sai.disk.v2.V2VectorIndexSearcher;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

public class VectorSiftSmallMultiSegmentBuildTest extends VectorSiftSmallTester
{
// exercise the path where we use the PQ from the first segment (constructed on-heap)
// to construct the others off-heap
@Test
public void testMultiSegmentBuild() throws Throwable
{
var baseVectors = readFvecs(String.format("test/data/%s/%s_base.fvecs", DATASET, DATASET));
var queryVectors = readFvecs(String.format("test/data/%s/%s_query.fvecs", DATASET, DATASET));
var groundTruth = readIvecs(String.format("test/data/%s/%s_groundtruth.ivecs", DATASET, DATASET));

// Create table without index
createTable();

// we're going to compact manually, so disable background compactions to avoid interference
disableCompaction();

insertVectors(baseVectors, 0);
// single big sstable before creating index
flush();
compact();

SegmentBuilder.updateLastValidSegmentRowId(2000); // 2000 rows per segment, enough for PQ to be created
createIndex();

// verify that we got the expected number of segments and that PQ is present in all of them
var sim = getCurrentColumnFamilyStore().getIndexManager();
var index = (StorageAttachedIndex) sim.listIndexes().iterator().next();
var searchableIndex = index.getIndexContext().getView().getIndexes().iterator().next();
var segments = searchableIndex.getSegments();
assertEquals(5, segments.size());
for (int i = 0; i < 5; i++)
assertNotNull(((V2VectorIndexSearcher) segments.get(0).getIndexSearcher()).getPQ());

var recall = testRecall(100, queryVectors, groundTruth);
assertTrue("Post-compaction recall is " + recall, recall > 0.975);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.cassandra.index.sai.cql;

import java.util.Arrays;

import org.junit.Test;

import org.apache.cassandra.cql3.UntypedResultSet;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

public class VectorSiftSmallRerankKZeroOrderMatchesFullPrecisionSimilarityTest extends VectorSiftSmallTester
{

// Note: test only fails when scores are sent from replica to coordinator.
@Test
public void testRerankKZeroOrderMatchesFullPrecisionSimilarity() throws Throwable
{
var baseVectors = readFvecs(String.format("test/data/%s/%s_base.fvecs", DATASET, DATASET));
var queryVectors = readFvecs(String.format("test/data/%s/%s_query.fvecs", DATASET, DATASET));

// Create table and index
createTable();
createIndex();

// Flush because in memory index uses FP vectors, therefore ignoring rerank_k = 0
insertVectors(baseVectors, 0);
flush();

// Test with a subset of query vectors to keep test runtime reasonable, but query with a high limit to
// increase probability for incorrect ordering
int numQueriesToTest = 10;
int limit = 100;

for (int queryIdx = 0; queryIdx < numQueriesToTest; queryIdx++)
{
float[] queryVector = queryVectors.get(queryIdx);
String queryVectorAsString = Arrays.toString(queryVector);

// Execute query with rerank_k = 0 and get the similarity scores computed by the coordinator
String query = String.format("SELECT pk, similarity_euclidean(val, %s) as similarity FROM %%s ORDER BY val ANN OF %s LIMIT %d WITH ann_options = {'rerank_k': 0}",
queryVectorAsString, queryVectorAsString, limit);
UntypedResultSet result = execute(query);

// Verify that results are in descending order of similarity score
// (Euclidean similarity is 1.0 / (1.0 + distance²), so higher score = more similar)
float lastSimilarity = Float.MAX_VALUE;
assertEquals(limit, result.size());
for (UntypedResultSet.Row row : result)
{
float similarity = row.getFloat("similarity");
assertTrue(String.format("Query %d: Similarity scores should be in descending order (higher score = more similar). " +
"Previous: %.10f, Current: %.10f",
queryIdx, lastSimilarity, similarity),
similarity <= lastSimilarity);
lastSimilarity = similarity;
}
}
}
}
Loading
Loading