Skip to content

Commit 344c636

Browse files
authored
OAK-11743: Filters only work inside knn query, so filters should be moved to knn query also (#2313)
1 parent ee6c57e commit 344c636

File tree

2 files changed

+188
-4
lines changed

2 files changed

+188
-4
lines changed

oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -694,7 +694,13 @@ private ObjectBuilder<BoolQuery> inferenceConfigQuery(BoolQuery.Builder b, Strin
694694
knnQueryBuilder.field(InferenceConstants.VECTOR_SPACES + "." + inferenceModelConfigName + "." + InferenceConstants.VECTOR);
695695
knnQueryBuilder.numCandidates(inferenceModelConfig.getNumCandidates());
696696
knnQueryBuilder.queryVector(embeddings);
697-
697+
// filters in knn are only applicable if filters are defined in knn query itself.
698+
// the filters outside knn query are applicable as post filters which can lead to missing results.
699+
if (planResult.evaluateNonFullTextConstraints()) {
700+
for (Query constraint : nonFullTextConstraints(indexPlan, planResult)) {
701+
knnQueryBuilder.filter(constraint);
702+
}
703+
}
698704
KnnQuery knnQuery = knnQueryBuilder.build();
699705

700706
NestedQuery.Builder nestedQueryBuilder = new NestedQuery.Builder()

oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/inference/ElasticInferenceUsingConfigTest.java

Lines changed: 181 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
package org.apache.jackrabbit.oak.plugins.index.elastic.query.inference;
1818

1919
import ch.qos.logback.classic.Level;
20+
import co.elastic.clients.elasticsearch._types.ElasticsearchException;
21+
import co.elastic.clients.elasticsearch.core.SearchRequest;
22+
import co.elastic.clients.elasticsearch.core.SearchResponse;
2023
import co.elastic.clients.elasticsearch.indices.get_mapping.IndexMappingRecord;
2124
import co.elastic.clients.json.JsonData;
2225
import com.fasterxml.jackson.core.JsonProcessingException;
@@ -38,6 +41,7 @@
3841
import org.apache.jackrabbit.oak.commons.PathUtils;
3942
import org.apache.jackrabbit.oak.commons.junit.LogCustomizer;
4043
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticAbstractQueryTest;
44+
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
4145
import org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder;
4246
import org.apache.jackrabbit.oak.spi.commit.CommitInfo;
4347
import org.apache.jackrabbit.oak.spi.commit.EmptyHook;
@@ -358,9 +362,20 @@ private void verifyMetricsLogsPresent(LogCustomizer logCustomizer) {
358362

359363
/**
360364
* Adds test content for the hybrid search test.
365+
* default path for node creation is under /content
361366
*/
362367
private void addTestContent() throws CommitFailedException {
363-
Tree content = root.getTree("/").addChild("content");
368+
addTestContent("/content");
369+
}
370+
371+
/**
372+
* Adds test content on a specified path.
373+
*/
374+
private void addTestContent(String path) throws CommitFailedException {
375+
Tree content = root.getTree("/");
376+
for (String element : PathUtils.elements(path)) {
377+
content = content.addChild(element);
378+
}
364379

365380
// Health content
366381
Tree health = content.addChild("health");
@@ -402,7 +417,8 @@ private void setupEmbeddingsForContent(Tree index, String inferenceModelConfigNa
402417
List<String> paths = executeQuery("select [jcr:path] from [nt:base] where ISDESCENDANTNODE('/content') and title is not null", SQL2);
403418

404419
for (String path : paths) {
405-
URL json = this.getClass().getResource("/inferenceUsingConfig" + path + ".json");
420+
String docName = PathUtils.getName(path);
421+
URL json = this.getClass().getResource("/inferenceUsingConfig/content/" + docName + ".json");
406422
if (json != null) {
407423
Map<String, Collection<Double>> map = MAPPER.readValue(json, Map.class);
408424
ObjectNode updateDoc = MAPPER.createObjectNode();
@@ -497,7 +513,7 @@ private void verifyQueryResults(Map<String, String> queryResults, String inferen
497513
private void verifyErrorHandling(String jcrIndexName, String inferenceConfigInQuery) {
498514
// Test server error handling
499515
String queryPath3 = "select [jcr:path] from [nt:base] where ISDESCENDANTNODE('/content') and contains(*, '"
500-
+ inferenceConfigInQuery + "machine learning')";
516+
+ inferenceConfigInQuery + "machine learning')";
501517
assertQuery(queryPath3, List.of("/content/ml", "/content/programming"));
502518

503519
// Test timeout handling
@@ -811,4 +827,166 @@ protected String getMetricName(String baseName) {
811827
return uniquePrefix + "_" + baseName;
812828
}
813829
}
830+
831+
/**
832+
* Count documents under a specific path
833+
*/
834+
private int countDocuments(Tree index, String path) {
835+
try {
836+
return (int) getIndexedPaths(index).stream()
837+
.filter(indexPath -> indexPath.startsWith(path))
838+
.count();
839+
} catch (IOException e) {
840+
LOG.error("Error counting documents", e);
841+
return 0;
842+
}
843+
}
844+
845+
/**
846+
* Get all paths indexed in the given index
847+
*/
848+
private List<String> getIndexedPaths(Tree index) throws IOException {
849+
ElasticIndexDefinition esIdxDef = getElasticIndexDefinition(index);
850+
851+
try {
852+
// Query to get all documents
853+
SearchRequest searchRequest = SearchRequest.of(r -> r
854+
.index(esIdxDef.getIndexAlias())
855+
.size(10000) // Set a high limit, adjust if needed
856+
.query(q -> q.matchAll(m -> m))
857+
);
858+
859+
SearchResponse<JsonData> response = esConnection.getClient().search(searchRequest, JsonData.class);
860+
861+
// Extract paths from all hits
862+
return response.hits().hits().stream()
863+
.map(hit -> hit.id())
864+
.collect(Collectors.toList());
865+
} catch (ElasticsearchException e) {
866+
throw new IOException("Error getting indexed paths", e);
867+
}
868+
}
869+
870+
/**
871+
* Tests KNN search functionality with a large number of documents.
872+
* This test verifies that vector search works correctly with filters when the
873+
* number of documents exceeds the default KNN result limit.
874+
*/
875+
@Test
876+
public void testHugeIngestionForKNNFilters() throws Exception {
877+
// Setup test parameters
878+
String jcrIndexName = UUID.randomUUID().toString();
879+
String inferenceServiceUrl = "http://localhost:" + wireMock.port() + "/v1/embeddings";
880+
String inferenceModelConfigName = "ada-test-model";
881+
String inferenceModelName = "text-embedding-ada-002";
882+
String inferenceConfigInQuery = "?{}?";
883+
884+
// Create inference configuration
885+
createInferenceConfig(jcrIndexName, true, defaultEnricherConfig, inferenceModelConfigName,
886+
inferenceModelName, inferenceServiceUrl, 0.8, 1L, true, true);
887+
setupEnricherStatus(defaultEnricherStatusMapping, defaultEnricherStatusData);
888+
889+
// Create index definition with searchable properties
890+
IndexDefinitionBuilder builder = createIndexDefinition("title", "description", "updatedBy");
891+
Tree index = setIndex(jcrIndexName, builder);
892+
root.commit();
893+
894+
// Setup mock inference service
895+
setupMockInferenceService(inferenceModelConfigName, jcrIndexName);
896+
897+
// Create and index test content
898+
LOG.info("Starting large-scale document ingestion test for KNN search");
899+
900+
// Add regular test documents to be searched against these will be used when searching with filters
901+
addTestContent("/content/filterPath");
902+
903+
// Remove the cars node as we'll be searching for content similar to "cars" query
904+
// but need to verify we get the next best result (ML content)
905+
root.getTree("/").getChild("content").getChild("filterPath").getChild("cars").remove();
906+
root.commit();
907+
908+
// Let the index catch up with initial content
909+
assertEventually(() -> assertEquals(7, countDocuments(index)));
910+
911+
// Enrich the initial test documents with embeddings
912+
setupEmbeddingsForContent(index, inferenceModelConfigName, inferenceModelName);
913+
914+
// Create a large number of documents with car-related content
915+
Tree hugeIngestion = root.getTree("/").addChild("content").addChild("hugeIngestion");
916+
root.commit();
917+
918+
// Default KNN result limit is 100, so we create more documents to test post-filtering
919+
int numberOfDocuments = 200;
920+
921+
// Load the embeddings from cars.json to reuse for all test documents
922+
URL jsonUrl = this.getClass().getResource("/inferenceUsingConfig/content/cars.json");
923+
ObjectMapper mapper = new JsonMapper();
924+
Map<String, Collection<Double>> embeddingsMap = mapper.readValue(jsonUrl, Map.class);
925+
List<Float> embeddings = embeddingsMap.get("embedding").stream()
926+
.map(d -> ((Double) d).floatValue())
927+
.collect(Collectors.toList());
928+
929+
// Create a large number of car-related documents
930+
LOG.info("Creating {} car-related documents", numberOfDocuments);
931+
for (int i = 0; i < numberOfDocuments; i++) {
932+
String nodeName = "cars_" + i;
933+
Tree document = hugeIngestion.addChild(nodeName);
934+
document.setProperty("title", "The Future of Electric Cars " + i);
935+
document.setProperty("description",
936+
"Electric vehicles are revolutionizing the automobile industry. Document " + i +
937+
" explores advancements in battery technology, charging infrastructure, and sustainability.");
938+
}
939+
root.commit();
940+
941+
// Wait for initial indexing to complete
942+
LOG.info("Waiting for initial indexing to complete");
943+
assertEventually(() -> {
944+
int docCount = countDocuments(index, "/content/hugeIngestion");
945+
LOG.info("Current document count: {}", docCount);
946+
assertTrue("Expected at least " + numberOfDocuments + " documents, found " + docCount,
947+
docCount >= numberOfDocuments);
948+
});
949+
950+
// Add vector embeddings to all documents
951+
LOG.info("Adding vector embeddings to all documents");
952+
for (int i = 0; i < numberOfDocuments; i++) {
953+
String path = "/content/hugeIngestion/cars_" + i;
954+
VectorDocument vectorDocument = new VectorDocument(
955+
UUID.randomUUID().toString(),
956+
embeddings,
957+
Map.of("updatedAt", Instant.now().toEpochMilli(), "model", inferenceModelName)
958+
);
959+
960+
ObjectNode updateDoc = mapper.createObjectNode();
961+
ObjectNode vectorSpacesNode = updateDoc.putObject(InferenceConstants.VECTOR_SPACES);
962+
ArrayNode inferenceModelConfigNode = vectorSpacesNode.putArray(inferenceModelConfigName);
963+
inferenceModelConfigNode.addPOJO(vectorDocument);
964+
965+
updateDocument(index, path, updateDoc);
966+
967+
// Log progress periodically
968+
if (i % 50 == 0) {
969+
LOG.info("Added embeddings to {} documents", i);
970+
}
971+
}
972+
973+
LOG.info("All documents have been ingested with embeddings");
974+
975+
// Test vector search query that should match car-related content
976+
// but only return results from the filterPath (not from hugeIngestion)
977+
String searchQuery = "technological advancements in electric vehicles";
978+
String queryPath = "select [jcr:path] from [nt:base] where ISDESCENDANTNODE('/content/filterPath') and contains(*, '"
979+
+ inferenceConfigInQuery + searchQuery + "')";
980+
981+
// Execute query and verify results
982+
LOG.info("Executing vector search query with path filter: {}", queryPath);
983+
assertEventually(() -> {
984+
List<String> results = executeQuery(queryPath, SQL2, true, true);
985+
986+
// Since we removed the cars node, we should still get ML content as the next best match
987+
assertFalse("Should have returned at least one result", results.isEmpty());
988+
LOG.info("Search returned {} results with machine learning content", results.size());
989+
assertEquals("/content/filterPath/ml", results.get(0));
990+
});
991+
}
814992
}

0 commit comments

Comments
 (0)