|
17 | 17 | package org.apache.jackrabbit.oak.plugins.index.elastic.query.inference; |
18 | 18 |
|
19 | 19 | import ch.qos.logback.classic.Level; |
| 20 | +import co.elastic.clients.elasticsearch._types.ElasticsearchException; |
| 21 | +import co.elastic.clients.elasticsearch.core.SearchRequest; |
| 22 | +import co.elastic.clients.elasticsearch.core.SearchResponse; |
20 | 23 | import co.elastic.clients.elasticsearch.indices.get_mapping.IndexMappingRecord; |
21 | 24 | import co.elastic.clients.json.JsonData; |
22 | 25 | import com.fasterxml.jackson.core.JsonProcessingException; |
|
38 | 41 | import org.apache.jackrabbit.oak.commons.PathUtils; |
39 | 42 | import org.apache.jackrabbit.oak.commons.junit.LogCustomizer; |
40 | 43 | import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticAbstractQueryTest; |
| 44 | +import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition; |
41 | 45 | import org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder; |
42 | 46 | import org.apache.jackrabbit.oak.spi.commit.CommitInfo; |
43 | 47 | import org.apache.jackrabbit.oak.spi.commit.EmptyHook; |
@@ -358,9 +362,20 @@ private void verifyMetricsLogsPresent(LogCustomizer logCustomizer) { |
358 | 362 |
|
359 | 363 | /** |
360 | 364 | * Adds test content for the hybrid search test. |
| 365 | + * default path for node creation is under /content |
361 | 366 | */ |
362 | 367 | private void addTestContent() throws CommitFailedException { |
363 | | - Tree content = root.getTree("/").addChild("content"); |
| 368 | + addTestContent("/content"); |
| 369 | + } |
| 370 | + |
| 371 | + /** |
| 372 | + * Adds test content on a specified path. |
| 373 | + */ |
| 374 | + private void addTestContent(String path) throws CommitFailedException { |
| 375 | + Tree content = root.getTree("/"); |
| 376 | + for (String element : PathUtils.elements(path)) { |
| 377 | + content = content.addChild(element); |
| 378 | + } |
364 | 379 |
|
365 | 380 | // Health content |
366 | 381 | Tree health = content.addChild("health"); |
@@ -402,7 +417,8 @@ private void setupEmbeddingsForContent(Tree index, String inferenceModelConfigNa |
402 | 417 | List<String> paths = executeQuery("select [jcr:path] from [nt:base] where ISDESCENDANTNODE('/content') and title is not null", SQL2); |
403 | 418 |
|
404 | 419 | for (String path : paths) { |
405 | | - URL json = this.getClass().getResource("/inferenceUsingConfig" + path + ".json"); |
| 420 | + String docName = PathUtils.getName(path); |
| 421 | + URL json = this.getClass().getResource("/inferenceUsingConfig/content/" + docName + ".json"); |
406 | 422 | if (json != null) { |
407 | 423 | Map<String, Collection<Double>> map = MAPPER.readValue(json, Map.class); |
408 | 424 | ObjectNode updateDoc = MAPPER.createObjectNode(); |
@@ -497,7 +513,7 @@ private void verifyQueryResults(Map<String, String> queryResults, String inferen |
497 | 513 | private void verifyErrorHandling(String jcrIndexName, String inferenceConfigInQuery) { |
498 | 514 | // Test server error handling |
499 | 515 | String queryPath3 = "select [jcr:path] from [nt:base] where ISDESCENDANTNODE('/content') and contains(*, '" |
500 | | - + inferenceConfigInQuery + "machine learning')"; |
| 516 | + + inferenceConfigInQuery + "machine learning')"; |
501 | 517 | assertQuery(queryPath3, List.of("/content/ml", "/content/programming")); |
502 | 518 |
|
503 | 519 | // Test timeout handling |
@@ -811,4 +827,166 @@ protected String getMetricName(String baseName) { |
811 | 827 | return uniquePrefix + "_" + baseName; |
812 | 828 | } |
813 | 829 | } |
| 830 | + |
| 831 | + /** |
| 832 | + * Count documents under a specific path |
| 833 | + */ |
| 834 | + private int countDocuments(Tree index, String path) { |
| 835 | + try { |
| 836 | + return (int) getIndexedPaths(index).stream() |
| 837 | + .filter(indexPath -> indexPath.startsWith(path)) |
| 838 | + .count(); |
| 839 | + } catch (IOException e) { |
| 840 | + LOG.error("Error counting documents", e); |
| 841 | + return 0; |
| 842 | + } |
| 843 | + } |
| 844 | + |
| 845 | + /** |
| 846 | + * Get all paths indexed in the given index |
| 847 | + */ |
| 848 | + private List<String> getIndexedPaths(Tree index) throws IOException { |
| 849 | + ElasticIndexDefinition esIdxDef = getElasticIndexDefinition(index); |
| 850 | + |
| 851 | + try { |
| 852 | + // Query to get all documents |
| 853 | + SearchRequest searchRequest = SearchRequest.of(r -> r |
| 854 | + .index(esIdxDef.getIndexAlias()) |
| 855 | + .size(10000) // Set a high limit, adjust if needed |
| 856 | + .query(q -> q.matchAll(m -> m)) |
| 857 | + ); |
| 858 | + |
| 859 | + SearchResponse<JsonData> response = esConnection.getClient().search(searchRequest, JsonData.class); |
| 860 | + |
| 861 | + // Extract paths from all hits |
| 862 | + return response.hits().hits().stream() |
| 863 | + .map(hit -> hit.id()) |
| 864 | + .collect(Collectors.toList()); |
| 865 | + } catch (ElasticsearchException e) { |
| 866 | + throw new IOException("Error getting indexed paths", e); |
| 867 | + } |
| 868 | + } |
| 869 | + |
| 870 | + /** |
| 871 | + * Tests KNN search functionality with a large number of documents. |
| 872 | + * This test verifies that vector search works correctly with filters when the |
| 873 | + * number of documents exceeds the default KNN result limit. |
| 874 | + */ |
| 875 | + @Test |
| 876 | + public void testHugeIngestionForKNNFilters() throws Exception { |
| 877 | + // Setup test parameters |
| 878 | + String jcrIndexName = UUID.randomUUID().toString(); |
| 879 | + String inferenceServiceUrl = "http://localhost:" + wireMock.port() + "/v1/embeddings"; |
| 880 | + String inferenceModelConfigName = "ada-test-model"; |
| 881 | + String inferenceModelName = "text-embedding-ada-002"; |
| 882 | + String inferenceConfigInQuery = "?{}?"; |
| 883 | + |
| 884 | + // Create inference configuration |
| 885 | + createInferenceConfig(jcrIndexName, true, defaultEnricherConfig, inferenceModelConfigName, |
| 886 | + inferenceModelName, inferenceServiceUrl, 0.8, 1L, true, true); |
| 887 | + setupEnricherStatus(defaultEnricherStatusMapping, defaultEnricherStatusData); |
| 888 | + |
| 889 | + // Create index definition with searchable properties |
| 890 | + IndexDefinitionBuilder builder = createIndexDefinition("title", "description", "updatedBy"); |
| 891 | + Tree index = setIndex(jcrIndexName, builder); |
| 892 | + root.commit(); |
| 893 | + |
| 894 | + // Setup mock inference service |
| 895 | + setupMockInferenceService(inferenceModelConfigName, jcrIndexName); |
| 896 | + |
| 897 | + // Create and index test content |
| 898 | + LOG.info("Starting large-scale document ingestion test for KNN search"); |
| 899 | + |
| 900 | + // Add regular test documents to be searched against these will be used when searching with filters |
| 901 | + addTestContent("/content/filterPath"); |
| 902 | + |
| 903 | + // Remove the cars node as we'll be searching for content similar to "cars" query |
| 904 | + // but need to verify we get the next best result (ML content) |
| 905 | + root.getTree("/").getChild("content").getChild("filterPath").getChild("cars").remove(); |
| 906 | + root.commit(); |
| 907 | + |
| 908 | + // Let the index catch up with initial content |
| 909 | + assertEventually(() -> assertEquals(7, countDocuments(index))); |
| 910 | + |
| 911 | + // Enrich the initial test documents with embeddings |
| 912 | + setupEmbeddingsForContent(index, inferenceModelConfigName, inferenceModelName); |
| 913 | + |
| 914 | + // Create a large number of documents with car-related content |
| 915 | + Tree hugeIngestion = root.getTree("/").addChild("content").addChild("hugeIngestion"); |
| 916 | + root.commit(); |
| 917 | + |
| 918 | + // Default KNN result limit is 100, so we create more documents to test post-filtering |
| 919 | + int numberOfDocuments = 200; |
| 920 | + |
| 921 | + // Load the embeddings from cars.json to reuse for all test documents |
| 922 | + URL jsonUrl = this.getClass().getResource("/inferenceUsingConfig/content/cars.json"); |
| 923 | + ObjectMapper mapper = new JsonMapper(); |
| 924 | + Map<String, Collection<Double>> embeddingsMap = mapper.readValue(jsonUrl, Map.class); |
| 925 | + List<Float> embeddings = embeddingsMap.get("embedding").stream() |
| 926 | + .map(d -> ((Double) d).floatValue()) |
| 927 | + .collect(Collectors.toList()); |
| 928 | + |
| 929 | + // Create a large number of car-related documents |
| 930 | + LOG.info("Creating {} car-related documents", numberOfDocuments); |
| 931 | + for (int i = 0; i < numberOfDocuments; i++) { |
| 932 | + String nodeName = "cars_" + i; |
| 933 | + Tree document = hugeIngestion.addChild(nodeName); |
| 934 | + document.setProperty("title", "The Future of Electric Cars " + i); |
| 935 | + document.setProperty("description", |
| 936 | + "Electric vehicles are revolutionizing the automobile industry. Document " + i + |
| 937 | + " explores advancements in battery technology, charging infrastructure, and sustainability."); |
| 938 | + } |
| 939 | + root.commit(); |
| 940 | + |
| 941 | + // Wait for initial indexing to complete |
| 942 | + LOG.info("Waiting for initial indexing to complete"); |
| 943 | + assertEventually(() -> { |
| 944 | + int docCount = countDocuments(index, "/content/hugeIngestion"); |
| 945 | + LOG.info("Current document count: {}", docCount); |
| 946 | + assertTrue("Expected at least " + numberOfDocuments + " documents, found " + docCount, |
| 947 | + docCount >= numberOfDocuments); |
| 948 | + }); |
| 949 | + |
| 950 | + // Add vector embeddings to all documents |
| 951 | + LOG.info("Adding vector embeddings to all documents"); |
| 952 | + for (int i = 0; i < numberOfDocuments; i++) { |
| 953 | + String path = "/content/hugeIngestion/cars_" + i; |
| 954 | + VectorDocument vectorDocument = new VectorDocument( |
| 955 | + UUID.randomUUID().toString(), |
| 956 | + embeddings, |
| 957 | + Map.of("updatedAt", Instant.now().toEpochMilli(), "model", inferenceModelName) |
| 958 | + ); |
| 959 | + |
| 960 | + ObjectNode updateDoc = mapper.createObjectNode(); |
| 961 | + ObjectNode vectorSpacesNode = updateDoc.putObject(InferenceConstants.VECTOR_SPACES); |
| 962 | + ArrayNode inferenceModelConfigNode = vectorSpacesNode.putArray(inferenceModelConfigName); |
| 963 | + inferenceModelConfigNode.addPOJO(vectorDocument); |
| 964 | + |
| 965 | + updateDocument(index, path, updateDoc); |
| 966 | + |
| 967 | + // Log progress periodically |
| 968 | + if (i % 50 == 0) { |
| 969 | + LOG.info("Added embeddings to {} documents", i); |
| 970 | + } |
| 971 | + } |
| 972 | + |
| 973 | + LOG.info("All documents have been ingested with embeddings"); |
| 974 | + |
| 975 | + // Test vector search query that should match car-related content |
| 976 | + // but only return results from the filterPath (not from hugeIngestion) |
| 977 | + String searchQuery = "technological advancements in electric vehicles"; |
| 978 | + String queryPath = "select [jcr:path] from [nt:base] where ISDESCENDANTNODE('/content/filterPath') and contains(*, '" |
| 979 | + + inferenceConfigInQuery + searchQuery + "')"; |
| 980 | + |
| 981 | + // Execute query and verify results |
| 982 | + LOG.info("Executing vector search query with path filter: {}", queryPath); |
| 983 | + assertEventually(() -> { |
| 984 | + List<String> results = executeQuery(queryPath, SQL2, true, true); |
| 985 | + |
| 986 | + // Since we removed the cars node, we should still get ML content as the next best match |
| 987 | + assertFalse("Should have returned at least one result", results.isEmpty()); |
| 988 | + LOG.info("Search returned {} results with machine learning content", results.size()); |
| 989 | + assertEquals("/content/filterPath/ml", results.get(0)); |
| 990 | + }); |
| 991 | + } |
814 | 992 | } |
0 commit comments