Skip to content

Commit 53c1964

Browse files
OAK-11603: lucene 4.x fuzzy queries don't work in Elastic (#2180)
* OAK-11603: lucene 4.x fuzzy queries don't work in Elastic * OAK-11603: improve fuzzy conversion
1 parent 4c8b070 commit 53c1964

File tree

3 files changed

+139
-5
lines changed

3 files changed

+139
-5
lines changed

oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java

Lines changed: 78 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@
104104
import java.util.function.BiPredicate;
105105
import java.util.function.Consumer;
106106
import java.util.function.Function;
107+
import java.util.regex.Matcher;
108+
import java.util.regex.Pattern;
107109
import java.util.stream.Collectors;
108110
import java.util.stream.Stream;
109111
import java.util.stream.StreamSupport;
@@ -126,6 +128,11 @@ public class ElasticRequestHandler {
126128
private static final String HIGHLIGHT_PREFIX = "<strong>";
127129
private static final String HIGHLIGHT_SUFFIX = "</strong>";
128130

131+
// Match Lucene 4.x fuzzy queries (e.g., roam~0.8), but not 5.x and beyond (e.g., roam~2)
132+
private static final Pattern LUCENE_4_FUZZY_PATTERN = Pattern.compile("\\b(\\w+)~([0-9]*\\.?[0-9]+)\\b");
133+
// From Lucene 5 and above (used by elastic), the fuzzy query syntax has changed to use a single integer
134+
private static final Pattern ELASTIC_FUZZY_PATTERN = Pattern.compile("\\b(\\w+)~([0-2])\\b");
135+
129136
private final IndexPlan indexPlan;
130137
private final Filter filter;
131138
private final PlanResult planResult;
@@ -889,10 +896,10 @@ private static Query referenceConstraint(String uuid) {
889896
return Query.of(q -> q.multiMatch(m -> m.fields(uuid)));
890897
}
891898

892-
private static QueryStringQuery.Builder fullTextQuery(String text, String fieldName, PlanResult pr, boolean includeDynamicBoostedValues) {
899+
private QueryStringQuery.Builder fullTextQuery(String text, String fieldName, PlanResult pr, boolean includeDynamicBoostedValues) {
893900
LOG.debug("fullTextQuery for text: '{}', fieldName: '{}'", text, fieldName);
894901
QueryStringQuery.Builder qsqBuilder = new QueryStringQuery.Builder()
895-
.query(FulltextIndex.rewriteQueryText(text))
902+
.query(rewriteQueryText(text))
896903
.defaultOperator(Operator.And)
897904
.type(TextQueryType.CrossFields)
898905
.tieBreaker(0.5d);
@@ -908,6 +915,75 @@ private static QueryStringQuery.Builder fullTextQuery(String text, String fieldN
908915
return qsqBuilder.fields(fieldName);
909916
}
910917

918+
private String rewriteQueryText(String text) {
919+
String rewritten = FulltextIndex.rewriteQueryText(text);
920+
921+
// here we handle special cases where the syntax used in the lucene 4.x query parser is not supported by the current version
922+
rewritten = convertFuzzyQuery(rewritten);
923+
924+
return rewritten;
925+
}
926+
927+
/**
928+
* Converts Lucene fuzzy queries from the old syntax (float similarity) to the new syntax (edit distance).
929+
* <p>
930+
* In Lucene 4, fuzzy queries were specified using a floating-point similarity (e.g., "term~0.8"), where values
931+
* closer to 1 required a higher similarity match. In later Lucene versions, this was replaced with a discrete
932+
* edit distance (0, 1, or 2).
933+
* <p>
934+
* This method:
935+
* <ul>
936+
* <li>Detects and converts old fuzzy queries (e.g., "roam~0.7" → "roam~1").</li>
937+
* <li>Preserves new fuzzy queries (e.g., "test~2" remains unchanged).</li>
938+
* <li>Avoids modifying proximity queries (e.g., "\"quick fox\"~5" remains unchanged).</li>
939+
* </ul>
940+
*
941+
* @param text The input query string containing fuzzy or proximity queries.
942+
* @return A query string where old fuzzy syntax is converted to the new format.
943+
*/
944+
private String convertFuzzyQuery(String text) {
945+
if (!text.contains("~")) {
946+
return text;
947+
}
948+
Matcher lucene4FuzzyMatcher = LUCENE_4_FUZZY_PATTERN.matcher(text);
949+
950+
if (!lucene4FuzzyMatcher.find()) {
951+
// this can only happen if the pattern is not found, which means we are dealing with a tilde not related to a fuzzy query
952+
return text;
953+
}
954+
955+
StringBuilder result = new StringBuilder();
956+
do {
957+
String term = lucene4FuzzyMatcher.group(1);
958+
String fuzzyValue = lucene4FuzzyMatcher.group(2);
959+
960+
// Skip if it's already using the new syntax (integer 0-2)
961+
if (ELASTIC_FUZZY_PATTERN.matcher(term + "~" + fuzzyValue).matches()) {
962+
continue;
963+
}
964+
965+
// Convert floating-point similarity to integer edit distance
966+
int editDistance = 2; // Default to the most lenient setting
967+
try {
968+
float similarity = Float.parseFloat(fuzzyValue);
969+
if (similarity >= 0.8f) {
970+
editDistance = 0;
971+
} else if (similarity >= 0.5f) {
972+
editDistance = 1;
973+
}
974+
} catch (NumberFormatException e) {
975+
LOG.warn("Invalid fuzzy value: {} for query text {}, using default edit distance of 2", fuzzyValue, text);
976+
}
977+
978+
lucene4FuzzyMatcher.appendReplacement(result, term + "~" + editDistance);
979+
} while (lucene4FuzzyMatcher.find());
980+
981+
lucene4FuzzyMatcher.appendTail(result);
982+
String resultString = result.toString();
983+
LOG.info("Converted fuzzy query from '{}' to '{}'", text, resultString);
984+
return resultString;
985+
}
986+
911987
private Query createQuery(String propertyName, Filter.PropertyRestriction pr, PropertyDefinition defn) {
912988
final String field = elasticIndexDefinition.getElasticKeyword(propertyName);
913989

oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticFullTextIndexTest.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,15 @@
1717
package org.apache.jackrabbit.oak.plugins.index.elastic;
1818

1919
import org.apache.jackrabbit.oak.api.ContentRepository;
20+
import org.apache.jackrabbit.oak.api.Tree;
2021
import org.apache.jackrabbit.oak.plugins.index.FullTextIndexCommonTest;
2122
import org.junit.ClassRule;
23+
import org.junit.Test;
24+
25+
import java.util.List;
26+
27+
import static org.hamcrest.CoreMatchers.containsString;
28+
import static org.hamcrest.MatcherAssert.assertThat;
2229

2330
public class ElasticFullTextIndexTest extends FullTextIndexCommonTest {
2431

@@ -40,4 +47,27 @@ protected void createTestIndexNode() {
4047
setTraversalEnabled(false);
4148
}
4249

50+
@Test
51+
public void fullTextWithFuzzyEditDistance() throws Exception {
52+
Tree index = setup(builder -> builder.indexRule("nt:base").property("propa").analyzed(), idx -> {
53+
},
54+
"propa");
55+
56+
//add content
57+
Tree test = root.getTree("/").addChild("test");
58+
59+
test.addChild("a").setProperty("propa", "Hello World!");
60+
test.addChild("b").setProperty("propa", "Simple test");
61+
root.commit();
62+
63+
String misspelledWorld = "//*[jcr:contains(@propa, 'wordl~0.5')]";
64+
String mixedFuzzyFormats = "//*[jcr:contains(@propa, 'wordl~0.5 OR sample~1')]";
65+
66+
assertEventually(() -> {
67+
assertThat(explain(misspelledWorld, XPATH), containsString(indexOptions.getIndexType() + ":" + index.getName()));
68+
assertQuery(misspelledWorld, XPATH, List.of("/test/a"));
69+
assertQuery(mixedFuzzyFormats, XPATH, List.of("/test/a", "/test/b"));
70+
});
71+
}
72+
4373
}

oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextIndexCommonTest.java

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ public void fullTextWithInvalidSyntax() throws Exception {
7575
test.addChild("a").setProperty("propa", "Hello everyone. This is a fulltext test");
7676
root.commit();
7777

78-
// fuzziness support the following syntax: <term>~[edit_distance] (eg: hello~2). The query below is invalid
78+
// fuzziness support the following syntax: <term>~[edit_distance] (eg: hello~[similarity value]). The query below is invalid
79+
// https://lucene.apache.org/core/2_9_4/queryparsersyntax.html#Fuzzy%20Searches
7980
// https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-fuzziness
8081
String query = "//*[jcr:contains(@propa, 'hello e~one')]";
8182

@@ -85,6 +86,33 @@ public void fullTextWithInvalidSyntax() throws Exception {
8586
});
8687
}
8788

89+
@Test
90+
public void fullTextWithFuzziness() throws Exception {
91+
Tree index = setup(builder -> builder.indexRule("nt:base").property("propa").analyzed(), idx -> {
92+
},
93+
"propa");
94+
95+
//add content
96+
Tree test = root.getTree("/").addChild("test");
97+
98+
test.addChild("a").setProperty("propa", "Hello World!");
99+
test.addChild("b").setProperty("propa", "hello~folks!");
100+
test.addChild("c").setProperty("propa", "Hello everyone!");
101+
root.commit();
102+
103+
String misspelledWorld = "//*[jcr:contains(@propa, 'wordl~0.5')]";
104+
String multipleMisspelledWorlds = "//*[jcr:contains(@propa, 'wordl~0.5 OR everone~0.5')]";
105+
String withTilde = "//*[jcr:contains(@propa, 'hello\\~folks')]";
106+
107+
assertEventually(() -> {
108+
assertThat(explain(misspelledWorld, XPATH), containsString(indexOptions.getIndexType() + ":" + index.getName()));
109+
110+
assertQuery(misspelledWorld, XPATH, List.of("/test/a"));
111+
assertQuery(multipleMisspelledWorlds, XPATH, List.of("/test/a", "/test/c"));
112+
assertQuery(withTilde, XPATH, List.of("/test/b"));
113+
});
114+
}
115+
88116
@Test
89117
public void fullTextQueryRegExp() throws Exception {
90118
Tree index = setup(builder -> builder.indexRule("nt:base").property("propa").analyzed(), idx -> {
@@ -318,7 +346,7 @@ protected Tree setup(List<String> analyzedFields, Consumer<Tree> indexHook) thro
318346
);
319347
}
320348

321-
private Tree setup(Consumer<IndexDefinitionBuilder> builderHook, Consumer<Tree> indexHook, String... propNames) throws Exception {
349+
protected Tree setup(Consumer<IndexDefinitionBuilder> builderHook, Consumer<Tree> indexHook, String... propNames) throws Exception {
322350
IndexDefinitionBuilder builder = indexOptions.createIndex(
323351
indexOptions.createIndexDefinitionBuilder(), false, propNames);
324352
builder.noAsync();
@@ -332,7 +360,7 @@ private Tree setup(Consumer<IndexDefinitionBuilder> builderHook, Consumer<Tree>
332360
return index;
333361
}
334362

335-
private String explain(String query, String lang) {
363+
protected String explain(String query, String lang) {
336364
String explain = "explain " + query;
337365
return executeQuery(explain, lang).get(0);
338366
}

0 commit comments

Comments
 (0)