Skip to content

Commit d874361

Browse files
OAK-11603: improve fuzzy conversion
1 parent 949f55d commit d874361

File tree

3 files changed

+42
-24
lines changed

3 files changed

+42
-24
lines changed

oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -128,9 +128,10 @@ public class ElasticRequestHandler {
128128
private static final String HIGHLIGHT_PREFIX = "<strong>";
129129
private static final String HIGHLIGHT_SUFFIX = "</strong>";
130130

131-
// Match old-style fuzzy queries (e.g., roam~0.8), but not new ones (e.g., roam~2)
132-
private static final Pattern OLD_FUZZY_PATTERN = Pattern.compile("\\b(\\w+)~([0-9]*\\.?[0-9]+)\\b");
133-
private static final Pattern NEW_FUZZY_PATTERN = Pattern.compile("\\b(\\w+)~([0-2])\\b");
131+
// Match Lucene 4.x fuzzy queries (e.g., roam~0.8), but not 5.x and beyond (e.g., roam~2)
132+
private static final Pattern LUCENE_4_FUZZY_PATTERN = Pattern.compile("\\b(\\w+)~([0-9]*\\.?[0-9]+)\\b");
133+
// From Lucene 5 and above (used by elastic), the fuzzy query syntax has changed to use a single integer
134+
private static final Pattern ELASTIC_FUZZY_PATTERN = Pattern.compile("\\b(\\w+)~([0-2])\\b");
134135

135136
private final IndexPlan indexPlan;
136137
private final Filter filter;
@@ -918,9 +919,7 @@ private String rewriteQueryText(String text) {
918919
String rewritten = FulltextIndex.rewriteQueryText(text);
919920

920921
// here we handle special cases where the syntax used in the lucene 4.x query parser is not supported by the current version
921-
if (rewritten.contains("~")) {
922-
rewritten = convertFuzzyQuery(rewritten);
923-
}
922+
rewritten = convertFuzzyQuery(rewritten);
924923

925924
return rewritten;
926925
}
@@ -943,15 +942,23 @@ private String rewriteQueryText(String text) {
943942
* @return A query string where old fuzzy syntax is converted to the new format.
944943
*/
945944
private String convertFuzzyQuery(String text) {
946-
Matcher oldMatcher = OLD_FUZZY_PATTERN.matcher(text);
947-
StringBuilder result = new StringBuilder();
945+
if (!text.contains("~")) {
946+
return text;
947+
}
948+
Matcher lucene4FuzzyMatcher = LUCENE_4_FUZZY_PATTERN.matcher(text);
949+
950+
if (!lucene4FuzzyMatcher.find()) {
951+
// this can only happen if the pattern is not found, which means we are dealing with a tilde not related to a fuzzy query
952+
return text;
953+
}
948954

949-
while (oldMatcher.find()) {
950-
String term = oldMatcher.group(1);
951-
String fuzzyValue = oldMatcher.group(2);
955+
StringBuilder result = new StringBuilder();
956+
do {
957+
String term = lucene4FuzzyMatcher.group(1);
958+
String fuzzyValue = lucene4FuzzyMatcher.group(2);
952959

953960
// Skip if it's already using the new syntax (integer 0-2)
954-
if (NEW_FUZZY_PATTERN.matcher(term + "~" + fuzzyValue).matches()) {
961+
if (ELASTIC_FUZZY_PATTERN.matcher(term + "~" + fuzzyValue).matches()) {
955962
continue;
956963
}
957964

@@ -965,13 +972,16 @@ private String convertFuzzyQuery(String text) {
965972
editDistance = 1;
966973
}
967974
} catch (NumberFormatException e) {
968-
LOG.warn("Invalid fuzzy value: {}, using default edit distance of 2", fuzzyValue);
975+
LOG.warn("Invalid fuzzy value: {} for query text {}, using default edit distance of 2", fuzzyValue, text);
969976
}
970977

971-
oldMatcher.appendReplacement(result, term + "~" + editDistance);
972-
}
973-
oldMatcher.appendTail(result);
974-
return result.toString();
978+
lucene4FuzzyMatcher.appendReplacement(result, term + "~" + editDistance);
979+
} while (lucene4FuzzyMatcher.find());
980+
981+
lucene4FuzzyMatcher.appendTail(result);
982+
String resultString = result.toString();
983+
LOG.info("Converted fuzzy query from '{}' to '{}'", text, resultString);
984+
return resultString;
975985
}
976986

977987
private Query createQuery(String propertyName, Filter.PropertyRestriction pr, PropertyDefinition defn) {

oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticFullTextIndexTest.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,13 @@ public void fullTextWithFuzzyEditDistance() throws Exception {
6060
test.addChild("b").setProperty("propa", "Simple test");
6161
root.commit();
6262

63-
String query = "//*[jcr:contains(@propa, 'wordl~1')]"; // misspelled world
63+
String misspelledWorld = "//*[jcr:contains(@propa, 'wordl~0.5')]";
64+
String mixedFuzzyFormats = "//*[jcr:contains(@propa, 'wordl~0.5 OR sample~1')]";
6465

6566
assertEventually(() -> {
66-
assertThat(explain(query, XPATH), containsString(indexOptions.getIndexType() + ":" + index.getName()));
67-
assertQuery(query, XPATH, List.of("/test/a"));
67+
assertThat(explain(misspelledWorld, XPATH), containsString(indexOptions.getIndexType() + ":" + index.getName()));
68+
assertQuery(misspelledWorld, XPATH, List.of("/test/a"));
69+
assertQuery(mixedFuzzyFormats, XPATH, List.of("/test/a", "/test/b"));
6870
});
6971
}
7072

oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextIndexCommonTest.java

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,14 +96,20 @@ public void fullTextWithFuzziness() throws Exception {
9696
Tree test = root.getTree("/").addChild("test");
9797

9898
test.addChild("a").setProperty("propa", "Hello World!");
99-
test.addChild("b").setProperty("propa", "Simple test");
99+
test.addChild("b").setProperty("propa", "hello~folks!");
100+
test.addChild("c").setProperty("propa", "Hello everyone!");
100101
root.commit();
101102

102-
String query = "//*[jcr:contains(@propa, 'wordl~0.5')]"; // misspelled world
103+
String misspelledWorld = "//*[jcr:contains(@propa, 'wordl~0.5')]";
104+
String multipleMisspelledWorlds = "//*[jcr:contains(@propa, 'wordl~0.5 OR everone~0.5')]";
105+
String withTilde = "//*[jcr:contains(@propa, 'hello\\~folks')]";
103106

104107
assertEventually(() -> {
105-
assertThat(explain(query, XPATH), containsString(indexOptions.getIndexType() + ":" + index.getName()));
106-
assertQuery(query, XPATH, List.of("/test/a"));
108+
assertThat(explain(misspelledWorld, XPATH), containsString(indexOptions.getIndexType() + ":" + index.getName()));
109+
110+
assertQuery(misspelledWorld, XPATH, List.of("/test/a"));
111+
assertQuery(multipleMisspelledWorlds, XPATH, List.of("/test/a", "/test/c"));
112+
assertQuery(withTilde, XPATH, List.of("/test/b"));
107113
});
108114
}
109115

0 commit comments

Comments
 (0)