Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@
import java.util.function.BiPredicate;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
Expand All @@ -126,6 +128,11 @@ public class ElasticRequestHandler {
private static final String HIGHLIGHT_PREFIX = "<strong>";
private static final String HIGHLIGHT_SUFFIX = "</strong>";

// Match Lucene 4.x fuzzy queries (e.g., roam~0.8), but not 5.x and beyond (e.g., roam~2)
private static final Pattern LUCENE_4_FUZZY_PATTERN = Pattern.compile("\\b(\\w+)~([0-9]*\\.?[0-9]+)\\b");
// From Lucene 5 and above (used by elastic), the fuzzy query syntax has changed to use a single integer
private static final Pattern ELASTIC_FUZZY_PATTERN = Pattern.compile("\\b(\\w+)~([0-2])\\b");

private final IndexPlan indexPlan;
private final Filter filter;
private final PlanResult planResult;
Expand Down Expand Up @@ -889,10 +896,10 @@ private static Query referenceConstraint(String uuid) {
return Query.of(q -> q.multiMatch(m -> m.fields(uuid)));
}

private static QueryStringQuery.Builder fullTextQuery(String text, String fieldName, PlanResult pr, boolean includeDynamicBoostedValues) {
private QueryStringQuery.Builder fullTextQuery(String text, String fieldName, PlanResult pr, boolean includeDynamicBoostedValues) {
LOG.debug("fullTextQuery for text: '{}', fieldName: '{}'", text, fieldName);
QueryStringQuery.Builder qsqBuilder = new QueryStringQuery.Builder()
.query(FulltextIndex.rewriteQueryText(text))
.query(rewriteQueryText(text))
.defaultOperator(Operator.And)
.type(TextQueryType.CrossFields)
.tieBreaker(0.5d);
Expand All @@ -908,6 +915,75 @@ private static QueryStringQuery.Builder fullTextQuery(String text, String fieldN
return qsqBuilder.fields(fieldName);
}

private String rewriteQueryText(String text) {
String rewritten = FulltextIndex.rewriteQueryText(text);

// here we handle special cases where the syntax used in the lucene 4.x query parser is not supported by the current version
rewritten = convertFuzzyQuery(rewritten);

return rewritten;
}

/**
* Converts Lucene fuzzy queries from the old syntax (float similarity) to the new syntax (edit distance).
* <p>
* In Lucene 4, fuzzy queries were specified using a floating-point similarity (e.g., "term~0.8"), where values
* closer to 1 required a higher similarity match. In later Lucene versions, this was replaced with a discrete
* edit distance (0, 1, or 2).
* <p>
* This method:
* <ul>
* <li>Detects and converts old fuzzy queries (e.g., "roam~0.7" → "roam~1").</li>
* <li>Preserves new fuzzy queries (e.g., "test~2" remains unchanged).</li>
* <li>Avoids modifying proximity queries (e.g., "\"quick fox\"~5" remains unchanged).</li>
* </ul>
*
* @param text The input query string containing fuzzy or proximity queries.
* @return A query string where old fuzzy syntax is converted to the new format.
*/
private String convertFuzzyQuery(String text) {
if (!text.contains("~")) {
return text;
}
Matcher lucene4FuzzyMatcher = LUCENE_4_FUZZY_PATTERN.matcher(text);

if (!lucene4FuzzyMatcher.find()) {
// this can only happen if the pattern is not found, which means we are dealing with a tilde not related to a fuzzy query
return text;
}

StringBuilder result = new StringBuilder();
do {
String term = lucene4FuzzyMatcher.group(1);
String fuzzyValue = lucene4FuzzyMatcher.group(2);

// Skip if it's already using the new syntax (integer 0-2)
if (ELASTIC_FUZZY_PATTERN.matcher(term + "~" + fuzzyValue).matches()) {
continue;
}

// Convert floating-point similarity to integer edit distance
int editDistance = 2; // Default to the most lenient setting
try {
float similarity = Float.parseFloat(fuzzyValue);
if (similarity >= 0.8f) {
editDistance = 0;
} else if (similarity >= 0.5f) {
editDistance = 1;
}
} catch (NumberFormatException e) {
LOG.warn("Invalid fuzzy value: {} for query text {}, using default edit distance of 2", fuzzyValue, text);
}

lucene4FuzzyMatcher.appendReplacement(result, term + "~" + editDistance);
} while (lucene4FuzzyMatcher.find());

lucene4FuzzyMatcher.appendTail(result);
String resultString = result.toString();
LOG.info("Converted fuzzy query from '{}' to '{}'", text, resultString);
return resultString;
}

private Query createQuery(String propertyName, Filter.PropertyRestriction pr, PropertyDefinition defn) {
final String field = elasticIndexDefinition.getElasticKeyword(propertyName);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,15 @@
package org.apache.jackrabbit.oak.plugins.index.elastic;

import org.apache.jackrabbit.oak.api.ContentRepository;
import org.apache.jackrabbit.oak.api.Tree;
import org.apache.jackrabbit.oak.plugins.index.FullTextIndexCommonTest;
import org.junit.ClassRule;
import org.junit.Test;

import java.util.List;

import static org.hamcrest.CoreMatchers.containsString;
import static org.hamcrest.MatcherAssert.assertThat;

public class ElasticFullTextIndexTest extends FullTextIndexCommonTest {

Expand All @@ -40,4 +47,27 @@ protected void createTestIndexNode() {
setTraversalEnabled(false);
}

@Test
public void fullTextWithFuzzyEditDistance() throws Exception {
Tree index = setup(builder -> builder.indexRule("nt:base").property("propa").analyzed(), idx -> {
},
"propa");

//add content
Tree test = root.getTree("/").addChild("test");

test.addChild("a").setProperty("propa", "Hello World!");
test.addChild("b").setProperty("propa", "Simple test");
root.commit();

String misspelledWorld = "//*[jcr:contains(@propa, 'wordl~0.5')]";
String mixedFuzzyFormats = "//*[jcr:contains(@propa, 'wordl~0.5 OR sample~1')]";

assertEventually(() -> {
assertThat(explain(misspelledWorld, XPATH), containsString(indexOptions.getIndexType() + ":" + index.getName()));
assertQuery(misspelledWorld, XPATH, List.of("/test/a"));
assertQuery(mixedFuzzyFormats, XPATH, List.of("/test/a", "/test/b"));
});
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ public void fullTextWithInvalidSyntax() throws Exception {
test.addChild("a").setProperty("propa", "Hello everyone. This is a fulltext test");
root.commit();

// fuzziness support the following syntax: <term>~[edit_distance] (eg: hello~2). The query below is invalid
// fuzziness support the following syntax: <term>~[edit_distance] (eg: hello~[similarity value]). The query below is invalid
// https://lucene.apache.org/core/2_9_4/queryparsersyntax.html#Fuzzy%20Searches
// https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-fuzziness
String query = "//*[jcr:contains(@propa, 'hello e~one')]";

Expand All @@ -85,6 +86,33 @@ public void fullTextWithInvalidSyntax() throws Exception {
});
}

@Test
public void fullTextWithFuzziness() throws Exception {
Tree index = setup(builder -> builder.indexRule("nt:base").property("propa").analyzed(), idx -> {
},
"propa");

//add content
Tree test = root.getTree("/").addChild("test");

test.addChild("a").setProperty("propa", "Hello World!");
test.addChild("b").setProperty("propa", "hello~folks!");
test.addChild("c").setProperty("propa", "Hello everyone!");
root.commit();

String misspelledWorld = "//*[jcr:contains(@propa, 'wordl~0.5')]";
String multipleMisspelledWorlds = "//*[jcr:contains(@propa, 'wordl~0.5 OR everone~0.5')]";
String withTilde = "//*[jcr:contains(@propa, 'hello\\~folks')]";

assertEventually(() -> {
assertThat(explain(misspelledWorld, XPATH), containsString(indexOptions.getIndexType() + ":" + index.getName()));

assertQuery(misspelledWorld, XPATH, List.of("/test/a"));
assertQuery(multipleMisspelledWorlds, XPATH, List.of("/test/a", "/test/c"));
assertQuery(withTilde, XPATH, List.of("/test/b"));
});
}

@Test
public void fullTextQueryRegExp() throws Exception {
Tree index = setup(builder -> builder.indexRule("nt:base").property("propa").analyzed(), idx -> {
Expand Down Expand Up @@ -318,7 +346,7 @@ protected Tree setup(List<String> analyzedFields, Consumer<Tree> indexHook) thro
);
}

private Tree setup(Consumer<IndexDefinitionBuilder> builderHook, Consumer<Tree> indexHook, String... propNames) throws Exception {
protected Tree setup(Consumer<IndexDefinitionBuilder> builderHook, Consumer<Tree> indexHook, String... propNames) throws Exception {
IndexDefinitionBuilder builder = indexOptions.createIndex(
indexOptions.createIndexDefinitionBuilder(), false, propNames);
builder.noAsync();
Expand All @@ -332,7 +360,7 @@ private Tree setup(Consumer<IndexDefinitionBuilder> builderHook, Consumer<Tree>
return index;
}

private String explain(String query, String lang) {
protected String explain(String query, String lang) {
String explain = "explain " + query;
return executeQuery(explain, lang).get(0);
}
Expand Down
Loading