Skip to content

Add support for matched_fields with the unified highlighter #18166

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Improve sort-query performance by retaining the default `totalHitsThreshold` for approximated `match_all` queries ([#18189](https://github.com/opensearch-project/OpenSearch/pull/18189))
- Enable testing for ExtensiblePlugins using classpath plugins ([#16908](https://github.com/opensearch-project/OpenSearch/pull/16908))
- Introduce system generated ingest pipeline ([#17817](https://github.com/opensearch-project/OpenSearch/pull/17817)))
- Add support for `matched_fields` with the unified highlighter ([#18164](https://github.com/opensearch-project/OpenSearch/issues/18164))

### Changed

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator;
import org.apache.lucene.search.uhighlight.SplittingBreakIterator;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.opensearch.core.common.Strings;
Expand Down Expand Up @@ -123,9 +124,10 @@ private void assertHighlightOneDoc(
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
assertThat(topDocs.totalHits.value(), equalTo(1L));
String rawValue = Strings.collectionToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
UnifiedHighlighter.Builder builder = UnifiedHighlighter.builder(searcher, hiliteAnalyzer);
builder.withFieldMatcher("text"::equals);
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
searcher,
hiliteAnalyzer,
builder,
null,
passageFormatter,
locale,
Expand All @@ -135,11 +137,9 @@ private void assertHighlightOneDoc(
query,
noMatchSize,
expectedPassages.length,
name -> "text".equals(name),
Integer.MAX_VALUE,
null
);
highlighter.setFieldMatcher((name) -> "text".equals(name));
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
assertEquals(expectedPassages.length, snippets.length);
for (int i = 0; i < snippets.length; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,25 @@ setup:
"postings":
"type": "text"
"index_options": "offsets"
"another_text":
"type": "text"
"analyzer": "stop"
"fields":
"plain":
"type": "text"
"analyzer": "standard"
- do:
index:
index: test
id: 1
body:
"text" : "The quick brown fox is brown."
- do:
index:
index: test
id: 2
body:
"another_text" : "What jumps over the lazy dog?"
- do:
indices.refresh: {}

Expand All @@ -33,3 +46,17 @@ setup:
- match: {hits.hits.0.highlight.text.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is <em>brown</em>."}
- match: {hits.hits.0.highlight.text\.fvh.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is <em>brown</em>."}
- match: {hits.hits.0.highlight.text\.postings.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is <em>brown</em>."}
---
"With matched_fields":
- skip:
version: " - 3.0.99"
reason: "matched_fields support for unified added in OpenSearch 3.1.0"
- do:
search:
rest_total_hits_as_int: true
body:
query : {"multi_match" : { "query" : "the dog", "fields" : [ "another_text", "another_text.plain"] } }
highlight : { "type" : "unified", "fields" : { "another_text" : { "matched_fields": [ "another_text.plain" ] } } }

# Here we want "the" (stopword ignored on another_text) to be highlighted thanks to matched_fields
- match: {hits.hits.0.highlight.another_text.0: "What jumps over <em>the</em> lazy <em>dog</em>?"}
Original file line number Diff line number Diff line change
Expand Up @@ -1063,15 +1063,69 @@ public void testFVHManyMatches() throws Exception {
assertThat(defaultPhraseLimit.getTook().getMillis(), lessThan(largePhraseLimit.getTook().getMillis()));
}

public void testMatchedFieldsFvhRequireFieldMatch() throws Exception {
checkMatchedFieldsCase(true);
public void testMatchedFieldsWithUnified() throws Exception {
Settings.Builder settings = Settings.builder();
settings.put(indexSettings());
settings.put("index.analysis.analyzer.mock_english.tokenizer", "standard");
settings.put("index.analysis.analyzer.mock_english.filter", "mock_snowball");
assertAcked(
prepareCreate("test").setSettings(settings)
.setMapping(
XContentFactory.jsonBuilder()
.startObject()
.startObject("properties")
.startObject("foo")
.field("type", "text")
.field("store", true)
.field("analyzer", "mock_english")
.startObject("fields")
.startObject("plain")
.field("type", "text")
.field("analyzer", "standard")
.endObject()
.endObject()
.endObject()
.endObject()
.endObject()
)
Comment on lines +1073 to +1090
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Super minor suggestion, more for future reference and not something you need to change...Java now allows multiline strings so you can just write regular JSON to make this a bit more readable:

Suggested change
.setMapping(
XContentFactory.jsonBuilder()
.startObject()
.startObject("properties")
.startObject("foo")
.field("type", "text")
.field("store", true)
.field("analyzer", "mock_english")
.startObject("fields")
.startObject("plain")
.field("type", "text")
.field("analyzer", "standard")
.endObject()
.endObject()
.endObject()
.endObject()
.endObject()
)
.setMapping("""
{
"properties": {
"foo": {
"type": "text",
"store": true
"analyzer": "mock_english"
"fields": {
"plain": {
"type": "text"
"analyzer": "standard"
}
}
}
}
}
""")

Copy link
Author

@nomoa nomoa May 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good call, thanks for the suggestion, if time permits I might try to clean this whole test class in a separate PR if you find this useful.

);
ensureGreen();

index("test", "type1", "1", "foo", "running with scissors");
refresh();
Field fooField = new Field("foo").numOfFragments(1).order("score").fragmentSize(25).highlighterType("unified");
SearchRequestBuilder req = client().prepareSearch("test").highlighter(new HighlightBuilder().field(fooField));

// First check highlighting without any matched fields set
SearchResponse resp = req.setQuery(queryStringQuery("running scissors").field("foo")).get();
assertHighlight(resp, 0, "foo", 0, equalTo("<em>running</em> with <em>scissors</em>"));

// And that matching a subfield doesn't automatically highlight it
resp = req.setQuery(queryStringQuery("foo.plain:running scissors").field("foo")).get();
assertHighlight(resp, 0, "foo", 0, equalTo("running with <em>scissors</em>"));

// Add the subfield to the list of matched fields but don't match it. Everything should still work
// like before we added it.
fooField = new Field("foo").numOfFragments(1).order("score").fragmentSize(25).highlighterType("unified");
fooField.matchedFields("foo.plain");
req = client().prepareSearch("test").highlighter(new HighlightBuilder().field(fooField));
resp = req.setQuery(queryStringQuery("running scissors").field("foo")).get();
assertHighlight(resp, 0, "foo", 0, equalTo("<em>running</em> with <em>scissors</em>"));

// Now make half the matches come from the stored field and half from just a matched field.
resp = req.setQuery(queryStringQuery("foo.plain:running scissors").field("foo")).get();
assertHighlight(resp, 0, "foo", 0, equalTo("<em>running</em> with <em>scissors</em>"));
}

public void testFvhMatchedFieldsRequireFieldMatch() throws Exception {
checkFvhMatchedFieldsCase(true);
}

public void testMatchedFieldsFvhNoRequireFieldMatch() throws Exception {
checkMatchedFieldsCase(false);
public void testFvhMatchedFieldsFvhNoRequireFieldMatch() throws Exception {
checkFvhMatchedFieldsCase(false);
}

private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception {
private void checkFvhMatchedFieldsCase(boolean requireFieldMatch) throws Exception {
Settings.Builder settings = Settings.builder();
settings.put(indexSettings());
settings.put("index.analysis.analyzer.mock_english.tokenizer", "standard");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@
class CustomFieldHighlighter extends FieldHighlighter {
private static final Passage[] EMPTY_PASSAGE = new Passage[0];

private static final Comparator<Passage> DEFAULT_PASSAGE_SORT_COMPARATOR = Comparator.comparingInt(Passage::getStartOffset);

private final Locale breakIteratorLocale;
private final int noMatchSize;
private String fieldValue;
Expand All @@ -72,7 +70,8 @@ class CustomFieldHighlighter extends FieldHighlighter {
int maxPassages,
int maxNoHighlightPassages,
PassageFormatter passageFormatter,
int noMatchSize
int noMatchSize,
Comparator<Passage> passageSortComparator
) {
super(
field,
Expand All @@ -82,7 +81,7 @@ class CustomFieldHighlighter extends FieldHighlighter {
maxPassages,
maxNoHighlightPassages,
passageFormatter,
DEFAULT_PASSAGE_SORT_COMPARATOR
passageSortComparator
);
this.breakIteratorLocale = breakIteratorLocale;
this.noMatchSize = noMatchSize;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,26 +32,22 @@

package org.opensearch.lucene.search.uhighlight;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.queries.spans.SpanNearQuery;
import org.apache.lucene.queries.spans.SpanOrQuery;
import org.apache.lucene.queries.spans.SpanQuery;
import org.apache.lucene.queries.spans.SpanTermQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.uhighlight.FieldHighlighter;
import org.apache.lucene.search.uhighlight.FieldOffsetStrategy;
import org.apache.lucene.search.uhighlight.LabelledCharArrayMatcher;
import org.apache.lucene.search.uhighlight.NoOpOffsetStrategy;
import org.apache.lucene.search.uhighlight.Passage;
import org.apache.lucene.search.uhighlight.PassageFormatter;
import org.apache.lucene.search.uhighlight.PhraseHelper;
import org.apache.lucene.search.uhighlight.SplittingBreakIterator;
import org.apache.lucene.search.uhighlight.UHComponents;
import org.apache.lucene.search.uhighlight.PassageScorer;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.util.BytesRef;
import org.opensearch.common.CheckedSupplier;
import org.opensearch.common.Nullable;
import org.opensearch.common.lucene.search.MultiPhrasePrefixQuery;
Expand All @@ -61,9 +57,9 @@
import java.text.BreakIterator;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Locale;
import java.util.Set;
import java.util.function.Predicate;

/**
* Subclass of the {@link UnifiedHighlighter} that works for a single field in a single document.
Expand Down Expand Up @@ -91,7 +87,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
/**
* Creates a new instance of {@link CustomUnifiedHighlighter}
*
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally.
* @param builder the unified highlighter builder
* @param offsetSource the {@link OffsetSource} to used for offsets retrieval.
* @param passageFormatter our own {@link CustomPassageFormatter}
* which generates snippets in forms of {@link Snippet} objects.
Expand All @@ -104,14 +100,12 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
* @param query the query we're highlighting
* @param noMatchSize The size of the text that should be returned when no highlighting can be performed.
* @param maxPassages the maximum number of passes to highlight
* @param fieldMatcher decides which terms should be highlighted
* @param maxAnalyzedOffset if the field is more than this long we'll refuse to use the ANALYZED
* offset source for it because it'd be super slow
* @param fieldMaxAnalyzedOffset this is used to limit the length of input that will be ANALYZED, this allows bigger fields to be partially highligthed
*/
public CustomUnifiedHighlighter(
IndexSearcher searcher,
Analyzer analyzer,
UnifiedHighlighter.Builder builder,
OffsetSource offsetSource,
PassageFormatter passageFormatter,
@Nullable Locale breakIteratorLocale,
Expand All @@ -121,21 +115,19 @@ public CustomUnifiedHighlighter(
Query query,
int noMatchSize,
int maxPassages,
Predicate<String> fieldMatcher,
int maxAnalyzedOffset,
Integer fieldMaxAnalyzedOffset
) throws IOException {
super(searcher, analyzer);
super(builder);
this.offsetSource = offsetSource;
this.breakIterator = breakIterator;
this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale;
this.passageFormatter = passageFormatter;
this.index = index;
this.field = field;
this.noMatchSize = noMatchSize;
this.setFieldMatcher(fieldMatcher);
this.maxAnalyzedOffset = maxAnalyzedOffset;
fieldHighlighter = getFieldHighlighter(field, query, extractTerms(query), maxPassages);
fieldHighlighter = (CustomFieldHighlighter) getFieldHighlighter(field, query, extractTerms(query), maxPassages);
this.fieldMaxAnalyzedOffset = fieldMaxAnalyzedOffset;
}

Expand Down Expand Up @@ -203,26 +195,27 @@ protected PassageFormatter getFormatter(String field) {
}

@Override
protected CustomFieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
Predicate<String> fieldMatcher = getFieldMatcher(field);
BytesRef[] terms = filterExtractedTerms(fieldMatcher, allTerms);
Set<HighlightFlag> highlightFlags = getFlags(field);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
LabelledCharArrayMatcher[] automata = getAutomata(field, query, highlightFlags);
UHComponents components = new UHComponents(field, fieldMatcher, query, terms, phraseHelper, automata, false, highlightFlags);
OffsetSource offsetSource = getOptimizedOffsetSource(components);
BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR);
FieldOffsetStrategy strategy = getOffsetStrategy(offsetSource, components);
protected FieldHighlighter newFieldHighlighter(
String field,
FieldOffsetStrategy fieldOffsetStrategy,
BreakIterator breakIterator,
PassageScorer passageScorer,
int maxPassages,
int maxNoHighlightPassages,
PassageFormatter passageFormatter,
Comparator<Passage> passageSortComparator
) {
return new CustomFieldHighlighter(
field,
strategy,
fieldOffsetStrategy,
breakIteratorLocale,
breakIterator,
getScorer(field),
maxPassages,
(noMatchSize > 0 ? 1 : 0),
getFormatter(field),
noMatchSize
noMatchSize,
passageSortComparator
);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,12 @@
import java.io.IOException;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.function.Predicate;
import java.util.stream.Collectors;

Expand Down Expand Up @@ -189,8 +191,7 @@ CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) th
higlighterNumberOfFragments = numberOfFragments;
}
return new CustomUnifiedHighlighter(
searcher,
analyzer,
newBuilder(searcher, analyzer, fieldContext),
offsetSource,
passageFormatter,
fieldContext.field.fieldOptions().boundaryScannerLocale(),
Expand All @@ -200,7 +201,6 @@ CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) th
fieldContext.query,
fieldContext.field.fieldOptions().noMatchSize(),
higlighterNumberOfFragments,
fieldMatcher(fieldContext),
maxAnalyzedOffset,
fieldMaxAnalyzedOffset
);
Expand Down Expand Up @@ -273,10 +273,27 @@ protected OffsetSource getOffsetSource(MappedFieldType fieldType) {
return OffsetSource.ANALYSIS;
}

private org.apache.lucene.search.uhighlight.UnifiedHighlighter.Builder newBuilder(
IndexSearcher searcher,
Analyzer analyzer,
FieldHighlightContext fieldContext
) {
org.apache.lucene.search.uhighlight.UnifiedHighlighter.Builder builder = org.apache.lucene.search.uhighlight.UnifiedHighlighter
.builder(searcher, analyzer);
Set<String> matchedFields = fieldContext.field.fieldOptions().matchedFields();
if (matchedFields != null && !matchedFields.isEmpty()) {
Map<String, Set<String>> maskedFields = Collections.singletonMap(fieldContext.fieldName, matchedFields);
builder.withMaskedFieldsFunc(f -> maskedFields.getOrDefault(f, Collections.emptySet()));
}
builder.withFieldMatcher(fieldMatcher(fieldContext));
return builder;
}

private Predicate<String> fieldMatcher(FieldHighlightContext fieldContext) {
if (fieldContext.field.fieldOptions().requireFieldMatch()) {
String fieldName = fieldContext.fieldName;
return fieldName::equals;
Set<String> matchedFields = fieldContext.field.fieldOptions().matchedFields();
return f -> fieldName.equals(f) || (matchedFields != null && matchedFields.contains(f));
}
// ignore terms that targets the _id field since they use a different encoding
// that is not compatible with utf8
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,10 @@ private void assertHighlightOneDoc(
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
assertThat(topDocs.totalHits.value(), equalTo(1L));
String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
UnifiedHighlighter.Builder builder = UnifiedHighlighter.builder(searcher, analyzer);
builder.withFieldMatcher("text"::equals);
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
searcher,
analyzer,
builder,
UnifiedHighlighter.OffsetSource.ANALYSIS,
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()),
locale,
Expand All @@ -112,7 +113,6 @@ private void assertHighlightOneDoc(
query,
noMatchSize,
expectedPassages.length,
name -> "text".equals(name),
Integer.MAX_VALUE,
null
);
Expand Down
Loading