Skip to content

Commit f7907c5

Browse files
committed
Add support for matched_fields with the unified highlighter
This option was silently ignored when running the unified highlighter. Since lucene 10 it is now possible to pass a list of additional fields to fetch matches from and blend into the snippets. Signed-off-by: David Causse <[email protected]>
1 parent 00093ea commit f7907c5

File tree

8 files changed

+139
-48
lines changed

8 files changed

+139
-48
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
1414
- Add pull-based ingestion error metrics and make internal queue size configurable ([#18088](https://github.com/opensearch-project/OpenSearch/pull/18088))
1515
- Enabled Async Shard Batch Fetch by default ([#18139](https://github.com/opensearch-project/OpenSearch/pull/18139))
1616
- Allow to get the search request from the QueryCoordinatorContext ([#17818](https://github.com/opensearch-project/OpenSearch/pull/17818))
17+
- Add support `matched_fields` with the unified highlighter ([#18164](https://github.com/opensearch-project/OpenSearch/issues/18164))
1718

1819
### Changed
1920

plugins/mapper-annotated-text/src/test/java/org/opensearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java

+4-4
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
import org.apache.lucene.search.highlight.DefaultEncoder;
5353
import org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator;
5454
import org.apache.lucene.search.uhighlight.SplittingBreakIterator;
55+
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
5556
import org.apache.lucene.store.Directory;
5657
import org.apache.lucene.tests.index.RandomIndexWriter;
5758
import org.opensearch.core.common.Strings;
@@ -123,9 +124,10 @@ private void assertHighlightOneDoc(
123124
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
124125
assertThat(topDocs.totalHits.value(), equalTo(1L));
125126
String rawValue = Strings.collectionToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
127+
UnifiedHighlighter.Builder builder = UnifiedHighlighter.builder(searcher, hiliteAnalyzer);
128+
builder.withFieldMatcher("text"::equals);
126129
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
127-
searcher,
128-
hiliteAnalyzer,
130+
builder,
129131
null,
130132
passageFormatter,
131133
locale,
@@ -135,11 +137,9 @@ private void assertHighlightOneDoc(
135137
query,
136138
noMatchSize,
137139
expectedPassages.length,
138-
name -> "text".equals(name),
139140
Integer.MAX_VALUE,
140141
null
141142
);
142-
highlighter.setFieldMatcher((name) -> "text".equals(name));
143143
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
144144
assertEquals(expectedPassages.length, snippets.length);
145145
for (int i = 0; i < snippets.length; i++) {

rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/10_unified.yml

+27
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,25 @@ setup:
1414
"postings":
1515
"type": "text"
1616
"index_options": "offsets"
17+
"another_text":
18+
"type": "text"
19+
"analyzer": "stop"
20+
"fields":
21+
"plain":
22+
"type": "text"
23+
"analyzer": "standard"
1724
- do:
1825
index:
1926
index: test
2027
id: 1
2128
body:
2229
"text" : "The quick brown fox is brown."
30+
- do:
31+
index:
32+
index: test
33+
id: 2
34+
body:
35+
"another_text" : "What jumps over the lazy dog?"
2336
- do:
2437
indices.refresh: {}
2538

@@ -33,3 +46,17 @@ setup:
3346
- match: {hits.hits.0.highlight.text.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is <em>brown</em>."}
3447
- match: {hits.hits.0.highlight.text\.fvh.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is <em>brown</em>."}
3548
- match: {hits.hits.0.highlight.text\.postings.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is <em>brown</em>."}
49+
---
50+
"With matched_fields":
51+
- skip:
52+
version: " - 3.0.99"
53+
reason: "matched_fields support for unified added in OpenSearch 3.1.0"
54+
- do:
55+
search:
56+
rest_total_hits_as_int: true
57+
body:
58+
query : {"multi_match" : { "query" : "the dog", "fields" : [ "another_text", "another_text.plain"] } }
59+
highlight : { "type" : "unified", "fields" : { "another_text" : { "matched_fields": [ "another_text.plain" ] } } }
60+
61+
# Here we want "the" (stopword ignored on another_text) to be highlighted thanks to matched_fields
62+
- match: {hits.hits.0.highlight.another_text.0: "What jumps over <em>the</em> lazy <em>dog</em>?"}

server/src/internalClusterTest/java/org/opensearch/search/fetch/subphase/highlight/HighlighterSearchIT.java

+59-5
Original file line numberDiff line numberDiff line change
@@ -1063,15 +1063,69 @@ public void testFVHManyMatches() throws Exception {
10631063
assertThat(defaultPhraseLimit.getTook().getMillis(), lessThan(largePhraseLimit.getTook().getMillis()));
10641064
}
10651065

1066-
public void testMatchedFieldsFvhRequireFieldMatch() throws Exception {
1067-
checkMatchedFieldsCase(true);
1066+
public void testMatchedFieldsWithUnified() throws Exception {
1067+
Settings.Builder settings = Settings.builder();
1068+
settings.put(indexSettings());
1069+
settings.put("index.analysis.analyzer.mock_english.tokenizer", "standard");
1070+
settings.put("index.analysis.analyzer.mock_english.filter", "mock_snowball");
1071+
assertAcked(
1072+
prepareCreate("test").setSettings(settings)
1073+
.setMapping(
1074+
XContentFactory.jsonBuilder()
1075+
.startObject()
1076+
.startObject("properties")
1077+
.startObject("foo")
1078+
.field("type", "text")
1079+
.field("store", true)
1080+
.field("analyzer", "mock_english")
1081+
.startObject("fields")
1082+
.startObject("plain")
1083+
.field("type", "text")
1084+
.field("analyzer", "standard")
1085+
.endObject()
1086+
.endObject()
1087+
.endObject()
1088+
.endObject()
1089+
.endObject()
1090+
)
1091+
);
1092+
ensureGreen();
1093+
1094+
index("test", "type1", "1", "foo", "running with scissors");
1095+
refresh();
1096+
Field fooField = new Field("foo").numOfFragments(1).order("score").fragmentSize(25).highlighterType("unified");
1097+
SearchRequestBuilder req = client().prepareSearch("test").highlighter(new HighlightBuilder().field(fooField));
1098+
1099+
// First check highlighting without any matched fields set
1100+
SearchResponse resp = req.setQuery(queryStringQuery("running scissors").field("foo")).get();
1101+
assertHighlight(resp, 0, "foo", 0, equalTo("<em>running</em> with <em>scissors</em>"));
1102+
1103+
// And that matching a subfield doesn't automatically highlight it
1104+
resp = req.setQuery(queryStringQuery("foo.plain:running scissors").field("foo")).get();
1105+
assertHighlight(resp, 0, "foo", 0, equalTo("running with <em>scissors</em>"));
1106+
1107+
// Add the subfield to the list of matched fields but don't match it. Everything should still work
1108+
// like before we added it.
1109+
fooField = new Field("foo").numOfFragments(1).order("score").fragmentSize(25).highlighterType("unified");
1110+
fooField.matchedFields("foo.plain");
1111+
req = client().prepareSearch("test").highlighter(new HighlightBuilder().field(fooField));
1112+
resp = req.setQuery(queryStringQuery("running scissors").field("foo")).get();
1113+
assertHighlight(resp, 0, "foo", 0, equalTo("<em>running</em> with <em>scissors</em>"));
1114+
1115+
// Now make half the matches come from the stored field and half from just a matched field.
1116+
resp = req.setQuery(queryStringQuery("foo.plain:running scissors").field("foo")).get();
1117+
assertHighlight(resp, 0, "foo", 0, equalTo("<em>running</em> with <em>scissors</em>"));
1118+
}
1119+
1120+
public void testFvhMatchedFieldsRequireFieldMatch() throws Exception {
1121+
checkFvhMatchedFieldsCase(true);
10681122
}
10691123

1070-
public void testMatchedFieldsFvhNoRequireFieldMatch() throws Exception {
1071-
checkMatchedFieldsCase(false);
1124+
public void testFvhMatchedFieldsFvhNoRequireFieldMatch() throws Exception {
1125+
checkFvhMatchedFieldsCase(false);
10721126
}
10731127

1074-
private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception {
1128+
private void checkFvhMatchedFieldsCase(boolean requireFieldMatch) throws Exception {
10751129
Settings.Builder settings = Settings.builder();
10761130
settings.put(indexSettings());
10771131
settings.put("index.analysis.analyzer.mock_english.tokenizer", "standard");

server/src/main/java/org/opensearch/lucene/search/uhighlight/CustomFieldHighlighter.java

+3-4
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,6 @@
5757
class CustomFieldHighlighter extends FieldHighlighter {
5858
private static final Passage[] EMPTY_PASSAGE = new Passage[0];
5959

60-
private static final Comparator<Passage> DEFAULT_PASSAGE_SORT_COMPARATOR = Comparator.comparingInt(Passage::getStartOffset);
61-
6260
private final Locale breakIteratorLocale;
6361
private final int noMatchSize;
6462
private String fieldValue;
@@ -72,7 +70,8 @@ class CustomFieldHighlighter extends FieldHighlighter {
7270
int maxPassages,
7371
int maxNoHighlightPassages,
7472
PassageFormatter passageFormatter,
75-
int noMatchSize
73+
int noMatchSize,
74+
Comparator<Passage> passageSortComparator
7675
) {
7776
super(
7877
field,
@@ -82,7 +81,7 @@ class CustomFieldHighlighter extends FieldHighlighter {
8281
maxPassages,
8382
maxNoHighlightPassages,
8483
passageFormatter,
85-
DEFAULT_PASSAGE_SORT_COMPARATOR
84+
passageSortComparator
8685
);
8786
this.breakIteratorLocale = breakIteratorLocale;
8887
this.noMatchSize = noMatchSize;

server/src/main/java/org/opensearch/lucene/search/uhighlight/CustomUnifiedHighlighter.java

+21-28
Original file line numberDiff line numberDiff line change
@@ -32,26 +32,22 @@
3232

3333
package org.opensearch.lucene.search.uhighlight;
3434

35-
import org.apache.lucene.analysis.Analyzer;
3635
import org.apache.lucene.index.LeafReader;
3736
import org.apache.lucene.index.Term;
3837
import org.apache.lucene.queries.spans.SpanMultiTermQueryWrapper;
3938
import org.apache.lucene.queries.spans.SpanNearQuery;
4039
import org.apache.lucene.queries.spans.SpanOrQuery;
4140
import org.apache.lucene.queries.spans.SpanQuery;
4241
import org.apache.lucene.queries.spans.SpanTermQuery;
43-
import org.apache.lucene.search.IndexSearcher;
4442
import org.apache.lucene.search.PrefixQuery;
4543
import org.apache.lucene.search.Query;
44+
import org.apache.lucene.search.uhighlight.FieldHighlighter;
4645
import org.apache.lucene.search.uhighlight.FieldOffsetStrategy;
47-
import org.apache.lucene.search.uhighlight.LabelledCharArrayMatcher;
4846
import org.apache.lucene.search.uhighlight.NoOpOffsetStrategy;
47+
import org.apache.lucene.search.uhighlight.Passage;
4948
import org.apache.lucene.search.uhighlight.PassageFormatter;
50-
import org.apache.lucene.search.uhighlight.PhraseHelper;
51-
import org.apache.lucene.search.uhighlight.SplittingBreakIterator;
52-
import org.apache.lucene.search.uhighlight.UHComponents;
49+
import org.apache.lucene.search.uhighlight.PassageScorer;
5350
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
54-
import org.apache.lucene.util.BytesRef;
5551
import org.opensearch.common.CheckedSupplier;
5652
import org.opensearch.common.Nullable;
5753
import org.opensearch.common.lucene.search.MultiPhrasePrefixQuery;
@@ -61,9 +57,9 @@
6157
import java.text.BreakIterator;
6258
import java.util.Collection;
6359
import java.util.Collections;
60+
import java.util.Comparator;
6461
import java.util.Locale;
6562
import java.util.Set;
66-
import java.util.function.Predicate;
6763

6864
/**
6965
* Subclass of the {@link UnifiedHighlighter} that works for a single field in a single document.
@@ -91,7 +87,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
9187
/**
9288
* Creates a new instance of {@link CustomUnifiedHighlighter}
9389
*
94-
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally.
90+
* @param builder the unified highlighter builder
9591
* @param offsetSource the {@link OffsetSource} to used for offsets retrieval.
9692
* @param passageFormatter our own {@link CustomPassageFormatter}
9793
* which generates snippets in forms of {@link Snippet} objects.
@@ -104,14 +100,12 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
104100
* @param query the query we're highlighting
105101
* @param noMatchSize The size of the text that should be returned when no highlighting can be performed.
106102
* @param maxPassages the maximum number of passes to highlight
107-
* @param fieldMatcher decides which terms should be highlighted
108103
* @param maxAnalyzedOffset if the field is more than this long we'll refuse to use the ANALYZED
109104
* offset source for it because it'd be super slow
110105
* @param fieldMaxAnalyzedOffset this is used to limit the length of input that will be ANALYZED, this allows bigger fields to be partially highligthed
111106
*/
112107
public CustomUnifiedHighlighter(
113-
IndexSearcher searcher,
114-
Analyzer analyzer,
108+
UnifiedHighlighter.Builder builder,
115109
OffsetSource offsetSource,
116110
PassageFormatter passageFormatter,
117111
@Nullable Locale breakIteratorLocale,
@@ -121,21 +115,19 @@ public CustomUnifiedHighlighter(
121115
Query query,
122116
int noMatchSize,
123117
int maxPassages,
124-
Predicate<String> fieldMatcher,
125118
int maxAnalyzedOffset,
126119
Integer fieldMaxAnalyzedOffset
127120
) throws IOException {
128-
super(searcher, analyzer);
121+
super(builder);
129122
this.offsetSource = offsetSource;
130123
this.breakIterator = breakIterator;
131124
this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale;
132125
this.passageFormatter = passageFormatter;
133126
this.index = index;
134127
this.field = field;
135128
this.noMatchSize = noMatchSize;
136-
this.setFieldMatcher(fieldMatcher);
137129
this.maxAnalyzedOffset = maxAnalyzedOffset;
138-
fieldHighlighter = getFieldHighlighter(field, query, extractTerms(query), maxPassages);
130+
fieldHighlighter = (CustomFieldHighlighter) getFieldHighlighter(field, query, extractTerms(query), maxPassages);
139131
this.fieldMaxAnalyzedOffset = fieldMaxAnalyzedOffset;
140132
}
141133

@@ -203,26 +195,27 @@ protected PassageFormatter getFormatter(String field) {
203195
}
204196

205197
@Override
206-
protected CustomFieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
207-
Predicate<String> fieldMatcher = getFieldMatcher(field);
208-
BytesRef[] terms = filterExtractedTerms(fieldMatcher, allTerms);
209-
Set<HighlightFlag> highlightFlags = getFlags(field);
210-
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
211-
LabelledCharArrayMatcher[] automata = getAutomata(field, query, highlightFlags);
212-
UHComponents components = new UHComponents(field, fieldMatcher, query, terms, phraseHelper, automata, false, highlightFlags);
213-
OffsetSource offsetSource = getOptimizedOffsetSource(components);
214-
BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR);
215-
FieldOffsetStrategy strategy = getOffsetStrategy(offsetSource, components);
198+
protected FieldHighlighter newFieldHighlighter(
199+
String field,
200+
FieldOffsetStrategy fieldOffsetStrategy,
201+
BreakIterator breakIterator,
202+
PassageScorer passageScorer,
203+
int maxPassages,
204+
int maxNoHighlightPassages,
205+
PassageFormatter passageFormatter,
206+
Comparator<Passage> passageSortComparator
207+
) {
216208
return new CustomFieldHighlighter(
217209
field,
218-
strategy,
210+
fieldOffsetStrategy,
219211
breakIteratorLocale,
220212
breakIterator,
221213
getScorer(field),
222214
maxPassages,
223215
(noMatchSize > 0 ? 1 : 0),
224216
getFormatter(field),
225-
noMatchSize
217+
noMatchSize,
218+
passageSortComparator
226219
);
227220
}
228221

server/src/main/java/org/opensearch/search/fetch/subphase/highlight/UnifiedHighlighter.java

+21-4
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,12 @@
6060
import java.io.IOException;
6161
import java.text.BreakIterator;
6262
import java.util.ArrayList;
63+
import java.util.Collections;
6364
import java.util.HashMap;
6465
import java.util.List;
6566
import java.util.Locale;
6667
import java.util.Map;
68+
import java.util.Set;
6769
import java.util.function.Predicate;
6870
import java.util.stream.Collectors;
6971

@@ -189,8 +191,7 @@ CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) th
189191
higlighterNumberOfFragments = numberOfFragments;
190192
}
191193
return new CustomUnifiedHighlighter(
192-
searcher,
193-
analyzer,
194+
newBuilder(searcher, analyzer, fieldContext),
194195
offsetSource,
195196
passageFormatter,
196197
fieldContext.field.fieldOptions().boundaryScannerLocale(),
@@ -200,7 +201,6 @@ CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) th
200201
fieldContext.query,
201202
fieldContext.field.fieldOptions().noMatchSize(),
202203
higlighterNumberOfFragments,
203-
fieldMatcher(fieldContext),
204204
maxAnalyzedOffset,
205205
fieldMaxAnalyzedOffset
206206
);
@@ -273,10 +273,27 @@ protected OffsetSource getOffsetSource(MappedFieldType fieldType) {
273273
return OffsetSource.ANALYSIS;
274274
}
275275

276+
private org.apache.lucene.search.uhighlight.UnifiedHighlighter.Builder newBuilder(
277+
IndexSearcher searcher,
278+
Analyzer analyzer,
279+
FieldHighlightContext fieldContext
280+
) {
281+
org.apache.lucene.search.uhighlight.UnifiedHighlighter.Builder builder = org.apache.lucene.search.uhighlight.UnifiedHighlighter
282+
.builder(searcher, analyzer);
283+
Set<String> matchedFields = fieldContext.field.fieldOptions().matchedFields();
284+
if (matchedFields != null && !matchedFields.isEmpty()) {
285+
Map<String, Set<String>> maskedFields = Collections.singletonMap(fieldContext.fieldName, matchedFields);
286+
builder.withMaskedFieldsFunc(f -> maskedFields.getOrDefault(f, Collections.emptySet()));
287+
}
288+
builder.withFieldMatcher(fieldMatcher(fieldContext));
289+
return builder;
290+
}
291+
276292
private Predicate<String> fieldMatcher(FieldHighlightContext fieldContext) {
277293
if (fieldContext.field.fieldOptions().requireFieldMatch()) {
278294
String fieldName = fieldContext.fieldName;
279-
return fieldName::equals;
295+
Set<String> matchedFields = fieldContext.field.fieldOptions().matchedFields();
296+
return f -> fieldName.equals(f) || (matchedFields != null && matchedFields.contains(f));
280297
}
281298
// ignore terms that targets the _id field since they use a different encoding
282299
// that is not compatible with utf8

server/src/test/java/org/opensearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,10 @@ private void assertHighlightOneDoc(
100100
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
101101
assertThat(topDocs.totalHits.value(), equalTo(1L));
102102
String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
103+
UnifiedHighlighter.Builder builder = UnifiedHighlighter.builder(searcher, analyzer);
104+
builder.withFieldMatcher("text"::equals);
103105
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
104-
searcher,
105-
analyzer,
106+
builder,
106107
UnifiedHighlighter.OffsetSource.ANALYSIS,
107108
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()),
108109
locale,
@@ -112,7 +113,6 @@ private void assertHighlightOneDoc(
112113
query,
113114
noMatchSize,
114115
expectedPassages.length,
115-
name -> "text".equals(name),
116116
Integer.MAX_VALUE,
117117
null
118118
);

0 commit comments

Comments
 (0)