Skip to content

Commit 4308f4c

Browse files
authored
Disable scoring of Keyword Term search by default, allow user use old logic using useSimilarity parameter (#17889)
* Initial attempt to add constant scorer for term keyword search * will follow up with unit tests * making sure I'm on the right track Signed-off-by: Asim Mahmood <[email protected]> * Fix unit tests * will follow up with manually testing the new mapping parameter Signed-off-by: Asim Mahmood <[email protected]> * Add changelog entry Signed-off-by: Asim Mahmood <[email protected]> * Fix unit test Signed-off-by: Asim Mahmood <[email protected]> * Fix changelog entry Signed-off-by: Asim Mahmood <[email protected]> * Apply style check Signed-off-by: Asim Mahmood <[email protected]> * Fix unit tests * since term is a basic query type, it affects many that build on top * none of these should cause a regression, infact this change should fix any possible regression these query types, which may not be copatured in big5 datasets Signed-off-by: Asim Mahmood <[email protected]> * Fix more unit tests Signed-off-by: Asim Mahmood <[email protected]> * Fix flakly test * due to the random nature of org.opensearch.index.query.SimpleQueryStringBuilderTests.testToQuery, this can sometimes fail with `but: was <ConstantScore(mapped_string_2:za)>` Signed-off-by: Asim Mahmood <[email protected]> # Please enter the commit message for your changes. Lines starting * Fix style check Signed-off-by: Asim Mahmood <[email protected]> * Change parameter to use snake_case: use_similarity Signed-off-by: Asim Mahmood <[email protected]> * Fix integ tests * also check all places where we do `X instanceof BoostQuery` to also check for `ConstantScoreQuery` Signed-off-by: Asim Mahmood <[email protected]> --------- Signed-off-by: Asim Mahmood <[email protected]> Signed-off-by: Asim Mahmood <[email protected]> # Please enter the commit message for your changes. Lines starting Signed-off-by: Asim M <[email protected]>
1 parent 5bbb699 commit 4308f4c

File tree

13 files changed

+178
-65
lines changed

13 files changed

+178
-65
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
3939
- Add `ApproximateMatchAllQuery` that targets match_all queries and approximates sorts ([#17772](https://github.com/opensearch-project/OpenSearch/pull/17772))
4040
- Add TermsQuery support to Search GRPC endpoint ([#17888](https://github.com/opensearch-project/OpenSearch/pull/17888))
4141
- Support sub agg in filter rewrite optimization ([#17447](https://github.com/opensearch-project/OpenSearch/pull/17447)
42+
- Disable scoring of keyword term search by default, fallback logic with new use_similarity:true parameter ([#17889](https://github.com/opensearch-project/OpenSearch/pull/17889))
4243

4344
### Changed
4445
- Migrate BC libs to their FIPS counterparts ([#14912](https://github.com/opensearch-project/OpenSearch/pull/14912))

modules/percolator/src/test/java/org/opensearch/percolator/QueryBuilderStoreTests.java

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@
3838
import org.apache.lucene.index.IndexWriterConfig;
3939
import org.apache.lucene.index.LeafReaderContext;
4040
import org.apache.lucene.index.NoMergePolicy;
41+
import org.apache.lucene.index.Term;
42+
import org.apache.lucene.search.ConstantScoreQuery;
4143
import org.apache.lucene.search.Query;
4244
import org.apache.lucene.search.TermQuery;
4345
import org.apache.lucene.store.Directory;
@@ -121,9 +123,13 @@ public void testStoringQueryBuilders() throws IOException {
121123
CheckedFunction<Integer, Query, IOException> queries = queryStore.getQueries(leafContext);
122124
assertEquals(queryBuilders.length, leafContext.reader().numDocs());
123125
for (int i = 0; i < queryBuilders.length; i++) {
124-
TermQuery query = (TermQuery) queries.apply(i);
125-
assertEquals(queryBuilders[i].fieldName(), query.getTerm().field());
126-
assertEquals(queryBuilders[i].value(), query.getTerm().text());
126+
Query query = queries.apply(i);
127+
if (query instanceof ConstantScoreQuery constantScoreQuery) {
128+
query = constantScoreQuery.getQuery();
129+
}
130+
Term term = ((TermQuery) query).getTerm();
131+
assertEquals(queryBuilders[i].fieldName(), term.field());
132+
assertEquals(queryBuilders[i].value(), term.text());
127133
}
128134
}
129135
}

server/src/main/java/org/opensearch/index/mapper/KeywordFieldMapper.java

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import org.apache.lucene.index.IndexOptions;
4141
import org.apache.lucene.index.Term;
4242
import org.apache.lucene.search.BoostQuery;
43+
import org.apache.lucene.search.ConstantScoreQuery;
4344
import org.apache.lucene.search.FuzzyQuery;
4445
import org.apache.lucene.search.IndexOrDocValuesQuery;
4546
import org.apache.lucene.search.MultiTermQuery;
@@ -158,6 +159,7 @@ public static class Builder extends ParametrizedFieldMapper.Builder {
158159
);
159160
private final Parameter<Boolean> hasNorms = TextParams.norms(false, m -> toType(m).fieldType.omitNorms() == false);
160161
private final Parameter<SimilarityProvider> similarity = TextParams.similarity(m -> toType(m).similarity);
162+
private final Parameter<Boolean> useSimilarity = Parameter.boolParam("use_similarity", true, m -> toType(m).useSimilarity, false);
161163

162164
private final Parameter<String> normalizer = Parameter.stringParam("normalizer", false, m -> toType(m).normalizerName, "default");
163165

@@ -214,6 +216,7 @@ protected List<Parameter<?>> getParameters() {
214216
indexOptions,
215217
hasNorms,
216218
similarity,
219+
useSimilarity,
217220
normalizer,
218221
splitQueriesOnWhitespace,
219222
boost,
@@ -275,6 +278,7 @@ public static class KeywordFieldType extends StringFieldType {
275278

276279
private final int ignoreAbove;
277280
private final String nullValue;
281+
private final boolean useSimilarity;
278282

279283
public KeywordFieldType(String name, FieldType fieldType, NamedAnalyzer normalizer, NamedAnalyzer searchAnalyzer, Builder builder) {
280284
super(
@@ -290,13 +294,19 @@ public KeywordFieldType(String name, FieldType fieldType, NamedAnalyzer normaliz
290294
setBoost(builder.boost.getValue());
291295
this.ignoreAbove = builder.ignoreAbove.getValue();
292296
this.nullValue = builder.nullValue.getValue();
297+
this.useSimilarity = builder.useSimilarity.getValue();
293298
}
294299

295300
public KeywordFieldType(String name, boolean isSearchable, boolean hasDocValues, Map<String, String> meta) {
301+
this(name, isSearchable, hasDocValues, false, meta);
302+
}
303+
304+
public KeywordFieldType(String name, boolean isSearchable, boolean hasDocValues, boolean useSimilarity, Map<String, String> meta) {
296305
super(name, isSearchable, false, hasDocValues, TextSearchInfo.SIMPLE_MATCH_ONLY, meta);
297306
setIndexAnalyzer(Lucene.KEYWORD_ANALYZER);
298307
this.ignoreAbove = Integer.MAX_VALUE;
299308
this.nullValue = null;
309+
this.useSimilarity = useSimilarity;
300310
}
301311

302312
public KeywordFieldType(String name) {
@@ -314,12 +324,14 @@ public KeywordFieldType(String name, FieldType fieldType) {
314324
);
315325
this.ignoreAbove = Integer.MAX_VALUE;
316326
this.nullValue = null;
327+
this.useSimilarity = false;
317328
}
318329

319330
public KeywordFieldType(String name, NamedAnalyzer analyzer) {
320331
super(name, true, false, true, new TextSearchInfo(Defaults.FIELD_TYPE, null, analyzer, analyzer), Collections.emptyMap());
321332
this.ignoreAbove = Integer.MAX_VALUE;
322333
this.nullValue = null;
334+
this.useSimilarity = false;
323335
}
324336

325337
@Override
@@ -423,7 +435,14 @@ public Query termQueryCaseInsensitive(Object value, QueryShardContext context) {
423435
public Query termQuery(Object value, QueryShardContext context) {
424436
failIfNotIndexedAndNoDocValues();
425437
if (isSearchable()) {
426-
return super.termQuery(value, context);
438+
Query query = super.termQuery(value, context);
439+
if (!this.useSimilarity) {
440+
query = new ConstantScoreQuery(super.termQuery(value, context));
441+
}
442+
if (boost() != 1f) {
443+
query = new BoostQuery(query, boost());
444+
}
445+
return query;
427446
} else {
428447
Query query = SortedSetDocValuesField.newSlowRangeQuery(
429448
name(),
@@ -703,6 +722,7 @@ public Query wildcardQuery(
703722
private final String indexOptions;
704723
private final FieldType fieldType;
705724
private final SimilarityProvider similarity;
725+
private final boolean useSimilarity;
706726
private final String normalizerName;
707727
private final boolean splitQueriesOnWhitespace;
708728

@@ -726,6 +746,7 @@ protected KeywordFieldMapper(
726746
this.indexOptions = builder.indexOptions.getValue();
727747
this.fieldType = fieldType;
728748
this.similarity = builder.similarity.getValue();
749+
this.useSimilarity = builder.useSimilarity.getValue();
729750
this.normalizerName = builder.normalizer.getValue();
730751
this.splitQueriesOnWhitespace = builder.splitQueriesOnWhitespace.getValue();
731752

@@ -740,6 +761,10 @@ public int ignoreAbove() {
740761
return ignoreAbove;
741762
}
742763

764+
boolean useSimilarity() {
765+
return useSimilarity;
766+
}
767+
743768
@Override
744769
protected KeywordFieldMapper clone() {
745770
return (KeywordFieldMapper) super.clone();

server/src/main/java/org/opensearch/index/mapper/MappedFieldType.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,9 @@ public static Term extractTerm(Query termQuery) throws IOException {
497497
return new Term(tisQuery.getField(), term);
498498
}
499499
}
500+
if (termQuery instanceof ConstantScoreQuery) {
501+
termQuery = ((ConstantScoreQuery) termQuery).getQuery();
502+
}
500503
if (termQuery instanceof TermQuery == false) {
501504
throw new IllegalArgumentException("Cannot extract a term from a query of type " + termQuery.getClass() + ": " + termQuery);
502505
}

server/src/test/java/org/opensearch/index/mapper/FlatObjectFieldTypeTests.java

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import org.apache.lucene.index.IndexOptions;
1414
import org.apache.lucene.index.Term;
1515
import org.apache.lucene.search.AutomatonQuery;
16+
import org.apache.lucene.search.ConstantScoreQuery;
1617
import org.apache.lucene.search.FieldExistsQuery;
1718
import org.apache.lucene.search.FuzzyQuery;
1819
import org.apache.lucene.search.IndexOrDocValuesQuery;
@@ -182,7 +183,9 @@ public void testTermQueryCaseInsensitive() {
182183
true,
183184
false
184185
);
185-
Query expected = new TermQuery(new Term("field" + VALUE_AND_PATH_SUFFIX, new BytesRef("field.field1=fOo")));
186+
Query expected = new ConstantScoreQuery(
187+
new TermQuery(new Term("field" + VALUE_AND_PATH_SUFFIX, new BytesRef("field.field1=fOo")))
188+
);
186189

187190
assertEquals(expected, ft.termQuery("fOo", MOCK_QSC_ENABLE_INDEX_DOC_VALUES));
188191
}
@@ -275,7 +278,10 @@ public void testTermQuery() {
275278
String searchFieldName = flatParentFieldType.getSearchField();
276279
String searchValues = flatParentFieldType.rewriteSearchValue("foo");
277280
assertEquals("foo", searchValues);
278-
assertEquals(new TermQuery(new Term(searchFieldName, searchValues)), flatParentFieldType.termQuery(searchValues, null));
281+
assertEquals(
282+
new ConstantScoreQuery(new TermQuery(new Term(searchFieldName, searchValues))),
283+
flatParentFieldType.termQuery(searchValues, null)
284+
);
279285

280286
FlatObjectFieldType dynamicMappedFieldType = new FlatObjectFieldMapper.FlatObjectFieldType(
281287
"field.bar",
@@ -289,7 +295,7 @@ public void testTermQuery() {
289295
String searchValuesDocPath = dynamicMappedFieldType.rewriteSearchValue("foo");
290296
assertEquals("field.bar=foo", searchValuesDocPath);
291297
assertEquals(
292-
new TermQuery(new Term(searchFieldNameDocPath, searchValuesDocPath)),
298+
new ConstantScoreQuery(new TermQuery(new Term(searchFieldNameDocPath, searchValuesDocPath))),
293299
dynamicMappedFieldType.termQuery("foo", null)
294300
);
295301
}
@@ -302,7 +308,7 @@ public void testTermQuery() {
302308
true,
303309
false
304310
);
305-
Query expected = new TermQuery(new Term("field" + VALUE_SUFFIX, new BytesRef("foo")));
311+
Query expected = new ConstantScoreQuery(new TermQuery(new Term("field" + VALUE_SUFFIX, new BytesRef("foo"))));
306312
assertEquals(expected, ft.termQuery("foo", MOCK_QSC_ENABLE_INDEX_DOC_VALUES));
307313
}
308314

@@ -314,7 +320,9 @@ public void testTermQuery() {
314320
true,
315321
false
316322
);
317-
Query expected = new TermQuery(new Term("field" + VALUE_AND_PATH_SUFFIX, new BytesRef("field.field1=foo")));
323+
Query expected = new ConstantScoreQuery(
324+
new TermQuery(new Term("field" + VALUE_AND_PATH_SUFFIX, new BytesRef("field.field1=foo")))
325+
);
318326

319327
assertEquals(expected, ft.termQuery("foo", MOCK_QSC_ENABLE_INDEX_DOC_VALUES));
320328
}

server/src/test/java/org/opensearch/index/mapper/KeywordFieldTypeTests.java

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
import org.apache.lucene.document.FieldType;
4444
import org.apache.lucene.document.SortedSetDocValuesField;
4545
import org.apache.lucene.index.Term;
46+
import org.apache.lucene.search.ConstantScoreQuery;
4647
import org.apache.lucene.search.FieldExistsQuery;
4748
import org.apache.lucene.search.FuzzyQuery;
4849
import org.apache.lucene.search.IndexOrDocValuesQuery;
@@ -129,10 +130,16 @@ public void testTermQueryCaseInsensitive() {
129130

130131
public void testTermQuery() {
131132
MappedFieldType ft = new KeywordFieldType("field");
132-
assertEquals(new TermQuery(new Term("field", "foo")), ft.termQuery("foo", MOCK_QSC_ENABLE_INDEX_DOC_VALUES));
133+
assertEquals(
134+
new ConstantScoreQuery(new TermQuery(new Term("field", "foo"))),
135+
ft.termQuery("foo", MOCK_QSC_ENABLE_INDEX_DOC_VALUES)
136+
);
133137

134138
ft = new KeywordFieldType("field", true, false, Collections.emptyMap());
135-
assertEquals(new TermQuery(new Term("field", "foo")), ft.termQuery("foo", MOCK_QSC_ENABLE_INDEX_DOC_VALUES));
139+
assertEquals(
140+
new ConstantScoreQuery(new TermQuery(new Term("field", "foo"))),
141+
ft.termQuery("foo", MOCK_QSC_ENABLE_INDEX_DOC_VALUES)
142+
);
136143

137144
ft = new KeywordFieldType("field", false, true, Collections.emptyMap());
138145
Query expected = SortedSetDocValuesField.newSlowRangeQuery("field", new BytesRef("foo"), new BytesRef("foo"), true, true);
@@ -147,6 +154,10 @@ public void testTermQuery() {
147154
"Cannot search on field [field] since it is both not indexed, and does not have doc_values " + "enabled.",
148155
e.getMessage()
149156
);
157+
// backwards compatible enaled with useSimilarity=true
158+
ft = new KeywordFieldType("field", true, false, true, Collections.emptyMap());
159+
assertEquals(new TermQuery(new Term("field", "foo")), ft.termQuery("foo", MOCK_QSC_ENABLE_INDEX_DOC_VALUES));
160+
150161
}
151162

152163
public void testTermQueryWithNormalizer() {
@@ -164,7 +175,7 @@ protected TokenStream normalize(String fieldName, TokenStream in) {
164175
}
165176
};
166177
MappedFieldType ft = new KeywordFieldType("field", new NamedAnalyzer("my_normalizer", AnalyzerScope.INDEX, normalizer));
167-
assertEquals(new TermQuery(new Term("field", "foo bar")), ft.termQuery("fOo BaR", null));
178+
assertEquals(new ConstantScoreQuery(new TermQuery(new Term("field", "foo bar"))), ft.termQuery("fOo BaR", null));
168179
}
169180

170181
public void testTermsQuery() {
@@ -413,9 +424,9 @@ public void testWildCardQuery() {
413424

414425
public void testNormalizeQueries() {
415426
MappedFieldType ft = new KeywordFieldType("field");
416-
assertEquals(new TermQuery(new Term("field", new BytesRef("FOO"))), ft.termQuery("FOO", null));
427+
assertEquals(new ConstantScoreQuery(new TermQuery(new Term("field", new BytesRef("FOO")))), ft.termQuery("FOO", null));
417428
ft = new KeywordFieldType("field", Lucene.STANDARD_ANALYZER);
418-
assertEquals(new TermQuery(new Term("field", new BytesRef("foo"))), ft.termQuery("FOO", null));
429+
assertEquals(new ConstantScoreQuery(new TermQuery(new Term("field", new BytesRef("foo")))), ft.termQuery("FOO", null));
419430
}
420431

421432
public void testFetchSourceValue() throws IOException {

server/src/test/java/org/opensearch/index/query/MultiMatchQueryBuilderTests.java

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import org.apache.lucene.search.BooleanClause;
3737
import org.apache.lucene.search.BooleanQuery;
3838
import org.apache.lucene.search.BoostQuery;
39+
import org.apache.lucene.search.ConstantScoreQuery;
3940
import org.apache.lucene.search.DisjunctionMaxQuery;
4041
import org.apache.lucene.search.FuzzyQuery;
4142
import org.apache.lucene.search.IndexOrDocValuesQuery;
@@ -242,7 +243,10 @@ public void testToQueryMultipleFieldsDisableDismax() throws Exception {
242243
.tieBreaker(1.0f)
243244
.toQuery(createShardContext());
244245
Query expected = new DisjunctionMaxQuery(
245-
List.of(new TermQuery(new Term(TEXT_FIELD_NAME, "test")), new TermQuery(new Term(KEYWORD_FIELD_NAME, "test"))),
246+
List.of(
247+
new TermQuery(new Term(TEXT_FIELD_NAME, "test")),
248+
new ConstantScoreQuery(new TermQuery(new Term(KEYWORD_FIELD_NAME, "test")))
249+
),
246250
1
247251
);
248252
assertEquals(expected, query);
@@ -251,7 +255,10 @@ public void testToQueryMultipleFieldsDisableDismax() throws Exception {
251255
public void testToQueryMultipleFieldsDisMaxQuery() throws Exception {
252256
Query query = multiMatchQuery("test").field(TEXT_FIELD_NAME).field(KEYWORD_FIELD_NAME).toQuery(createShardContext());
253257
Query expected = new DisjunctionMaxQuery(
254-
List.of(new TermQuery(new Term(TEXT_FIELD_NAME, "test")), new TermQuery(new Term(KEYWORD_FIELD_NAME, "test"))),
258+
List.of(
259+
new TermQuery(new Term(TEXT_FIELD_NAME, "test")),
260+
new ConstantScoreQuery(new TermQuery(new Term(KEYWORD_FIELD_NAME, "test")))
261+
),
255262
0
256263
);
257264
assertEquals(expected, query);
@@ -260,7 +267,10 @@ public void testToQueryMultipleFieldsDisMaxQuery() throws Exception {
260267
public void testToQueryFieldsWildcard() throws Exception {
261268
Query query = multiMatchQuery("test").field("mapped_str*").tieBreaker(1.0f).toQuery(createShardContext());
262269
Query expected = new DisjunctionMaxQuery(
263-
List.of(new TermQuery(new Term(TEXT_FIELD_NAME, "test")), new TermQuery(new Term(KEYWORD_FIELD_NAME, "test"))),
270+
List.of(
271+
new TermQuery(new Term(TEXT_FIELD_NAME, "test")),
272+
new ConstantScoreQuery(new TermQuery(new Term(KEYWORD_FIELD_NAME, "test")))
273+
),
264274
1
265275
);
266276
assertEquals(expected, query);
@@ -459,7 +469,7 @@ public void testDefaultField() throws Exception {
459469
DisjunctionMaxQuery expected = new DisjunctionMaxQuery(
460470
Arrays.asList(
461471
new TermQuery(new Term(TEXT_FIELD_NAME, "hello")),
462-
new BoostQuery(new TermQuery(new Term(KEYWORD_FIELD_NAME, "hello")), 5.0f)
472+
new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term(KEYWORD_FIELD_NAME, "hello"))), 5.0f)
463473
),
464474
0.0f
465475
);
@@ -487,7 +497,7 @@ public void testDefaultField() throws Exception {
487497
Arrays.asList(
488498
new MatchNoDocsQuery("failed [mapped_int] query, caused by number_format_exception:[For input string: \"hello\"]"),
489499
new TermQuery(new Term(TEXT_FIELD_NAME, "hello")),
490-
new BoostQuery(new TermQuery(new Term(KEYWORD_FIELD_NAME, "hello")), 5.0f)
500+
new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term(KEYWORD_FIELD_NAME, "hello"))), 5.0f)
491501
),
492502
0.0f
493503
);
@@ -531,9 +541,10 @@ public void testWithStopWords() throws Exception {
531541
new BooleanQuery.Builder().add(new TermQuery(new Term(TEXT_FIELD_NAME, "quick")), BooleanClause.Occur.SHOULD)
532542
.add(new TermQuery(new Term(TEXT_FIELD_NAME, "fox")), BooleanClause.Occur.SHOULD)
533543
.build(),
534-
new BooleanQuery.Builder().add(new TermQuery(new Term(KEYWORD_FIELD_NAME, "quick")), BooleanClause.Occur.SHOULD)
535-
.add(new TermQuery(new Term(KEYWORD_FIELD_NAME, "fox")), BooleanClause.Occur.SHOULD)
536-
.build()
544+
new BooleanQuery.Builder().add(
545+
new ConstantScoreQuery(new TermQuery(new Term(KEYWORD_FIELD_NAME, "quick"))),
546+
BooleanClause.Occur.SHOULD
547+
).add(new ConstantScoreQuery(new TermQuery(new Term(KEYWORD_FIELD_NAME, "fox"))), BooleanClause.Occur.SHOULD).build()
537548
),
538549
0f
539550
);
@@ -588,7 +599,10 @@ private void assertQueryWithAllFieldsWildcard(Query query) {
588599
assertEquals(9, noMatchNoDocsQueries);
589600
assertThat(
590601
disjunctionMaxQuery.getDisjuncts(),
591-
hasItems(new TermQuery(new Term(TEXT_FIELD_NAME, "hello")), new TermQuery(new Term(KEYWORD_FIELD_NAME, "hello")))
602+
hasItems(
603+
new TermQuery(new Term(TEXT_FIELD_NAME, "hello")),
604+
new ConstantScoreQuery(new TermQuery(new Term(KEYWORD_FIELD_NAME, "hello")))
605+
)
592606
);
593607
}
594608

0 commit comments

Comments
 (0)