Skip to content

Commit 39b4892

Browse files
authored
Add semantic field mapper. (#1225)
Signed-off-by: Bo Zhang <[email protected]>
1 parent b0b8b29 commit 39b4892

File tree

16 files changed

+810
-58
lines changed

16 files changed

+810
-58
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
## [Unreleased 3.x](https://github.com/opensearch-project/neural-search/compare/main...HEAD)
77

88
### Features
9+
- [Semantic Field] Add semantic field mapper. ([#1225](https://github.com/opensearch-project/neural-search/pull/1225)).
910

1011
### Enhancements
1112

build.gradle

+1
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ def knnJarDirectory = "$buildDir/dependencies/opensearch-knn"
251251

252252
dependencies {
253253
api "org.opensearch:opensearch:${opensearch_version}"
254+
implementation group: 'org.opensearch.plugin', name:'mapper-extras-client', version: "${opensearch_version}"
254255
zipArchive group: 'org.opensearch.plugin', name:'opensearch-job-scheduler', version: "${opensearch_build}"
255256
zipArchive group: 'org.opensearch.plugin', name:'opensearch-knn', version: "${opensearch_build}"
256257
zipArchive group: 'org.opensearch.plugin', name:'opensearch-ml-plugin', version: "${opensearch_build}"

qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/rolling/HybridSearchWithRescoreIT.java

+7-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,13 @@ private void validateTestIndexOnUpgrade(
105105
) throws Exception {
106106
int docCount = getDocCount(getIndexNameForTest());
107107
assertEquals(numberOfDocs, docCount);
108-
loadModel(modelId);
108+
// In rolling upgrade tests we will not clean up the resources created in old and mix
109+
// so check if the model is already deployed then no need to deploy it again.
110+
if (!isModelAlreadyDeployed(modelId)) {
111+
loadModel(modelId);
112+
}
113+
// Try to ensure all nodes are green before we do the search.
114+
waitForClusterHealthGreen(NODES_BWC_CLUSTER);
109115
Map<String, Object> searchResponseAsMap = search(
110116
getIndexNameForTest(),
111117
hybridQueryBuilder,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
package org.opensearch.neuralsearch.constants;
6+
7+
/**
8+
* Constants related to the index mapping.
9+
*/
10+
public class MappingConstants {
11+
/**
12+
* Name for the field type. In index mapping we use this key to define the field type.
13+
*/
14+
public static final String TYPE = "type";
15+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
package org.opensearch.neuralsearch.constants;
6+
7+
/**
8+
* Constants for semantic field
9+
*/
10+
public class SemanticFieldConstants {
11+
/**
12+
* Name of the model id parameter. We use this key to define the id of the ML model that we will use for the
13+
* semantic field.
14+
*/
15+
public static final String MODEL_ID = "model_id";
16+
17+
/**
18+
* Name of the search model id parameter. We use this key to define the id of the ML model that we will use to
19+
* inference the query text during the search. If this parameter is not defined we will use the model_id instead.
20+
*/
21+
public static final String SEARCH_MODEL_ID = "search_model_id";
22+
23+
/**
24+
* Name of the raw field type parameter. We use this key to define the field type for the raw data. It will control
25+
* how to store and query the raw data.
26+
*/
27+
public static final String RAW_FIELD_TYPE = "raw_field_type";
28+
29+
/**
30+
* Name of the raw field type parameter. We use this key to define a custom field name for the semantic info.
31+
*/
32+
public static final String SEMANTIC_INFO_FIELD_NAME = "semantic_info_field_name";
33+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
package org.opensearch.neuralsearch.mapper;
6+
7+
import lombok.Getter;
8+
import lombok.NonNull;
9+
import lombok.Setter;
10+
import org.opensearch.core.xcontent.XContentBuilder;
11+
import org.opensearch.index.mapper.BinaryFieldMapper;
12+
import org.opensearch.index.mapper.FilterFieldType;
13+
import org.opensearch.index.mapper.KeywordFieldMapper;
14+
import org.opensearch.index.mapper.MappedFieldType;
15+
import org.opensearch.index.mapper.Mapper;
16+
import org.opensearch.index.mapper.MapperParsingException;
17+
import org.opensearch.index.mapper.MatchOnlyTextFieldMapper;
18+
import org.opensearch.index.mapper.ParametrizedFieldMapper;
19+
import org.opensearch.index.mapper.ParseContext;
20+
import org.opensearch.index.mapper.TextFieldMapper;
21+
import org.opensearch.index.mapper.TokenCountFieldMapper;
22+
import org.opensearch.index.mapper.WildcardFieldMapper;
23+
import org.opensearch.neuralsearch.constants.MappingConstants;
24+
import org.opensearch.neuralsearch.mapper.dto.SemanticParameters;
25+
26+
import java.io.IOException;
27+
import java.util.HashMap;
28+
import java.util.List;
29+
import java.util.Locale;
30+
import java.util.Map;
31+
import java.util.Set;
32+
33+
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.MODEL_ID;
34+
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.RAW_FIELD_TYPE;
35+
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SEARCH_MODEL_ID;
36+
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SEMANTIC_INFO_FIELD_NAME;
37+
38+
/**
39+
* FieldMapper for the semantic field. It will hold a delegate field mapper to delegate the data parsing and query work
40+
* based on the raw_field_type.
41+
*/
42+
public class SemanticFieldMapper extends ParametrizedFieldMapper {
43+
public static final String CONTENT_TYPE = "semantic";
44+
private final SemanticParameters semanticParameters;
45+
46+
@Setter
47+
@Getter
48+
private ParametrizedFieldMapper delegateFieldMapper;
49+
50+
protected SemanticFieldMapper(
51+
String simpleName,
52+
MappedFieldType mappedFieldType,
53+
MultiFields multiFields,
54+
CopyTo copyTo,
55+
ParametrizedFieldMapper delegateFieldMapper,
56+
SemanticParameters semanticParameters
57+
) {
58+
super(simpleName, mappedFieldType, multiFields, copyTo);
59+
this.delegateFieldMapper = delegateFieldMapper;
60+
this.semanticParameters = semanticParameters;
61+
}
62+
63+
@Override
64+
public Builder getMergeBuilder() {
65+
Builder semanticFieldMapperBuilder = (Builder) new Builder(simpleName()).init(this);
66+
ParametrizedFieldMapper.Builder delegateBuilder = delegateFieldMapper.getMergeBuilder();
67+
semanticFieldMapperBuilder.setDelegateBuilder(delegateBuilder);
68+
return semanticFieldMapperBuilder;
69+
}
70+
71+
@Override
72+
public final ParametrizedFieldMapper merge(Mapper mergeWith) {
73+
if (mergeWith instanceof SemanticFieldMapper) {
74+
try {
75+
delegateFieldMapper = delegateFieldMapper.merge(((SemanticFieldMapper) mergeWith).delegateFieldMapper);
76+
} catch (IllegalArgumentException e) {
77+
final String err = String.format(
78+
Locale.ROOT,
79+
"Failed to update the mapper %s because failed to update the delegate mapper for the raw_field_type %s due to %s",
80+
this.name(),
81+
this.semanticParameters.getRawFieldType(),
82+
e.getMessage()
83+
);
84+
throw new IllegalArgumentException(err, e);
85+
}
86+
}
87+
return super.merge(mergeWith);
88+
}
89+
90+
@Override
91+
protected void parseCreateField(ParseContext context) throws IOException {
92+
delegateFieldMapper.parse(context);
93+
}
94+
95+
@Override
96+
protected String contentType() {
97+
return CONTENT_TYPE;
98+
}
99+
100+
public static class Builder extends ParametrizedFieldMapper.Builder {
101+
@Getter
102+
protected final Parameter<String> modelId = Parameter.stringParam(
103+
MODEL_ID,
104+
true,
105+
m -> ((SemanticFieldMapper) m).semanticParameters.getModelId(),
106+
null
107+
);
108+
@Getter
109+
protected final Parameter<String> searchModelId = Parameter.stringParam(
110+
SEARCH_MODEL_ID,
111+
true,
112+
m -> ((SemanticFieldMapper) m).semanticParameters.getSearchModelId(),
113+
null
114+
);
115+
@Getter
116+
protected final Parameter<String> rawFieldType = Parameter.stringParam(
117+
RAW_FIELD_TYPE,
118+
false,
119+
m -> ((SemanticFieldMapper) m).semanticParameters.getRawFieldType(),
120+
TextFieldMapper.CONTENT_TYPE
121+
);
122+
@Getter
123+
protected final Parameter<String> semanticInfoFieldName = Parameter.stringParam(
124+
SEMANTIC_INFO_FIELD_NAME,
125+
false,
126+
m -> ((SemanticFieldMapper) m).semanticParameters.getSemanticInfoFieldName(),
127+
null
128+
);
129+
130+
@Setter
131+
protected ParametrizedFieldMapper.Builder delegateBuilder;
132+
133+
protected Builder(String name) {
134+
super(name);
135+
}
136+
137+
@Override
138+
protected List<Parameter<?>> getParameters() {
139+
return List.of(modelId, searchModelId, rawFieldType, semanticInfoFieldName);
140+
}
141+
142+
@Override
143+
public SemanticFieldMapper build(BuilderContext context) {
144+
final ParametrizedFieldMapper delegateMapper = delegateBuilder.build(context);
145+
146+
final SemanticParameters semanticParameters = this.getSemanticParameters();
147+
final MappedFieldType semanticFieldType = new SemanticFieldType(delegateMapper.fieldType(), semanticParameters);
148+
149+
return new SemanticFieldMapper(
150+
name,
151+
semanticFieldType,
152+
multiFieldsBuilder.build(this, context),
153+
copyTo.build(),
154+
delegateMapper,
155+
semanticParameters
156+
);
157+
}
158+
159+
public SemanticParameters getSemanticParameters() {
160+
return new SemanticParameters(
161+
modelId.getValue(),
162+
searchModelId.getValue(),
163+
rawFieldType.getValue(),
164+
semanticInfoFieldName.getValue()
165+
);
166+
}
167+
}
168+
169+
public static class TypeParser implements Mapper.TypeParser {
170+
171+
private final static Set<String> SUPPORTED_RAW_FIELD_TYPE = Set.of(
172+
TextFieldMapper.CONTENT_TYPE,
173+
KeywordFieldMapper.CONTENT_TYPE,
174+
MatchOnlyTextFieldMapper.CONTENT_TYPE,
175+
WildcardFieldMapper.CONTENT_TYPE,
176+
TokenCountFieldMapper.CONTENT_TYPE,
177+
BinaryFieldMapper.CONTENT_TYPE
178+
);
179+
180+
@Override
181+
public Builder parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
182+
final String rawFieldType = (String) node.getOrDefault(RAW_FIELD_TYPE, TextFieldMapper.CONTENT_TYPE);
183+
184+
validateRawFieldType(rawFieldType);
185+
186+
final ParametrizedFieldMapper.TypeParser typeParser = (ParametrizedFieldMapper.TypeParser) parserContext.typeParser(
187+
rawFieldType
188+
);
189+
final Builder semanticFieldMapperBuilder = new Builder(name);
190+
191+
// semantic field mapper builder parse semantic fields
192+
Map<String, Object> semanticConfig = extractSemanticConfig(node, semanticFieldMapperBuilder.getParameters(), rawFieldType);
193+
semanticFieldMapperBuilder.parse(name, parserContext, semanticConfig);
194+
195+
// delegate field mapper builder parse remaining fields
196+
ParametrizedFieldMapper.Builder delegateBuilder = typeParser.parse(name, node, parserContext);
197+
semanticFieldMapperBuilder.setDelegateBuilder(delegateBuilder);
198+
199+
return semanticFieldMapperBuilder;
200+
}
201+
202+
private void validateRawFieldType(final String rawFieldType) {
203+
if (rawFieldType == null || !SUPPORTED_RAW_FIELD_TYPE.contains(rawFieldType)) {
204+
final String err = String.format(
205+
Locale.ROOT,
206+
"raw_field_type %s is not supported. It should be one of [%s]",
207+
rawFieldType,
208+
String.join(", ", SUPPORTED_RAW_FIELD_TYPE)
209+
);
210+
throw new IllegalArgumentException(err);
211+
}
212+
}
213+
214+
/**
215+
* In this function we will extract all the parameters defined in the semantic field mapper builder and parse it
216+
* later. The remaining parameters will be processed by the type parser of the raw field type. Here we cannot
217+
* pass the parameters defined by semantic field to the delegate type parser of the raw field type because it
218+
* cannot recognize them.
219+
* @param node field config
220+
* @param parameters parameters for semantic field
221+
* @param rawFieldType field type of the raw data
222+
* @return semantic field config
223+
*/
224+
private Map<String, Object> extractSemanticConfig(Map<String, Object> node, List<Parameter<?>> parameters, String rawFieldType) {
225+
final Map<String, Object> semanticConfig = new HashMap<>();
226+
for (Parameter<?> parameter : parameters) {
227+
Object config = node.get(parameter.name);
228+
if (config != null) {
229+
semanticConfig.put(parameter.name, config);
230+
node.remove(parameter.name);
231+
}
232+
}
233+
semanticConfig.put(MappingConstants.TYPE, SemanticFieldMapper.CONTENT_TYPE);
234+
node.put(MappingConstants.TYPE, rawFieldType);
235+
return semanticConfig;
236+
}
237+
}
238+
239+
public static class SemanticFieldType extends FilterFieldType {
240+
@Getter
241+
private SemanticParameters semanticParameters;
242+
243+
public SemanticFieldType(@NonNull final MappedFieldType delegate, @NonNull final SemanticParameters semanticParameters) {
244+
super(delegate);
245+
this.semanticParameters = semanticParameters;
246+
}
247+
248+
@Override
249+
public String typeName() {
250+
return SemanticFieldMapper.CONTENT_TYPE;
251+
}
252+
}
253+
254+
@Override
255+
protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException {
256+
builder.field(MappingConstants.TYPE, contentType());
257+
258+
// semantic parameters
259+
final List<Parameter<?>> parameters = getMergeBuilder().getParameters();
260+
for (Parameter<?> parameter : parameters) {
261+
// By default, we will not return the default value. But raw_field_type is useful info to let users know how
262+
// we will handle the raw data. So we explicitly return it even it is using the default value.
263+
if (RAW_FIELD_TYPE.equals(parameter.name)) {
264+
parameter.toXContent(builder, true);
265+
} else {
266+
parameter.toXContent(builder, includeDefaults);
267+
}
268+
}
269+
270+
// non-semantic parameters
271+
// semantic field mapper itself does not handle multi fields or copy to. The delegate field mapper will handle it.
272+
delegateFieldMapper.multiFields().toXContent(builder, params);
273+
delegateFieldMapper.copyTo().toXContent(builder, params);
274+
delegateFieldMapper.getMergeBuilder().toXContent(builder, includeDefaults);
275+
}
276+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
package org.opensearch.neuralsearch.mapper.dto;
6+
7+
import lombok.Getter;
8+
9+
/**
10+
* A DTO to hold all the semantic parameters.
11+
*/
12+
@Getter
13+
public class SemanticParameters {
14+
private final String modelId;
15+
private final String searchModelId;
16+
private final String rawFieldType;
17+
private final String semanticInfoFieldName;
18+
19+
public SemanticParameters(String modelId, String searchModelId, String rawFieldType, String semanticInfoFieldName) {
20+
this.modelId = modelId;
21+
this.searchModelId = searchModelId;
22+
this.semanticInfoFieldName = semanticInfoFieldName;
23+
this.rawFieldType = rawFieldType;
24+
}
25+
}

0 commit comments

Comments
 (0)