Skip to content

Commit 65d170e

Browse files
committed
Synthetic source doc values arrays encoding experiment 2
WIP
1 parent ca82e43 commit 65d170e

File tree

10 files changed

+269
-21
lines changed

10 files changed

+269
-21
lines changed

rest-api-spec/build.gradle

+3
Original file line numberDiff line numberDiff line change
@@ -98,4 +98,7 @@ tasks.named("yamlRestCompatTestTransform").configure ({ task ->
9898
task.skipTest("index/91_metrics_no_subobjects/Metrics object indexing with synthetic source", "_source.mode mapping attribute is no-op since 9.0.0")
9999
task.skipTest("index/91_metrics_no_subobjects/Root without subobjects with synthetic source", "_source.mode mapping attribute is no-op since 9.0.0")
100100
task.skipTest("logsdb/10_settings/routing path allowed in logs mode with routing on sort fields", "Unknown feature routing.logsb_route_on_sort_fields")
101+
task.skipTest("indices.create/21_synthetic_source_stored/index param - field ordering", "Synthetic source keep arrays now stores leaf arrays natively")
102+
task.skipTest("indices.create/21_synthetic_source_stored/field param - keep nested array", "Synthetic source keep arrays now stores leaf arrays natively")
103+
task.skipTest("indices.create/21_synthetic_source_stored/field param - keep root array", "Synthetic source keep arrays now stores leaf arrays natively")
101104
})

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/21_synthetic_source_stored.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1024,7 +1024,7 @@ index param - field ordering:
10241024
index: test
10251025

10261026
- length: { hits.hits.0._source: 4 }
1027-
- match: { hits.hits.0._source: { "a": "2", "b": [ { "bb": 100, "aa": 200 }, { "aa": 300, "bb": 400 } ], "c": [30, 20, 10], "d": [ { "bb": 10, "aa": 20 }, { "aa": 30, "bb": 40 } ] } }
1027+
- match: { hits.hits.0._source: { "a": "2", "b": [ { "bb": 100, "aa": 200 }, { "aa": 300, "bb": 400 } ], "c": ["30", "20", "10"], "d": [ { "bb": 10, "aa": 20 }, { "aa": 30, "bb": 40 } ] } }
10281028

10291029

10301030
---

server/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java

+49-3
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,12 @@
99

1010
package org.elasticsearch.index.mapper;
1111

12+
import org.apache.lucene.document.BinaryDocValuesField;
1213
import org.apache.lucene.index.IndexableField;
1314
import org.apache.lucene.index.LeafReaderContext;
1415
import org.apache.lucene.search.Query;
1516
import org.elasticsearch.common.Explicit;
17+
import org.elasticsearch.common.io.stream.BytesStreamOutput;
1618
import org.elasticsearch.common.regex.Regex;
1719
import org.elasticsearch.common.xcontent.XContentHelper;
1820
import org.elasticsearch.core.Nullable;
@@ -24,6 +26,8 @@
2426
import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
2527
import org.elasticsearch.index.query.SearchExecutionContext;
2628
import org.elasticsearch.indices.breaker.NoneCircuitBreakerService;
29+
import org.elasticsearch.logging.LogManager;
30+
import org.elasticsearch.logging.Logger;
2731
import org.elasticsearch.plugins.internal.XContentMeteringParserDecorator;
2832
import org.elasticsearch.search.lookup.SearchLookup;
2933
import org.elasticsearch.search.lookup.Source;
@@ -36,6 +40,7 @@
3640

3741
import java.io.IOException;
3842
import java.util.ArrayList;
43+
import java.util.Arrays;
3944
import java.util.Collections;
4045
import java.util.HashMap;
4146
import java.util.Iterator;
@@ -53,6 +58,8 @@
5358
*/
5459
public final class DocumentParser {
5560

61+
private static final Logger LOGGER = LogManager.getLogger(DocumentParser.class);
62+
5663
public static final IndexVersion DYNAMICALLY_MAP_DENSE_VECTORS_INDEX_VERSION = IndexVersions.FIRST_DETACHED_INDEX_VERSION;
5764
static final NodeFeature FIX_PARSING_SUBOBJECTS_FALSE_DYNAMIC_FALSE = new NodeFeature(
5865
"mapper.fix_parsing_subobjects_false_dynamic_false"
@@ -148,7 +155,7 @@ private void internalParseDocument(MetadataFieldMapper[] metadataFieldsMappers,
148155
}
149156

150157
executeIndexTimeScripts(context);
151-
158+
processArrayOffsets(context);
152159
for (MetadataFieldMapper metadataMapper : metadataFieldsMappers) {
153160
metadataMapper.postParse(context);
154161
}
@@ -157,6 +164,41 @@ private void internalParseDocument(MetadataFieldMapper[] metadataFieldsMappers,
157164
}
158165
}
159166

167+
private static void processArrayOffsets(DocumentParserContext context) throws IOException {
168+
var offsets = context.getOffSetsByField();
169+
for (var entry : offsets.entrySet()) {
170+
var fieldName = entry.getKey();
171+
var offset = entry.getValue();
172+
if (offset.valueToOffsets.isEmpty()) {
173+
continue;
174+
}
175+
176+
if (offset.currentOffset == 1 && offset.inArray == false) {
177+
continue;
178+
}
179+
180+
int ord = 0;
181+
int[] offsetToOrd = new int[offset.currentOffset];
182+
for (var offsetEntry : offset.valueToOffsets.entrySet()) {
183+
for (var offsetAndLevel : offsetEntry.getValue()) {
184+
offsetToOrd[offsetAndLevel] = ord;
185+
}
186+
ord++;
187+
}
188+
189+
// TODO: remove later
190+
LOGGER.info("values=" + offset.valueToOffsets);
191+
LOGGER.info("offsetToOrd=" + Arrays.toString(offsetToOrd));
192+
193+
try (var streamOutput = new BytesStreamOutput()) {
194+
// TODO: optimize
195+
// This array allows to retain the original ordering of the leaf array and duplicate values.
196+
streamOutput.writeVIntArray(offsetToOrd);
197+
context.doc().add(new BinaryDocValuesField(fieldName, streamOutput.bytes().toBytesRef()));
198+
}
199+
}
200+
}
201+
160202
private static void executeIndexTimeScripts(DocumentParserContext context) {
161203
List<FieldMapper> indexTimeScriptMappers = context.mappingLookup().indexTimeScriptMappers();
162204
if (indexTimeScriptMappers.isEmpty()) {
@@ -687,7 +729,7 @@ private static void parseNonDynamicArray(
687729

688730
// Check if we need to record the array source. This only applies to synthetic source.
689731
boolean canRemoveSingleLeafElement = false;
690-
if (context.canAddIgnoredField()) {
732+
if (context.canAddIgnoredField() && (mapper != null && mapper.supportsStoringArraysNatively() == false)) {
691733
Mapper.SourceKeepMode mode = Mapper.SourceKeepMode.NONE;
692734
boolean objectWithFallbackSyntheticSource = false;
693735
if (mapper instanceof ObjectMapper objectMapper) {
@@ -725,10 +767,13 @@ private static void parseNonDynamicArray(
725767
// In synthetic source, if any array element requires storing its source as-is, it takes precedence over
726768
// elements from regular source loading that are then skipped from the synthesized array source.
727769
// To prevent this, we track that parsing sub-context is within array scope.
728-
context = context.maybeCloneForArray(mapper);
770+
if (mapper != null && mapper.supportsStoringArraysNatively() == false) {
771+
context = context.maybeCloneForArray(mapper);
772+
}
729773

730774
XContentParser parser = context.parser();
731775
XContentParser.Token token;
776+
context.setInArray(true);
732777
while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
733778
if (token == XContentParser.Token.START_OBJECT) {
734779
parseObject(context, lastFieldName);
@@ -743,6 +788,7 @@ private static void parseNonDynamicArray(
743788
parseValue(context, lastFieldName);
744789
}
745790
}
791+
context.setInArray(false);
746792
postProcessDynamicArrayMapping(context, lastFieldName);
747793
}
748794

server/src/main/java/org/elasticsearch/index/mapper/DocumentParserContext.java

+43
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import java.util.List;
3434
import java.util.Map;
3535
import java.util.Set;
36+
import java.util.TreeMap;
3637

3738
/**
3839
* Context used when parsing incoming documents. Holds everything that is needed to parse a document as well as
@@ -84,6 +85,21 @@ public LuceneDocument doc() {
8485
protected void addDoc(LuceneDocument doc) {
8586
in.addDoc(doc);
8687
}
88+
89+
@Override
90+
public Map<String, Offsets> getOffSetsByField() {
91+
return in.getOffSetsByField();
92+
}
93+
94+
@Override
95+
void recordOffset(String field, String value) {
96+
in.recordOffset(field, value);
97+
}
98+
99+
@Override
100+
public void setInArray(boolean inArray) {
101+
in.setInArray(inArray);
102+
}
87103
}
88104

89105
/**
@@ -134,6 +150,9 @@ private enum Scope {
134150
private final SeqNoFieldMapper.SequenceIDFields seqID;
135151
private final Set<String> fieldsAppliedFromTemplates;
136152

153+
private final Map<String, Offsets> offsetsPerField = new HashMap<>();
154+
private boolean inArray;
155+
137156
/**
138157
* Fields that are copied from values of other fields via copy_to.
139158
* This per-document state is needed since it is possible
@@ -470,6 +489,30 @@ public Set<String> getCopyToFields() {
470489
return copyToFields;
471490
}
472491

492+
public static class Offsets {
493+
494+
public int currentOffset;
495+
public boolean inArray;
496+
public final Map<String, List<Integer>> valueToOffsets = new TreeMap<>();
497+
498+
}
499+
500+
public Map<String, Offsets> getOffSetsByField() {
501+
return offsetsPerField;
502+
}
503+
504+
void recordOffset(String field, String value) {
505+
Offsets arrayOffsets = offsetsPerField.computeIfAbsent(field, k -> new Offsets());
506+
int nextOffset = arrayOffsets.currentOffset++;
507+
var offsets = arrayOffsets.valueToOffsets.computeIfAbsent(value, s -> new ArrayList<>());
508+
offsets.add(nextOffset);
509+
arrayOffsets.inArray = inArray;
510+
}
511+
512+
public void setInArray(boolean inArray) {
513+
this.inArray = inArray;
514+
}
515+
473516
/**
474517
* Add a new mapper dynamically created while parsing.
475518
*

server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java

+52-9
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ public final class KeywordFieldMapper extends FieldMapper {
8787
private static final Logger logger = LogManager.getLogger(KeywordFieldMapper.class);
8888

8989
public static final String CONTENT_TYPE = "keyword";
90+
public static final String OFFSETS_FIELD_NAME_SUFFIX = ".offsets";
9091

9192
public static class Defaults {
9293
public static final FieldType FIELD_TYPE;
@@ -182,14 +183,16 @@ public static final class Builder extends FieldMapper.DimensionBuilder {
182183
private final IndexAnalyzers indexAnalyzers;
183184
private final ScriptCompiler scriptCompiler;
184185
private final IndexVersion indexCreatedVersion;
186+
private final SourceKeepMode indexSourceKeepMode;
185187

186188
public Builder(final String name, final MappingParserContext mappingParserContext) {
187189
this(
188190
name,
189191
mappingParserContext.getIndexAnalyzers(),
190192
mappingParserContext.scriptCompiler(),
191193
IGNORE_ABOVE_SETTING.get(mappingParserContext.getSettings()),
192-
mappingParserContext.getIndexSettings().getIndexVersionCreated()
194+
mappingParserContext.getIndexSettings().getIndexVersionCreated(),
195+
mappingParserContext.getIndexSettings().sourceKeepMode()
193196
);
194197
}
195198

@@ -198,7 +201,8 @@ public Builder(final String name, final MappingParserContext mappingParserContex
198201
IndexAnalyzers indexAnalyzers,
199202
ScriptCompiler scriptCompiler,
200203
int ignoreAboveDefault,
201-
IndexVersion indexCreatedVersion
204+
IndexVersion indexCreatedVersion,
205+
SourceKeepMode indexSourceKeepMode
202206
) {
203207
super(name);
204208
this.indexAnalyzers = indexAnalyzers;
@@ -233,10 +237,11 @@ public Builder(final String name, final MappingParserContext mappingParserContex
233237
throw new IllegalArgumentException("[ignore_above] must be positive, got [" + v + "]");
234238
}
235239
});
240+
this.indexSourceKeepMode = indexSourceKeepMode;
236241
}
237242

238243
public Builder(String name, IndexVersion indexCreatedVersion) {
239-
this(name, null, ScriptCompiler.NONE, Integer.MAX_VALUE, indexCreatedVersion);
244+
this(name, null, ScriptCompiler.NONE, Integer.MAX_VALUE, indexCreatedVersion, SourceKeepMode.NONE);
240245
}
241246

242247
public Builder ignoreAbove(int ignoreAbove) {
@@ -370,13 +375,36 @@ public KeywordFieldMapper build(MapperBuilderContext context) {
370375
}
371376
super.hasScript = script.get() != null;
372377
super.onScriptError = onScriptError.getValue();
378+
379+
var sourceKeepMode = this.sourceKeepMode.orElse(indexSourceKeepMode);
380+
BinaryFieldMapper offsetsFieldMapper;
381+
if (context.isSourceSynthetic()
382+
&& sourceKeepMode == SourceKeepMode.ARRAYS
383+
&& fieldtype.stored() == false
384+
&& copyTo.copyToFields().isEmpty()
385+
&& multiFieldsBuilder.hasMultiFields() == false) {
386+
// Skip stored, we will be synthesizing from stored fields, no point to keep track of the offsets
387+
// Skip copy_to, supporting that requires more work. However, copy_to usage is rare in metrics and logging use cases
388+
389+
// keep track of value offsets so that we can reconstruct arrays from doc values in order as was specified during indexing
390+
// (if field is stored then there is no point of doing this)
391+
offsetsFieldMapper = new BinaryFieldMapper.Builder(
392+
context.buildFullName(leafName() + OFFSETS_FIELD_NAME_SUFFIX),
393+
context.isSourceSynthetic()
394+
).docValues(true).build(context);
395+
} else {
396+
offsetsFieldMapper = null;
397+
}
398+
373399
return new KeywordFieldMapper(
374400
leafName(),
375401
fieldtype,
376402
buildFieldType(context, fieldtype),
377403
builderParams(this, context),
378404
context.isSourceSynthetic(),
379-
this
405+
this,
406+
offsetsFieldMapper,
407+
indexSourceKeepMode
380408
);
381409
}
382410
}
@@ -867,14 +895,18 @@ public boolean hasNormalizer() {
867895
private final IndexAnalyzers indexAnalyzers;
868896
private final int ignoreAboveDefault;
869897
private final int ignoreAbove;
898+
private final BinaryFieldMapper offsetsFieldMapper;
899+
private final SourceKeepMode indexSourceKeepMode;
870900

871901
private KeywordFieldMapper(
872902
String simpleName,
873903
FieldType fieldType,
874904
KeywordFieldType mappedFieldType,
875905
BuilderParams builderParams,
876906
boolean isSyntheticSource,
877-
Builder builder
907+
Builder builder,
908+
BinaryFieldMapper offsetsFieldMapper,
909+
SourceKeepMode indexSourceKeepMode
878910
) {
879911
super(simpleName, mappedFieldType, builderParams);
880912
assert fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) <= 0;
@@ -891,6 +923,8 @@ private KeywordFieldMapper(
891923
this.isSyntheticSource = isSyntheticSource;
892924
this.ignoreAboveDefault = builder.ignoreAboveDefault;
893925
this.ignoreAbove = builder.ignoreAbove.getValue();
926+
this.offsetsFieldMapper = offsetsFieldMapper;
927+
this.indexSourceKeepMode = indexSourceKeepMode;
894928
}
895929

896930
@Override
@@ -967,6 +1001,9 @@ private void indexValue(DocumentParserContext context, String value) {
9671001
if (fieldType().hasDocValues() == false && fieldType.omitNorms()) {
9681002
context.addToFieldNames(fieldType().name());
9691003
}
1004+
if (offsetsFieldMapper != null) {
1005+
context.recordOffset(offsetsFieldMapper.fullPath(), value);
1006+
}
9701007
}
9711008

9721009
private static String normalizeValue(NamedAnalyzer normalizer, String field, String value) {
@@ -1008,9 +1045,9 @@ public Map<String, NamedAnalyzer> indexAnalyzers() {
10081045

10091046
@Override
10101047
public FieldMapper.Builder getMergeBuilder() {
1011-
return new Builder(leafName(), indexAnalyzers, scriptCompiler, ignoreAboveDefault, indexCreatedVersion).dimension(
1012-
fieldType().isDimension()
1013-
).init(this);
1048+
return new Builder(leafName(), indexAnalyzers, scriptCompiler, ignoreAboveDefault, indexCreatedVersion, indexSourceKeepMode)
1049+
.dimension(fieldType().isDimension())
1050+
.init(this);
10141051
}
10151052

10161053
@Override
@@ -1063,7 +1100,8 @@ protected void writeValue(Object value, XContentBuilder b) throws IOException {
10631100
}
10641101
});
10651102
} else if (hasDocValues) {
1066-
layers.add(new SortedSetDocValuesSyntheticFieldLoaderLayer(fullPath()) {
1103+
String offsetsFullPath = offsetsFieldMapper != null ? offsetsFieldMapper.fullPath() : null;
1104+
layers.add(new SortedSetDocValuesSyntheticFieldLoaderLayer(fullPath(), offsetsFullPath) {
10671105

10681106
@Override
10691107
protected BytesRef convert(BytesRef value) {
@@ -1090,4 +1128,9 @@ protected void writeValue(Object value, XContentBuilder b) throws IOException {
10901128

10911129
return new CompositeSyntheticFieldLoader(leafFieldName, fullFieldName, layers);
10921130
}
1131+
1132+
@Override
1133+
public boolean supportsStoringArraysNatively() {
1134+
return offsetsFieldMapper != null;
1135+
}
10931136
}

server/src/main/java/org/elasticsearch/index/mapper/Mapper.java

+4
Original file line numberDiff line numberDiff line change
@@ -212,4 +212,8 @@ public static FieldType freezeAndDeduplicateFieldType(FieldType fieldType) {
212212
* Defines how this mapper counts towards {@link MapperService#INDEX_MAPPING_TOTAL_FIELDS_LIMIT_SETTING}.
213213
*/
214214
public abstract int getTotalFieldsCount();
215+
216+
public boolean supportsStoringArraysNatively() {
217+
return false;
218+
}
215219
}

0 commit comments

Comments
 (0)