-
Notifications
You must be signed in to change notification settings - Fork 25.2k
lucene_snapshot: Update to new Lucene 10.3 postings format #128240
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: lucene_snapshot
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the "Elastic License | ||
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side | ||
* Public License v 1"; you may not use this file except in compliance with, at | ||
* your election, the "Elastic License 2.0", the "GNU Affero General Public | ||
* License v3.0 only", or the "Server Side Public License, v 1". | ||
*/ | ||
|
||
package org.elasticsearch.index.codec; | ||
|
||
import org.apache.lucene.codecs.DocValuesFormat; | ||
import org.apache.lucene.codecs.KnnVectorsFormat; | ||
import org.apache.lucene.codecs.PostingsFormat; | ||
import org.apache.lucene.codecs.StoredFieldsFormat; | ||
import org.apache.lucene.codecs.lucene103.Lucene103Codec; | ||
import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; | ||
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; | ||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; | ||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; | ||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; | ||
import org.elasticsearch.index.codec.perfield.XPerFieldDocValuesFormat; | ||
import org.elasticsearch.index.codec.zstd.Zstd814StoredFieldsFormat; | ||
|
||
/** | ||
* Elasticsearch codec as of 9.2 relying on Lucene 10.3. This extends the Lucene 10.3 codec to compressed | ||
* stored fields with ZSTD instead of LZ4/DEFLATE. See {@link Zstd814StoredFieldsFormat}. | ||
*/ | ||
public class Elasticsearch902Lucene103Codec extends CodecService.DeduplicateFieldInfosCodec { | ||
|
||
private final StoredFieldsFormat storedFieldsFormat; | ||
|
||
private final PostingsFormat defaultPostingsFormat; | ||
private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() { | ||
@Override | ||
public PostingsFormat getPostingsFormatForField(String field) { | ||
return Elasticsearch902Lucene103Codec.this.getPostingsFormatForField(field); | ||
} | ||
}; | ||
|
||
private final DocValuesFormat defaultDVFormat; | ||
private final DocValuesFormat docValuesFormat = new XPerFieldDocValuesFormat() { | ||
@Override | ||
public DocValuesFormat getDocValuesFormatForField(String field) { | ||
return Elasticsearch902Lucene103Codec.this.getDocValuesFormatForField(field); | ||
} | ||
}; | ||
|
||
private final KnnVectorsFormat defaultKnnVectorsFormat; | ||
private final KnnVectorsFormat knnVectorsFormat = new PerFieldKnnVectorsFormat() { | ||
@Override | ||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) { | ||
return Elasticsearch902Lucene103Codec.this.getKnnVectorsFormatForField(field); | ||
} | ||
}; | ||
|
||
/** Public no-arg constructor, needed for SPI loading at read-time. */ | ||
public Elasticsearch902Lucene103Codec() { | ||
this(Zstd814StoredFieldsFormat.Mode.BEST_SPEED); | ||
} | ||
|
||
/** | ||
* Constructor. Takes a {@link Zstd814StoredFieldsFormat.Mode} that describes whether to optimize for retrieval speed at the expense of | ||
* worse space-efficiency or vice-versa. | ||
*/ | ||
public Elasticsearch902Lucene103Codec(Zstd814StoredFieldsFormat.Mode mode) { | ||
super("Elasticsearch902Lucene103", new Lucene103Codec()); | ||
this.storedFieldsFormat = mode.getFormat(); | ||
this.defaultPostingsFormat = new Lucene103PostingsFormat(); | ||
this.defaultDVFormat = new Lucene90DocValuesFormat(); | ||
this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat(); | ||
} | ||
|
||
@Override | ||
public StoredFieldsFormat storedFieldsFormat() { | ||
return storedFieldsFormat; | ||
} | ||
|
||
@Override | ||
public final PostingsFormat postingsFormat() { | ||
return postingsFormat; | ||
} | ||
|
||
@Override | ||
public final DocValuesFormat docValuesFormat() { | ||
return docValuesFormat; | ||
} | ||
|
||
@Override | ||
public final KnnVectorsFormat knnVectorsFormat() { | ||
return knnVectorsFormat; | ||
} | ||
|
||
/** | ||
* Returns the postings format that should be used for writing new segments of <code>field</code>. | ||
* | ||
* <p>The default implementation always returns "Lucene912". | ||
* | ||
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility: | ||
* future version of Lucene are only guaranteed to be able to read the default implementation, | ||
*/ | ||
public PostingsFormat getPostingsFormatForField(String field) { | ||
return defaultPostingsFormat; | ||
} | ||
|
||
/** | ||
* Returns the docvalues format that should be used for writing new segments of <code>field</code> | ||
* . | ||
* | ||
* <p>The default implementation always returns "Lucene912". | ||
* | ||
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility: | ||
* future version of Lucene are only guaranteed to be able to read the default implementation. | ||
*/ | ||
public DocValuesFormat getDocValuesFormatForField(String field) { | ||
return defaultDVFormat; | ||
} | ||
|
||
/** | ||
* Returns the vectors format that should be used for writing new segments of <code>field</code> | ||
* | ||
* <p>The default implementation always returns "Lucene912". | ||
* | ||
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility: | ||
* future version of Lucene are only guaranteed to be able to read the default implementation. | ||
*/ | ||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) { | ||
return defaultKnnVectorsFormat; | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,10 +9,11 @@ | |
|
||
package org.elasticsearch.index.codec; | ||
|
||
import org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat; | ||
import org.apache.lucene.codecs.DocValuesFormat; | ||
import org.apache.lucene.codecs.KnnVectorsFormat; | ||
import org.apache.lucene.codecs.PostingsFormat; | ||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; | ||
import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; | ||
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; | ||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; | ||
import org.elasticsearch.common.util.BigArrays; | ||
|
@@ -35,12 +36,12 @@ | |
*/ | ||
public class PerFieldFormatSupplier { | ||
public static final FeatureFlag USE_LUCENE101_POSTINGS_FORMAT = new FeatureFlag("use_lucene101_postings_format"); | ||
public static final FeatureFlag USE_LUCENE103_POSTINGS_FORMAT = new FeatureFlag("use_lucene103_postings_format"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will be removed once #128236 is merged. |
||
|
||
private static final DocValuesFormat docValuesFormat = new Lucene90DocValuesFormat(); | ||
private static final KnnVectorsFormat knnVectorsFormat = new Lucene99HnswVectorsFormat(); | ||
private static final ES819TSDBDocValuesFormat tsdbDocValuesFormat = new ES819TSDBDocValuesFormat(); | ||
private static final ES812PostingsFormat es812PostingsFormat = new ES812PostingsFormat(); | ||
private static final Lucene101PostingsFormat lucene101PostingsFormat = new Lucene101PostingsFormat(); | ||
private static final PostingsFormat completionPostingsFormat = PostingsFormat.forName("Completion101"); | ||
|
||
private final ES87BloomFilterPostingsFormat bloomFilterPostingsFormat; | ||
|
@@ -53,14 +54,19 @@ public PerFieldFormatSupplier(MapperService mapperService, BigArrays bigArrays) | |
this.bloomFilterPostingsFormat = new ES87BloomFilterPostingsFormat(bigArrays, this::internalGetPostingsFormatForField); | ||
|
||
if (mapperService != null | ||
&& USE_LUCENE103_POSTINGS_FORMAT.isEnabled() | ||
&& mapperService.getIndexSettings().getIndexVersionCreated().onOrAfter(IndexVersions.UPGRADE_TO_LUCENE_10_3_0) | ||
&& mapperService.getIndexSettings().getMode() == IndexMode.STANDARD) { | ||
defaultPostingsFormat = new Lucene103PostingsFormat(); | ||
} else if (mapperService != null | ||
&& USE_LUCENE101_POSTINGS_FORMAT.isEnabled() | ||
&& mapperService.getIndexSettings().getIndexVersionCreated().onOrAfter(IndexVersions.USE_LUCENE101_POSTINGS_FORMAT) | ||
&& mapperService.getIndexSettings().getMode() == IndexMode.STANDARD) { | ||
defaultPostingsFormat = lucene101PostingsFormat; | ||
} else { | ||
// our own posting format using PFOR | ||
defaultPostingsFormat = es812PostingsFormat; | ||
} | ||
defaultPostingsFormat = new Lucene101PostingsFormat(); | ||
} else { | ||
// our own posting format using PFOR | ||
defaultPostingsFormat = es812PostingsFormat; | ||
} | ||
} | ||
|
||
public PostingsFormat getPostingsFormatForField(String field) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
/* | ||
* @notice | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
* Modifications copyright (C) 2025 Elasticsearch B.V. | ||
*/ | ||
package org.elasticsearch.index.codec.postings; | ||
|
||
import org.apache.lucene.store.DataInput; | ||
import org.apache.lucene.util.compress.LowercaseAsciiCompression; | ||
|
||
import java.io.IOException; | ||
|
||
/** Compression algorithm used for suffixes of a block of terms. */ | ||
public enum CompressionAlgorithm { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this class is somehow surprising, what makes it required to fork on our end? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's adjust the access modifier in Lucene to avoid this needless copying - apache/lucene#14695 |
||
NO_COMPRESSION(0x00) { | ||
|
||
@Override | ||
void read(DataInput in, byte[] out, int len) throws IOException { | ||
in.readBytes(out, 0, len); | ||
} | ||
}, | ||
|
||
LOWERCASE_ASCII(0x01) { | ||
|
||
@Override | ||
void read(DataInput in, byte[] out, int len) throws IOException { | ||
LowercaseAsciiCompression.decompress(in, out, len); | ||
} | ||
}, | ||
|
||
LZ4(0x02) { | ||
|
||
@Override | ||
void read(DataInput in, byte[] out, int len) throws IOException { | ||
org.apache.lucene.util.compress.LZ4.decompress(in, len, out, 0); | ||
} | ||
}; | ||
|
||
private static final CompressionAlgorithm[] BY_CODE = new CompressionAlgorithm[3]; | ||
|
||
static { | ||
for (CompressionAlgorithm alg : CompressionAlgorithm.values()) { | ||
BY_CODE[alg.code] = alg; | ||
} | ||
} | ||
|
||
/** Look up a {@link CompressionAlgorithm} by its {@link CompressionAlgorithm#code}. */ | ||
static CompressionAlgorithm byCode(int code) { | ||
if (code < 0 || code >= BY_CODE.length) { | ||
throw new IllegalArgumentException("Illegal code for a compression algorithm: " + code); | ||
} | ||
return BY_CODE[code]; | ||
} | ||
|
||
public final int code; | ||
|
||
CompressionAlgorithm(int code) { | ||
this.code = code; | ||
} | ||
|
||
abstract void read(DataInput in, byte[] out, int len) throws IOException; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
that makes me wonder why I used 900 instead of 90 back when I added it in the first place. 9.2.0 could have been 92 as well and perhaps 902 makes it look like a bugfix release which it is not.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh good point. So this should be
Elasticsearch92Lucene103Codec
, then right ? I'll update it.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be better yea. pardon the inconsistency with 900 that we will have to live with.