Skip to content

[Optimization] Move bloom filter init logic outside of the FuzzyFilteredFieldsProducer constructor #17200

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
/**
* Encapsulates common behaviour implementation for a fuzzy set.
*/
public abstract class AbstractFuzzySet implements FuzzySet {
public abstract class AbstractFuzzySet<T extends FuzzySet.Meta> implements FuzzySet<T> {

/**
* Add an item to this fuzzy set.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
* Since the Lucene implementation is marked experimental,
* this aims to ensure we can provide a bwc implementation during upgrades.
*/
public class BloomFilter extends AbstractFuzzySet {
public class BloomFilter extends AbstractFuzzySet<BloomFilter.BloomMeta> {

private static final Logger logger = LogManager.getLogger(BloomFilter.class);

Expand Down Expand Up @@ -77,6 +77,13 @@ public class BloomFilter extends AbstractFuzzySet {
this.bitset = new LongArrayBackedBitSet(in);
}

BloomFilter(FuzzySet.Meta inMeta) throws IOException {
BloomMeta meta = (BloomMeta) inMeta;
hashCount = meta.hashCount;
setSize = meta.setSize;
bitset = new LongArrayBackedBitSet(meta.bitSetMeta);
}

@Override
public void writeTo(DataOutput out) throws IOException {
out.writeInt(hashCount);
Expand All @@ -101,7 +108,7 @@ public SetType setType() {
}

@Override
public Result containsHash(long hash) {
protected Result containsHash(long hash) {
int msb = (int) (hash >>> Integer.SIZE);
int lsb = (int) hash;
for (int i = 0; i < hashCount; i++) {
Expand Down Expand Up @@ -147,4 +154,16 @@ private boolean mayContainValue(int aHash) {
public void close() throws IOException {
IOUtils.close(bitset);
}

static class BloomMeta implements FuzzySet.Meta {
int setSize;
int hashCount;
LongArrayBackedBitSet.LongArrayBackedBitSetMeta bitSetMeta;

BloomMeta(IndexInput in) throws IOException {
this.hashCount = in.readInt();
this.setSize = in.readInt();
bitSetMeta = new LongArrayBackedBitSet.LongArrayBackedBitSetMeta(in);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.opensearch.common.CheckedSupplier;
import org.opensearch.common.util.io.IOUtils;

import java.io.Closeable;
Expand Down Expand Up @@ -103,7 +104,7 @@ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException

static class FuzzyFilteredFieldsProducer extends FieldsProducer {
private FieldsProducer delegateFieldsProducer;
HashMap<String, FuzzySet> fuzzySetsByFieldName = new HashMap<>();
HashMap<String, CheckedSupplier<FuzzySet, IOException>> fuzzySetsByFieldName = new HashMap<>();
private List<Closeable> closeables = new ArrayList<>();

public FuzzyFilteredFieldsProducer(SegmentReadState state) throws IOException {
Expand Down Expand Up @@ -132,10 +133,9 @@ public FuzzyFilteredFieldsProducer(SegmentReadState state) throws IOException {
int numFilters = filterIn.readInt();
for (int i = 0; i < numFilters; i++) {
int fieldNum = filterIn.readInt();
FuzzySet set = FuzzySetFactory.deserializeFuzzySet(filterIn);
closeables.add(set);
CheckedSupplier<FuzzySet, IOException> setBuilder = FuzzySetFactory.buildSetProvider(filterIn);
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
fuzzySetsByFieldName.put(fieldInfo.name, set);
fuzzySetsByFieldName.put(fieldInfo.name, setBuilder);
}
CodecUtil.retrieveChecksum(filterIn);

Expand Down Expand Up @@ -164,7 +164,10 @@ public void close() throws IOException {

@Override
public Terms terms(String field) throws IOException {
FuzzySet filter = fuzzySetsByFieldName.get(field);
FuzzySet filter = null;
if (fuzzySetsByFieldName.get(field) != null) {
filter = fuzzySetsByFieldName.get(field).get();
}
if (filter == null) {
return delegateFieldsProducer.terms(field);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,17 @@
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.opensearch.common.CheckedFunction;
import org.opensearch.common.CheckedSupplier;

import java.io.Closeable;
import java.io.IOException;
import java.util.List;
import java.util.function.Supplier;

/**
* Fuzzy Filter interface
*/
public interface FuzzySet extends Accountable, Closeable {
public interface FuzzySet<T extends FuzzySet.Meta> extends Accountable, Closeable {

/**
* Name used for a codec to be aware of what fuzzy set has been used.
Expand Down Expand Up @@ -58,32 +60,34 @@ enum Result {
* Enum to declare supported properties and mappings for a fuzzy set implementation.
*/
enum SetType {
BLOOM_FILTER_V1("bloom_filter_v1", BloomFilter::new, List.of("bloom_filter"));
BLOOM_FILTER_V1("bloom_filter_v1", List.of("bloom_filter"), (in) -> {
BloomFilter.BloomMeta meta = new BloomFilter.BloomMeta(in);
return () -> new BloomFilter(meta);
});

/**
* Name persisted in postings file. This will be used when reading to determine the bloom filter implementation.
*/
private final String setName;

/**
* Interface for reading the actual fuzzy set implementation into java object.
*/
private final CheckedFunction<IndexInput, ? extends FuzzySet, IOException> deserializer;
private final CheckedFunction<IndexInput, CheckedSupplier<? extends FuzzySet, IOException>, IOException> metaExtractor;

SetType(String setName, CheckedFunction<IndexInput, ? extends FuzzySet, IOException> deserializer, List<String> aliases) {
SetType(String setName,
List<String> aliases,
CheckedFunction<IndexInput, CheckedSupplier<? extends FuzzySet, IOException>, IOException> metaExtractor) {
if (aliases.size() < 1) {
throw new IllegalArgumentException("Alias list is empty. Could not create Set Type: " + setName);
}
this.setName = setName;
this.deserializer = deserializer;
this.metaExtractor = metaExtractor;
}

public String getSetName() {
return setName;
}

public CheckedFunction<IndexInput, ? extends FuzzySet, IOException> getDeserializer() {
return deserializer;
public CheckedSupplier<? extends FuzzySet, IOException> extractMetaAndGetSupplier(IndexInput in) throws IOException {
return metaExtractor.apply(in);
}

public static SetType from(String name) {
Expand All @@ -95,4 +99,8 @@ public static SetType from(String name) {
throw new IllegalArgumentException("There is no implementation for fuzzy set: " + name);
}
}

interface Meta {

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ public FuzzySet createFuzzySet(int maxDocs, String fieldName, CheckedSupplier<It
}
}

public static CheckedSupplier<FuzzySet, IOException> buildSetProvider(IndexInput in) throws IOException {
FuzzySet.SetType setType = FuzzySet.SetType.from(in.readString());
return setType.extractMetaAndGetSupplier(in);
}

public static FuzzySet deserializeFuzzySet(IndexInput in) throws IOException {
FuzzySet.SetType setType = FuzzySet.SetType.from(in.readString());
return setType.getDeserializer().apply(in);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public long size() {
}

@Override
public synchronized long get(long index) {
public long get(long index) {
try {
// Multiplying by 8 since each long is 8 bytes, and we need to get the long value at (index * 8) in the
// RandomAccessInput being accessed.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,15 @@ class LongArrayBackedBitSet implements Accountable, Closeable {
in.skipBytes(streamLength);
}

LongArrayBackedBitSet(LongArrayBackedBitSetMeta meta) throws IOException {
underlyingArrayLength = meta.underlyingArrayLength;
// Multiplying by 8 since the length above is of the long array, so we will have
// 8 times the number of bytes in our stream.
long streamLength = underlyingArrayLength << 3;
IndexInput in = meta.dataIn;
longArray = new IndexInputImmutableLongArray(underlyingArrayLength, in.randomAccessSlice(meta.startPointer, streamLength));
}

public void writeTo(DataOutput out) throws IOException {
out.writeLong(underlyingArrayLength);
for (int idx = 0; idx < underlyingArrayLength; idx++) {
Expand Down Expand Up @@ -102,4 +111,17 @@ public long ramBytesUsed() {
public void close() throws IOException {
IOUtils.close(longArray);
}

static class LongArrayBackedBitSetMeta {
long underlyingArrayLength;
long startPointer;
IndexInput dataIn;
public LongArrayBackedBitSetMeta(IndexInput in) throws IOException {
dataIn = in;
underlyingArrayLength = in.readLong();
startPointer = in.getFilePointer();
long streamLength = underlyingArrayLength << 3;
in.skipBytes(streamLength);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,9 @@ public boolean isComplete() {

@Override
public void preParse(ParseContext context) throws IOException {
if (enabled != SourceFieldStatus.ENABLED) {
return;
}
BytesReference originalSource = context.sourceToParse().source();
MediaType contentType = context.sourceToParse().getMediaType();
final BytesReference adaptedSource = applyFilters(originalSource, contentType);
Expand Down
Loading