Skip to content

LUCENE-9476: Add getBulkPath API to DirectoryTaxonomyReader #179

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Sep 2, 2021
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,9 @@ Improvements
* LUCENE-10019: Align file starts in CFS files to have proper alignment (8 bytes)
(Uwe Schinder)

* LUCENE-9476: Add new getBulkPath API to DirectoryTaxonomyReader to more efficiently retrieve FacetLabels for multiple
facet ordinals at once (Gautam Worah, Mike McCandless). This API is 2-4% faster than iteratively calling getPath

Bug fixes

* LUCENE-9686: Fix read past EOF handling in DirectIODirectory. (Zach Chen,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
}

LabelAndValue[] labelValues = new LabelAndValue[q.size()];
int[] ordinals = new int[labelValues.length];
float[] values = new float[labelValues.length];

for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndFloatQueue.OrdAndValue ordAndValue = q.pop();
FacetLabel child = taxoReader.getPath(ordAndValue.ord);
labelValues[i] = new LabelAndValue(child.components[cp.length], ordAndValue.value);
ordinals[i] = ordAndValue.ord;
values[i] = ordAndValue.value;
}

FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals);
for (int i = 0; i < labelValues.length; i++) {
labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], values[i]);
}

return new FacetResult(dim, path, sumValues, labelValues, childCount);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,10 +237,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
}

LabelAndValue[] labelValues = new LabelAndValue[q.size()];
int[] ordinals = new int[labelValues.length];
int[] values = new int[labelValues.length];

for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
FacetLabel child = taxoReader.getPath(ordAndValue.ord);
labelValues[i] = new LabelAndValue(child.components[cp.length], ordAndValue.value);
ordinals[i] = ordAndValue.ord;
values[i] = ordAndValue.value;
}

FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals);
for (int i = 0; i < labelValues.length; i++) {
labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], values[i]);
}

return new FacetResult(dim, path, totValue, labelValues, childCount);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,21 @@ public int getOrdinal(String dim, String... path) throws IOException {
/** Returns the path name of the category with the given ordinal. */
public abstract FacetLabel getPath(int ordinal) throws IOException;

/**
* Returns the path names of the list of ordinals associated with different categories.
*
* <p>The implementation in {@link
* org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader} is generally faster than
* the default implementation which iteratively calls {@link #getPath(int)}
*/
public FacetLabel[] getBulkPath(int... ordinals) throws IOException {
FacetLabel[] facetLabels = new FacetLabel[ordinals.length];
for (int i = 0; i < ordinals.length; i++) {
facetLabels[i] = getPath(ordinals[i]);
}
return facetLabels;
}

/** Returns the current refCount for this taxonomy reader. */
public final int getRefCount() {
return refCount.get();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.function.IntUnaryOperator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.document.Document;
Expand All @@ -35,6 +37,7 @@
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.ReaderUtil;
Expand All @@ -44,6 +47,7 @@
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.RamUsageEstimator;

/**
Expand Down Expand Up @@ -318,23 +322,16 @@ public FacetLabel getPath(int ordinal) throws IOException {
// doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
// instance recognizes. Therefore we do this check up front, before we hit
// the cache.
if (ordinal < 0 || ordinal >= indexReader.maxDoc()) {
return null;
}
checkOrdinalBounds(ordinal);

// TODO: can we use an int-based hash impl, such as IntToObjectMap,
// wrapped as LRU?
Integer catIDInteger = Integer.valueOf(ordinal);
synchronized (categoryCache) {
FacetLabel res = categoryCache.get(catIDInteger);
if (res != null) {
return res;
}
FacetLabel[] ordinalPath = getPathFromCache(ordinal);

if (ordinalPath[0] != null) {
return ordinalPath[0];
}

int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves());
LeafReader leafReader = indexReader.leaves().get(readerIndex).reader();
// TODO: Use LUCENE-9476 to get the bulk lookup API for extracting BinaryDocValues
BinaryDocValues values = leafReader.getBinaryDocValues(Consts.FULL);

FacetLabel ret;
Expand All @@ -351,12 +348,142 @@ public FacetLabel getPath(int ordinal) throws IOException {
}

synchronized (categoryCache) {
categoryCache.put(catIDInteger, ret);
categoryCache.put(ordinal, ret);
}

return ret;
}

private FacetLabel[] getPathFromCache(int... ordinals) {
FacetLabel[] facetLabels = new FacetLabel[ordinals.length];
// TODO LUCENE-10068: can we use an int-based hash impl, such as IntToObjectMap,
// wrapped as LRU?
synchronized (categoryCache) {
for (int i = 0; i < ordinals.length; i++) {
facetLabels[i] = categoryCache.get(ordinals[i]);
}
}
return facetLabels;
}

/**
* Checks if the ordinals in the array are >=0 and < {@code
* DirectoryTaxonomyReader#indexReader.maxDoc()}
*
* @param ordinals Integer array of ordinals
* @throws IllegalArgumentException Throw an IllegalArgumentException if one of the ordinals is
* out of bounds
*/
private void checkOrdinalBounds(int... ordinals) throws IllegalArgumentException {
for (int ordinal : ordinals) {
if (ordinal < 0 || ordinal >= indexReader.maxDoc()) {
throw new IllegalArgumentException(
"ordinal "
+ ordinal
+ " is out of the range of the indexReader "
+ indexReader.toString()
+ ". The maximum possible ordinal number is "
+ (indexReader.maxDoc() - 1));
}
}
}

/**
* Returns an array of FacetLabels for a given array of ordinals.
*
* <p>This API is generally faster than iteratively calling {@link #getPath(int)} over an array of
* ordinals. It uses the {@link #getPath(int)} method iteratively when it detects that the index
* was created using StoredFields (with no performance gains) and uses DocValues based iteration
* when the index is based on BinaryDocValues. Lucene switched to BinaryDocValues in version 9.0
*
* @param ordinals Array of ordinals that are assigned to categories inserted into the taxonomy
* index
*/
@Override
public FacetLabel[] getBulkPath(int... ordinals) throws IOException {
ensureOpen();
checkOrdinalBounds(ordinals);

int ordinalsLength = ordinals.length;
FacetLabel[] bulkPath = new FacetLabel[ordinalsLength];
// remember the original positions of ordinals before they are sorted
int[] originalPosition = new int[ordinalsLength];
Arrays.setAll(originalPosition, IntUnaryOperator.identity());

getPathFromCache(ordinals);

/* parallel sort the ordinals and originalPosition array based on the values in the ordinals array */
new InPlaceMergeSorter() {
@Override
protected void swap(int i, int j) {
int x = ordinals[i];
ordinals[i] = ordinals[j];
ordinals[j] = x;

x = originalPosition[i];
originalPosition[i] = originalPosition[j];
originalPosition[j] = x;
}

@Override
public int compare(int i, int j) {
return Integer.compare(ordinals[i], ordinals[j]);
}
}.sort(0, ordinalsLength);

int readerIndex;
int leafReaderMaxDoc = 0;
int leafReaderDocBase = 0;
LeafReader leafReader;
LeafReaderContext leafReaderContext;
BinaryDocValues values = null;
List<Integer> uncachedOrdinalPositions = new ArrayList<>();

for (int i = 0; i < ordinalsLength; i++) {
if (bulkPath[originalPosition[i]] == null) {
/*
If ordinals[i] >= leafReaderDocBase + leafReaderMaxDoc then we find the next leaf that contains our ordinal.
Remember: ordinals[i] operates in the global ordinal space and hence we add leafReaderDocBase to the leafReaderMaxDoc
(which is the maxDoc of the specific leaf)
*/
if (values == null || ordinals[i] >= leafReaderDocBase + leafReaderMaxDoc) {

readerIndex = ReaderUtil.subIndex(ordinals[i], indexReader.leaves());
leafReaderContext = indexReader.leaves().get(readerIndex);
leafReader = leafReaderContext.reader();
leafReaderMaxDoc = leafReader.maxDoc();
leafReaderDocBase = leafReaderContext.docBase;
values = leafReader.getBinaryDocValues(Consts.FULL);

/*
If the index is constructed with the older StoredFields it will not have any BinaryDocValues field and will return null
*/
if (values == null) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's sort of weird to do this check per-segment, since we can know up front, based on indexed created version, whether it's stored fields or doc values? I.e. this is not a segment by segment decision. But let's do it in follow-on issue. I think this one is ready!

Copy link
Contributor Author

@gautamworah96 gautamworah96 Aug 29, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. I'll create it after this PR is merged. Otherwise, it just feels weird to create an issue for a fix in a piece of code that has not been merged :D

return super.getBulkPath(ordinals);
}
}
// values is leaf specific so you only advance till you reach the target within the leaf
boolean success = values.advanceExact(ordinals[i] - leafReaderDocBase);
assert success;
bulkPath[originalPosition[i]] =
new FacetLabel(FacetsConfig.stringToPath(values.binaryValue().utf8ToString()));

uncachedOrdinalPositions.add(i);
}
}

if (uncachedOrdinalPositions.isEmpty() == false) {
synchronized (categoryCache) {
for (int i : uncachedOrdinalPositions) {
// add the value to the categoryCache after computation
categoryCache.put(ordinals[i], bulkPath[originalPosition[i]]);
}
}
}

return bulkPath;
}

@Override
public int getSize() {
ensureOpen();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -366,9 +366,9 @@ public void testReaderBasic() throws Exception {
}
}
// (also test invalid ordinals:)
assertNull(tr.getPath(-1));
assertNull(tr.getPath(tr.getSize()));
assertNull(tr.getPath(TaxonomyReader.INVALID_ORDINAL));
expectThrows(IllegalArgumentException.class, () -> tr.getPath(-1));
expectThrows(IllegalArgumentException.class, () -> tr.getPath(tr.getSize()));
expectThrows(IllegalArgumentException.class, () -> tr.getPath(TaxonomyReader.INVALID_ORDINAL));

// test TaxonomyReader.getOrdinal():
for (int i = 1; i < expectedCategories.length; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,28 @@ private void createNewTaxonomyIndex(String dirName) throws IOException {
dir.close();
}

// Opens up a pre-existing index and tries to run getBulkPath on it
public void testGetBulkPathOnOlderCodec() throws Exception {
Path indexDir = createTempDir(oldTaxonomyIndexName);
TestUtil.unzip(getDataInputStream(oldTaxonomyIndexName + ".zip"), indexDir);
Directory dir = newFSDirectory(indexDir);

DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
DirectoryTaxonomyReader reader = new DirectoryTaxonomyReader(writer);

FacetLabel[] facetLabels = {
new FacetLabel("a"), new FacetLabel("b"),
};
int[] ordinals =
new int[] {reader.getOrdinal(facetLabels[0]), reader.getOrdinal(facetLabels[1])};
// The zip file already contains a category "a" and a category "b" stored with the older
// StoredFields codec
assertArrayEquals(facetLabels, reader.getBulkPath(ordinals));
reader.close();
writer.close();
dir.close();
}

// Used to create a fresh taxonomy index with StoredFields
@Ignore
public void testCreateOldTaxonomy() throws IOException {
Expand Down
Loading