Skip to content

Commit

Permalink
V5 one-to-many fixes (CNDB-10737) (#1268)
Browse files Browse the repository at this point in the history
* assertion compares with getIdUpperBound instead of size.  fixes cndb-10710
* add test of one-to-many CompactionGraph
* re-serialize CompactionVectorPostings in postingsMap after adding a row to it
* OrdinalMapper needs to return OMITTED where holes are present
* parameterize VectorLocalTest to cover Versions CA, DC
* remove redundant (duplicated) empty table test
* easy test flushing empty index
* add testOneToManyCompactionTooManyHoles
* use correct value of max degree in FusedADC
* create a separate path for compaction that doesn't renumber vectors that have already been written to disk
* refactors RemappedPostings constructors to static methods that are more clear about memtable vs compaction usage
  • Loading branch information
jbellis authored Sep 11, 2024
1 parent 758b110 commit b1382ea
Show file tree
Hide file tree
Showing 11 changed files with 444 additions and 118 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import java.util.function.BooleanSupplier;
import javax.annotation.concurrent.NotThreadSafe;
Expand All @@ -44,6 +43,7 @@
import org.apache.cassandra.index.sai.disk.format.IndexComponents;
import org.apache.cassandra.index.sai.disk.v2.V2VectorIndexSearcher;
import org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat;
import org.apache.cassandra.index.sai.disk.v5.V5VectorIndexSearcher;
import org.apache.cassandra.index.sai.disk.v5.V5VectorPostingsWriter;
import org.apache.cassandra.index.sai.disk.vector.CassandraDiskAnn;
import org.apache.cassandra.index.sai.disk.vector.CassandraOnHeapGraph;
Expand Down Expand Up @@ -381,13 +381,15 @@ private static boolean allRowsHaveVectorsInWrittenSegments(IndexContext indexCon
for (Segment segment : index.getSegments())
{
segmentsChecked++;
var searcher = (V2VectorIndexSearcher) segment.getIndexSearcher();
if (segment.getIndexSearcher() instanceof V2VectorIndexSearcher)
return true; // V2 doesn't know, so we err on the side of being optimistic. See comments in CompactionGraph
var searcher = (V5VectorIndexSearcher) segment.getIndexSearcher();
var structure = searcher.getPostingsStructure();
if (structure == V5VectorPostingsWriter.Structure.ZERO_OR_ONE_TO_MANY)
return false;
}
}
return segmentsChecked != 0;
return true;
}

private CassandraOnHeapGraph.PqInfo maybeReadPqFromLastSegment() throws IOException
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,16 +203,21 @@ private static <T> Pair<BiMap<Integer, Integer>, Integer> buildOrdinalMap(Map<Ve
return Pair.create(ordinalMap, maxRow);
}

public static <T> V5VectorPostingsWriter.RemappedPostings remapPostings(Map<VectorFloat<?>, ? extends VectorPostings<T>> postingsMap,
boolean containsDeletes)
public static <T> V5VectorPostingsWriter.RemappedPostings remapForMemtable(Map<VectorFloat<?>, ? extends VectorPostings<T>> postingsMap,
boolean containsDeletes)
{
var p = buildOrdinalMap(postingsMap);
int maxNewOrdinal = postingsMap.size() - 1; // no in-graph deletes in v2
if (p == null || containsDeletes)
return V5VectorPostingsWriter.createGenericV2Mapping(postingsMap);
return V5VectorPostingsWriter.createGenericIdentityMapping(postingsMap);

var ordinalMap = p.left;
var maxRow = p.right;
return new V5VectorPostingsWriter.RemappedPostings(V5VectorPostingsWriter.Structure.ONE_TO_ONE, maxNewOrdinal, maxRow, ordinalMap, new Int2IntHashMap(Integer.MIN_VALUE));
return new V5VectorPostingsWriter.RemappedPostings(V5VectorPostingsWriter.Structure.ONE_TO_ONE,
maxNewOrdinal,
maxRow,
ordinalMap,
new Int2IntHashMap(Integer.MIN_VALUE),
new V5VectorPostingsWriter.BiMapMapper(maxNewOrdinal, ordinalMap));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@
import java.util.Arrays;
import java.util.Map;
import java.util.Set;
import java.util.function.IntPredicate;
import java.util.function.IntUnaryOperator;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import javax.annotation.Nullable;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.BiMap;
Expand All @@ -34,6 +35,7 @@

import io.github.jbellis.jvector.graph.RandomAccessVectorValues;
import io.github.jbellis.jvector.graph.disk.OrdinalMapper;
import io.github.jbellis.jvector.util.FixedBitSet;
import io.github.jbellis.jvector.vector.types.VectorFloat;
import org.agrona.collections.Int2IntHashMap;
import org.agrona.collections.Int2ObjectHashMap;
Expand Down Expand Up @@ -101,12 +103,60 @@ public V5VectorPostingsWriter(RemappedPostings remappedPostings)
this.remappedPostings = remappedPostings;
}

public V5VectorPostingsWriter(Structure structure, int graphSize, Map<VectorFloat<?>, VectorPostings.CompactionVectorPostings> postingsMap)
/**
* This method describes the mapping done during construction of the graph so that we can easily create
* an appropriate V5VectorPostingsWriter. No ordinal remapping is performed because (V5) compaction writes
* vectors to disk as they are added to the graph, so there is no opportunity to reorder the way there is
* in a Memtable index.
*/
public static RemappedPostings describeForCompaction(Structure structure, int graphSize, Map<VectorFloat<?>, VectorPostings.CompactionVectorPostings> postingsMap)
{
assert !postingsMap.isEmpty(); // flush+compact should skip writing an index component in this case

if (structure == Structure.ONE_TO_ONE)
remappedPostings = new RemappedPostings(Structure.ONE_TO_ONE, graphSize - 1, graphSize - 1, null, null);
else
remappedPostings = remapPostings(postingsMap);
{
return new RemappedPostings(Structure.ONE_TO_ONE,
graphSize - 1,
graphSize - 1,
null,
null,
new OrdinalMapper.IdentityMapper(graphSize - 1));
}

if (structure == Structure.ONE_TO_MANY)
{
// compute maxOldOrdinal, maxRow, and extraOrdinals from the postingsMap
int maxOldOrdinal = Integer.MIN_VALUE;
int maxRow = Integer.MIN_VALUE;
var extraOrdinals = new Int2IntHashMap(Integer.MIN_VALUE);
for (var entry : postingsMap.entrySet())
{
var postings = entry.getValue();
int ordinal = postings.getOrdinal();

maxOldOrdinal = Math.max(maxOldOrdinal, ordinal);
var rowIds = postings.getRowIds();
assert ordinal == rowIds.getInt(0); // synthetic ordinals not allowed in ONE_TO_MANY
for (int i = 0; i < rowIds.size(); i++)
{
int rowId = rowIds.getInt(i);
maxRow = Math.max(maxRow, rowId);
if (i > 0)
extraOrdinals.put(rowId, ordinal);
}
}

var skippedOrdinals = extraOrdinals.keySet();
return new RemappedPostings(Structure.ONE_TO_MANY,
maxOldOrdinal,
maxRow,
null,
extraOrdinals,
new OmissionAwareIdentityMapper(maxOldOrdinal, skippedOrdinals::contains));
}

assert structure == Structure.ZERO_OR_ONE_TO_MANY : structure;
return createGenericIdentityMapping(postingsMap);
}

public long writePostings(SequentialWriter writer,
Expand Down Expand Up @@ -167,7 +217,6 @@ private void writeOneToManyOrdinalMapping(SequentialWriter writer) throws IOExce
writer.writeInt(newOrdinal);
writer.writeInt(0);
entries++;
assert !ordinalToExtraRowIds.containsKey(oldOrdinal);
continue;
}

Expand Down Expand Up @@ -204,8 +253,8 @@ private void writeOneToManyRowIdMapping(SequentialWriter writer) throws IOExcept
writer.writeInt(rowId);
writer.writeInt(remappedPostings.ordinalMapper.oldToNew(originalOrdinal));
// validate that we do in fact have contiguous rowids in the non-extra mapping
for (int j = lastExtraRowId + 1; j < rowId; j++)
assert remappedPostings.ordinalMap.inverse().containsKey(j);
assert IntStream.range(lastExtraRowId + 1, rowId)
.allMatch(j -> remappedPostings.ordinalMapper.newToOld(j) != OrdinalMapper.OMITTED) : "Non-contiguous rowids found in non-extra mapping";
lastExtraRowId = rowId;
}

Expand Down Expand Up @@ -314,63 +363,32 @@ public static class RemappedPostings
public final int maxNewOrdinal;
/** the largest rowId in the postings (inclusive) */
public final int maxRowId;
/** map from original vector ordinal to rowId that will be its new, remapped ordinal */
private final BiMap<Integer, Integer> ordinalMap;
/** map from rowId to [original] vector ordinal */
@Nullable
private final Int2IntHashMap extraPostings;
/** public api */
public final OrdinalMapper ordinalMapper;

public RemappedPostings(Structure structure, int maxNewOrdinal, int maxRowId, BiMap<Integer, Integer> ordinalMap, Int2IntHashMap extraPostings)
/** visible for V2VectorPostingsWriter.remapPostings, everyone else should use factory methods */
public RemappedPostings(Structure structure, int maxNewOrdinal, int maxRowId, BiMap<Integer, Integer> ordinalMap, Int2IntHashMap extraPostings, OrdinalMapper ordinalMapper)
{
assert structure == Structure.ONE_TO_ONE || structure == Structure.ONE_TO_MANY;
this.structure = structure;
this.maxNewOrdinal = maxNewOrdinal;
this.maxRowId = maxRowId;
this.ordinalMap = ordinalMap;
this.extraPostings = extraPostings;
ordinalMapper = new OrdinalMapper()
{
@Override
public int maxOrdinal()
{
return maxNewOrdinal;
}

@Override
public int oldToNew(int i)
{
return ordinalMap.get(i);
}

@Override
public int newToOld(int i)
{
return ordinalMap.inverse().getOrDefault(i, OMITTED);
}
};
}

public RemappedPostings(int maxNewOrdinal, int maxRowId, Int2IntHashMap sequentialMap)
{
this.structure = Structure.ZERO_OR_ONE_TO_MANY;
this.maxNewOrdinal = maxNewOrdinal;
this.maxRowId = maxRowId;
this.ordinalMap = null;
this.extraPostings = null;
ordinalMapper = new OrdinalMapper.MapMapper(sequentialMap);
this.ordinalMapper = ordinalMapper;
}
}

/**
* @see RemappedPostings
*/
public static <T> RemappedPostings remapPostings(Map<VectorFloat<?>, ? extends VectorPostings<T>> postingsMap)
public static <T> RemappedPostings remapForMemtable(Map<VectorFloat<?>, ? extends VectorPostings<T>> postingsMap)
{
assert V5OnDiskFormat.writeV5VectorPostings();

BiMap<Integer, Integer> ordinalMap = HashBiMap.create();
Int2IntHashMap extraPostings = new Int2IntHashMap(-1);
Int2IntHashMap extraPostings = new Int2IntHashMap(Integer.MIN_VALUE);
int minRow = Integer.MAX_VALUE;
int maxRow = Integer.MIN_VALUE;
int maxNewOrdinal = Integer.MIN_VALUE;
Expand Down Expand Up @@ -398,12 +416,13 @@ public static <T> RemappedPostings remapPostings(Map<VectorFloat<?>, ? extends V
extraPostings.put(a[i], oldOrdinal);
}
}
assert totalRowsAssigned == 0 || totalRowsAssigned <= maxRow + 1: "rowids are not unique -- " + totalRowsAssigned + " >= " + maxRow;

// derive the correct structure
Structure structure;
if (totalRowsAssigned > 0 && (minRow != 0 || totalRowsAssigned != maxRow + 1))
if (totalRowsAssigned > 0 && (minRow != 0 || totalRowsAssigned < maxRow + 1))
{
logger.debug("Not all rows are assigned vectors, cannot remap");
logger.debug("Not all rows are assigned vectors, cannot remap one-to-many");
structure = Structure.ZERO_OR_ONE_TO_MANY;
}
else
Expand All @@ -419,32 +438,105 @@ public static <T> RemappedPostings remapPostings(Map<VectorFloat<?>, ? extends V

// create the mapping
if (structure == Structure.ZERO_OR_ONE_TO_MANY)
return createGenericMapping(ordinalMap.keySet(), maxOldOrdinal, maxRow);
return new RemappedPostings(structure, maxNewOrdinal, maxRow, ordinalMap, extraPostings);
return createGenericRenumberedMapping(ordinalMap.keySet(), maxOldOrdinal, maxRow);
var ordinalMapper = new BiMapMapper(maxNewOrdinal, ordinalMap);
return new RemappedPostings(structure, maxNewOrdinal, maxRow, ordinalMap, extraPostings, ordinalMapper);
}

/**
* return an exhaustive zero-to-many mapping with the live ordinals renumbered sequentially
*/
private static RemappedPostings createGenericMapping(Set<Integer> liveOrdinals, int maxOldOrdinal, int maxRow)
private static RemappedPostings createGenericRenumberedMapping(Set<Integer> liveOrdinals, int maxOldOrdinal, int maxRow)
{
var sequentialMap = new Int2IntHashMap(maxOldOrdinal, 0.65f, Integer.MIN_VALUE);
var oldToNew = new Int2IntHashMap(maxOldOrdinal, 0.65f, Integer.MIN_VALUE);
int nextOrdinal = 0;
for (int i = 0; i <= maxOldOrdinal; i++) {
if (liveOrdinals.contains(i))
sequentialMap.put(i, nextOrdinal++);
oldToNew.put(i, nextOrdinal++);
}
return new RemappedPostings(nextOrdinal - 1, maxRow, sequentialMap);
return new RemappedPostings(Structure.ZERO_OR_ONE_TO_MANY,
nextOrdinal - 1,
maxRow,
null,
null,
new OrdinalMapper.MapMapper(oldToNew));
}

/**
* return an exhaustive zero-to-many mapping for v2 postings, which never contain missing ordinals
* since deleted vectors are only removed from the index in its next compaction
* return an exhaustive zero-to-many mapping with no renumbering
*/
public static <T> RemappedPostings createGenericV2Mapping(Map<VectorFloat<?>, ? extends VectorPostings<T>> postingsMap)
public static <T> RemappedPostings createGenericIdentityMapping(Map<VectorFloat<?>, ? extends VectorPostings<T>> postingsMap)
{
int maxOldOrdinal = postingsMap.size() - 1;
var maxOldOrdinal = postingsMap.values().stream().mapToInt(VectorPostings::getOrdinal).max().orElseThrow();
int maxRow = postingsMap.values().stream().flatMap(p -> p.getRowIds().stream()).mapToInt(i -> i).max().orElseThrow();
return createGenericMapping(IntStream.range(0, postingsMap.size()).boxed().collect(Collectors.toSet()), maxOldOrdinal, maxRow);
var presentOrdinals = new FixedBitSet(maxOldOrdinal + 1);
for (var entry : postingsMap.entrySet())
presentOrdinals.set(entry.getValue().getOrdinal());
return new RemappedPostings(Structure.ZERO_OR_ONE_TO_MANY,
maxOldOrdinal,
maxRow,
null,
null,
new OmissionAwareIdentityMapper(maxOldOrdinal, i -> !presentOrdinals.get(i)));
}

public static class BiMapMapper implements OrdinalMapper
{
private final int maxOrdinal;
private final BiMap<Integer, Integer> ordinalMap;

public BiMapMapper(int maxNewOrdinal, BiMap<Integer, Integer> ordinalMap)
{
this.maxOrdinal = maxNewOrdinal;
this.ordinalMap = ordinalMap;
}

@Override
public int maxOrdinal()
{
return maxOrdinal;
}

@Override
public int oldToNew(int i)
{
return ordinalMap.get(i);
}

@Override
public int newToOld(int i)
{
return ordinalMap.inverse().getOrDefault(i, OMITTED);
}
}

private static class OmissionAwareIdentityMapper implements OrdinalMapper
{
private final int maxVectorOrdinal;
private final IntPredicate toSkip;

public OmissionAwareIdentityMapper(int maxVectorOrdinal, IntPredicate toSkip)
{
this.maxVectorOrdinal = maxVectorOrdinal;
this.toSkip = toSkip;
}

@Override
public int maxOrdinal()
{
return maxVectorOrdinal;
}

@Override
public int oldToNew(int i)
{
return i;
}

@Override
public int newToOld(int i)
{
return toSkip.test(i) ? OrdinalMapper.OMITTED : i;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
import java.util.Collection;
import java.util.Comparator;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicInteger;
Expand Down Expand Up @@ -395,14 +394,14 @@ public SegmentMetadata.ComponentMetadataMap flush(IndexComponents.ForWrite perIn
deletedOrdinals.stream().parallel().forEach(builder::markNodeDeleted);
deletedOrdinals.clear();
builder.cleanup();
remappedPostings = V5VectorPostingsWriter.remapPostings(postingsMap);
remappedPostings = V5VectorPostingsWriter.remapForMemtable(postingsMap);
}
else
{
assert postingsMap.keySet().size() == vectorValues.size() : String.format("postings map entry count %d != vector count %d",
postingsMap.keySet().size(), vectorValues.size());
builder.cleanup();
remappedPostings = V2VectorPostingsWriter.remapPostings(postingsMap, !deletedOrdinals.isEmpty());
remappedPostings = V2VectorPostingsWriter.remapForMemtable(postingsMap, !deletedOrdinals.isEmpty());
}

OrdinalMapper ordinalMapper = remappedPostings.ordinalMapper;
Expand Down
Loading

0 comments on commit b1382ea

Please sign in to comment.