Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/java/org/apache/cassandra/db/filter/RowFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,14 @@ public boolean hasANN()
return false;
}

/**
* Returns a copy of this {@link RowFilter} with ordering expressions removed
*/
public RowFilter withoutOrderingExpressions()
{
return restrict(e -> !e.isOrderingExpression());
}

/**
* @return the {@link ANNOptions} of the ANN expression in this filter, or {@link ANNOptions#NONE} if there is
* no ANN expression.
Expand Down
4 changes: 2 additions & 2 deletions src/java/org/apache/cassandra/index/sai/SSTableIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,9 @@ public long getApproximateTermCount()
*
* @return an approximate number of the matching rows
*/
public long estimateMatchingRowsCount(Expression predicate, AbstractBounds<PartitionPosition> keyRange)
public long estimateMatchingRowsCount(Expression predicate)
{
return searchableIndex.estimateMatchingRowsCount(predicate, keyRange);
return searchableIndex.estimateMatchingRowsCount(predicate);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ public void populateSystemView(SimpleDataSet dataSet, SSTableReader sstable)
}

@Override
public long estimateMatchingRowsCount(Expression predicate, AbstractBounds<PartitionPosition> keyRange)
public long estimateMatchingRowsCount(Expression predicate)
{
return 0;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,5 @@ List<CloseableIterator<PrimaryKeyWithSortKey>> orderResultsBy(QueryContext conte

void populateSystemView(SimpleDataSet dataSet, SSTableReader sstable);

long estimateMatchingRowsCount(Expression predicate, AbstractBounds<PartitionPosition> keyRange);
long estimateMatchingRowsCount(Expression predicate);
}
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ public int proportionalAnnLimit(int limit, long totalRows)
return proportionalLimit;
}

public long estimateMatchingRowsCount(Expression predicate, AbstractBounds<PartitionPosition> keyRange)
public long estimateMatchingRowsCount(Expression predicate)
{
return metadata.estimateNumRowsMatching(predicate);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -289,12 +289,12 @@ public void populateSystemView(SimpleDataSet dataset, SSTableReader sstable)
}

@Override
public long estimateMatchingRowsCount(Expression predicate, AbstractBounds<PartitionPosition> keyRange)
public long estimateMatchingRowsCount(Expression predicate)
{
long rowCount = 0;
for (Segment segment: segments)
{
long c = segment.estimateMatchingRowsCount(predicate, keyRange);
long c = segment.estimateMatchingRowsCount(predicate);
assert c >= 0 : "Estimated row count must not be negative: " + c + " (predicate: " + predicate + ')';
rowCount += c;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,18 +217,12 @@ public KeyRangeIterator search(QueryContext context, Expression expr, AbstractBo
}

@Override
public long estimateMatchingRowsCountUsingFirstShard(Expression expression, AbstractBounds<PartitionPosition> keyRange)
public long estimateMatchingRowsCount(Expression expression)
{
// For BOUNDED_ANN we use the old way of estimating cardinality - by running the search.
throw new UnsupportedOperationException("Cardinality estimation not supported by vector indexes");
}

@Override
public long estimateMatchingRowsCountUsingAllShards(Expression expression, AbstractBounds<PartitionPosition> keyRange)
{
throw new UnsupportedOperationException("Cardinality estimation not supported by vector indexes");
}

@Override
public List<CloseableIterator<PrimaryKeyWithSortKey>> orderBy(QueryContext context,
Orderer orderer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ public abstract void update(DecoratedKey key,

public abstract KeyRangeIterator search(Expression expression, AbstractBounds<PartitionPosition> keyRange);

public abstract long estimateMatchingRowsCount(Expression expression, AbstractBounds<PartitionPosition> keyRange);
public abstract long estimateMatchingRowsCount(Expression expression);

public abstract ByteBuffer getMinTerm();

Expand Down
19 changes: 4 additions & 15 deletions src/java/org/apache/cassandra/index/sai/memory/MemtableIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -77,25 +77,14 @@ public interface MemtableIndex extends MemtableOrdering

/**
* Estimates the number of rows that would be returned by this index given the predicate.
* It is extrapolated from the first shard.
* Note that this is not a guarantee of the number of rows that will actually be returned.
* The estimate is intended for query plan optimization and is
* not guaranteed to be very accurate, but should be usually at the right
* level of magnitude. Being off by +/-50% is not a bug.
*
* @param expression predicate to match
* @param keyRange the key range to search within
* @return an approximate number of the matching rows
*/
long estimateMatchingRowsCountUsingFirstShard(Expression expression, AbstractBounds<PartitionPosition> keyRange);

/**
* Estimates the number of rows that would be returned by this index given the predicate.
* It estimates from all relevant shards individually.
* Note that this is not a guarantee of the number of rows that will actually be returned.
*
* @param expression predicate to match
* @param keyRange the key range to search within
* @return an estimated number of the matching rows
*/
long estimateMatchingRowsCountUsingAllShards(Expression expression, AbstractBounds<PartitionPosition> keyRange);
long estimateMatchingRowsCount(Expression expression);

Iterator<Pair<ByteComparable.Preencoded, List<MemoryIndex.PkWithFrequency>>> iterator(DecoratedKey min, DecoratedKey max);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,7 @@ private KeyRangeIterator rangeMatch(Expression expression, AbstractBounds<Partit
}

@Override
public long estimateMatchingRowsCount(Expression expression, AbstractBounds<PartitionPosition> keyRange)
public long estimateMatchingRowsCount(Expression expression)
{
switch (expression.getOp())
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -365,57 +365,54 @@ public List<CloseableIterator<PrimaryKeyWithSortKey>> orderBy(QueryContext query
}

/**
* Estimates the number of matching rows by extrapolating from the first shard only.
* This method provides a fast approximation by calculating the matching row count for the first
* shard in the key range and then multiplying by the total number of shards involved.
*
* <p>This approach assumes that data is uniformly distributed across shards, which may not
* Estimates the number of rows matching the given query predicate.
* <p>
* This method provides a fast approximation by calculating the matching row count from a subset of shards.
* The number of shards taken for the computation is determined dynamically depending on the number of indexed
* and matching rows – the more rows found in the shard, the fewer shards are needed to get a reliable estimate.
* <p>
* This approach assumes that data is uniformly distributed across shards, which may not
* always be accurate but provides a quick estimate with minimal computational overhead.
* The estimate is particularly useful for query planning and optimization decisions where
* speed is more important than precision.</p>
*
* @param expression the search expression/predicate to match against indexed terms
* @param keyRange the partition key range to search within, used to determine which shards to consider
* @return an estimated number of matching rows extrapolated from the first shard;
* @see #estimateMatchingRowsCountUsingAllShards(Expression, AbstractBounds) for a more accurate but slower alternative
* @return an estimated number of matching rows extrapolated from the first few shards;
*/
@Override
public long estimateMatchingRowsCountUsingFirstShard(Expression expression, AbstractBounds<PartitionPosition> keyRange)
public long estimateMatchingRowsCount(Expression expression)
{
int startShard = boundaries.getShardForToken(keyRange.left.getToken());
int endShard = getEndShardForBounds(keyRange);
return rangeIndexes[startShard].estimateMatchingRowsCount(expression, keyRange) * (endShard - startShard + 1);
}

/**
* Estimates the number of matching rows by querying all relevant shards individually.
* This method provides a more accurate estimate compared to the first-shard extrapolation
* approach by actually calculating the matching row count for each shard that intersects
* with the given key range and summing the results.
*
* <p>This approach accounts for non-uniform data distribution across shards and provides
* a more precise estimate at the cost of increased computational overhead. Each shard's
* memory index is consulted to determine how many rows would match the given expression
* within the specified key range.</p>
*
* @param expression the search expression/predicate to match against indexed terms
* @param keyRange the partition key range to search within, used to determine which shards to query
* @return the sum of estimated matching rows from all relevant shards;
*
* @see #estimateMatchingRowsCountUsingFirstShard(Expression, AbstractBounds) for a faster but less accurate alternative
*/
@Override
public long estimateMatchingRowsCountUsingAllShards(Expression expression, AbstractBounds<PartitionPosition> keyRange)
{
int startShard = boundaries.getShardForToken(keyRange.left.getToken());
int endShard = getEndShardForBounds(keyRange);
long count = 0;
for (int shard = startShard; shard <= endShard; ++shard)
// Control how many shards are taken for estimating the number of keys matching the query expression.
// Shards are taken until we reach at least MIN_MATCHING_ROWS matching rows
// or the total number of indexed rows in all the shards we considered reaches MIN_INDEXED_ROWS.
// Those constants do not affect query correctness, and can be safely to set to any value.
// They navigate the tradeoff between the cardinality estimation speed and accuracy. The higher the values are,
// the more shards will be considered for the estimation, which will increase accuracy but also
// increase the time it takes to estimate.
// If set to MAX_VALUE, all shards will be considered.
// If set to 0, only the first shard will be considered.
final int MIN_MATCHING_ROWS = 100;
final int MIN_INDEXED_ROWS = 100000;

long matchingRows = 0;
long indexedRows = 0;
int processedShards = 0;

for (int shard = 0; shard < shardCount(); ++shard)
{
assert rangeIndexes[shard] != null;
count += rangeIndexes[shard].estimateMatchingRowsCount(expression, keyRange);
matchingRows += rangeIndexes[shard].estimateMatchingRowsCount(expression);
indexedRows += rangeIndexes[shard].indexedRows();
processedShards++;

if (matchingRows >= MIN_MATCHING_ROWS)
break;
if (indexedRows >= MIN_INDEXED_ROWS)
break;
}
return count;

assert processedShards >= 1 : "Must process at least one shard for estimating matching rows count";
return Math.round(matchingRows * (double) (shardCount()) / processedShards);
}

@Override
Expand Down
Loading
Loading