datastax · driftx · Apr 21, 2026 · Jan 7, 2026
diff --git a/src/java/org/apache/cassandra/db/filter/RowFilter.java b/src/java/org/apache/cassandra/db/filter/RowFilter.java
@@ -149,6 +149,14 @@ public boolean hasANN()
         return false;
     }
 
+    /**
+     * Returns a copy of this {@link RowFilter} with ordering expressions removed
+     */
+    public RowFilter withoutOrderingExpressions()
+    {
+        return restrict(e -> !e.isOrderingExpression());
+    }
+
     /**
      * @return the {@link ANNOptions} of the ANN expression in this filter, or {@link ANNOptions#NONE} if there is
      * no ANN expression.

diff --git a/src/java/org/apache/cassandra/index/sai/SSTableIndex.java b/src/java/org/apache/cassandra/index/sai/SSTableIndex.java
@@ -154,9 +154,9 @@ public long getApproximateTermCount()
      *
      * @return an approximate number of the matching rows
      */
-    public long estimateMatchingRowsCount(Expression predicate, AbstractBounds<PartitionPosition> keyRange)
+    public long estimateMatchingRowsCount(Expression predicate)
     {
-        return searchableIndex.estimateMatchingRowsCount(predicate, keyRange);
+        return searchableIndex.estimateMatchingRowsCount(predicate);
     }
 
     /**

diff --git a/src/java/org/apache/cassandra/index/sai/disk/EmptyIndex.java b/src/java/org/apache/cassandra/index/sai/disk/EmptyIndex.java
@@ -129,7 +129,7 @@ public void populateSystemView(SimpleDataSet dataSet, SSTableReader sstable)
     }
 
     @Override
-    public long estimateMatchingRowsCount(Expression predicate, AbstractBounds<PartitionPosition> keyRange)
+    public long estimateMatchingRowsCount(Expression predicate)
     {
         return 0;
     }

diff --git a/src/java/org/apache/cassandra/index/sai/disk/SearchableIndex.java b/src/java/org/apache/cassandra/index/sai/disk/SearchableIndex.java
@@ -88,5 +88,5 @@ List<CloseableIterator<PrimaryKeyWithSortKey>> orderResultsBy(QueryContext conte
 
     void populateSystemView(SimpleDataSet dataSet, SSTableReader sstable);
 
-    long estimateMatchingRowsCount(Expression predicate, AbstractBounds<PartitionPosition> keyRange);
+    long estimateMatchingRowsCount(Expression predicate);
 }
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/Segment.java b/src/java/org/apache/cassandra/index/sai/disk/v1/Segment.java
@@ -234,7 +234,7 @@ public int proportionalAnnLimit(int limit, long totalRows)
         return proportionalLimit;
     }
 
-    public long estimateMatchingRowsCount(Expression predicate, AbstractBounds<PartitionPosition> keyRange)
+    public long estimateMatchingRowsCount(Expression predicate)
     {
         return metadata.estimateNumRowsMatching(predicate);
     }

diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/V1SearchableIndex.java b/src/java/org/apache/cassandra/index/sai/disk/v1/V1SearchableIndex.java
@@ -289,12 +289,12 @@ public void populateSystemView(SimpleDataSet dataset, SSTableReader sstable)
     }
 
     @Override
-    public long estimateMatchingRowsCount(Expression predicate, AbstractBounds<PartitionPosition> keyRange)
+    public long estimateMatchingRowsCount(Expression predicate)
     {
         long rowCount = 0;
         for (Segment segment: segments)
         {
-            long c = segment.estimateMatchingRowsCount(predicate, keyRange);
+            long c = segment.estimateMatchingRowsCount(predicate);
             assert c >= 0 : "Estimated row count must not be negative: " + c + " (predicate: " + predicate + ')';
             rowCount += c;
         }

diff --git a/src/java/org/apache/cassandra/index/sai/disk/vector/VectorMemtableIndex.java b/src/java/org/apache/cassandra/index/sai/disk/vector/VectorMemtableIndex.java
@@ -217,18 +217,12 @@ public KeyRangeIterator search(QueryContext context, Expression expr, AbstractBo
     }
 
     @Override
-    public long estimateMatchingRowsCountUsingFirstShard(Expression expression, AbstractBounds<PartitionPosition> keyRange)
+    public long estimateMatchingRowsCount(Expression expression)
     {
         // For BOUNDED_ANN we use the old way of estimating cardinality - by running the search.
         throw new UnsupportedOperationException("Cardinality estimation not supported by vector indexes");
     }
 
-    @Override
-    public long estimateMatchingRowsCountUsingAllShards(Expression expression, AbstractBounds<PartitionPosition> keyRange)
-    {
-        throw new UnsupportedOperationException("Cardinality estimation not supported by vector indexes");
-    }
-
     @Override
     public List<CloseableIterator<PrimaryKeyWithSortKey>> orderBy(QueryContext context,
                                                                   Orderer orderer,

diff --git a/src/java/org/apache/cassandra/index/sai/memory/MemoryIndex.java b/src/java/org/apache/cassandra/index/sai/memory/MemoryIndex.java
@@ -78,7 +78,7 @@ public abstract void update(DecoratedKey key,
 
     public abstract KeyRangeIterator search(Expression expression, AbstractBounds<PartitionPosition> keyRange);
 
-    public abstract long estimateMatchingRowsCount(Expression expression, AbstractBounds<PartitionPosition> keyRange);
+    public abstract long estimateMatchingRowsCount(Expression expression);
 
     public abstract ByteBuffer getMinTerm();
 

diff --git a/src/java/org/apache/cassandra/index/sai/memory/MemtableIndex.java b/src/java/org/apache/cassandra/index/sai/memory/MemtableIndex.java
@@ -77,25 +77,14 @@ public interface MemtableIndex extends MemtableOrdering
 
     /**
      * Estimates the number of rows that would be returned by this index given the predicate.
-     * It is extrapolated from the first shard.
-     * Note that this is not a guarantee of the number of rows that will actually be returned.
+     * The estimate is intended for query plan optimization and is
+     * not guaranteed to be very accurate, but should be usually at the right
+     * level of magnitude. Being off by +/-50% is not a bug.
      *
      * @param expression predicate to match
-     * @param keyRange   the key range to search within
      * @return an approximate number of the matching rows
      */
-    long estimateMatchingRowsCountUsingFirstShard(Expression expression, AbstractBounds<PartitionPosition> keyRange);
-
-    /**
-     * Estimates the number of rows that would be returned by this index given the predicate.
-     * It estimates from all relevant shards individually.
-     * Note that this is not a guarantee of the number of rows that will actually be returned.
-     *
-     * @param expression predicate to match
-     * @param keyRange   the key range to search within
-     * @return an estimated number of the matching rows
-     */
-    long estimateMatchingRowsCountUsingAllShards(Expression expression, AbstractBounds<PartitionPosition> keyRange);
+    long estimateMatchingRowsCount(Expression expression);
 
     Iterator<Pair<ByteComparable.Preencoded, List<MemoryIndex.PkWithFrequency>>> iterator(DecoratedKey min, DecoratedKey max);
 

diff --git a/src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java b/src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java
@@ -718,7 +718,7 @@ private KeyRangeIterator rangeMatch(Expression expression, AbstractBounds<Partit
     }
 
     @Override
-    public long estimateMatchingRowsCount(Expression expression, AbstractBounds<PartitionPosition> keyRange)
+    public long estimateMatchingRowsCount(Expression expression)
     {
         switch (expression.getOp())
         {

diff --git a/src/java/org/apache/cassandra/index/sai/memory/TrieMemtableIndex.java b/src/java/org/apache/cassandra/index/sai/memory/TrieMemtableIndex.java
@@ -365,57 +365,54 @@ public List<CloseableIterator<PrimaryKeyWithSortKey>> orderBy(QueryContext query
     }
 
     /**
-     * Estimates the number of matching rows by extrapolating from the first shard only.
-     * This method provides a fast approximation by calculating the matching row count for the first
-     * shard in the key range and then multiplying by the total number of shards involved.
-     *
-     * <p>This approach assumes that data is uniformly distributed across shards, which may not
+     * Estimates the number of rows matching the given query predicate.
+     * <p>
+     * This method provides a fast approximation by calculating the matching row count from a subset of shards.
+     * The number of shards taken for the computation is determined dynamically depending on the number of indexed
+     * and matching rows – the more rows found in the shard, the fewer shards are needed to get a reliable estimate.
+     * <p>
+     * This approach assumes that data is uniformly distributed across shards, which may not
      * always be accurate but provides a quick estimate with minimal computational overhead.
      * The estimate is particularly useful for query planning and optimization decisions where
      * speed is more important than precision.</p>
      *
      * @param expression the search expression/predicate to match against indexed terms
-     * @param keyRange   the partition key range to search within, used to determine which shards to consider
-     * @return an estimated number of matching rows extrapolated from the first shard;
-     * @see #estimateMatchingRowsCountUsingAllShards(Expression, AbstractBounds) for a more accurate but slower alternative
+     * @return an estimated number of matching rows extrapolated from the first few shards;
      */
     @Override
-    public long estimateMatchingRowsCountUsingFirstShard(Expression expression, AbstractBounds<PartitionPosition> keyRange)
+    public long estimateMatchingRowsCount(Expression expression)
     {
-        int startShard = boundaries.getShardForToken(keyRange.left.getToken());
-        int endShard = getEndShardForBounds(keyRange);
-        return rangeIndexes[startShard].estimateMatchingRowsCount(expression, keyRange) * (endShard - startShard + 1);
-    }
-
-    /**
-     * Estimates the number of matching rows by querying all relevant shards individually.
-     * This method provides a more accurate estimate compared to the first-shard extrapolation
-     * approach by actually calculating the matching row count for each shard that intersects
-     * with the given key range and summing the results.
-     *
-     * <p>This approach accounts for non-uniform data distribution across shards and provides
-     * a more precise estimate at the cost of increased computational overhead. Each shard's
-     * memory index is consulted to determine how many rows would match the given expression
-     * within the specified key range.</p>
-     *
-     * @param expression the search expression/predicate to match against indexed terms
-     * @param keyRange the partition key range to search within, used to determine which shards to query
-     * @return the sum of estimated matching rows from all relevant shards;
-     *
-     * @see #estimateMatchingRowsCountUsingFirstShard(Expression, AbstractBounds) for a faster but less accurate alternative
-     */
-    @Override
-    public long estimateMatchingRowsCountUsingAllShards(Expression expression, AbstractBounds<PartitionPosition> keyRange)
-    {
-        int startShard = boundaries.getShardForToken(keyRange.left.getToken());
-        int endShard = getEndShardForBounds(keyRange);
-        long count = 0;
-        for (int shard = startShard; shard <= endShard; ++shard)
+        // Control how many shards are taken for estimating the number of keys matching the query expression.
+        // Shards are taken until we reach at least MIN_MATCHING_ROWS matching rows
+        // or the total number of indexed rows in all the shards we considered reaches MIN_INDEXED_ROWS.
+        // Those constants do not affect query correctness, and can be safely to set to any value.
+        // They navigate the tradeoff between the cardinality estimation speed and accuracy. The higher the values are,
+        // the more shards will be considered for the estimation, which will increase accuracy but also
+        // increase the time it takes to estimate.
+        // If set to MAX_VALUE, all shards will be considered.
+        // If set to 0, only the first shard will be considered.
+        final int MIN_MATCHING_ROWS = 100;
+        final int MIN_INDEXED_ROWS = 100000;
+
+        long matchingRows = 0;
+        long indexedRows = 0;
+        int processedShards = 0;
+
+        for (int shard = 0; shard < shardCount(); ++shard)
         {
             assert rangeIndexes[shard] != null;
-            count += rangeIndexes[shard].estimateMatchingRowsCount(expression, keyRange);
+            matchingRows += rangeIndexes[shard].estimateMatchingRowsCount(expression);
+            indexedRows += rangeIndexes[shard].indexedRows();
+            processedShards++;
+
+            if (matchingRows >= MIN_MATCHING_ROWS)
+                break;
+            if (indexedRows >= MIN_INDEXED_ROWS)
+                break;
         }
-        return count;
+
+        assert processedShards >= 1 : "Must process at least one shard for estimating matching rows count";
+        return Math.round(matchingRows * (double) (shardCount()) / processedShards);
     }
 
     @Override