Merge pull request ClickHouse#97231 from ClickHouse/fix_collection_for_selective_filters_

nickitat · web-flow · commit 75c0826b5909 · 2026-02-23T21:03:16.000Z
Fix AutoPR statistics collection for highly selective prewhere
diff --git a/src/Processors/QueryPlan/Optimizations/RuntimeDataflowStatistics.cpp b/src/Processors/QueryPlan/Optimizations/RuntimeDataflowStatistics.cpp
@@ -219,7 +219,8 @@ void RuntimeDataflowStatisticsCacheUpdater::recordInputColumns(
     const ColumnsWithTypeAndName & input_columns,
     const NamesAndTypesList & part_columns,
     const ColumnSizeByName & column_sizes,
-    size_t read_bytes)
+    size_t read_bytes,
+    std::optional<bool> & should_continue_sampling)
 {
     Stopwatch watch;
 
@@ -252,8 +253,11 @@ void RuntimeDataflowStatisticsCacheUpdater::recordInputColumns(
         }
         else
         {
+            if (!should_continue_sampling.has_value())
+                should_continue_sampling = shouldSampleBlock(statistics, input_columns[0].column->size());
+
             // We don't have individual column size info, likely because it is a compact part. Let's try to estimate it.
-            if (shouldSampleBlock(statistics, input_columns[0].column->size()))
+            if (*should_continue_sampling)
             {
                 for (const auto & column : input_columns)
                 {
diff --git a/src/Processors/QueryPlan/Optimizations/RuntimeDataflowStatistics.h b/src/Processors/QueryPlan/Optimizations/RuntimeDataflowStatistics.h
@@ -12,6 +12,7 @@
 #include <cstddef>
 #include <memory>
 #include <mutex>
+#include <optional>
 
 namespace DB
 {
@@ -93,11 +94,16 @@ class RuntimeDataflowStatisticsCacheUpdater
 
     void recordAggregationKeySizes(const Aggregator & aggregator, const Block & block);
 
+    /// Updates should_continue_sampling to true if the current read block is chosen for sampling.
+    /// It is needed because in general we read each block in multiple steps because of prewhere.
+    /// If the first part of the block was chosen for sampling, we want to record statistics for the whole block in later steps,
+    /// so should_continue_sampling remains true for subsequent calls for the same logical block.
     void recordInputColumns(
         const ColumnsWithTypeAndName & input_columns,
         const NamesAndTypesList & part_columns,
         const ColumnSizeByName & column_sizes,
-        size_t read_bytes);
+        size_t read_bytes,
+        std::optional<bool> & should_continue_sampling);
 
     void markUnsupportedCase() { unsupported_case.store(true, std::memory_order_relaxed); }
 
diff --git a/src/Storages/MergeTree/IMergeTreeReader.h b/src/Storages/MergeTree/IMergeTreeReader.h
@@ -71,6 +71,11 @@ class IMergeTreeReader : private boost::noncopyable
     ALWAYS_INLINE const NamesAndTypesList & getColumns() const { return data_part_info_for_read->isWidePart() ? converted_requested_columns : original_requested_columns; }
     size_t numColumnsInResult() const { return getColumns().size(); }
 
+    /// Returns column names and types as they are stored on disk (may differ from requested types
+    /// when there are pending type-changing mutations). Used to build correct `ColumnsWithTypeAndName`
+    /// before `performRequiredConversions` is applied.
+    const NamesAndTypes & getColumnsToRead() const { return columns_to_read; }
+
     size_t getFirstMarkToRead() const { return all_mark_ranges.front().begin; }
 
     MergeTreeDataPartInfoForReaderPtr data_part_info_for_read;
diff --git a/src/Storages/MergeTree/MergeTreeReadTask.cpp b/src/Storages/MergeTree/MergeTreeReadTask.cpp
@@ -82,6 +82,17 @@ MergeTreeReadTask::MergeTreeReadTask(
     , size_predictor(std::move(size_predictor_))
     , updater(std::move(updater_))
 {
+    if (updater)
+    {
+        dataflow_cache_update_cb
+            = [&](const ColumnsWithTypeAndName & columns, size_t read_bytes, std::optional<bool> & should_continue_sampling) -> void
+        {
+            chassert(updater);
+            const auto & part_columns = info->data_part->getColumns();
+            const auto & column_sizes = info->data_part->getColumnSizes();
+            updater->recordInputColumns(columns, part_columns, column_sizes, read_bytes, should_continue_sampling);
+        };
+    }
 }
 
 /// Returns pointer to the index if all columns in the read step belongs to the read step for that index.
@@ -356,7 +367,7 @@ MergeTreeReadTask::BlockAndProgress MergeTreeReadTask::read()
     UInt64 recommended_rows = estimateNumRows();
     UInt64 rows_to_read = std::max(static_cast<UInt64>(1), std::min(block_size_params.max_block_size_rows, recommended_rows));
 
-    auto read_result = readers_chain.read(rows_to_read, mark_ranges, patches_mark_ranges);
+    auto read_result = readers_chain.read(rows_to_read, mark_ranges, patches_mark_ranges, dataflow_cache_update_cb);
 
     /// All rows were filtered. Repeat.
     if (read_result.num_rows == 0)
@@ -399,10 +410,6 @@ MergeTreeReadTask::BlockAndProgress MergeTreeReadTask::read()
         block = sample_block.cloneWithColumns(read_result.columns);
     }
 
-    if (updater)
-        updater->recordInputColumns(
-            block.getColumnsWithTypeAndName(), info->data_part->getColumns(), info->data_part->getColumnSizes(), num_read_bytes);
-
     BlockAndProgress res = {
         .block = std::move(block),
         .read_mark_ranges = read_result.read_mark_ranges,
diff --git a/src/Storages/MergeTree/MergeTreeReadTask.h b/src/Storages/MergeTree/MergeTreeReadTask.h
@@ -212,6 +212,9 @@ struct MergeTreeReadTask : private boost::noncopyable
         const ReadStepsPerformanceCounters & read_steps_performance_counters);
 
 private:
+    using DataflowCacheUpdateCallback
+        = std::function<void(const ColumnsWithTypeAndName & columns, size_t read_bytes, std::optional<bool> & should_continue_sampling)>;
+
     UInt64 estimateNumRows() const;
 
     /// Shared information required for reading.
@@ -239,6 +242,7 @@ struct MergeTreeReadTask : private boost::noncopyable
     MergeTreeBlockSizePredictorPtr size_predictor;
 
     RuntimeDataflowStatisticsCacheUpdaterPtr updater;
+    DataflowCacheUpdateCallback dataflow_cache_update_cb;
 };
 
 using MergeTreeReadTaskPtr = std::unique_ptr<MergeTreeReadTask>;
diff --git a/src/Storages/MergeTree/MergeTreeReadersChain.cpp b/src/Storages/MergeTree/MergeTreeReadersChain.cpp
@@ -1,7 +1,7 @@
-#include <Storages/MergeTree/MergeTreeReadersChain.h>
 #include <Storages/MergeTree/IMergeTreeReader.h>
-#include <Common/logger_useful.h>
+#include <Storages/MergeTree/MergeTreeReadersChain.h>
 #include <Storages/MergeTree/PatchParts/PatchPartsUtils.h>
+#include <Common/logger_useful.h>
 
 namespace DB
 {
@@ -56,7 +56,33 @@ static std::optional<UInt64> getMaxPatchVersionForStep(const MergeTreeRangeReade
     return prewhere_info ? prewhere_info->mutation_version : std::nullopt;
 }
 
-MergeTreeReadersChain::ReadResult MergeTreeReadersChain::read(size_t max_rows, MarkRanges & ranges, std::vector<MarkRanges> & patch_ranges)
+/// Builds `ColumnsWithTypeAndName` using the on-disk column descriptions (from `IMergeTreeReader::getColumnsToRead`).
+/// This is important when columns have not yet been converted, i.e. their types with differ those contained in `getReadSampleBlock`.
+static ColumnsWithTypeAndName toColumnsWithTypeAndName(const Columns & columns, const NamesAndTypes & on_disk_columns)
+{
+    if (columns.size() != on_disk_columns.size())
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "Number of columns doesn't match number of on-disk columns, columns size: {}, on_disk_columns size: {}",
+            columns.size(),
+            on_disk_columns.size());
+
+    ColumnsWithTypeAndName res;
+    res.reserve(columns.size());
+    for (size_t i = 0; i < columns.size(); ++i)
+    {
+        /// Columns might be null, e.g. not yet filled by `fillMissingColumns`
+        if (columns[i])
+            res.emplace_back(columns[i], on_disk_columns[i].type, on_disk_columns[i].name);
+    }
+    return res;
+}
+
+MergeTreeReadersChain::ReadResult MergeTreeReadersChain::read(
+    size_t max_rows,
+    MarkRanges & ranges,
+    std::vector<MarkRanges> & patch_ranges,
+    const DataflowCacheUpdateCallback & dataflow_cache_update_cb)
 {
     if (max_rows == 0)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected at least 1 row to read, got 0.");
@@ -79,10 +105,18 @@ MergeTreeReadersChain::ReadResult MergeTreeReadersChain::read(size_t max_rows, M
         throw;
     }
 
+    std::optional<bool> should_continue_sampling;
     if (read_result.num_rows != 0)
     {
         first_reader.getReader()->fillVirtualColumns(read_result.columns, read_result.num_rows);
         readPatches(first_reader.getReadSampleBlock(), patch_ranges, read_result);
+
+        if (dataflow_cache_update_cb)
+            dataflow_cache_update_cb(
+                toColumnsWithTypeAndName(read_result.columns, first_reader.getReader()->getColumnsToRead()),
+                read_result.num_bytes_read,
+                should_continue_sampling);
+
         executeActionsBeforePrewhere(read_result, read_result.columns, first_reader, {}, read_result.num_rows);
 
         executePrewhereActions(first_reader, read_result, {}, range_readers.size() == 1);
@@ -91,6 +125,7 @@ MergeTreeReadersChain::ReadResult MergeTreeReadersChain::read(size_t max_rows, M
 
     for (size_t i = 1; i < range_readers.size(); ++i)
     {
+        const size_t num_bytes_read_so_far = read_result.num_bytes_read;
         size_t num_read_rows = 0;
         auto columns = range_readers[i].continueReadingChain(read_result, num_read_rows);
 
@@ -109,6 +144,17 @@ MergeTreeReadersChain::ReadResult MergeTreeReadersChain::read(size_t max_rows, M
             if (num_read_rows == 0)
                 num_read_rows = read_result.num_rows;
 
+            if (dataflow_cache_update_cb)
+            {
+                chassert(read_result.num_bytes_read >= num_bytes_read_so_far);
+                // It is important that we call `recordInputColumns` here even if `should_continue_sampling`
+                // is already set to false, because we still need to update the total bytes seen.
+                dataflow_cache_update_cb(
+                    toColumnsWithTypeAndName(columns, range_readers[i].getReader()->getColumnsToRead()),
+                    read_result.num_bytes_read - num_bytes_read_so_far,
+                    should_continue_sampling);
+            }
+
             executeActionsBeforePrewhere(read_result, columns, range_readers[i], previous_header, num_read_rows);
             read_result.columns.insert(read_result.columns.end(), columns.begin(), columns.end());
         }
diff --git a/src/Storages/MergeTree/MergeTreeReadersChain.h b/src/Storages/MergeTree/MergeTreeReadersChain.h
@@ -2,6 +2,8 @@
 #include <Storages/MergeTree/MergeTreeRangeReader.h>
 #include <Storages/MergeTree/PatchParts/MergeTreePatchReader.h>
 
+#include <functional>
+
 namespace DB
 {
 
@@ -30,13 +32,18 @@ using ColumnsForPatches = std::vector<ColumnsForPatch>;
 
 class MergeTreeReadersChain
 {
+    using DataflowCacheUpdateCallback
+        = std::function<void(const ColumnsWithTypeAndName & columns, size_t read_bytes, std::optional<bool> & should_continue_sampling)>;
+
 public:
     MergeTreeReadersChain() = default;
     MergeTreeReadersChain(RangeReaders range_readers_, MergeTreePatchReaders patch_readers_);
     bool isInitialized() const { return is_initialized; }
 
     using ReadResult = MergeTreeRangeReader::ReadResult;
-    ReadResult read(size_t max_rows, MarkRanges & ranges, std::vector<MarkRanges> & patch_ranges);
+
+    ReadResult
+    read(size_t max_rows, MarkRanges & ranges, std::vector<MarkRanges> & patch_ranges, const DataflowCacheUpdateCallback & update_cb = {});
 
     size_t numReadRowsInCurrentGranule() const;
     size_t numPendingRowsInCurrentGranule() const;
diff --git a/tests/queries/0_stateless/03801_autopr_input_bytes_estimation_query_with_subqueries.reference b/tests/queries/0_stateless/03801_autopr_input_bytes_estimation_query_with_subqueries.reference
diff --git a/tests/queries/0_stateless/03801_autopr_input_bytes_estimation_query_with_subqueries.sql b/tests/queries/0_stateless/03801_autopr_input_bytes_estimation_query_with_subqueries.sql
@@ -0,0 +1,49 @@
+-- Tags: stateful
+
+-- To avoid too slow test execution
+set remote_filesystem_read_method='threadpool', allow_prefetched_read_pool_for_remote_filesystem=1, filesystem_prefetch_step_marks=0, filesystem_prefetch_step_bytes='100Mi';
+
+SET enable_parallel_replicas=0, automatic_parallel_replicas_mode=2, parallel_replicas_local_plan=1, parallel_replicas_index_analysis_only_on_coordinator=1,
+    parallel_replicas_for_non_replicated_merge_tree=1, max_parallel_replicas=3, cluster_for_parallel_replicas='parallel_replicas';
+
+-- External aggregation is not supported as of now
+SET max_bytes_before_external_group_by=0, max_bytes_ratio_before_external_group_by=0;
+
+SET use_query_condition_cache=0;
+
+create table t(a UInt64) engine=MergeTree order by a;
+insert into t select number from numbers_mt(1e6);
+
+SELECT a % 10000 FROM t FORMAT Null SETTINGS log_comment='03801_autopr_input_bytes_estimation_query_with_subqueries_query_0';
+
+-- `CounterID` is part of the PK
+SELECT EventTime, CounterID, URL, Referer FROM test.hits WHERE CounterID IN (SELECT a % 10000 FROM t) FORMAT Null SETTINGS log_comment='03801_autopr_input_bytes_estimation_query_with_subqueries_query_1';
+-- `WatchID` is not
+SELECT EventTime, CounterID, URL, Referer FROM test.hits WHERE WatchID IN (SELECT a % 10000 FROM t) FORMAT Null SETTINGS log_comment='03801_autopr_input_bytes_estimation_query_with_subqueries_query_2';
+
+SET enable_parallel_replicas=0, automatic_parallel_replicas_mode=0;
+
+SYSTEM FLUSH LOGS query_log;
+
+-- Just checking that the estimation is not too far off
+--
+-- We subtract the compressed bytes of the subquery because it cannot be executed with parallel replicas in the current infrastructure,
+-- so the "parallelizable" part of the query is only the main query itself, thus AutoPR heuristic should use only its estimation.
+WITH (
+    SELECT
+        ProfileEvents['ReadCompressedBytes']
+    FROM system.query_log
+    WHERE (event_date >= yesterday()) AND (event_time >= NOW() - INTERVAL '15 MINUTES') AND (current_database = currentDatabase()) AND (log_comment = '03801_autopr_input_bytes_estimation_query_with_subqueries_query_0') AND (type = 'QueryFinish')
+    ORDER BY event_time_microseconds
+) AS compressed_bytes_subquery
+SELECT format('{} {} {}', log_comment, compressed_bytes, statistics_input_bytes)
+FROM (
+    SELECT
+        log_comment,
+        ProfileEvents['ReadCompressedBytes'] - compressed_bytes_subquery AS compressed_bytes,
+        ProfileEvents['RuntimeDataflowStatisticsInputBytes']::Int64 statistics_input_bytes
+    FROM system.query_log
+    WHERE (event_date >= yesterday()) AND (event_time >= NOW() - INTERVAL '15 MINUTES') AND (current_database = currentDatabase()) AND (match(log_comment, '03801_autopr_input_bytes_estimation_query_with_subqueries_query_[12]')) AND (type = 'QueryFinish')
+    ORDER BY event_time_microseconds
+)
+WHERE greatest(compressed_bytes, statistics_input_bytes) / least(compressed_bytes, statistics_input_bytes) > 2;
diff --git a/tests/queries/0_stateless/03927_autopr_input_bytes_estimation_prewhere_filter.reference b/tests/queries/0_stateless/03927_autopr_input_bytes_estimation_prewhere_filter.reference
diff --git a/tests/queries/0_stateless/03927_autopr_input_bytes_estimation_prewhere_filter.sql b/tests/queries/0_stateless/03927_autopr_input_bytes_estimation_prewhere_filter.sql
@@ -0,0 +1,44 @@
+-- Tags: stateful, long
+
+SET use_uncompressed_cache=0;
+
+SET enable_parallel_replicas=0, automatic_parallel_replicas_mode=2, parallel_replicas_local_plan=1, parallel_replicas_index_analysis_only_on_coordinator=1,
+    parallel_replicas_for_non_replicated_merge_tree=1, max_parallel_replicas=3, cluster_for_parallel_replicas='parallel_replicas';
+
+-- Reading of aggregation states from disk will affect `ReadCompressedBytes`
+SET max_bytes_before_external_group_by=0, max_bytes_ratio_before_external_group_by=0;
+
+-- To avoid too slow test execution
+set remote_filesystem_read_method='threadpool', allow_prefetched_read_pool_for_remote_filesystem=1, filesystem_prefetch_step_marks=0, filesystem_prefetch_step_bytes='100Mi';
+
+-- Override randomized max_threads to avoid timeout on slow builds (ASan)
+SET max_threads=0;
+
+SELECT URL FROM test.hits WHERE WatchID < 4611892230367380000 FORMAT Null SETTINGS log_comment='03927_autopr_input_bytes_estimation_prewhere_filter_0';
+SELECT URL FROM test.hits WHERE WatchID < 5550265976347679000 FORMAT Null SETTINGS log_comment='03927_autopr_input_bytes_estimation_prewhere_filter_1';
+SELECT URL FROM test.hits WHERE WatchID < 6509275139329711000 FORMAT Null SETTINGS log_comment='03927_autopr_input_bytes_estimation_prewhere_filter_2';
+
+SELECT CounterID, URL FROM test.hits WHERE WatchID < 4611892230367380000 FORMAT Null SETTINGS log_comment='03927_autopr_input_bytes_estimation_prewhere_filter_10';
+SELECT CounterID, URL FROM test.hits WHERE WatchID < 5550265976347679000 FORMAT Null SETTINGS log_comment='03927_autopr_input_bytes_estimation_prewhere_filter_11';
+SELECT CounterID, URL FROM test.hits WHERE WatchID < 6509275139329711000 FORMAT Null SETTINGS log_comment='03927_autopr_input_bytes_estimation_prewhere_filter_12';
+
+SELECT CounterID, URL, Referer FROM test.hits WHERE WatchID < 4611892230367380000 FORMAT Null SETTINGS log_comment='03927_autopr_input_bytes_estimation_prewhere_filter_20';
+SELECT CounterID, URL, Referer FROM test.hits WHERE WatchID < 5550265976347679000 FORMAT Null SETTINGS log_comment='03927_autopr_input_bytes_estimation_prewhere_filter_21';
+SELECT CounterID, URL, Referer FROM test.hits WHERE WatchID < 6509275139329711000 FORMAT Null SETTINGS log_comment='03927_autopr_input_bytes_estimation_prewhere_filter_22';
+
+SET enable_parallel_replicas=0, automatic_parallel_replicas_mode=0;
+
+SYSTEM FLUSH LOGS query_log;
+
+-- Just checking that the estimation is not too far off
+SELECT format('{} {} {}', log_comment, compressed_bytes, statistics_input_bytes)
+FROM (
+    SELECT
+        log_comment,
+        ProfileEvents['ReadCompressedBytes'] compressed_bytes,
+        ProfileEvents['RuntimeDataflowStatisticsInputBytes'] statistics_input_bytes
+    FROM system.query_log
+    WHERE (event_date >= yesterday()) AND (event_time >= NOW() - INTERVAL '15 MINUTES') AND (current_database = currentDatabase()) AND (log_comment LIKE '03927_autopr_input_bytes_estimation_prewhere_filter_%') AND (type = 'QueryFinish')
+    ORDER BY event_time_microseconds
+)
+WHERE greatest(compressed_bytes, statistics_input_bytes) / least(compressed_bytes, statistics_input_bytes) > 2;

Original file line number	Diff line number	Diff line change
`@@ -219,7 +219,8 @@ void RuntimeDataflowStatisticsCacheUpdater::recordInputColumns(`
`219`	`219`	`const ColumnsWithTypeAndName & input_columns,`
`220`	`220`	`const NamesAndTypesList & part_columns,`
`221`	`221`	`const ColumnSizeByName & column_sizes,`
`222`		`- size_t read_bytes)`
	`222`	`+ size_t read_bytes,`
	`223`	`+ std::optional<bool> & should_continue_sampling)`
`223`	`224`	`{`
`224`	`225`	`Stopwatch watch;`
`225`	`226`
`@@ -252,8 +253,11 @@ void RuntimeDataflowStatisticsCacheUpdater::recordInputColumns(`
`252`	`253`	`}`
`253`	`254`	`else`
`254`	`255`	`{`
	`256`	`+ if (!should_continue_sampling.has_value())`
	`257`	`+ should_continue_sampling = shouldSampleBlock(statistics, input_columns[0].column->size());`
	`258`	`+`
`255`	`259`	`// We don't have individual column size info, likely because it is a compact part. Let's try to estimate it.`
`256`		`- if (shouldSampleBlock(statistics, input_columns[0].column->size()))`
	`260`	`+ if (*should_continue_sampling)`
`257`	`261`	`{`
`258`	`262`	`for (const auto & column : input_columns)`
`259`	`263`	`{`