Skip to content

Commit 197c0dd

Browse files
authored
Merge pull request ClickHouse#87241 from CurtizJ/automatic-statistics
Allow to enable statistics on all columns
2 parents 58aa6db + c66230d commit 197c0dd

75 files changed

Lines changed: 1092 additions & 256 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/Core/SettingsChangesHistory.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -905,6 +905,7 @@ const VersionToSettingsChangesMap & getMergeTreeSettingsChangesHistory()
905905
{
906906
addSettingsChanges(merge_tree_settings_changes_history, "25.10",
907907
{
908+
{"auto_statistics_types", "", "", "New setting"},
908909
{"exclude_materialize_skip_indexes_on_merge", "", "", "New setting."},
909910
{"serialization_info_version", "default", "default", "New setting"},
910911
{"string_serialization_version", "default", "default", "New setting"},

src/Interpreters/InterpreterCreateQuery.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -488,7 +488,7 @@ ASTPtr InterpreterCreateQuery::formatColumns(const ColumnsDescription & columns)
488488
column_declaration->children.push_back(column_declaration->codec);
489489
}
490490

491-
if (!column.statistics.empty())
491+
if (column.statistics.hasExplicitStatistics())
492492
{
493493
column_declaration->statistics_desc = column.statistics.getAST();
494494
column_declaration->children.push_back(column_declaration->statistics_desc);
@@ -699,7 +699,8 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription(
699699
if (!skip_checks && !context_->getSettingsRef()[Setting::allow_experimental_statistics])
700700
throw Exception(
701701
ErrorCodes::INCORRECT_QUERY, "Create table with statistics is now disabled. Turn on allow_experimental_statistics");
702-
column.statistics = ColumnStatisticsDescription::fromColumnDeclaration(col_decl, column.type);
702+
703+
column.statistics = ColumnStatisticsDescription::fromStatisticsDescriptionAST(col_decl.statistics_desc, column.name, column.type);
703704
}
704705

705706
if (col_decl.ttl)

src/Interpreters/MutationsInterpreter.cpp

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -941,6 +941,7 @@ void MutationsInterpreter::prepare(bool dry_run)
941941
{
942942
if (!columns_desc.has(stat_column_name) || columns_desc.get(stat_column_name).statistics.empty())
943943
throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Unknown statistics column: {}", stat_column_name);
944+
944945
dependencies.emplace(stat_column_name, ColumnDependency::STATISTICS);
945946
materialized_statistics.emplace(stat_column_name);
946947
}
@@ -964,8 +965,20 @@ void MutationsInterpreter::prepare(bool dry_run)
964965
else if (command.type == MutationCommand::DROP_STATISTICS)
965966
{
966967
mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTICS_PROJECTION);
967-
for (const auto & stat_column_name: command.statistics_columns)
968-
materialized_statistics.erase(stat_column_name);
968+
969+
if (command.clear && command.statistics_columns.empty())
970+
{
971+
for (const auto & column_desc : columns_desc)
972+
{
973+
if (!column_desc.statistics.empty())
974+
materialized_statistics.erase(column_desc.name);
975+
}
976+
}
977+
else
978+
{
979+
for (const auto & stat_column_name: command.statistics_columns)
980+
materialized_statistics.erase(stat_column_name);
981+
}
969982
}
970983
else if (command.type == MutationCommand::DROP_PROJECTION)
971984
{
@@ -1049,6 +1062,8 @@ void MutationsInterpreter::prepare(bool dry_run)
10491062
{
10501063
mutation_kind.set(MutationKind::MUTATE_OTHER);
10511064
read_columns.emplace_back(command.column_name);
1065+
materialized_statistics.insert(command.column_name);
1066+
10521067
/// Check if the type of this column is changed and there are projections that
10531068
/// have this column in the primary key. We should rebuild such projections.
10541069
if (const auto & merge_tree_data_part = source.getMergeTreeDataPart())
@@ -1190,6 +1205,15 @@ void MutationsInterpreter::prepare(bool dry_run)
11901205
materialized_projections.insert(projection.name);
11911206
}
11921207

1208+
for (const auto & column : metadata_snapshot->getColumns())
1209+
{
1210+
if (column.statistics.empty())
1211+
continue;
1212+
1213+
if (updated_columns.contains(column.name) || changed_columns.contains(column.name))
1214+
materialized_statistics.insert(column.name);
1215+
}
1216+
11931217
/// Stages might be empty when we materialize skip indices or projections which don't add any
11941218
/// column dependencies.
11951219
if (stages.empty())

src/Parsers/ASTAlterQuery.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,12 @@ void ASTAlterCommand::formatImpl(WriteBuffer & ostr, const FormatSettings & sett
228228
{
229229
ostr << (clear_statistics ? "CLEAR " : "DROP ") << "STATISTICS "
230230
<< (if_exists ? "IF EXISTS " : "");
231-
statistics_decl->format(ostr, settings, state, frame);
231+
232+
if (statistics_decl)
233+
statistics_decl->format(ostr, settings, state, frame);
234+
else
235+
ostr << " ALL";
236+
232237
if (partition)
233238
{
234239
ostr << " IN PARTITION ";

src/Parsers/ParserAlterQuery.cpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
126126
ParserKeyword s_remove_sample_by(Keyword::REMOVE_SAMPLE_BY);
127127
ParserKeyword s_apply_deleted_mask(Keyword::APPLY_DELETED_MASK);
128128
ParserKeyword s_apply_patches(Keyword::APPLY_PATCHES);
129+
ParserKeyword s_all(Keyword::ALL);
129130

130131
ParserToken parser_opening_round_bracket(TokenType::OpeningRoundBracket);
131132
ParserToken parser_closing_round_bracket(TokenType::ClosingRoundBracket);
@@ -413,20 +414,23 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
413414
}
414415
else if (s_clear_statistics.ignore(pos, expected))
415416
{
416-
if (s_if_exists.ignore(pos, expected))
417-
command->if_exists = true;
418-
419-
if (!parser_stat_decl_without_types.parse(pos, command_statistics_decl, expected))
420-
return false;
421-
422417
command->type = ASTAlterCommand::DROP_STATISTICS;
423418
command->clear_statistics = true;
424419
command->detach = false;
425420

426-
if (s_in_partition.ignore(pos, expected))
421+
if (!s_all.ignore(pos, expected))
427422
{
428-
if (!parser_partition.parse(pos, command_partition, expected))
423+
if (s_if_exists.ignore(pos, expected))
424+
command->if_exists = true;
425+
426+
if (!parser_stat_decl_without_types.parse(pos, command_statistics_decl, expected))
429427
return false;
428+
429+
if (s_in_partition.ignore(pos, expected))
430+
{
431+
if (!parser_partition.parse(pos, command_partition, expected))
432+
return false;
433+
}
430434
}
431435
}
432436
else if (s_materialize_statistics.ignore(pos, expected))

src/Storages/AlterCommands.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -381,11 +381,16 @@ std::optional<AlterCommand> AlterCommand::parse(const ASTAlterCommand * command_
381381
{
382382
AlterCommand command;
383383
command.ast = command_ast->clone();
384-
command.statistics_decl = command_ast->statistics_decl->clone();
385384
command.type = AlterCommand::DROP_STATISTICS;
386-
const auto & ast_stat_decl = command_ast->statistics_decl->as<ASTStatisticsDeclaration &>();
387385

388-
command.statistics_columns = ast_stat_decl.getColumnNames();
386+
if (command_ast->statistics_decl)
387+
{
388+
command.statistics_decl = command_ast->statistics_decl->clone();
389+
390+
const auto & ast_stat_decl = command_ast->statistics_decl->as<ASTStatisticsDeclaration &>();
391+
command.statistics_columns = ast_stat_decl.getColumnNames();
392+
}
393+
389394
command.if_exists = command_ast->if_exists;
390395
command.clear = command_ast->clear_statistics;
391396

src/Storages/ColumnsDescription.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ void ColumnDescription::writeText(WriteBuffer & buf, IAST::FormatState & state,
156156
writeEscapedString(formatASTStateAware(*codec, state), buf);
157157
}
158158

159-
if (!statistics.empty())
159+
if (statistics.hasExplicitStatistics())
160160
{
161161
writeChar('\t', buf);
162162
writeEscapedString(formatASTStateAware(*statistics.getAST(), state), buf);
@@ -223,7 +223,7 @@ void ColumnDescription::readText(ReadBuffer & buf)
223223
settings = col_ast->settings->as<ASTSetQuery &>().changes;
224224

225225
if (col_ast->statistics_desc)
226-
statistics = ColumnStatisticsDescription::fromColumnDeclaration(*col_ast, type);
226+
statistics = ColumnStatisticsDescription::fromStatisticsDescriptionAST(col_ast->statistics_desc, name, type);
227227
}
228228
else
229229
throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Cannot parse column description");

src/Storages/MergeTree/IMergeTreeDataPart.cpp

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -882,8 +882,16 @@ ColumnsStatistics IMergeTreeDataPart::loadStatistics() const
882882
ColumnsStatistics result;
883883
for (auto & stat : total_statistics)
884884
{
885-
String file_name = stat->getFileName() + STATS_FILE_SUFFIX;
886-
String file_path = fs::path(getDataPartStorage().getRelativePath()) / file_name;
885+
String escaped_name = escapeForFileName(stat->getStatisticName());
886+
auto stream_name = getStreamNameOrHash(escaped_name, STATS_FILE_SUFFIX, checksums);
887+
888+
if (!stream_name.has_value())
889+
{
890+
LOG_INFO(storage.log, "File for statistics with name '{}' is not found", escaped_name);
891+
continue;
892+
}
893+
894+
String file_name = *stream_name + STATS_FILE_SUFFIX;
887895

888896
if (auto stat_file = readFileIfExists(file_name))
889897
{
@@ -892,11 +900,30 @@ ColumnsStatistics IMergeTreeDataPart::loadStatistics() const
892900
result.push_back(stat);
893901
}
894902
else
895-
LOG_INFO(storage.log, "Cannot find stats file {}", file_path);
903+
{
904+
String file_path = fs::path(getDataPartStorage().getRelativePath()) / file_name;
905+
LOG_INFO(storage.log, "Cannot read stats file {}", file_path);
906+
}
896907
}
897908
return result;
898909
}
899910

911+
Estimates IMergeTreeDataPart::getEstimates() const
912+
{
913+
std::lock_guard lock(estimates_mutex);
914+
915+
if (estimates.has_value())
916+
return *estimates;
917+
918+
estimates = Estimates();
919+
auto statistics = loadStatistics();
920+
921+
for (const auto & stat : statistics)
922+
estimates->emplace(stat->getColumnName(), stat->getEstimate());
923+
924+
return *estimates;
925+
}
926+
900927
void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checksums, bool check_consistency, bool load_metadata_version)
901928
{
902929
/// Memory should not be limited during ATTACH TABLE query.
@@ -2566,13 +2593,14 @@ String IMergeTreeDataPart::getNewPartBlockID(std::string_view token) const
25662593

25672594
std::optional<String> IMergeTreeDataPart::getStreamNameOrHash(
25682595
const String & stream_name,
2596+
const String & extension,
25692597
const Checksums & checksums_)
25702598
{
2571-
if (checksums_.files.contains(stream_name + ".bin"))
2599+
if (checksums_.files.contains(stream_name + extension))
25722600
return stream_name;
25732601

25742602
auto hash = sipHash128String(stream_name);
2575-
if (checksums_.files.contains(hash + ".bin"))
2603+
if (checksums_.files.contains(hash + extension))
25762604
return hash;
25772605

25782606
return {};
@@ -2596,19 +2624,21 @@ std::optional<String> IMergeTreeDataPart::getStreamNameOrHash(
25962624
std::optional<String> IMergeTreeDataPart::getStreamNameForColumn(
25972625
const String & column_name,
25982626
const ISerialization::SubstreamPath & substream_path,
2627+
const String & extension,
25992628
const Checksums & checksums_)
26002629
{
26012630
auto stream_name = ISerialization::getFileNameForStream(column_name, substream_path);
2602-
return getStreamNameOrHash(stream_name, checksums_);
2631+
return getStreamNameOrHash(stream_name, extension, checksums_);
26032632
}
26042633

26052634
std::optional<String> IMergeTreeDataPart::getStreamNameForColumn(
26062635
const NameAndTypePair & column,
26072636
const ISerialization::SubstreamPath & substream_path,
2637+
const String & extension,
26082638
const Checksums & checksums_)
26092639
{
26102640
auto stream_name = ISerialization::getFileNameForStream(column, substream_path);
2611-
return getStreamNameOrHash(stream_name, checksums_);
2641+
return getStreamNameOrHash(stream_name, extension, checksums_);
26122642
}
26132643

26142644
std::optional<String> IMergeTreeDataPart::getStreamNameForColumn(

src/Storages/MergeTree/IMergeTreeDataPart.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ class IMergeTreeDataPart : public std::enable_shared_from_this<IMergeTreeDataPar
163163
void remove();
164164

165165
ColumnsStatistics loadStatistics() const;
166+
Estimates getEstimates() const;
166167

167168
/// Initialize columns (from columns.txt if exists, or create from column files if not).
168169
/// Load various metadata into memory: checksums from checksums.txt, index if required, etc.
@@ -594,6 +595,7 @@ class IMergeTreeDataPart : public std::enable_shared_from_this<IMergeTreeDataPar
594595

595596
static std::optional<String> getStreamNameOrHash(
596597
const String & name,
598+
const String & extension,
597599
const IMergeTreeDataPart::Checksums & checksums);
598600

599601
static std::optional<String> getStreamNameOrHash(
@@ -604,11 +606,13 @@ class IMergeTreeDataPart : public std::enable_shared_from_this<IMergeTreeDataPar
604606
static std::optional<String> getStreamNameForColumn(
605607
const String & column_name,
606608
const ISerialization::SubstreamPath & substream_path,
609+
const String & extension,
607610
const Checksums & checksums_);
608611

609612
static std::optional<String> getStreamNameForColumn(
610613
const NameAndTypePair & column,
611614
const ISerialization::SubstreamPath & substream_path,
615+
const String & extension,
612616
const Checksums & checksums_);
613617

614618
static std::optional<String> getStreamNameForColumn(
@@ -716,6 +720,11 @@ class IMergeTreeDataPart : public std::enable_shared_from_this<IMergeTreeDataPar
716720
/// It is used while reading from wide parts.
717721
ColumnsDescription columns_description_with_collected_nested;
718722

723+
/// Small state of finalized statistics for suitable statistics types.
724+
/// Lazily initialized on a first access.
725+
mutable std::mutex estimates_mutex;
726+
mutable std::optional<Estimates> estimates;
727+
719728
/// Reads part unique identifier (if exists) from uuid.txt
720729
void loadUUID();
721730

src/Storages/MergeTree/IMergedBlockOutputStream.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart(
6060
data_part->getSerialization(column.name)->enumerateStreams(
6161
[&](const ISerialization::SubstreamPath & substream_path)
6262
{
63-
auto stream_name = IMergeTreeDataPart::getStreamNameForColumn(column, substream_path, checksums);
63+
auto stream_name = IMergeTreeDataPart::getStreamNameForColumn(column, substream_path, ".bin", checksums);
6464
if (stream_name)
6565
++stream_counts[*stream_name];
6666
});
@@ -76,7 +76,7 @@ NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart(
7676

7777
ISerialization::StreamCallback callback = [&](const ISerialization::SubstreamPath & substream_path)
7878
{
79-
auto stream_name = IMergeTreeDataPart::getStreamNameForColumn(column_name, substream_path, checksums);
79+
auto stream_name = IMergeTreeDataPart::getStreamNameForColumn(column_name, substream_path, ".bin", checksums);
8080

8181
/// Delete files if they are no longer shared with another column.
8282
if (stream_name && --stream_counts[*stream_name] == 0)

0 commit comments

Comments
 (0)