Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhancement/8561507350/precompute output schema from processing #2233

Open
wants to merge 29 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
45d9a43
Implement modify_schema for passthrough, row range, and date range cl…
alexowens90 Mar 6, 2025
c4ff8e9
Implement and test FilterClause::modify_schema based only on input_co…
alexowens90 Mar 6, 2025
9772e0e
Implement and test PartitionClause::modify_schema
alexowens90 Mar 6, 2025
191a101
Implemented AggregationClause::modify_schema
alexowens90 Mar 6, 2025
912c059
Aggregation column names tested
alexowens90 Mar 6, 2025
4b1cf76
Implement aggregation output types properly, test for sum
alexowens90 Mar 7, 2025
5970065
Tests for min and max aggregators
alexowens90 Mar 7, 2025
738c9b3
Tests for mean aggregator
alexowens90 Mar 7, 2025
9938d77
Refactored tests
alexowens90 Mar 7, 2025
c68eda1
Implement modify_schema for ResampleClause
alexowens90 Mar 10, 2025
3923582
ResampleClause::modify_schema fully tested
alexowens90 Mar 10, 2025
0ff5086
Tests for RemoveCOlumnPartitioning, Split, Sort, and Merge clauses, a…
alexowens90 Mar 10, 2025
c103b92
Throw if modify_schema is called on ColumnStatsGenerationClause
alexowens90 Mar 10, 2025
a0b0266
Same level of checks for FilterClause added to ProjectionClause
alexowens90 Mar 10, 2025
c419245
Implemented type checking for ProjectClause (untested)
alexowens90 Mar 10, 2025
0d3f2d5
Implemented type checking for FilterClause (untested)
alexowens90 Mar 10, 2025
ac9e385
Fix filter and project tests
alexowens90 Mar 10, 2025
2073432
Clone StreamDescriptors
alexowens90 Mar 11, 2025
cb344ae
AST validity tests passing for projections
alexowens90 Mar 11, 2025
e401c17
AST validity tests passing for filters
alexowens90 Mar 12, 2025
fd5d85c
Remove unneeded computed_data
alexowens90 Mar 12, 2025
15ea08e
Keep cache of column types
alexowens90 Mar 12, 2025
1b63ab0
Make ExpressionContext const&
alexowens90 Mar 12, 2025
ed5e186
Implement PartitionClause::modify_schema in the same way as the others
alexowens90 Mar 12, 2025
9f0c941
Factor out input columns check to own method
alexowens90 Mar 12, 2025
eab0465
Factor out check that a stream descriptor represents a timeseries int…
alexowens90 Mar 12, 2025
1c7d237
Improve return type of ExpressionNode::compute
alexowens90 Mar 12, 2025
4029b74
Factor out child type calculation to own method
alexowens90 Mar 12, 2025
8ed9560
Uncomment test files
alexowens90 Mar 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/arcticdb/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -939,6 +939,7 @@ if(${TEST})
processing/test/test_filter_and_project_sparse.cpp
processing/test/test_type_promotion.cpp
processing/test/test_operation_dispatch.cpp
processing/test/test_output_schema.cpp
processing/test/test_parallel_processing.cpp
processing/test/test_resample.cpp
processing/test/test_set_membership.cpp
Expand Down
29 changes: 29 additions & 0 deletions cpp/arcticdb/entity/stream_descriptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#include <arcticdb/util/variant.hpp>
#include <arcticdb/entity/types_proto.hpp>

#include <ankerl/unordered_dense.h>

namespace arcticdb::entity {

struct SegmentDescriptorImpl : public SegmentDescriptor {
Expand Down Expand Up @@ -271,6 +273,33 @@ struct StreamDescriptor {
}
};

struct OutputSchema {
StreamDescriptor stream_descriptor_;
arcticdb::proto::descriptors::NormalizationMetadata norm_metadata_;

OutputSchema(StreamDescriptor stream_descriptor,
arcticdb::proto::descriptors::NormalizationMetadata norm_metadata):
stream_descriptor_(std::move(stream_descriptor)),
norm_metadata_(std::move(norm_metadata)) {};

ankerl::unordered_dense::map<std::string, DataType>& column_types() {
if (!column_types_.has_value()) {
column_types_ = ankerl::unordered_dense::map<std::string, DataType>();
column_types_->reserve(stream_descriptor_.field_count());
for (const auto& field: stream_descriptor_.fields()) {
column_types_->emplace(field.name(), field.type().data_type());
}
}
return *column_types_;
}

void clear_column_types() {
column_types_ = std::nullopt;
}
private:
std::optional<ankerl::unordered_dense::map<std::string, DataType>> column_types_;
};

template <class IndexType>
inline void set_index(StreamDescriptor &stream_desc) {
stream_desc.set_index_field_count(std::uint32_t(IndexType::field_count()));
Expand Down
8 changes: 5 additions & 3 deletions cpp/arcticdb/processing/aggregation_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,18 @@ struct IGroupingAggregatorData {
struct Interface : Base {
void add_data_type(DataType data_type) { folly::poly_call<0>(*this, data_type); }

DataType get_output_data_type() { return folly::poly_call<1>(*this); };

void aggregate(const std::optional<ColumnWithStrings>& input_column, const std::vector<size_t>& groups, size_t unique_values) {
folly::poly_call<1>(*this, input_column, groups, unique_values);
folly::poly_call<2>(*this, input_column, groups, unique_values);
}
[[nodiscard]] SegmentInMemory finalize(const ColumnName& output_column_name, bool dynamic_schema, size_t unique_values) {
return folly::poly_call<2>(*this, output_column_name, dynamic_schema, unique_values);
return folly::poly_call<3>(*this, output_column_name, dynamic_schema, unique_values);
}
};

template<class T>
using Members = folly::PolyMembers<&T::add_data_type, &T::aggregate, &T::finalize>;
using Members = folly::PolyMembers<&T::add_data_type, &T::get_output_data_type, &T::aggregate, &T::finalize>;
};

using GroupingAggregatorData = folly::Poly<IGroupingAggregatorData>;
Expand Down
114 changes: 114 additions & 0 deletions cpp/arcticdb/processing/clause.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,28 @@ struct SegmentWrapper {
}
};

void check_column_presence(OutputSchema& output_schema, const std::unordered_set<std::string>& required_columns, std::string_view clause_name) {
const auto& column_types = output_schema.column_types();
for (const auto& input_column: required_columns) {
schema::check<ErrorCode::E_COLUMN_DOESNT_EXIST>(
column_types.contains(input_column),
"{}Clause requires column '{}' to exist in input data",
clause_name,
input_column
);
}
}

void check_is_timeseries(const StreamDescriptor& stream_descriptor, std::string_view clause_name) {
schema::check<ErrorCode::E_UNSUPPORTED_INDEX_TYPE>(
stream_descriptor.index().type() == IndexDescriptor::Type::TIMESTAMP &&
stream_descriptor.index().field_count() >= 1 &&
stream_descriptor.field(0).type() == make_scalar_type(DataType::NANOSECONDS_UTC64),
"{}Clause can only be applied to timeseries",
clause_name
);
}

std::vector<EntityId> PassthroughClause::process(std::vector<EntityId>&& entity_ids) const {
return std::move(entity_ids);
}
Expand Down Expand Up @@ -133,6 +155,14 @@ std::vector<EntityId> FilterClause::process(std::vector<EntityId>&& entity_ids)
return output;
}

OutputSchema FilterClause::modify_schema(OutputSchema&& output_schema) const {
check_column_presence(output_schema, *clause_info_.input_columns_, "Filter");
auto root_expr = expression_context_->expression_nodes_.get_value(expression_context_->root_node_name_.value);
std::variant<BitSetTag, DataType> return_type = root_expr->compute(*expression_context_, output_schema.column_types());
user_input::check<ErrorCode::E_INVALID_USER_ARGUMENT>(std::holds_alternative<BitSetTag>(return_type), "FilterClause AST produces a column, not a bitset");
return output_schema;
}

std::string FilterClause::to_string() const {
return expression_context_ ? fmt::format("WHERE {}", expression_context_->root_node_name_.value) : "";
}
Expand Down Expand Up @@ -170,6 +200,16 @@ std::vector<EntityId> ProjectClause::process(std::vector<EntityId>&& entity_ids)
return output;
}

OutputSchema ProjectClause::modify_schema(OutputSchema&& output_schema) const {
check_column_presence(output_schema, *clause_info_.input_columns_, "Project");
auto root_expr = expression_context_->expression_nodes_.get_value(expression_context_->root_node_name_.value);
std::variant<BitSetTag, DataType> return_type = root_expr->compute(*expression_context_, output_schema.column_types());
user_input::check<ErrorCode::E_INVALID_USER_ARGUMENT>(std::holds_alternative<DataType>(return_type), "ProjectClause AST produces a column, not a bitset");
output_schema.stream_descriptor_.add_scalar_field(std::get<DataType>(return_type), output_column_);
output_schema.column_types().emplace(output_column_, std::get<DataType>(return_type));
return output_schema;
}

[[nodiscard]] std::string ProjectClause::to_string() const {
return expression_context_ ? fmt::format("PROJECT Column[\"{}\"] = {}", output_column_, expression_context_->root_node_name_.value) : "";
}
Expand Down Expand Up @@ -432,6 +472,31 @@ std::vector<EntityId> AggregationClause::process(std::vector<EntityId>&& entity_
return push_entities(*component_manager_, ProcessingUnit(std::move(seg)));
}

OutputSchema AggregationClause::modify_schema(OutputSchema&& output_schema) const {
check_column_presence(output_schema, *clause_info_.input_columns_, "Aggregation");
StreamDescriptor stream_desc(output_schema.stream_descriptor_.id());
stream_desc.add_field(output_schema.stream_descriptor_.field(*output_schema.stream_descriptor_.find_field(grouping_column_)));
stream_desc.set_index({0, IndexDescriptorImpl::Type::ROWCOUNT});

for (const auto& agg: aggregators_){
const auto& input_column_name = agg.get_input_column_name().value;
const auto& output_column_name = agg.get_output_column_name().value;
const auto& input_column_type = output_schema.stream_descriptor_.field(*output_schema.stream_descriptor_.find_field(input_column_name)).type().data_type();
auto agg_data = agg.get_aggregator_data();
agg_data.add_data_type(input_column_type);
const auto& output_column_type = agg_data.get_output_data_type();
stream_desc.add_scalar_field(output_column_type, output_column_name);
}

output_schema.stream_descriptor_ = std::move(stream_desc);
output_schema.clear_column_types();
auto mutable_index = output_schema.norm_metadata_.mutable_df()->mutable_common()->mutable_index();
mutable_index->set_name(grouping_column_);
mutable_index->clear_fake_name();
mutable_index->set_is_physically_stored(true);
return output_schema;
}

[[nodiscard]] std::string AggregationClause::to_string() const {
return str_;
}
Expand Down Expand Up @@ -463,6 +528,45 @@ void ResampleClause<closed_boundary>::set_component_manager(std::shared_ptr<Comp
component_manager_ = std::move(component_manager);
}

template<ResampleBoundary closed_boundary>
OutputSchema ResampleClause<closed_boundary>::modify_schema(OutputSchema&& output_schema) const {
check_is_timeseries(output_schema.stream_descriptor_, "Resample");
check_column_presence(output_schema, *clause_info_.input_columns_, "Resample");
StreamDescriptor stream_desc(output_schema.stream_descriptor_.id());
stream_desc.add_field(output_schema.stream_descriptor_.field(0));
stream_desc.set_index(IndexDescriptorImpl(1, IndexDescriptor::Type::TIMESTAMP));

for (const auto& agg: aggregators_){
const auto& input_column_name = agg.get_input_column_name().value;
const auto& output_column_name = agg.get_output_column_name().value;
const auto& input_column_type = output_schema.stream_descriptor_.field(*output_schema.stream_descriptor_.find_field(input_column_name)).type().data_type();
agg.check_aggregator_supported_with_data_type(input_column_type);
auto output_column_type = agg.generate_output_data_type(input_column_type);
stream_desc.add_scalar_field(output_column_type, output_column_name);
}
output_schema.stream_descriptor_ = std::move(stream_desc);
output_schema.clear_column_types();

if (output_schema.norm_metadata_.df().common().has_multi_index()) {
const auto& multi_index = output_schema.norm_metadata_.mutable_df()->mutable_common()->multi_index();
auto name = multi_index.name();
auto tz = multi_index.tz();
bool fake_name{false};
for (auto pos: multi_index.fake_field_pos()) {
if (pos == 0) {
fake_name = true;
break;
}
}
auto mutable_index = output_schema.norm_metadata_.mutable_df()->mutable_common()->mutable_index();
mutable_index->set_tz(tz);
mutable_index->set_is_physically_stored(true);
mutable_index->set_name(name);
mutable_index->set_fake_name(fake_name);
}
return output_schema;
}

template<ResampleBoundary closed_boundary>
std::string ResampleClause<closed_boundary>::rule() const {
return rule_;
Expand Down Expand Up @@ -903,6 +1007,11 @@ const ClauseInfo& MergeClause::clause_info() const {
return clause_info_;
}

OutputSchema MergeClause::modify_schema(OutputSchema&& output_schema) const {
check_is_timeseries(output_schema.stream_descriptor_, "Merge");
return output_schema;
}

std::vector<std::vector<EntityId>> MergeClause::structure_for_processing(std::vector<std::vector<EntityId>>&& entity_ids_vec) {

// TODO this is a hack because we don't currently have a way to
Expand Down Expand Up @@ -1213,6 +1322,11 @@ std::vector<EntityId> DateRangeClause::process(std::vector<EntityId> &&entity_id
return push_entities(*component_manager_, std::move(proc));
}

OutputSchema DateRangeClause::modify_schema(OutputSchema&& output_schema) const {
check_is_timeseries(output_schema.stream_descriptor_, "DateRange");
return output_schema;
}

std::string DateRangeClause::to_string() const {
return fmt::format("DATE RANGE {} - {}", start_, end_);
}
Expand Down
52 changes: 51 additions & 1 deletion cpp/arcticdb/processing/clause.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ struct IClause {
void set_component_manager(std::shared_ptr<ComponentManager> component_manager) {
folly::poly_call<5>(*this, component_manager);
}

OutputSchema modify_schema(OutputSchema&& output_schema) const {
return folly::poly_call<6>(*this, std::move(output_schema));
}
};

template<class T>
Expand All @@ -80,11 +84,16 @@ struct IClause {
&T::process,
&T::clause_info,
&T::set_processing_config,
&T::set_component_manager>;
&T::set_component_manager,
&T::modify_schema>;
};

using Clause = folly::Poly<IClause>;

void check_column_presence(OutputSchema& output_schema,
const std::unordered_set<std::string>& required_columns,
std::string_view clause_name);

struct PassthroughClause {
ClauseInfo clause_info_;

Expand All @@ -109,6 +118,10 @@ struct PassthroughClause {
void set_processing_config(ARCTICDB_UNUSED const ProcessingConfig&) {}

void set_component_manager(ARCTICDB_UNUSED std::shared_ptr<ComponentManager>) {}

OutputSchema modify_schema(OutputSchema&& output_schema) const {
return output_schema;
}
};

struct FilterClause {
Expand Down Expand Up @@ -152,6 +165,8 @@ struct FilterClause {
component_manager_ = component_manager;
}

OutputSchema modify_schema(OutputSchema&& output_schema) const;

[[nodiscard]] std::string to_string() const;

void set_pipeline_optimisation(PipelineOptimisation pipeline_optimisation) {
Expand Down Expand Up @@ -201,6 +216,8 @@ struct ProjectClause {
component_manager_ = component_manager;
}

OutputSchema modify_schema(OutputSchema&& output_schema) const;

[[nodiscard]] std::string to_string() const;
};

Expand Down Expand Up @@ -259,6 +276,11 @@ struct PartitionClause {
component_manager_ = component_manager;
}

OutputSchema modify_schema(OutputSchema&& output_schema) const {
check_column_presence(output_schema, *clause_info_.input_columns_, "GroupBy");
return output_schema;
}

[[nodiscard]] std::string to_string() const {
return fmt::format("GROUPBY Column[\"{}\"]", grouping_column_);
}
Expand Down Expand Up @@ -314,6 +336,8 @@ struct AggregationClause {
component_manager_ = component_manager;
}

OutputSchema modify_schema(OutputSchema&& output_schema) const;

[[nodiscard]] std::string to_string() const;
};

Expand Down Expand Up @@ -358,6 +382,8 @@ struct ResampleClause {

void set_component_manager(std::shared_ptr<ComponentManager> component_manager);

OutputSchema modify_schema(OutputSchema&& output_schema) const;

[[nodiscard]] std::string to_string() const;

[[nodiscard]] std::string rule() const;
Expand Down Expand Up @@ -414,6 +440,10 @@ struct RemoveColumnPartitioningClause {
void set_component_manager(std::shared_ptr<ComponentManager> component_manager) {
component_manager_ = component_manager;
}

OutputSchema modify_schema(OutputSchema&& output_schema) const {
return output_schema;
}
};

struct SplitClause {
Expand Down Expand Up @@ -445,6 +475,10 @@ struct SplitClause {
void set_component_manager(std::shared_ptr<ComponentManager> component_manager) {
component_manager_ = component_manager;
}

OutputSchema modify_schema(OutputSchema&& output_schema) const {
return output_schema;
}
};

struct SortClause {
Expand Down Expand Up @@ -479,6 +513,10 @@ struct SortClause {
void set_component_manager(std::shared_ptr<ComponentManager> component_manager) {
component_manager_ = component_manager;
}

OutputSchema modify_schema(OutputSchema&& output_schema) const {
return output_schema;
}
};

struct MergeClause {
Expand Down Expand Up @@ -513,6 +551,8 @@ struct MergeClause {
void set_processing_config(const ProcessingConfig& processing_config);

void set_component_manager(std::shared_ptr<ComponentManager> component_manager);

OutputSchema modify_schema(OutputSchema&& output_schema) const;
};

struct ColumnStatsGenerationClause {
Expand Down Expand Up @@ -555,6 +595,10 @@ struct ColumnStatsGenerationClause {
void set_component_manager(std::shared_ptr<ComponentManager> component_manager) {
component_manager_ = component_manager;
}

OutputSchema modify_schema(ARCTICDB_UNUSED OutputSchema&& output_schema) const {
internal::raise<ErrorCode::E_ASSERTION_FAILURE>("ColumnStatsGenerationClause::modify_schema should never be called");
}
};

// Used by head and tail to discard rows not requested by the user
Expand Down Expand Up @@ -617,6 +661,10 @@ struct RowRangeClause {
component_manager_ = component_manager;
}

OutputSchema modify_schema(OutputSchema&& output_schema) const {
return output_schema;
}

[[nodiscard]] std::string to_string() const;

void calculate_start_and_end(size_t total_rows);
Expand Down Expand Up @@ -658,6 +706,8 @@ struct DateRangeClause {
component_manager_ = component_manager;
}

OutputSchema modify_schema(OutputSchema&& output_schema) const;

[[nodiscard]] timestamp start() const {
return start_;
}
Expand Down
Loading
Loading