Skip to content

Commit 15ea08e

Browse files
committed
Keep cache of column types
1 parent fd5d85c commit 15ea08e

File tree

4 files changed

+38
-29
lines changed

4 files changed

+38
-29
lines changed

cpp/arcticdb/entity/stream_descriptor.hpp

+24
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
#include <arcticdb/util/variant.hpp>
1515
#include <arcticdb/entity/types_proto.hpp>
1616

17+
#include <ankerl/unordered_dense.h>
18+
1719
namespace arcticdb::entity {
1820

1921
struct SegmentDescriptorImpl : public SegmentDescriptor {
@@ -274,6 +276,28 @@ struct StreamDescriptor {
274276
struct OutputSchema {
275277
StreamDescriptor stream_descriptor_;
276278
arcticdb::proto::descriptors::NormalizationMetadata norm_metadata_;
279+
280+
OutputSchema(StreamDescriptor stream_descriptor,
281+
arcticdb::proto::descriptors::NormalizationMetadata norm_metadata):
282+
stream_descriptor_(std::move(stream_descriptor)),
283+
norm_metadata_(std::move(norm_metadata)) {};
284+
285+
ankerl::unordered_dense::map<std::string, DataType>& column_types() {
286+
if (!column_types_.has_value()) {
287+
column_types_ = ankerl::unordered_dense::map<std::string, DataType>();
288+
column_types_->reserve(stream_descriptor_.field_count());
289+
for (const auto& field: stream_descriptor_.fields()) {
290+
column_types_->emplace(field.name(), field.type().data_type());
291+
}
292+
}
293+
return *column_types_;
294+
}
295+
296+
void clear_column_types() {
297+
column_types_ = std::nullopt;
298+
}
299+
private:
300+
std::optional<ankerl::unordered_dense::map<std::string, DataType>> column_types_;
277301
};
278302

279303
template <class IndexType>

cpp/arcticdb/processing/clause.cpp

+12-27
Original file line numberDiff line numberDiff line change
@@ -134,21 +134,14 @@ std::vector<EntityId> FilterClause::process(std::vector<EntityId>&& entity_ids)
134134
}
135135

136136
OutputSchema FilterClause::modify_schema(OutputSchema&& output_schema) const {
137-
// TODO: Factor out checking against clause_info_.input_columns_ into separate function and call from all clauses
138-
// TODO: Consider adding (optional, lazily constructed?) unordered map from column names to data types in
139-
// output_schema to make this sort of operation more efficient
137+
const auto& column_types = output_schema.column_types();
140138
for (const auto& input_column: *clause_info_.input_columns_) {
141139
schema::check<ErrorCode::E_COLUMN_DOESNT_EXIST>(
142-
output_schema.stream_descriptor_.find_field(input_column).has_value(),
140+
column_types.contains(input_column),
143141
"FilterClause requires column '{}' to exist in input data",
144142
input_column
145143
);
146144
}
147-
// TODO: Factor this out with same code in ProjectClause
148-
std::unordered_map<std::string, DataType> column_types;
149-
for (const auto& field: output_schema.stream_descriptor_.fields()) {
150-
column_types.emplace(field.name(), field.type().data_type());
151-
}
152145
auto expr = expression_context_->expression_nodes_.get_value(expression_context_->root_node_name_.value);
153146
auto opt_datatype = expr->compute(*expression_context_, column_types);
154147
user_input::check<ErrorCode::E_INVALID_USER_ARGUMENT>(!opt_datatype.has_value(), "FilterClause AST produces a column, not a bitset");
@@ -193,24 +186,19 @@ std::vector<EntityId> ProjectClause::process(std::vector<EntityId>&& entity_ids)
193186
}
194187

195188
OutputSchema ProjectClause::modify_schema(OutputSchema&& output_schema) const {
196-
// TODO: Factor out checking against clause_info_.input_columns_ into separate function and call from all clauses
197-
// TODO: Consider adding (optional, lazily constructed?) unordered map from column names to data types in
198-
// output_schema to make this sort of operation more efficient
189+
auto& column_types = output_schema.column_types();
199190
for (const auto& input_column: *clause_info_.input_columns_) {
200191
schema::check<ErrorCode::E_COLUMN_DOESNT_EXIST>(
201-
output_schema.stream_descriptor_.find_field(input_column).has_value(),
202-
"ProjectClause requires column '{}' to exist in input data",
192+
column_types.contains(input_column),
193+
"FilterClause requires column '{}' to exist in input data",
203194
input_column
204195
);
205196
}
206-
std::unordered_map<std::string, DataType> column_types;
207-
for (const auto& field: output_schema.stream_descriptor_.fields()) {
208-
column_types.emplace(field.name(), field.type().data_type());
209-
}
210197
auto expr = expression_context_->expression_nodes_.get_value(expression_context_->root_node_name_.value);
211198
auto opt_datatype = expr->compute(*expression_context_, column_types);
212199
user_input::check<ErrorCode::E_INVALID_USER_ARGUMENT>(opt_datatype.has_value(), "ProjectClause AST produces a bitset, not a column");
213200
output_schema.stream_descriptor_.add_scalar_field(*opt_datatype, output_column_);
201+
column_types.emplace(output_column_, *opt_datatype);
214202
return output_schema;
215203
}
216204

@@ -477,12 +465,10 @@ std::vector<EntityId> AggregationClause::process(std::vector<EntityId>&& entity_
477465
}
478466

479467
OutputSchema AggregationClause::modify_schema(OutputSchema&& output_schema) const {
480-
// TODO: Factor out checking against clause_info_.input_columns_ into separate function and call from all clauses
481-
// TODO: Consider adding (optional, lazily constructed?) unordered map from column names to data types in
482-
// output_schema to make this sort of operation more efficient
468+
const auto& column_types = output_schema.column_types();
483469
for (const auto& input_column: *clause_info_.input_columns_) {
484470
schema::check<ErrorCode::E_COLUMN_DOESNT_EXIST>(
485-
output_schema.stream_descriptor_.find_field(input_column).has_value(),
471+
column_types.contains(input_column),
486472
"AggregationClause requires column '{}' to exist in input data",
487473
input_column
488474
);
@@ -491,7 +477,6 @@ OutputSchema AggregationClause::modify_schema(OutputSchema&& output_schema) cons
491477
stream_desc.add_field(output_schema.stream_descriptor_.field(*output_schema.stream_descriptor_.find_field(grouping_column_)));
492478
stream_desc.set_index({0, IndexDescriptorImpl::Type::ROWCOUNT});
493479

494-
// TODO: Similar to process method, consider refactoring
495480
for (const auto& agg: aggregators_){
496481
const auto& input_column_name = agg.get_input_column_name().value;
497482
const auto& output_column_name = agg.get_output_column_name().value;
@@ -503,6 +488,7 @@ OutputSchema AggregationClause::modify_schema(OutputSchema&& output_schema) cons
503488
}
504489

505490
output_schema.stream_descriptor_ = std::move(stream_desc);
491+
output_schema.clear_column_types();
506492
auto mutable_index = output_schema.norm_metadata_.mutable_df()->mutable_common()->mutable_index();
507493
mutable_index->set_name(grouping_column_);
508494
mutable_index->clear_fake_name();
@@ -549,12 +535,10 @@ OutputSchema ResampleClause<closed_boundary>::modify_schema(OutputSchema&& outpu
549535
output_schema.stream_descriptor_.field(0).type() == make_scalar_type(DataType::NANOSECONDS_UTC64),
550536
"ResampleClause can only be applied to timeseries"
551537
);
552-
// TODO: Factor out checking against clause_info_.input_columns_ into separate function and call from all clauses
553-
// TODO: Consider adding (optional, lazily constructed?) unordered map from column names to data types in
554-
// output_schema to make this sort of operation more efficient
538+
const auto& column_types = output_schema.column_types();
555539
for (const auto& input_column: *clause_info_.input_columns_) {
556540
schema::check<ErrorCode::E_COLUMN_DOESNT_EXIST>(
557-
output_schema.stream_descriptor_.find_field(input_column).has_value(),
541+
column_types.contains(input_column),
558542
"ResampleClause requires column '{}' to exist in input data",
559543
input_column
560544
);
@@ -572,6 +556,7 @@ OutputSchema ResampleClause<closed_boundary>::modify_schema(OutputSchema&& outpu
572556
stream_desc.add_scalar_field(output_column_type, output_column_name);
573557
}
574558
output_schema.stream_descriptor_ = std::move(stream_desc);
559+
output_schema.clear_column_types();
575560

576561
if (output_schema.norm_metadata_.df().common().has_multi_index()) {
577562
const auto& multi_index = output_schema.norm_metadata_.mutable_df()->mutable_common()->multi_index();

cpp/arcticdb/processing/expression_node.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ VariantData ExpressionNode::compute(ProcessingUnit& seg) const {
7171
}
7272

7373
std::optional<DataType> ExpressionNode::compute(ExpressionContext& expression_context,
74-
const std::unordered_map<std::string, DataType>& column_types) const {
74+
const ankerl::unordered_dense::map<std::string, DataType>& column_types) const {
7575
std::optional<DataType> res;
7676
std::optional<DataType> left_type = util::variant_match(
7777
left_,

cpp/arcticdb/processing/expression_node.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ struct ExpressionNode {
9292
// TODO: better return type
9393
// TODO: Make expression_context const
9494
std::optional<DataType> compute(ExpressionContext& expression_context,
95-
const std::unordered_map<std::string, DataType>& column_types) const;
95+
const ankerl::unordered_dense::map<std::string, DataType>& column_types) const;
9696
};
9797

9898
} //namespace arcticdb

0 commit comments

Comments
 (0)