Skip to content

Commit d28480c

Browse files
committed
Add flag for controling how string should the casing between int and float be
Add option to have default values for aggregations Make the default value type a variant
1 parent 67d2bbe commit d28480c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+460
-294
lines changed

cpp/arcticdb/async/tasks.hpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,7 @@
1414
#include <arcticdb/storage/storage.hpp>
1515
#include <arcticdb/storage/key_segment_pair.hpp>
1616
#include <arcticdb/entity/types.hpp>
17-
#include <arcticdb/util/hash.hpp>
1817
#include <arcticdb/stream/stream_utils.hpp>
19-
#include <arcticdb/stream/protobuf_mappings.hpp>
2018
#include <arcticdb/stream/stream_source.hpp>
2119
#include <arcticdb/column_store/memory_segment.hpp>
2220
#include <arcticdb/entity/variant_key.hpp>
@@ -30,7 +28,6 @@
3028
#include <arcticdb/util/test/random_throw.hpp>
3129

3230
#include <type_traits>
33-
#include <ranges>
3431

3532
namespace arcticdb::async {
3633

cpp/arcticdb/column_store/column.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -371,9 +371,9 @@ void Column::mark_absent_rows(size_t num_rows) {
371371
}
372372
}
373373

374-
void Column::default_initialize_rows(size_t start_pos, size_t num_rows, bool ensure_alloc) {
374+
void Column::default_initialize_rows(size_t start_pos, size_t num_rows, bool ensure_alloc, const VariantRawValue default_value) {
375375
if (num_rows > 0) {
376-
type_.visit_tag([this, start_pos, num_rows, ensure_alloc](auto tag) {
376+
type_.visit_tag([&,this](auto tag) {
377377
using T = std::decay_t<decltype(tag)>;
378378
using RawType = typename T::DataTypeTag::raw_type;
379379
const auto bytes = (num_rows * sizeof(RawType));
@@ -382,7 +382,12 @@ void Column::default_initialize_rows(size_t start_pos, size_t num_rows, bool ens
382382
data_.ensure<uint8_t>(bytes);
383383

384384
auto type_ptr = data_.ptr_cast<RawType>(start_pos, bytes);
385-
util::default_initialize<T>(reinterpret_cast<uint8_t *>(type_ptr), bytes);
385+
if (std::holds_alternative<RawType>(default_value)) {
386+
const RawType raw_default = std::get<RawType>(default_value);
387+
std::fill_n(type_ptr, num_rows, raw_default);
388+
} else {
389+
util::default_initialize<T>(reinterpret_cast<uint8_t *>(type_ptr), bytes);
390+
}
386391

387392
if (ensure_alloc)
388393
data_.commit();

cpp/arcticdb/column_store/column.hpp

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,10 @@
2020
#include <arcticdb/util/preconditions.hpp>
2121
#include <arcticdb/util/sparse_utils.hpp>
2222

23-
#include <folly/container/Enumerate.h>
2423
// Compilation fails on Mac if cstdio is not included prior to folly/Function.h due to a missing definition of memalign in folly/Memory.h
2524
#ifdef __APPLE__
2625
#include <cstdio>
2726
#endif
28-
#include <folly/Function.h>
2927
#include <pybind11/pybind11.h>
3028
#include <pybind11/numpy.h>
3129

@@ -453,7 +451,7 @@ class Column {
453451

454452
void mark_absent_rows(size_t num_rows);
455453

456-
void default_initialize_rows(size_t start_pos, size_t num_rows, bool ensure_alloc);
454+
void default_initialize_rows(size_t start_pos, size_t num_rows, bool ensure_alloc, VariantRawValue default_value = {});
457455

458456
void set_row_data(size_t row_id);
459457

@@ -730,19 +728,15 @@ class Column {
730728
size_t end_row
731729
);
732730

733-
template <
734-
typename input_tdt,
735-
typename functor>
731+
template <typename input_tdt, typename functor>
736732
requires std::is_invocable_r_v<void, functor, typename input_tdt::DataTypeTag::raw_type>
737733
static void for_each(const Column& input_column, functor&& f) {
738734
auto input_data = input_column.data();
739735
std::for_each(input_data.cbegin<input_tdt>(), input_data.cend<input_tdt>(), std::forward<functor>(f));
740736
}
741737

742-
template <
743-
typename input_tdt,
744-
typename functor>
745-
requires std::is_invocable_r_v<void, functor, typename ColumnData::Enumeration<typename input_tdt::DataTypeTag::raw_type>>
738+
template <typename input_tdt, typename functor>
739+
requires std::is_invocable_r_v<void, functor, ColumnData::Enumeration<typename input_tdt::DataTypeTag::raw_type>>
746740
static void for_each_enumerated(const Column& input_column, functor&& f) {
747741
auto input_data = input_column.data();
748742
if (input_column.is_sparse()) {
@@ -754,10 +748,7 @@ class Column {
754748
}
755749
}
756750

757-
template <
758-
typename input_tdt,
759-
typename output_tdt,
760-
typename functor>
751+
template <typename input_tdt, typename output_tdt, typename functor>
761752
requires std::is_invocable_r_v<typename output_tdt::DataTypeTag::raw_type, functor, typename input_tdt::DataTypeTag::raw_type>
762753
static void transform(const Column& input_column, Column& output_column, functor&& f) {
763754
auto input_data = input_column.data();
@@ -839,9 +830,7 @@ class Column {
839830
}
840831
}
841832

842-
template <
843-
typename input_tdt,
844-
std::predicate<typename input_tdt::DataTypeTag::raw_type> functor>
833+
template <typename input_tdt, std::predicate<typename input_tdt::DataTypeTag::raw_type> functor>
845834
static void transform(const Column& input_column,
846835
util::BitSet& output_bitset,
847836
bool sparse_missing_value_output,

cpp/arcticdb/column_store/memory_segment_impl.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313

1414
#include <google/protobuf/any.pb.h>
1515

16+
#include <arcticdb/stream/index.hpp>
17+
1618
namespace arcticdb {
1719

1820
SegmentInMemoryImpl::SegmentInMemoryImpl() = default;
@@ -154,7 +156,7 @@ std::optional<std::size_t> SegmentInMemoryImpl::column_index(std::string_view na
154156
if (auto index = column_index(name); index)
155157
return index;
156158

157-
std::string multi_index_column_name = fmt::format("__idx__{}", name);
159+
const std::string multi_index_column_name = stream::mangled_name(name);
158160
if (auto multi_index = column_index(multi_index_column_name); multi_index)
159161
return multi_index;
160162

cpp/arcticdb/column_store/string_pool.hpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77

88
#pragma once
99

10-
#include <cstddef>
11-
#include <cstdint>
1210
#include <string_view>
1311
#include <unordered_set>
1412

cpp/arcticdb/entity/stream_descriptor.hpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
#include "arcticdb/storage/memory_layout.hpp"
1212

1313
#include <arcticdb/entity/field_collection_proto.hpp>
14-
#include <arcticdb/util/variant.hpp>
1514
#include <arcticdb/entity/types_proto.hpp>
1615

1716
#include <ankerl/unordered_dense.h>
@@ -308,13 +307,27 @@ struct OutputSchema {
308307
column_types().emplace(name, data_type);
309308
}
310309

310+
auto release() {
311+
column_types_.reset();
312+
return std::tuple{std::move(stream_descriptor_), std::move(norm_metadata_), std::move(default_values_)};
313+
}
314+
315+
void clear_default_values() {
316+
default_values_.clear();
317+
}
318+
319+
void set_default_value_for_column(const std::string_view name, const VariantRawValue& value) {
320+
default_values_.emplace(std::string(name), value);
321+
}
322+
311323
private:
312324
StreamDescriptor stream_descriptor_;
313325
std::optional<ankerl::unordered_dense::map<std::string, DataType>> column_types_;
326+
ankerl::unordered_dense::map<std::string, VariantRawValue> default_values_;
314327
};
315328

316329
template <class IndexType>
317-
inline void set_index(StreamDescriptor &stream_desc) {
330+
void set_index(StreamDescriptor &stream_desc) {
318331
stream_desc.set_index_field_count(std::uint32_t(IndexType::field_count()));
319332
stream_desc.set_index_type(IndexType::type());
320333
}

cpp/arcticdb/entity/type_utils.cpp

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,30 @@ namespace arcticdb {
6767
return std::make_optional<entity::TypeDescriptor>(combine_data_type(entity::ValueType::FLOAT, target_size), dimension);
6868
}
6969

70+
static bool is_valid_int_to_float_conversion(
71+
const entity::TypeDescriptor& source,
72+
const entity::TypeDescriptor& target,
73+
IntToFloatConversion int_to_to_float_conversion
74+
) {
75+
debug::check<ErrorCode::E_ASSERTION_FAILURE>(
76+
entity::is_integer_type(source.data_type()) && is_floating_point_type(target.data_type()),
77+
"Expected source to be int and target to be float got: {} {}", source, target
78+
);
79+
if (int_to_to_float_conversion == IntToFloatConversion::STRICT) {
80+
return target.get_size_bits() == entity::SizeBits::S64 || source.get_size_bits() < entity::SizeBits::S32;
81+
} else {
82+
debug::check<ErrorCode::E_ASSERTION_FAILURE>(
83+
int_to_to_float_conversion == IntToFloatConversion::PERMISSIVE,
84+
"Unknown int to float conversion option: {}", static_cast<int>(int_to_to_float_conversion)
85+
);
86+
return true;
87+
}
88+
}
89+
7090
bool is_valid_type_promotion_to_target(
7191
const entity::TypeDescriptor& source,
72-
const entity::TypeDescriptor& target
92+
const entity::TypeDescriptor& target,
93+
IntToFloatConversion int_to_to_float_conversion
7394
) {
7495
if (source.dimension() != target.dimension()) {
7596
// Empty of dimension 0 means lack of any given type and can be promoted to anything (even if the dimensions
@@ -108,7 +129,7 @@ namespace arcticdb {
108129
return target_size > source_size;
109130
} else if (is_floating_point_type(target_type)) {
110131
// UINT->FLOAT
111-
return target_size == entity::SizeBits::S64 || source_size < entity::SizeBits::S32;
132+
return is_valid_int_to_float_conversion(source, target, int_to_to_float_conversion);
112133
} else {
113134
// Non-numeric target type
114135
return false;
@@ -122,7 +143,7 @@ namespace arcticdb {
122143
return target_size >= source_size;
123144
} else if (is_floating_point_type(target_type)) {
124145
// INT->FLOAT
125-
return target_size == entity::SizeBits::S64 || source_size < entity::SizeBits::S32;
146+
return is_valid_int_to_float_conversion(source, target, int_to_to_float_conversion);
126147
} else {
127148
// Non-numeric target type
128149
return false;
@@ -175,10 +196,14 @@ namespace arcticdb {
175196
}
176197
}
177198

178-
std::optional<entity::TypeDescriptor> has_valid_common_type(const entity::TypeDescriptor& left, const entity::TypeDescriptor& right) {
179-
if (is_valid_type_promotion_to_target(left, right)) {
199+
std::optional<entity::TypeDescriptor> has_valid_common_type(
200+
const entity::TypeDescriptor& left,
201+
const entity::TypeDescriptor& right,
202+
IntToFloatConversion int_to_to_float_conversion
203+
) {
204+
if (is_valid_type_promotion_to_target(left, right, int_to_to_float_conversion)) {
180205
return right;
181-
} else if (is_valid_type_promotion_to_target(right, left)) {
206+
} else if (is_valid_type_promotion_to_target(right, left, int_to_to_float_conversion)) {
182207
return left;
183208
}
184209

cpp/arcticdb/entity/type_utils.hpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
#pragma once
99
#include<optional>
1010
#include <fmt/format.h>
11-
#include<arcticdb/entity/descriptors.hpp>
1211
#include <arcticdb/entity/types.hpp>
1312

1413
namespace arcticdb {
@@ -17,18 +16,30 @@ namespace entity {
1716
struct TypeDescriptor;
1817
}
1918

19+
/// Defines which static casts from int to float are permitted in is_valid_type_promotion_to_target
20+
enum class IntToFloatConversion {
21+
/// Avoids lossy casting from int to float and tries to make sure that the integer can be represented exactly using
22+
/// the specified float, (u)int8, (u)int16 can be represented exactly using float 32, (u)int32 can be represented
23+
/// exactly via float64. Note this still allows casting (u)int64 to flaot64 even though it's a lossy cast.
24+
STRICT,
25+
/// Allow all type casts from int to float regardless of the byte size of the int and float type
26+
PERMISSIVE
27+
};
28+
2029
/// Two types are trivially compatible if their byte representation is exactly the same i.e. you can memcpy
2130
/// n elements of left type from one buffer to n elements of type right in another buffer and get the same result
2231
[[nodiscard]] bool trivially_compatible_types(const entity::TypeDescriptor& left, const entity::TypeDescriptor& right);
2332

2433
[[nodiscard]] bool is_valid_type_promotion_to_target(
2534
const entity::TypeDescriptor& source,
26-
const entity::TypeDescriptor& target
35+
const entity::TypeDescriptor& target,
36+
IntToFloatConversion int_to_to_float_conversion = IntToFloatConversion::STRICT
2737
);
2838

2939
[[nodiscard]] std::optional<entity::TypeDescriptor> has_valid_common_type(
3040
const entity::TypeDescriptor& left,
31-
const entity::TypeDescriptor& right
41+
const entity::TypeDescriptor& right,
42+
IntToFloatConversion int_to_to_float_conversion = IntToFloatConversion::STRICT
3243
);
3344

3445
inline std::string get_user_friendly_type_string(const entity::TypeDescriptor& type) {

cpp/arcticdb/entity/types.hpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,20 @@ DATA_TYPE_TAG(EMPTYVAL, std::uint64_t)
394394
DATA_TYPE_TAG(BOOL_OBJECT8, uint8_t)
395395
#undef DATA_TYPE_TAG
396396

397+
using VariantRawValue = std::variant<
398+
std::monostate,
399+
int8_t,
400+
int16_t,
401+
int32_t,
402+
int64_t,
403+
uint8_t,
404+
uint16_t,
405+
uint32_t,
406+
uint64_t,
407+
bool,
408+
float,
409+
double>;
410+
397411
enum class Dimension : uint8_t {
398412
Dim0 = 0,
399413
Dim1 = 1,

cpp/arcticdb/pipeline/frame_slice_map.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#pragma once
99

1010
#include <arcticdb/pipeline/pipeline_context.hpp>
11+
#include <arcticdb/pipeline/index_utils.hpp>
1112

1213
namespace arcticdb::pipelines {
1314

cpp/arcticdb/pipeline/frame_utils.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <arcticdb/pipeline/frame_utils.hpp>
99
#include <arcticdb/stream/aggregator.hpp>
1010
#include <arcticdb/pipeline/frame_slice.hpp>
11+
#include <arcticdb/entity/protobuf_mappings.hpp>
1112

1213
namespace arcticdb {
1314

cpp/arcticdb/pipeline/frame_utils.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,11 @@
88
#pragma once
99

1010
#include <arcticdb/pipeline/pipeline_context.hpp>
11-
#include <arcticdb/column_store/string_pool.hpp>
1211
#include <arcticdb/column_store/chunked_buffer.hpp>
1312
#include <arcticdb/pipeline/frame_slice.hpp>
1413
#include <arcticdb/entity/atom_key.hpp>
1514
#include <arcticdb/pipeline/input_tensor_frame.hpp>
1615
#include <arcticdb/stream/protobuf_mappings.hpp>
17-
#include <arcticdb/entity/protobuf_mappings.hpp>
1816
#include <arcticdb/python/gil_lock.hpp>
1917
#include <arcticdb/python/python_types.hpp>
2018
#include <arcticdb/python/python_to_tensor_frame.hpp>
@@ -26,6 +24,10 @@
2624

2725
namespace arcticdb {
2826

27+
namespace pipelines::index {
28+
struct IndexSegmentReader;
29+
}
30+
2931
inline size_t get_first_string_size(const pipelines::PipelineContextRow& context_row, ChunkedBuffer &src, std::size_t first_row_in_frame) {
3032
auto offset = first_context_row(context_row.slice_and_key(), first_row_in_frame);
3133
auto num_rows = context_row.slice_and_key().slice_.row_range.diff();

cpp/arcticdb/pipeline/input_tensor_frame.hpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,14 @@
1212
#include <arcticdb/entity/protobufs.hpp>
1313
#include <arcticdb/entity/index_range.hpp>
1414
#include <arcticdb/entity/types.hpp>
15-
#include <arcticdb/util/flatten_utils.hpp>
15+
#include <arcticdb/util/type_traits.hpp>
1616

1717
namespace arcticdb::pipelines {
1818

1919
using namespace arcticdb::entity;
2020

21-
/// @TODO Move to a separate "util" header
22-
template <typename T, typename... U>
23-
concept is_any_of = (std::same_as<T, U> || ...);
24-
2521
template <typename IndexT>
26-
concept ValidIndex = is_any_of<
22+
concept ValidIndex = util::any_of<
2723
std::remove_cvref_t<std::remove_pointer_t<std::decay_t<IndexT>>>,
2824
stream::TimeseriesIndex,
2925
stream::RowCountIndex,

0 commit comments

Comments
 (0)