Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
167cb2f
Add basic structure for read_modify_write
vasil-pashov Oct 16, 2025
580cf14
Working version of read modify write
vasil-pashov Oct 21, 2025
abe52e3
Implement write clause
vasil-pashov Oct 21, 2025
59af346
Make the write clause store future components so that it does not dea…
vasil-pashov Oct 22, 2025
97549b6
Fix a bug with empty dataframes. Test all filtering
vasil-pashov Oct 23, 2025
dfda400
Fix col range for multiindex
vasil-pashov Oct 23, 2025
0473e65
wip
vasil-pashov Oct 24, 2025
3490e74
Compact rows function
vasil-pashov Oct 24, 2025
b613264
Add comments
vasil-pashov Oct 24, 2025
769e271
Fix compilation errors
vasil-pashov Oct 24, 2025
83a8065
Fix compilation errors
vasil-pashov Oct 24, 2025
c57fdc0
Fix compilation errors
vasil-pashov Oct 24, 2025
27cbe29
Apply comments
vasil-pashov Oct 30, 2025
5f94216
Add C++ stress test to check for deadlocks in write clause
vasil-pashov Oct 31, 2025
12e2c28
Add overloat for async_write that performs the encoding in the curren…
vasil-pashov Nov 3, 2025
658e2dc
Structure write clause by row slice
vasil-pashov Nov 3, 2025
056b8d8
Fix resampling tests
vasil-pashov Nov 3, 2025
c28903b
Address review comments
vasil-pashov Nov 4, 2025
6c9438a
Extract the variant match in async write to a separate function
vasil-pashov Nov 5, 2025
809d961
Fix resampling tests
vasil-pashov Nov 3, 2025
6615c97
Prepare for merge update
vasil-pashov Nov 3, 2025
556fdce
Add merge clause skeleton
vasil-pashov Nov 5, 2025
7c861b2
Implement structure for processing
vasil-pashov Nov 6, 2025
fca60b9
WIP
vasil-pashov Nov 10, 2025
38ac5b5
Unify iteration over source columns
vasil-pashov Nov 10, 2025
ab71903
Add utility functions for generating columns, native tensors and segm…
vasil-pashov Nov 11, 2025
9b80b7a
Add one more utility function for generating a dense segment in memory
vasil-pashov Nov 11, 2025
0e1921c
Add more testing C++ utils
vasil-pashov Nov 13, 2025
8691d88
WIP on test
vasil-pashov Nov 13, 2025
3d02c50
Fixes to C++ tests
vasil-pashov Nov 14, 2025
40610a6
Passing unit test
vasil-pashov Nov 14, 2025
b1052de
Fix input_frame_from_tensors
vasil-pashov Nov 19, 2025
25eea67
Merge branch 'master' into vasil.pashov/merge-update-using-write-clause
vasil-pashov Nov 25, 2025
dfc8496
Make python API for merge propagate to C++
vasil-pashov Nov 25, 2025
437fee0
Merge branch 'vasil.pashov/feature/merge' into vasil.pashov/merge-upd…
vasil-pashov Nov 26, 2025
0687e03
vcpkg
vasil-pashov Nov 26, 2025
29a28e5
Fix custom formatters for gtest
vasil-pashov Nov 27, 2025
c88363d
Split utils from formatters
vasil-pashov Nov 27, 2025
967b70c
Fix compilation issues on the CI
vasil-pashov Nov 27, 2025
51f8854
Fix unreachable code
vasil-pashov Nov 28, 2025
07580e9
Fix windows build issues
Nov 28, 2025
546ccb7
Fix use after stack free
vasil-pashov Nov 29, 2025
336bae9
Fix out ouf bounds access
vasil-pashov Dec 1, 2025
c5f0516
Fix column selection when the source is SegmetInMemory
vasil-pashov Dec 1, 2025
ba437a8
Python tests for string column pass
vasil-pashov Dec 3, 2025
8431069
Refactor
vasil-pashov Dec 3, 2025
051af2c
Add comments
vasil-pashov Dec 3, 2025
686d8d7
Fix concept
vasil-pashov Dec 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions cpp/arcticdb/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ set(arcticdb_srcs
column_store/memory_segment_impl.hpp
column_store/row_ref.hpp
column_store/string_pool.hpp
column_store/segment_utils.hpp
entity/atom_key.hpp
entity/frame_and_descriptor.hpp
entity/index_range.hpp
Expand Down Expand Up @@ -250,6 +251,7 @@ set(arcticdb_srcs
pipeline/index_utils.hpp
pipeline/index_writer.hpp
pipeline/input_frame.hpp
pipeline/input_frame_utils.hpp
pipeline/pandas_output_frame.hpp
pipeline/pipeline_common.hpp
pipeline/pipeline_utils.hpp
Expand Down Expand Up @@ -438,6 +440,7 @@ set(arcticdb_srcs
column_store/memory_segment.cpp
column_store/memory_segment_impl.cpp
column_store/memory_segment_impl.cpp
column_store/segment_utils.cpp
column_store/statistics.hpp
column_store/string_pool.cpp
entity/data_error.cpp
Expand Down Expand Up @@ -986,6 +989,7 @@ if(${TEST})
processing/test/test_signed_unsigned_comparison.cpp
processing/test/test_type_comparison.cpp
processing/test/test_unsorted_aggregation.cpp
processing/test/test_merge_update.cpp
storage/test/test_local_storages.cpp
storage/test/test_memory_storage.cpp
storage/test/test_s3_storage.cpp
Expand Down Expand Up @@ -1023,6 +1027,8 @@ if(${TEST})
util/test/test_string_pool.cpp
util/test/test_string_utils.cpp
util/test/test_tracing_allocator.cpp
util/test/gtest_custom_formatters.hpp
util/test/gtest_utils.hpp
version/test/test_append.cpp
version/test/test_key_block.cpp
version/test/test_sort_index.cpp
Expand Down Expand Up @@ -1106,6 +1112,12 @@ if(${TEST})
)
endif()

# In order for the custom formatting to work the ODR must be satisfied and every TU that uses gtest must include
# the gtest_custom_formatters.hpp https://github.com/google/googletest/issues/1149
target_compile_options(test_unit_arcticdb PRIVATE
$<$<CXX_COMPILER_ID:MSVC>:/FIutil/test/gtest_custom_formatters.hpp>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-include util/test/gtest_custom_formatters.hpp>
)
gtest_discover_tests(test_unit_arcticdb PROPERTIES DISCOVERY_TIMEOUT 60)

set(benchmark_srcs
Expand Down
31 changes: 31 additions & 0 deletions cpp/arcticdb/column_store/memory_segment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,3 +299,34 @@ class SegmentInMemory {
};

} // namespace arcticdb

namespace fmt {
template<>
struct formatter<arcticdb::SegmentInMemory> {
template<typename ParseContext>
constexpr auto parse(ParseContext& ctx) {
return ctx.begin();
}

template<typename FormatContext>
constexpr auto format(const arcticdb::SegmentInMemory& segment, FormatContext& ctx) const {
const StreamDescriptor& desc = segment.descriptor();
auto out = fmt::format_to(ctx.out(), "Segment\n");
for (unsigned i = 0; i < desc.field_count(); ++i) {
out = fmt::format_to(out, "\nColumn[{}]: {}\n", i, desc.field(i));
visit_field(desc.field(i), [&](auto tdt) {
using TDT = decltype(tdt);
arcticdb::ColumnData cd = segment.column_data(i);
for (auto it = cd.begin<TDT>(); it != cd.end<TDT>(); ++it) {
if constexpr (std::same_as<typename TDT::DataTypeTag::raw_type, int8_t>) {
out = fmt::format_to(out, "{} ", i, int(*it));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we convert the int8_t to int which is probably int32_t?

} else {
out = fmt::format_to(out, "{} ", i, *it);
}
}
});
}
return out;
}
};
} // namespace fmt
61 changes: 61 additions & 0 deletions cpp/arcticdb/column_store/segment_utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/* Copyright 2025 Man Group Operations Limited
*
* Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
*
* As of the Change Date specified in that file, in accordance with the Business Source License, use of this software
* will be governed by the Apache License, version 2.0.
*/

#include <arcticdb/column_store/segment_utils.hpp>
#include <arcticdb/column_store/column.hpp>
#include <arcticdb/util/configs_map.hpp>
#include <arcticdb/column_store/column_algorithms.hpp>

namespace arcticdb {

ankerl::unordered_dense::set<entity::position_t> unique_values_for_string_column(const Column& column) {
ankerl::unordered_dense::set<entity::position_t> output_set;
// Guessing that unique values is a third of the column length
// TODO would be useful to have actual unique count here from stats
static auto map_reserve_ratio = ConfigsMap::instance()->get_int("UniqueColumns.AllocationRatio", 3);
output_set.reserve(column.row_count() / map_reserve_ratio);

details::visit_type(column.type().data_type(), [&](auto col_desc_tag) {
using type_info = ScalarTypeInfo<decltype(col_desc_tag)>;
if constexpr (is_sequence_type(type_info::data_type)) {
arcticdb::for_each<typename type_info::TDT>(column, [&output_set](auto value) {
output_set.emplace(value);
});
} else {
util::raise_rte("Column {} is not a string type column");
}
});
return output_set;
}
Comment on lines +16 to +34
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was moved from cpp/arcticdb/column_store/segment_utils.hpp without any changes


std::vector<StreamDescriptor> split_descriptor(const StreamDescriptor& descriptor, const size_t cols_per_segment) {
if (descriptor.fields().size() <= cols_per_segment) {
return std::vector{descriptor};
}
const size_t num_segments = descriptor.fields().size() / cols_per_segment;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we round up? E.g. with 3 fields and 2 cols per segment we'll end up dropping the last field?

std::vector<StreamDescriptor> res;
res.reserve(num_segments);

const unsigned field_count = descriptor.field_count();
for (size_t i = 0, source_field = descriptor.index().field_count(); i < num_segments; ++i) {
StreamDescriptor partial(descriptor.id());
if (descriptor.index().field_count() > 0) {
partial.set_index(descriptor.index());
for (unsigned index_field = 0; index_field < descriptor.index().field_count(); ++index_field) {
partial.add_field(descriptor.field(index_field));
}
}
for (size_t field = 0; field < cols_per_segment && source_field < field_count; ++field) {
partial.add_field(descriptor.field(source_field++));
}
res.push_back(std::move(partial));
}
return res;
}

} // namespace arcticdb
Loading
Loading