man-group · vasil-pashov · Oct 16, 2025 · Oct 21, 2025 · Oct 21, 2025 · Oct 22, 2025
@@ -213,6 +213,7 @@ set(arcticdb_srcs
         column_store/memory_segment_impl.hpp
         column_store/row_ref.hpp
         column_store/string_pool.hpp
+        column_store/segment_utils.hpp
         entity/atom_key.hpp
         entity/frame_and_descriptor.hpp
         entity/index_range.hpp
@@ -250,6 +251,7 @@ set(arcticdb_srcs
         pipeline/index_utils.hpp
         pipeline/index_writer.hpp
         pipeline/input_frame.hpp
+        pipeline/input_frame_utils.hpp
         pipeline/pandas_output_frame.hpp
         pipeline/pipeline_common.hpp
         pipeline/pipeline_utils.hpp
@@ -438,6 +440,7 @@ set(arcticdb_srcs
         column_store/memory_segment.cpp
         column_store/memory_segment_impl.cpp
         column_store/memory_segment_impl.cpp
+        column_store/segment_utils.cpp
         column_store/statistics.hpp
         column_store/string_pool.cpp
         entity/data_error.cpp
@@ -986,6 +989,7 @@ if(${TEST})
             processing/test/test_signed_unsigned_comparison.cpp
             processing/test/test_type_comparison.cpp
             processing/test/test_unsorted_aggregation.cpp
+            processing/test/test_merge_update.cpp
             storage/test/test_local_storages.cpp
             storage/test/test_memory_storage.cpp
             storage/test/test_s3_storage.cpp
@@ -1023,6 +1027,8 @@ if(${TEST})
             util/test/test_string_pool.cpp
             util/test/test_string_utils.cpp
             util/test/test_tracing_allocator.cpp
+            util/test/gtest_custom_formatters.hpp
+            util/test/gtest_utils.hpp
             version/test/test_append.cpp
             version/test/test_key_block.cpp
             version/test/test_sort_index.cpp
@@ -1106,6 +1112,12 @@ if(${TEST})
                 )
     endif()
 
+    # In order for the custom formatting to work the ODR must be satisfied and every TU that uses gtest must include
+    # the gtest_custom_formatters.hpp https://github.com/google/googletest/issues/1149
+    target_compile_options(test_unit_arcticdb PRIVATE
+            $<$<CXX_COMPILER_ID:MSVC>:/FIutil/test/gtest_custom_formatters.hpp>
+            $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-include util/test/gtest_custom_formatters.hpp>
+    )
     gtest_discover_tests(test_unit_arcticdb PROPERTIES DISCOVERY_TIMEOUT 60)
 
     set(benchmark_srcs

@@ -299,3 +299,34 @@ class SegmentInMemory {
 };
 
 } // namespace arcticdb
+
+namespace fmt {
+template<>
+struct formatter<arcticdb::SegmentInMemory> {
+    template<typename ParseContext>
+    constexpr auto parse(ParseContext& ctx) {
+        return ctx.begin();
+    }
+
+    template<typename FormatContext>
+    constexpr auto format(const arcticdb::SegmentInMemory& segment, FormatContext& ctx) const {
+        const StreamDescriptor& desc = segment.descriptor();
+        auto out = fmt::format_to(ctx.out(), "Segment\n");
+        for (unsigned i = 0; i < desc.field_count(); ++i) {
+            out = fmt::format_to(out, "\nColumn[{}]: {}\n", i, desc.field(i));
+            visit_field(desc.field(i), [&](auto tdt) {
+                using TDT = decltype(tdt);
+                arcticdb::ColumnData cd = segment.column_data(i);
+                for (auto it = cd.begin<TDT>(); it != cd.end<TDT>(); ++it) {
+                    if constexpr (std::same_as<typename TDT::DataTypeTag::raw_type, int8_t>) {
+                        out = fmt::format_to(out, "{} ", i, int(*it));
+                    } else {
+                        out = fmt::format_to(out, "{} ", i, *it);
+                    }
+                }
+            });
+        }
+        return out;
+    }
+};
+} // namespace fmt
@@ -0,0 +1,61 @@
+/* Copyright 2025 Man Group Operations Limited
+ *
+ * Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
+ *
+ * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software
+ * will be governed by the Apache License, version 2.0.
+ */
+
+#include <arcticdb/column_store/segment_utils.hpp>
+#include <arcticdb/column_store/column.hpp>
+#include <arcticdb/util/configs_map.hpp>
+#include <arcticdb/column_store/column_algorithms.hpp>
+
+namespace arcticdb {
+
+ankerl::unordered_dense::set<entity::position_t> unique_values_for_string_column(const Column& column) {
+    ankerl::unordered_dense::set<entity::position_t> output_set;
+    // Guessing that unique values is a third of the column length
+    // TODO would be useful to have actual unique count here from stats
+    static auto map_reserve_ratio = ConfigsMap::instance()->get_int("UniqueColumns.AllocationRatio", 3);
+    output_set.reserve(column.row_count() / map_reserve_ratio);
+
+    details::visit_type(column.type().data_type(), [&](auto col_desc_tag) {
+        using type_info = ScalarTypeInfo<decltype(col_desc_tag)>;
+        if constexpr (is_sequence_type(type_info::data_type)) {
+            arcticdb::for_each<typename type_info::TDT>(column, [&output_set](auto value) {
+                output_set.emplace(value);
+            });
+        } else {
+            util::raise_rte("Column {} is not a string type column");
+        }
+    });
+    return output_set;
+}
+
+std::vector<StreamDescriptor> split_descriptor(const StreamDescriptor& descriptor, const size_t cols_per_segment) {
+    if (descriptor.fields().size() <= cols_per_segment) {
+        return std::vector{descriptor};
+    }
+    const size_t num_segments = descriptor.fields().size() / cols_per_segment;
+    std::vector<StreamDescriptor> res;
+    res.reserve(num_segments);
+
+    const unsigned field_count = descriptor.field_count();
+    for (size_t i = 0, source_field = descriptor.index().field_count(); i < num_segments; ++i) {
+        StreamDescriptor partial(descriptor.id());
+        if (descriptor.index().field_count() > 0) {
+            partial.set_index(descriptor.index());
+            for (unsigned index_field = 0; index_field < descriptor.index().field_count(); ++index_field) {
+                partial.add_field(descriptor.field(index_field));
+            }
+        }
+        for (size_t field = 0; field < cols_per_segment && source_field < field_count; ++field) {
+            partial.add_field(descriptor.field(source_field++));
+        }
+        res.push_back(std::move(partial));
+    }
+    return res;
+}
+
+} // namespace arcticdb