Rework C++ truncation for arrow

IvoDD · IvoDD · commit d28b7a50e4dd · 2025-06-06T18:06:36.000+03:00
Arrow requires truncation to happen in C++ layer.
Previous logic had many issues:
- Wrong truncated range calculation
- Wrong offsets used when first row offset != 0
- Wrong row range calculation

This PR fixes the issues by rewriting a lot of the column truncation
logic and adds more tests.
diff --git a/cpp/arcticdb/column_store/chunked_buffer.hpp b/cpp/arcticdb/column_store/chunked_buffer.hpp
@@ -475,16 +475,16 @@ class ChunkedBufferImpl {
         util::check(block == *blocks_.begin(), "Truncate first block position {} not within initial block", bytes);
         util::check(bytes < block->bytes(), "Can't truncate {} bytes from a {} byte block", bytes, block->bytes());
         auto remaining_bytes = block->bytes() - bytes;
-        auto new_block = create_block(bytes, 0);
+        auto new_block = create_block(remaining_bytes, block->offset_);
         new_block->copy_from(block->data() + bytes, remaining_bytes, 0);
         blocks_[0] = new_block;
         block->abandon();
         delete block;
     }
 
     void truncate_last_block(size_t bytes) {
-        auto [block, offset, ts] = block_and_offset(bytes);
-        util::check(block == *blocks_.rbegin(), "Truncate first block position {} not within initial block", bytes);
+        auto [block, offset, ts] = block_and_offset(bytes_ - bytes);
+        util::check(block == *blocks_.rbegin(), "Truncate last block position {} not within last block", bytes);
         util::check(bytes < block->bytes(), "Can't truncate {} bytes from a {} byte block", bytes, block->bytes());
         auto remaining_bytes = block->bytes() - bytes;
         auto new_block = create_block(remaining_bytes, block->offset_);
diff --git a/cpp/arcticdb/column_store/column.cpp b/cpp/arcticdb/column_store/column.cpp
@@ -590,20 +590,17 @@ std::vector<std::shared_ptr<Column>> Column::split(const std::shared_ptr<Column>
     return output;
 }
 
-void Column::truncate_first_block(size_t row) {
+void Column::truncate_first_block(size_t start_row) {
     if(!is_sparse()) {
-        auto bytes = data_type_size(type_, OutputFormat::NATIVE, DataTypeMode::INTERNAL)  * row;
+        auto bytes = start_row * data_type_size(type_, OutputFormat::NATIVE, DataTypeMode::INTERNAL);
         data_.buffer().truncate_first_block(bytes);
     }
 }
 
-void Column::truncate_last_block(size_t row) {
+void Column::truncate_last_block(size_t end_row) {
     if(!is_sparse()) {
         const auto column_row_count = row_count();
-        if(row < static_cast<size_t>(column_row_count))
-            return;
-
-        auto bytes = data_type_size(type_, OutputFormat::NATIVE, DataTypeMode::INTERNAL)  * (column_row_count - row);
+        auto bytes = (column_row_count - end_row) * data_type_size(type_, OutputFormat::NATIVE, DataTypeMode::INTERNAL);
         data_.buffer().truncate_last_block(bytes);
     }
 }
diff --git a/cpp/arcticdb/column_store/column.hpp b/cpp/arcticdb/column_store/column.hpp
@@ -742,6 +742,7 @@ class Column {
                         // At least one value in the column exactly matches the input val
                         // Search to the right/left for the last/first such value
                         if (from_right) {
+                            // TODO: Super inefficient
                             while (++mid <= high && val == accessor.at(mid)) {}
                             res = mid;
                         } else {
diff --git a/cpp/arcticdb/pipeline/frame_slice.hpp b/cpp/arcticdb/pipeline/frame_slice.hpp
@@ -60,7 +60,7 @@ struct RowRange : AxisRange {
 };
 
 inline bool contains(const RowRange& range, size_t row) {
-    return row >= range.first && row <= range.second;
+    return row >= range.first && row < range.second;
 }
 
 struct FrameSlice {
diff --git a/cpp/arcticdb/pipeline/read_frame.cpp b/cpp/arcticdb/pipeline/read_frame.cpp
@@ -247,12 +247,15 @@ void decode_index_field(
 void handle_truncation(
     Column& dest_column,
     const ColumnTruncation& truncate) {
-    if(dest_column.num_blocks() == 1 && truncate.start_ && truncate.end_)
+    if(dest_column.num_blocks() == 1 && truncate.start_ && truncate.end_) {
         dest_column.truncate_single_block(*truncate.start_, *truncate.end_);
-    else if(truncate.start_)
-        dest_column.truncate_first_block(*truncate.start_);
-    else if(truncate.end_)
-        dest_column.truncate_last_block(*truncate.end_);
+    }
+    else {
+        if(truncate.start_)
+            dest_column.truncate_first_block(*truncate.start_);
+        if(truncate.end_)
+            dest_column.truncate_last_block(*truncate.end_);
+    }
 }
 
 void handle_truncation(
@@ -329,36 +332,40 @@ void decode_or_expand(
 template <typename IndexValueType>
 ColumnTruncation get_truncate_range_from_index(
     const Column& column,
-    const IndexValueType& start,
-    const IndexValueType& end,
-    std::optional<int64_t> start_offset = std::nullopt,
-    std::optional<int64_t> end_offset = std::nullopt) {
-    int64_t start_row = column.search_sorted<IndexValueType>(start, false, start_offset, end_offset);
-    int64_t end_row = column.search_sorted<IndexValueType>(end, true, start_offset, end_offset);
+    const IndexValueType& filter_start,
+    const IndexValueType& filter_end,
+    int64_t start_col_offset,
+    int64_t end_col_offset) {
+    // search_sorted expects inclusive end_col_offset
+    auto inclusive_end_col_offset = end_col_offset - 1;
+    int64_t start_row = column.search_sorted<IndexValueType>(filter_start, false, start_col_offset, inclusive_end_col_offset);
+    int64_t end_row = column.search_sorted<IndexValueType>(filter_end, true, start_col_offset, inclusive_end_col_offset);
+
     std::optional<int64_t> truncate_start;
     std::optional<int64_t> truncate_end;
-    if((start_offset && start_row != *start_offset) || (!start_offset && start_row > 0))
+    if(start_row != start_col_offset)
         truncate_start = start_row;
 
-    if((end_offset && end_row != *end_offset) || (!end_offset && end_row < column.row_count() - 1))
+    if(end_row != end_col_offset)
         truncate_end = end_row;
 
     return {truncate_start, truncate_end};
 }
 
-std::pair<std::optional<int64_t>, std::optional<int64_t>> get_truncate_range_from_rows(
-    const RowRange& row_range,
-    size_t start_offset,
-    size_t end_offset) {
+ColumnTruncation get_truncate_range_from_rows(
+    const RowRange& slice_range,
+    size_t row_filter_start,
+    size_t row_filter_end) {
     std::optional<int64_t> truncate_start;
     std::optional<int64_t> truncate_end;
-    if(contains(row_range, start_offset))
-        truncate_start = start_offset;
+    // TODO: Explain
+    if(contains(slice_range, row_filter_start) && row_filter_start != slice_range.start())
+        truncate_start = row_filter_start;
 
-    if(contains(row_range, end_offset))
-        truncate_end = end_offset;
+    if(contains(slice_range, row_filter_end) && row_filter_end != slice_range.start())
+        truncate_end = row_filter_end;
 
-    return std::make_pair(truncate_start, truncate_end);
+    return {truncate_start, truncate_end};
 }
 
 ColumnTruncation get_truncate_range(
@@ -370,32 +377,39 @@ ColumnTruncation get_truncate_range(
         const EncodedFieldImpl& index_field,
         const uint8_t* index_field_offset) {
     ColumnTruncation truncate_rows;
+    const auto& slice_row_range = context.slice_and_key().slice().row_range;
+    const auto& first_row_offset = frame.offset();
+    auto column_slice_row_range = RowRange(slice_row_range.first - first_row_offset, slice_row_range.second - first_row_offset);
     if(read_options.output_format() == OutputFormat::ARROW) {
         util::variant_match(read_query.row_filter,
-            [&truncate_rows, &frame, &context, &index_field, index_field_offset, encoding_version] (const IndexRange& index_range) {
-                const auto& time_range = static_cast<const TimestampRange&>(index_range);
+            [&truncate_rows, &column_slice_row_range, &frame, &context, &index_field, index_field_offset, encoding_version] (const IndexRange& index_filter) {
+                const auto& time_filter = static_cast<const TimestampRange&>(index_filter);
                 const auto& slice_time_range =  context.slice_and_key().key().time_range();
-                if(contains(slice_time_range, time_range.first) || contains(slice_time_range, time_range.second)) {
+                // The `get_truncate_range_from_index` is O(logn). The `contains` checks serves to avoid the expensive
+                // O(logn) check for blocks in the middle of the range
+                if(contains(slice_time_range, time_filter.first) || contains(slice_time_range, time_filter.second)) {
                     if(context.fetch_index()) {
                         const auto& index_column = frame.column(0);
-                        truncate_rows = get_truncate_range_from_index(index_column, time_range.first, time_range.second);
+                        truncate_rows = get_truncate_range_from_index(index_column, time_filter.first, time_filter.second, column_slice_row_range.first, column_slice_row_range.second);
                     } else {
                         const auto& frame_index_desc = frame.descriptor().fields(0UL);
                         Column sink{frame_index_desc.type(), encoding_sizes::field_uncompressed_size(index_field), AllocationType::PRESIZED, Sparsity::PERMITTED};
                         std::optional<util::BitMagic> bv;
                         (void)decode_field(frame_index_desc.type(), index_field, index_field_offset, sink, bv, encoding_version);
-                        truncate_rows = get_truncate_range_from_index(sink, time_range.first, time_range.second);
+                        truncate_rows = get_truncate_range_from_index(sink, time_filter.first, time_filter.second, column_slice_row_range.first, column_slice_row_range.second);
                     }
                 }
             },
-            [&context] (const RowRange& row_range) {
-                const auto& slice_row_range = context.slice_and_key().slice().row_range;
-                get_truncate_range_from_rows(row_range, slice_row_range.start(), slice_row_range.end());
+            [&truncate_rows, &column_slice_row_range, &first_row_offset] (const RowRange& row_filter) {
+                // The row_filter is with respect to global offset. Column truncation cares about column indices.
+                auto row_filter_start = row_filter.first - first_row_offset;
+                auto row_filter_end = row_filter.second - first_row_offset;
+                truncate_rows = get_truncate_range_from_rows(column_slice_row_range, row_filter_start, row_filter_end);
             },
             [] (const auto&) {
                 // Do nothing
             });
-        }
+    }
     return truncate_rows;
 };
 
diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py
@@ -824,7 +824,7 @@ def update(
             If a range is specified, it will clear/delete the data within the
             range and overwrite it with the data in `data`. This allows the user
             to update with data that might only be a subset of the
-            original data.
+            original data. Note date_range is end-inclusive.
         upsert: bool, default=False
             If True, will write the data even if the symbol does not exist.
         prune_previous_version
@@ -1932,18 +1932,19 @@ def read(
             `str` : snapshot name which contains the version
             `datetime.datetime` : the version of the data that existed as_of the requested point in time
         date_range: `Optional[DateRangeInput]`, default=None
-            DateRange to read data for.  Applicable only for Pandas data with a DateTime index. Returns only the part
-            of the data that falls within the given range. The same effect can be achieved by using the date_range
-            clause of the QueryBuilder class, which will be slower, but return data with a smaller memory footprint.
-            See the QueryBuilder.date_range docstring for more details.
+            DateRange to read data for. Inclusive both for lower and upper bounds. Applicable only for dataframes with
+            a DateTime index. Returns only the part of the data that falls within the given range.
+            The same effect can  be achieved by using the date_range clause of the QueryBuilder class, which will be
+            slower, but return data with a smaller memory footprint. See the QueryBuilder.date_range docstring for more
+            details.
             Only one of date_range or row_range can be provided.
         row_range: `Optional[Tuple[int, int]]`, default=None
             Row range to read data for. Inclusive of the lower bound, exclusive of the upper bound
             lib.read(symbol, row_range=(start, end)).data should behave the same as df.iloc[start:end], including in
             the handling of negative start/end values.
             Only one of date_range or row_range can be provided.
         columns: `Optional[List[str]]`, default=None
-            Applicable only for Pandas data. Determines which columns to return data for.
+            Applicable only for dataframes. Determines which columns to return data for.
         query_builder: 'Optional[QueryBuilder]', default=None
             A QueryBuilder object to apply to the dataframe before it is returned.
             For more information see the documentation for the QueryBuilder class.
diff --git a/python/tests/unit/arcticdb/version_store/test_arrow.py b/python/tests/unit/arcticdb/version_store/test_arrow.py
@@ -21,8 +21,10 @@ def test_basic(lmdb_version_store_v1):
 
 
 # TODO: Do this fix during normalization in frontend PR
-def fix_timeseries_index(df):
+def fix_timeseries_index(df, set_index=False):
     df["index"] = df["index"].apply(lambda x : pd.Timestamp(x))
+    if set_index:
+        df = df.set_index("index")
     return df
 
 
@@ -130,9 +132,10 @@ def test_all_types(lmdb_version_store_v1):
     assert_frame_equal(result, df)
 
 
+@pytest.mark.parametrize("segment_row_size", [1, 2, 10, 100])
 @pytest.mark.parametrize("start_offset,end_offset", [(2, 3), (3, 75), (4, 32), (0, 99), (7, 56)])
-def test_date_range(lmdb_version_store_v1, start_offset, end_offset):
-    lib = lmdb_version_store_v1
+def test_date_range(version_store_factory, segment_row_size, start_offset, end_offset):
+    lib = version_store_factory(segment_row_size=segment_row_size)
     initial_timestamp = pd.Timestamp("2019-01-01")
     df = pd.DataFrame(data=np.arange(100), index=pd.date_range(initial_timestamp, periods=100), columns=['x'])
     sym = "arrow_date_test"
@@ -143,14 +146,34 @@ def test_date_range(lmdb_version_store_v1, start_offset, end_offset):
 
     date_range = (query_start_ts, query_end_ts)
     data_closed_table = lib.read(sym, date_range=date_range, _output_format=OutputFormat.ARROW).data
-    df = data_closed_table.to_pandas()
-    df = df.set_index('index')
-    assert query_start_ts == pd.Timestamp(df.index[0])
-    assert query_end_ts == pd.Timestamp(df.index[-1])
+    df = fix_timeseries_index(data_closed_table.to_pandas(), set_index=True)
+    assert query_start_ts == df.index[0]
+    assert query_end_ts == df.index[-1]
     assert df['x'].iloc[0] == start_offset
     assert df['x'].iloc[-1] == end_offset
 
 
+@pytest.mark.parametrize("segment_row_size", [1, 2, 10, 100])
+@pytest.mark.parametrize("start_offset,end_offset", [(2, 4), (3, 76), (4, 33), (0, 100), (7, 57)])
+def test_row_range(version_store_factory, segment_row_size, start_offset, end_offset):
+    lib = version_store_factory(segment_row_size=segment_row_size)
+    initial_timestamp = pd.Timestamp("2019-01-01")
+    df = pd.DataFrame(data=np.arange(100), index=pd.date_range(initial_timestamp, periods=100), columns=['x'])
+    sym = "arrow_date_test"
+    lib.write(sym, df)
+
+    row_range = (start_offset, end_offset)
+    data_closed_table = lib.read(sym, row_range=row_range, _output_format=OutputFormat.ARROW).data
+    df = fix_timeseries_index(data_closed_table.to_pandas(), set_index=True)
+
+    start_ts = initial_timestamp + pd.DateOffset(start_offset)
+    end_ts = initial_timestamp + pd.DateOffset(end_offset-1)
+    assert start_ts == df.index[0]
+    assert end_ts == df.index[-1]
+    assert df['x'].iloc[0] == start_offset
+    assert df['x'].iloc[-1] == end_offset-1
+
+
 def test_with_querybuilder(lmdb_version_store_v1):
     lib = lmdb_version_store_v1
     df = pd.DataFrame({"x": np.arange(10), "y": np.arange(10.0, 20.0)})

Original file line number	Diff line number	Diff line change
`@@ -590,20 +590,17 @@ std::vector<std::shared_ptr<Column>> Column::split(const std::shared_ptr<Column>`
`590`	`590`	`return output;`
`591`	`591`	`}`
`592`	`592`
`593`		`-void Column::truncate_first_block(size_t row) {`
	`593`	`+void Column::truncate_first_block(size_t start_row) {`
`594`	`594`	`if(!is_sparse()) {`
`595`		`- auto bytes = data_type_size(type_, OutputFormat::NATIVE, DataTypeMode::INTERNAL) * row;`
	`595`	`+ auto bytes = start_row * data_type_size(type_, OutputFormat::NATIVE, DataTypeMode::INTERNAL);`
`596`	`596`	`data_.buffer().truncate_first_block(bytes);`
`597`	`597`	`}`
`598`	`598`	`}`
`599`	`599`
`600`		`-void Column::truncate_last_block(size_t row) {`
	`600`	`+void Column::truncate_last_block(size_t end_row) {`
`601`	`601`	`if(!is_sparse()) {`
`602`	`602`	`const auto column_row_count = row_count();`
`603`		`- if(row < static_cast<size_t>(column_row_count))`
`604`		`- return;`
`605`		`-`
`606`		`- auto bytes = data_type_size(type_, OutputFormat::NATIVE, DataTypeMode::INTERNAL) * (column_row_count - row);`
	`603`	`+ auto bytes = (column_row_count - end_row) * data_type_size(type_, OutputFormat::NATIVE, DataTypeMode::INTERNAL);`
`607`	`604`	`data_.buffer().truncate_last_block(bytes);`
`608`	`605`	`}`
`609`	`606`	`}`
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@ struct RowRange : AxisRange {`
`60`	`60`	`};`
`61`	`61`
`62`	`62`	`inline bool contains(const RowRange& range, size_t row) {`
`63`		`- return row >= range.first && row <= range.second;`
	`63`	`+ return row >= range.first && row < range.second;`
`64`	`64`	`}`
`65`	`65`
`66`	`66`	`struct FrameSlice {`