5.2.4: Fix off by one errors in staged segment validation (#2191) (#2192)

poodlewars · web-flow · commit e28c5f1e91fb · 2025-02-19T20:05:24.000Z
Our validation when finalizing is too strict at the moment. The index start and end on the APPEND_DATA key is `[start_time, end_time+1]` of data contained within its segment. Our validation logic is not always substracting one from the end time on the key to get the true date range in the segment. We need a similar change when detecting duplicate staged segments. If two staged segments, both covering a single duplicated index value, are staged, we should allow the write. #### Reference Issues/PRs  #### What does this implement or fix? ## Change Type (Required) - [ ] **Patch** (Bug fix or non-breaking improvement) - [ ] **Minor** (New feature, but backward compatible) - [ ] **Major** (Breaking changes) - [ ] **Cherry pick** #### Any other comments? #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details>
diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp
@@ -1180,7 +1180,9 @@ void check_incompletes_index_ranges_dont_overlap(const std::shared_ptr<PipelineC
             auto [_, inserted] = unique_timestamp_ranges.emplace(key.start_time(), key.end_time());
             // This is correct because incomplete segments aren't column sliced
             sorting::check<ErrorCode::E_UNSORTED_DATA>(
-                    inserted,
+                    // If the segment is entirely covering a single index value, then duplicates are fine
+                    // -1 as end_time is stored as 1 greater than the last index value in the segment
+                    inserted || key.end_time() -1 == key.start_time(),
                     "Cannot finalize staged data as 2 or more incomplete segments cover identical index values (in UTC): ({}, {})",
                     date_and_time(key.start_time()), date_and_time(key.end_time()));
         }
@@ -1189,7 +1191,8 @@ void check_incompletes_index_ranges_dont_overlap(const std::shared_ptr<PipelineC
             auto next_it = std::next(it);
             if (next_it != unique_timestamp_ranges.end()) {
                 sorting::check<ErrorCode::E_UNSORTED_DATA>(
-                        next_it->first >= it->second,
+                        // -1 as end_time is stored as 1 greater than the last index value in the segment
+                        next_it->first >= it->second - 1,
                         "Cannot finalize staged data as incomplete segment index values overlap one another (in UTC): ({}, {}) intersects ({}, {})",
                         date_and_time(it->first),
                         date_and_time(it->second - 1),
diff --git a/python/tests/unit/arcticdb/version_store/test_parallel.py b/python/tests/unit/arcticdb/version_store/test_parallel.py
@@ -27,9 +27,10 @@
 )
 from arcticdb.util._versions import IS_PANDAS_TWO
 from arcticdb.version_store.library import Library
+from arcticdb_ext.exceptions import UnsortedDataException
 from arcticdb_ext.storage import KeyType
 
-from arcticdb import util
+from arcticdb import util, LibraryOptions
 
 from arcticdb.util.test import config_context_multi
 
@@ -1437,3 +1438,152 @@ def test_writing_wide_segment_over_sliced_data(
         lib.compact_incomplete("sym", False, False)
 
         assert_frame_equal(lib.read("sym").data, df_1)
+
+
+def test_chunks_overlap(lmdb_storage, lib_name):
+    """Given - we stage chunks with indexes:
+
+    b:test:0:0xdfde242de44bdf38@1739968386409923711[0,1001]
+    b:test:0:0x95750a82cfa088df@1739968386410180283[1000,1001]
+    
+    When - We finalize the staged segments
+    
+    Then - We should succeed even though the segments seem to overlap by 1ns because the end time in the key is 1
+    greater than the last index value in the segment
+    """
+    lib: Library = lmdb_storage.create_arctic().create_library(
+        lib_name,
+        library_options=LibraryOptions(rows_per_segment=2))
+
+    idx = [
+        pd.Timestamp(0),
+        pd.Timestamp(1000),
+        pd.Timestamp(1000),
+        pd.Timestamp(1000),
+    ]
+
+    data = pd.DataFrame({"a": len(idx)}, index=idx)
+    lib.write("test", data, staged=True)
+
+    lt = lib._nvs.library_tool()
+    append_keys = lt.find_keys_for_id(KeyType.APPEND_DATA, "test")
+    assert len(append_keys) == 2
+    assert sorted([key.start_index for key in append_keys]) == [0, 1000]
+    assert [key.end_index for key in append_keys] == [1001, 1001]
+
+    lib.finalize_staged_data("test")
+
+    df = lib.read("test").data
+    assert_frame_equal(df, data)
+
+
+def test_chunks_overlap_1ns(lmdb_storage, lib_name):
+    """Given - we stage chunks that overlap by 1ns
+
+    When - We finalize the staged segments
+
+    Then - We should raise a validation error
+    """
+    lib: Library = lmdb_storage.create_arctic().create_library(
+        lib_name,
+        library_options=LibraryOptions(rows_per_segment=2))
+
+    idx = [pd.Timestamp(0), pd.Timestamp(1), pd.Timestamp(2)]
+    first = pd.DataFrame({"a": len(idx)}, index=idx)
+    lib.write("test", first, staged=True)
+
+    idx = [pd.Timestamp(1), pd.Timestamp(3)]
+    second = pd.DataFrame({"a": len(idx)}, index=idx)
+    lib.write("test", second, staged=True)
+
+    with pytest.raises(UnsortedDataException):
+        lib.finalize_staged_data("test")
+
+
+def test_chunks_match_at_ends(lmdb_storage, lib_name):
+    """Given - we stage chunks that match at the ends
+
+    When - We finalize the staged segments
+
+    Then - Should be OK to finalize
+    """
+    lib: Library = lmdb_storage.create_arctic().create_library(
+        lib_name,
+        library_options=LibraryOptions(rows_per_segment=2))
+
+    first_idx = [pd.Timestamp(0), pd.Timestamp(1), pd.Timestamp(2)]
+    first = pd.DataFrame({"a": np.arange(3)}, index=first_idx)
+    lib.write("test", first, staged=True)
+
+    second_idx = [pd.Timestamp(2), pd.Timestamp(2), pd.Timestamp(2), pd.Timestamp(3)]
+    second = pd.DataFrame({"a": np.arange(3, 7)}, index=second_idx)
+    lib.write("test", second, staged=True)
+
+    lib.finalize_staged_data("test")
+
+    result = lib.read("test").data
+    index_result = result.index
+    assert index_result.equals(pd.Index(first_idx + second_idx))
+    assert result.index.is_monotonic_increasing
+    # There is some non-determinism about where the overlap will end up
+    assert set(result["a"].values) == set(range(7))
+    assert result["a"][0] == 0
+    assert result["a"][-1] == 6
+
+
+def test_chunks_the_same(lmdb_storage, lib_name):
+    """Given - we stage chunks with indexes:
+
+    b:test:0:0xc7ad4135da54cd6e@1739968588832977666[1000,2001]
+    b:test:0:0x68d8759aba38bcf0@1739968588832775570[1000,1001]
+    b:test:0:0x68d8759aba38bcf0@1739968588832621000[1000,1001]
+
+    When - We finalize the staged segments
+    
+    Then - We should succeed even though the segments seem to be identical, since they are just covering a duplicated
+    index value
+    """
+    lib: Library = lmdb_storage.create_arctic().create_library(
+        lib_name,
+        library_options=LibraryOptions(rows_per_segment=2))
+
+    idx = [
+        pd.Timestamp(1000),
+        pd.Timestamp(1000),
+        pd.Timestamp(1000),
+        pd.Timestamp(1000),
+        pd.Timestamp(1000),
+        pd.Timestamp(2000),
+    ]
+
+    data = pd.DataFrame({"a": len(idx)}, index=idx)
+    lib.write("test", data, staged=True)
+
+    lt = lib._nvs.library_tool()
+    append_keys = lt.find_keys_for_id(KeyType.APPEND_DATA, "test")
+    assert len(append_keys) == 3
+    assert sorted([key.start_index for key in append_keys]) == [1000, 1000, 1000]
+    assert sorted([key.end_index for key in append_keys]) == [1001, 1001, 2001]
+
+    lib.finalize_staged_data("test")
+
+    df = lib.read("test").data
+    assert_frame_equal(df, data)
+    assert df.index.is_monotonic_increasing
+
+
+def test_staging_in_chunks_default_settings(lmdb_storage, lib_name):
+    lib: Library = lmdb_storage.create_arctic().create_library(lib_name)
+    idx = pd.date_range(pd.Timestamp(0), periods=int(31e5), freq="us")
+
+    data = pd.DataFrame({"a": len(idx)}, index=idx)
+    lib.write("test", data, staged=True)
+
+    lt = lib._nvs.library_tool()
+    append_keys = lt.find_keys_for_id(KeyType.APPEND_DATA, "test")
+    assert len(append_keys) == 31
+    lib.finalize_staged_data("test")
+
+    df = lib.read("test").data
+    assert_frame_equal(df, data)
+    assert df.index.is_monotonic_increasing