Skip to content

Commit e28c5f1

Browse files
authored
5.2.4: Fix off by one errors in staged segment validation (#2191) (#2192)
Our validation when finalizing is too strict at the moment. The index start and end on the APPEND_DATA key is `[start_time, end_time+1]` of data contained within its segment. Our validation logic is not always substracting one from the end time on the key to get the true date range in the segment. We need a similar change when detecting duplicate staged segments. If two staged segments, both covering a single duplicated index value, are staged, we should allow the write. #### Reference Issues/PRs <!--Example: Fixes #1234. See also #3456.--> #### What does this implement or fix? ## Change Type (Required) - [ ] **Patch** (Bug fix or non-breaking improvement) - [ ] **Minor** (New feature, but backward compatible) - [ ] **Major** (Breaking changes) - [ ] **Cherry pick** #### Any other comments? #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details> <!-- Thanks for contributing a Pull Request to ArcticDB! Please ensure you have taken a look at: - ArcticDB's Code of Conduct: https://github.com/man-group/ArcticDB/blob/master/CODE_OF_CONDUCT.md - ArcticDB's Contribution Licensing: https://github.com/man-group/ArcticDB/blob/master/docs/mkdocs/docs/technical/contributing.md#contribution-licensing -->
1 parent 370e96e commit e28c5f1

File tree

2 files changed

+156
-3
lines changed

2 files changed

+156
-3
lines changed

cpp/arcticdb/version/version_core.cpp

+5-2
Original file line numberDiff line numberDiff line change
@@ -1180,7 +1180,9 @@ void check_incompletes_index_ranges_dont_overlap(const std::shared_ptr<PipelineC
11801180
auto [_, inserted] = unique_timestamp_ranges.emplace(key.start_time(), key.end_time());
11811181
// This is correct because incomplete segments aren't column sliced
11821182
sorting::check<ErrorCode::E_UNSORTED_DATA>(
1183-
inserted,
1183+
// If the segment is entirely covering a single index value, then duplicates are fine
1184+
// -1 as end_time is stored as 1 greater than the last index value in the segment
1185+
inserted || key.end_time() -1 == key.start_time(),
11841186
"Cannot finalize staged data as 2 or more incomplete segments cover identical index values (in UTC): ({}, {})",
11851187
date_and_time(key.start_time()), date_and_time(key.end_time()));
11861188
}
@@ -1189,7 +1191,8 @@ void check_incompletes_index_ranges_dont_overlap(const std::shared_ptr<PipelineC
11891191
auto next_it = std::next(it);
11901192
if (next_it != unique_timestamp_ranges.end()) {
11911193
sorting::check<ErrorCode::E_UNSORTED_DATA>(
1192-
next_it->first >= it->second,
1194+
// -1 as end_time is stored as 1 greater than the last index value in the segment
1195+
next_it->first >= it->second - 1,
11931196
"Cannot finalize staged data as incomplete segment index values overlap one another (in UTC): ({}, {}) intersects ({}, {})",
11941197
date_and_time(it->first),
11951198
date_and_time(it->second - 1),

python/tests/unit/arcticdb/version_store/test_parallel.py

+151-1
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,10 @@
2727
)
2828
from arcticdb.util._versions import IS_PANDAS_TWO
2929
from arcticdb.version_store.library import Library
30+
from arcticdb_ext.exceptions import UnsortedDataException
3031
from arcticdb_ext.storage import KeyType
3132

32-
from arcticdb import util
33+
from arcticdb import util, LibraryOptions
3334

3435
from arcticdb.util.test import config_context_multi
3536

@@ -1437,3 +1438,152 @@ def test_writing_wide_segment_over_sliced_data(
14371438
lib.compact_incomplete("sym", False, False)
14381439

14391440
assert_frame_equal(lib.read("sym").data, df_1)
1441+
1442+
1443+
def test_chunks_overlap(lmdb_storage, lib_name):
1444+
"""Given - we stage chunks with indexes:
1445+
1446+
b:test:0:0xdfde242de44bdf38@1739968386409923711[0,1001]
1447+
b:test:0:0x95750a82cfa088df@1739968386410180283[1000,1001]
1448+
1449+
When - We finalize the staged segments
1450+
1451+
Then - We should succeed even though the segments seem to overlap by 1ns because the end time in the key is 1
1452+
greater than the last index value in the segment
1453+
"""
1454+
lib: Library = lmdb_storage.create_arctic().create_library(
1455+
lib_name,
1456+
library_options=LibraryOptions(rows_per_segment=2))
1457+
1458+
idx = [
1459+
pd.Timestamp(0),
1460+
pd.Timestamp(1000),
1461+
pd.Timestamp(1000),
1462+
pd.Timestamp(1000),
1463+
]
1464+
1465+
data = pd.DataFrame({"a": len(idx)}, index=idx)
1466+
lib.write("test", data, staged=True)
1467+
1468+
lt = lib._nvs.library_tool()
1469+
append_keys = lt.find_keys_for_id(KeyType.APPEND_DATA, "test")
1470+
assert len(append_keys) == 2
1471+
assert sorted([key.start_index for key in append_keys]) == [0, 1000]
1472+
assert [key.end_index for key in append_keys] == [1001, 1001]
1473+
1474+
lib.finalize_staged_data("test")
1475+
1476+
df = lib.read("test").data
1477+
assert_frame_equal(df, data)
1478+
1479+
1480+
def test_chunks_overlap_1ns(lmdb_storage, lib_name):
1481+
"""Given - we stage chunks that overlap by 1ns
1482+
1483+
When - We finalize the staged segments
1484+
1485+
Then - We should raise a validation error
1486+
"""
1487+
lib: Library = lmdb_storage.create_arctic().create_library(
1488+
lib_name,
1489+
library_options=LibraryOptions(rows_per_segment=2))
1490+
1491+
idx = [pd.Timestamp(0), pd.Timestamp(1), pd.Timestamp(2)]
1492+
first = pd.DataFrame({"a": len(idx)}, index=idx)
1493+
lib.write("test", first, staged=True)
1494+
1495+
idx = [pd.Timestamp(1), pd.Timestamp(3)]
1496+
second = pd.DataFrame({"a": len(idx)}, index=idx)
1497+
lib.write("test", second, staged=True)
1498+
1499+
with pytest.raises(UnsortedDataException):
1500+
lib.finalize_staged_data("test")
1501+
1502+
1503+
def test_chunks_match_at_ends(lmdb_storage, lib_name):
1504+
"""Given - we stage chunks that match at the ends
1505+
1506+
When - We finalize the staged segments
1507+
1508+
Then - Should be OK to finalize
1509+
"""
1510+
lib: Library = lmdb_storage.create_arctic().create_library(
1511+
lib_name,
1512+
library_options=LibraryOptions(rows_per_segment=2))
1513+
1514+
first_idx = [pd.Timestamp(0), pd.Timestamp(1), pd.Timestamp(2)]
1515+
first = pd.DataFrame({"a": np.arange(3)}, index=first_idx)
1516+
lib.write("test", first, staged=True)
1517+
1518+
second_idx = [pd.Timestamp(2), pd.Timestamp(2), pd.Timestamp(2), pd.Timestamp(3)]
1519+
second = pd.DataFrame({"a": np.arange(3, 7)}, index=second_idx)
1520+
lib.write("test", second, staged=True)
1521+
1522+
lib.finalize_staged_data("test")
1523+
1524+
result = lib.read("test").data
1525+
index_result = result.index
1526+
assert index_result.equals(pd.Index(first_idx + second_idx))
1527+
assert result.index.is_monotonic_increasing
1528+
# There is some non-determinism about where the overlap will end up
1529+
assert set(result["a"].values) == set(range(7))
1530+
assert result["a"][0] == 0
1531+
assert result["a"][-1] == 6
1532+
1533+
1534+
def test_chunks_the_same(lmdb_storage, lib_name):
1535+
"""Given - we stage chunks with indexes:
1536+
1537+
b:test:0:0xc7ad4135da54cd6e@1739968588832977666[1000,2001]
1538+
b:test:0:0x68d8759aba38bcf0@1739968588832775570[1000,1001]
1539+
b:test:0:0x68d8759aba38bcf0@1739968588832621000[1000,1001]
1540+
1541+
When - We finalize the staged segments
1542+
1543+
Then - We should succeed even though the segments seem to be identical, since they are just covering a duplicated
1544+
index value
1545+
"""
1546+
lib: Library = lmdb_storage.create_arctic().create_library(
1547+
lib_name,
1548+
library_options=LibraryOptions(rows_per_segment=2))
1549+
1550+
idx = [
1551+
pd.Timestamp(1000),
1552+
pd.Timestamp(1000),
1553+
pd.Timestamp(1000),
1554+
pd.Timestamp(1000),
1555+
pd.Timestamp(1000),
1556+
pd.Timestamp(2000),
1557+
]
1558+
1559+
data = pd.DataFrame({"a": len(idx)}, index=idx)
1560+
lib.write("test", data, staged=True)
1561+
1562+
lt = lib._nvs.library_tool()
1563+
append_keys = lt.find_keys_for_id(KeyType.APPEND_DATA, "test")
1564+
assert len(append_keys) == 3
1565+
assert sorted([key.start_index for key in append_keys]) == [1000, 1000, 1000]
1566+
assert sorted([key.end_index for key in append_keys]) == [1001, 1001, 2001]
1567+
1568+
lib.finalize_staged_data("test")
1569+
1570+
df = lib.read("test").data
1571+
assert_frame_equal(df, data)
1572+
assert df.index.is_monotonic_increasing
1573+
1574+
1575+
def test_staging_in_chunks_default_settings(lmdb_storage, lib_name):
1576+
lib: Library = lmdb_storage.create_arctic().create_library(lib_name)
1577+
idx = pd.date_range(pd.Timestamp(0), periods=int(31e5), freq="us")
1578+
1579+
data = pd.DataFrame({"a": len(idx)}, index=idx)
1580+
lib.write("test", data, staged=True)
1581+
1582+
lt = lib._nvs.library_tool()
1583+
append_keys = lt.find_keys_for_id(KeyType.APPEND_DATA, "test")
1584+
assert len(append_keys) == 31
1585+
lib.finalize_staged_data("test")
1586+
1587+
df = lib.read("test").data
1588+
assert_frame_equal(df, data)
1589+
assert df.index.is_monotonic_increasing

0 commit comments

Comments
 (0)