|
27 | 27 | )
|
28 | 28 | from arcticdb.util._versions import IS_PANDAS_TWO
|
29 | 29 | from arcticdb.version_store.library import Library
|
| 30 | +from arcticdb_ext.exceptions import UnsortedDataException |
30 | 31 | from arcticdb_ext.storage import KeyType
|
31 | 32 |
|
32 |
| -from arcticdb import util |
| 33 | +from arcticdb import util, LibraryOptions |
33 | 34 |
|
34 | 35 | from arcticdb.util.test import config_context_multi
|
35 | 36 |
|
@@ -1437,3 +1438,152 @@ def test_writing_wide_segment_over_sliced_data(
|
1437 | 1438 | lib.compact_incomplete("sym", False, False)
|
1438 | 1439 |
|
1439 | 1440 | assert_frame_equal(lib.read("sym").data, df_1)
|
| 1441 | + |
| 1442 | + |
| 1443 | +def test_chunks_overlap(lmdb_storage, lib_name): |
| 1444 | + """Given - we stage chunks with indexes: |
| 1445 | +
|
| 1446 | + b:test:0:0xdfde242de44bdf38@1739968386409923711[0,1001] |
| 1447 | + b:test:0:0x95750a82cfa088df@1739968386410180283[1000,1001] |
| 1448 | + |
| 1449 | + When - We finalize the staged segments |
| 1450 | + |
| 1451 | + Then - We should succeed even though the segments seem to overlap by 1ns because the end time in the key is 1 |
| 1452 | + greater than the last index value in the segment |
| 1453 | + """ |
| 1454 | + lib: Library = lmdb_storage.create_arctic().create_library( |
| 1455 | + lib_name, |
| 1456 | + library_options=LibraryOptions(rows_per_segment=2)) |
| 1457 | + |
| 1458 | + idx = [ |
| 1459 | + pd.Timestamp(0), |
| 1460 | + pd.Timestamp(1000), |
| 1461 | + pd.Timestamp(1000), |
| 1462 | + pd.Timestamp(1000), |
| 1463 | + ] |
| 1464 | + |
| 1465 | + data = pd.DataFrame({"a": len(idx)}, index=idx) |
| 1466 | + lib.write("test", data, staged=True) |
| 1467 | + |
| 1468 | + lt = lib._nvs.library_tool() |
| 1469 | + append_keys = lt.find_keys_for_id(KeyType.APPEND_DATA, "test") |
| 1470 | + assert len(append_keys) == 2 |
| 1471 | + assert sorted([key.start_index for key in append_keys]) == [0, 1000] |
| 1472 | + assert [key.end_index for key in append_keys] == [1001, 1001] |
| 1473 | + |
| 1474 | + lib.finalize_staged_data("test") |
| 1475 | + |
| 1476 | + df = lib.read("test").data |
| 1477 | + assert_frame_equal(df, data) |
| 1478 | + |
| 1479 | + |
| 1480 | +def test_chunks_overlap_1ns(lmdb_storage, lib_name): |
| 1481 | + """Given - we stage chunks that overlap by 1ns |
| 1482 | +
|
| 1483 | + When - We finalize the staged segments |
| 1484 | +
|
| 1485 | + Then - We should raise a validation error |
| 1486 | + """ |
| 1487 | + lib: Library = lmdb_storage.create_arctic().create_library( |
| 1488 | + lib_name, |
| 1489 | + library_options=LibraryOptions(rows_per_segment=2)) |
| 1490 | + |
| 1491 | + idx = [pd.Timestamp(0), pd.Timestamp(1), pd.Timestamp(2)] |
| 1492 | + first = pd.DataFrame({"a": len(idx)}, index=idx) |
| 1493 | + lib.write("test", first, staged=True) |
| 1494 | + |
| 1495 | + idx = [pd.Timestamp(1), pd.Timestamp(3)] |
| 1496 | + second = pd.DataFrame({"a": len(idx)}, index=idx) |
| 1497 | + lib.write("test", second, staged=True) |
| 1498 | + |
| 1499 | + with pytest.raises(UnsortedDataException): |
| 1500 | + lib.finalize_staged_data("test") |
| 1501 | + |
| 1502 | + |
| 1503 | +def test_chunks_match_at_ends(lmdb_storage, lib_name): |
| 1504 | + """Given - we stage chunks that match at the ends |
| 1505 | +
|
| 1506 | + When - We finalize the staged segments |
| 1507 | +
|
| 1508 | + Then - Should be OK to finalize |
| 1509 | + """ |
| 1510 | + lib: Library = lmdb_storage.create_arctic().create_library( |
| 1511 | + lib_name, |
| 1512 | + library_options=LibraryOptions(rows_per_segment=2)) |
| 1513 | + |
| 1514 | + first_idx = [pd.Timestamp(0), pd.Timestamp(1), pd.Timestamp(2)] |
| 1515 | + first = pd.DataFrame({"a": np.arange(3)}, index=first_idx) |
| 1516 | + lib.write("test", first, staged=True) |
| 1517 | + |
| 1518 | + second_idx = [pd.Timestamp(2), pd.Timestamp(2), pd.Timestamp(2), pd.Timestamp(3)] |
| 1519 | + second = pd.DataFrame({"a": np.arange(3, 7)}, index=second_idx) |
| 1520 | + lib.write("test", second, staged=True) |
| 1521 | + |
| 1522 | + lib.finalize_staged_data("test") |
| 1523 | + |
| 1524 | + result = lib.read("test").data |
| 1525 | + index_result = result.index |
| 1526 | + assert index_result.equals(pd.Index(first_idx + second_idx)) |
| 1527 | + assert result.index.is_monotonic_increasing |
| 1528 | + # There is some non-determinism about where the overlap will end up |
| 1529 | + assert set(result["a"].values) == set(range(7)) |
| 1530 | + assert result["a"][0] == 0 |
| 1531 | + assert result["a"][-1] == 6 |
| 1532 | + |
| 1533 | + |
| 1534 | +def test_chunks_the_same(lmdb_storage, lib_name): |
| 1535 | + """Given - we stage chunks with indexes: |
| 1536 | +
|
| 1537 | + b:test:0:0xc7ad4135da54cd6e@1739968588832977666[1000,2001] |
| 1538 | + b:test:0:0x68d8759aba38bcf0@1739968588832775570[1000,1001] |
| 1539 | + b:test:0:0x68d8759aba38bcf0@1739968588832621000[1000,1001] |
| 1540 | +
|
| 1541 | + When - We finalize the staged segments |
| 1542 | + |
| 1543 | + Then - We should succeed even though the segments seem to be identical, since they are just covering a duplicated |
| 1544 | + index value |
| 1545 | + """ |
| 1546 | + lib: Library = lmdb_storage.create_arctic().create_library( |
| 1547 | + lib_name, |
| 1548 | + library_options=LibraryOptions(rows_per_segment=2)) |
| 1549 | + |
| 1550 | + idx = [ |
| 1551 | + pd.Timestamp(1000), |
| 1552 | + pd.Timestamp(1000), |
| 1553 | + pd.Timestamp(1000), |
| 1554 | + pd.Timestamp(1000), |
| 1555 | + pd.Timestamp(1000), |
| 1556 | + pd.Timestamp(2000), |
| 1557 | + ] |
| 1558 | + |
| 1559 | + data = pd.DataFrame({"a": len(idx)}, index=idx) |
| 1560 | + lib.write("test", data, staged=True) |
| 1561 | + |
| 1562 | + lt = lib._nvs.library_tool() |
| 1563 | + append_keys = lt.find_keys_for_id(KeyType.APPEND_DATA, "test") |
| 1564 | + assert len(append_keys) == 3 |
| 1565 | + assert sorted([key.start_index for key in append_keys]) == [1000, 1000, 1000] |
| 1566 | + assert sorted([key.end_index for key in append_keys]) == [1001, 1001, 2001] |
| 1567 | + |
| 1568 | + lib.finalize_staged_data("test") |
| 1569 | + |
| 1570 | + df = lib.read("test").data |
| 1571 | + assert_frame_equal(df, data) |
| 1572 | + assert df.index.is_monotonic_increasing |
| 1573 | + |
| 1574 | + |
| 1575 | +def test_staging_in_chunks_default_settings(lmdb_storage, lib_name): |
| 1576 | + lib: Library = lmdb_storage.create_arctic().create_library(lib_name) |
| 1577 | + idx = pd.date_range(pd.Timestamp(0), periods=int(31e5), freq="us") |
| 1578 | + |
| 1579 | + data = pd.DataFrame({"a": len(idx)}, index=idx) |
| 1580 | + lib.write("test", data, staged=True) |
| 1581 | + |
| 1582 | + lt = lib._nvs.library_tool() |
| 1583 | + append_keys = lt.find_keys_for_id(KeyType.APPEND_DATA, "test") |
| 1584 | + assert len(append_keys) == 31 |
| 1585 | + lib.finalize_staged_data("test") |
| 1586 | + |
| 1587 | + df = lib.read("test").data |
| 1588 | + assert_frame_equal(df, data) |
| 1589 | + assert df.index.is_monotonic_increasing |
0 commit comments