Get object sizes based on S3's ListObjects output - implement PR review comments

poodlewars · poodlewars · commit 9322077d1fc0 · 2025-03-18T17:07:26.000Z
8560764974
diff --git a/cpp/arcticdb/storage/s3/detail-inl.hpp b/cpp/arcticdb/storage/s3/detail-inl.hpp
@@ -525,7 +525,7 @@ ObjectSizes do_calculate_sizes_for_type_impl(
     do {
         auto list_objects_result = s3_client.list_objects(path_info.key_prefix_, bucket_name, continuation_token);
         if (list_objects_result.is_success()) {
-            auto& output = list_objects_result.get_output();
+            const auto& output = list_objects_result.get_output();
 
             ARCTICDB_RUNTIME_DEBUG(log::storage(), "Received object list");
 
diff --git a/cpp/arcticdb/storage/s3/s3_client_impl.cpp b/cpp/arcticdb/storage/s3/s3_client_impl.cpp
@@ -305,8 +305,7 @@ S3Result<ListObjectsOutput> S3ClientImpl::list_objects(
         s3_object_sizes.emplace_back(s3_object.GetSize());
     }
 
-    ListObjectsOutput output = {s3_object_names, s3_object_sizes, next_continuation_token};
-    return {output};
+    return {ListObjectsOutput{std::move(s3_object_names), std::move(s3_object_sizes), next_continuation_token}};
 }
 
 }
diff --git a/cpp/arcticdb/storage/storage.hpp b/cpp/arcticdb/storage/storage.hpp
@@ -137,7 +137,7 @@ class Storage {
           visitor(std::move(k));
           return false; // keep applying the visitor no matter what
         };
-      do_iterate_type_until_match(key_type, predicate_visitor, prefix);
+        do_iterate_type_until_match(key_type, predicate_visitor, prefix);
     }
 
     [[nodiscard]] virtual bool supports_object_size_calculation() const {
@@ -259,9 +259,9 @@ template<> struct formatter<ObjectSizes> {
     constexpr auto parse(ParseContext &ctx) { return ctx.begin(); }
 
     template<typename FormatContext>
-    auto format(const ObjectSizes &srv, FormatContext &ctx) const {
+    auto format(const ObjectSizes &sizes, FormatContext &ctx) const {
         return fmt::format_to(ctx.out(), "ObjectSizes key_type[{}] count[{}] compressed_size_bytes[{}]",
-                              srv.key_type_, srv.count_, srv.compressed_size_bytes_);
+                              sizes.key_type_, sizes.count_, sizes.compressed_size_bytes_);
     }
 };
 }
diff --git a/cpp/arcticdb/storage/storages.hpp b/cpp/arcticdb/storage/storages.hpp
@@ -188,7 +188,12 @@ class Storages {
         }
     }
 
-    ObjectSizes get_object_sizes(KeyType key_type, const std::string& prefix) {
+    ObjectSizes get_object_sizes(KeyType key_type, const std::string& prefix, bool primary_only = true) {
+        if (primary_only) {
+            auto storage_sizes = primary().get_object_sizes(key_type, prefix);
+            return {key_type, storage_sizes.count_, storage_sizes.compressed_size_bytes_};
+        }
+
         ObjectSizes res{key_type, 0, 0};
         for (const auto& storage : storages_) {
             auto storage_sizes = storage->get_object_sizes(key_type, prefix);
diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp
@@ -1708,8 +1708,7 @@ std::vector<storage::ObjectSizes> LocalVersionedEngine::scan_object_sizes() {
         sizes.push_back(store->get_object_sizes(key_type, ""));
     });
 
-    folly::QueuedImmediateExecutor inline_executor;
-    return folly::collect(sizes_futs).via(&inline_executor).get();
+    return folly::collect(sizes_futs).via(&async::cpu_executor()).get();
 }
 
 std::unordered_map<StreamId, std::unordered_map<KeyType, KeySizesInfo>> LocalVersionedEngine::scan_object_sizes_by_stream() {
diff --git a/python/tests/integration/arcticdb/version_store/test_symbol_sizes.py b/python/tests/integration/arcticdb/version_store/test_symbol_sizes.py
@@ -1,8 +1,11 @@
 from multiprocessing import Queue, Process
 
 import pytest
-from arcticdb.util.test import sample_dataframe
+from arcticdb import LibraryOptions
+from arcticdb.encoding_version import EncodingVersion
+from arcticdb.util.test import sample_dataframe, config_context_multi
 from arcticdb_ext.storage import KeyType
+import arcticdb_ext.cpp_async as adb_async
 
 
 def test_symbol_sizes(basic_store):
@@ -105,6 +108,79 @@ def test_scan_object_sizes(arctic_client, lib_name):
     assert 500 < res[KeyType.VERSION_REF][1] < 1500
 
 
+@pytest.mark.parametrize("storage, encoding_version_, num_io_threads, num_cpu_threads", [
+    ("s3", EncodingVersion.V1, 1, 1),
+    ("s3", EncodingVersion.V1, 10, 1),
+    ("s3", EncodingVersion.V1, 1, 10),
+])
+def test_scan_object_sizes_threading(request, storage, encoding_version_, lib_name, num_io_threads, num_cpu_threads):
+    """Some stress testing for scan_object_sizes, particularly against deadlocks. Use a small segment size so that
+    there is some work to be done in parallel."""
+    storage_fixture = request.getfixturevalue(storage + "_storage")
+    arctic_client = storage_fixture.create_arctic(encoding_version=encoding_version_)
+    try:
+        with config_context_multi({"VersionStore.NumIOThreads": num_io_threads, "VersionStore.NumCPUThreads": num_cpu_threads}):
+            adb_async.reinit_task_scheduler()
+            if num_io_threads:
+                assert adb_async.io_thread_count() == num_io_threads
+            if num_cpu_threads:
+                assert adb_async.cpu_thread_count() == num_cpu_threads
+
+            lib = arctic_client.create_library(lib_name, library_options=LibraryOptions(rows_per_segment=5))
+            basic_store = lib._nvs
+
+            df = sample_dataframe(100)
+            basic_store.write("sym", df)
+            basic_store.write("sym", df)
+
+            sizes = basic_store.version_store.scan_object_sizes()
+
+            res = dict()
+            for s in sizes:
+                res[s.key_type] = (s.count, s.compressed_size_bytes)
+
+            assert KeyType.VERSION in res
+            assert KeyType.TABLE_INDEX in res
+            assert KeyType.TABLE_DATA in res
+            assert KeyType.VERSION_REF in res
+    finally:
+        adb_async.reinit_task_scheduler()
+
+
+@pytest.mark.parametrize("storage, encoding_version_, num_io_threads, num_cpu_threads", [
+    ("s3", EncodingVersion.V1, 1, 1),
+    ("s3", EncodingVersion.V1, 10, 1),
+    ("s3", EncodingVersion.V1, 1, 10),
+])
+def test_scan_object_sizes_by_stream_threading(request, storage, encoding_version_, lib_name, num_io_threads, num_cpu_threads):
+    """Some stress testing for scan_object_sizes, particularly against deadlocks. Use a small segment size so that
+    there is some work to be done in parallel."""
+    storage_fixture = request.getfixturevalue(storage + "_storage")
+    arctic_client = storage_fixture.create_arctic(encoding_version=encoding_version_)
+    try:
+        with config_context_multi({"VersionStore.NumIOThreads": num_io_threads, "VersionStore.NumCPUThreads": num_cpu_threads}):
+            adb_async.reinit_task_scheduler()
+            if num_io_threads:
+                assert adb_async.io_thread_count() == num_io_threads
+            if num_cpu_threads:
+                assert adb_async.cpu_thread_count() == num_cpu_threads
+
+            lib = arctic_client.create_library(lib_name, library_options=LibraryOptions(rows_per_segment=5))
+            basic_store = lib._nvs
+
+            df = sample_dataframe(100)
+            basic_store.write("sym", df)
+            basic_store.write("sym", df)
+
+            sizes = basic_store.version_store.scan_object_sizes_by_stream()
+
+            assert sizes["sym"][KeyType.VERSION].compressed_size < 2000
+            assert sizes["sym"][KeyType.TABLE_INDEX].compressed_size < 5000
+            assert sizes["sym"][KeyType.TABLE_DATA].compressed_size < 50_000
+    finally:
+        adb_async.reinit_task_scheduler()
+
+
 @pytest.fixture
 def reader_store(basic_store):
     return basic_store
@@ -141,7 +217,7 @@ def test_symbol_sizes_concurrent(reader_store, writer_store):
     try:
         reader.start()
         writer.start()
-        reader.join(1)
+        reader.join(2)
         writer.join(0.001)
     finally:
         writer.terminate()
diff --git a/python/tests/unit/arcticdb/version_store/test_parallel.py b/python/tests/unit/arcticdb/version_store/test_parallel.py
@@ -113,44 +113,48 @@ def test_remove_incomplete(basic_store):
 @pytest.mark.parametrize("num_segments_live_during_compaction, num_io_threads, num_cpu_threads", [
     (1, 1, 1),
     (10, 1, 1),
+    (1, 10, 1),
     (None, None, None)
 ])
 def test_parallel_write(basic_store_tiny_segment, num_segments_live_during_compaction, num_io_threads, num_cpu_threads):
-    with config_context_multi({"VersionStore.NumSegmentsLiveDuringCompaction": num_segments_live_during_compaction,
-                               "VersionStore.NumIOThreads": num_io_threads,
-                               "VersionStore.NumCPUThreads": num_cpu_threads}):
+    try:
+        with config_context_multi({"VersionStore.NumSegmentsLiveDuringCompaction": num_segments_live_during_compaction,
+                                   "VersionStore.NumIOThreads": num_io_threads,
+                                   "VersionStore.NumCPUThreads": num_cpu_threads}):
+            adb_async.reinit_task_scheduler()
+            if num_io_threads:
+                assert adb_async.io_thread_count() == num_io_threads
+            if num_cpu_threads:
+                assert adb_async.cpu_thread_count() == num_cpu_threads
+
+            store = basic_store_tiny_segment
+            sym = "parallel"
+            store.remove_incomplete(sym)
+
+            num_rows = 1111
+            dtidx = pd.date_range("1970-01-01", periods=num_rows)
+            test = pd.DataFrame(
+                {
+                    "uint8": random_integers(num_rows, np.uint8),
+                    "uint32": random_integers(num_rows, np.uint32),
+                },
+                index=dtidx,
+            )
+            chunk_size = 100
+            list_df = [test[i : i + chunk_size] for i in range(0, test.shape[0], chunk_size)]
+            random.shuffle(list_df)
+
+            for df in list_df:
+                store.write(sym, df, parallel=True)
+
+            user_meta = {"thing": 7}
+            store.compact_incomplete(sym, False, False, metadata=user_meta)
+            vit = store.read(sym)
+            assert_frame_equal(test, vit.data)
+            assert vit.metadata["thing"] == 7
+            assert len(get_append_keys(store, sym)) == 0
+    finally:
         adb_async.reinit_task_scheduler()
-        if num_io_threads:
-            assert adb_async.io_thread_count() == num_io_threads
-        if num_cpu_threads:
-            assert adb_async.cpu_thread_count() == num_cpu_threads
-
-        store = basic_store_tiny_segment
-        sym = "parallel"
-        store.remove_incomplete(sym)
-
-        num_rows = 1111
-        dtidx = pd.date_range("1970-01-01", periods=num_rows)
-        test = pd.DataFrame(
-            {
-                "uint8": random_integers(num_rows, np.uint8),
-                "uint32": random_integers(num_rows, np.uint32),
-            },
-            index=dtidx,
-        )
-        chunk_size = 100
-        list_df = [test[i : i + chunk_size] for i in range(0, test.shape[0], chunk_size)]
-        random.shuffle(list_df)
-
-        for df in list_df:
-            store.write(sym, df, parallel=True)
-
-        user_meta = {"thing": 7}
-        store.compact_incomplete(sym, False, False, metadata=user_meta)
-        vit = store.read(sym)
-        assert_frame_equal(test, vit.data)
-        assert vit.metadata["thing"] == 7
-        assert len(get_append_keys(store, sym)) == 0
 
 
 @pytest.mark.parametrize("index, expect_ordered", [

Original file line number	Diff line number	Diff line change
`@@ -305,8 +305,7 @@ S3Result<ListObjectsOutput> S3ClientImpl::list_objects(`
`305`	`305`	`s3_object_sizes.emplace_back(s3_object.GetSize());`
`306`	`306`	`}`
`307`	`307`
`308`		`- ListObjectsOutput output = {s3_object_names, s3_object_sizes, next_continuation_token};`
`309`		`- return {output};`
	`308`	`+ return {ListObjectsOutput{std::move(s3_object_names), std::move(s3_object_sizes), next_continuation_token}};`
`310`	`309`	`}`
`311`	`310`
`312`	`311`	`}`
Original file line number	Diff line number	Diff line change
`@@ -137,7 +137,7 @@ class Storage {`
`137`	`137`	`visitor(std::move(k));`
`138`	`138`	`return false; // keep applying the visitor no matter what`
`139`	`139`	`};`
`140`		`- do_iterate_type_until_match(key_type, predicate_visitor, prefix);`
	`140`	`+ do_iterate_type_until_match(key_type, predicate_visitor, prefix);`
`141`	`141`	`}`
`142`	`142`
`143`	`143`	`[[nodiscard]] virtual bool supports_object_size_calculation() const {`
`@@ -259,9 +259,9 @@ template<> struct formatter<ObjectSizes> {`
`259`	`259`	`constexpr auto parse(ParseContext &ctx) { return ctx.begin(); }`
`260`	`260`
`261`	`261`	`template<typename FormatContext>`
`262`		`- auto format(const ObjectSizes &srv, FormatContext &ctx) const {`
	`262`	`+ auto format(const ObjectSizes &sizes, FormatContext &ctx) const {`
`263`	`263`	`return fmt::format_to(ctx.out(), "ObjectSizes key_type[{}] count[{}] compressed_size_bytes[{}]",`
`264`		`- srv.key_type_, srv.count_, srv.compressed_size_bytes_);`
	`264`	`+ sizes.key_type_, sizes.count_, sizes.compressed_size_bytes_);`
`265`	`265`	`}`
`266`	`266`	`};`
`267`	`267`	`}`
Original file line number	Diff line number	Diff line change
`@@ -188,7 +188,12 @@ class Storages {`
`188`	`188`	`}`
`189`	`189`	`}`
`190`	`190`
`191`		`- ObjectSizes get_object_sizes(KeyType key_type, const std::string& prefix) {`
	`191`	`+ ObjectSizes get_object_sizes(KeyType key_type, const std::string& prefix, bool primary_only = true) {`
	`192`	`+ if (primary_only) {`
	`193`	`+ auto storage_sizes = primary().get_object_sizes(key_type, prefix);`
	`194`	`+ return {key_type, storage_sizes.count_, storage_sizes.compressed_size_bytes_};`
	`195`	`+ }`
	`196`	`+`
`192`	`197`	`ObjectSizes res{key_type, 0, 0};`
`193`	`198`	`for (const auto& storage : storages_) {`
`194`	`199`	`auto storage_sizes = storage->get_object_sizes(key_type, prefix);`
Original file line number	Diff line number	Diff line change
`@@ -1708,8 +1708,7 @@ std::vector<storage::ObjectSizes> LocalVersionedEngine::scan_object_sizes() {`
`1708`	`1708`	`sizes.push_back(store->get_object_sizes(key_type, ""));`
`1709`	`1709`	`});`
`1710`	`1710`
`1711`		`- folly::QueuedImmediateExecutor inline_executor;`
`1712`		`- return folly::collect(sizes_futs).via(&inline_executor).get();`
	`1711`	`+ return folly::collect(sizes_futs).via(&async::cpu_executor()).get();`
`1713`	`1712`	`}`
`1714`	`1713`
`1715`	`1714`	`std::unordered_map<StreamId, std::unordered_map<KeyType, KeySizesInfo>> LocalVersionedEngine::scan_object_sizes_by_stream() {`