Increase the memory threshold for test_mem_leak_read_all_arctic_lib

G-D-Petrov · G-D-Petrov · commit ae1abf6c863a · 2025-03-11T17:13:18.000+02:00
diff --git a/python/tests/stress/arcticdb/version_store/test_mem_leaks.py b/python/tests/stress/arcticdb/version_store/test_mem_leaks.py
@@ -38,10 +38,10 @@
 logger = logging.getLogger("Memory_tests")
 
 ##   IMPORTANT !!!
-##   
+##
 ##   All memory tests MUST be done with fixtures that return Library object
-##   and not NativeVersionStore. Reason is that the last is thick wrapper which 
-##   is hiding some possible problems, therefore all tests have to be done with what 
+##   and not NativeVersionStore. Reason is that the last is thick wrapper which
+##   is hiding some possible problems, therefore all tests have to be done with what
 ##   customer works on
 
 
@@ -362,7 +362,7 @@ def proc_to_examine():
          run the test from command line again to assure it runs ok before commit 
 
     """
-    max_mem_bytes = 295_623_040
+    max_mem_bytes = 340_623_040
 
     check_process_memory_leaks(proc_to_examine, 20, max_mem_bytes, 80.0)
 
@@ -609,7 +609,7 @@ def is_relevant(stack: Stack) -> bool:
 
             if "folly::CPUThreadPoolExecutor::CPUTask" in frame_info_str:
                 logger.warning(f"Frame excluded : {frame_info_str}")
-                logger.warning(f'''Explanation    : These are on purpose, and they come from the interaction of 
+                logger.warning(f"""Explanation    : These are on purpose, and they come from the interaction of 
                                multi-threading and forking. When Python forks, the task-scheduler has a linked-list 
                                of tasks to execute, but there is a global lock held that protects the thread-local state.
                                We can't free the list without accessing the global thread-local storage singleton, 
@@ -626,9 +626,9 @@ def is_relevant(stack: Stack) -> bool:
                                find something better
 
                                Great that it is catching this, as it's the one case in the whole project where I know 
-                               for certain that it does leak memory (and only because there's no alternative''')
+                               for certain that it does leak memory (and only because there's no alternative""")
                 return False
-            
+
             pass
         return True
 
@@ -776,7 +776,7 @@ def test_mem_leak_read_all_arctic_lib_memray(library_with_big_symbol_):
     def lmdb_library(lmdb_storage, lib_name, request) -> Generator[Library, None, None]:
         """
         Allows passing library creation parameters as parameters of the test or other fixture.
-        Example: 
+        Example:
 
 
             @pytest.mark.parametrize("lmdb_library_any", [
@@ -785,12 +785,11 @@ def lmdb_library(lmdb_storage, lib_name, request) -> Generator[Library, None, No
             def test_my_test(lmdb_library_any):
             .....
         """
-        params = request.param if hasattr(request, 'param') else {}
+        params = request.param if hasattr(request, "param") else {}
         yield lmdb_storage.create_arctic().create_library(name=lib_name, **params)
 
-
     @pytest.fixture
-    def prepare_head_tails_symbol(lmdb_library):               
+    def prepare_head_tails_symbol(lmdb_library):
         """
         This fixture is part of test `test_mem_leak_head_tail_memray`
 
@@ -801,17 +800,18 @@ def prepare_head_tails_symbol(lmdb_library):
         Should not be reused
         """
         lib: Library = lmdb_library
-        opts  = lib.options()
-        
+        opts = lib.options()
+
         total_number_columns = 1002
         symbol = "asdf12345"
-        num_rows_list = [279,199,1,350,999,0,1001]
+        num_rows_list = [279, 199, 1, 350, 999, 0, 1001]
         snapshot_names = []
         for rows in num_rows_list:
             st = time.time()
-            df = DFGenerator.generate_wide_dataframe(num_rows=rows, num_cols=total_number_columns, num_string_cols=25, 
-                                                     start_time=pd.Timestamp(0),seed=64578)
-            lib.write(symbol,df)
+            df = DFGenerator.generate_wide_dataframe(
+                num_rows=rows, num_cols=total_number_columns, num_string_cols=25, start_time=pd.Timestamp(0), seed=64578
+            )
+            lib.write(symbol, df)
             snap = f"{symbol}_{rows}"
             lib.snapshot(snap)
             snapshot_names.append(snap)
@@ -821,20 +821,37 @@ def prepare_head_tails_symbol(lmdb_library):
                 # characteristic
                 total_number_columns += 20
                 logger.info(f"Total number of columns increased to {total_number_columns}")
-            
+
         all_columns = df.columns.to_list()
         yield (lib, symbol, num_rows_list, snapshot_names, all_columns)
         lib.delete(symbol=symbol)
 
-
     @MEMRAY_TESTS_MARK
     @SLOW_TESTS_MARK
     ## Linux is having quite huge location there will be separate issue to investigate why
     @pytest.mark.limit_leaks(location_limit="1000 KB" if LINUX else "52 KB", filter_fn=is_relevant)
-    @pytest.mark.parametrize("lmdb_library", [
-                {'library_options': LibraryOptions(rows_per_segment=233, columns_per_segment=197, dynamic_schema=True, encoding_version=EncodingVersion.V2)},
-                {'library_options': LibraryOptions(rows_per_segment=99, columns_per_segment=99, dynamic_schema=False, encoding_version=EncodingVersion.V1)}
-            ], indirect=True)
+    @pytest.mark.parametrize(
+        "lmdb_library",
+        [
+            {
+                "library_options": LibraryOptions(
+                    rows_per_segment=233,
+                    columns_per_segment=197,
+                    dynamic_schema=True,
+                    encoding_version=EncodingVersion.V2,
+                )
+            },
+            {
+                "library_options": LibraryOptions(
+                    rows_per_segment=99,
+                    columns_per_segment=99,
+                    dynamic_schema=False,
+                    encoding_version=EncodingVersion.V1,
+                )
+            },
+        ],
+        indirect=True,
+    )
     def test_mem_leak_head_tail_memray(prepare_head_tails_symbol):
         """
         This test aims to test `head` and `tail` functions if they do leak memory.
@@ -845,33 +862,35 @@ def test_mem_leak_head_tail_memray(prepare_head_tails_symbol):
         symbol: str
         num_rows_list: List[int]
         store: NativeVersionStore = None
-        snapshot_names:  List[str]
+        snapshot_names: List[str]
         all_columns: List[str]
         (store, symbol, num_rows_list, snapshot_names, all_columns) = prepare_head_tails_symbol
-    
+
         start_test: float = time.time()
-        max_rows:int = max(num_rows_list)
+        max_rows: int = max(num_rows_list)
 
         np.random.seed(959034)
         # constructing a list of head and tail rows to be selected
         num_rows_to_select = []
-        important_values = [0, 1, 0 -1, 2, -2, max_rows, -max_rows ] # some boundary cases
+        important_values = [0, 1, 0 - 1, 2, -2, max_rows, -max_rows]  # some boundary cases
         num_rows_to_select.extend(important_values)
-        num_rows_to_select.extend(np.random.randint(low=5, high=99, size=7)) # add 7 more random values
+        num_rows_to_select.extend(np.random.randint(low=5, high=99, size=7))  # add 7 more random values
         # number of iterations will be the list length/size
         iterations = len(num_rows_to_select)
         # constructing a random list of values for snapshot names for each iteration
-        snapshots_list: List[str] = np.random.choice(snapshot_names, iterations) 
+        snapshots_list: List[str] = np.random.choice(snapshot_names, iterations)
         # constructing a random list of values for versions names for each iteration
-        versions_list: List[int] = np.random.randint(0, len(num_rows_list) - 1, iterations) 
+        versions_list: List[int] = np.random.randint(0, len(num_rows_list) - 1, iterations)
         # constructing a random list of number of columns to be selected
-        number_columns_for_selection_list: List[int] = np.random.randint(0, len(all_columns)-1, iterations) 
+        number_columns_for_selection_list: List[int] = np.random.randint(0, len(all_columns) - 1, iterations)
 
         count: int = 0
         # We will execute several time all head/tail operations with specific number of columns.
         # the number of columns consist of random columns and boundary cases see definition above
         for rows in num_rows_to_select:
-            selected_columns:List[str] = np.random.choice(all_columns, number_columns_for_selection_list[count], replace=False).tolist() 
+            selected_columns: List[str] = np.random.choice(
+                all_columns, number_columns_for_selection_list[count], replace=False
+            ).tolist()
             snap: str = snapshots_list[count]
             ver: str = int(versions_list[count])
             logger.info(f"rows {rows} / snapshot {snap}")
@@ -887,11 +906,9 @@ def test_mem_leak_head_tail_memray(prepare_head_tails_symbol):
             logger.info(f"Iteration {count} / {iterations} completed")
             count += 1
             del selected_columns, df1, df2, df3, df4
-        
+
         del store, symbol, num_rows_list, snapshot_names, all_columns
         del num_rows_to_select, important_values, snapshots_list, versions_list, number_columns_for_selection_list
         gc.collect()
-        time.sleep(10) # collection is not immediate
-        logger.info(f"Test completed in {time.time() - start_test}")     
-
-
+        time.sleep(10)  # collection is not immediate
+        logger.info(f"Test completed in {time.time() - start_test}")