Extended testing of how we handle unicode strings

Alex Seaton · poodlewars · commit f03d19880b70 · 2025-03-11T13:56:22.000Z
Adjust mem leaks threshold in tests as testing with unicode strings uses a bit more memory
diff --git a/python/arcticdb/util/test.py b/python/arcticdb/util/test.py
@@ -244,8 +244,16 @@ def assert_frame_equal_rebuild_index_first(expected: pd.DataFrame, actual: pd.Da
     assert_frame_equal(left=expected, right=actual)
 
 
+unicode_symbol = "\u00A0"  # start of latin extensions
+unicode_symbols = "".join([chr(ord(unicode_symbol) + i) for i in range(100)])
+
+
 def random_string(length: int):
-    return "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length))
+    if random.randint(0, 3) == 0:
+        # (probably) Give a unicode string one time in three, we have special handling in C++ for unicode
+        return "".join(random.choice(string.ascii_uppercase + unicode_symbols) for _ in range(length))
+    else:
+        return "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length))
 
 
 def get_sample_dataframe(size=1000, seed=0, str_size=10):
@@ -433,7 +441,15 @@ def get_pickle():
     )[np.random.randint(0, 2)]
 
 
-def random_strings_of_length(num, length, unique):
+def random_ascii_strings(count, max_length):
+    result = []
+    for _ in range(count):
+        length = random.randrange(max_length + 1)
+        result.append("".join(random.choice(string.ascii_letters) for _ in range(length)))
+    return result
+
+
+def random_strings_of_length(num, length, unique=False):
     out = []
     for i in range(num):
         out.append(random_string(length))
diff --git a/python/tests/integration/arcticdb/version_store/test_basic_version_store.py b/python/tests/integration/arcticdb/version_store/test_basic_version_store.py
@@ -458,47 +458,6 @@ def test_prune_previous_versions_append_batch(basic_store):
     assert len(lib_tool.find_keys(KeyType.SYMBOL_LIST)) == 4
 
 
-def test_batch_append_unicode(basic_store):
-    symbol = "test_append_unicode"
-    uc = "\u0420\u043e\u0441\u0441\u0438\u044f"
-
-    df1 = pd.DataFrame(
-        index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
-        data={"a": ["123", uc]},
-    )
-    basic_store.batch_write(symbols=[symbol], data_vector=[df1])
-    vit = basic_store.batch_read([symbol])[symbol]
-    assert_equal(vit.data, df1)
-
-    df2 = pd.DataFrame(
-        index=[pd.Timestamp("2018-01-04"), pd.Timestamp("2018-01-05")],
-        data={"a": ["123", uc]},
-    )
-    basic_store.batch_append(symbols=[symbol], data_vector=[df2])
-    vit = basic_store.batch_read([symbol])[symbol]
-    expected = pd.concat([df1, df2])
-    assert_equal(vit.data, expected)
-
-
-def test_batch_write_metadata_unicode(basic_store):
-    symbol = "test_append_unicode"
-    uc = "\u0420\u043e\u0441\u0441\u0438\u044f"
-    df1 = pd.DataFrame(
-        index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
-        data={"a": ["123", uc]},
-    )
-
-    basic_store.batch_write(symbols=[symbol], data_vector=[df1])
-    vit = basic_store.batch_read([symbol])[symbol]
-    assert_equal(vit.data, df1)
-
-    meta = {"a": 1, "b": uc}
-    basic_store.batch_write_metadata(symbols=[symbol], metadata_vector=[meta])
-    vits = basic_store.batch_read_metadata([symbol])
-    metadata = vits[symbol].metadata
-    assert metadata == meta
-
-
 def test_deleting_unknown_symbol(basic_store, symbol):
     df = sample_dataframe()
 
diff --git a/python/tests/integration/arcticdb/version_store/test_symbol_list.py b/python/tests/integration/arcticdb/version_store/test_symbol_list.py
@@ -10,7 +10,7 @@
 import pytest
 
 from arcticdb.config import Defaults
-from arcticdb.util.test import sample_dataframe
+from arcticdb.util.test import sample_dataframe, random_ascii_strings
 from arcticdb.version_store._store import NativeVersionStore
 from arcticdb.toolbox.library_tool import (
     VariantKey,
@@ -25,10 +25,9 @@
 
 from multiprocessing import Pool
 from arcticdb_ext import set_config_int
-import random
-import string
 from tests.util.mark import MACOS_CONDA_BUILD
 
+
 @pytest.fixture
 def small_max_delta():
     set_config_int("SymbolList.MaxDelta", 2)
@@ -278,16 +277,6 @@ def test_lock_contention(small_max_delta, basic_store, mode):
         assert lt.find_keys(KeyType.SYMBOL_LIST) != orig_sl
 
 
-def random_strings(count, max_length):
-    result = []
-    for _ in range(count):
-        length = random.randrange(max_length) + 2
-        result.append(
-            "".join(random.choice(string.ascii_letters) for _ in range(length))
-        )
-    return result
-
-
 def _tiny_df(idx):
     return pd.DataFrame(
         {"x": np.arange(idx % 10, idx % 10 + 10)},
@@ -346,16 +335,16 @@ def test_symbol_list_parallel_stress_with_delete(
     num_cycles = 1
     symbol_length = 6
 
-    pre_existing_symbols = random_strings(num_pre_existing_symbols, symbol_length)
+    pre_existing_symbols = random_ascii_strings(num_pre_existing_symbols, symbol_length)
     for idx, existing in enumerate(pre_existing_symbols):
         lib.write(existing, _tiny_df(idx))
 
     if same_symbols:
-        frozen_symbols = random_strings(num_symbols, symbol_length)
+        frozen_symbols = random_ascii_strings(num_symbols, symbol_length)
         symbols = [frozen_symbols for _ in range(num_workers)]
     else:
         symbols = [
-            random_strings(num_symbols, symbol_length) for _ in range(num_workers)
+            random_ascii_strings(num_symbols, symbol_length) for _ in range(num_workers)
         ]
 
     with Pool(num_workers) as p:
diff --git a/python/tests/stress/arcticdb/version_store/test_mem_leaks.py b/python/tests/stress/arcticdb/version_store/test_mem_leaks.py
@@ -361,7 +361,7 @@ def proc_to_examine():
          run the test from command line again to assure it runs ok before commit 
 
     """
-    max_mem_bytes = 295_623_040
+    max_mem_bytes = 350_000_000
 
     check_process_memory_leaks(proc_to_examine, 20, max_mem_bytes, 80.0)
 
@@ -705,7 +705,7 @@ def test_mem_leak_querybuilder_read_batch_memray(library_with_symbol):
         mem_query(lib, df, read_batch=True)
 
     @MEMRAY_TESTS_MARK
-    @pytest.mark.limit_memory("490 MB")
+    @pytest.mark.limit_memory("600 MB")
     @pytest.mark.skipif(MACOS, reason="Mac OS mem usage is harder to predicts than WINDOWS")
     def test_mem_limit_querybuilder_read_memray(library_with_symbol):
         """
@@ -719,7 +719,7 @@ def test_mem_limit_querybuilder_read_memray(library_with_symbol):
         mem_query(lib, df)
 
     @MEMRAY_TESTS_MARK
-    @pytest.mark.limit_memory("490 MB")
+    @pytest.mark.limit_memory("600 MB")
     @pytest.mark.skipif(MACOS, reason="Mac OS mem usage is harder to predicts than WINDOWS")
     def test_mem_limit_querybuilder_read_batch_memray(library_with_symbol):
         """
diff --git a/python/tests/unit/arcticdb/version_store/test_append.py b/python/tests/unit/arcticdb/version_store/test_append.py
@@ -32,28 +32,6 @@ def test_append_simple(lmdb_version_store):
     assert_frame_equal(vit.data, expected)
 
 
-def test_append_unicode(lmdb_version_store):
-    symbol = "test_append_unicode"
-    uc = "\u0420\u043e\u0441\u0441\u0438\u044f"
-
-    df1 = pd.DataFrame(
-        index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
-        data={"a": ["123", uc]},
-    )
-    lmdb_version_store.write(symbol, df1)
-    vit = lmdb_version_store.read(symbol)
-    assert_frame_equal(vit.data, df1)
-
-    df2 = pd.DataFrame(
-        index=[pd.Timestamp("2018-01-04"), pd.Timestamp("2018-01-05")],
-        data={"a": ["123", uc]},
-    )
-    lmdb_version_store.append(symbol, df2)
-    vit = lmdb_version_store.read(symbol)
-    expected = pd.concat([df1, df2])
-    assert_frame_equal(vit.data, expected)
-
-
 @pytest.mark.parametrize("empty_types", (True, False))
 @pytest.mark.parametrize("dynamic_schema", (True, False))
 def test_append_range_index(version_store_factory, empty_types, dynamic_schema):
diff --git a/python/tests/unit/arcticdb/version_store/test_sort.py b/python/tests/unit/arcticdb/version_store/test_sort.py
@@ -2,12 +2,11 @@
 import numpy as np
 import arcticdb as adb
 from arcticdb.util.test import assert_frame_equal
-import random
-import string
-
 from arcticdb_ext.storage import KeyType
 from arcticdb_ext.version_store import SortedValue
 
+from arcticdb.util.test import random_strings_of_length
+
 
 def test_stage_finalize(arctic_library):
     symbol = "AAPL"
@@ -73,16 +72,6 @@ def test_stage_finalize_dynamic(arctic_client, lib_name):
     pd.testing.assert_frame_equal(result, expected)
 
 
-def random_strings(count, max_length):
-    result = []
-    for _ in range(count):
-        length = random.randrange(max_length) + 2
-        result.append(
-            "".join(random.choice(string.ascii_letters) for _ in range(length))
-        )
-    return result
-
-
 def test_stage_finalize_strings(arctic_library):
     symbol = "AAPL"
     sort_cols = ["timestamp", "col1"]
@@ -91,14 +80,14 @@ def test_stage_finalize_strings(arctic_library):
         "timestamp": pd.date_range("2023-01-01", periods=25, freq="H").repeat(2),
         "col1": np.arange(1, 51),
         "col2": [f"a{i:02d}" for i in range(1, 51)],
-        "col3": random_strings(50, 12)
+        "col3": random_strings_of_length(50, 12)
     }).set_index("timestamp")
 
     df2 = pd.DataFrame({
         "timestamp": pd.date_range("2023-01-04", periods=25, freq="H").repeat(2),
         "col1": np.arange(51, 101),
         "col2": [f"b{i:02d}" for i in range(1, 51)],
-        "col3": random_strings(50, 12)
+        "col3": random_strings_of_length(50, 12)
     }).set_index("timestamp")
 
     df1_shuffled = df1.sample(frac=1)
@@ -122,15 +111,15 @@ def test_stage_finalize_strings_dynamic(arctic_client, lib_name):
         "timestamp": pd.date_range("2023-01-01", periods=25, freq="H").repeat(2),
         "col1": np.arange(1, 51),
         "col2": [f"a{i:02d}" for i in range(1, 51)],
-        "col3": random_strings(50, 12)
+        "col3": random_strings_of_length(50, 12)
     }).set_index("timestamp")
 
     df2 = pd.DataFrame({
         "timestamp": pd.date_range("2023-01-04", periods=25, freq="H").repeat(2),
         "col1": np.arange(51, 101),
         "col2": [f"b{i:02d}" for i in range(1, 51)],
         "col4": [f"a{i:02d}" for i in range(101, 151)],
-        "col5": random_strings(50, 12)
+        "col5": random_strings_of_length(50, 12)
     }).set_index("timestamp")
 
     df1_shuffled = df1.sample(frac=1)
diff --git a/python/tests/unit/arcticdb/version_store/test_string_dedup.py b/python/tests/unit/arcticdb/version_store/test_string_dedup.py
@@ -7,7 +7,6 @@
 """
 import gc
 import random
-import string
 import sys
 
 import numpy as np
@@ -16,13 +15,7 @@
 
 from datetime import datetime as dt
 
-
-def random_strings(count, max_length):
-    result = []
-    for _ in range(count):
-        length = random.randrange(max_length + 1)
-        result.append("".join(random.choice(string.ascii_letters) for _ in range(length)))
-    return result
+from arcticdb.util.test import random_ascii_strings
 
 
 def generate_dataframe(columns, number_of_rows, strings, index_start="2000-1-1"):
@@ -50,7 +43,7 @@ def getsize(df):
 def test_string_dedup_basic(lmdb_version_store_tiny_segment):
     lib = lmdb_version_store_tiny_segment
     symbol = "test_string_dedup_basic"
-    original_df = generate_dataframe(["col1", "col2", "col3", "col4"], 1000, random_strings(100, 10))
+    original_df = generate_dataframe(["col1", "col2", "col3", "col4"], 1000, random_ascii_strings(100, 10))
     lib.write(symbol, original_df, dynamic_strings=True)
     read_df_with_dedup = lib.read(symbol, optimise_string_memory=True).data
     read_df_without_dedup = lib.read(symbol, optimise_string_memory=False).data
@@ -63,7 +56,7 @@ def test_string_dedup_basic(lmdb_version_store_tiny_segment):
 def test_string_dedup_dynamic_schema(lmdb_version_store_dynamic_schema):
     lib = lmdb_version_store_dynamic_schema
     symbol = "test_string_dedup_dynamic_schema"
-    unique_strings = random_strings(100, 10)
+    unique_strings = random_ascii_strings(100, 10)
     original_df = generate_dataframe(["col1"], 1000, unique_strings, "2000-1-1")
     # This will be different to original_df, as the value in each row is chosen at random from the unique string pool
     append_df = generate_dataframe(["col1"], 1000, unique_strings, "2010-1-1")
@@ -91,7 +84,7 @@ def test_string_dedup_nans(lmdb_version_store_tiny_segment):
     lib = lmdb_version_store_tiny_segment
     symbol = "test_string_dedup_nans"
     # Throw a nan into the unique string pool
-    unique_strings = random_strings(9, 10)
+    unique_strings = random_ascii_strings(9, 10)
     unique_strings.append(np.nan)
     columns = ["col1", "col2", "col3", "col4"]
     original_df = generate_dataframe(columns, 1000, unique_strings)
@@ -141,7 +134,7 @@ def test_string_dedup_performance(lmdb_version_store):
 
     for unique_string in unique_strings:
         for string_length in string_lengths:
-            string_pool = random_strings(unique_string, string_length)
+            string_pool = random_ascii_strings(unique_string, string_length)
             for rows in number_of_rows:
                 print("Unique strings:  {}".format(unique_string))
                 print("String length:   {}".format(string_length))
diff --git a/python/tests/unit/arcticdb/version_store/test_unicode.py b/python/tests/unit/arcticdb/version_store/test_unicode.py
diff --git a/python/tests/unit/arcticdb/version_store/test_update.py b/python/tests/unit/arcticdb/version_store/test_update.py
diff --git a/python/tests/unit/arcticdb/version_store/test_write.py b/python/tests/unit/arcticdb/version_store/test_write.py