Skip to content

Commit f03d198

Browse files
Alex Seatonpoodlewars
Alex Seaton
authored andcommitted
Extended testing of how we handle unicode strings
Adjust mem leaks threshold in tests as testing with unicode strings uses a bit more memory
1 parent b3b0273 commit f03d198

File tree

10 files changed

+380
-171
lines changed

10 files changed

+380
-171
lines changed

python/arcticdb/util/test.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,16 @@ def assert_frame_equal_rebuild_index_first(expected: pd.DataFrame, actual: pd.Da
244244
assert_frame_equal(left=expected, right=actual)
245245

246246

247+
unicode_symbol = "\u00A0" # start of latin extensions
248+
unicode_symbols = "".join([chr(ord(unicode_symbol) + i) for i in range(100)])
249+
250+
247251
def random_string(length: int):
248-
return "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length))
252+
if random.randint(0, 3) == 0:
253+
# (probably) Give a unicode string one time in three, we have special handling in C++ for unicode
254+
return "".join(random.choice(string.ascii_uppercase + unicode_symbols) for _ in range(length))
255+
else:
256+
return "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length))
249257

250258

251259
def get_sample_dataframe(size=1000, seed=0, str_size=10):
@@ -433,7 +441,15 @@ def get_pickle():
433441
)[np.random.randint(0, 2)]
434442

435443

436-
def random_strings_of_length(num, length, unique):
444+
def random_ascii_strings(count, max_length):
445+
result = []
446+
for _ in range(count):
447+
length = random.randrange(max_length + 1)
448+
result.append("".join(random.choice(string.ascii_letters) for _ in range(length)))
449+
return result
450+
451+
452+
def random_strings_of_length(num, length, unique=False):
437453
out = []
438454
for i in range(num):
439455
out.append(random_string(length))

python/tests/integration/arcticdb/version_store/test_basic_version_store.py

-41
Original file line numberDiff line numberDiff line change
@@ -458,47 +458,6 @@ def test_prune_previous_versions_append_batch(basic_store):
458458
assert len(lib_tool.find_keys(KeyType.SYMBOL_LIST)) == 4
459459

460460

461-
def test_batch_append_unicode(basic_store):
462-
symbol = "test_append_unicode"
463-
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"
464-
465-
df1 = pd.DataFrame(
466-
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
467-
data={"a": ["123", uc]},
468-
)
469-
basic_store.batch_write(symbols=[symbol], data_vector=[df1])
470-
vit = basic_store.batch_read([symbol])[symbol]
471-
assert_equal(vit.data, df1)
472-
473-
df2 = pd.DataFrame(
474-
index=[pd.Timestamp("2018-01-04"), pd.Timestamp("2018-01-05")],
475-
data={"a": ["123", uc]},
476-
)
477-
basic_store.batch_append(symbols=[symbol], data_vector=[df2])
478-
vit = basic_store.batch_read([symbol])[symbol]
479-
expected = pd.concat([df1, df2])
480-
assert_equal(vit.data, expected)
481-
482-
483-
def test_batch_write_metadata_unicode(basic_store):
484-
symbol = "test_append_unicode"
485-
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"
486-
df1 = pd.DataFrame(
487-
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
488-
data={"a": ["123", uc]},
489-
)
490-
491-
basic_store.batch_write(symbols=[symbol], data_vector=[df1])
492-
vit = basic_store.batch_read([symbol])[symbol]
493-
assert_equal(vit.data, df1)
494-
495-
meta = {"a": 1, "b": uc}
496-
basic_store.batch_write_metadata(symbols=[symbol], metadata_vector=[meta])
497-
vits = basic_store.batch_read_metadata([symbol])
498-
metadata = vits[symbol].metadata
499-
assert metadata == meta
500-
501-
502461
def test_deleting_unknown_symbol(basic_store, symbol):
503462
df = sample_dataframe()
504463

python/tests/integration/arcticdb/version_store/test_symbol_list.py

+5-16
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import pytest
1111

1212
from arcticdb.config import Defaults
13-
from arcticdb.util.test import sample_dataframe
13+
from arcticdb.util.test import sample_dataframe, random_ascii_strings
1414
from arcticdb.version_store._store import NativeVersionStore
1515
from arcticdb.toolbox.library_tool import (
1616
VariantKey,
@@ -25,10 +25,9 @@
2525

2626
from multiprocessing import Pool
2727
from arcticdb_ext import set_config_int
28-
import random
29-
import string
3028
from tests.util.mark import MACOS_CONDA_BUILD
3129

30+
3231
@pytest.fixture
3332
def small_max_delta():
3433
set_config_int("SymbolList.MaxDelta", 2)
@@ -278,16 +277,6 @@ def test_lock_contention(small_max_delta, basic_store, mode):
278277
assert lt.find_keys(KeyType.SYMBOL_LIST) != orig_sl
279278

280279

281-
def random_strings(count, max_length):
282-
result = []
283-
for _ in range(count):
284-
length = random.randrange(max_length) + 2
285-
result.append(
286-
"".join(random.choice(string.ascii_letters) for _ in range(length))
287-
)
288-
return result
289-
290-
291280
def _tiny_df(idx):
292281
return pd.DataFrame(
293282
{"x": np.arange(idx % 10, idx % 10 + 10)},
@@ -346,16 +335,16 @@ def test_symbol_list_parallel_stress_with_delete(
346335
num_cycles = 1
347336
symbol_length = 6
348337

349-
pre_existing_symbols = random_strings(num_pre_existing_symbols, symbol_length)
338+
pre_existing_symbols = random_ascii_strings(num_pre_existing_symbols, symbol_length)
350339
for idx, existing in enumerate(pre_existing_symbols):
351340
lib.write(existing, _tiny_df(idx))
352341

353342
if same_symbols:
354-
frozen_symbols = random_strings(num_symbols, symbol_length)
343+
frozen_symbols = random_ascii_strings(num_symbols, symbol_length)
355344
symbols = [frozen_symbols for _ in range(num_workers)]
356345
else:
357346
symbols = [
358-
random_strings(num_symbols, symbol_length) for _ in range(num_workers)
347+
random_ascii_strings(num_symbols, symbol_length) for _ in range(num_workers)
359348
]
360349

361350
with Pool(num_workers) as p:

python/tests/stress/arcticdb/version_store/test_mem_leaks.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ def proc_to_examine():
361361
run the test from command line again to assure it runs ok before commit
362362
363363
"""
364-
max_mem_bytes = 295_623_040
364+
max_mem_bytes = 350_000_000
365365

366366
check_process_memory_leaks(proc_to_examine, 20, max_mem_bytes, 80.0)
367367

@@ -705,7 +705,7 @@ def test_mem_leak_querybuilder_read_batch_memray(library_with_symbol):
705705
mem_query(lib, df, read_batch=True)
706706

707707
@MEMRAY_TESTS_MARK
708-
@pytest.mark.limit_memory("490 MB")
708+
@pytest.mark.limit_memory("600 MB")
709709
@pytest.mark.skipif(MACOS, reason="Mac OS mem usage is harder to predicts than WINDOWS")
710710
def test_mem_limit_querybuilder_read_memray(library_with_symbol):
711711
"""
@@ -719,7 +719,7 @@ def test_mem_limit_querybuilder_read_memray(library_with_symbol):
719719
mem_query(lib, df)
720720

721721
@MEMRAY_TESTS_MARK
722-
@pytest.mark.limit_memory("490 MB")
722+
@pytest.mark.limit_memory("600 MB")
723723
@pytest.mark.skipif(MACOS, reason="Mac OS mem usage is harder to predicts than WINDOWS")
724724
def test_mem_limit_querybuilder_read_batch_memray(library_with_symbol):
725725
"""

python/tests/unit/arcticdb/version_store/test_append.py

-22
Original file line numberDiff line numberDiff line change
@@ -32,28 +32,6 @@ def test_append_simple(lmdb_version_store):
3232
assert_frame_equal(vit.data, expected)
3333

3434

35-
def test_append_unicode(lmdb_version_store):
36-
symbol = "test_append_unicode"
37-
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"
38-
39-
df1 = pd.DataFrame(
40-
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
41-
data={"a": ["123", uc]},
42-
)
43-
lmdb_version_store.write(symbol, df1)
44-
vit = lmdb_version_store.read(symbol)
45-
assert_frame_equal(vit.data, df1)
46-
47-
df2 = pd.DataFrame(
48-
index=[pd.Timestamp("2018-01-04"), pd.Timestamp("2018-01-05")],
49-
data={"a": ["123", uc]},
50-
)
51-
lmdb_version_store.append(symbol, df2)
52-
vit = lmdb_version_store.read(symbol)
53-
expected = pd.concat([df1, df2])
54-
assert_frame_equal(vit.data, expected)
55-
56-
5735
@pytest.mark.parametrize("empty_types", (True, False))
5836
@pytest.mark.parametrize("dynamic_schema", (True, False))
5937
def test_append_range_index(version_store_factory, empty_types, dynamic_schema):

python/tests/unit/arcticdb/version_store/test_sort.py

+6-17
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,11 @@
22
import numpy as np
33
import arcticdb as adb
44
from arcticdb.util.test import assert_frame_equal
5-
import random
6-
import string
7-
85
from arcticdb_ext.storage import KeyType
96
from arcticdb_ext.version_store import SortedValue
107

8+
from arcticdb.util.test import random_strings_of_length
9+
1110

1211
def test_stage_finalize(arctic_library):
1312
symbol = "AAPL"
@@ -73,16 +72,6 @@ def test_stage_finalize_dynamic(arctic_client, lib_name):
7372
pd.testing.assert_frame_equal(result, expected)
7473

7574

76-
def random_strings(count, max_length):
77-
result = []
78-
for _ in range(count):
79-
length = random.randrange(max_length) + 2
80-
result.append(
81-
"".join(random.choice(string.ascii_letters) for _ in range(length))
82-
)
83-
return result
84-
85-
8675
def test_stage_finalize_strings(arctic_library):
8776
symbol = "AAPL"
8877
sort_cols = ["timestamp", "col1"]
@@ -91,14 +80,14 @@ def test_stage_finalize_strings(arctic_library):
9180
"timestamp": pd.date_range("2023-01-01", periods=25, freq="H").repeat(2),
9281
"col1": np.arange(1, 51),
9382
"col2": [f"a{i:02d}" for i in range(1, 51)],
94-
"col3": random_strings(50, 12)
83+
"col3": random_strings_of_length(50, 12)
9584
}).set_index("timestamp")
9685

9786
df2 = pd.DataFrame({
9887
"timestamp": pd.date_range("2023-01-04", periods=25, freq="H").repeat(2),
9988
"col1": np.arange(51, 101),
10089
"col2": [f"b{i:02d}" for i in range(1, 51)],
101-
"col3": random_strings(50, 12)
90+
"col3": random_strings_of_length(50, 12)
10291
}).set_index("timestamp")
10392

10493
df1_shuffled = df1.sample(frac=1)
@@ -122,15 +111,15 @@ def test_stage_finalize_strings_dynamic(arctic_client, lib_name):
122111
"timestamp": pd.date_range("2023-01-01", periods=25, freq="H").repeat(2),
123112
"col1": np.arange(1, 51),
124113
"col2": [f"a{i:02d}" for i in range(1, 51)],
125-
"col3": random_strings(50, 12)
114+
"col3": random_strings_of_length(50, 12)
126115
}).set_index("timestamp")
127116

128117
df2 = pd.DataFrame({
129118
"timestamp": pd.date_range("2023-01-04", periods=25, freq="H").repeat(2),
130119
"col1": np.arange(51, 101),
131120
"col2": [f"b{i:02d}" for i in range(1, 51)],
132121
"col4": [f"a{i:02d}" for i in range(101, 151)],
133-
"col5": random_strings(50, 12)
122+
"col5": random_strings_of_length(50, 12)
134123
}).set_index("timestamp")
135124

136125
df1_shuffled = df1.sample(frac=1)

python/tests/unit/arcticdb/version_store/test_string_dedup.py

+5-12
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
"""
88
import gc
99
import random
10-
import string
1110
import sys
1211

1312
import numpy as np
@@ -16,13 +15,7 @@
1615

1716
from datetime import datetime as dt
1817

19-
20-
def random_strings(count, max_length):
21-
result = []
22-
for _ in range(count):
23-
length = random.randrange(max_length + 1)
24-
result.append("".join(random.choice(string.ascii_letters) for _ in range(length)))
25-
return result
18+
from arcticdb.util.test import random_ascii_strings
2619

2720

2821
def generate_dataframe(columns, number_of_rows, strings, index_start="2000-1-1"):
@@ -50,7 +43,7 @@ def getsize(df):
5043
def test_string_dedup_basic(lmdb_version_store_tiny_segment):
5144
lib = lmdb_version_store_tiny_segment
5245
symbol = "test_string_dedup_basic"
53-
original_df = generate_dataframe(["col1", "col2", "col3", "col4"], 1000, random_strings(100, 10))
46+
original_df = generate_dataframe(["col1", "col2", "col3", "col4"], 1000, random_ascii_strings(100, 10))
5447
lib.write(symbol, original_df, dynamic_strings=True)
5548
read_df_with_dedup = lib.read(symbol, optimise_string_memory=True).data
5649
read_df_without_dedup = lib.read(symbol, optimise_string_memory=False).data
@@ -63,7 +56,7 @@ def test_string_dedup_basic(lmdb_version_store_tiny_segment):
6356
def test_string_dedup_dynamic_schema(lmdb_version_store_dynamic_schema):
6457
lib = lmdb_version_store_dynamic_schema
6558
symbol = "test_string_dedup_dynamic_schema"
66-
unique_strings = random_strings(100, 10)
59+
unique_strings = random_ascii_strings(100, 10)
6760
original_df = generate_dataframe(["col1"], 1000, unique_strings, "2000-1-1")
6861
# This will be different to original_df, as the value in each row is chosen at random from the unique string pool
6962
append_df = generate_dataframe(["col1"], 1000, unique_strings, "2010-1-1")
@@ -91,7 +84,7 @@ def test_string_dedup_nans(lmdb_version_store_tiny_segment):
9184
lib = lmdb_version_store_tiny_segment
9285
symbol = "test_string_dedup_nans"
9386
# Throw a nan into the unique string pool
94-
unique_strings = random_strings(9, 10)
87+
unique_strings = random_ascii_strings(9, 10)
9588
unique_strings.append(np.nan)
9689
columns = ["col1", "col2", "col3", "col4"]
9790
original_df = generate_dataframe(columns, 1000, unique_strings)
@@ -141,7 +134,7 @@ def test_string_dedup_performance(lmdb_version_store):
141134

142135
for unique_string in unique_strings:
143136
for string_length in string_lengths:
144-
string_pool = random_strings(unique_string, string_length)
137+
string_pool = random_ascii_strings(unique_string, string_length)
145138
for rows in number_of_rows:
146139
print("Unique strings: {}".format(unique_string))
147140
print("String length: {}".format(string_length))

0 commit comments

Comments
 (0)