Skip to content

Commit 3f523c8

Browse files
committed
Add asv benchmarks for Azure Storage
Signed-off-by: Julien Jerphanion <[email protected]>
1 parent 4c596f2 commit 3f523c8

File tree

1 file changed

+182
-0
lines changed

1 file changed

+182
-0
lines changed
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
"""
2+
Copyright 2024 Man Group Operations Limited
3+
4+
Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
5+
6+
As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
7+
"""
8+
9+
import time
10+
import numpy as np
11+
import pandas as pd
12+
from arcticdb import Arctic
13+
from arcticdb.options import LibraryOptions
14+
from arcticdb.storage_fixtures.azure import AzuriteStorageFixtureFactory
15+
from arcticdb.util.utils import DFGenerator, ListGenerators, TimestampNumber
16+
from arcticdb.version_store.library import Library
17+
from .common import *
18+
19+
class AzureReadWrite(AsvBase):
20+
"""
21+
This class is for general read write tests on Azure Blob Storage
22+
"""
23+
rounds = 1
24+
number = 3 # invokes 3 times the test runs between each setup-teardown
25+
repeat = 1 # defines the number of times the measurements will invoke setup-teardown
26+
min_run_count = 1
27+
warmup_time = 0
28+
29+
timeout = 1200
30+
31+
library_manager = TestLibraryManager(storage=Storage.AZURE, name_benchmark="READ_WRITE")
32+
library_type = LibraryType.PERSISTENT
33+
34+
param_names = ["num_rows"]
35+
params = [10_000_000] # 10M rows
36+
37+
number_columns = 100 # 100 columns
38+
39+
def get_library_manager(self) -> TestLibraryManager:
40+
return AzureReadWrite.library_manager
41+
42+
def get_population_policy(self) -> LibraryPopulationPolicy:
43+
lpp = LibraryPopulationPolicy(self.get_logger())
44+
lpp.set_parameters(AzureReadWrite.params, [AzureReadWrite.number_columns])
45+
return lpp
46+
47+
def setup_cache(self):
48+
self.setup_library()
49+
self.symbol = "test_symbol"
50+
self.to_write_df = None
51+
self.last_20 = None
52+
53+
def setup(self, num_rows):
54+
self.setup_library()
55+
self.lib = self.get_library_manager().get_library(AzureReadWrite.library_type, 1)
56+
57+
# Generate test data with mixed types including strings
58+
df_generator = DFGenerator(num_rows, [AzureReadWrite.number_columns])
59+
self.to_write_df = df_generator.generate_dataframe()
60+
61+
# Add some string columns
62+
string_cols = [f"string_{i}" for i in range(10)] # 10 string columns
63+
for col in string_cols:
64+
self.to_write_df[col] = [f"string_value_{i}" for i in range(num_rows)]
65+
66+
# Write the data
67+
self.lib.write(self.symbol, self.to_write_df)
68+
69+
# Calculate date range for last 20% of rows
70+
self.last_20 = self.get_last_x_percent_date_range(num_rows, 20)
71+
72+
def time_read(self, num_rows):
73+
self.lib.read(self.symbol)
74+
75+
def peakmem_read(self, num_rows):
76+
self.lib.read(self.symbol)
77+
78+
def time_write(self, num_rows):
79+
self.lib.write(self.symbol, self.to_write_df)
80+
81+
def peakmem_write(self, num_rows):
82+
self.lib.write(self.symbol, self.to_write_df)
83+
84+
def time_read_with_column_float(self, num_rows):
85+
COLS = ["float2"]
86+
self.lib.read(symbol=self.symbol, columns=COLS).data
87+
88+
def peakmem_read_with_column_float(self, num_rows):
89+
COLS = ["float2"]
90+
self.lib.read(symbol=self.symbol, columns=COLS).data
91+
92+
def time_read_with_columns_all_types(self, num_rows):
93+
COLS = ["float2", "string_0", "bool", "int64", "uint64"]
94+
self.lib.read(symbol=self.symbol, columns=COLS).data
95+
96+
def peakmem_read_with_columns_all_types(self, num_rows):
97+
COLS = ["float2", "string_0", "bool", "int64", "uint64"]
98+
self.lib.read(symbol=self.symbol, columns=COLS).data
99+
100+
def time_write_staged(self, num_rows):
101+
lib = self.lib
102+
lib.write(self.symbol, self.to_write_df, staged=True)
103+
lib._nvs.compact_incomplete(self.symbol, False, False)
104+
105+
def peakmem_write_staged(self, num_rows):
106+
lib = self.lib
107+
lib.write(self.symbol, self.to_write_df, staged=True)
108+
lib._nvs.compact_incomplete(self.symbol, False, False)
109+
110+
def time_read_with_date_ranges_last20_percent_rows(self, num_rows):
111+
self.lib.read(symbol=self.symbol, date_range=self.last_20).data
112+
113+
def peakmem_read_with_date_ranges_last20_percent_rows(self, num_rows):
114+
self.lib.read(symbol=self.symbol, date_range=self.last_20).data
115+
116+
def get_last_x_percent_date_range(self, num_rows, percents):
117+
df_generator = self.population_policy.df_generator
118+
freq = df_generator.freq
119+
return DataRangeUtils.get_last_x_percent_date_range(
120+
initial_timestamp=df_generator.initial_timestamp,
121+
freq=freq,
122+
num_rows=num_rows,
123+
percents=percents
124+
)
125+
126+
127+
class AzureListVersions(AsvBase):
128+
"""
129+
This class is for testing list_versions performance on Azure Blob Storage
130+
"""
131+
rounds = 1
132+
number = 3
133+
repeat = 1
134+
min_run_count = 1
135+
warmup_time = 0
136+
137+
timeout = 1200
138+
139+
library_manager = TestLibraryManager(storage=Storage.AZURE, name_benchmark="LIST_VERSIONS")
140+
library_type = LibraryType.PERSISTENT
141+
142+
param_names = ["num_symbols"]
143+
params = [10_000] # 10k symbols
144+
145+
def get_library_manager(self) -> TestLibraryManager:
146+
return AzureListVersions.library_manager
147+
148+
def get_population_policy(self) -> LibraryPopulationPolicy:
149+
lpp = LibraryPopulationPolicy(self.get_logger())
150+
lpp.set_parameters([1000] * AzureListVersions.params[0], [10]) # 1000 rows per symbol, 10 columns
151+
return lpp
152+
153+
def setup_cache(self):
154+
self.setup_library()
155+
self.test_counter = 1
156+
157+
def setup(self, num_symbols):
158+
self.setup_library()
159+
self.lib = self.get_library_manager().get_library(AzureListVersions.library_type, num_symbols)
160+
161+
# Generate and write test data
162+
start = time.time()
163+
policy = self.get_population_policy()
164+
policy.set_parameters([1000] * num_symbols, [10])
165+
if not self.library_manager.has_library(AzureListVersions.library_type, num_symbols):
166+
populate_library(self.library_manager, policy, AzureListVersions.library_type, num_symbols)
167+
self.get_logger().info(f"Generated {num_symbols} symbols with 1000 rows each in {time.time() - start:.2f}s")
168+
else:
169+
self.get_logger().info("Library already exists, population skipped")
170+
171+
# Clear cache to ensure we're testing actual storage performance
172+
self.lib._nvs.version_store._clear_symbol_list_keys()
173+
174+
def time_list_versions(self, num_symbols):
175+
assert self.test_counter == 1, "Test executed only once in setup-teardown cycle"
176+
self.lib.list_versions()
177+
self.test_counter += 1
178+
179+
def peakmem_list_versions(self, num_symbols):
180+
assert self.test_counter == 1, "Test executed only once in setup-teardown cycle"
181+
self.lib.list_versions()
182+
self.test_counter += 1

0 commit comments

Comments
 (0)