1
+ """
2
+ Copyright 2024 Man Group Operations Limited
3
+
4
+ Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
5
+
6
+ As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
7
+ """
8
+
9
+ import time
10
+ import numpy as np
11
+ import pandas as pd
12
+ from arcticdb import Arctic
13
+ from arcticdb .options import LibraryOptions
14
+ from arcticdb .storage_fixtures .azure import AzuriteStorageFixtureFactory
15
+ from arcticdb .util .utils import DFGenerator , ListGenerators , TimestampNumber
16
+ from arcticdb .version_store .library import Library
17
+ from .common import *
18
+
19
+ class AzureReadWrite (AsvBase ):
20
+ """
21
+ This class is for general read write tests on Azure Blob Storage
22
+ """
23
+ rounds = 1
24
+ number = 3 # invokes 3 times the test runs between each setup-teardown
25
+ repeat = 1 # defines the number of times the measurements will invoke setup-teardown
26
+ min_run_count = 1
27
+ warmup_time = 0
28
+
29
+ timeout = 1200
30
+
31
+ library_manager = TestLibraryManager (storage = Storage .AZURE , name_benchmark = "READ_WRITE" )
32
+ library_type = LibraryType .PERSISTENT
33
+
34
+ param_names = ["num_rows" ]
35
+ params = [10_000_000 ] # 10M rows
36
+
37
+ number_columns = 100 # 100 columns
38
+
39
+ def get_library_manager (self ) -> TestLibraryManager :
40
+ return AzureReadWrite .library_manager
41
+
42
+ def get_population_policy (self ) -> LibraryPopulationPolicy :
43
+ lpp = LibraryPopulationPolicy (self .get_logger ())
44
+ lpp .set_parameters (AzureReadWrite .params , [AzureReadWrite .number_columns ])
45
+ return lpp
46
+
47
+ def setup_cache (self ):
48
+ self .setup_library ()
49
+ self .symbol = "test_symbol"
50
+ self .to_write_df = None
51
+ self .last_20 = None
52
+
53
+ def setup (self , num_rows ):
54
+ self .setup_library ()
55
+ self .lib = self .get_library_manager ().get_library (AzureReadWrite .library_type , 1 )
56
+
57
+ # Generate test data with mixed types including strings
58
+ df_generator = DFGenerator (num_rows , [AzureReadWrite .number_columns ])
59
+ self .to_write_df = df_generator .generate_dataframe ()
60
+
61
+ # Add some string columns
62
+ string_cols = [f"string_{ i } " for i in range (10 )] # 10 string columns
63
+ for col in string_cols :
64
+ self .to_write_df [col ] = [f"string_value_{ i } " for i in range (num_rows )]
65
+
66
+ # Write the data
67
+ self .lib .write (self .symbol , self .to_write_df )
68
+
69
+ # Calculate date range for last 20% of rows
70
+ self .last_20 = self .get_last_x_percent_date_range (num_rows , 20 )
71
+
72
+ def time_read (self , num_rows ):
73
+ self .lib .read (self .symbol )
74
+
75
+ def peakmem_read (self , num_rows ):
76
+ self .lib .read (self .symbol )
77
+
78
+ def time_write (self , num_rows ):
79
+ self .lib .write (self .symbol , self .to_write_df )
80
+
81
+ def peakmem_write (self , num_rows ):
82
+ self .lib .write (self .symbol , self .to_write_df )
83
+
84
+ def time_read_with_column_float (self , num_rows ):
85
+ COLS = ["float2" ]
86
+ self .lib .read (symbol = self .symbol , columns = COLS ).data
87
+
88
+ def peakmem_read_with_column_float (self , num_rows ):
89
+ COLS = ["float2" ]
90
+ self .lib .read (symbol = self .symbol , columns = COLS ).data
91
+
92
+ def time_read_with_columns_all_types (self , num_rows ):
93
+ COLS = ["float2" , "string_0" , "bool" , "int64" , "uint64" ]
94
+ self .lib .read (symbol = self .symbol , columns = COLS ).data
95
+
96
+ def peakmem_read_with_columns_all_types (self , num_rows ):
97
+ COLS = ["float2" , "string_0" , "bool" , "int64" , "uint64" ]
98
+ self .lib .read (symbol = self .symbol , columns = COLS ).data
99
+
100
+ def time_write_staged (self , num_rows ):
101
+ lib = self .lib
102
+ lib .write (self .symbol , self .to_write_df , staged = True )
103
+ lib ._nvs .compact_incomplete (self .symbol , False , False )
104
+
105
+ def peakmem_write_staged (self , num_rows ):
106
+ lib = self .lib
107
+ lib .write (self .symbol , self .to_write_df , staged = True )
108
+ lib ._nvs .compact_incomplete (self .symbol , False , False )
109
+
110
+ def time_read_with_date_ranges_last20_percent_rows (self , num_rows ):
111
+ self .lib .read (symbol = self .symbol , date_range = self .last_20 ).data
112
+
113
+ def peakmem_read_with_date_ranges_last20_percent_rows (self , num_rows ):
114
+ self .lib .read (symbol = self .symbol , date_range = self .last_20 ).data
115
+
116
+ def get_last_x_percent_date_range (self , num_rows , percents ):
117
+ df_generator = self .population_policy .df_generator
118
+ freq = df_generator .freq
119
+ return DataRangeUtils .get_last_x_percent_date_range (
120
+ initial_timestamp = df_generator .initial_timestamp ,
121
+ freq = freq ,
122
+ num_rows = num_rows ,
123
+ percents = percents
124
+ )
125
+
126
+
127
+ class AzureListVersions (AsvBase ):
128
+ """
129
+ This class is for testing list_versions performance on Azure Blob Storage
130
+ """
131
+ rounds = 1
132
+ number = 3
133
+ repeat = 1
134
+ min_run_count = 1
135
+ warmup_time = 0
136
+
137
+ timeout = 1200
138
+
139
+ library_manager = TestLibraryManager (storage = Storage .AZURE , name_benchmark = "LIST_VERSIONS" )
140
+ library_type = LibraryType .PERSISTENT
141
+
142
+ param_names = ["num_symbols" ]
143
+ params = [10_000 ] # 10k symbols
144
+
145
+ def get_library_manager (self ) -> TestLibraryManager :
146
+ return AzureListVersions .library_manager
147
+
148
+ def get_population_policy (self ) -> LibraryPopulationPolicy :
149
+ lpp = LibraryPopulationPolicy (self .get_logger ())
150
+ lpp .set_parameters ([1000 ] * AzureListVersions .params [0 ], [10 ]) # 1000 rows per symbol, 10 columns
151
+ return lpp
152
+
153
+ def setup_cache (self ):
154
+ self .setup_library ()
155
+ self .test_counter = 1
156
+
157
+ def setup (self , num_symbols ):
158
+ self .setup_library ()
159
+ self .lib = self .get_library_manager ().get_library (AzureListVersions .library_type , num_symbols )
160
+
161
+ # Generate and write test data
162
+ start = time .time ()
163
+ policy = self .get_population_policy ()
164
+ policy .set_parameters ([1000 ] * num_symbols , [10 ])
165
+ if not self .library_manager .has_library (AzureListVersions .library_type , num_symbols ):
166
+ populate_library (self .library_manager , policy , AzureListVersions .library_type , num_symbols )
167
+ self .get_logger ().info (f"Generated { num_symbols } symbols with 1000 rows each in { time .time () - start :.2f} s" )
168
+ else :
169
+ self .get_logger ().info ("Library already exists, population skipped" )
170
+
171
+ # Clear cache to ensure we're testing actual storage performance
172
+ self .lib ._nvs .version_store ._clear_symbol_list_keys ()
173
+
174
+ def time_list_versions (self , num_symbols ):
175
+ assert self .test_counter == 1 , "Test executed only once in setup-teardown cycle"
176
+ self .lib .list_versions ()
177
+ self .test_counter += 1
178
+
179
+ def peakmem_list_versions (self , num_symbols ):
180
+ assert self .test_counter == 1 , "Test executed only once in setup-teardown cycle"
181
+ self .lib .list_versions ()
182
+ self .test_counter += 1
0 commit comments