Skip to content

Commit 7dbba01

Browse files
Georgi RusevGeorgi Rusev
Georgi Rusev
authored and
Georgi Rusev
committed
added support for unicode strings
1 parent 5d2fa1d commit 7dbba01

File tree

2 files changed

+37
-16
lines changed

2 files changed

+37
-16
lines changed

python/arcticdb/util/utils.py

+35-14
Original file line numberDiff line numberDiff line change
@@ -363,8 +363,10 @@ class RandomStringPool:
363363
with certain size of each string and limited number of strings in the pool
364364
"""
365365

366-
def __init__(self, str_length: int, pool_size: int):
367-
self.__pool = ListGenerators.generate_random_string_pool(str_length, pool_size)
366+
def __init__(self, str_length: int, pool_size: int, include_unicode: bool = False, seed = 3):
367+
self.__pool = ListGenerators.generate_random_string_pool(str_length=str_length,
368+
pool_size=pool_size, include_unicode=include_unicode,
369+
seed=seed)
368370

369371
def get_list(self, size: int) -> List[str]:
370372
return [random.choice(self.__pool) for _ in range(size)]
@@ -395,17 +397,23 @@ def generate_random_floats(cls, dtype: ArcticFloatType,
395397
return np.round(np.random.uniform(min_value, max_value, size), round_to).astype(dtype)
396398

397399
@classmethod
398-
def generate_random_string_pool(cls, str_length: int, pool_size: int, seed = 1) -> List[str]:
400+
def generate_random_string_pool(cls, str_length: int, pool_size: int,
401+
include_unicode: bool = False, seed = 1) -> List[str]:
399402
np.random.seed(seed)
403+
random.seed(seed)
400404
unique_values = set()
401405
while len(unique_values) < pool_size:
402-
unique_values.add(ListGenerators.random_string(str_length))
406+
unique_values.add(ListGenerators.random_string(length=str_length, include_unicode=include_unicode,
407+
seed=None))
403408
return list(unique_values)
404409

405410
@classmethod
406-
def generate_random_strings(cls, str_size: int, length: int, seed = 1) -> List[str]:
411+
def generate_random_strings(cls, str_size: int, length: int,
412+
include_unicode: bool = False, seed = 1) -> List[str]:
407413
np.random.seed(seed)
408-
return [ListGenerators.random_string(str_size) for _ in range(length)]
414+
random.seed(seed)
415+
return [ListGenerators.random_string(length=str_size,
416+
include_unicode=include_unicode, seed=None) for _ in range(length)]
409417

410418
@classmethod
411419
def generate_random_ints(cls, dtype: ArcticIntType,
@@ -420,10 +428,16 @@ def generate_random_bools(cls, size: int, seed = 1) -> List[bool]:
420428
return np.random.choice([True, False], size=size)
421429

422430
@classmethod
423-
def random_string(cls, length: int, seed = 1):
424-
np.random.seed(seed)
425-
return "".join(random.choice(string.ascii_uppercase
426-
+ string.digits + string.ascii_lowercase + ' ') for _ in range(length))
431+
def random_string(cls, length: int, include_unicode: bool = False, seed: int = 1):
432+
if seed:
433+
random.seed(seed)
434+
unicode_symbol = "\u00A0" # start of latin extensions
435+
unicode_symbols = "".join([chr(ord(unicode_symbol) + i) for i in range(100)])
436+
characters = string.ascii_letters + string.digits + string.punctuation + (" " * 5)
437+
if include_unicode:
438+
characters = characters + unicode_symbols
439+
result = ''.join(random.choice(characters) for _ in range(length))
440+
return result
427441

428442
@classmethod
429443
def generate_random_list_with_mean(cls, number_elements, specified_mean, value_range=(0, 100),
@@ -476,17 +490,24 @@ def add_float_col(self, name: str, dtype: ArcticFloatType = np.float64, min: flo
476490
self.__types[name] = dtype
477491
return self
478492

479-
def add_string_col(self, name: str, str_size: int, num_unique_values: int = None) -> 'DFGenerator':
493+
def add_string_col(self, name: str, str_size: int, include_unicode: bool = False,
494+
num_unique_values: int = None) -> 'DFGenerator':
480495
"""
481496
Generates a list of strings with length 'str_size', and if 'num_unique_values' values is None
482497
the list will be of unique values if 'num_unique_values' is a number then this will be the length
483498
pf the string pool of values
484499
"""
485500
list = []
486501
if num_unique_values is None:
487-
list = ListGenerators.generate_random_strings(str_size, self.__size)
502+
list = ListGenerators.generate_random_strings(str_size=str_size,
503+
length=self.__size,
504+
include_unicode=include_unicode,seed=self.__seed)
488505
else:
489-
list = RandomStringPool(str_size, num_unique_values).get_list(self.__size)
506+
list = RandomStringPool(str_length=str_size,
507+
pool_size=num_unique_values,
508+
include_unicode=include_unicode,
509+
seed=self.__seed
510+
).get_list(self.__size)
490511
self.__data[name] = list
491512
self.__types[name] = str
492513
return self
@@ -550,7 +571,7 @@ def generate_random_dataframe(cls, rows: int, cols: int, indexed: bool = True, s
550571
elif 'float' in str(dtype):
551572
gen.add_float_col(f"col_{i}", dtype)
552573
elif 'str' in str(dtype):
553-
gen.add_string_col(f"col_{i}", 10)
574+
gen.add_string_col(name=f"col_{i}", str_size=10)
554575
else:
555576
return f"Unsupported type {dtype}"
556577
if indexed:

python/benchmarks/real_read_write.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,8 @@ def generate_dataframe(self, row_num:int, col_num: int) -> pd.DataFrame:
6060
.add_int_col("uint64", np.uint64, min=100, max=199)
6161
.add_float_col("float16",np.float32)
6262
.add_float_col("float2",min=-100.0, max=200.0, round_at=4)
63-
.add_string_col("string10", str_size=10)
64-
.add_string_col("string20", str_size=20, num_unique_values=20000)
63+
.add_string_col(name="string10", str_size=10)
64+
.add_string_col(name="string20", str_size=20, num_unique_values=20000)
6565
.add_bool_col("bool")
6666
.add_timestamp_index("time", ReadWriteBenchmarkSettings.INDEX_FREQ, ReadWriteBenchmarkSettings.START_DATE_INDEX)
6767
).generate_dataframe()

0 commit comments

Comments
 (0)