@@ -363,8 +363,10 @@ class RandomStringPool:
363
363
with certain size of each string and limited number of strings in the pool
364
364
"""
365
365
366
- def __init__ (self , str_length : int , pool_size : int ):
367
- self .__pool = ListGenerators .generate_random_string_pool (str_length , pool_size )
366
+ def __init__ (self , str_length : int , pool_size : int , include_unicode : bool = False , seed = 3 ):
367
+ self .__pool = ListGenerators .generate_random_string_pool (str_length = str_length ,
368
+ pool_size = pool_size , include_unicode = include_unicode ,
369
+ seed = seed )
368
370
369
371
def get_list (self , size : int ) -> List [str ]:
370
372
return [random .choice (self .__pool ) for _ in range (size )]
@@ -395,17 +397,23 @@ def generate_random_floats(cls, dtype: ArcticFloatType,
395
397
return np .round (np .random .uniform (min_value , max_value , size ), round_to ).astype (dtype )
396
398
397
399
@classmethod
398
- def generate_random_string_pool (cls , str_length : int , pool_size : int , seed = 1 ) -> List [str ]:
400
+ def generate_random_string_pool (cls , str_length : int , pool_size : int ,
401
+ include_unicode : bool = False , seed = 1 ) -> List [str ]:
399
402
np .random .seed (seed )
403
+ random .seed (seed )
400
404
unique_values = set ()
401
405
while len (unique_values ) < pool_size :
402
- unique_values .add (ListGenerators .random_string (str_length ))
406
+ unique_values .add (ListGenerators .random_string (length = str_length , include_unicode = include_unicode ,
407
+ seed = None ))
403
408
return list (unique_values )
404
409
405
410
@classmethod
406
- def generate_random_strings (cls , str_size : int , length : int , seed = 1 ) -> List [str ]:
411
+ def generate_random_strings (cls , str_size : int , length : int ,
412
+ include_unicode : bool = False , seed = 1 ) -> List [str ]:
407
413
np .random .seed (seed )
408
- return [ListGenerators .random_string (str_size ) for _ in range (length )]
414
+ random .seed (seed )
415
+ return [ListGenerators .random_string (length = str_size ,
416
+ include_unicode = include_unicode , seed = None ) for _ in range (length )]
409
417
410
418
@classmethod
411
419
def generate_random_ints (cls , dtype : ArcticIntType ,
@@ -420,10 +428,16 @@ def generate_random_bools(cls, size: int, seed = 1) -> List[bool]:
420
428
return np .random .choice ([True , False ], size = size )
421
429
422
430
@classmethod
423
- def random_string (cls , length : int , seed = 1 ):
424
- np .random .seed (seed )
425
- return "" .join (random .choice (string .ascii_uppercase
426
- + string .digits + string .ascii_lowercase + ' ' ) for _ in range (length ))
431
+ def random_string (cls , length : int , include_unicode : bool = False , seed : int = 1 ):
432
+ if seed :
433
+ random .seed (seed )
434
+ unicode_symbol = "\u00A0 " # start of latin extensions
435
+ unicode_symbols = "" .join ([chr (ord (unicode_symbol ) + i ) for i in range (100 )])
436
+ characters = string .ascii_letters + string .digits + string .punctuation + (" " * 5 )
437
+ if include_unicode :
438
+ characters = characters + unicode_symbols
439
+ result = '' .join (random .choice (characters ) for _ in range (length ))
440
+ return result
427
441
428
442
@classmethod
429
443
def generate_random_list_with_mean (cls , number_elements , specified_mean , value_range = (0 , 100 ),
@@ -476,17 +490,24 @@ def add_float_col(self, name: str, dtype: ArcticFloatType = np.float64, min: flo
476
490
self .__types [name ] = dtype
477
491
return self
478
492
479
- def add_string_col (self , name : str , str_size : int , num_unique_values : int = None ) -> 'DFGenerator' :
493
+ def add_string_col (self , name : str , str_size : int , include_unicode : bool = False ,
494
+ num_unique_values : int = None ) -> 'DFGenerator' :
480
495
"""
481
496
Generates a list of strings with length 'str_size', and if 'num_unique_values' values is None
482
497
the list will be of unique values if 'num_unique_values' is a number then this will be the length
483
498
pf the string pool of values
484
499
"""
485
500
list = []
486
501
if num_unique_values is None :
487
- list = ListGenerators .generate_random_strings (str_size , self .__size )
502
+ list = ListGenerators .generate_random_strings (str_size = str_size ,
503
+ length = self .__size ,
504
+ include_unicode = include_unicode ,seed = self .__seed )
488
505
else :
489
- list = RandomStringPool (str_size , num_unique_values ).get_list (self .__size )
506
+ list = RandomStringPool (str_length = str_size ,
507
+ pool_size = num_unique_values ,
508
+ include_unicode = include_unicode ,
509
+ seed = self .__seed
510
+ ).get_list (self .__size )
490
511
self .__data [name ] = list
491
512
self .__types [name ] = str
492
513
return self
@@ -550,7 +571,7 @@ def generate_random_dataframe(cls, rows: int, cols: int, indexed: bool = True, s
550
571
elif 'float' in str (dtype ):
551
572
gen .add_float_col (f"col_{ i } " , dtype )
552
573
elif 'str' in str (dtype ):
553
- gen .add_string_col (f"col_{ i } " , 10 )
574
+ gen .add_string_col (name = f"col_{ i } " , str_size = 10 )
554
575
else :
555
576
return f"Unsupported type { dtype } "
556
577
if indexed :
0 commit comments