-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_data.py
33 lines (22 loc) · 1.27 KB
/
generate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import numpy as np
from tqdm import tqdm
import math
# Make the list of Characters to be included
list_chars = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '-', '/', '*']
# Associate probablities for the distribution
probs = [1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14]
# assert math.isclose(sum(probs), 1.0) == True
NUM_TOKENS = 884647 # Total number of tokens in Shakespeare's plays - Don't run it now - use the dataset instead.
def spit_garbage(len_expr = 10):
""" Source : https://math.wvu.edu/~hdiamond/Math222F17/Sigurd_et_al-2004-Studia_Linguistica.pdf """
word_len_probs = [0.03160, 0.16975, 0.21192, 0.15678, 0.10852, 0.08524, 0.07724,
0.05623, 0.04032, 0.02766, 0.01582, 0.00917, 0.00482, 0.00262, 0.00099, 0.00050,
0.00027, 0.00022, 0.00011, 0.00006, 0.00005, 0.00002, 0.00001, 0.00001, 0.00001, 0.00001, 0.00005]
# assert math.isclose(sum(word_len_probs), 1.0, rel_tol=1e-4) == True
word_lens = [i+1 for i in range(len(word_len_probs))]
wl = np.random.choice(word_lens, p=word_len_probs)
result = "".join([np.random.choice(list_chars, p=probs) for i in range(wl)])
with open('math.txt', 'a') as fp:
print(result, end=' ', file=fp)
for i in tqdm(range(NUM_TOKENS)):
spit_garbage()