-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_sim_py.py
175 lines (162 loc) · 5.87 KB
/
data_sim_py.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import datetime
import uuid
import json
import pprint
import random
import os
def simulate_tweets(
user_id,
user_name,
user_screen_name,
num_batches,
num_deleted_tweets,
batch_dir,
min_num_new_non_deleted_tweets_per_batch=1,
max_num_new_non_deleted_tweets_per_batch=3,
):
'''
Creates a number of batches of JSON data in a specified directory.
The format of the data is as follows:
{
'created_at': <created_at>,
'id': <tweet id>,
'text': <tweet text>,
'user': {
'id': <user id>,
'name': <user name>,
'screen_name': <user screen name>
}
}
Every non-deleted tweet generated is guaranteed to appear in every batch
after the first in which it is found (ordered by batch number).
Every deleted tweet is guaranteed to be present in every batch except some
batch j <= num_batches, and is missing from every batch > j (if any such batch exists).
The text of each tweet indicates either
'not deleted X / <# non deleted tweets>'
OR
'deleted Y / <# deleted tweets>'
This makes it easy to see whether our solution catches all the deleted tweets and
doesn't have false positives for not deleted tweets.
See the bottom of this file for the call used to generate data. You can tweak
it to your liking.
Params
user_id: backend id of twitter user
user_name: human readable name of twitter user (with spaces)
user_screen_name: @ handle e.g. BillMaher
num_batches: how many batches to create.
num_deleted_tweets: how many tweets to include in some batch, and then
exclude in some later batch, and all following batches
batch_dir: where to put the batch files
min_num_new_non_deleted_tweets_per_batch: lower bound for how many new tweets
that are not deleted are introduced in a batch. this is used in a random num generator
per batch
max_num_new_non_deleted_tweets_per_batch: upper bound, similar to above
'''
# this must be true because with less than 2 batches, a tweet could not have been deleted
assert num_batches >= 2
# determine where each deleted tweet appears originally and is deleted
deleted_tweet_batch_appeared_in = sorted([
random.randint(0, num_batches - 2)
for _ in range(num_deleted_tweets)
])
deleted_tweet_batch_deleted_from = [
random.randint(appeared_in + 1, num_batches - 1)
for appeared_in in deleted_tweet_batch_appeared_in
]
deleted_tweet_data = list(
{
'appeared_in': appeared_in,
'deleted_from': deleted_from
}
for appeared_in, deleted_from in
zip(deleted_tweet_batch_appeared_in, deleted_tweet_batch_deleted_from)
)
deleted_tweet_seq_num = 1
non_deleted_tweet_seq_num = 1
# determine how many non-deleted tweets to add in batch i (we add deleted tweets later)
num_new_non_deleted_per_batch = [
random.randint(
min_num_new_non_deleted_tweets_per_batch,
max_num_new_non_deleted_tweets_per_batch
)
for _ in range(num_batches)
]
num_non_deleted_tweets = sum(num_new_non_deleted_per_batch)
batches = []
# create batches with non-deleted tweets
for n in num_new_non_deleted_per_batch:
batch = []
if len(batches) > 0:
batch += batches[-1]
for seq in range(non_deleted_tweet_seq_num, non_deleted_tweet_seq_num + n):
batch.append(
create_tweet_dict(
str(datetime.datetime.now()),
str(uuid.uuid4()),
user_id,
user_name,
user_screen_name,
False,
seq,
num_non_deleted_tweets
)
)
non_deleted_tweet_seq_num += n
batches.append(batch)
# insert deleted tweets in the range of batches specified by deleted_tweet_data
for dtd in deleted_tweet_data:
appeared_in = dtd['appeared_in']
deleted_from = dtd['deleted_from']
for batch_idx in range(appeared_in, deleted_from):
batches[batch_idx].insert(
random.randint(0, len(batches[batch_idx])),
create_tweet_dict(
str(datetime.datetime.now()),
str(uuid.uuid4()),
user_id,
user_name,
user_screen_name,
True,
deleted_tweet_seq_num,
num_deleted_tweets
)
)
deleted_tweet_seq_num += 1
# write to output dir, one file per batch
for i in range(num_batches):
with open(os.path.join(batch_dir, f'batch{i+1}.json'), 'w') as batch_f:
# turn list of dicts into JSON array of objects
batch_f.write(create_tweet_json(batches[i]))
def create_tweet_dict(
created_at,
tweet_id,
user_id,
user_name,
user_screen_name,
is_deleted,
seq_num,
seq_len
):
text_prefix = 'deleted' if is_deleted else 'not deleted'
text = f'{text_prefix} {seq_num} / {seq_len}'
tweet_dict = {
'created_at': created_at,
'id': tweet_id,
'text': text,
'user': {
'id': user_id,
'name': user_name,
'screen_name': user_screen_name
}
}
return tweet_dict
def create_tweet_json(tweet_dict):
return json.dumps(tweet_dict, indent=4)
simulate_tweets(
user_id=str(uuid.uuid4()), # generate arbitrary string using system library
user_name='Jim Bob',
user_screen_name='JimBob',
num_batches=4,
num_deleted_tweets=3,
batch_dir='./batches'
)