-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathatlas_api.py
372 lines (303 loc) · 11 KB
/
atlas_api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
from datetime import datetime, timezone
import dateutil.parser
import glob
import json
import os
from pathlib import Path
import pytz
import requests
import string
import time
from tqdm import tqdm
import numpy as np
import utils
def query_atlas_api(url, params):
"""
Queries the Board Game Atlas API and returns
a json response if available.
Args:
url (str): API url to query
params (dict): dict of params corresponding
to the api
Returns:
xml tree (if no data, returns None)
"""
back_off = [1, 2, 10, 50, 100, 200, 300, 500, 1000, 2000]
for sleep_time in back_off:
print(f"sleeping: {sleep_time}")
time.sleep(sleep_time)
try:
resp = requests.get(url, params=params)
if resp.status_code == 200:
data = json.loads(resp.content)
return data
except requests.exceptions.ChunkedEncodingError as e:
print(e)
print(f"Invalid chunk encoding for {params}")
except Exception as e:
print(e)
print(f"Un-handled error for {params}")
print("200 code not achieved")
return
def compile_users(client_id, output_file):
"""
Compiles user data queried from the Board Game Atlas
API and saves all users to an output json file.
Args:
client_id (str): client id associated with the Board Game
Atlas API
output_file (str): output json filepath/name
Returns:
None. Saves data to a json file.
"""
skip = 0
url = "https://api.boardgameatlas.com/api/users"
params = {"pretty": "true", "skip": str(skip), "client_id": client_id}
data = []
while True:
temp_data = query_atlas_api(url, params)
user_data = temp_data["users"]
if not user_data:
break
data.extend(user_data)
skip += 100
params["skip"] = str(skip)
# clean-up
new_data = []
for val in data:
if "username" in val:
new_data.append(val)
with open(output_file, "w") as f:
json.dump(new_data, f)
def compile_reviews(client_id, user_file, output_dir):
"""
Compiles review data queried from the Board Game Atlas
API and saves reviews from each user in a json file with the
json file name corresponding to the Board Game Atlas user id.
Note, if a user has more than 900 reviews, the API is not able
to retrieve those reviews.
Args:
client_id (str): client id associated with the Board Game
Atlas API
user_file (str): user json file populated from `compile_users()`
output_file (str): output directory
Returns:
None. Saves data to json files.
"""
url = "https://api.boardgameatlas.com/api/reviews"
params = {
"limit": str(100),
"pretty": "true",
"client_id": client_id,
}
Path(output_dir).mkdir(parents=True, exist_ok=True)
# get list of users
with open(user_file, "r") as f:
user_data = json.load(f)
# get reviews and save
for user in tqdm(user_data):
data = []
user_name = user["username"]
params["username"] = user_name
for rating in np.arange(0, 5.25, 0.25):
params["rating"] = str(rating)
for num in range(0, 1000, 100):
params["skip"] = str(num)
temp_data = query_atlas_api(url, params)
review_data = temp_data["reviews"]
if not review_data:
break
for review in review_data:
review["user"]["username"] = user_name
data.extend(review_data)
if num == 900:
print(
f"{user_name} may have additional reviews beyond allowable query limit"
)
if data:
user_id = data[0]["user"]["id"]
with open(os.path.join(output_dir, f"{user_id}.json"), "w") as f:
json.dump(data, f)
def clean_reviews(input_dir, reviews_dir, users_dir):
"""
Cleans the review data extracted from `compile_reviews()` by
removing unnecessary data. Also extracts additional user data.
Review and user data are saved to the specified dirs using the
same filenames.
Args:
input_dir (str): output dir from `compile_reviews()`
reviews_dir (str): new dir to save cleaned reviews data to
users_dir (str): new dir to save additional user data extracted
from review data
Returns:
None. Saves data to json files.
"""
Path(reviews_dir).mkdir(parents=True, exist_ok=True)
Path(users_dir).mkdir(parents=True, exist_ok=True)
reviews_data_list = glob.glob(os.path.join(input_dir, "*.json"))
for reviews in reviews_data_list:
with open(reviews, "r") as f:
reviews_data = json.load(f)
date_check = pytz.utc.localize(datetime(1900, 1, 1))
for review in reviews_data:
user_data = review["user"]
user_id = user_data["id"]
review["user_id"] = user_id
del review["user"]
game_data = review["game"]
game_id = game_data["id"]
review["game_id"] = game_id
del review["game"]
if dateutil.parser.isoparse(review["date"]) > date_check:
user_data_save = user_data
new_reviews_path = os.path.join(reviews_dir, os.path.basename(reviews))
with open(new_reviews_path, "w") as f:
json.dump(reviews_data, f)
new_user_path = os.path.join(users_dir, os.path.basename(reviews))
with open(new_user_path, "w") as f:
json.dump(user_data_save, f)
def compile_games(client_id, reviews_dir, output_dir):
"""
Compiles game data queried from the Board Game Atlas
API and saves data in json files each containing 100 games.
Requires the review data from the Board Game Atlas API to
be compiled using `compile_reviews()` to be able to query the
games.
Args:
client_id (str): client id associated with the Board Game
Atlas API
reviews_dir (str): output dir from `compile_reviews()` or
`clean_reviews()`
output_file (str): output directory
Returns:
None. Saves data to json files.
"""
# get list of games
reviews_data_list = glob.glob(os.path.join(reviews_dir, "*.json"))
game_set = set()
for reviews in reviews_data_list:
with open(reviews, "r") as f:
reviews_data = json.load(f)
for review in reviews_data:
game_set.add(review["game"]["id"])
chunked_game_ids = utils.create_chunks(list(game_set), 10)
url = "https://api.boardgameatlas.com/api/search"
params = {
"pretty": "true",
"client_id": client_id,
}
# extract game data and save
game_counter = 0
output_counter = 1
data = []
for idx, chunk in tqdm(enumerate(chunked_game_ids)):
params["ids"] = ",".join(str(x) for x in chunk)
temp_data = query_atlas_api(url, params)
game_data = temp_data["games"]
if not game_data:
continue
for game in game_data:
now = datetime.now(timezone.utc)
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
game["datetime_extracted"] = dt_string
data.extend(game_data)
game_counter += 10
if game_counter % 100 == 0:
with open(
os.path.join(output_dir, f"{str(output_counter).zfill(8)}.json"),
"w",
) as f:
json.dump(data, f)
data = []
output_counter += 1
def compile_prices(client_id, reviews_dir, output_dir):
"""
Compiles price data queried from the Board Game Atlas
API and saves data in json files using the game id.
Requires the review data from the Board Game Atlas API to
be compiled using `compile_reviews()` to be able to query the
games.
Args:
client_id (str): client id associated with the Board Game
Atlas API
reviews_dir (str): output dir from `compile_reviews()` or
`clean_reviews()`
output_file (str): output directory
Returns:
None. Saves data to json files.
"""
reviews_data_list = glob.glob(os.path.join(reviews_dir, "*.json"))
game_set = set()
# get list of games to check the price
for reviews in reviews_data_list:
with open(reviews, "r") as f:
reviews_data = json.load(f)
for review in reviews_data:
game_set.add(str(review["game"]["id"]))
url = "https://api.boardgameatlas.com/api/game/prices"
params = {
"pretty": "true",
"client_id": client_id,
}
# check already parsed games
price_set = set()
for item in Path(output_dir).iterdir():
if item.is_dir():
price_data_list = glob.glob(os.path.join(output_dir, item, "*.json"))
for price_file in price_data_list:
price_set.add(os.path.basename(price_file).split(".")[0])
# removes already parsed games from the set
game_set = game_set.difference(price_set)
# extract and save price data
for idx, game in tqdm(enumerate(list(game_set))):
params["game_id"] = str(game)
temp_data = query_atlas_api(url, params)
price_data = temp_data["gameWithPrices"]
if not price_data:
continue
for key, val in price_data.items():
key_dir = os.path.join(output_dir, key)
if not os.path.isdir(key_dir):
os.makedirs(key_dir)
if val:
for game in val:
game["price_category"] = key
with open(
os.path.join(key_dir, f"{game}.json"),
"w",
) as f:
json.dump(val, f)
def combine_files(input_dir, output_dir=None):
"""
Combines json files in a directory using the the first
character of the file name based the set of ascii_letters
and digits.
Args:
input_dir (str): directory with json files to be
combined
output_file (str): output directory to save combined
files, if None, saves the files to the input_dir
(default=None)
Returns:
None. Saves data to json files.
"""
ascii_list = list(string.ascii_letters + string.digits)
if not output_dir:
output_dir = input_dir
for val in ascii_list:
temp_files = glob.glob(os.path.join(input_dir, val + "*.json"))
if not temp_files:
continue
data = []
for file in temp_files:
with open(file, "r") as f:
temp_data = json.load(f)
if not isinstance(temp_data, list):
data.append(temp_data)
else:
data.extend(temp_data)
outfile = os.path.join(
output_dir, os.path.basename(input_dir) + "_" + val + ".json"
)
with open(outfile, "w") as f:
json.dump(data, f)