Skip to content

Commit 624f32f

Browse files
committed
add getty-add
1 parent 2222043 commit 624f32f

File tree

2 files changed

+104
-0
lines changed

2 files changed

+104
-0
lines changed

library/__main__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"tables_add": "Add table-like data to SQLite",
2020
"reddit_add": "Create a reddit database; Add subreddits",
2121
"hn_add": "Create / Update a Hacker News database",
22+
"getty_add": "Create / Update a Getty Museum database",
2223
"substack": "Backup substack articles",
2324
"tildes": "Backup tildes comments and topics",
2425
"nicotine_import": "Import paths from nicotine+",
@@ -167,6 +168,7 @@ def print_help(parser) -> None:
167168
"library.createdb.gallery_add.gallery_add": ["gdl-add", "ga"],
168169
"library.createdb.gallery_add.gallery_update": ["gdl-update", "gu"],
169170
"library.createdb.hn_add.hacker_news_add": ["hn-add"],
171+
"library.createdb.getty_add.getty_add": [],
170172
"library.createdb.links_add.links_add": ["links-db"],
171173
"library.createdb.links_add.links_update": [],
172174
"library.createdb.places_import.places_import": [],

library/createdb/getty_add.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
from library.utils import arggroups, argparse_utils, web
2+
from library.utils.log_utils import log
3+
4+
5+
def parse_args():
6+
parser = argparse_utils.ArgumentParser()
7+
arggroups.requests(parser)
8+
9+
arggroups.debug(parser)
10+
arggroups.database(parser)
11+
args = parser.parse_args()
12+
arggroups.args_post(args, parser, create_db=True)
13+
14+
web.requests_session(args) # prepare requests session
15+
16+
return args
17+
18+
19+
def activity_stream_extract(args, json_data):
20+
assert json_data['type'] == 'OrderedCollectionPage'
21+
22+
data = []
23+
if 'orderedItems' in json_data:
24+
for item in json_data['orderedItems']:
25+
for k in ['id', 'created', 'endTime']:
26+
item.pop(k)
27+
obj = item.pop('object')
28+
29+
type_ = item.pop('type')
30+
if type_ == 'Delete':
31+
with args.db.conn:
32+
args.db["activity_stream"].delete_where("path = ?", [obj.get('id')])
33+
elif type_ == 'Update':
34+
continue # TODO: implement in-band Update mechanism
35+
elif type_ not in ['Create']:
36+
raise
37+
38+
obj_info = {
39+
'path': obj.get('id'),
40+
'type': obj.get('type'),
41+
**{k: v for k, v in obj.items() if k not in ['id', 'type']},
42+
}
43+
data.append(obj_info)
44+
if item:
45+
print('item', item)
46+
47+
else:
48+
raise
49+
50+
return data
51+
52+
53+
def activity_stream_fetch(url):
54+
try:
55+
r = web.session.get(url, timeout=120)
56+
except Exception as e:
57+
if "too many 429 error" in str(e):
58+
raise
59+
log.exception("Could not get a valid response from the server")
60+
return None
61+
if r.status_code == 404:
62+
log.warning("404 Not Found Error: %s", url)
63+
return
64+
else:
65+
r.raise_for_status()
66+
67+
# time.sleep(random.uniform(0.05, 0.6)) # 300ms is politeness
68+
69+
return r.json()
70+
71+
def update_activity_stream(args):
72+
current_page = int(args.db.pop('select max(page) from activity_stream') or 0) + 1
73+
74+
next_page_url = f"https://data.getty.edu/museum/collection/activity-stream/page/{current_page}"
75+
while next_page_url:
76+
log.debug("Fetching %s...", next_page_url)
77+
78+
page_data = activity_stream_fetch(next_page_url)
79+
if page_data:
80+
current_page = int(page_data['id'].split('/')[-1])
81+
82+
activities = activity_stream_extract(args, page_data)
83+
args.db["activity_stream"].insert_all(
84+
[{"page": current_page, **activity} for activity in activities], alter=True, replace=True # pk="id",
85+
)
86+
87+
next_page_url = page_data.get('next', {}).get('id')
88+
else:
89+
break
90+
91+
92+
def getty_add():
93+
args = parse_args()
94+
95+
update_activity_stream(args)
96+
97+
# https://data.getty.edu/museum/collection/group/ee294bfc-bbe5-42b4-95b2-04872b802bfe
98+
# https://data.getty.edu/museum/collection/object/08eaed9f-1354-4817-8aed-1db49e893a03
99+
# https://data.getty.edu/museum/collection/document/37194afd-905c-43df-9f28-baacdd91062a
100+
# https://data.getty.edu/museum/collection/person/f4806477-b058-4852-88ae-852a99465249
101+
# https://data.getty.edu/museum/collection/place/ed18d1db-1ed7-4d04-a46a-909c054dc762
102+
# https://data.getty.edu/museum/collection/exhibition/6bd62de5-391f-45a9-95f0-bc88d4bcc2a8

0 commit comments

Comments
 (0)