Skip to content

Commit 7e0afe0

Browse files
committed
getty-add: add objects
1 parent 6fa109e commit 7e0afe0

File tree

6 files changed

+201
-44
lines changed

6 files changed

+201
-44
lines changed

library/createdb/getty_add.py

Lines changed: 159 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1-
from library.utils import arggroups, argparse_utils, web
1+
import random, sqlite3
2+
3+
from library.utils import arggroups, argparse_utils, iterables, web
24
from library.utils.log_utils import log
5+
from library.utils.objects import traverse_obj
36

47

58
def parse_args():
@@ -16,6 +19,27 @@ def parse_args():
1619
return args
1720

1821

22+
def getty_fetch(url):
23+
log.debug("Fetching %s...", url)
24+
25+
try:
26+
r = web.session.get(url, timeout=120)
27+
except Exception as e:
28+
if "too many 429 error" in str(e):
29+
raise
30+
log.exception("Could not get a valid response from the server")
31+
return None
32+
if r.status_code == 404:
33+
log.warning("404 Not Found Error: %s", url)
34+
return
35+
else:
36+
r.raise_for_status()
37+
38+
# time.sleep(random.uniform(0.05, 0.6)) # 300ms is politeness
39+
40+
return r.json()
41+
42+
1943
def activity_stream_extract(args, json_data):
2044
assert json_data["type"] == "OrderedCollectionPage"
2145

@@ -50,55 +74,159 @@ def activity_stream_extract(args, json_data):
5074
return data
5175

5276

53-
def activity_stream_fetch(url):
54-
try:
55-
r = web.session.get(url, timeout=120)
56-
except Exception as e:
57-
if "too many 429 error" in str(e):
58-
raise
59-
log.exception("Could not get a valid response from the server")
60-
return None
61-
if r.status_code == 404:
62-
log.warning("404 Not Found Error: %s", url)
63-
return
64-
else:
65-
r.raise_for_status()
66-
67-
# time.sleep(random.uniform(0.05, 0.6)) # 300ms is politeness
68-
69-
return r.json()
70-
71-
7277
def update_activity_stream(args):
7378
current_page = int(args.db.pop("select max(page) from activity_stream") or 0) + 1
7479

7580
next_page_url = f"https://data.getty.edu/museum/collection/activity-stream/page/{current_page}"
7681
while next_page_url:
77-
log.debug("Fetching %s...", next_page_url)
78-
79-
page_data = activity_stream_fetch(next_page_url)
82+
page_data = getty_fetch(next_page_url)
8083
if page_data:
8184
current_page = int(page_data["id"].split("/")[-1])
8285

8386
activities = activity_stream_extract(args, page_data)
8487
args.db["activity_stream"].insert_all(
85-
[{"page": current_page, **activity} for activity in activities], alter=True, replace=True # pk="id",
88+
[{"page": current_page, **activity} for activity in activities], alter=True, replace=True, pk="id"
8689
)
8790

8891
next_page_url = page_data.get("next", {}).get("id")
8992
else:
9093
break
9194

9295

96+
def objects_extract(args, j):
97+
assert j["type"] == "HumanMadeObject"
98+
99+
known_keys = set(
100+
[
101+
"@context",
102+
"id",
103+
"type",
104+
"_label",
105+
"classified_as",
106+
"identified_by",
107+
"referred_to_by",
108+
"dimension",
109+
"shows",
110+
"produced_by",
111+
"current_keeper",
112+
"current_location",
113+
"current_owner",
114+
"subject_of",
115+
"representation",
116+
"subject_to",
117+
"member_of",
118+
"part_of",
119+
"carries",
120+
"changed_ownership_through",
121+
"attributed_by",
122+
"made_of",
123+
"part",
124+
"number_of_parts",
125+
]
126+
)
127+
unhandled_keys = set(j.keys()) - known_keys
128+
if unhandled_keys:
129+
log.warning("Unhandled keys %s", {k: v for k, v in j.items() if k in unhandled_keys})
130+
131+
ignore_types = set(["Object Record Structure: Whole"])
132+
133+
description = None
134+
object_description = iterables.find_dict_value(
135+
j["referred_to_by"], _label="Object Description", format="text/markdown"
136+
)
137+
if object_description:
138+
description = object_description["content"]
139+
description += ";".join(d["content"] for d in object_description["subject_to"] for d in d["subject_of"])
140+
141+
author = None
142+
if j["produced_by"].get("referred_to_by"):
143+
author = iterables.find_dict_value(
144+
j["produced_by"]["referred_to_by"], _label="Artist/Maker (Producer) Description"
145+
).get("content")
146+
147+
# TODO: deprecated but I don't want to make another HTTP call... calling their bluff
148+
image_path = [
149+
d["id"]
150+
for d in (j.get("representation") or []) # but some objects don't have images...
151+
if d["id"].startswith("https://media.getty.edu/iiif/image/")
152+
]
153+
if j.get("representation"):
154+
assert len(image_path) == 1
155+
image_path = image_path[0]
156+
157+
media_path = [d["id"] for d in (j.get("shows") or []) if d["id"].startswith("https://data.getty.edu/media/image/")]
158+
# assert len(media_path) == 1
159+
media_path = "|".join(media_path)
160+
161+
timestamp_created = traverse_obj(j, ["produced_by", "timespan", "begin_of_the_begin"]) or traverse_obj(
162+
j, ["produced_by", "timespan", "end_of_the_end"]
163+
)
164+
165+
d = {
166+
"path": image_path or None,
167+
"name": j["_label"],
168+
"types": "; ".join(set(d["_label"] for d in j["classified_as"]) - ignore_types),
169+
"description": description,
170+
"culture": iterables.find_dict_value(j["referred_to_by"], _label="Culture Statement").get("content"),
171+
"dimensions": iterables.find_dict_value(j["referred_to_by"], _label="Dimensions Statement").get("content"),
172+
"materials": iterables.find_dict_value(j["referred_to_by"], _label="Materials Description").get("content"),
173+
"author": author,
174+
"place_created": iterables.find_dict_value(j["referred_to_by"], _label="Place Created").get("content"),
175+
"object_path": j["id"],
176+
"media_path": media_path or None,
177+
"timestamp_created": timestamp_created,
178+
"license": j["referred_to_by"][-1]["id"],
179+
}
180+
181+
return [d]
182+
183+
184+
def update_objects(args):
185+
try:
186+
unknown_objects = [
187+
d["path"]
188+
for d in args.db.query(
189+
"""
190+
SELECT path FROM activity_stream WHERE type = 'HumanMadeObject'
191+
EXCEPT
192+
SELECT object_path FROM media
193+
"""
194+
)
195+
]
196+
except sqlite3.OperationalError:
197+
unknown_objects = [
198+
d["path"] for d in args.db.query("SELECT path FROM activity_stream WHERE type = 'HumanMadeObject'")
199+
]
200+
201+
print("Fetching", len(unknown_objects), "unknown objects")
202+
203+
random.shuffle(unknown_objects)
204+
for unknown_object in unknown_objects:
205+
log.debug("Fetching %s...", unknown_object)
206+
207+
page_data = getty_fetch(unknown_object)
208+
if page_data:
209+
images = objects_extract(args, page_data)
210+
args.db["media"].insert_all(images, alter=True, replace=True, pk="id")
211+
212+
93213
def getty_add():
94214
args = parse_args()
95215

96216
update_activity_stream(args)
97217

98-
99-
# https://data.getty.edu/museum/collection/group/ee294bfc-bbe5-42b4-95b2-04872b802bfe
100-
# https://data.getty.edu/museum/collection/object/08eaed9f-1354-4817-8aed-1db49e893a03
101-
# https://data.getty.edu/museum/collection/document/37194afd-905c-43df-9f28-baacdd91062a
102-
# https://data.getty.edu/museum/collection/person/f4806477-b058-4852-88ae-852a99465249
103-
# https://data.getty.edu/museum/collection/place/ed18d1db-1ed7-4d04-a46a-909c054dc762
104-
# https://data.getty.edu/museum/collection/exhibition/6bd62de5-391f-45a9-95f0-bc88d4bcc2a8
218+
"""
219+
┌─────────────────────┬──────────┐
220+
│ type │ count(*) │ collection_type
221+
├─────────────────────┼──────────┤
222+
│ PropositionalObject │ 10480 │ exhibition
223+
│ Activity │ 11376 │ activity
224+
│ Group │ 13383 │ group
225+
│ Place │ 24977 │ place
226+
│ Person │ 41438 │ person
227+
│ LinguisticObject │ 73273 │ document
228+
│ HumanMadeObject │ 319018 │ object # the one that is most interesting...
229+
└─────────────────────┴──────────┘
230+
"""
231+
232+
update_objects(args)

library/playback/torrents_info.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from library import usage
66
from library.mediafiles import torrents_start
7-
from library.utils import arggroups, argparse_utils, consts, iterables, printing, strings
7+
from library.utils import arggroups, argparse_utils, consts, iterables, printing, processes, strings
88
from library.utils.path_utils import domain_from_url
99

1010

@@ -33,16 +33,18 @@ def parse_args():
3333
)
3434

3535
parser.add_argument(
36-
"--file-counts", "--files", "--counts", action="store_true", help="Print file counts (a bit slow)"
36+
"--file-counts", "--files", "--counts", action="store_true", help="Include file counts column (a bit slow)"
3737
)
38+
parser.add_argument("--trackers", action="store_true", help="Include tracker column")
39+
3840
parser.add_argument("--priority", action="store_true", help="Sort by priority")
3941
parser.add_argument("--ratio", action="store_true", help="Sort by ratio")
4042
parser.add_argument("--size", action="store_true", help="Sort by data transferred")
4143
parser.add_argument("--remaining", action="store_true", help="Sort by remaining")
4244

4345
parser.add_argument("--all", action="store_true", help="Show active and inactive torrents")
44-
parser.add_argument("--active", action=argparse.BooleanOptionalAction, help="Show active torrents")
45-
parser.add_argument("--inactive", "--dead", action=argparse.BooleanOptionalAction, help="Show inactive torrents")
46+
parser.add_argument("--active", action="store_true", help="Show active torrents")
47+
parser.add_argument("--inactive", "--dead", action="store_true", help="Show inactive torrents")
4648

4749
parser.add_argument(
4850
"--force-start", "--start", action=argparse.BooleanOptionalAction, help="Force start matching torrents"
@@ -131,13 +133,13 @@ def shorten(s, width):
131133
printing.table(tbl)
132134
print()
133135

136+
torrents = filter_torrents_by_activity(args, torrents)
137+
134138
if args.torrent_search or args.file_search:
135139
torrents = [t for t in torrents if strings.glob_match(args.torrent_search, [t.name, t.save_path, t.hash])]
136140

137141
if args.file_search:
138142
torrents = [t for t in torrents if strings.glob_match(args.file_search, [f.name for f in t.files])]
139-
else:
140-
torrents = filter_torrents_by_activity(args, torrents)
141143

142144
if args.priority:
143145
torrents = sorted(torrents, key=lambda t: t.priority)
@@ -173,6 +175,8 @@ def shorten(s, width):
173175
),
174176
)
175177

178+
if not torrents:
179+
processes.no_media_found()
176180
if args.torrent_search or args.file_search:
177181
print(len(torrents), "matching torrents")
178182

@@ -215,9 +219,13 @@ def gen_row(t):
215219
elif args.file_counts:
216220
d |= {"files": len(t.files)}
217221

222+
if args.priority:
223+
d |= {"priority": str(t.priority) + (" [F]" if t.force_start else "")}
224+
if args.trackers:
225+
d |= {"tracker": qbt_get_tracker(qbt_client, t)}
226+
218227
if args.verbose >= consts.LOG_INFO:
219228
d |= {
220-
"tracker": qbt_get_tracker(qbt_client, t),
221229
"seen_complete": (strings.relative_datetime(t.seen_complete) if t.seen_complete > 0 else None),
222230
"added_on": strings.relative_datetime(t.added_on),
223231
"last_activity": strings.relative_datetime(t.last_activity),
@@ -273,10 +281,14 @@ def gen_row(t):
273281
elif args.file_counts:
274282
d |= {"files": len(t.files)}
275283

284+
if args.priority:
285+
d |= {"priority": str(t.priority) + (" [F]" if t.force_start else "")}
286+
if args.trackers:
287+
d |= {"tracker": qbt_get_tracker(qbt_client, t)}
288+
276289
if args.verbose >= consts.LOG_INFO:
277290
d |= {
278291
"state": t.state,
279-
"tracker": qbt_get_tracker(qbt_client, t),
280292
"added_on": strings.relative_datetime(t.added_on),
281293
"size": strings.file_size(t.total_size),
282294
"remaining": strings.file_size(t.amount_left),

library/playback/torrents_status.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,17 +85,16 @@ def shorten(s, width):
8585
printing.table(tbl)
8686
print()
8787

88+
torrents = filter_torrents_by_activity(args, torrents)
89+
8890
if args.torrent_search or args.file_search:
8991
torrents = [t for t in torrents if strings.glob_match(args.torrent_search, [t.name, t.save_path, t.hash])]
9092

9193
if args.file_search:
9294
torrents = [t for t in torrents if strings.glob_match(args.file_search, [f.name for f in t.files])]
93-
else:
94-
torrents = filter_torrents_by_activity(args, torrents)
9595

9696
if not torrents:
9797
processes.no_media_found()
98-
9998
print(len(torrents), "torrents:")
10099
print()
101100

library/tablefiles/markdown_tables.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from library import usage
44
from library.playback import media_printer
5-
from library.utils import arggroups, argparse_utils, file_utils, web
5+
from library.utils import arggroups, argparse_utils, file_utils, printing, web
66
from library.utils.log_utils import check_stdio
77

88

@@ -58,7 +58,7 @@ def file_markdown(args, path):
5858
else:
5959
print(f"## {path}:{df_name}")
6060
print()
61-
print(df.to_markdown(tablefmt="github", index=False))
61+
printing.table(df.to_dict(orient="records"))
6262
print()
6363

6464

library/utils/iterables.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,13 @@ def concat(*args):
9595
return (part for part in args if part)
9696

9797

98+
def find_dict_value(input_list: list[dict], **kwargs) -> dict:
99+
for item in input_list:
100+
if all(item.get(k) == v for k, v in kwargs.items()):
101+
return item
102+
return {}
103+
104+
98105
def find_none_keys(list_of_dicts, keep_0=True):
99106
none_keys = []
100107

library/utils/objects.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,3 +256,14 @@ def dict_filter_similar_key(input_dict, input_string, threshold=0.7):
256256
filtered_dict = {key: value for key, value in input_dict.items() if similar_keys[key] >= threshold}
257257

258258
return filtered_dict
259+
260+
261+
def traverse_obj(obj, path):
262+
for key in path:
263+
if isinstance(obj, dict) and key in obj:
264+
obj = obj[key]
265+
elif isinstance(obj, list) and isinstance(key, int) and -1 <= key < len(obj):
266+
obj = obj[key]
267+
else:
268+
return None
269+
return obj

0 commit comments

Comments
 (0)