getty-add: add objects

chapmanjacobd · chapmanjacobd · commit 7e0afe0329e4 · 2025-01-17T19:03:28.000-06:00
diff --git a/library/createdb/getty_add.py b/library/createdb/getty_add.py
@@ -1,5 +1,8 @@
-from library.utils import arggroups, argparse_utils, web
+import random, sqlite3
+
+from library.utils import arggroups, argparse_utils, iterables, web
 from library.utils.log_utils import log
+from library.utils.objects import traverse_obj
 
 
 def parse_args():
@@ -16,6 +19,27 @@ def parse_args():
     return args
 
 
+def getty_fetch(url):
+    log.debug("Fetching %s...", url)
+
+    try:
+        r = web.session.get(url, timeout=120)
+    except Exception as e:
+        if "too many 429 error" in str(e):
+            raise
+        log.exception("Could not get a valid response from the server")
+        return None
+    if r.status_code == 404:
+        log.warning("404 Not Found Error: %s", url)
+        return
+    else:
+        r.raise_for_status()
+
+    # time.sleep(random.uniform(0.05, 0.6))  # 300ms is politeness
+
+    return r.json()
+
+
 def activity_stream_extract(args, json_data):
     assert json_data["type"] == "OrderedCollectionPage"
 
@@ -50,55 +74,159 @@ def activity_stream_extract(args, json_data):
     return data
 
 
-def activity_stream_fetch(url):
-    try:
-        r = web.session.get(url, timeout=120)
-    except Exception as e:
-        if "too many 429 error" in str(e):
-            raise
-        log.exception("Could not get a valid response from the server")
-        return None
-    if r.status_code == 404:
-        log.warning("404 Not Found Error: %s", url)
-        return
-    else:
-        r.raise_for_status()
-
-    # time.sleep(random.uniform(0.05, 0.6))  # 300ms is politeness
-
-    return r.json()
-
-
 def update_activity_stream(args):
     current_page = int(args.db.pop("select max(page) from activity_stream") or 0) + 1
 
     next_page_url = f"https://data.getty.edu/museum/collection/activity-stream/page/{current_page}"
     while next_page_url:
-        log.debug("Fetching %s...", next_page_url)
-
-        page_data = activity_stream_fetch(next_page_url)
+        page_data = getty_fetch(next_page_url)
         if page_data:
             current_page = int(page_data["id"].split("/")[-1])
 
             activities = activity_stream_extract(args, page_data)
             args.db["activity_stream"].insert_all(
-                [{"page": current_page, **activity} for activity in activities], alter=True, replace=True  # pk="id",
+                [{"page": current_page, **activity} for activity in activities], alter=True, replace=True, pk="id"
             )
 
             next_page_url = page_data.get("next", {}).get("id")
         else:
             break
 
 
+def objects_extract(args, j):
+    assert j["type"] == "HumanMadeObject"
+
+    known_keys = set(
+        [
+            "@context",
+            "id",
+            "type",
+            "_label",
+            "classified_as",
+            "identified_by",
+            "referred_to_by",
+            "dimension",
+            "shows",
+            "produced_by",
+            "current_keeper",
+            "current_location",
+            "current_owner",
+            "subject_of",
+            "representation",
+            "subject_to",
+            "member_of",
+            "part_of",
+            "carries",
+            "changed_ownership_through",
+            "attributed_by",
+            "made_of",
+            "part",
+            "number_of_parts",
+        ]
+    )
+    unhandled_keys = set(j.keys()) - known_keys
+    if unhandled_keys:
+        log.warning("Unhandled keys %s", {k: v for k, v in j.items() if k in unhandled_keys})
+
+    ignore_types = set(["Object Record Structure: Whole"])
+
+    description = None
+    object_description = iterables.find_dict_value(
+        j["referred_to_by"], _label="Object Description", format="text/markdown"
+    )
+    if object_description:
+        description = object_description["content"]
+        description += ";".join(d["content"] for d in object_description["subject_to"] for d in d["subject_of"])
+
+    author = None
+    if j["produced_by"].get("referred_to_by"):
+        author = iterables.find_dict_value(
+            j["produced_by"]["referred_to_by"], _label="Artist/Maker (Producer) Description"
+        ).get("content")
+
+    # TODO: deprecated but I don't want to make another HTTP call... calling their bluff
+    image_path = [
+        d["id"]
+        for d in (j.get("representation") or [])  # but some objects don't have images...
+        if d["id"].startswith("https://media.getty.edu/iiif/image/")
+    ]
+    if j.get("representation"):
+        assert len(image_path) == 1
+        image_path = image_path[0]
+
+    media_path = [d["id"] for d in (j.get("shows") or []) if d["id"].startswith("https://data.getty.edu/media/image/")]
+    # assert len(media_path) == 1
+    media_path = "|".join(media_path)
+
+    timestamp_created = traverse_obj(j, ["produced_by", "timespan", "begin_of_the_begin"]) or traverse_obj(
+        j, ["produced_by", "timespan", "end_of_the_end"]
+    )
+
+    d = {
+        "path": image_path or None,
+        "name": j["_label"],
+        "types": "; ".join(set(d["_label"] for d in j["classified_as"]) - ignore_types),
+        "description": description,
+        "culture": iterables.find_dict_value(j["referred_to_by"], _label="Culture Statement").get("content"),
+        "dimensions": iterables.find_dict_value(j["referred_to_by"], _label="Dimensions Statement").get("content"),
+        "materials": iterables.find_dict_value(j["referred_to_by"], _label="Materials Description").get("content"),
+        "author": author,
+        "place_created": iterables.find_dict_value(j["referred_to_by"], _label="Place Created").get("content"),
+        "object_path": j["id"],
+        "media_path": media_path or None,
+        "timestamp_created": timestamp_created,
+        "license": j["referred_to_by"][-1]["id"],
+    }
+
+    return [d]
+
+
+def update_objects(args):
+    try:
+        unknown_objects = [
+            d["path"]
+            for d in args.db.query(
+                """
+                SELECT path FROM activity_stream WHERE type = 'HumanMadeObject'
+                EXCEPT
+                SELECT object_path FROM media
+                """
+            )
+        ]
+    except sqlite3.OperationalError:
+        unknown_objects = [
+            d["path"] for d in args.db.query("SELECT path FROM activity_stream WHERE type = 'HumanMadeObject'")
+        ]
+
+    print("Fetching", len(unknown_objects), "unknown objects")
+
+    random.shuffle(unknown_objects)
+    for unknown_object in unknown_objects:
+        log.debug("Fetching %s...", unknown_object)
+
+        page_data = getty_fetch(unknown_object)
+        if page_data:
+            images = objects_extract(args, page_data)
+            args.db["media"].insert_all(images, alter=True, replace=True, pk="id")
+
+
 def getty_add():
     args = parse_args()
 
     update_activity_stream(args)
 
-
-# https://data.getty.edu/museum/collection/group/ee294bfc-bbe5-42b4-95b2-04872b802bfe
-# https://data.getty.edu/museum/collection/object/08eaed9f-1354-4817-8aed-1db49e893a03
-# https://data.getty.edu/museum/collection/document/37194afd-905c-43df-9f28-baacdd91062a
-# https://data.getty.edu/museum/collection/person/f4806477-b058-4852-88ae-852a99465249
-# https://data.getty.edu/museum/collection/place/ed18d1db-1ed7-4d04-a46a-909c054dc762
-# https://data.getty.edu/museum/collection/exhibition/6bd62de5-391f-45a9-95f0-bc88d4bcc2a8
+    """
+    ┌─────────────────────┬──────────┐
+    │        type         │ count(*) │ collection_type
+    ├─────────────────────┼──────────┤
+    │ PropositionalObject │ 10480    │ exhibition
+    │ Activity            │ 11376    │ activity
+    │ Group               │ 13383    │ group
+    │ Place               │ 24977    │ place
+    │ Person              │ 41438    │ person
+    │ LinguisticObject    │ 73273    │ document
+    │ HumanMadeObject     │ 319018   │ object  # the one that is most interesting...
+    └─────────────────────┴──────────┘
+    """
+
+    update_objects(args)
diff --git a/library/playback/torrents_info.py b/library/playback/torrents_info.py
@@ -4,7 +4,7 @@
 
 from library import usage
 from library.mediafiles import torrents_start
-from library.utils import arggroups, argparse_utils, consts, iterables, printing, strings
+from library.utils import arggroups, argparse_utils, consts, iterables, printing, processes, strings
 from library.utils.path_utils import domain_from_url
 
 
@@ -33,16 +33,18 @@ def parse_args():
     )
 
     parser.add_argument(
-        "--file-counts", "--files", "--counts", action="store_true", help="Print file counts (a bit slow)"
+        "--file-counts", "--files", "--counts", action="store_true", help="Include file counts column (a bit slow)"
     )
+    parser.add_argument("--trackers", action="store_true", help="Include tracker column")
+
     parser.add_argument("--priority", action="store_true", help="Sort by priority")
     parser.add_argument("--ratio", action="store_true", help="Sort by ratio")
     parser.add_argument("--size", action="store_true", help="Sort by data transferred")
     parser.add_argument("--remaining", action="store_true", help="Sort by remaining")
 
     parser.add_argument("--all", action="store_true", help="Show active and inactive torrents")
-    parser.add_argument("--active", action=argparse.BooleanOptionalAction, help="Show active torrents")
-    parser.add_argument("--inactive", "--dead", action=argparse.BooleanOptionalAction, help="Show inactive torrents")
+    parser.add_argument("--active", action="store_true", help="Show active torrents")
+    parser.add_argument("--inactive", "--dead", action="store_true", help="Show inactive torrents")
 
     parser.add_argument(
         "--force-start", "--start", action=argparse.BooleanOptionalAction, help="Force start matching torrents"
@@ -131,13 +133,13 @@ def shorten(s, width):
         printing.table(tbl)
         print()
 
+    torrents = filter_torrents_by_activity(args, torrents)
+
     if args.torrent_search or args.file_search:
         torrents = [t for t in torrents if strings.glob_match(args.torrent_search, [t.name, t.save_path, t.hash])]
 
         if args.file_search:
             torrents = [t for t in torrents if strings.glob_match(args.file_search, [f.name for f in t.files])]
-    else:
-        torrents = filter_torrents_by_activity(args, torrents)
 
     if args.priority:
         torrents = sorted(torrents, key=lambda t: t.priority)
@@ -173,6 +175,8 @@ def shorten(s, width):
             ),
         )
 
+    if not torrents:
+        processes.no_media_found()
     if args.torrent_search or args.file_search:
         print(len(torrents), "matching torrents")
 
@@ -215,9 +219,13 @@ def gen_row(t):
             elif args.file_counts:
                 d |= {"files": len(t.files)}
 
+            if args.priority:
+                d |= {"priority": str(t.priority) + (" [F]" if t.force_start else "")}
+            if args.trackers:
+                d |= {"tracker": qbt_get_tracker(qbt_client, t)}
+
             if args.verbose >= consts.LOG_INFO:
                 d |= {
-                    "tracker": qbt_get_tracker(qbt_client, t),
                     "seen_complete": (strings.relative_datetime(t.seen_complete) if t.seen_complete > 0 else None),
                     "added_on": strings.relative_datetime(t.added_on),
                     "last_activity": strings.relative_datetime(t.last_activity),
@@ -273,10 +281,14 @@ def gen_row(t):
             elif args.file_counts:
                 d |= {"files": len(t.files)}
 
+            if args.priority:
+                d |= {"priority": str(t.priority) + (" [F]" if t.force_start else "")}
+            if args.trackers:
+                d |= {"tracker": qbt_get_tracker(qbt_client, t)}
+
             if args.verbose >= consts.LOG_INFO:
                 d |= {
                     "state": t.state,
-                    "tracker": qbt_get_tracker(qbt_client, t),
                     "added_on": strings.relative_datetime(t.added_on),
                     "size": strings.file_size(t.total_size),
                     "remaining": strings.file_size(t.amount_left),
diff --git a/library/playback/torrents_status.py b/library/playback/torrents_status.py
@@ -85,17 +85,16 @@ def shorten(s, width):
         printing.table(tbl)
         print()
 
+    torrents = filter_torrents_by_activity(args, torrents)
+
     if args.torrent_search or args.file_search:
         torrents = [t for t in torrents if strings.glob_match(args.torrent_search, [t.name, t.save_path, t.hash])]
 
         if args.file_search:
             torrents = [t for t in torrents if strings.glob_match(args.file_search, [f.name for f in t.files])]
-    else:
-        torrents = filter_torrents_by_activity(args, torrents)
 
     if not torrents:
         processes.no_media_found()
-
     print(len(torrents), "torrents:")
     print()
 
diff --git a/library/tablefiles/markdown_tables.py b/library/tablefiles/markdown_tables.py
@@ -2,7 +2,7 @@
 
 from library import usage
 from library.playback import media_printer
-from library.utils import arggroups, argparse_utils, file_utils, web
+from library.utils import arggroups, argparse_utils, file_utils, printing, web
 from library.utils.log_utils import check_stdio
 
 
@@ -58,7 +58,7 @@ def file_markdown(args, path):
             else:
                 print(f"## {path}:{df_name}")
             print()
-            print(df.to_markdown(tablefmt="github", index=False))
+            printing.table(df.to_dict(orient="records"))
             print()
 
 
diff --git a/library/utils/iterables.py b/library/utils/iterables.py
@@ -95,6 +95,13 @@ def concat(*args):
     return (part for part in args if part)
 
 
+def find_dict_value(input_list: list[dict], **kwargs) -> dict:
+    for item in input_list:
+        if all(item.get(k) == v for k, v in kwargs.items()):
+            return item
+    return {}
+
+
 def find_none_keys(list_of_dicts, keep_0=True):
     none_keys = []
 
diff --git a/library/utils/objects.py b/library/utils/objects.py
@@ -256,3 +256,14 @@ def dict_filter_similar_key(input_dict, input_string, threshold=0.7):
     filtered_dict = {key: value for key, value in input_dict.items() if similar_keys[key] >= threshold}
 
     return filtered_dict
+
+
+def traverse_obj(obj, path):
+    for key in path:
+        if isinstance(obj, dict) and key in obj:
+            obj = obj[key]
+        elif isinstance(obj, list) and isinstance(key, int) and -1 <= key < len(obj):
+            obj = obj[key]
+        else:
+            return None
+    return obj