1- from library .utils import arggroups , argparse_utils , web
1+ import random , sqlite3
2+
3+ from library .utils import arggroups , argparse_utils , iterables , web
24from library .utils .log_utils import log
5+ from library .utils .objects import traverse_obj
36
47
58def parse_args ():
@@ -16,6 +19,27 @@ def parse_args():
1619 return args
1720
1821
22+ def getty_fetch (url ):
23+ log .debug ("Fetching %s..." , url )
24+
25+ try :
26+ r = web .session .get (url , timeout = 120 )
27+ except Exception as e :
28+ if "too many 429 error" in str (e ):
29+ raise
30+ log .exception ("Could not get a valid response from the server" )
31+ return None
32+ if r .status_code == 404 :
33+ log .warning ("404 Not Found Error: %s" , url )
34+ return
35+ else :
36+ r .raise_for_status ()
37+
38+ # time.sleep(random.uniform(0.05, 0.6)) # 300ms is politeness
39+
40+ return r .json ()
41+
42+
1943def activity_stream_extract (args , json_data ):
2044 assert json_data ["type" ] == "OrderedCollectionPage"
2145
@@ -50,55 +74,159 @@ def activity_stream_extract(args, json_data):
5074 return data
5175
5276
53- def activity_stream_fetch (url ):
54- try :
55- r = web .session .get (url , timeout = 120 )
56- except Exception as e :
57- if "too many 429 error" in str (e ):
58- raise
59- log .exception ("Could not get a valid response from the server" )
60- return None
61- if r .status_code == 404 :
62- log .warning ("404 Not Found Error: %s" , url )
63- return
64- else :
65- r .raise_for_status ()
66-
67- # time.sleep(random.uniform(0.05, 0.6)) # 300ms is politeness
68-
69- return r .json ()
70-
71-
7277def update_activity_stream (args ):
7378 current_page = int (args .db .pop ("select max(page) from activity_stream" ) or 0 ) + 1
7479
7580 next_page_url = f"https://data.getty.edu/museum/collection/activity-stream/page/{ current_page } "
7681 while next_page_url :
77- log .debug ("Fetching %s..." , next_page_url )
78-
79- page_data = activity_stream_fetch (next_page_url )
82+ page_data = getty_fetch (next_page_url )
8083 if page_data :
8184 current_page = int (page_data ["id" ].split ("/" )[- 1 ])
8285
8386 activities = activity_stream_extract (args , page_data )
8487 args .db ["activity_stream" ].insert_all (
85- [{"page" : current_page , ** activity } for activity in activities ], alter = True , replace = True # pk="id",
88+ [{"page" : current_page , ** activity } for activity in activities ], alter = True , replace = True , pk = "id"
8689 )
8790
8891 next_page_url = page_data .get ("next" , {}).get ("id" )
8992 else :
9093 break
9194
9295
96+ def objects_extract (args , j ):
97+ assert j ["type" ] == "HumanMadeObject"
98+
99+ known_keys = set (
100+ [
101+ "@context" ,
102+ "id" ,
103+ "type" ,
104+ "_label" ,
105+ "classified_as" ,
106+ "identified_by" ,
107+ "referred_to_by" ,
108+ "dimension" ,
109+ "shows" ,
110+ "produced_by" ,
111+ "current_keeper" ,
112+ "current_location" ,
113+ "current_owner" ,
114+ "subject_of" ,
115+ "representation" ,
116+ "subject_to" ,
117+ "member_of" ,
118+ "part_of" ,
119+ "carries" ,
120+ "changed_ownership_through" ,
121+ "attributed_by" ,
122+ "made_of" ,
123+ "part" ,
124+ "number_of_parts" ,
125+ ]
126+ )
127+ unhandled_keys = set (j .keys ()) - known_keys
128+ if unhandled_keys :
129+ log .warning ("Unhandled keys %s" , {k : v for k , v in j .items () if k in unhandled_keys })
130+
131+ ignore_types = set (["Object Record Structure: Whole" ])
132+
133+ description = None
134+ object_description = iterables .find_dict_value (
135+ j ["referred_to_by" ], _label = "Object Description" , format = "text/markdown"
136+ )
137+ if object_description :
138+ description = object_description ["content" ]
139+ description += ";" .join (d ["content" ] for d in object_description ["subject_to" ] for d in d ["subject_of" ])
140+
141+ author = None
142+ if j ["produced_by" ].get ("referred_to_by" ):
143+ author = iterables .find_dict_value (
144+ j ["produced_by" ]["referred_to_by" ], _label = "Artist/Maker (Producer) Description"
145+ ).get ("content" )
146+
147+ # TODO: deprecated but I don't want to make another HTTP call... calling their bluff
148+ image_path = [
149+ d ["id" ]
150+ for d in (j .get ("representation" ) or []) # but some objects don't have images...
151+ if d ["id" ].startswith ("https://media.getty.edu/iiif/image/" )
152+ ]
153+ if j .get ("representation" ):
154+ assert len (image_path ) == 1
155+ image_path = image_path [0 ]
156+
157+ media_path = [d ["id" ] for d in (j .get ("shows" ) or []) if d ["id" ].startswith ("https://data.getty.edu/media/image/" )]
158+ # assert len(media_path) == 1
159+ media_path = "|" .join (media_path )
160+
161+ timestamp_created = traverse_obj (j , ["produced_by" , "timespan" , "begin_of_the_begin" ]) or traverse_obj (
162+ j , ["produced_by" , "timespan" , "end_of_the_end" ]
163+ )
164+
165+ d = {
166+ "path" : image_path or None ,
167+ "name" : j ["_label" ],
168+ "types" : "; " .join (set (d ["_label" ] for d in j ["classified_as" ]) - ignore_types ),
169+ "description" : description ,
170+ "culture" : iterables .find_dict_value (j ["referred_to_by" ], _label = "Culture Statement" ).get ("content" ),
171+ "dimensions" : iterables .find_dict_value (j ["referred_to_by" ], _label = "Dimensions Statement" ).get ("content" ),
172+ "materials" : iterables .find_dict_value (j ["referred_to_by" ], _label = "Materials Description" ).get ("content" ),
173+ "author" : author ,
174+ "place_created" : iterables .find_dict_value (j ["referred_to_by" ], _label = "Place Created" ).get ("content" ),
175+ "object_path" : j ["id" ],
176+ "media_path" : media_path or None ,
177+ "timestamp_created" : timestamp_created ,
178+ "license" : j ["referred_to_by" ][- 1 ]["id" ],
179+ }
180+
181+ return [d ]
182+
183+
184+ def update_objects (args ):
185+ try :
186+ unknown_objects = [
187+ d ["path" ]
188+ for d in args .db .query (
189+ """
190+ SELECT path FROM activity_stream WHERE type = 'HumanMadeObject'
191+ EXCEPT
192+ SELECT object_path FROM media
193+ """
194+ )
195+ ]
196+ except sqlite3 .OperationalError :
197+ unknown_objects = [
198+ d ["path" ] for d in args .db .query ("SELECT path FROM activity_stream WHERE type = 'HumanMadeObject'" )
199+ ]
200+
201+ print ("Fetching" , len (unknown_objects ), "unknown objects" )
202+
203+ random .shuffle (unknown_objects )
204+ for unknown_object in unknown_objects :
205+ log .debug ("Fetching %s..." , unknown_object )
206+
207+ page_data = getty_fetch (unknown_object )
208+ if page_data :
209+ images = objects_extract (args , page_data )
210+ args .db ["media" ].insert_all (images , alter = True , replace = True , pk = "id" )
211+
212+
93213def getty_add ():
94214 args = parse_args ()
95215
96216 update_activity_stream (args )
97217
98-
99- # https://data.getty.edu/museum/collection/group/ee294bfc-bbe5-42b4-95b2-04872b802bfe
100- # https://data.getty.edu/museum/collection/object/08eaed9f-1354-4817-8aed-1db49e893a03
101- # https://data.getty.edu/museum/collection/document/37194afd-905c-43df-9f28-baacdd91062a
102- # https://data.getty.edu/museum/collection/person/f4806477-b058-4852-88ae-852a99465249
103- # https://data.getty.edu/museum/collection/place/ed18d1db-1ed7-4d04-a46a-909c054dc762
104- # https://data.getty.edu/museum/collection/exhibition/6bd62de5-391f-45a9-95f0-bc88d4bcc2a8
218+ """
219+ ┌─────────────────────┬──────────┐
220+ │ type │ count(*) │ collection_type
221+ ├─────────────────────┼──────────┤
222+ │ PropositionalObject │ 10480 │ exhibition
223+ │ Activity │ 11376 │ activity
224+ │ Group │ 13383 │ group
225+ │ Place │ 24977 │ place
226+ │ Person │ 41438 │ person
227+ │ LinguisticObject │ 73273 │ document
228+ │ HumanMadeObject │ 319018 │ object # the one that is most interesting...
229+ └─────────────────────┴──────────┘
230+ """
231+
232+ update_objects (args )
0 commit comments