44import re
55import time
66import zipfile
7- from urllib .parse import unquote , urljoin , urlparse
87from typing import Optional
8+ from urllib .parse import unquote , urljoin , urlparse
99
1010import requests
1111from bs4 import BeautifulSoup
2727SAVE_METADATA_PATH = "metadata.csv"
2828CSV_HEADERS = [
2929 "filename_prefix" ,
30- "url" , # the original repository page (item_url)
31- "ref_title" , # metadata title
32- "authors" , # semicolon-joined list of author names
33- "ref_year" , # numeric year or empty string
34- "ref_source" , # UA Library handle (or equivalent)
35- "isbn_doi" , # DOI or first API link href
36- "license" , # license type string
37- "series" , # e.g. DGM-209
38- "keywords" , # semicolon-joined keyword names
30+ "url" , # the original repository page (item_url)
31+ "ref_title" , # metadata title
32+ "authors" , # semicolon-joined list of author names
33+ "ref_year" , # numeric year or empty string
34+ "ref_source" , # UA Library handle (or equivalent)
35+ "isbn_doi" , # DOI or first API link href
36+ "license" , # license type string
37+ "series" , # e.g. DGM-209
38+ "keywords" , # semicolon-joined keyword names
3939 "language" , # language
40- "description" , # abstract
40+ "description" , # abstract
4141]
4242
4343# inserts header row in csv
@@ -236,9 +236,10 @@ def filename_to_title_param(filename: str) -> str:
236236 return title_param
237237
238238
239-
240239def get_collection_id (title_param : str , filename : str ) -> Optional [str ]:
241- results = requests .get (f'https://data.azgs.arizona.edu/api/v1/metadata?collection_group=%21ADMM&title={ title_param } ' )
240+ results = requests .get (
241+ f"https://data.azgs.arizona.edu/api/v1/metadata?collection_group=%21ADMM&title={ title_param } "
242+ )
242243 results .raise_for_status ()
243244 results = results .json ()
244245 for collection in results .get ("data" , []):
@@ -252,7 +253,8 @@ def get_collection_id(title_param: str, filename: str) -> Optional[str]:
252253
253254def get_collection_metadata (collection_id : str ) -> dict :
254255 results = requests .get (
255- f'https://data.azgs.arizona.edu/api/v1/metadata/{ collection_id } ' )
256+ f"https://data.azgs.arizona.edu/api/v1/metadata/{ collection_id } "
257+ )
256258 payload = results .json ()
257259
258260 coll = payload .get ("data" , {})
@@ -261,16 +263,8 @@ def get_collection_metadata(collection_id: str) -> dict:
261263 top_links = coll .get ("links" , []) or []
262264 identifiers = meta .get ("identifiers" , {}) or {}
263265 license_info = meta .get ("license" , {}) or {}
264- authors = [
265- a .get ("person" )
266- for a in meta .get ("authors" , [])
267- if a .get ("person" )
268- ]
269- keywords = [
270- k .get ("name" )
271- for k in meta .get ("keywords" , [])
272- if k .get ("name" )
273- ]
266+ authors = [a .get ("person" ) for a in meta .get ("authors" , []) if a .get ("person" )]
267+ keywords = [k .get ("name" ) for k in meta .get ("keywords" , []) if k .get ("name" )]
274268 ref_source = meta_links [0 ].get ("url" ) if meta_links else None
275269
276270 isbn_doi = identifiers .get ("doi" )
@@ -292,13 +286,13 @@ def get_collection_metadata(collection_id: str) -> dict:
292286 # Remove the entire boilerplate paragraph starting with "This geodatabase is part of..."
293287 # This pattern matches from "This geodatabase" through "U.S. Government."
294288 description = re .sub (
295- r' \s*This geodatabase is part of a digital republication.*?U\.S\. Government\.' ,
296- '' ,
289+ r" \s*This geodatabase is part of a digital republication.*?U\.S\. Government\." ,
290+ "" ,
297291 description ,
298- flags = re .DOTALL | re .IGNORECASE
292+ flags = re .DOTALL | re .IGNORECASE ,
299293 )
300- description = re .sub (r' \n+' , ' ' , description )
301- description = re .sub (r' \s+' , ' ' , description ).strip ()
294+ description = re .sub (r" \n+" , " " , description )
295+ description = re .sub (r" \s+" , " " , description ).strip ()
302296
303297 required_fields = {
304298 "authors" : authors ,
@@ -315,7 +309,6 @@ def get_collection_metadata(collection_id: str) -> dict:
315309 return required_fields
316310
317311
318-
319312def download_gdb_zips (item_url : str ):
320313 """
321314 Download any .gdb.zip files on the page and record metadata in processed_item_urls.csv.
@@ -331,9 +324,9 @@ def download_gdb_zips(item_url: str):
331324
332325 for file_url in gdb_links :
333326 parsed = urlparse (file_url )
334- filename = unquote (os .path .basename (parsed .path )) # e.g. 'WildcatHill.gdb.zip'
335- title_param = filename_to_title_param (filename ) # e.g. 'Wildcat+Hill'
336- filename_prefix = strip_gdb_zip_suffixes (filename ) # e.g. 'WildcatHill'
327+ filename = unquote (os .path .basename (parsed .path )) # e.g. 'WildcatHill.gdb.zip'
328+ title_param = filename_to_title_param (filename ) # e.g. 'Wildcat+Hill'
329+ filename_prefix = strip_gdb_zip_suffixes (filename ) # e.g. 'WildcatHill'
337330 download_ok = False
338331 if filename in downloaded_filenames :
339332 print (f"Already scraped this file... skipping: { filename } " )
@@ -343,7 +336,7 @@ def download_gdb_zips(item_url: str):
343336 downloaded_filenames .add (filename )
344337 download_ok = True
345338 else :
346- #trying downloading the gdb
339+ # trying downloading the gdb
347340 out_path = os .path .join (OUTPUT_DIR , filename )
348341 print (f"Downloading: { filename } " )
349342 try :
@@ -363,7 +356,9 @@ def download_gdb_zips(item_url: str):
363356 download_ok = False
364357
365358 if not download_ok :
366- print (f"Skipping metadata for { filename_prefix } because download failed and file is not present." )
359+ print (
360+ f"Skipping metadata for { filename_prefix } because download failed and file is not present."
361+ )
367362 continue
368363 else :
369364 # Get metadata via API
@@ -372,8 +367,12 @@ def download_gdb_zips(item_url: str):
372367
373368 # Map and write filename + metadata to CSV
374369 if metadata :
375- authors_str = "; " .join (metadata ["authors" ]) if metadata ["authors" ] else ""
376- keywords_str = "; " .join (metadata ["keywords" ]) if metadata ["keywords" ] else ""
370+ authors_str = (
371+ "; " .join (metadata ["authors" ]) if metadata ["authors" ] else ""
372+ )
373+ keywords_str = (
374+ "; " .join (metadata ["keywords" ]) if metadata ["keywords" ] else ""
375+ )
377376
378377 row = [
379378 filename_prefix , # filename_prefix
@@ -390,7 +389,20 @@ def download_gdb_zips(item_url: str):
390389 metadata ["description" ] or "" , # description
391390 ]
392391 else :
393- row = [filename_prefix , item_url , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" ]
392+ row = [
393+ filename_prefix ,
394+ item_url ,
395+ "" ,
396+ "" ,
397+ "" ,
398+ "" ,
399+ "" ,
400+ "" ,
401+ "" ,
402+ "" ,
403+ "" ,
404+ "" ,
405+ ]
394406
395407 with open (SAVE_METADATA_PATH , "a" , newline = "" ) as f :
396408 writer = csv .writer (f )
0 commit comments