-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathutils.py
112 lines (86 loc) · 3.7 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import json
import requests
import urllib.request
from Preprint import Preprint
# Helper function to download a file from a URL
def download( url, localFile ):
error = False
message = ""
try:
response = urllib.request.urlretrieve(url, localFile)
except urllib.error.URLError:
error = True
message = "URL Error"
except urllib.error.HTTPError:
error = True
message = "HTTP Error"
except urllib.error.ContentTooShortError:
error = True
message = "Content Too Short Error"
return error, message
# Generic function to send a query to the API
def queryAPI( url, headers ):
response = requests.get(url, headers=headers)
return response
# Helper function to extract and decode JSON from API responses
def getJSON( response ):
json_object = json.loads(response.content.decode('utf-8'))
return json_object
# Helper function to parse JSON response and look for citation data
def parseCitation ( jsonData, preprint ):
data = jsonData['data']['attributes']
preprint.title = data['title']
for a in range( len(data['author']) ):
given = data['author'][a]['given']
family = data['author'][a]['family']
name = given.strip() + " " + family.strip()
preprint.authors.append(name)
# Helper function to parse JSON response and look for identifier data
def parseIdentifier ( jsonData, preprint ):
num_elements = len( jsonData['data'] )
for j in range( num_elements ):
data = jsonData['data'][j]
preprint.parseIdentifierData( data )
# Function to loop over all preprints in the response and parse their data
def parsePreprints( preprints, json_object, headers, verbose=True ):
# Loop over all the response preprints
for i in range( len(json_object['data']) ):
# Response is comprised of Relationships, Links, and Attributes
# Here we seperate them out of the response
rel = json_object['data'][i]['relationships']
attr = json_object['data'][i]['attributes']
# Links contains references to peer reviewed paper (if available)
links = json_object['data'][i]['links']
# Also need to extract scalar values like id and construct the download link
id = json_object['data'][i]['id']
html_link = "https://eartharxiv.org/" + id
download_link = "https://eartharxiv.org/" + id + "/download"
if ( verbose ): print( "Working on preprint ", len(preprints)+1 ) # should be 1 when len=0, thus the +1
# Create a Preprint object to store all the information on this preprint
preprint = Preprint()
# Add the id and download_link values to our object
preprint.setScalars( id, html_link, download_link )
# Parse the Attributes data for this preprint
preprint.parseAttrData( attr )
# The Relationships data has links to more information
# Use our helper function to extract those links and put them in our preprint object
preprint.parseRelData( rel )
# Now that we have the identifiers link (from the Relationships data), send it back to the API
# to get the Identifier JSON and extract the actual DOI identifier
response2 = queryAPI( preprint.identifiersLink, headers )
if response2.status_code == 200:
json_object2 = getJSON( response2 )
parseIdentifier( json_object2, preprint )
else:
print( "Error parsing Identifier Link, HTTP status code is: ", response2.status_code )
# Do the same with the citation link to get all the authors
response2 = queryAPI( preprint.citationLink, headers )
if response2.status_code == 200:
json_object2 = getJSON( response2 )
parseCitation( json_object2, preprint )
else:
print( "Error parsing Citation Link, HTTP status code is: ", response2.status_code )
# Parse the Links to look for peer reviewed version of paper
preprint.parseLinkData( links )
# Add the current preprint to our list of preprints
preprints.append( preprint )