1
+ import hashlib
1
2
import json
2
3
import os
3
4
import shutil
4
- from urllib .parse import parse_qs , urlparse , urlunparse
5
+ from typing import List , Tuple
6
+ from urllib .parse import parse_qs , urlparse
5
7
6
- from ..utils import copytree , deep_get
8
+ from ..utils import copytree , deep_get , is_doi
7
9
from .doi import DoiProvider
8
10
9
11
@@ -23,10 +25,11 @@ def __init__(self):
23
25
self .hosts = json .load (fp )["installations" ]
24
26
super ().__init__ ()
25
27
26
- def detect (self , doi , ref = None , extra_args = None ):
27
- """Trigger this provider for things that resolve to a Dataverse dataset.
28
+ def detect (self , spec , ref = None , extra_args = None ):
29
+ """
30
+ Detect if given spec is hosted on dataverse
28
31
29
- Handles :
32
+ The spec can be :
30
33
- DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
31
34
- DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
32
35
- URL {siteURL}/api/access/datafile/{fileId}
@@ -35,9 +38,11 @@ def detect(self, doi, ref=None, extra_args=None):
35
38
- https://dataverse.harvard.edu/api/access/datafile/3323458
36
39
- doi:10.7910/DVN/6ZXAGT
37
40
- doi:10.7910/DVN/6ZXAGT/3YRRYJ
38
-
39
41
"""
40
- url = self .doi2url (doi )
42
+ if is_doi (spec ):
43
+ url = self .doi2url (spec )
44
+ else :
45
+ url = spec
41
46
# Parse the url, to get the base for later API calls
42
47
parsed_url = urlparse (url )
43
48
@@ -53,57 +58,137 @@ def detect(self, doi, ref=None, extra_args=None):
53
58
if host is None :
54
59
return
55
60
56
- query_args = parse_qs (parsed_url .query )
57
- # Corner case handling
58
- if parsed_url .path .startswith ("/file.xhtml" ):
59
- # There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
60
- # is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way.
61
- new_doi = doi .rsplit ("/" , 1 )[0 ]
62
- if new_doi == doi :
63
- # tough luck :( Avoid inifite recursion and exit.
64
- return
65
- return self .detect (new_doi )
66
- elif parsed_url .path .startswith ("/api/access/datafile" ):
67
- # Raw url pointing to a datafile is a typical output from an External Tool integration
68
- entity_id = os .path .basename (parsed_url .path )
69
- search_query = "q=entityId:" + entity_id + "&type=file"
70
- # Knowing the file identifier query search api to get parent dataset
71
- search_url = urlunparse (
72
- parsed_url ._replace (path = "/api/search" , query = search_query )
61
+ # At this point, we *know* this is a dataverse URL, because:
62
+ # 1. The DOI resolved to a particular host (if using DOI)
63
+ # 2. The host is in the list of known dataverse installations
64
+ #
65
+ # We don't know exactly what kind of dataverse object this is, but
66
+ # that can be figured out during fetch as needed
67
+ return url
68
+
69
+ def get_dataset_id_from_file_id (self , base_url : str , file_id : str ) -> str :
70
+ """
71
+ Return the persistent_id (DOI) of a dataset that a given file_id (int or doi) belongs to
72
+ """
73
+ if file_id .isdigit ():
74
+ # the file_id is an integer, rather than a persistent id (DOI)
75
+ api_url = f"{ base_url } /api/files/{ file_id } ?returnDatasetVersion=true"
76
+ else :
77
+ # the file_id is a doi itself
78
+ api_url = f"{ base_url } /api/files/:persistentId?persistentId={ file_id } &returnDatasetVersion=true"
79
+
80
+ resp = self ._request (api_url )
81
+ if resp .status_code == 404 :
82
+ raise ValueError (f"File with id { file_id } not found in { base_url } " )
83
+
84
+ resp .raise_for_status ()
85
+
86
+ data = resp .json ()["data" ]
87
+ return data ["datasetVersion" ]["datasetPersistentId" ]
88
+
89
+ def parse_dataverse_url (self , url : str ) -> Tuple [str , bool ]:
90
+ """
91
+ Parse the persistent id out of a dataverse URL
92
+
93
+ persistent_id can point to either a dataset or a file. The second return
94
+ value is False if we know that the persistent id is a file or a dataset,
95
+ and True if it is ambiguous.
96
+
97
+ Raises a ValueError if we can not parse the url
98
+ """
99
+ parsed_url = urlparse (url )
100
+ path = parsed_url .path
101
+ qs = parse_qs (parsed_url .query )
102
+ base_url = f"{ parsed_url .scheme } ://{ parsed_url .netloc } "
103
+
104
+ is_ambiguous = False
105
+ # https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
106
+ if path .startswith ("/citation" ):
107
+ is_ambiguous = True
108
+ persistent_id = qs ["persistentId" ][0 ]
109
+ # https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
110
+ elif path .startswith ("/dataset.xhtml" ):
111
+ # https://dataverse.harvard.edu/api/access/datafile/3323458
112
+ persistent_id = qs ["persistentId" ][0 ]
113
+ elif path .startswith ("/api/access/datafile" ):
114
+ # What we have here is an entity id, which we can use to get a persistentId
115
+ file_id = os .path .basename (path )
116
+ persistent_id = self .get_dataset_id_from_file_id (base_url , file_id )
117
+ elif parsed_url .path .startswith ("/file.xhtml" ):
118
+ file_persistent_id = qs ["persistentId" ][0 ]
119
+ persistent_id = self .get_dataset_id_from_file_id (
120
+ base_url , file_persistent_id
121
+ )
122
+ else :
123
+ raise ValueError (
124
+ f"Could not determine persistent id for dataverse URL { url } "
73
125
)
74
- self .log .debug ("Querying Dataverse: " + search_url )
75
- data = self .urlopen (search_url ).json ()["data" ]
76
- if data ["count_in_response" ] != 1 :
77
- self .log .debug (
78
- f"Dataverse search query failed!\n - doi: { doi } \n - url: { url } \n - resp: { json .dump (data )} \n "
79
- )
80
- return
81
-
82
- self .record_id = deep_get (data , "items.0.dataset_persistent_id" )
83
- elif (
84
- parsed_url .path .startswith ("/dataset.xhtml" )
85
- and "persistentId" in query_args
86
- ):
87
- self .record_id = deep_get (query_args , "persistentId.0" )
88
-
89
- if hasattr (self , "record_id" ):
90
- return {"record" : self .record_id , "host" : host }
126
+
127
+ return persistent_id , is_ambiguous
128
+
129
+ def get_datafiles (self , url : str ) -> List [dict ]:
130
+ """
131
+ Return a list of dataFiles for given persistent_id
132
+
133
+ Supports the following *dataset* URL styles:
134
+ - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
135
+ - /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
136
+
137
+ Supports the following *file* URL styles (entire dataset file belongs to will be fetched):
138
+ - /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458
139
+ - /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
140
+ - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
141
+
142
+ If a URL can not be parsed, throw an exception
143
+ """
144
+
145
+ parsed_url = urlparse (url )
146
+ base_url = f"{ parsed_url .scheme } ://{ parsed_url .netloc } "
147
+
148
+ persistent_id , is_ambiguous = self .parse_dataverse_url (url )
149
+
150
+ dataset_api_url = (
151
+ f"{ base_url } /api/datasets/:persistentId?persistentId={ persistent_id } "
152
+ )
153
+ resp = self ._request (dataset_api_url , headers = {"accept" : "application/json" })
154
+ if resp .status_code == 404 and is_ambiguous :
155
+ # It's possible this is a *file* persistent_id, not a dataset one
156
+ persistent_id = self .get_dataset_id_from_file_id (base_url , persistent_id )
157
+ dataset_api_url = (
158
+ f"{ base_url } /api/datasets/:persistentId?persistentId={ persistent_id } "
159
+ )
160
+ resp = self ._request (
161
+ dataset_api_url , headers = {"accept" : "application/json" }
162
+ )
163
+
164
+ if resp .status_code == 404 :
165
+ # This persistent id is just not here
166
+ raise ValueError (f"{ persistent_id } on { base_url } is not found" )
167
+
168
+ # We already handled 404, raise error for everything else
169
+ resp .raise_for_status ()
170
+
171
+ # We know the exact persistent_id of the dataset we fetched now
172
+ # Save it for use as content_id
173
+ self .persistent_id = persistent_id
174
+
175
+ data = resp .json ()["data" ]
176
+
177
+ return data ["latestVersion" ]["files" ]
91
178
92
179
def fetch (self , spec , output_dir , yield_output = False ):
93
180
"""Fetch and unpack a Dataverse dataset."""
94
- record_id = spec ["record" ]
95
- host = spec ["host" ]
96
-
97
- yield f"Fetching Dataverse record { record_id } .\n "
98
- url = f'{ host ["url" ]} /api/datasets/:persistentId?persistentId={ record_id } '
181
+ url = spec
182
+ parsed_url = urlparse (url )
183
+ # FIXME: Support determining API URL better
184
+ base_url = f"{ parsed_url .scheme } ://{ parsed_url .netloc } "
99
185
100
- resp = self .urlopen (url , headers = {"accept" : "application/json" })
101
- record = resp .json ()["data" ]
186
+ yield f"Fetching Dataverse record { url } .\n "
102
187
103
- for fobj in deep_get ( record , "latestVersion.files" ):
188
+ for fobj in self . get_datafiles ( url ):
104
189
file_url = (
105
190
# without format=original you get the preservation format (plain text, tab separated)
106
- f'{ host [ "url" ] } /api/access/datafile/{ deep_get (fobj , "dataFile.id" )} ?format=original'
191
+ f'{ base_url } /api/access/datafile/{ deep_get (fobj , "dataFile.id" )} ?format=original'
107
192
)
108
193
filename = fobj ["label" ]
109
194
original_filename = fobj ["dataFile" ].get ("originalFileName" , None )
@@ -128,5 +213,9 @@ def fetch(self, spec, output_dir, yield_output=False):
128
213
129
214
@property
130
215
def content_id (self ):
131
- """The Dataverse persistent identifier."""
132
- return self .record_id
216
+ """
217
+ The Dataverse persistent identifier.
218
+
219
+ Only valid if called after a succesfull fetch
220
+ """
221
+ return self .persistent_id
0 commit comments