1
1
import json
2
2
import os
3
3
import shutil
4
+ import hashlib
4
5
from urllib .parse import parse_qs , urlparse , urlunparse
5
6
6
7
from ..utils import copytree , deep_get , is_doi
@@ -56,6 +57,9 @@ def detect(self, spec, ref=None, extra_args=None):
56
57
if host is None :
57
58
return
58
59
60
+ # Used only for content_id
61
+ self .url = url
62
+
59
63
# At this point, we *know* this is a dataverse URL, because:
60
64
# 1. The DOI resolved to a particular host (if using DOI)
61
65
# 2. The host is in the list of known dataverse installations
@@ -84,9 +88,9 @@ def get_dataset_id_from_file_id(self, host: str, file_id: str) -> str:
84
88
data = resp .json ()["data" ]
85
89
return data ["datasetVersion" ]["datasetPersistentId" ]
86
90
87
- def get_persistent_id_from_url (self , url : str ) -> str :
91
+ def get_datafiles (self , dataverse_host : str , url : str ) -> list [ dict ] :
88
92
"""
89
- Return the persistentId for given dataverse URL.
93
+ Return a list of dataFiles for given persistent_id
90
94
91
95
Supports the following *dataset* URL styles:
92
96
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
@@ -101,11 +105,6 @@ def get_persistent_id_from_url(self, url: str) -> str:
101
105
If a URL can not be parsed, throw an exception
102
106
"""
103
107
104
- def get_datafiles (self , dataverse_host : str , url : str ) -> list [dict ]:
105
- """
106
- Return a list of dataFiles for given persistent_id
107
- """
108
-
109
108
parsed_url = urlparse (url )
110
109
path = parsed_url .path
111
110
qs = parse_qs (parsed_url .query )
@@ -156,9 +155,7 @@ def fetch(self, spec, output_dir, yield_output=False):
156
155
url = spec ["url" ]
157
156
host = spec ["host" ]
158
157
159
- persistent_id = self .get_persistent_id_from_url (url )
160
-
161
- yield f"Fetching Dataverse record { persistent_id } .\n "
158
+ yield f"Fetching Dataverse record { url } .\n "
162
159
163
160
for fobj in self .get_datafiles (host ["url" ], url ):
164
161
file_url = (
@@ -186,10 +183,7 @@ def fetch(self, spec, output_dir, yield_output=False):
186
183
copytree (os .path .join (output_dir , d ), output_dir )
187
184
shutil .rmtree (os .path .join (output_dir , d ))
188
185
189
- # Save persistent id
190
- self .persitent_id = persistent_id
191
-
192
186
@property
193
187
def content_id (self ):
194
188
"""The Dataverse persistent identifier."""
195
- return self .persistent_id
189
+ return self .url
0 commit comments