Skip to content

Commit fde74ef

Browse files
committed
Fix content_id for dataverse URLs
1 parent b7050ba commit fde74ef

File tree

1 file changed

+8
-14
lines changed

1 file changed

+8
-14
lines changed

repo2docker/contentproviders/dataverse.py

+8-14
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import json
22
import os
33
import shutil
4+
import hashlib
45
from urllib.parse import parse_qs, urlparse, urlunparse
56

67
from ..utils import copytree, deep_get, is_doi
@@ -56,6 +57,9 @@ def detect(self, spec, ref=None, extra_args=None):
5657
if host is None:
5758
return
5859

60+
# Used only for content_id
61+
self.url = url
62+
5963
# At this point, we *know* this is a dataverse URL, because:
6064
# 1. The DOI resolved to a particular host (if using DOI)
6165
# 2. The host is in the list of known dataverse installations
@@ -84,9 +88,9 @@ def get_dataset_id_from_file_id(self, host: str, file_id: str) -> str:
8488
data = resp.json()["data"]
8589
return data["datasetVersion"]["datasetPersistentId"]
8690

87-
def get_persistent_id_from_url(self, url: str) -> str:
91+
def get_datafiles(self, dataverse_host: str, url: str) -> list[dict]:
8892
"""
89-
Return the persistentId for given dataverse URL.
93+
Return a list of dataFiles for given persistent_id
9094
9195
Supports the following *dataset* URL styles:
9296
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
@@ -101,11 +105,6 @@ def get_persistent_id_from_url(self, url: str) -> str:
101105
If a URL can not be parsed, throw an exception
102106
"""
103107

104-
def get_datafiles(self, dataverse_host: str, url: str) -> list[dict]:
105-
"""
106-
Return a list of dataFiles for given persistent_id
107-
"""
108-
109108
parsed_url = urlparse(url)
110109
path = parsed_url.path
111110
qs = parse_qs(parsed_url.query)
@@ -156,9 +155,7 @@ def fetch(self, spec, output_dir, yield_output=False):
156155
url = spec["url"]
157156
host = spec["host"]
158157

159-
persistent_id = self.get_persistent_id_from_url(url)
160-
161-
yield f"Fetching Dataverse record {persistent_id}.\n"
158+
yield f"Fetching Dataverse record {url}.\n"
162159

163160
for fobj in self.get_datafiles(host["url"], url):
164161
file_url = (
@@ -186,10 +183,7 @@ def fetch(self, spec, output_dir, yield_output=False):
186183
copytree(os.path.join(output_dir, d), output_dir)
187184
shutil.rmtree(os.path.join(output_dir, d))
188185

189-
# Save persistent id
190-
self.persitent_id = persistent_id
191-
192186
@property
193187
def content_id(self):
194188
"""The Dataverse persistent identifier."""
195-
return self.persistent_id
189+
return self.url

0 commit comments

Comments
 (0)