Skip to content

Commit 1260a5a

Browse files
committed
Support fetcing single files in dataverse
1 parent 172f8b0 commit 1260a5a

File tree

2 files changed

+118
-37
lines changed

2 files changed

+118
-37
lines changed

repo2docker/contentproviders/dataverse.py

+34-8
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def get_persistent_id_from_url(self, url: str) -> str:
106106
)
107107
return data["items"][0]["dataset_persistent_id"]
108108
elif parsed_url.path.startswith("/file.xhtml"):
109-
file_persistent_id = qs['persistentId'][0]
109+
file_persistent_id = qs["persistentId"][0]
110110
dataset_persistent_id = file_persistent_id.rsplit("/", 1)[0]
111111
if file_persistent_id == dataset_persistent_id:
112112
# We can't figure this one out, throw an error
@@ -115,6 +115,38 @@ def get_persistent_id_from_url(self, url: str) -> str:
115115

116116
raise ValueError(f"Could not determine persistent id for dataverse URL {url}")
117117

118+
def get_datafiles(self, host: str, persistent_id: str) -> list[dict]:
119+
"""
120+
Return a list of dataFiles for given persistent_id
121+
"""
122+
dataset_url = f"{host}/api/datasets/:persistentId?persistentId={persistent_id}"
123+
124+
resp = self._request(dataset_url, headers={"accept": "application/json"})
125+
# Assume it's a dataset
126+
is_dataset = True
127+
if resp.status_code == 404:
128+
# It's possible this is a *file* persistent_id, not a dataset one
129+
file_url = f"{host}/api/files/:persistentId?persistentId={persistent_id}"
130+
resp = self._request(file_url, headers={"accept": "application/json"})
131+
132+
if resp.status_code == 404:
133+
# This persistent id is just not here
134+
raise ValueError(f"{persistent_id} on {host} is not found")
135+
136+
# It's not a dataset, it's a file!
137+
is_dataset = False
138+
139+
# We already handled 404, raise error for everything else
140+
resp.raise_for_status()
141+
142+
data = resp.json()["data"]
143+
144+
if is_dataset:
145+
return data["latestVersion"]["files"]
146+
else:
147+
# Only one file object
148+
return [data]
149+
118150
def fetch(self, spec, output_dir, yield_output=False):
119151
"""Fetch and unpack a Dataverse dataset."""
120152
url = spec["url"]
@@ -123,13 +155,8 @@ def fetch(self, spec, output_dir, yield_output=False):
123155
persistent_id = self.get_persistent_id_from_url(url)
124156

125157
yield f"Fetching Dataverse record {persistent_id}.\n"
126-
url = f'{host["url"]}/api/datasets/:persistentId?persistentId={persistent_id}'
127-
128-
resp = self.urlopen(url, headers={"accept": "application/json"})
129-
print(resp.json())
130-
record = resp.json()["data"]
131158

132-
for fobj in deep_get(record, "latestVersion.files"):
159+
for fobj in self.get_datafiles(host["url"], persistent_id):
133160
file_url = (
134161
# without format=original you get the preservation format (plain text, tab separated)
135162
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
@@ -155,7 +182,6 @@ def fetch(self, spec, output_dir, yield_output=False):
155182
copytree(os.path.join(output_dir, d), output_dir)
156183
shutil.rmtree(os.path.join(output_dir, d))
157184

158-
159185
# Save persistent id
160186
self.persitent_id = persistent_id
161187

tests/contentproviders/test_dataverse.py

+84-29
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,50 @@
1010
harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse")
1111
cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data")
1212

13+
1314
@pytest.mark.parametrize(
1415
("doi", "resolved"),
1516
[
16-
("doi:10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"}),
17-
("10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"}),
18-
("10.7910/DVN/TJCLKP", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP"}),
19-
("https://dataverse.harvard.edu/api/access/datafile/3323458", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/api/access/datafile/3323458"}),
20-
("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", {"host": cimmyt_dv, "url": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016"}),
17+
(
18+
"doi:10.7910/DVN/6ZXAGT/3YRRYJ",
19+
{
20+
"host": harvard_dv,
21+
"url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
22+
},
23+
),
24+
(
25+
"10.7910/DVN/6ZXAGT/3YRRYJ",
26+
{
27+
"host": harvard_dv,
28+
"url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
29+
},
30+
),
31+
(
32+
"10.7910/DVN/TJCLKP",
33+
{
34+
"host": harvard_dv,
35+
"url": "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
36+
},
37+
),
38+
(
39+
"https://dataverse.harvard.edu/api/access/datafile/3323458",
40+
{
41+
"host": harvard_dv,
42+
"url": "https://dataverse.harvard.edu/api/access/datafile/3323458",
43+
},
44+
),
45+
(
46+
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
47+
{
48+
"host": cimmyt_dv,
49+
"url": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
50+
},
51+
),
2152
("/some/random/string", None),
2253
("https://example.com/path/here", None),
2354
# Non dataverse DOIs
24-
("https://doi.org/10.21105/joss.01277", None)
25-
]
55+
("https://doi.org/10.21105/joss.01277", None),
56+
],
2657
)
2758
def test_detect(doi, resolved):
2859
assert Dataverse().detect(doi) == resolved
@@ -31,37 +62,61 @@ def test_detect(doi, resolved):
3162
@pytest.mark.parametrize(
3263
("url", "persistent_id"),
3364
[
34-
("https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", "doi:10.7910/DVN/6ZXAGT"),
35-
("https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", "doi:10.7910/DVN/TJCLKP"),
36-
("https://dataverse.harvard.edu/api/access/datafile/3323458", "doi:10.7910/DVN/3MJ7IR"),
37-
("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", "hdl:11529/10016"),
38-
]
65+
(
66+
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
67+
"doi:10.7910/DVN/6ZXAGT",
68+
),
69+
(
70+
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
71+
"doi:10.7910/DVN/TJCLKP",
72+
),
73+
(
74+
"https://dataverse.harvard.edu/api/access/datafile/3323458",
75+
"doi:10.7910/DVN/3MJ7IR",
76+
),
77+
(
78+
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
79+
"hdl:11529/10016",
80+
),
81+
],
3982
)
4083
def test_get_persistent_id(url, persistent_id):
4184
assert Dataverse().get_persistent_id_from_url(url) == persistent_id
4285

43-
def test_dataverse_fetch():
4486

87+
@pytest.mark.parametrize(
88+
("spec", "md5tree"),
89+
[
90+
(
91+
"doi:10.7910/DVN/TJCLKP",
92+
{
93+
"data/primary/primary-data.zip": "a8f6fc3fc58f503cd48e23fa8b088694",
94+
"data/2023-01-03.tsv": "6fd497bf13dab9a06fe737ebc22f1917",
95+
"code/language.py": "9d61582bcf497c83bbd1ed0eed3c772e",
96+
},
97+
),
98+
(
99+
# A citation targeting a single file
100+
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
101+
{
102+
"ARCHAEOGLOBE_CONSENSUS_ASSESSMENT.tab": "17a91888ed8e91dfb63acbbab6127ac5"
103+
}
104+
)
105+
],
106+
)
107+
def test_fetch(spec, md5tree):
45108
dv = Dataverse()
46-
spec = dv.detect("doi:10.7910/DVN/TJCLKP")
47109

48110
with TemporaryDirectory() as d:
49111
output = []
50-
for l in dv.fetch(spec, d):
112+
for l in dv.fetch(dv.detect(spec), d):
51113
output.append(l)
52114

53-
# Verify two directories
54-
assert set(os.listdir(d)) == {"data", "code"}
55-
56-
# Verify sha256sum of three files
57-
expected_sha = {
58-
'data/primary/primary-data.zip': '880f99a1e1d54a2553be61301f92e06b29236785b8d4d1b7ad0b4595d9d7512b',
59-
'data/2023-01-03.tsv': 'cc9759e8e6bc076dd7c1a8eb53a7ea3d38e8697fa9f544d15768db308516cc5f',
60-
'code/language.py': '1ffb3b3cdc9de01279779f3fc88824672c8ec3ab1c41ecdd5c1b59a9b0202215'
61-
}
62-
63-
for subpath, expected_sha in expected_sha.items():
64-
with open(os.path.join(d, subpath), 'rb') as f:
65-
h = hashlib.sha256()
115+
# Verify md5 sum of the files we expect to find
116+
# We are using md5 instead of something more secure because that is what
117+
# dataverse itself uses
118+
for subpath, expected_sha in md5tree.items():
119+
with open(os.path.join(d, subpath), "rb") as f:
120+
h = hashlib.md5()
66121
h.update(f.read())
67-
assert h.hexdigest() == expected_sha
122+
assert h.hexdigest() == expected_sha

0 commit comments

Comments
 (0)