Skip to content

Commit b7c1515

Browse files
authored
Merge pull request #1390 from yuvipanda/use-api
Use REST APIs to resolve DOIs + cleanup dataverse provider
2 parents f0b1c0c + e48f5b7 commit b7c1515

File tree

9 files changed

+315
-272
lines changed

9 files changed

+315
-272
lines changed

.github/workflows/test.yml

+1
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ jobs:
6060
- r
6161
- unit
6262
- venv
63+
- contentproviders
6364
include:
6465
# The actions/setup-python action with Python version 3.6 isn't
6566
# possible to use with the ubuntu-22.04 runner, so we use ubuntu-20.04
+141-52
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1+
import hashlib
12
import json
23
import os
34
import shutil
4-
from urllib.parse import parse_qs, urlparse, urlunparse
5+
from typing import List, Tuple
6+
from urllib.parse import parse_qs, urlparse
57

6-
from ..utils import copytree, deep_get
8+
from ..utils import copytree, deep_get, is_doi
79
from .doi import DoiProvider
810

911

@@ -23,10 +25,11 @@ def __init__(self):
2325
self.hosts = json.load(fp)["installations"]
2426
super().__init__()
2527

26-
def detect(self, doi, ref=None, extra_args=None):
27-
"""Trigger this provider for things that resolve to a Dataverse dataset.
28+
def detect(self, spec, ref=None, extra_args=None):
29+
"""
30+
Detect if given spec is hosted on dataverse
2831
29-
Handles:
32+
The spec can be:
3033
- DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
3134
- DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
3235
- URL {siteURL}/api/access/datafile/{fileId}
@@ -35,9 +38,11 @@ def detect(self, doi, ref=None, extra_args=None):
3538
- https://dataverse.harvard.edu/api/access/datafile/3323458
3639
- doi:10.7910/DVN/6ZXAGT
3740
- doi:10.7910/DVN/6ZXAGT/3YRRYJ
38-
3941
"""
40-
url = self.doi2url(doi)
42+
if is_doi(spec):
43+
url = self.doi2url(spec)
44+
else:
45+
url = spec
4146
# Parse the url, to get the base for later API calls
4247
parsed_url = urlparse(url)
4348

@@ -53,57 +58,137 @@ def detect(self, doi, ref=None, extra_args=None):
5358
if host is None:
5459
return
5560

56-
query_args = parse_qs(parsed_url.query)
57-
# Corner case handling
58-
if parsed_url.path.startswith("/file.xhtml"):
59-
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
60-
# is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way.
61-
new_doi = doi.rsplit("/", 1)[0]
62-
if new_doi == doi:
63-
# tough luck :( Avoid inifite recursion and exit.
64-
return
65-
return self.detect(new_doi)
66-
elif parsed_url.path.startswith("/api/access/datafile"):
67-
# Raw url pointing to a datafile is a typical output from an External Tool integration
68-
entity_id = os.path.basename(parsed_url.path)
69-
search_query = "q=entityId:" + entity_id + "&type=file"
70-
# Knowing the file identifier query search api to get parent dataset
71-
search_url = urlunparse(
72-
parsed_url._replace(path="/api/search", query=search_query)
61+
# At this point, we *know* this is a dataverse URL, because:
62+
# 1. The DOI resolved to a particular host (if using DOI)
63+
# 2. The host is in the list of known dataverse installations
64+
#
65+
# We don't know exactly what kind of dataverse object this is, but
66+
# that can be figured out during fetch as needed
67+
return url
68+
69+
def get_dataset_id_from_file_id(self, base_url: str, file_id: str) -> str:
70+
"""
71+
Return the persistent_id (DOI) of a dataset that a given file_id (int or doi) belongs to
72+
"""
73+
if file_id.isdigit():
74+
# the file_id is an integer, rather than a persistent id (DOI)
75+
api_url = f"{base_url}/api/files/{file_id}?returnDatasetVersion=true"
76+
else:
77+
# the file_id is a doi itself
78+
api_url = f"{base_url}/api/files/:persistentId?persistentId={file_id}&returnDatasetVersion=true"
79+
80+
resp = self._request(api_url)
81+
if resp.status_code == 404:
82+
raise ValueError(f"File with id {file_id} not found in {base_url}")
83+
84+
resp.raise_for_status()
85+
86+
data = resp.json()["data"]
87+
return data["datasetVersion"]["datasetPersistentId"]
88+
89+
def parse_dataverse_url(self, url: str) -> Tuple[str, bool]:
90+
"""
91+
Parse the persistent id out of a dataverse URL
92+
93+
persistent_id can point to either a dataset or a file. The second return
94+
value is False if we know that the persistent id is a file or a dataset,
95+
and True if it is ambiguous.
96+
97+
Raises a ValueError if we can not parse the url
98+
"""
99+
parsed_url = urlparse(url)
100+
path = parsed_url.path
101+
qs = parse_qs(parsed_url.query)
102+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
103+
104+
is_ambiguous = False
105+
# https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
106+
if path.startswith("/citation"):
107+
is_ambiguous = True
108+
persistent_id = qs["persistentId"][0]
109+
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
110+
elif path.startswith("/dataset.xhtml"):
111+
# https://dataverse.harvard.edu/api/access/datafile/3323458
112+
persistent_id = qs["persistentId"][0]
113+
elif path.startswith("/api/access/datafile"):
114+
# What we have here is an entity id, which we can use to get a persistentId
115+
file_id = os.path.basename(path)
116+
persistent_id = self.get_dataset_id_from_file_id(base_url, file_id)
117+
elif parsed_url.path.startswith("/file.xhtml"):
118+
file_persistent_id = qs["persistentId"][0]
119+
persistent_id = self.get_dataset_id_from_file_id(
120+
base_url, file_persistent_id
121+
)
122+
else:
123+
raise ValueError(
124+
f"Could not determine persistent id for dataverse URL {url}"
73125
)
74-
self.log.debug("Querying Dataverse: " + search_url)
75-
data = self.urlopen(search_url).json()["data"]
76-
if data["count_in_response"] != 1:
77-
self.log.debug(
78-
f"Dataverse search query failed!\n - doi: {doi}\n - url: {url}\n - resp: {json.dump(data)}\n"
79-
)
80-
return
81-
82-
self.record_id = deep_get(data, "items.0.dataset_persistent_id")
83-
elif (
84-
parsed_url.path.startswith("/dataset.xhtml")
85-
and "persistentId" in query_args
86-
):
87-
self.record_id = deep_get(query_args, "persistentId.0")
88-
89-
if hasattr(self, "record_id"):
90-
return {"record": self.record_id, "host": host}
126+
127+
return persistent_id, is_ambiguous
128+
129+
def get_datafiles(self, url: str) -> List[dict]:
130+
"""
131+
Return a list of dataFiles for given persistent_id
132+
133+
Supports the following *dataset* URL styles:
134+
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
135+
- /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
136+
137+
Supports the following *file* URL styles (entire dataset file belongs to will be fetched):
138+
- /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458
139+
- /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
140+
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
141+
142+
If a URL can not be parsed, throw an exception
143+
"""
144+
145+
parsed_url = urlparse(url)
146+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
147+
148+
persistent_id, is_ambiguous = self.parse_dataverse_url(url)
149+
150+
dataset_api_url = (
151+
f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}"
152+
)
153+
resp = self._request(dataset_api_url, headers={"accept": "application/json"})
154+
if resp.status_code == 404 and is_ambiguous:
155+
# It's possible this is a *file* persistent_id, not a dataset one
156+
persistent_id = self.get_dataset_id_from_file_id(base_url, persistent_id)
157+
dataset_api_url = (
158+
f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}"
159+
)
160+
resp = self._request(
161+
dataset_api_url, headers={"accept": "application/json"}
162+
)
163+
164+
if resp.status_code == 404:
165+
# This persistent id is just not here
166+
raise ValueError(f"{persistent_id} on {base_url} is not found")
167+
168+
# We already handled 404, raise error for everything else
169+
resp.raise_for_status()
170+
171+
# We know the exact persistent_id of the dataset we fetched now
172+
# Save it for use as content_id
173+
self.persistent_id = persistent_id
174+
175+
data = resp.json()["data"]
176+
177+
return data["latestVersion"]["files"]
91178

92179
def fetch(self, spec, output_dir, yield_output=False):
93180
"""Fetch and unpack a Dataverse dataset."""
94-
record_id = spec["record"]
95-
host = spec["host"]
96-
97-
yield f"Fetching Dataverse record {record_id}.\n"
98-
url = f'{host["url"]}/api/datasets/:persistentId?persistentId={record_id}'
181+
url = spec
182+
parsed_url = urlparse(url)
183+
# FIXME: Support determining API URL better
184+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
99185

100-
resp = self.urlopen(url, headers={"accept": "application/json"})
101-
record = resp.json()["data"]
186+
yield f"Fetching Dataverse record {url}.\n"
102187

103-
for fobj in deep_get(record, "latestVersion.files"):
188+
for fobj in self.get_datafiles(url):
104189
file_url = (
105190
# without format=original you get the preservation format (plain text, tab separated)
106-
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
191+
f'{base_url}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
107192
)
108193
filename = fobj["label"]
109194
original_filename = fobj["dataFile"].get("originalFileName", None)
@@ -128,5 +213,9 @@ def fetch(self, spec, output_dir, yield_output=False):
128213

129214
@property
130215
def content_id(self):
131-
"""The Dataverse persistent identifier."""
132-
return self.record_id
216+
"""
217+
The Dataverse persistent identifier.
218+
219+
Only valid if called after a succesfull fetch
220+
"""
221+
return self.persistent_id

repo2docker/contentproviders/doi.py

+21-14
Original file line numberDiff line numberDiff line change
@@ -46,21 +46,28 @@ def doi2url(self, doi):
4646
# Transform a DOI to a URL
4747
# If not a doi, assume we have a URL and return
4848
if is_doi(doi):
49-
doi = normalize_doi(doi)
50-
51-
try:
52-
resp = self._request(f"https://doi.org/{doi}")
53-
resp.raise_for_status()
54-
except HTTPError as e:
55-
# If the DOI doesn't exist, just return URL
56-
if e.response.status_code == 404:
57-
return doi
58-
# Reraise any other errors because if the DOI service is down (or
59-
# we hit a rate limit) we don't want to silently continue to the
60-
# default Git provider as this leads to a misleading error.
61-
self.log.error(f"DOI {doi} does not resolve: {e}")
49+
normalized_doi = normalize_doi(doi)
50+
51+
# Use the doi.org resolver API
52+
# documented at https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation#5-proxy-server-rest-api
53+
req_url = f"https://doi.org/api/handles/{normalized_doi}"
54+
resp = self._request(req_url)
55+
if resp.status_code == 404:
56+
# Not a doi, return what we were passed in
57+
return doi
58+
elif resp.status_code == 200:
59+
data = resp.json()
60+
# Pick the first URL we find from the doi response
61+
for v in data["values"]:
62+
if v["type"] == "URL":
63+
return v["data"]["value"]
64+
65+
# No URLs found for this doi, what do we do?
66+
self.log.error("DOI {normalized_doi} doesn't point to any URLs")
67+
return doi
68+
else:
69+
# If we get any other status codes, raise error
6270
raise
63-
return resp.url
6471
else:
6572
# Just return what is actulally just a URL
6673
return doi

0 commit comments

Comments
 (0)