Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use REST APIs to resolve DOIs + cleanup dataverse provider #1390

Merged
merged 20 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ jobs:
- r
- unit
- venv
- contentproviders
include:
# The actions/setup-python action with Python version 3.6 isn't
# possible to use with the ubuntu-22.04 runner, so we use ubuntu-20.04
Expand Down
186 changes: 135 additions & 51 deletions repo2docker/contentproviders/dataverse.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import hashlib
import json
import os
import shutil
from urllib.parse import parse_qs, urlparse, urlunparse
from typing import List, Tuple
from urllib.parse import parse_qs, urlparse

from ..utils import copytree, deep_get
from ..utils import copytree, deep_get, is_doi
from .doi import DoiProvider


Expand All @@ -23,10 +25,11 @@ def __init__(self):
self.hosts = json.load(fp)["installations"]
super().__init__()

def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Dataverse dataset.
def detect(self, spec, ref=None, extra_args=None):
"""
Detect if given spec is hosted on dataverse

Handles:
The spec can be:
- DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
- DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
- URL {siteURL}/api/access/datafile/{fileId}
Expand All @@ -35,9 +38,11 @@ def detect(self, doi, ref=None, extra_args=None):
- https://dataverse.harvard.edu/api/access/datafile/3323458
- doi:10.7910/DVN/6ZXAGT
- doi:10.7910/DVN/6ZXAGT/3YRRYJ

"""
url = self.doi2url(doi)
if is_doi(spec):
url = self.doi2url(spec)
else:
url = spec
# Parse the url, to get the base for later API calls
parsed_url = urlparse(url)

Expand All @@ -53,57 +58,136 @@ def detect(self, doi, ref=None, extra_args=None):
if host is None:
return

query_args = parse_qs(parsed_url.query)
# Corner case handling
if parsed_url.path.startswith("/file.xhtml"):
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
# is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way.
new_doi = doi.rsplit("/", 1)[0]
if new_doi == doi:
# tough luck :( Avoid inifite recursion and exit.
return
return self.detect(new_doi)
elif parsed_url.path.startswith("/api/access/datafile"):
# Raw url pointing to a datafile is a typical output from an External Tool integration
entity_id = os.path.basename(parsed_url.path)
search_query = "q=entityId:" + entity_id + "&type=file"
# Knowing the file identifier query search api to get parent dataset
search_url = urlunparse(
parsed_url._replace(path="/api/search", query=search_query)
# Used only for content_id
self.url = url

# At this point, we *know* this is a dataverse URL, because:
# 1. The DOI resolved to a particular host (if using DOI)
# 2. The host is in the list of known dataverse installations
#
# We don't know exactly what kind of dataverse object this is, but
# that can be figured out during fetch as needed
return url

def get_dataset_id_from_file_id(self, base_url: str, file_id: str) -> str:
"""
Return the persistent_id (DOI) of a dataset that a given file_id (int or doi) belongs to
"""
if file_id.isdigit():
# the file_id is an integer, rather than a persistent id (DOI)
api_url = f"{base_url}/api/files/{file_id}?returnDatasetVersion=true"
else:
# the file_id is a doi itself
api_url = f"{base_url}/api/files/:persistentId?persistentId={file_id}&returnDatasetVersion=true"

resp = self._request(api_url)
if resp.status_code == 404:
raise ValueError(f"File with id {file_id} not found in {base_url}")

resp.raise_for_status()

data = resp.json()["data"]
return data["datasetVersion"]["datasetPersistentId"]

def parse_dataverse_url(self, url: str) -> Tuple[str, bool]:
"""
Parse the persistent id out of a dataverse URL

persistent_id can point to either a dataset or a file. The second return
value is False if we know that the persistent id is a file or a dataset,
and True if it is ambiguous.

Raises a ValueError if we can not parse the url
"""
parsed_url = urlparse(url)
path = parsed_url.path
qs = parse_qs(parsed_url.query)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"

is_ambiguous = False
# https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
if path.startswith("/citation"):
is_ambiguous = True
persistent_id = qs["persistentId"][0]
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
elif path.startswith("/dataset.xhtml"):
# https://dataverse.harvard.edu/api/access/datafile/3323458
persistent_id = qs["persistentId"][0]
elif path.startswith("/api/access/datafile"):
# What we have here is an entity id, which we can use to get a persistentId
file_id = os.path.basename(path)
persistent_id = self.get_dataset_id_from_file_id(base_url, file_id)
elif parsed_url.path.startswith("/file.xhtml"):
file_persistent_id = qs["persistentId"][0]
persistent_id = self.get_dataset_id_from_file_id(
base_url, file_persistent_id
)
else:
raise ValueError(
f"Could not determine persistent id for dataverse URL {url}"
)

return persistent_id, is_ambiguous

def get_datafiles(self, url: str) -> List[dict]:
"""
Return a list of dataFiles for given persistent_id

Supports the following *dataset* URL styles:
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
- /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP

Supports the following *file* URL styles (entire dataset file belongs to will be fetched):
- /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458
- /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ

If a URL can not be parsed, throw an exception
"""

parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"

persistent_id, is_ambiguous = self.parse_dataverse_url(url)

dataset_api_url = (
f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}"
)
resp = self._request(dataset_api_url, headers={"accept": "application/json"})
if resp.status_code == 404 and is_ambiguous:
# It's possible this is a *file* persistent_id, not a dataset one
persistent_id = self.get_dataset_id_from_file_id(base_url, persistent_id)
dataset_api_url = (
f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}"
)
resp = self._request(
dataset_api_url, headers={"accept": "application/json"}
)
self.log.debug("Querying Dataverse: " + search_url)
data = self.urlopen(search_url).json()["data"]
if data["count_in_response"] != 1:
self.log.debug(
f"Dataverse search query failed!\n - doi: {doi}\n - url: {url}\n - resp: {json.dump(data)}\n"
)
return

self.record_id = deep_get(data, "items.0.dataset_persistent_id")
elif (
parsed_url.path.startswith("/dataset.xhtml")
and "persistentId" in query_args
):
self.record_id = deep_get(query_args, "persistentId.0")

if hasattr(self, "record_id"):
return {"record": self.record_id, "host": host}

if resp.status_code == 404:
# This persistent id is just not here
raise ValueError(f"{persistent_id} on {base_url} is not found")

# We already handled 404, raise error for everything else
resp.raise_for_status()

data = resp.json()["data"]

return data["latestVersion"]["files"]

def fetch(self, spec, output_dir, yield_output=False):
"""Fetch and unpack a Dataverse dataset."""
record_id = spec["record"]
host = spec["host"]

yield f"Fetching Dataverse record {record_id}.\n"
url = f'{host["url"]}/api/datasets/:persistentId?persistentId={record_id}'
url = spec
parsed_url = urlparse(url)
# FIXME: Support determining API URL better
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"

resp = self.urlopen(url, headers={"accept": "application/json"})
record = resp.json()["data"]
yield f"Fetching Dataverse record {url}.\n"

for fobj in deep_get(record, "latestVersion.files"):
for fobj in self.get_datafiles(url):
file_url = (
# without format=original you get the preservation format (plain text, tab separated)
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
f'{base_url}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
)
filename = fobj["label"]
original_filename = fobj["dataFile"].get("originalFileName", None)
Expand All @@ -129,4 +213,4 @@ def fetch(self, spec, output_dir, yield_output=False):
@property
def content_id(self):
"""The Dataverse persistent identifier."""
return self.record_id
return hashlib.sha256(self.url.encode()).hexdigest()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a risk that a 64 character hash might make image names too long?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://github.com/opencontainers/distribution-spec/blob/main/spec.md#pulling-manifests says many implementations limit the hostname and name of the image in total to 256 chars. I think this means it may be good enough and not a problem?

Alternatively, I can go back to parsing persistent_id at detect time, instead of at fetch time, and set it that way. I think part of the confusion here is around detect semantics and when content_id is called. Ideally detect should be stateless and be simply used to, well, detect things! But we seem to treat it as also the thing that sets .content_id so it's a little bit of a mess. I'm happy to treat that as a different refactor though.

Choice to be made

  1. Leave this as is
  2. Set this to be persistent_id instead, and move persistent_id parsing back into detect

Happy to do either!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But we seem to treat it as also the thing that sets .content_id so it's a little bit of a mess.

Yeah, that certainly doesn't sound right. It looks to me like we also only access content_id after calling fetch. Is it possible that the issue you are seeing is only in the tests, not how r2d actually behaves? What happens if you raise in content_id if fetch hasn't been called?

If it's just the tests and persistent_id is defined after fetch, then keeping persistent_id seems nice here, and maybe we can fix the tests to be more realistic. And make it explicit that content_id cannot be assumed to be available until fetch has been called?


A tangent I went on about hash length, that I'm not sure is relevant anymore, but already wrote down. Feel free to ignore:

Initially, I thought the content id was the full thing, but of course it's the 'ref' that goes after the doi itself. Running a test gives this 106-character image name:

r2ddoi-3a10-2e7910-2fdvn-2f6zxagt-2f3yrryjec19b07b80bf8eeb95f669a51f64efb7f647f91cf1b1f6ccbef736396ba936ef

Since we're in the namespace of the doi, collision probability is super low. We truncate to the short hash in git. So maybe truncate this hash, or use a natively shorter hash function like:

hashlib.blake2s(self.url.encode(), digest_size=10).hexdigest()

(blake2 is in hashlib.algorithms_guaranteed since 3.6, I think)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ooooh, fixing the tests seems the right thing to do! I'll take a peek.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hey, it looks like I already fixed the tests, so it's all fine now! Back to using persistent_id as the identifier, but this time it's 'proper' - so if we get a file persistent_id, we resolve it to the dataset persistent id, and use that. so if multiple different folks try to use different files from the same dataset, it will lead to cache reuse now!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great! All looks right to me, then.

35 changes: 21 additions & 14 deletions repo2docker/contentproviders/doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,21 +46,28 @@ def doi2url(self, doi):
# Transform a DOI to a URL
# If not a doi, assume we have a URL and return
if is_doi(doi):
doi = normalize_doi(doi)

try:
resp = self._request(f"https://doi.org/{doi}")
resp.raise_for_status()
except HTTPError as e:
# If the DOI doesn't exist, just return URL
if e.response.status_code == 404:
return doi
# Reraise any other errors because if the DOI service is down (or
# we hit a rate limit) we don't want to silently continue to the
# default Git provider as this leads to a misleading error.
self.log.error(f"DOI {doi} does not resolve: {e}")
normalized_doi = normalize_doi(doi)

# Use the doi.org resolver API
# documented at https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation#5-proxy-server-rest-api
req_url = f"https://doi.org/api/handles/{normalized_doi}"
resp = self._request(req_url)
if resp.status_code == 404:
# Not a doi, return what we were passed in
return doi
elif resp.status_code == 200:
data = resp.json()
# Pick the first URL we find from the doi response
for v in data["values"]:
if v["type"] == "URL":
return v["data"]["value"]

# No URLs found for this doi, what do we do?
self.log.error("DOI {normalized_doi} doesn't point to any URLs")
return doi
else:
# If we get any other status codes, raise error
raise
return resp.url
else:
# Just return what is actulally just a URL
return doi
Expand Down
Loading