Skip to content

Add support for Yoda repositories #100

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Datahugger is a tool to download scientific datasets, software, and code from a

Datahugger offers support for more than [<!-- count -->377<!-- count --> generic and specific (scientific) repositories](https://j535d165.github.io/datahugger/repositories) (and more to come!).

[![Datahugger support Zenodo, Dataverse, DataOne, GitHub, FigShare, HuggingFace, Mendeley Data, Dryad, OSF, and many more](https://github.com/J535D165/datahugger/raw/main/docs/images/logos.png)](https://j535d165.github.io/datahugger/repositories)
[![Datahugger support Zenodo, Dataverse, DataOne, GitHub, FigShare, HuggingFace, Mendeley Data, Dryad, OSF, Yoda, and many more](https://github.com/J535D165/datahugger/raw/main/docs/images/logos.png)](https://j535d165.github.io/datahugger/repositories)

We are still expanding Datahugger with support for more repositories. You can
help by [requesting support for a repository](https://github.com/J535D165/datahugger/issues/new/choose) in the issue
Expand Down
3 changes: 3 additions & 0 deletions datahugger/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from datahugger.services import OSFDataset
from datahugger.services import PangaeaDataset
from datahugger.services import SeaNoeDataset
from datahugger.services import YodaDataset
from datahugger.services import ZenodoDataset

# fast lookup
Expand Down Expand Up @@ -102,6 +103,8 @@
"mxrdr.icm.edu.pl": DataverseDataset,
"osnadata.ub.uni-osnabrueck.de": DataverseDataset,
"planetary-data-portal.org": DataverseDataset,
"publication.yoda.vu.nl": YodaDataset,
"public.yoda.uu.nl": YodaDataset,
"qdr.syr.edu": DataverseDataset,
"rdm.aau.edu.et": DataverseDataset,
"rdr.kuleuven.be": DataverseDataset,
Expand Down
116 changes: 116 additions & 0 deletions datahugger/services.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
import io
import os
import re
import tempfile
import xml.etree.ElementTree as ET
import zipfile
from pathlib import Path
from typing import Union
from urllib.parse import quote
from urllib.parse import unquote
from urllib.parse import urljoin
from urllib.parse import urlparse

import requests
import requests_cache
from bs4 import BeautifulSoup
from jsonpath_ng.jsonpath import Fields
from jsonpath_ng.jsonpath import Slice

Expand Down Expand Up @@ -445,3 +451,113 @@ class B2shareDataset(DatasetDownloader):
ATTR_SIZE_JSONPATH = "size"
ATTR_HASH_JSONPATH = "checksum"
ATTR_HASH_TYPE_VALUE = "md5"


class YodaDataset(DatasetDownloader):
"""Downloader for Yoda repositories."""

def _get_contents_url(self):
"""Resolve the root folder of the contents of a Yoda data package."""
url_to_use = (
self.resource if isinstance(self.resource, str) else self.resource.resolve()
)
res = requests.get(url_to_use)
res.raise_for_status()
soup = BeautifulSoup(res.content, "html.parser")
contents_link = soup.find("a", string=re.compile(r"View contents"))
return contents_link.get("href") + "/original" if contents_link else None

@property
def files(self):
if not hasattr(self, "_files"):
self._requests_cache_file = tempfile.NamedTemporaryFile(delete=False)
requests_cache.install_cache(self._requests_cache_file.name)
self._files = self._harvest_files()
self._cleanup_requests_cache()
return self._files

def _cleanup_requests_cache(self):
if hasattr(self, "_requests_cache_file"):
if os.path.isfile(self._requests_cache_file.name):
self._requests_cache_file.close()
os.unlink(self._requests_cache_file.name)

def _get_collection_name_from_folder_url(self, base_url, folder_url):
result = folder_url.replace(base_url, "", 1)
return result[1:] if result.startswith("/") else result

def _get_full_url(self, base_url, relative_collection, object_name):
result = base_url
if relative_collection != "":
result = urljoin(
result if result.endswith("/") else result + "/", relative_collection
)
result = urljoin(result if result.endswith("/") else result + "/", object_name)
return result

def _get_relative_path(self, collection_name, dataobject_name):
separator = (
"/" if collection_name != "" and not collection_name.endswith("/") else ""
)
return f"{collection_name}{separator}{dataobject_name}"

def _harvest_files(self):
contents_url = self._get_contents_url()
if contents_url is None:
raise ValueError(
"Data package contents link not found. "
+ "This can happen if the Yoda data package is not open access.\n"
)

folders_to_process = [contents_url]
files_to_download = []

while True:
if len(folders_to_process) == 0:
break
folder = folders_to_process.pop(0)

res = requests.get(folder)
res.raise_for_status()
soup = BeautifulSoup(res.content, "html.parser")

collection_name = self._get_collection_name_from_folder_url(
contents_url, folder
)

data_object_parts = soup.find_all("tr", class_="data-object")
data_objects = [
a["href"]
for data_object_part in data_object_parts
for a in data_object_part.find_all("a")
]
for data_object in data_objects:
data_object_url = self._get_full_url(
contents_url, collection_name, data_object
)
data_object_relative_path = unquote(
self._get_relative_path(collection_name, data_object)
)
files_to_download.append(
{
"link": data_object_url,
"name": data_object_relative_path,
"size": None,
"hash": None,
"hash_type": None,
}
)

collection_parts = soup.find_all("tr", class_="collection")
collections = [
a["href"]
for collection_part in collection_parts
for a in collection_part.find_all("a")
]
for collection in collections:
subcollection_url = self._get_full_url(
contents_url, collection_name, collection
)
folders_to_process.append(subcollection_url)

return files_to_download
Binary file modified docs/images/logos.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
7 changes: 7 additions & 0 deletions docs/repositories.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@ DataOne software is supported by Datahugger.
- [Rolling Deck to Repository (R2R)](https://rvdata.us)
- [SEAD](https://sead-published.ncsa.illinois.edu)

### Yoda repositories

The following Yoda repositories are supported:

- [Utrecht University Yoda repository](https://public.yoda.uu.nl)
- [Vrije Universiteit Amsterdam Yoda repository](https://publication.yoda.vu.nl)

### DataVerse repositories

See [https://dataverse.org/institutions](https://dataverse.org/institutions) and [DataVerse on Re3data.org](https://www.re3data.org/search?query=&software%5B%5D=DataVerse) for an overview of DataVerse repositories.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ classifiers = [
"Programming Language :: Python :: 3.12"
]
license = {text = "MIT"}
dependencies = ["jsonpath_ng", "requests", "requests-cache", "scitree", "tqdm"]
dependencies = ["jsonpath_ng", "requests", "requests-cache", "scitree", "tqdm", "beautifulsoup4"]
dynamic = ["version"]
requires-python = ">=3.8"

Expand Down
4 changes: 4 additions & 0 deletions tests/test_repositories.toml
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,7 @@ files = "consolidation-wattzhub-schema-irve-statique-20240220-152202.csv"
[[seanoe]]
location = "https://doi.org/10.17882/101042"
files = "111609.xlsx"

[[yoda]]
location = "https://doi.org/10.24416/uu01-hesuhs"
files = "University policy framework for research data Utrecht University - January 2016.pdf"