J535D165 · stsnel · Mar 28, 2025 · Apr 1, 2025 · J535D165 · Jun 26, 2025
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ Datahugger is a tool to download scientific datasets, software, and code from a
 
 Datahugger offers support for more than [<!-- count -->377<!-- count --> generic and specific (scientific) repositories](https://j535d165.github.io/datahugger/repositories) (and more to come!).
 
-[![Datahugger support Zenodo, Dataverse, DataOne, GitHub, FigShare, HuggingFace, Mendeley Data, Dryad, OSF, and many more](https://github.com/J535D165/datahugger/raw/main/docs/images/logos.png)](https://j535d165.github.io/datahugger/repositories)
+[![Datahugger support Zenodo, Dataverse, DataOne, GitHub, FigShare, HuggingFace, Mendeley Data, Dryad, OSF, Yoda, and many more](https://github.com/J535D165/datahugger/raw/main/docs/images/logos.png)](https://j535d165.github.io/datahugger/repositories)
 
 We are still expanding Datahugger with support for more repositories. You can
 help by [requesting support for a repository](https://github.com/J535D165/datahugger/issues/new/choose) in the issue

diff --git a/datahugger/config.py b/datahugger/config.py
@@ -13,6 +13,7 @@
 from datahugger.services import OSFDataset
 from datahugger.services import PangaeaDataset
 from datahugger.services import SeaNoeDataset
+from datahugger.services import YodaDataset
 from datahugger.services import ZenodoDataset
 
 # fast lookup
@@ -102,6 +103,9 @@
     "mxrdr.icm.edu.pl": DataverseDataset,
     "osnadata.ub.uni-osnabrueck.de": DataverseDataset,
     "planetary-data-portal.org": DataverseDataset,
+    "publication.yoda.vu.nl": YodaDataset,
+    "publication.yoda.wur.nl": YodaDataset,
+    "public.yoda.uu.nl": YodaDataset,
     "qdr.syr.edu": DataverseDataset,
     "rdm.aau.edu.et": DataverseDataset,
     "rdr.kuleuven.be": DataverseDataset,

diff --git a/datahugger/services.py b/datahugger/services.py
@@ -1,13 +1,19 @@
 import io
+import os
 import re
+import tempfile
 import xml.etree.ElementTree as ET
 import zipfile
 from pathlib import Path
 from typing import Union
 from urllib.parse import quote
+from urllib.parse import unquote
+from urllib.parse import urljoin
 from urllib.parse import urlparse
 
 import requests
+import requests_cache
+from bs4 import BeautifulSoup
 from jsonpath_ng.jsonpath import Fields
 from jsonpath_ng.jsonpath import Slice
 
@@ -445,3 +451,113 @@ class B2shareDataset(DatasetDownloader):
     ATTR_SIZE_JSONPATH = "size"
     ATTR_HASH_JSONPATH = "checksum"
     ATTR_HASH_TYPE_VALUE = "md5"
+
+
+class YodaDataset(DatasetDownloader):
+    """Downloader for Yoda repositories."""
+
+    def _get_contents_url(self):
+        """Resolve the root folder of the contents of a Yoda data package."""
+        url_to_use = (
+            self.resource if isinstance(self.resource, str) else self.resource.resolve()
+        )
+        res = requests.get(url_to_use)
+        res.raise_for_status()
+        soup = BeautifulSoup(res.content, "html.parser")
+        contents_link = soup.find("a", string=re.compile(r"View contents"))
+        return contents_link.get("href") + "/original" if contents_link else None
+
+    @property
+    def files(self):
+        if not hasattr(self, "_files"):
+            self._requests_cache_file = tempfile.NamedTemporaryFile(delete=False)
+            requests_cache.install_cache(self._requests_cache_file.name)
+            self._files = self._harvest_files()
+            self._cleanup_requests_cache()
+        return self._files
+
+    def _cleanup_requests_cache(self):
+        if hasattr(self, "_requests_cache_file"):
+            if os.path.isfile(self._requests_cache_file.name):
+                self._requests_cache_file.close()
+                os.unlink(self._requests_cache_file.name)
+
+    def _get_collection_name_from_folder_url(self, base_url, folder_url):
+        result = folder_url.replace(base_url, "", 1)
+        return result[1:] if result.startswith("/") else result
+
+    def _get_full_url(self, base_url, relative_collection, object_name):
+        result = base_url
+        if relative_collection != "":
+            result = urljoin(
+                result if result.endswith("/") else result + "/", relative_collection
+            )
+        result = urljoin(result if result.endswith("/") else result + "/", object_name)
+        return result
+
+    def _get_relative_path(self, collection_name, dataobject_name):
+        separator = (
+            "/" if collection_name != "" and not collection_name.endswith("/") else ""
+        )
+        return f"{collection_name}{separator}{dataobject_name}"
+
+    def _harvest_files(self):
+        contents_url = self._get_contents_url()
+        if contents_url is None:
+            raise ValueError(
+                "Data package contents link not found. "
+                + "This can happen if the Yoda data package is not open access.\n"
+            )
+
+        folders_to_process = [contents_url]
+        files_to_download = []
+
+        while True:
+            if len(folders_to_process) == 0:
+                break
+            folder = folders_to_process.pop(0)
+
+            res = requests.get(folder)
+            res.raise_for_status()
+            soup = BeautifulSoup(res.content, "html.parser")
+
+            collection_name = self._get_collection_name_from_folder_url(
+                contents_url, folder
+            )
+
+            data_object_parts = soup.find_all("tr", class_="data-object")
+            data_objects = [
+                a["href"]
+                for data_object_part in data_object_parts
+                for a in data_object_part.find_all("a")
+            ]
+            for data_object in data_objects:
+                data_object_url = self._get_full_url(
+                    contents_url, collection_name, data_object
+                )
+                data_object_relative_path = unquote(
+                    self._get_relative_path(collection_name, data_object)
+                )
+                files_to_download.append(
+                    {
+                        "link": data_object_url,
+                        "name": data_object_relative_path,
+                        "size": None,
+                        "hash": None,
+                        "hash_type": None,
+                    }
+                )
+
+            collection_parts = soup.find_all("tr", class_="collection")
+            collections = [
+                a["href"]
+                for collection_part in collection_parts
+                for a in collection_part.find_all("a")
+            ]
+            for collection in collections:
+                subcollection_url = self._get_full_url(
+                    contents_url, collection_name, collection
+                )
+                folders_to_process.append(subcollection_url)
+
+        return files_to_download
diff --git a/docs/images/logos.png b/docs/images/logos.png
diff --git a/docs/repositories.md b/docs/repositories.md
@@ -49,6 +49,14 @@ DataOne software is supported by Datahugger.
 - [Rolling Deck to Repository (R2R)](https://rvdata.us)
 - [SEAD](https://sead-published.ncsa.illinois.edu)
 
+### Yoda repositories
+
+The following Yoda repositories are supported:
+
+- [Utrecht University Yoda repository](https://public.yoda.uu.nl)
+- [Vrije Universiteit Amsterdam Yoda repository](https://publication.yoda.vu.nl)
+- [Wageningen University & Research Yoda repository](https://publication.yoda.wur.nl)
+
 ### DataVerse repositories
 
 See [https://dataverse.org/institutions](https://dataverse.org/institutions) and [DataVerse on Re3data.org](https://www.re3data.org/search?query=&software%5B%5D=DataVerse) for an overview of DataVerse repositories.
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,7 +15,7 @@ classifiers = [
     "Programming Language :: Python :: 3.12"
 ]
 license = {text = "MIT"}
-dependencies = ["jsonpath_ng", "requests", "requests-cache", "scitree", "tqdm"]
+dependencies = ["jsonpath_ng", "requests", "requests-cache", "scitree", "tqdm", "beautifulsoup4"]
 dynamic = ["version"]
 requires-python = ">=3.8"
 

diff --git a/tests/test_repositories.toml b/tests/test_repositories.toml
@@ -125,3 +125,7 @@ files = "consolidation-wattzhub-schema-irve-statique-20240220-152202.csv"
 [[seanoe]]
 location = "https://doi.org/10.17882/101042"
 files = "111609.xlsx"
+
+[[yoda]]
+location = "https://doi.org/10.24416/uu01-hesuhs"
+files = "University policy framework for research data Utrecht University - January 2016.pdf"