-
Notifications
You must be signed in to change notification settings - Fork 10
Add support for Yoda repositories #100
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,19 @@ | ||
import io | ||
import os | ||
import re | ||
import tempfile | ||
import xml.etree.ElementTree as ET | ||
import zipfile | ||
from pathlib import Path | ||
from typing import Union | ||
from urllib.parse import quote | ||
from urllib.parse import unquote | ||
from urllib.parse import urljoin | ||
from urllib.parse import urlparse | ||
|
||
import requests | ||
import requests_cache | ||
from bs4 import BeautifulSoup | ||
from jsonpath_ng.jsonpath import Fields | ||
from jsonpath_ng.jsonpath import Slice | ||
|
||
|
@@ -445,3 +451,113 @@ class B2shareDataset(DatasetDownloader): | |
ATTR_SIZE_JSONPATH = "size" | ||
ATTR_HASH_JSONPATH = "checksum" | ||
ATTR_HASH_TYPE_VALUE = "md5" | ||
|
||
|
||
class YodaDataset(DatasetDownloader): | ||
"""Downloader for Yoda repositories.""" | ||
|
||
def _get_contents_url(self): | ||
"""Resolve the root folder of the contents of a Yoda data package.""" | ||
url_to_use = ( | ||
self.resource if isinstance(self.resource, str) else self.resource.resolve() | ||
) | ||
res = requests.get(url_to_use) | ||
res.raise_for_status() | ||
soup = BeautifulSoup(res.content, "html.parser") | ||
contents_link = soup.find("a", string=re.compile(r"View contents")) | ||
return contents_link.get("href") + "/original" if contents_link else None | ||
|
||
@property | ||
def files(self): | ||
if not hasattr(self, "_files"): | ||
self._requests_cache_file = tempfile.NamedTemporaryFile(delete=False) | ||
requests_cache.install_cache(self._requests_cache_file.name) | ||
self._files = self._harvest_files() | ||
self._cleanup_requests_cache() | ||
return self._files | ||
Comment on lines
+472
to
+477
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder why you cache this. It makes sense, but is there a reason to do this for Yoda specifically? Or should we implement this feature for all services in a generic way? |
||
|
||
def _cleanup_requests_cache(self): | ||
if hasattr(self, "_requests_cache_file"): | ||
if os.path.isfile(self._requests_cache_file.name): | ||
self._requests_cache_file.close() | ||
os.unlink(self._requests_cache_file.name) | ||
|
||
def _get_collection_name_from_folder_url(self, base_url, folder_url): | ||
result = folder_url.replace(base_url, "", 1) | ||
return result[1:] if result.startswith("/") else result | ||
|
||
def _get_full_url(self, base_url, relative_collection, object_name): | ||
result = base_url | ||
if relative_collection != "": | ||
result = urljoin( | ||
result if result.endswith("/") else result + "/", relative_collection | ||
) | ||
result = urljoin(result if result.endswith("/") else result + "/", object_name) | ||
return result | ||
|
||
def _get_relative_path(self, collection_name, dataobject_name): | ||
separator = ( | ||
"/" if collection_name != "" and not collection_name.endswith("/") else "" | ||
) | ||
return f"{collection_name}{separator}{dataobject_name}" | ||
|
||
def _harvest_files(self): | ||
contents_url = self._get_contents_url() | ||
if contents_url is None: | ||
raise ValueError( | ||
"Data package contents link not found. " | ||
+ "This can happen if the Yoda data package is not open access.\n" | ||
) | ||
|
||
folders_to_process = [contents_url] | ||
files_to_download = [] | ||
|
||
while True: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder whether the while loop is needed here. |
||
if len(folders_to_process) == 0: | ||
break | ||
folder = folders_to_process.pop(0) | ||
|
||
res = requests.get(folder) | ||
res.raise_for_status() | ||
soup = BeautifulSoup(res.content, "html.parser") | ||
|
||
collection_name = self._get_collection_name_from_folder_url( | ||
contents_url, folder | ||
) | ||
|
||
data_object_parts = soup.find_all("tr", class_="data-object") | ||
data_objects = [ | ||
a["href"] | ||
for data_object_part in data_object_parts | ||
for a in data_object_part.find_all("a") | ||
] | ||
for data_object in data_objects: | ||
data_object_url = self._get_full_url( | ||
contents_url, collection_name, data_object | ||
) | ||
data_object_relative_path = unquote( | ||
self._get_relative_path(collection_name, data_object) | ||
) | ||
files_to_download.append( | ||
{ | ||
"link": data_object_url, | ||
"name": data_object_relative_path, | ||
"size": None, | ||
"hash": None, | ||
"hash_type": None, | ||
} | ||
) | ||
|
||
collection_parts = soup.find_all("tr", class_="collection") | ||
collections = [ | ||
a["href"] | ||
for collection_part in collection_parts | ||
for a in collection_part.find_all("a") | ||
] | ||
for collection in collections: | ||
subcollection_url = self._get_full_url( | ||
contents_url, collection_name, collection | ||
) | ||
folders_to_process.append(subcollection_url) | ||
|
||
return files_to_download |
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wouldn't it be better to select based on a class? The current class is
btn btn-primary access-btn view-contents
, which appears to be a suitable and stable option for navigating this URL (e.g.view-contents
). We also don't need regexp.