Skip to content

Commit dae2727

Browse files
MarkusKonkWolfgang Maier
andauthored
add data source tool for NFDI4Earth's OneStop4All (#187)
* add data source tool for NFDI's OneStop4All * Update tools/nfdi4earth_os4a_importer/nfdi4earth_os4a_importer.xml Co-authored-by: Wolfgang Maier <maierw@posteo.de> * Update tools/nfdi4earth_os4a_importer/nfdi4earth_os4a_importer.xml Co-authored-by: Wolfgang Maier <maierw@posteo.de> --------- Co-authored-by: Wolfgang Maier <maierw@posteo.de>
1 parent b3d67b0 commit dae2727

3 files changed

Lines changed: 145 additions & 0 deletions

File tree

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
categories:
2+
- Ecology
3+
owner: ecology
4+
remote_repository_url: https://git.rwth-aachen.de/nfdi4earth/onestop4all/onestop4all-implementation/-/tree/develop
5+
homepage_url: https://onestop4all.nfdi4earth.de/
6+
long_description: |
7+
A data source tool for downloading datasets via NFDI4Earth's OneStop4All search user interface.
8+
type: unrestricted
9+
auto_tool_repositories:
10+
name_template: "{{ tool_id }}"
11+
description_template: "Data source tool for content provided in NFDI4Earth's OneStop4All: {{ tool_name }}."
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
#!/usr/bin/env python
2+
# Retrieves data from external data source applications and
3+
# stores in a dataset file.
4+
#
5+
# Data source application parameters are temporarily stored
6+
# in the dataset file.
7+
import json
8+
import os
9+
import sys
10+
from urllib.parse import urlencode, urlparse
11+
from urllib.request import urlopen
12+
13+
from galaxy.datatypes import sniff
14+
from galaxy.datatypes.registry import Registry
15+
from galaxy.util import (
16+
DEFAULT_SOCKET_TIMEOUT,
17+
get_charset_from_http_headers,
18+
stream_to_open_named_file,
19+
)
20+
21+
GALAXY_PARAM_PREFIX = "GALAXY"
22+
GALAXY_ROOT_DIR = os.path.realpath(
23+
os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
24+
)
25+
GALAXY_DATATYPES_CONF_FILE = os.path.join(
26+
GALAXY_ROOT_DIR, "datatypes_conf.xml"
27+
)
28+
29+
30+
def main():
31+
if len(sys.argv) >= 3:
32+
max_file_size = int(sys.argv[2])
33+
else:
34+
max_file_size = 0
35+
36+
with open(sys.argv[1]) as fh:
37+
params = json.load(fh)
38+
39+
out_data_name = params["output_data"][0]["out_data_name"]
40+
41+
URL = params["param_dict"].get("URL", None)
42+
URL_method = params["param_dict"].get("URL_method", "get")
43+
44+
datatypes_registry = Registry()
45+
datatypes_registry.load_datatypes(
46+
root_dir=params["job_config"]["GALAXY_ROOT_DIR"],
47+
config=params["job_config"]["GALAXY_DATATYPES_CONF_FILE"],
48+
)
49+
50+
for data_dict in params["output_data"]:
51+
cur_filename = data_dict["file_name"]
52+
cur_URL = params["param_dict"].get(
53+
"%s|%s|URL" % (GALAXY_PARAM_PREFIX,
54+
data_dict["out_data_name"]), URL
55+
)
56+
if not cur_URL or urlparse(cur_URL).scheme not in ("http", "https",
57+
"ftp"):
58+
open(cur_filename, "w").write("")
59+
sys.exit(
60+
"The remote data source application has not sent "
61+
"back a URL parameter in the request."
62+
)
63+
64+
try:
65+
if URL_method == "get":
66+
page = urlopen(cur_URL, timeout=DEFAULT_SOCKET_TIMEOUT)
67+
elif URL_method == "post":
68+
param_dict = params["param_dict"]
69+
page = urlopen(
70+
cur_URL,
71+
urlencode(param_dict["incoming_request_params"]).encode(
72+
"utf-8"
73+
),
74+
timeout=DEFAULT_SOCKET_TIMEOUT,
75+
)
76+
except Exception as e:
77+
sys.exit(
78+
"The remote data source application may "
79+
"be off line, please try again later. Error: %s"
80+
% str(e)
81+
)
82+
if max_file_size:
83+
file_size = int(page.info().get("Content-Length", 0))
84+
if file_size > max_file_size:
85+
sys.exit(
86+
"The requested data size (%d bytes) exceeds the maximum"
87+
"allowed size (%d bytes) on this server."
88+
% (file_size, max_file_size)
89+
)
90+
try:
91+
cur_filename = stream_to_open_named_file(
92+
page,
93+
os.open(
94+
cur_filename,
95+
os.O_WRONLY | os.O_TRUNC | os.O_CREAT
96+
),
97+
cur_filename,
98+
source_encoding=get_charset_from_http_headers(page.headers),
99+
)
100+
except Exception as e:
101+
sys.exit("Unable to fetch %s:\n%s" % (cur_URL, e))
102+
103+
try:
104+
ext = sniff.handle_uploaded_dataset_file(
105+
cur_filename, datatypes_registry, ext=data_dict["ext"]
106+
)
107+
except Exception as e:
108+
sys.exit(str(e))
109+
110+
tool_provided_metadata = {out_data_name: {"ext": ext}}
111+
112+
with open(
113+
params["job_config"]["TOOL_PROVIDED_JOB_METADATA_FILE"], "w"
114+
) as json_file:
115+
json.dump(tool_provided_metadata, json_file)
116+
117+
118+
if __name__ == "__main__":
119+
main()
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<tool id="nfdi4earth_os4a" name="NFDI4Earth OneStop4All Importer" tool_type="data_source" version="1.0" profile="20.09">
2+
<description>downloads content via NFDI4Earth's OS4A search user interface</description>
3+
<command><![CDATA[
4+
python '$__tool_directory__/data_source.py' '$output' $__app__.config.output_size_limit
5+
]]></command>
6+
<inputs action="https://onestop4all.nfdi4earth.de/search" check_values="false" method="get">
7+
<param name="GALAXY_URL" type="baseurl" value="/tool_runner" />
8+
<param name="tool_id" type="hidden" value="nfdi4earth_os4a" />
9+
<param name="sendToGalaxy" type="hidden" value="1" />
10+
</inputs>
11+
<outputs>
12+
<data name="output" format="auto" label="OneStop4All Resource"/>
13+
</outputs>
14+
<options sanitize="False" refresh="True"/>
15+
</tool>

0 commit comments

Comments
 (0)