From 177fb99fbca54d66db302156cba70a40c7370410 Mon Sep 17 00:00:00 2001 From: downiec <42552189+downiec@users.noreply.github.com> Date: Mon, 3 Nov 2025 13:14:18 -0800 Subject: [PATCH 1/5] Initial integration of wget functions. Settings and config still need to be resolved --- backend/config/settings/site_specific.py | 46 ++- .../metagrid/api_proxy/tests/test_views.py | 22 +- backend/metagrid/api_proxy/views.py | 212 +++++++++- .../metagrid/api_proxy/wget/query_utils.py | 295 ++++++++++++++ .../api_proxy/wget/wget-simple-template.sh | 11 + .../metagrid/api_proxy/wget/wget-template.sh | 384 ++++++++++++++++++ backend/requirements/base.txt | 4 +- docker-compose.yml | 1 + .../configurable_environment_variables.md | 76 +++- 9 files changed, 1029 insertions(+), 22 deletions(-) create mode 100644 backend/metagrid/api_proxy/wget/query_utils.py create mode 100644 backend/metagrid/api_proxy/wget/wget-simple-template.sh create mode 100644 backend/metagrid/api_proxy/wget/wget-template.sh diff --git a/backend/config/settings/site_specific.py b/backend/config/settings/site_specific.py index 065cabc18..6bc122368 100644 --- a/backend/config/settings/site_specific.py +++ b/backend/config/settings/site_specific.py @@ -24,9 +24,49 @@ class MetagridBackendSettings(BaseSettings): examples=["https://api.stac.esgf-west.org/"], ) - WGET_URL: str = Field( - description="The URL at which the ESG-Search wget endpoint can be reached.", - examples=["https://esgf-node.llnl.gov/esg-search/wget"], + # Expand the number of fields allowed for wget API payloads (Django's DATA_UPLOAD_MAX_NUMBER_FIELDS) + DATA_UPLOAD_MAX_NUMBER_FIELDS: int = Field( + default=1024, + description="Maximum number of form fields allowed in a single upload. Useful for large wget payloads.", + examples=[1024], + ) + + # === wget related settings === + ESGF_SOLR_URL: Optional[str] = Field( + default=None, + description="Address of the ESGF Solr endpoint used by the wget helper logic.", + examples=["https://esgf-node.llnl.gov/esg-search/solr"], + ) + + ESGF_SOLR_SHARDS_XML: Optional[str] = Field( + default=None, + description="Path to the XML file containing Solr shards configuration used to resolve mirrors/shards.", + examples=["/etc/metagrid/solr_shards.xml"], + ) + + ESGF_ALLOWED_PROJECTS_JSON: Optional[str] = Field( + default=None, + description="Path to a JSON file that lists allowed projects for wget/dataset access checks.", + examples=["/etc/metagrid/wget_allowed_projects.json"], + ) + + WGET_SCRIPT_FILE_DEFAULT_LIMIT: int = Field( + default=1000, + description="Default limit on the number of files allowed in a generated wget script.", + examples=[1000], + ) + + WGET_SCRIPT_FILE_MAX_LIMIT: int = Field( + default=100000, + description="Maximum number of files allowed in a generated wget script.", + examples=[100000], + ) + + # Maximum length for facet values used in the wget directory structure + WGET_MAX_DIR_LENGTH: int = Field( + default=50, + description="Maximum character length for facet values when creating directory names for wget downloads.", + examples=[50], ) KEYCLOAK_CLIENT_ID: str = Field( diff --git a/backend/metagrid/api_proxy/tests/test_views.py b/backend/metagrid/api_proxy/tests/test_views.py index f9e4849a5..8e873a4d1 100644 --- a/backend/metagrid/api_proxy/tests/test_views.py +++ b/backend/metagrid/api_proxy/tests/test_views.py @@ -79,17 +79,17 @@ def test_globus_auth_logout(self): response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_302_FOUND) - @responses.activate - def test_wget(self): - url = reverse("do-wget") - responses.get(settings.WGET_URL) - response = self.client.get( - url, - { - "dataset_id": "CMIP6.CMIP.IPSL.IPSL-CM6A-LR.abrupt-4xCO2.r12i1p1f1.Amon.n2oglobal.gr.v20191003|esgf-data1.llnl.gov" - }, - ) - assert response.status_code == status.HTTP_200_OK + # @responses.activate + # def test_wget(self): + # url = reverse("do-wget") + # responses.get(settings.WGET_URL) + # response = self.client.get( + # url, + # { + # "dataset_id": "CMIP6.CMIP.IPSL.IPSL-CM6A-LR.abrupt-4xCO2.r12i1p1f1.Amon.n2oglobal.gr.v20191003|esgf-data1.llnl.gov" + # }, + # ) + # assert response.status_code == status.HTTP_200_OK @responses.activate def test_search(self): diff --git a/backend/metagrid/api_proxy/views.py b/backend/metagrid/api_proxy/views.py index f3b8df904..3a75b2252 100644 --- a/backend/metagrid/api_proxy/views.py +++ b/backend/metagrid/api_proxy/views.py @@ -1,4 +1,6 @@ +import datetime import json +import os from urllib.parse import urlparse import globus_sdk @@ -6,9 +8,10 @@ from django.conf import settings from django.contrib.auth import logout from django.http import HttpResponse, HttpResponseBadRequest, JsonResponse -from django.shortcuts import redirect +from django.shortcuts import redirect, render from django.views.decorators.csrf import csrf_exempt from django.views.decorators.http import require_http_methods +from esgcet.globus_query import ESGGlobusQuery from globus_portal_framework.gclients import load_transfer_client from rest_framework.decorators import api_view, permission_classes from rest_framework.response import Response @@ -16,6 +19,15 @@ from config.settings.site_specific import MetagridFrontendSettings +from .wget.query_utils import ( # get_allowed_projects_from_json, + CORE_QUERY_FIELDS, + FIELD_WGET_EMPTYPATH, + FIELD_WGET_PATH, + KEYWORDS, + SIMPLE, + UNSUPPORTED_FIELDS, +) + @api_view() @permission_classes([]) @@ -154,8 +166,202 @@ def do_status(request): @require_http_methods(["GET", "POST"]) @csrf_exempt -def do_wget(request): - return do_request(request, settings.WGET_URL, True) +def do_wget(request): # noqa: C901 + + file_limit = settings.WGET_SCRIPT_FILE_DEFAULT_LIMIT + # file_offset = 0 + # use_sort = False + # use_distrib = True + # requested_shards = [] + wget_path_facets = [] + wget_empty_path = "" + script_template_file = "wget-template.sh" + + # allowed_projects = get_allowed_projects_from_json() + + # querys = [] + # file_query = ["type:File"] + + # Gather dataset_ids and other parameters + if request.method == "POST": + url_params = json.loads(request.body) + print("POST url_params:", url_params) + elif request.method == "GET": + url_params = request.GET.copy() + print("GET url_params:", url_params) + else: + return HttpResponseBadRequest("Request method must be POST or GET.") + + # If no parameters were passed to the API, + # then default to limit=1 and distrib=false + if len(url_params.keys()) == 0: + url_params.update(dict(limit=1, distrib="false")) + + print("PART 1") + + # Catch invalid parameters + for param in url_params.keys(): + if param[-1] == "!": + param = param[:-1] + if param not in KEYWORDS and param not in CORE_QUERY_FIELDS: + msg = "Invalid HTTP query parameter=%s" % param + return HttpResponseBadRequest(msg) + + print("PART 2") + + # Catch unsupported fields + for uf in UNSUPPORTED_FIELDS: + if url_params.get(uf): + msg = "Unsupported parameter: %s" % uf + return HttpResponseBadRequest(msg) + + print("PART 3") + + # Create a simplified script that only runs wget on a list of files + if url_params.get(SIMPLE): + use_simple_param = url_params.pop(SIMPLE)[0].lower() + if use_simple_param == "false": + script_template_file = "wget-template.sh" + elif use_simple_param == "true": + script_template_file = "wget-simple-template.sh" + else: + msg = 'Parameter "%s" must be set to true or false.' % SIMPLE + return HttpResponseBadRequest(msg) + + print("PART 4") + + # Get directory structure for downloaded files + if url_params.get(FIELD_WGET_PATH): + wget_path_facets = url_params.pop(FIELD_WGET_PATH)[0].split(",") + + if url_params.get(FIELD_WGET_EMPTYPATH): + wget_empty_path = url_params.pop(FIELD_WGET_EMPTYPATH)[0] + + # Get facets for the file name, URL, checksum + # file_attribute_set = set(["title", "url", "checksum_type", "checksum"]) + + # Get facets for the download directory structure, + # and remove duplicate facets + # file_attributes = list(file_attribute_set) + + print("Requested URL params:", url_params) + + # Fetch files for the query + file_list = {} + dsid = url_params.get("dataset_id", "") + if "," in dsid: + dsid = dsid.split(",") + + print("PART 5") + + print(f"DEBUG: {dsid} ") + try: + qo = ESGGlobusQuery(settings.SOCIAL_AUTH_GLOBUS_KEY, "") + res = qo.query_file_records(dsid, wget=True) # , crit=url_params) + print("res:", res) + except PermissionError as e: + # Configuration or filesystem permission issue (e.g. missing shards/allowed-projects files) + print("PermissionError while accessing ESGGlobusQuery files:", e) + return HttpResponseBadRequest( + "Server configuration error: unable to access required ESGF helper files." + ) + except Exception as e: + # Generic fallback to avoid unhandled exceptions bubbling up + print("Error while querying ESGF metadata via ESGGlobusQuery:", e) + return HttpResponseBadRequest(f"Error querying ESGF metadata: {e}") + num_files = len(res) + + print(f"DEBUG: Number of files found: {num_files}") + + for file_info in res: + filename = file_info["title"] + checksum_type = file_info["checksum_type"][0] + checksum = file_info["checksum"][0] + # Create directory structure from facet values + # If the facet is not found, then use the empty path value + dir_struct = [] + for facet in wget_path_facets: + facet_value = wget_empty_path + if facet in file_info: + if isinstance(file_info[facet], list): + facet_value = file_info[facet][0] + else: + facet_value = file_info[facet] + # Prevent strange values while generating names + facet_value = facet_value.replace("['<>?*\"\n\t\r\0]", "") + facet_value = facet_value.replace("[ /\\\\|:;]+", "_") + # Limit length of value to WGET_MAX_DIR_LENGTH + if len(facet_value) > settings.WGET_MAX_DIR_LENGTH: + facet_value = facet_value[: settings.WGET_MAX_DIR_LENGTH] + dir_struct.append(facet_value) + dir_struct.append(filename) + file_path = os.path.join(*dir_struct) + # Only add a file to the list if its file path is not already present + if file_path not in file_list: + for url in file_info["url"]: + url_split = url.split("|") + if url_split[2] == "HTTPServer": + file_entry = dict( + url=url_split[0], + checksum_type=checksum_type, + checksum=checksum, + ) + file_list[file_path] = file_entry + break + + print("Part 6") + + # Limit the number of files to the maximum + warning_message = None + if num_files == 0: + return HttpResponse("No files found for datasets.") + elif num_files > file_limit: + warning_message = ( + "Warning! The total number of files was {} " + "but this script will only process {}.".format( + num_files, file_limit + ) + ) + + print("PART 7") + + # Warning message about files that were skipped + # to prevent overwriting similarly-named files. + skip_msg = ( + "There were files with the same name which were requested " + "to be download to the same directory. To avoid overwriting " + "the previous downloaded one they were skipped.\n" + "Please use the parameter 'download_structure' " + "to set up unique directories for them." + ) + if min(num_files, file_limit) > len(file_list): + if warning_message: + warning_message = "{}\n{}".format(warning_message, skip_msg) + else: + warning_message = skip_msg + + print("PART 8") + + # Build wget script + current_datetime = datetime.datetime.now() + timestamp = current_datetime.strftime("%Y/%m/%d %H:%M:%S") + + context = dict( + timestamp=timestamp, + url_params=[dsid], + files=file_list, + warning_message=warning_message, + ) + + print("PART 9") + wget_script = render(request, script_template_file, context) + + script_filename = current_datetime.strftime("wget-%Y%m%d%H%M%S.sh") + response_content = "attachment; filename={}".format(script_filename) + + response = HttpResponse(wget_script, content_type="text/x-sh") + response["Content-Disposition"] = response_content + return response def do_post(request, urlbase): diff --git a/backend/metagrid/api_proxy/wget/query_utils.py b/backend/metagrid/api_proxy/wget/query_utils.py new file mode 100644 index 000000000..95d7f92df --- /dev/null +++ b/backend/metagrid/api_proxy/wget/query_utils.py @@ -0,0 +1,295 @@ +import csv +import json +import os +import urllib.parse +import urllib.request +import xml.etree.ElementTree as ET +from io import StringIO + +from django.conf import settings + +# reserved query keywords +OFFSET = "offset" +LIMIT = "limit" +QUERY = "query" +DISTRIB = "distrib" +SHARDS = "shards" +FROM = "from" +TO = "to" +SORT = "sort" +SIMPLE = "simple" + +KEYWORDS = [OFFSET, LIMIT, QUERY, DISTRIB, SHARDS, FROM, TO, SORT, SIMPLE] + +# standard metadata fields, always included for each result (if available) +FIELD_ID = "id" +FIELD_TYPE = "type" +FIELD_REPLICA = "replica" +FIELD_RETRACTED = "retracted" +FIELD_LATEST = "latest" +FIELD_MASTER_ID = "master_id" +FIELD_INSTANCE_ID = "instance_id" +FIELD_DRS_ID = "drs_id" +FIELD_TITLE = "title" +FIELD_DESCRIPTION = "description" +FIELD_TIMESTAMP = "timestamp" +FIELD_TIMESTAMP_ = "_timestamp" +FIELD_URL = "url" +FIELD_ACCESS = "access" +FIELD_XLINK = "xlink" +FIELD_SIZE = "size" +FIELD_DATASET_ID = "dataset_id" +FIELD_TRACKING_ID = "tracking_id" +FIELD_VERSION = "version" +FIELD_VERSION_ = "_version_" +FIELD_MAX_VERSION = "max_version" +FIELD_MIN_VERSION = "min_version" +FIELD_SCORE = "score" +FIELD_UNITS = "units" + +FIELD_CHECKSUM = "checksum" +FIELD_CHECKSUM_TYPE = "checksum_type" +FIELD_INDEX_NODE = "index_node" +FIELD_DATA_NODE = "data_node" +FIELD_NUMBER_OF_FILES = "number_of_files" +FIELD_NUMBER_OF_AGGREGATIONS = "number_of_aggregations" +FIELD_DATASET_ID_TEMPLATE = "dataset_id_template_" +FIELD_DATETIME_START = "datetime_start" +FIELD_DATETIME_STOP = "datetime_stop" +FIELD_TEXT = "text" + +# special query fields for open search geo extension +FIELD_BBOX = "bbox" # west, south, east, north +FIELD_LAT = "lat" +FIELD_LON = "lon" +FIELD_LOCATION = "location" +FIELD_RADIUS = "radius" +FIELD_POLYGON = "polygon" +FIELD_EAST_DEGREES = "east_degrees" +FIELD_WEST_DEGREES = "west_degrees" +FIELD_NORTH_DEGREES = "north_degrees" +FIELD_SOUTH_DEGREES = "south_degrees" +FIELD_HEIGHT_BOTTOM = "height_bottom" +FIELD_HEIGHT_TOP = "height_top" +FIELD_HEIGHT_UNITS = "height_units" +FIELD_VARIABLE_UNITS = "variable_units" +FIELD_GEO = "geo" +FIELD_GEO_UNITS = "geo_units" + +# special query fields for open search time extension +FIELD_START = "start" +FIELD_END = "end" + +# special query fields for the wget scirpt generator +FIELD_WGET_PATH = "download_structure" +FIELD_WGET_EMPTYPATH = "download_emptypath" + +# fields that specify project +FIELD_PROJECT = "project" +FIELD_MIP_ERA = "mip_era" + +# fields that are always allowed in queries, in addition to configured facets +CORE_QUERY_FIELDS = [ + FIELD_ID, + FIELD_TYPE, + FIELD_REPLICA, + FIELD_RETRACTED, + FIELD_LATEST, + FIELD_MASTER_ID, + FIELD_INSTANCE_ID, + FIELD_DRS_ID, + FIELD_TITLE, + FIELD_DESCRIPTION, + FIELD_TIMESTAMP, + FIELD_TIMESTAMP_, + FIELD_URL, + FIELD_XLINK, + FIELD_SIZE, + FIELD_NUMBER_OF_FILES, + FIELD_NUMBER_OF_AGGREGATIONS, + FIELD_DATASET_ID, + FIELD_TRACKING_ID, + FIELD_ACCESS, + FIELD_VERSION, + FIELD_MAX_VERSION, + FIELD_MIN_VERSION, + FIELD_CHECKSUM, + FIELD_CHECKSUM_TYPE, + FIELD_DATA_NODE, + FIELD_INDEX_NODE, + FIELD_BBOX, + FIELD_LAT, + FIELD_LON, + FIELD_RADIUS, + FIELD_POLYGON, + FIELD_START, + FIELD_END, + FIELD_WGET_PATH, + FIELD_WGET_EMPTYPATH, + FIELD_PROJECT, + FIELD_MIP_ERA, +] + +# fields that should NOT be used as facets +NOT_FACETS = [ + FIELD_ID, + FIELD_MASTER_ID, + FIELD_INSTANCE_ID, + FIELD_DATASET_ID, + FIELD_DATASET_ID_TEMPLATE, + FIELD_DRS_ID, + FIELD_DATETIME_START, + FIELD_DATETIME_STOP, + FIELD_EAST_DEGREES, + FIELD_WEST_DEGREES, + FIELD_NORTH_DEGREES, + FIELD_SOUTH_DEGREES, + FIELD_BBOX, + FIELD_LAT, + FIELD_LON, + FIELD_RADIUS, + FIELD_POLYGON, + FIELD_HEIGHT_BOTTOM, + FIELD_HEIGHT_TOP, + FIELD_HEIGHT_UNITS, + FIELD_LATEST, + FIELD_REPLICA, + FIELD_RETRACTED, + FIELD_NUMBER_OF_FILES, + FIELD_NUMBER_OF_AGGREGATIONS, + FIELD_TRACKING_ID, + FIELD_TIMESTAMP, + FIELD_TITLE, + FIELD_DESCRIPTION, + FIELD_URL, + FIELD_XLINK, + FIELD_SIZE, + FIELD_TEXT, + FIELD_TYPE, + FIELD_VARIABLE_UNITS, + FIELD_GEO, + FIELD_GEO_UNITS, + FIELD_TIMESTAMP_, + FIELD_VERSION_, + FIELD_SCORE, + FIELD_UNITS, +] + +# unsupported fields +UNSUPPORTED_FIELDS = [ + FIELD_LAT, + FIELD_LON, + FIELD_LOCATION, + FIELD_RADIUS, + FIELD_POLYGON, +] + +# ID fields +ID_FIELDS = [FIELD_ID, FIELD_DATASET_ID, FIELD_MASTER_ID, FIELD_INSTANCE_ID] + + +def split_value(value): + """ + Utility method to split an HTTP parameter value into comma-separated + values but keep intact patterns such as "CESM1(CAM5.1,FV2) + """ + + # first split by comma + values = [v.strip() for v in value.split(",")] + values_length = len(values) + + if len(values) == 1: # no splitting occurred + return values + else: # possibly re-assemble broken pieces + _values = [] + i = 0 + while i < values_length: + if i < values_length - 1: + if ( + values[i].find("(") >= 0 + and values[i].find(")") < 0 + and values[i + 1].find(")") >= 0 + and values[i + 1].find("(") < 0 + ): + _values.append( + values[i] + "," + values[i + 1] + ) # re-assemble + i += 1 # skip next value + elif ( + values[i].find("[") >= 0 + and values[i].find("]") < 0 + and values[i + 1].find("]") >= 0 + and values[i + 1].find("[") < 0 + ): + _values.append( + values[i] + "," + values[i + 1] + ) # re-assemble + i += 1 # skip next value + elif ( + values[i].find("{") >= 0 + and values[i].find("}") < 0 + and values[i + 1].find("}") >= 0 + and values[i + 1].find("{") < 0 + ): + _values.append( + values[i] + "," + values[i + 1] + ) # re-assemble + i += 1 # skip next value + else: + _values.append(values[i]) + else: + _values.append(values[i]) + i += 1 + + # convert listo into array + return _values + + +def get_solr_shards_from_xml(): + """ + Get Solr shards from the XML file specified in the settings + as ESGF_SOLR_SHARDS_XML + """ + + shard_list = [] + if os.path.isfile(settings.ESGF_SOLR_SHARDS_XML): + tree = ET.parse(settings.ESGF_SOLR_SHARDS_XML) + root = tree.getroot() + for value in root: + shard_list.append(value.text) + return shard_list + + +def get_allowed_projects_from_json(): + """ + Get allowed ESGF projects from the JSON file specified in the settings + as ESGF_ALLOWED_PROJECTS_JSON + """ + + allowed_projects_list = [] + if os.path.isfile(settings.ESGF_ALLOWED_PROJECTS_JSON): + with open(settings.ESGF_ALLOWED_PROJECTS_JSON, "r") as js: + data = json.load(js) + allowed_projects_list = data["allowed_projects"] + return allowed_projects_list + + +def get_facets_from_solr(): + """ + Get valid facets currently used by the dataset Solr. + """ + + query_url = settings.ESGF_SOLR_URL + "/datasets/select" + query_params = dict(q="*:*", wt="csv", rows=0) + + query_encoded = urllib.parse.urlencode(query_params, doseq=True).encode() + req = urllib.request.Request(query_url, query_encoded) + with urllib.request.urlopen(req) as response: + results = StringIO(response.read().decode()) + reader = csv.reader(results, delimiter=",") + facets = next(reader) + + # Remove fields that should NOT be used as facets + _facets = [f for f in facets if f not in NOT_FACETS] + + return _facets diff --git a/backend/metagrid/api_proxy/wget/wget-simple-template.sh b/backend/metagrid/api_proxy/wget/wget-simple-template.sh new file mode 100644 index 000000000..f1e823699 --- /dev/null +++ b/backend/metagrid/api_proxy/wget/wget-simple-template.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +download_files=( +{% spaceless %}{% for filename, file in files.items %}'{{file.url}}' +{% endfor %}{% endspaceless %} +) + +for i in "${download_files[@]}" +do + wget $i +done \ No newline at end of file diff --git a/backend/metagrid/api_proxy/wget/wget-template.sh b/backend/metagrid/api_proxy/wget/wget-template.sh new file mode 100644 index 000000000..87a970dbe --- /dev/null +++ b/backend/metagrid/api_proxy/wget/wget-template.sh @@ -0,0 +1,384 @@ +#!/bin/bash +############################################################################## +# ESGF wget download script +# +# Template version: 0.4 +# Generated by {{request.get_host}} - {{timestamp}} +# Search URL: {{request.build_absolute_uri}} +# Request method: {{request.method}} +# +############################################################################### +# first be sure it's bash... anything out of bash or sh will break +# and the test will assure we are not using sh instead of bash +if [ $BASH ] && [ `basename $BASH` != bash ]; then + echo "######## This is a bash script! ##############" + echo "Change the execution bit 'chmod u+x $0' or start with 'bash $0' instead of sh." + echo "Trying to recover automatically..." + sleep 1 + /bin/bash $0 $@ + exit $? +fi + +version=0.4 +CACHE_FILE=.$(basename $0).status +search_url='{{request.build_absolute_uri}}' +request_method='{{request.method}}' +{% spaceless %}{% if url_params|length > 0 %} +url_params=( + {% spaceless %}{% for url_param in url_params %} '{{ url_param }}' +{% endfor %}{% endspaceless %} +) +{% endif %}{% endspaceless %} + +#These are the embedded files to be downloaded +download_files="$(cat < {{'10#${ver2[i]}'}})) + then + return 1 + fi + if (({{'10#${ver1[i]}'}} < {{'10#${ver2[i]}'}})) + then + return 2 + fi + done + return 0 +} + +check_commands() { + #check wget + local MIN_WGET_VERSION=1.10 + vercomp $(wget -V | sed -n 's/^.* \([1-9]\.[0-9.]*\) .*$/\1/p') $MIN_WGET_VERSION + case $? in + 2) #lower + wget -V + echo + echo "** ERROR: wget version is too old. Use version $MIN_WGET_VERSION or greater. **" >&2 + exit 1 + esac +} + +usage() { + echo "Usage: $(basename $0) [flags]" + echo "Flags is one of:" + sed -n '/^while getopts/,/^done/ s/^\([^)]*\)[^#]*#\(.*$\)/\1 \2/p' $0 + echo + echo "This command stores the states of the downloads in .$0.status" +} + +#defaults +debug=0 +clean_work=1 + +#parse flags +while getopts 'F:w:iuUnSpdvqh' OPT; do + case $OPT in + F) input_file="$OPTARG";; # : read input from file instead of the embedded one (use - to read from stdin) + w) output="$OPTARG";; # : Write embedded files into a file and exit + i) insecure=1;; # : set insecure mode, i.e. don't check server certificate + u) update=1;; # : Issue the search again and see if something has changed. + U) update_files=1;; # : Update files from server overwriting local ones (detect with -u) + n) dry_run=1;; # : Don't download any files, just report. + S) skip_checksum=1;; # : Skip file checksum + p) clean_work=0;; # : preserve data that failed checksum + d) verbose=1;debug=1;; # : display debug information + v) verbose=1;; # : be more verbose + q) quiet=1;; # : be less verbose + h) usage && exit 0;; # : displays this help + \?) echo "Unknown option '$OPTARG'" >&2 && usage && exit 1;; + \:) echo "Missing parameter for flag '$OPTARG'" >&2 && usage && exit 1;; + esac +done +shift $(($OPTIND - 1)) + +#setup input as desired by the user +if [[ "$input_file" ]]; then + if [[ "$input_file" == '-' ]]; then + download_files="$(cat)" #read from STDIN + exec 0$output + exit +fi + +#assure we have everything we need +check_commands + +if ((update)); then + echo "Checking the server for changes..." + {% spaceless %}{% if request.method == "POST" and url_params|length > 0 %} + post_data=$(IFS="&" ; echo "${url_params[*]}") + new_wget="$(wget --post-data "$post_data" "$search_url" -qO -)" + {% else %} + new_wget="$(wget "$search_url" -qO -)" + {% endif %}{% endspaceless %} + compare_cmd="grep -vE '^(# Generated by|# Search URL|search_url=)'" + if diff -q <(eval $compare_cmd<<<"$new_wget") <(eval $compare_cmd $0) >/dev/null; then + echo "No changes detected." + else + echo "Wget was changed. Dowloading. (old renamed to $0.old.#N)" + counter=0 + while [[ -f $0.old.$counter ]]; do ((counter++)); done + mv $0 $0.old.$counter + echo "$new_wget" > $0 + fi + exit 0 +fi + +check_chksum() { + local file="$1" + local chk_type=$2 + local chk_value=$3 + local local_chksum=Unknown + + case $chk_type in + md5) local_chksum=$(md5sum_ $file | cut -f1 -d" ");; + sha256) local_chksum=$(sha256sum_ $file|awk '{print $1}'|cut -d ' ' -f1);; + *) echo "Can't verify checksum." && return 0;; + esac + + #verify + ((debug)) && echo "local:$local_chksum vs remote:$chk_value" >&2 + echo $local_chksum +} + +#Our own md5sum function call that takes into account machines that don't have md5sum but do have md5 (i.e. mac os x) +md5sum_() { + hash -r + if type md5sum >& /dev/null; then + echo $(md5sum $@) + else + echo $(md5 $@ | sed -n 's/MD5[ ]*\(.*\)[^=]*=[ ]*\(.*$\)/\2 \1/p') + fi +} + +#Our own sha256sum function call that takes into account machines that don't have sha256sum but do have sha2 (i.e. mac os x) +sha256sum_() { + hash -r + if type sha256sum >& /dev/null; then + echo $(sha256sum $@) + elif type shasum >& /dev/null; then + echo $(shasum -a 256 $@) + else + echo $(sha2 -q -256 $@) + fi +} + +get_mod_time_() { + if ((MACOSX)); then + #on a mac modtime is stat -f %m + echo "$(stat -f %m $@)" + else + #on linux (cygwin) modtime is stat -c %Y + echo "$(stat -c %Y $@)" + fi + return 0; +} + +remove_from_cache() { + local entry="$1" + local tmp_file="$(grep -ve "^$entry" "$CACHE_FILE")" + echo "$tmp_file" > "$CACHE_FILE" + unset cached +} + +download() { + wget="wget ${insecure:+--no-check-certificate} ${quiet:+-q} ${quiet:--v}" + + while read line + do + # read csv here document into proper variables + eval $(awk -F "' '" '{$0=substr($0,2,length($0)-2); $3=tolower($3); print "file=\""$1"\";url=\""$2"\";chksum_type=\""$3"\";chksum=\""$4"\""}' <(echo $line) ) + + #Process the file + echo -n "$file ..." + + #get the cached entry if any. + cached="$(grep -e "^$file" "$CACHE_FILE")" + + #if we have the cache entry but no file, clean it. + if [[ ! -f $file && "$cached" ]]; then + #the file was removed, clean the cache + remove_from_cache "$file" + unset cached + fi + + #check it wasn't modified + if [[ -n "$cached" && "$(get_mod_time_ $file)" == $(echo "$cached" | cut -d ' ' -f2) ]]; then + if [[ "$chksum" == "$(echo "$cached" | cut -d ' ' -f3)" ]]; then + echo "Already downloaded and verified" + continue + elif ((update_files)); then + #user want's to overwrite newer files + rm $file + remove_from_cache "$file" + unset cached + else + #file on server is different from what we have. + echo "WARNING: The remote file was changed (probably a new version is available). Use -U to Update/overwrite" + continue + fi + fi + unset chksum_err_value chksum_err_count + + while : ; do + # (if we had the file size, we could check before trying to complete) + echo "Downloading" + [[ ! -d "$(dirname "$file")" ]] && mkdir -p "$(dirname "$file")" + if ((dry_run)); then + #all important info was already displayed, if in dry_run mode just abort + #No status will be stored + break + else + $wget -O "$file" $url || { failed=1; break; } + fi + + #check if file is there + if [[ -f $file ]]; then + ((debug)) && echo file found + if ((skip_checksum)); then + echo "Skipping check of file checksum" + break + fi + if [[ ! "$chksum" ]]; then + echo "Checksum not provided, can't verify file integrity" + break + fi + result_chksum=$(check_chksum "$file" $chksum_type $chksum) + if [[ "$result_chksum" != "$chksum" ]]; then + echo " $chksum_type failed!" + if ((clean_work)); then + if !((chksum_err_count)); then + chksum_err_value=$result_chksum + chksum_err_count=2 + elif ((checksum_err_count--)); then + if [[ "$result_chksum" != "$chksum_err_value" ]]; then + #this is a real transmission problem + chksum_err_value=$result_chksum + chksum_err_count=2 + fi + else + #ok if here we keep getting the same "different" checksum + echo "The file returns always a different checksum!" + echo "Contact the data owner to verify what is happening." + echo + sleep 1 + break + fi + + rm $file + #try again + echo -n " re-trying..." + continue + else + echo " don't use -p or remove manually." + fi + else + echo " $chksum_type ok. done!" + echo "$file" $(get_mod_time_ "$file") $chksum >> $CACHE_FILE + fi + fi + #done! + break + done + + if ((failed)); then + echo "download failed" + unset failed + fi + + done <<<"$download_files" + +} + +dedup_cache_() { + local file=${1:-${CACHE_FILE}} + ((debug)) && echo "dedup'ing cache ${file} ..." + local tmp=$(LC_ALL='C' sort -r -k1,2 $file | awk '!($1 in a) {a[$1];print $0}' | sort -k2,2) + ((DEBUG)) && echo "$tmp" + echo "$tmp" > $file + ((debug)) && echo "(cache dedup'ed)" +} + +#do we have old results? Create the file if not +[ ! -f $CACHE_FILE ] && echo "#filename mtime checksum" > $CACHE_FILE && chmod 666 $CACHE_FILE + +# +# MAIN +# + +echo "Running $(basename $0) version: $version" +((verbose)) && echo "we use other tools in here, don't try to user their proposed 'options' directly" +echo "Use $(basename $0) -h for help."$'\n' + +cat <<'EOF-MESSAGE' +{% if warning_message %}{{ warning_message|safe }} +{% endif %}Script created for {{ files|length }} file(s) +(The count won't match if you manually edit this file!) + +EOF-MESSAGE +sleep 1 + +check_os + +download + +dedup_cache_ + +echo "done" \ No newline at end of file diff --git a/backend/requirements/base.txt b/backend/requirements/base.txt index dbff42c98..cd4630940 100644 --- a/backend/requirements/base.txt +++ b/backend/requirements/base.txt @@ -34,7 +34,9 @@ pre-commit==3.7.1 # https://github.com/pre-commit/pre-commit # ------------------------------------------------------------------------------ django-model-utils==4.5.1 # https://github.com/jazzband/django-model-utils django_unique_upload==0.2.1 # https://github.com/agconti/django-unique-upload -globus-sdk==3.46.0 +globus-sdk==3.65.0 # +esgcet==5.3.5 # +globus-cli==3.39.0 # # SBOM # ------------------------------------------------------------------------------ diff --git a/docker-compose.yml b/docker-compose.yml index 2ab49f048..31adf5295 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -57,6 +57,7 @@ services: volumes: - ./backend:/app environment: + WGET_SCRIPT_FILE_DEFAULT_LIMIT: 1000 PGHOST: postgres PGPASSWORD: postgres PGUSER: postgres diff --git a/docs/docs/users/configurable_environment_variables.md b/docs/docs/users/configurable_environment_variables.md index 7c88f5e29..5556911a2 100644 --- a/docs/docs/users/configurable_environment_variables.md +++ b/docs/docs/users/configurable_environment_variables.md @@ -21,14 +21,82 @@ > > `https://api.stac.esgf-west.org/` -#### `METAGRID_WGET_URL` +#### `METAGRID_DATA_UPLOAD_MAX_NUMBER_FIELDS` -> !!! example "**Required**" -> The URL at which the ESG-Search wget endpoint can be reached. +> !!! example "*Optional*" +> __Default:__ `1024` +> +> Maximum number of form fields allowed in a single upload. Useful for large wget payloads. +> +> __Example Values__ +> +> `1024` + +#### `METAGRID_ESGF_SOLR_URL` + +> !!! example "*Optional*" +> __Default:__ `None` +> +> Address of the ESGF Solr endpoint used by the wget helper logic. +> +> __Example Values__ +> +> `https://esgf-node.llnl.gov/esg-search/solr` + +#### `METAGRID_ESGF_SOLR_SHARDS_XML` + +> !!! example "*Optional*" +> __Default:__ `None` +> +> Path to the XML file containing Solr shards configuration used to resolve mirrors/shards. +> +> __Example Values__ +> +> `/etc/metagrid/solr_shards.xml` + +#### `METAGRID_ESGF_ALLOWED_PROJECTS_JSON` + +> !!! example "*Optional*" +> __Default:__ `None` +> +> Path to a JSON file that lists allowed projects for wget/dataset access checks. +> +> __Example Values__ +> +> `/etc/metagrid/wget_allowed_projects.json` + +#### `METAGRID_WGET_SCRIPT_FILE_DEFAULT_LIMIT` + +> !!! example "*Optional*" +> __Default:__ `1000` +> +> Default limit on the number of files allowed in a generated wget script. +> +> __Example Values__ +> +> `1000` + +#### `METAGRID_WGET_SCRIPT_FILE_MAX_LIMIT` + +> !!! example "*Optional*" +> __Default:__ `100000` +> +> Maximum number of files allowed in a generated wget script. +> +> __Example Values__ +> +> `100000` + +#### `METAGRID_WGET_MAX_DIR_LENGTH` + +> !!! example "*Optional*" +> __Default:__ `50` +> +> Maximum character length for facet values when creating directory names for wget downloads. > > __Example Values__ > -> `https://esgf-node.llnl.gov/esg-search/wget` +> `50` #### `METAGRID_KEYCLOAK_CLIENT_ID` From 2c545cf709505e42ddd27cdc6a47314a547b2f9e Mon Sep 17 00:00:00 2001 From: downiec <42552189+downiec@users.noreply.github.com> Date: Tue, 4 Nov 2025 18:07:04 -0800 Subject: [PATCH 2/5] Finished adding files, and fixing issues with the wget downloads, downloads are now working properly. Need to add a few tests to increase test coverage. --- backend/Dockerfile | 4 +- backend/config/settings/site_specific.py | 20 +- backend/config/settings/static.py | 7 +- backend/config/urls.py | 2 +- .../metagrid/api_proxy/tests/test_views.py | 12 - backend/metagrid/api_proxy/views.py | 214 +----------------- backend/metagrid/wget/__init__.py | 0 .../{api_proxy => }/wget/query_utils.py | 60 ----- .../templates}/wget-simple-template.sh | 2 +- .../wget => wget/templates}/wget-template.sh | 24 +- backend/metagrid/wget/tests/__init__.py | 0 backend/metagrid/wget/tests/test_views.py | 107 +++++++++ backend/metagrid/wget/views.py | 184 +++++++++++++++ .../configurable_environment_variables.md | 30 +-- 14 files changed, 321 insertions(+), 345 deletions(-) create mode 100644 backend/metagrid/wget/__init__.py rename backend/metagrid/{api_proxy => }/wget/query_utils.py (78%) rename backend/metagrid/{api_proxy/wget => wget/templates}/wget-simple-template.sh (97%) rename backend/metagrid/{api_proxy/wget => wget/templates}/wget-template.sh (97%) create mode 100644 backend/metagrid/wget/tests/__init__.py create mode 100644 backend/metagrid/wget/tests/test_views.py create mode 100644 backend/metagrid/wget/views.py diff --git a/backend/Dockerfile b/backend/Dockerfile index cb290aaff..64d303347 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -13,8 +13,8 @@ RUN apt-get update \ && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \ && rm -rf /var/lib/apt/lists/* -RUN addgroup --system django \ - && adduser --system --ingroup django django +RUN groupadd --system --gid 1000 django && \ + useradd --system --create-home --uid 1000 --gid django django # Requirements are installed here to ensure they will be cached. COPY requirements /requirements diff --git a/backend/config/settings/site_specific.py b/backend/config/settings/site_specific.py index 6bc122368..6cc86acdf 100644 --- a/backend/config/settings/site_specific.py +++ b/backend/config/settings/site_specific.py @@ -32,22 +32,10 @@ class MetagridBackendSettings(BaseSettings): ) # === wget related settings === - ESGF_SOLR_URL: Optional[str] = Field( - default=None, - description="Address of the ESGF Solr endpoint used by the wget helper logic.", - examples=["https://esgf-node.llnl.gov/esg-search/solr"], - ) - - ESGF_SOLR_SHARDS_XML: Optional[str] = Field( - default=None, - description="Path to the XML file containing Solr shards configuration used to resolve mirrors/shards.", - examples=["/etc/metagrid/solr_shards.xml"], - ) - - ESGF_ALLOWED_PROJECTS_JSON: Optional[str] = Field( - default=None, - description="Path to a JSON file that lists allowed projects for wget/dataset access checks.", - examples=["/etc/metagrid/wget_allowed_projects.json"], + GLOBUS_PUBLIC_INDEX_ENDPOINT_ID: str = Field( + default="a8ef4320-9e5a-4793-837b-c45161ca1845", + description="The Globus index ID for the public ESGF2 data.", + examples=["a8ef4320-9e5a-4793-837b-c45161ca1845"], ) WGET_SCRIPT_FILE_DEFAULT_LIMIT: int = Field( diff --git a/backend/config/settings/static.py b/backend/config/settings/static.py index c6062c168..b1d397aa8 100644 --- a/backend/config/settings/static.py +++ b/backend/config/settings/static.py @@ -9,6 +9,8 @@ environ.Path(__file__) - 3 ) # (config/settings/django.py - 3 = metagrid/) +TEMPLATE_DIR = ROOT_DIR("metagrid", "wget", "templates") + class DjangoStaticSettings(BaseSettings): """Django settings that do not vary by site""" @@ -96,11 +98,12 @@ class DjangoStaticSettings(BaseSettings): "DATABASE_URL": "pgsql:///postgres", } } + STATIC_ROOT: str = ROOT_DIR("staticfiles") # https://docs.djangoproject.com/en/dev/ref/settings/#static-url STATIC_URL: str = "/static/" # https://docs.djangoproject.com/en/dev/ref/contrib/staticfiles/#std:setting-STATICFILES_DIRS - STATICFILES_DIRS: Sequence[str] = [] + STATICFILES_DIRS: Sequence[str] = [TEMPLATE_DIR] # https://docs.djangoproject.com/en/dev/ref/contrib/staticfiles/#staticfiles-finders STATICFILES_FINDERS: Sequence[str] = [ "django.contrib.staticfiles.finders.FileSystemFinder", @@ -121,7 +124,7 @@ class DjangoStaticSettings(BaseSettings): { "BACKEND": "django.template.backends.django.DjangoTemplates", "DIRS": STATICFILES_DIRS, - "APP_DIRS": False, + "APP_DIRS": True, "OPTIONS": { "context_processors": [ "django.template.context_processors.debug", diff --git a/backend/config/urls.py b/backend/config/urls.py index 8c64c5655..d6be01b40 100644 --- a/backend/config/urls.py +++ b/backend/config/urls.py @@ -24,7 +24,6 @@ do_search, do_stac_search, do_status, - do_wget, fetch_stac_aggregations, get_frontend_config, get_temp_storage, @@ -34,6 +33,7 @@ from metagrid.observability.views import liveness, readiness from metagrid.projects.views import ProjectsViewSet from metagrid.users.views import UserCreateViewSet, UserViewSet +from metagrid.wget.views import do_wget router = DefaultRouter() router.register(r"users", UserViewSet) diff --git a/backend/metagrid/api_proxy/tests/test_views.py b/backend/metagrid/api_proxy/tests/test_views.py index 8e873a4d1..dda8347b9 100644 --- a/backend/metagrid/api_proxy/tests/test_views.py +++ b/backend/metagrid/api_proxy/tests/test_views.py @@ -79,18 +79,6 @@ def test_globus_auth_logout(self): response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_302_FOUND) - # @responses.activate - # def test_wget(self): - # url = reverse("do-wget") - # responses.get(settings.WGET_URL) - # response = self.client.get( - # url, - # { - # "dataset_id": "CMIP6.CMIP.IPSL.IPSL-CM6A-LR.abrupt-4xCO2.r12i1p1f1.Amon.n2oglobal.gr.v20191003|esgf-data1.llnl.gov" - # }, - # ) - # assert response.status_code == status.HTTP_200_OK - @responses.activate def test_search(self): url = reverse("do-search") diff --git a/backend/metagrid/api_proxy/views.py b/backend/metagrid/api_proxy/views.py index 3a75b2252..d4ddf6f80 100644 --- a/backend/metagrid/api_proxy/views.py +++ b/backend/metagrid/api_proxy/views.py @@ -1,6 +1,4 @@ -import datetime import json -import os from urllib.parse import urlparse import globus_sdk @@ -8,10 +6,9 @@ from django.conf import settings from django.contrib.auth import logout from django.http import HttpResponse, HttpResponseBadRequest, JsonResponse -from django.shortcuts import redirect, render +from django.shortcuts import redirect from django.views.decorators.csrf import csrf_exempt from django.views.decorators.http import require_http_methods -from esgcet.globus_query import ESGGlobusQuery from globus_portal_framework.gclients import load_transfer_client from rest_framework.decorators import api_view, permission_classes from rest_framework.response import Response @@ -19,15 +16,6 @@ from config.settings.site_specific import MetagridFrontendSettings -from .wget.query_utils import ( # get_allowed_projects_from_json, - CORE_QUERY_FIELDS, - FIELD_WGET_EMPTYPATH, - FIELD_WGET_PATH, - KEYWORDS, - SIMPLE, - UNSUPPORTED_FIELDS, -) - @api_view() @permission_classes([]) @@ -164,206 +152,6 @@ def do_status(request): return HttpResponseBadRequest(resp.text) -@require_http_methods(["GET", "POST"]) -@csrf_exempt -def do_wget(request): # noqa: C901 - - file_limit = settings.WGET_SCRIPT_FILE_DEFAULT_LIMIT - # file_offset = 0 - # use_sort = False - # use_distrib = True - # requested_shards = [] - wget_path_facets = [] - wget_empty_path = "" - script_template_file = "wget-template.sh" - - # allowed_projects = get_allowed_projects_from_json() - - # querys = [] - # file_query = ["type:File"] - - # Gather dataset_ids and other parameters - if request.method == "POST": - url_params = json.loads(request.body) - print("POST url_params:", url_params) - elif request.method == "GET": - url_params = request.GET.copy() - print("GET url_params:", url_params) - else: - return HttpResponseBadRequest("Request method must be POST or GET.") - - # If no parameters were passed to the API, - # then default to limit=1 and distrib=false - if len(url_params.keys()) == 0: - url_params.update(dict(limit=1, distrib="false")) - - print("PART 1") - - # Catch invalid parameters - for param in url_params.keys(): - if param[-1] == "!": - param = param[:-1] - if param not in KEYWORDS and param not in CORE_QUERY_FIELDS: - msg = "Invalid HTTP query parameter=%s" % param - return HttpResponseBadRequest(msg) - - print("PART 2") - - # Catch unsupported fields - for uf in UNSUPPORTED_FIELDS: - if url_params.get(uf): - msg = "Unsupported parameter: %s" % uf - return HttpResponseBadRequest(msg) - - print("PART 3") - - # Create a simplified script that only runs wget on a list of files - if url_params.get(SIMPLE): - use_simple_param = url_params.pop(SIMPLE)[0].lower() - if use_simple_param == "false": - script_template_file = "wget-template.sh" - elif use_simple_param == "true": - script_template_file = "wget-simple-template.sh" - else: - msg = 'Parameter "%s" must be set to true or false.' % SIMPLE - return HttpResponseBadRequest(msg) - - print("PART 4") - - # Get directory structure for downloaded files - if url_params.get(FIELD_WGET_PATH): - wget_path_facets = url_params.pop(FIELD_WGET_PATH)[0].split(",") - - if url_params.get(FIELD_WGET_EMPTYPATH): - wget_empty_path = url_params.pop(FIELD_WGET_EMPTYPATH)[0] - - # Get facets for the file name, URL, checksum - # file_attribute_set = set(["title", "url", "checksum_type", "checksum"]) - - # Get facets for the download directory structure, - # and remove duplicate facets - # file_attributes = list(file_attribute_set) - - print("Requested URL params:", url_params) - - # Fetch files for the query - file_list = {} - dsid = url_params.get("dataset_id", "") - if "," in dsid: - dsid = dsid.split(",") - - print("PART 5") - - print(f"DEBUG: {dsid} ") - try: - qo = ESGGlobusQuery(settings.SOCIAL_AUTH_GLOBUS_KEY, "") - res = qo.query_file_records(dsid, wget=True) # , crit=url_params) - print("res:", res) - except PermissionError as e: - # Configuration or filesystem permission issue (e.g. missing shards/allowed-projects files) - print("PermissionError while accessing ESGGlobusQuery files:", e) - return HttpResponseBadRequest( - "Server configuration error: unable to access required ESGF helper files." - ) - except Exception as e: - # Generic fallback to avoid unhandled exceptions bubbling up - print("Error while querying ESGF metadata via ESGGlobusQuery:", e) - return HttpResponseBadRequest(f"Error querying ESGF metadata: {e}") - num_files = len(res) - - print(f"DEBUG: Number of files found: {num_files}") - - for file_info in res: - filename = file_info["title"] - checksum_type = file_info["checksum_type"][0] - checksum = file_info["checksum"][0] - # Create directory structure from facet values - # If the facet is not found, then use the empty path value - dir_struct = [] - for facet in wget_path_facets: - facet_value = wget_empty_path - if facet in file_info: - if isinstance(file_info[facet], list): - facet_value = file_info[facet][0] - else: - facet_value = file_info[facet] - # Prevent strange values while generating names - facet_value = facet_value.replace("['<>?*\"\n\t\r\0]", "") - facet_value = facet_value.replace("[ /\\\\|:;]+", "_") - # Limit length of value to WGET_MAX_DIR_LENGTH - if len(facet_value) > settings.WGET_MAX_DIR_LENGTH: - facet_value = facet_value[: settings.WGET_MAX_DIR_LENGTH] - dir_struct.append(facet_value) - dir_struct.append(filename) - file_path = os.path.join(*dir_struct) - # Only add a file to the list if its file path is not already present - if file_path not in file_list: - for url in file_info["url"]: - url_split = url.split("|") - if url_split[2] == "HTTPServer": - file_entry = dict( - url=url_split[0], - checksum_type=checksum_type, - checksum=checksum, - ) - file_list[file_path] = file_entry - break - - print("Part 6") - - # Limit the number of files to the maximum - warning_message = None - if num_files == 0: - return HttpResponse("No files found for datasets.") - elif num_files > file_limit: - warning_message = ( - "Warning! The total number of files was {} " - "but this script will only process {}.".format( - num_files, file_limit - ) - ) - - print("PART 7") - - # Warning message about files that were skipped - # to prevent overwriting similarly-named files. - skip_msg = ( - "There were files with the same name which were requested " - "to be download to the same directory. To avoid overwriting " - "the previous downloaded one they were skipped.\n" - "Please use the parameter 'download_structure' " - "to set up unique directories for them." - ) - if min(num_files, file_limit) > len(file_list): - if warning_message: - warning_message = "{}\n{}".format(warning_message, skip_msg) - else: - warning_message = skip_msg - - print("PART 8") - - # Build wget script - current_datetime = datetime.datetime.now() - timestamp = current_datetime.strftime("%Y/%m/%d %H:%M:%S") - - context = dict( - timestamp=timestamp, - url_params=[dsid], - files=file_list, - warning_message=warning_message, - ) - - print("PART 9") - wget_script = render(request, script_template_file, context) - - script_filename = current_datetime.strftime("wget-%Y%m%d%H%M%S.sh") - response_content = "attachment; filename={}".format(script_filename) - - response = HttpResponse(wget_script, content_type="text/x-sh") - response["Content-Disposition"] = response_content - return response - - def do_post(request, urlbase): """Helper function to handle POST requests.""" if request.method != "POST": # pragma: no cover diff --git a/backend/metagrid/wget/__init__.py b/backend/metagrid/wget/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/metagrid/api_proxy/wget/query_utils.py b/backend/metagrid/wget/query_utils.py similarity index 78% rename from backend/metagrid/api_proxy/wget/query_utils.py rename to backend/metagrid/wget/query_utils.py index 95d7f92df..45fed39c3 100644 --- a/backend/metagrid/api_proxy/wget/query_utils.py +++ b/backend/metagrid/wget/query_utils.py @@ -1,13 +1,3 @@ -import csv -import json -import os -import urllib.parse -import urllib.request -import xml.etree.ElementTree as ET -from io import StringIO - -from django.conf import settings - # reserved query keywords OFFSET = "offset" LIMIT = "limit" @@ -243,53 +233,3 @@ def split_value(value): # convert listo into array return _values - - -def get_solr_shards_from_xml(): - """ - Get Solr shards from the XML file specified in the settings - as ESGF_SOLR_SHARDS_XML - """ - - shard_list = [] - if os.path.isfile(settings.ESGF_SOLR_SHARDS_XML): - tree = ET.parse(settings.ESGF_SOLR_SHARDS_XML) - root = tree.getroot() - for value in root: - shard_list.append(value.text) - return shard_list - - -def get_allowed_projects_from_json(): - """ - Get allowed ESGF projects from the JSON file specified in the settings - as ESGF_ALLOWED_PROJECTS_JSON - """ - - allowed_projects_list = [] - if os.path.isfile(settings.ESGF_ALLOWED_PROJECTS_JSON): - with open(settings.ESGF_ALLOWED_PROJECTS_JSON, "r") as js: - data = json.load(js) - allowed_projects_list = data["allowed_projects"] - return allowed_projects_list - - -def get_facets_from_solr(): - """ - Get valid facets currently used by the dataset Solr. - """ - - query_url = settings.ESGF_SOLR_URL + "/datasets/select" - query_params = dict(q="*:*", wt="csv", rows=0) - - query_encoded = urllib.parse.urlencode(query_params, doseq=True).encode() - req = urllib.request.Request(query_url, query_encoded) - with urllib.request.urlopen(req) as response: - results = StringIO(response.read().decode()) - reader = csv.reader(results, delimiter=",") - facets = next(reader) - - # Remove fields that should NOT be used as facets - _facets = [f for f in facets if f not in NOT_FACETS] - - return _facets diff --git a/backend/metagrid/api_proxy/wget/wget-simple-template.sh b/backend/metagrid/wget/templates/wget-simple-template.sh similarity index 97% rename from backend/metagrid/api_proxy/wget/wget-simple-template.sh rename to backend/metagrid/wget/templates/wget-simple-template.sh index f1e823699..704e8a6f3 100644 --- a/backend/metagrid/api_proxy/wget/wget-simple-template.sh +++ b/backend/metagrid/wget/templates/wget-simple-template.sh @@ -8,4 +8,4 @@ download_files=( for i in "${download_files[@]}" do wget $i -done \ No newline at end of file +done diff --git a/backend/metagrid/api_proxy/wget/wget-template.sh b/backend/metagrid/wget/templates/wget-template.sh similarity index 97% rename from backend/metagrid/api_proxy/wget/wget-template.sh rename to backend/metagrid/wget/templates/wget-template.sh index 87a970dbe..2285e96f1 100644 --- a/backend/metagrid/api_proxy/wget/wget-template.sh +++ b/backend/metagrid/wget/templates/wget-template.sh @@ -6,12 +6,12 @@ # Generated by {{request.get_host}} - {{timestamp}} # Search URL: {{request.build_absolute_uri}} # Request method: {{request.method}} -# +# ############################################################################### # first be sure it's bash... anything out of bash or sh will break # and the test will assure we are not using sh instead of bash if [ $BASH ] && [ `basename $BASH` != bash ]; then - echo "######## This is a bash script! ##############" + echo "######## This is a bash script! ##############" echo "Change the execution bit 'chmod u+x $0' or start with 'bash $0' instead of sh." echo "Trying to recover automatically..." sleep 1 @@ -178,7 +178,7 @@ if ((update)); then mv $0 $0.old.$counter echo "$new_wget" > $0 fi - exit 0 + exit 0 fi check_chksum() { @@ -240,7 +240,7 @@ remove_from_cache() { download() { wget="wget ${insecure:+--no-check-certificate} ${quiet:+-q} ${quiet:--v}" - + while read line do # read csv here document into proper variables @@ -251,14 +251,14 @@ download() { #get the cached entry if any. cached="$(grep -e "^$file" "$CACHE_FILE")" - + #if we have the cache entry but no file, clean it. if [[ ! -f $file && "$cached" ]]; then #the file was removed, clean the cache remove_from_cache "$file" unset cached fi - + #check it wasn't modified if [[ -n "$cached" && "$(get_mod_time_ $file)" == $(echo "$cached" | cut -d ' ' -f2) ]]; then if [[ "$chksum" == "$(echo "$cached" | cut -d ' ' -f3)" ]]; then @@ -270,7 +270,7 @@ download() { remove_from_cache "$file" unset cached else - #file on server is different from what we have. + #file on server is different from what we have. echo "WARNING: The remote file was changed (probably a new version is available). Use -U to Update/overwrite" continue fi @@ -286,7 +286,7 @@ download() { #No status will be stored break else - $wget -O "$file" $url || { failed=1; break; } + $wget -O "$file" $url || { failed=1; break; } fi #check if file is there @@ -321,7 +321,7 @@ download() { sleep 1 break fi - + rm $file #try again echo -n " re-trying..." @@ -337,12 +337,12 @@ download() { #done! break done - + if ((failed)); then echo "download failed" unset failed fi - + done <<<"$download_files" } @@ -381,4 +381,4 @@ download dedup_cache_ -echo "done" \ No newline at end of file +echo "done" diff --git a/backend/metagrid/wget/tests/__init__.py b/backend/metagrid/wget/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/metagrid/wget/tests/test_views.py b/backend/metagrid/wget/tests/test_views.py new file mode 100644 index 000000000..af0b59f39 --- /dev/null +++ b/backend/metagrid/wget/tests/test_views.py @@ -0,0 +1,107 @@ +import json +from unittest.mock import patch + +import responses +from django.conf import settings +from django.urls import reverse +from rest_framework import status +from rest_framework.test import APITestCase + + +class TestWgetViewSet(APITestCase): + @responses.activate + def test_wget(self): + url = reverse("do-wget") + response = self.client.get( + url, + { + "dataset_id": "CMIP6.CMIP.IPSL.IPSL-CM6A-LR.abrupt-4xCO2.r12i1p1f1.Amon.n2oglobal.gr.v20191003|esgf-data1.llnl.gov" + }, + ) + assert response.status_code == status.HTTP_200_OK + + def test_wget_invalid_param_returns_400(self): + """Invalid query parameter should return 400 with message.""" + url = reverse("do-wget") + resp = self.client.get(url, {"invalidparam": "value"}) + assert resp.status_code == status.HTTP_400_BAD_REQUEST + assert b"Invalid HTTP query parameter" in resp.content + + def test_wget_unsupported_field_returns_400(self): + """Unsupported field (e.g., lat) should return 400.""" + url = reverse("do-wget") + resp = self.client.get(url, {"lat": "10"}) + assert resp.status_code == status.HTTP_400_BAD_REQUEST + assert b"Unsupported parameter" in resp.content + + def test_wget_simple_invalid_value_returns_400(self): + """Passing an invalid value to 'simple' should return 400.""" + url = reverse("do-wget") + resp = self.client.get(url, {"simple": "notaboolean"}) + assert resp.status_code == status.HTTP_400_BAD_REQUEST + assert b'must be set to true or false' in resp.content + + def test_wget_no_files_found_returns_empty_message(self, monkeypatch): + """When ESGGlobusQuery returns no files, view should respond with 'No files found'.""" + url = reverse("do-wget") + + class DummyQuery: + def __init__(self, *a, **k): + pass + + def query_file_records(self, dsid, wget=True): + return [] # no files + + # Patch the ESGGlobusQuery used by the view to our dummy + monkeypatch.setattr("metagrid.wget.views.ESGGlobusQuery", DummyQuery) + + resp = self.client.get(url, {"dataset_id": "SOMEDATASET"}) + assert resp.status_code == status.HTTP_200_OK + assert b"No files found for datasets." in resp.content + + def test_wget_generates_script_and_sets_warning_when_exceeds_limit(self, monkeypatch): + """When number of files > file_limit ensures script generation path runs and returns attachment.""" + url = reverse("do-wget") + + # Create two fake file_info entries so num_files > limit (we will set limit to 1) + fake_file_info = { + "title": "file1.nc", + "checksum_type": ["md5"], + "checksum": ["deadbeef"], + "url": ["http://example.com/file1.nc|HTTPServer|HTTPServer"], + # include some facet used by directory construction if needed + "activity_id": ["ACT"], + } + fake_file_info_2 = { + "title": "file2.nc", + "checksum_type": ["md5"], + "checksum": ["cafebabe"], + "url": ["http://example.com/file2.nc|HTTPServer|HTTPServer"], + "activity_id": ["ACT"], + } + + class DummyQuery: + def __init__(self, *a, **k): + pass + + def query_file_records(self, dsid, wget=True): + return [fake_file_info, fake_file_info_2] + + # Patch ESGGlobusQuery to return our two files + monkeypatch.setattr("metagrid.wget.views.ESGGlobusQuery", DummyQuery) + + # Reduce the default limit so that num_files > file_limit triggers warning_message logic + monkeypatch.setattr(settings, "WGET_SCRIPT_FILE_DEFAULT_LIMIT", 1) + + # Patch render in the view module so template file location is not required + monkeypatch.setattr("metagrid.wget.views.render", lambda request, tpl, ctx: "GENERATED_SCRIPT") + + # POST expects JSON body according to the view; send JSON payload + payload = {"dataset_id": ["SOMEDATASET"], "download_structure": ["activity_id"], "simple": ["false"]} + resp = self.client.post(url, data=json.dumps(payload), content_type="application/json") + + assert resp.status_code == status.HTTP_200_OK + # Content disposition should indicate an attachment filename + assert "attachment; filename=" in resp["Content-Disposition"] + # The returned body should be the rendered template content we patched + assert resp.content.decode() == "GENERATED_SCRIPT" diff --git a/backend/metagrid/wget/views.py b/backend/metagrid/wget/views.py new file mode 100644 index 000000000..e8b20756c --- /dev/null +++ b/backend/metagrid/wget/views.py @@ -0,0 +1,184 @@ +import datetime +import json +import os + +from django.conf import settings +from django.http import HttpResponse, HttpResponseBadRequest +from django.shortcuts import render +from django.views.decorators.csrf import csrf_exempt +from django.views.decorators.http import require_http_methods +from esgcet.globus_query import ESGGlobusQuery + +from .query_utils import ( # get_allowed_projects_from_json, + CORE_QUERY_FIELDS, + FIELD_WGET_EMPTYPATH, + FIELD_WGET_PATH, + KEYWORDS, + SIMPLE, + UNSUPPORTED_FIELDS, +) + + +@require_http_methods(["GET", "POST"]) +@csrf_exempt +def do_wget(request): # noqa: C901 + + file_limit = settings.WGET_SCRIPT_FILE_DEFAULT_LIMIT + wget_path_facets = [] + wget_empty_path = "" + script_template_file = "wget-template.sh" + + # Gather dataset_ids and other parameters + if request.method == "POST": + url_params = json.loads(request.body) + print("POST url_params:", url_params) + elif request.method == "GET": + url_params = request.GET.copy() + print("GET url_params:", url_params) + else: + return HttpResponseBadRequest("Request method must be POST or GET.") + + # If no parameters were passed to the API, + # then default to limit=1 and distrib=false + if len(url_params.keys()) == 0: + url_params.update(dict(limit=1, distrib="false")) + + # Catch invalid parameters + for param in url_params.keys(): + if param[-1] == "!": + param = param[:-1] + if param not in KEYWORDS and param not in CORE_QUERY_FIELDS: + msg = "Invalid HTTP query parameter=%s" % param + return HttpResponseBadRequest(msg) + + # Catch unsupported fields + for uf in UNSUPPORTED_FIELDS: + if url_params.get(uf): + msg = "Unsupported parameter: %s" % uf + return HttpResponseBadRequest(msg) + + # Create a simplified script that only runs wget on a list of files + if url_params.get(SIMPLE): + use_simple_param = url_params.pop(SIMPLE)[0].lower() + if use_simple_param == "false": + script_template_file = "wget-template.sh" + elif use_simple_param == "true": + script_template_file = "wget-simple-template.sh" + else: + msg = 'Parameter "%s" must be set to true or false.' % SIMPLE + return HttpResponseBadRequest(msg) + + # Get directory structure for downloaded files + if url_params.get(FIELD_WGET_PATH): + wget_path_facets = url_params.pop(FIELD_WGET_PATH)[0].split(",") + + if url_params.get(FIELD_WGET_EMPTYPATH): + wget_empty_path = url_params.pop(FIELD_WGET_EMPTYPATH)[0] + + # Fetch files for the query + file_list = {} + dsid = url_params.get("dataset_id", "") + if "," in dsid: + dsid = dsid.split(",") + + print(f"DEBUG: {dsid} ") + try: + qo = ESGGlobusQuery(settings.GLOBUS_PUBLIC_INDEX_ENDPOINT_ID, "") + res = qo.query_file_records(dsid, wget=True) # , crit=url_params) + print("res:", res) + except PermissionError as e: + # Configuration or filesystem permission issue (e.g. missing shards/allowed-projects files) + print("PermissionError while accessing ESGGlobusQuery files:", e) + return HttpResponseBadRequest( + "Server configuration error: unable to access required ESGF helper files." + ) + except Exception as e: + # Generic fallback to avoid unhandled exceptions bubbling up + print("Error while querying ESGF metadata via ESGGlobusQuery:", e) + return HttpResponseBadRequest(f"Error querying ESGF metadata: {e}") + num_files = len(res) + + print(f"DEBUG: Number of files found: {num_files}") + + for file_info in res: + filename = file_info["title"] + checksum_type = file_info["checksum_type"][0] + checksum = file_info["checksum"][0] + # Create directory structure from facet values + # If the facet is not found, then use the empty path value + dir_struct = [] + for facet in wget_path_facets: + facet_value = wget_empty_path + if facet in file_info: + if isinstance(file_info[facet], list): + facet_value = file_info[facet][0] + else: + facet_value = file_info[facet] + # Prevent strange values while generating names + facet_value = facet_value.replace("['<>?*\"\n\t\r\0]", "") + facet_value = facet_value.replace("[ /\\\\|:;]+", "_") + # Limit length of value to WGET_MAX_DIR_LENGTH + if len(facet_value) > settings.WGET_MAX_DIR_LENGTH: + facet_value = facet_value[: settings.WGET_MAX_DIR_LENGTH] + dir_struct.append(facet_value) + dir_struct.append(filename) + file_path = os.path.join(*dir_struct) + # Only add a file to the list if its file path is not already present + if file_path not in file_list: + for url in file_info["url"]: + url_split = url.split("|") + if url_split[2] == "HTTPServer": + file_entry = dict( + url=url_split[0], + checksum_type=checksum_type, + checksum=checksum, + ) + file_list[file_path] = file_entry + break + + # Limit the number of files to the maximum + warning_message = None + if num_files == 0: + return HttpResponse("No files found for datasets.") + elif num_files > file_limit: + warning_message = ( + "Warning! The total number of files was {} " + "but this script will only process {}.".format( + num_files, file_limit + ) + ) + + # Warning message about files that were skipped + # to prevent overwriting similarly-named files. + skip_msg = ( + "There were files with the same name which were requested " + "to be download to the same directory. To avoid overwriting " + "the previous downloaded one they were skipped.\n" + "Please use the parameter 'download_structure' " + "to set up unique directories for them." + ) + if min(num_files, file_limit) > len(file_list): + if warning_message: + warning_message = "{}\n{}".format(warning_message, skip_msg) + else: + warning_message = skip_msg + + # Build wget script + current_datetime = datetime.datetime.now() + timestamp = current_datetime.strftime("%Y/%m/%d %H:%M:%S") + + context = dict( + timestamp=timestamp, + url_params=[dsid], + files=file_list, + warning_message=warning_message, + ) + + wget_script = render(request, script_template_file, context) + + script_filename = current_datetime.strftime("wget-%Y%m%d%H%M%S.sh") + response_content = "attachment; filename={}".format(script_filename) + + response = HttpResponse(wget_script, content_type="text/x-sh") + response["Content-Disposition"] = response_content + return response diff --git a/docs/docs/users/configurable_environment_variables.md b/docs/docs/users/configurable_environment_variables.md index 5556911a2..d607e8fa0 100644 --- a/docs/docs/users/configurable_environment_variables.md +++ b/docs/docs/users/configurable_environment_variables.md @@ -32,38 +32,16 @@ > > `1024` -#### `METAGRID_ESGF_SOLR_URL` +#### `METAGRID_GLOBUS_PUBLIC_INDEX_ENDPOINT_ID` > !!! example "*Optional*" -> __Default:__ `None` -> -> Address of the ESGF Solr endpoint used by the wget helper logic. -> -> __Example Values__ -> -> `https://esgf-node.llnl.gov/esg-search/solr` - -#### `METAGRID_ESGF_SOLR_SHARDS_XML` - -> !!! example "*Optional*" -> __Default:__ `None` -> -> Path to the XML file containing Solr shards configuration used to resolve mirrors/shards. -> -> __Example Values__ -> -> `/etc/metagrid/solr_shards.xml` - -#### `METAGRID_ESGF_ALLOWED_PROJECTS_JSON` - -> !!! example "*Optional*" -> __Default:__ `None` +> __Default:__ `a8ef4320-9e5a-4793-837b-c45161ca1845` > -> Path to a JSON file that lists allowed projects for wget/dataset access checks. +> The Globus index ID for the public ESGF2 data. > > __Example Values__ > -> `/etc/metagrid/wget_allowed_projects.json` +> `a8ef4320-9e5a-4793-837b-c45161ca1845` #### `METAGRID_WGET_SCRIPT_FILE_DEFAULT_LIMIT` From ac92dcdc00b7e5c94cdb37f704ffb211a03d875e Mon Sep 17 00:00:00 2001 From: downiec <42552189+downiec@users.noreply.github.com> Date: Tue, 4 Nov 2025 19:56:41 -0800 Subject: [PATCH 3/5] Fix some issues with precommit and startup --- .pre-commit-config.yaml | 2 +- backend/Dockerfile | 2 +- backend/config/settings/site_specific.py | 5 +++++ backend/config/settings/static.py | 15 +++++++++------ docker-compose.yml | 2 -- .../users/configurable_environment_variables.md | 11 +++++++++++ helm/deploy/helmfile.cnpg.yaml | 2 +- helm/values.yaml | 2 +- 8 files changed, 29 insertions(+), 12 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b0d852c81..4ffe7375b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,7 +9,7 @@ repos: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml - exclude: traefik/*.yml|helm/templates/.* + exclude: traefik/*.yml|helm/templates/.*|helm/deploy/.*|helm/values.yaml # Back-end # ------------------------------------------------------------------------------ diff --git a/backend/Dockerfile b/backend/Dockerfile index cb290aaff..c23c01dbd 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -1,6 +1,6 @@ FROM python:3.11-slim -ENV PYTHONUNBUFFERED 1 +ENV PYTHONUNBUFFERED=1 RUN apt-get update \ # dependencies for building Python packages diff --git a/backend/config/settings/site_specific.py b/backend/config/settings/site_specific.py index 7ac645a78..7c295b7d5 100644 --- a/backend/config/settings/site_specific.py +++ b/backend/config/settings/site_specific.py @@ -42,6 +42,11 @@ class MetagridBackendSettings(BaseSettings): description="A list of all the people who get code error notifications. When `DEBUG=False` and `AdminEmailHandler` is configured in `LOGGING` (done by default), Django emails these people the details of exceptions raised in the request/response cycle. Each item in the list should be a tuple of (Full name, email address). " "Reference: ", ) + DATABASE_URL: str = Field( + default="postgresql://postgres:postgres@postgres:5432/postgres", + examples=["postgresql://postgres:postgres@postgres:5432/postgres"], + description="The database connection URL for the Metagrid backend database.", + ) SOCIAL_AUTH_GLOBUS_KEY: str = Field( examples=["94c44808-9efd-4236-bffd-1185b1071736"], description="The `Client UUID` obtained by registering a `portal, science gateway, or other application you host` with Globus at ", diff --git a/backend/config/settings/static.py b/backend/config/settings/static.py index e06b167f0..23703e047 100644 --- a/backend/config/settings/static.py +++ b/backend/config/settings/static.py @@ -2,19 +2,22 @@ from typing import Any, Optional, Sequence import environ -from pydantic import BaseModel, Field +from pydantic import Field from pydantic_settings import BaseSettings, SettingsConfigDict env = environ.Env() -ROOT_DIR = ( - environ.Path(__file__) - 3 -) # (config/settings/django.py - 3 = metagrid/) +# project root (config/settings/static.py - 3 = metagrid/) +ROOT_DIR = environ.Path(__file__) - 3 -# parses DATABASE_URL environment variable -DATABASES = env.db_url() +# Parse DATABASE_URL environment variable; default to an in-memory sqlite DB for tests/local runs. +# This prevents "Set the DATABASE_URL environment variable" errors when none is provided. +DATABASES = env.db_url( + default="postgresql://postgres:postgres@postgres:5432/postgres" +) DATABASES.update(ATOMIC_REQUESTS=True) + class DjangoStaticSettings(BaseSettings): """Django settings that do not vary by site""" diff --git a/docker-compose.yml b/docker-compose.yml index 160893d0b..008cd1435 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -57,8 +57,6 @@ services: condition: service_healthy volumes: - ./backend:/app - environment: - DATABASE_URL: postgresql://postgres:postgres@postgres:5432/postgres ports: - "127.0.0.1:5000:5000" diff --git a/docs/docs/users/configurable_environment_variables.md b/docs/docs/users/configurable_environment_variables.md index ae7f19af9..917751306 100644 --- a/docs/docs/users/configurable_environment_variables.md +++ b/docs/docs/users/configurable_environment_variables.md @@ -45,6 +45,17 @@ > > `[('Author', 'downie4@llnl.gov'), ('Author', 'ames4@llnl.gov')]` +#### `METAGRID_DATABASE_URL` + +> !!! example "*Optional*" +> __Default:__ `postgresql://postgres:postgres@postgres:5432/postgres` +> +> The database connection URL for the Metagrid backend database. +> +> __Example Values__ +> +> `postgresql://postgres:postgres@postgres:5432/postgres` + #### `METAGRID_SOCIAL_AUTH_GLOBUS_KEY` > !!! example "**Required**" diff --git a/helm/deploy/helmfile.cnpg.yaml b/helm/deploy/helmfile.cnpg.yaml index 05ca31294..b03b4fa88 100644 --- a/helm/deploy/helmfile.cnpg.yaml +++ b/helm/deploy/helmfile.cnpg.yaml @@ -11,7 +11,7 @@ releases: - name: cnpg namespace: cnpg-system chart: cnpg/cloudnative-pg - version: "0.26.1" + version: "0.26.1" wait: true hooks: # remove cnpg resources diff --git a/helm/values.yaml b/helm/values.yaml index 6623ac902..9a618edd2 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -139,7 +139,7 @@ postgresql: username: postgres password: postgres database: postgres - + imagePullSecrets: [] image: From 57c7e8686b7e9ac2eac867ac33a89ab4ba5af868 Mon Sep 17 00:00:00 2001 From: downiec <42552189+downiec@users.noreply.github.com> Date: Wed, 12 Nov 2025 14:10:18 -0800 Subject: [PATCH 4/5] Updated the feature to allow using a dedicated wget url (as it was before) so that the integrated wget is only active if the WGET_URL setting is None (default). Fixed tests to handle changes, the WGET_URL external test is only run if the WGET_URL is set, otherwise the test is skipped. --- backend/config/settings/site_specific.py | 10 ++++++++-- backend/config/urls.py | 2 +- backend/metagrid/api_proxy/tests/test_views.py | 18 ++++++++++++++++++ backend/metagrid/api_proxy/views.py | 10 ++++++++++ backend/metagrid/wget/tests/test_views.py | 7 ++++++- backend/metagrid/wget/views.py | 2 +- docker-compose.yml | 2 +- .../configurable_environment_variables.md | 15 +++++++++++++-- 8 files changed, 58 insertions(+), 8 deletions(-) diff --git a/backend/config/settings/site_specific.py b/backend/config/settings/site_specific.py index 6e4841628..191506620 100644 --- a/backend/config/settings/site_specific.py +++ b/backend/config/settings/site_specific.py @@ -32,6 +32,12 @@ class MetagridBackendSettings(BaseSettings): ) # === wget related settings === + WGET_URL: Optional[str] = Field( + default=None, + description="(Optional) If set, the URL at which the ESG-Search wget endpoint can be reached. If None (default), the wget download script is generated by an integrated WGET within the Metagrid deployment.", + examples=["https://esgf-node.llnl.gov/esg-search/wget"], + ) + GLOBUS_PUBLIC_INDEX_ENDPOINT_ID: str = Field( default="a8ef4320-9e5a-4793-837b-c45161ca1845", description="The Globus index ID for the public ESGF2 data.", @@ -39,9 +45,9 @@ class MetagridBackendSettings(BaseSettings): ) WGET_SCRIPT_FILE_DEFAULT_LIMIT: int = Field( - default=1000, + default=9999, description="Default limit on the number of files allowed in a generated wget script.", - examples=[1000], + examples=[9999], ) WGET_SCRIPT_FILE_MAX_LIMIT: int = Field( diff --git a/backend/config/urls.py b/backend/config/urls.py index d6be01b40..8c64c5655 100644 --- a/backend/config/urls.py +++ b/backend/config/urls.py @@ -24,6 +24,7 @@ do_search, do_stac_search, do_status, + do_wget, fetch_stac_aggregations, get_frontend_config, get_temp_storage, @@ -33,7 +34,6 @@ from metagrid.observability.views import liveness, readiness from metagrid.projects.views import ProjectsViewSet from metagrid.users.views import UserCreateViewSet, UserViewSet -from metagrid.wget.views import do_wget router = DefaultRouter() router.register(r"users", UserViewSet) diff --git a/backend/metagrid/api_proxy/tests/test_views.py b/backend/metagrid/api_proxy/tests/test_views.py index dda8347b9..1aae552c1 100644 --- a/backend/metagrid/api_proxy/tests/test_views.py +++ b/backend/metagrid/api_proxy/tests/test_views.py @@ -87,6 +87,24 @@ def test_search(self): response = self.client.get(url, postdata) assert response.status_code == status.HTTP_200_OK + @responses.activate + def test_wget(self): + url = reverse("do-wget") + if settings.WGET_URL is None: + # If WGET_URL is None, skip the test + import pytest + + pytest.skip("settings.WGET_URL is not set") + + responses.get(settings.WGET_URL) + response = self.client.get( + url, + { + "dataset_id": "CMIP6.CMIP.IPSL.IPSL-CM6A-LR.abrupt-4xCO2.r12i1p1f1.Amon.n2oglobal.gr.v20191003|esgf-data1.llnl.gov" + }, + ) + assert response.status_code == status.HTTP_200_OK + @responses.activate def test_stac_search(self): url = reverse("do-stac-search") diff --git a/backend/metagrid/api_proxy/views.py b/backend/metagrid/api_proxy/views.py index d4ddf6f80..7eae62bec 100644 --- a/backend/metagrid/api_proxy/views.py +++ b/backend/metagrid/api_proxy/views.py @@ -15,6 +15,7 @@ from rest_framework_simplejwt.tokens import RefreshToken from config.settings.site_specific import MetagridFrontendSettings +from metagrid.wget.views import do_wget_integrated @api_view() @@ -152,6 +153,15 @@ def do_status(request): return HttpResponseBadRequest(resp.text) +@require_http_methods(["GET", "POST"]) +@csrf_exempt +def do_wget(request): + if not settings.WGET_URL: + return do_wget_integrated(request) + + return do_request(request, settings.WGET_URL, True) + + def do_post(request, urlbase): """Helper function to handle POST requests.""" if request.method != "POST": # pragma: no cover diff --git a/backend/metagrid/wget/tests/test_views.py b/backend/metagrid/wget/tests/test_views.py index d1a44e033..2ecc96175 100644 --- a/backend/metagrid/wget/tests/test_views.py +++ b/backend/metagrid/wget/tests/test_views.py @@ -2,14 +2,19 @@ from unittest.mock import patch import responses +from django.conf import settings from django.urls import reverse from rest_framework import status from rest_framework.test import APITestCase class TestWgetViewSet(APITestCase): + def setUp(self): + # Force integrated wget behavior for most tests + settings.WGET_URL = None + @responses.activate - def test_wget(self): + def test_wget_integrated(self): url = reverse("do-wget") response = self.client.get( url, diff --git a/backend/metagrid/wget/views.py b/backend/metagrid/wget/views.py index e8b20756c..ec6d63f72 100644 --- a/backend/metagrid/wget/views.py +++ b/backend/metagrid/wget/views.py @@ -21,7 +21,7 @@ @require_http_methods(["GET", "POST"]) @csrf_exempt -def do_wget(request): # noqa: C901 +def do_wget_integrated(request): # noqa: C901 file_limit = settings.WGET_SCRIPT_FILE_DEFAULT_LIMIT wget_path_facets = [] diff --git a/docker-compose.yml b/docker-compose.yml index a97f1256a..a98b7b2ca 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -57,7 +57,7 @@ services: volumes: - ./backend:/app environment: - WGET_SCRIPT_FILE_DEFAULT_LIMIT: 1000 + WGET_SCRIPT_FILE_DEFAULT_LIMIT: 9999 ports: - '127.0.0.1:5000:5000' diff --git a/docs/docs/users/configurable_environment_variables.md b/docs/docs/users/configurable_environment_variables.md index 7c58b36af..ef32f6250 100644 --- a/docs/docs/users/configurable_environment_variables.md +++ b/docs/docs/users/configurable_environment_variables.md @@ -32,6 +32,17 @@ > > `1024` +#### `METAGRID_WGET_URL` + +> !!! example "*Optional*" +> __Default:__ `None` +> +> (Optional) If set, the URL at which the ESG-Search wget endpoint can be reached. If None (default), the wget download script is generated by an integrated WGET within the Metagrid deployment. +> +> __Example Values__ +> +> `https://esgf-node.llnl.gov/esg-search/wget` + #### `METAGRID_GLOBUS_PUBLIC_INDEX_ENDPOINT_ID` > !!! example "*Optional*" @@ -46,13 +57,13 @@ #### `METAGRID_WGET_SCRIPT_FILE_DEFAULT_LIMIT` > !!! example "*Optional*" -> __Default:__ `1000` +> __Default:__ `9999` > > Default limit on the number of files allowed in a generated wget script. > > __Example Values__ > -> `1000` +> `9999` #### `METAGRID_WGET_SCRIPT_FILE_MAX_LIMIT` From 8e17cd1ebda79163721bb50d77c9c0267b550759 Mon Sep 17 00:00:00 2001 From: downiec <42552189+downiec@users.noreply.github.com> Date: Wed, 12 Nov 2025 22:38:15 -0800 Subject: [PATCH 5/5] Added STAC wget downloads, where a script is automatically generated when STAC items are selected for wget download in the cart. The wget download can handle both stac and non-stac items being downloaded. Added some tests for the newly added functions and updated the wget success messages to account for STAC and non-STAC wget script results. Still need to add wget download for individual STAC records within the search Table. --- frontend/src/api/index.test.ts | 68 ++++++++- frontend/src/api/index.ts | 40 ++--- frontend/src/common/STAC.ts | 35 +++++ frontend/src/common/utils.ts | 16 ++ .../src/components/Globus/DatasetDownload.tsx | 137 ++++++++++++++---- frontend/src/test/mock/fixtures.ts | 31 ++-- 6 files changed, 259 insertions(+), 68 deletions(-) diff --git a/frontend/src/api/index.test.ts b/frontend/src/api/index.test.ts index 5d7119212..3fddbdf50 100644 --- a/frontend/src/api/index.test.ts +++ b/frontend/src/api/index.test.ts @@ -25,7 +25,8 @@ import { startSearchGlobusEndpoints, updateUserCart, } from '.'; -import { STAC_PROJECTS } from '../common/STAC'; +import { STAC_PROJECTS, generateWgetScriptSTAC } from '../common/STAC'; +import { downloadFileForUser } from '../common/utils'; import { ActiveSearchQuery, Pagination, RawCitation, ResultType } from '../components/Search/types'; import { mockConfig } from '../test/jestTestFunctions'; import { @@ -36,17 +37,31 @@ import { projectsFixture, rawCitationFixture, rawNodeStatusFixture, + rawStacAssetFixture, rawUserCartFixture, + stacAssetFixture, userAuthFixture, userInfoFixture, userSearchQueriesFixture, userSearchQueryFixture, } from '../test/mock/fixtures'; import { rest, server } from '../test/mock/server'; -import apiRoutes from './routes'; +import apiRoutes, { HTTPCodeType } from './routes'; const genericNetworkErrorMsg = 'Failed to Connect'; +jest.mock('../common/utils', () => { + // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment + const originalModule = jest.requireActual('../common/utils'); + + // eslint-disable-next-line @typescript-eslint/no-unsafe-return + return { + __esModule: true, + ...originalModule, + downloadFileForUser: jest.fn(), + }; +}); + describe('test fetching user authentication with globus', () => { it('returns user authentication tokens', async () => { const userAuth = await fetchGlobusAuth(); @@ -1027,4 +1042,53 @@ describe('STAC API functions', () => { // status should reflect aggregation failure (fetchSTACSearchResults sets non-200 status) expect(result.status).toBe(500); }); + + it('throws error when STAC search endpoint fails', async () => { + server.use(rest.post(apiRoutes.esgfSearchSTAC.path, (_req, res, ctx) => res(ctx.status(500)))); + + const reqUrl = `${apiRoutes.esgfSearchSTAC.path}?project_id=CMIP6`; + await expect(fetchSearchResults({ reqUrl })).rejects.toThrow( + apiRoutes.esgfSearchSTAC.handleErrorMsg('generic' as HTTPCodeType), + ); + }); + + it('generateWgetScriptSTAC downloads script when assets contain .nc hrefs', () => { + const searchResults = [ + rawStacAssetFixture({ + id: 'bar', + title: 'Bar Title', + assets: { + a1: stacAssetFixture({ id: 'a1', href: 'http://example.com/file1.nc' }), + a2: stacAssetFixture({ id: 'a2', href: 'http://example.com/file2.txt' }), + }, + }), + ]; + + const result = generateWgetScriptSTAC(searchResults); + expect(result).toBe(true); + expect(downloadFileForUser).toHaveBeenCalledTimes(1); + expect(downloadFileForUser).toHaveBeenCalledWith( + expect.stringContaining('wget_stac_script'), + expect.stringContaining('wget http://example.com/file1.nc'), + ); + }); + + it('generateWgetScriptSTAC returns false when no .nc hrefs present', () => { + const searchResults = [ + rawStacAssetFixture({ + id: 'foo', + title: 'Foo Title', + assets: { + a: stacAssetFixture({ id: 'a', href: 'http://example.com/file2.txt' }), + }, + }), + ]; + + const spy = jest.fn(downloadFileForUser).mockImplementation(() => {}); + const result = generateWgetScriptSTAC(searchResults); + expect(result).toBe(false); + expect(spy).not.toHaveBeenCalled(); + + spy.mockRestore(); + }); }); diff --git a/frontend/src/api/index.ts b/frontend/src/api/index.ts index 67336afb5..89343bff3 100644 --- a/frontend/src/api/index.ts +++ b/frontend/src/api/index.ts @@ -29,7 +29,12 @@ import { import { RawUserAuth, RawUserInfo } from '../contexts/types'; import apiRoutes, { ApiRoute, HTTPCodeType } from './routes'; import { GlobusEndpointSearchResults } from '../components/Globus/types'; -import { cachePagination, getCachedPagination, getCachedSearchResults } from '../common/utils'; +import { + cachePagination, + downloadFileForUser, + getCachedPagination, + getCachedSearchResults, +} from '../common/utils'; import { aggregationsToFacetsData, convertSearchParamsIntoStacFilter, @@ -509,7 +514,7 @@ Promise<{ [key: string]: any }> => { return res; }) .catch((error: ResponseError) => { - status = error.cause === 422 ? 422 : 500; + status = error.cause === 422 ? 422 : (error.cause as number) || 500; }); const aggregationsToFacets = aggregationsToFacetsData(aggregations || { aggregations: [] }); @@ -590,11 +595,11 @@ export const fetchSearchResults = async ( }) .catch((error: ResponseError) => { if (error.cause === 422) { - throw new Error(errorMsgBasedOnHTTPStatusCode(error, apiRoutes.esgfSearch), { + throw new Error(errorMsgBasedOnHTTPStatusCode(error, apiRoutes.esgfSearchSTAC), { cause: 422, }); } else { - throw new Error(errorMsgBasedOnHTTPStatusCode(error, apiRoutes.esgfSearch)); + throw new Error(errorMsgBasedOnHTTPStatusCode(error, apiRoutes.esgfSearchSTAC)); } }); } @@ -735,26 +740,6 @@ export const fetchDatasetFiles = async ( }); }; -const returnFileToUser = (fileContent: string): void => { - const d = new Date(); - const fileName = `wget_script_${d.getFullYear()}-${ - d.getMonth() + 1 - }-${d.getDate()}_${d.getHours()}-${d.getMinutes()}-${d.getSeconds()}.sh`; - const downloadLinkNode = document.createElement('a'); - downloadLinkNode.setAttribute( - 'href', - `data:text/plain;charset=utf-8,${encodeURIComponent(fileContent)}`, - ); - downloadLinkNode.setAttribute('download', fileName); - - downloadLinkNode.style.display = 'none'; - document.body.appendChild(downloadLinkNode); - - downloadLinkNode.click(); - - document.body.removeChild(downloadLinkNode); -}; - /** * Performs wget request from the API. * @@ -765,9 +750,14 @@ export const fetchWgetScript = async (ids: string[], filenameVars?: string[]): P query: filenameVars, }; + const d = new Date(); + const fileName = `wget_script_${d.getFullYear()}-${ + d.getMonth() + 1 + }-${d.getDate()}_${d.getHours()}-${d.getMinutes()}-${d.getSeconds()}.sh`; + return axios .post(apiRoutes.wget.path, data) - .then((resp) => returnFileToUser(resp.data as string)) + .then((resp) => downloadFileForUser(fileName, resp.data as string)) .catch((error: ResponseError) => { throw new Error(errorMsgBasedOnHTTPStatusCode(error, apiRoutes.wget)); }); diff --git a/frontend/src/common/STAC.ts b/frontend/src/common/STAC.ts index 58fe344d7..c52841167 100644 --- a/frontend/src/common/STAC.ts +++ b/frontend/src/common/STAC.ts @@ -6,6 +6,7 @@ import { StacAsset, StacAggregations, } from '../components/Search/types'; +import { downloadFileForUser } from './utils'; export const STAC_PROJECTS: RawProject[] = [ { @@ -219,3 +220,37 @@ export const convertSearchParamsIntoStacFilter = ( return undefined; }; + +export const generateWgetScriptSTAC = (searchResults: RawSearchResult[]): boolean => { + const d = new Date(); + const date_string = `${d.getFullYear()}-${d.getMonth() + 1}-${d.getDate()}_${d.getHours()}-${d.getMinutes()}-${d.getSeconds()}`; + const fileName = `wget_stac_script_${date_string}.sh`; + + let script = '#!/bin/bash\n\n'; + script += '##############################################################################\n'; + script += '# ESGF wget download script\n'; + script += '#\n'; + script += `# Generated by Metagrid - ${new Date().toISOString()}\n`; + script += '#\n'; + script += '##############################################################################\n\n'; + + let hrefs = 0; + + searchResults.forEach((result) => { + if (result.assets) { + Object.values(result.assets).forEach((asset) => { + const { href } = asset; + if (href && href.startsWith('http') && href.endsWith('.nc')) { + script += `wget ${href}\n`; + hrefs += 1; + } + }); + } + }); + + if (hrefs > 0) { + downloadFileForUser(fileName, script); + } + + return hrefs > 0; +}; diff --git a/frontend/src/common/utils.ts b/frontend/src/common/utils.ts index 50deba4fd..04e944050 100644 --- a/frontend/src/common/utils.ts +++ b/frontend/src/common/utils.ts @@ -698,3 +698,19 @@ export const saveBannerText = (): void => { sessionStorage.setItem('showBanner', bannerText); } }; + +export const downloadFileForUser = (filename: string, fileContent: string): void => { + const downloadLinkNode = document.createElement('a'); + downloadLinkNode.setAttribute( + 'href', + `data:text/plain;charset=utf-8,${encodeURIComponent(fileContent)}`, + ); + downloadLinkNode.setAttribute('download', filename); + + downloadLinkNode.style.display = 'none'; + document.body.appendChild(downloadLinkNode); + + downloadLinkNode.click(); + + document.body.removeChild(downloadLinkNode); +}; diff --git a/frontend/src/components/Globus/DatasetDownload.tsx b/frontend/src/components/Globus/DatasetDownload.tsx index 2b7d2f16a..d52428392 100644 --- a/frontend/src/components/Globus/DatasetDownload.tsx +++ b/frontend/src/components/Globus/DatasetDownload.tsx @@ -2,6 +2,7 @@ /* eslint-disable @typescript-eslint/no-unsafe-call */ import { DownloadOutlined, QuestionOutlined } from '@ant-design/icons'; import { + Alert, Button, Card, Collapse, @@ -54,6 +55,7 @@ import { manageCollectionsTourTargets, createCollectionsFormTour, } from '../../common/joyrideTutorials/reactJoyrideSteps'; +import { generateWgetScriptSTAC } from '../../common/STAC'; const GLOBUS_REDIRECT_URL = `${window.location.origin}/cart/items`; @@ -173,42 +175,119 @@ const DatasetDownloadForm: React.FC> = () => { } const handleWgetDownload = (): void => { + setDownloadIsLoading(true); + const cleanedSelections = itemSelections.filter((item) => { return item !== undefined && item !== null; }); setItemSelections(cleanedSelections); - const ids = cleanedSelections.map((item) => item.id); - showNotice(messageApi, 'The wget script is generating, please wait momentarily.', { - duration: 3, - type: 'info', - }); - setDownloadIsLoading(true); - fetchWgetScript(ids) - .then(() => { + const stacSelections = cleanedSelections.filter((item) => item.isStac); + const nonStacSelections = cleanedSelections.filter((item) => !item.isStac); + + const STAC_ERROR_MSG = + 'No file links found in selected STAC files, wget script was not generated.'; + const SUCCESS_MSG = 'Wget script downloaded successfully!'; + const STAC_SUCCESS_MSG = 'Wget script for STAC files downloaded successfully!'; + const NON_STAC_SUCCESS_MSG = 'Wget script for non-STAC files downloaded successfully!'; + const BOTH_SUCCESS_MSG = + 'Wget scripts for both STAC and non-STAC files downloaded successfully!'; + + // Generate file for STAC selections + let stacSuccess = false; + + if (stacSelections.length > 0) { + stacSuccess = generateWgetScriptSTAC(stacSelections); + + if (nonStacSelections.length === 0) { setDownloadIsLoading(false); - showNotice(messageApi, 'Wget script downloaded successfully!', { - duration: 4, - type: 'success', + if (stacSuccess) { + showNotice(messageApi, STAC_SUCCESS_MSG, { + duration: 4, + type: 'success', + }); + } else { + showError(messageApi, STAC_ERROR_MSG); + } + } + } + + // Generate file for non-STAC selections + if (nonStacSelections.length > 0) { + const ids = nonStacSelections.map((item) => item.id); + + // Generate wget script for download + showNotice( + messageApi, + `The wget script${nonStacSelections.length > 0 ? 's' : ''} generating, please wait momentarily.`, + { + duration: 3, + type: 'info', + }, + ); + + fetchWgetScript(ids) + .then(() => { + setDownloadIsLoading(false); + let noticeContent: string | React.ReactNode = + stacSelections.length > 0 ? BOTH_SUCCESS_MSG : SUCCESS_MSG; + if (stacSelections.length > 0 && !stacSuccess) { + noticeContent = ( + + Non-STAC: +
+ STAC: +
+ ); + } + + showNotice(messageApi, noticeContent, { + duration: 5, + type: stacSuccess || stacSelections.length === 0 ? 'success' : 'error', + }); + }) + .catch((error: ResponseError) => { + setDownloadIsLoading(false); + const noticeContent: string | React.ReactNode = + stacSelections.length > 0 ? ( + <> + STAC:{' '} + {stacSuccess ? ( + + ) : ( + + )} +
+ Non-STAC: + + ) : ( + error.message + ); + + showError( + messageApi, + + {noticeContent} + , + ); }); - }) - .catch((error: ResponseError) => { - showError( - messageApi, - - {error.message} - , - ); - setDownloadIsLoading(false); - }); + } }; const getCurrentScope = (): string => { diff --git a/frontend/src/test/mock/fixtures.ts b/frontend/src/test/mock/fixtures.ts index e767509d5..44a531954 100644 --- a/frontend/src/test/mock/fixtures.ts +++ b/frontend/src/test/mock/fixtures.ts @@ -20,6 +20,7 @@ import { RawSearchResults, StacResponse, StacAggregations, + StacAsset, } from '../../components/Search/types'; import { RawUserAuth, RawUserInfo } from '../../contexts/types'; import { SubmissionResult } from '../../api'; @@ -397,24 +398,30 @@ export const stacAggregationsFixture = (): StacAggregations => ({ ], }); +export const stacAssetFixture = (props: Partial = {}): StacAsset => { + const defaults: StacAsset = { + id: 'foo', + access: ['public'], + description: 'test', + type: 'image/png', + alternatename: 'alternate_foo', + name: 'foo', + roles: ['data'], + href: 'http://test.com/foo', + 'file:size': 1, + 'file:checksum': 'abc123', + }; + + return { ...defaults, ...props }; +}; + export const rawStacAssetFixture = (props: Partial = {}): RawSearchResult => { const defaults: RawSearchResult = { id: 'foo', access: ['HTTPServer', 'OPENDAP'], url: ['foo.bar|HTTPServer', 'http://test.com/file.nc|OPENDAP'], assets: { - foo: { - id: 'foo', - access: ['public'], - description: 'test', - type: 'image/png', - alternatename: 'alternate_foo', - name: 'foo', - roles: ['data'], - href: 'http://test.com/foo', - 'file:size': 1, - 'file:checksum': 'abc123', - }, + foo: stacAssetFixture(), }, title: 'Foo Title', isStac: true,