Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ RUN apt-get update \
&& apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
&& rm -rf /var/lib/apt/lists/*

RUN addgroup --system django \
&& adduser --system --ingroup django django
RUN groupadd --system --gid 1000 django && \
useradd --system --create-home --uid 1000 --gid django django

# Requirements are installed here to ensure they will be cached.
COPY requirements /requirements
Expand Down
38 changes: 36 additions & 2 deletions backend/config/settings/site_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,45 @@ class MetagridBackendSettings(BaseSettings):
examples=["https://api.stac.esgf-west.org/"],
)

WGET_URL: str = Field(
description="The URL at which the ESG-Search wget endpoint can be reached.",
# Expand the number of fields allowed for wget API payloads (Django's DATA_UPLOAD_MAX_NUMBER_FIELDS)
DATA_UPLOAD_MAX_NUMBER_FIELDS: int = Field(
default=1024,
description="Maximum number of form fields allowed in a single upload. Useful for large wget payloads.",
examples=[1024],
)

# === wget related settings ===
WGET_URL: Optional[str] = Field(
default=None,
description="(Optional) If set, the URL at which the ESG-Search wget endpoint can be reached. If None (default), the wget download script is generated by an integrated WGET within the Metagrid deployment.",
examples=["https://esgf-node.llnl.gov/esg-search/wget"],
)

GLOBUS_PUBLIC_INDEX_ENDPOINT_ID: str = Field(
default="a8ef4320-9e5a-4793-837b-c45161ca1845",
description="The Globus index ID for the public ESGF2 data.",
examples=["a8ef4320-9e5a-4793-837b-c45161ca1845"],
)

WGET_SCRIPT_FILE_DEFAULT_LIMIT: int = Field(
default=9999,
description="Default limit on the number of files allowed in a generated wget script.",
examples=[9999],
)

WGET_SCRIPT_FILE_MAX_LIMIT: int = Field(
default=100000,
description="Maximum number of files allowed in a generated wget script.",
examples=[100000],
)

# Maximum length for facet values used in the wget directory structure
WGET_MAX_DIR_LENGTH: int = Field(
default=50,
description="Maximum character length for facet values when creating directory names for wget downloads.",
examples=[50],
)

KEYCLOAK_CLIENT_ID: str = Field(
default="metagrid-localhost",
examples=["metagrid-localhost"],
Expand Down
6 changes: 4 additions & 2 deletions backend/config/settings/static.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
# project root (config/settings/static.py - 3 = metagrid/)
ROOT_DIR = environ.Path(__file__) - 3

TEMPLATE_DIR = ROOT_DIR("metagrid", "wget", "templates")

# Parse DATABASE_URL environment variable; default to an in-memory sqlite DB for tests/local runs.
# This prevents "Set the DATABASE_URL environment variable" errors when none is provided.
DATABASES = env.db_url(
Expand Down Expand Up @@ -104,7 +106,7 @@ class DjangoStaticSettings(BaseSettings):
# https://docs.djangoproject.com/en/dev/ref/settings/#static-url
STATIC_URL: str = "/static/"
# https://docs.djangoproject.com/en/dev/ref/contrib/staticfiles/#std:setting-STATICFILES_DIRS
STATICFILES_DIRS: Sequence[str] = []
STATICFILES_DIRS: Sequence[str] = [TEMPLATE_DIR]
# https://docs.djangoproject.com/en/dev/ref/contrib/staticfiles/#staticfiles-finders
STATICFILES_FINDERS: Sequence[str] = [
"django.contrib.staticfiles.finders.FileSystemFinder",
Expand All @@ -125,7 +127,7 @@ class DjangoStaticSettings(BaseSettings):
{
"BACKEND": "django.template.backends.django.DjangoTemplates",
"DIRS": STATICFILES_DIRS,
"APP_DIRS": False,
"APP_DIRS": True,
"OPTIONS": {
"context_processors": [
"django.template.context_processors.debug",
Expand Down
22 changes: 14 additions & 8 deletions backend/metagrid/api_proxy/tests/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,23 @@ def test_globus_auth_logout(self):
response = self.client.get(url)
self.assertEqual(response.status_code, status.HTTP_302_FOUND)

@responses.activate
def test_search(self):
url = reverse("do-search")
postdata = {"project": "CMIP6", "limit": 0}
responses.get(settings.SEARCH_URL)
response = self.client.get(url, postdata)
assert response.status_code == status.HTTP_200_OK

@responses.activate
def test_wget(self):
url = reverse("do-wget")
if settings.WGET_URL is None:
# If WGET_URL is None, skip the test
import pytest

pytest.skip("settings.WGET_URL is not set")

responses.get(settings.WGET_URL)
response = self.client.get(
url,
Expand All @@ -91,14 +105,6 @@ def test_wget(self):
)
assert response.status_code == status.HTTP_200_OK

@responses.activate
def test_search(self):
url = reverse("do-search")
postdata = {"project": "CMIP6", "limit": 0}
responses.get(settings.SEARCH_URL)
response = self.client.get(url, postdata)
assert response.status_code == status.HTTP_200_OK

@responses.activate
def test_stac_search(self):
url = reverse("do-stac-search")
Expand Down
4 changes: 4 additions & 0 deletions backend/metagrid/api_proxy/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from rest_framework_simplejwt.tokens import RefreshToken

from config.settings.site_specific import MetagridFrontendSettings
from metagrid.wget.views import do_wget_integrated


@api_view()
Expand Down Expand Up @@ -155,6 +156,9 @@ def do_status(request):
@require_http_methods(["GET", "POST"])
@csrf_exempt
def do_wget(request):
if not settings.WGET_URL:
return do_wget_integrated(request)

return do_request(request, settings.WGET_URL, True)


Expand Down
Empty file.
235 changes: 235 additions & 0 deletions backend/metagrid/wget/query_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
# reserved query keywords
OFFSET = "offset"
LIMIT = "limit"
QUERY = "query"
DISTRIB = "distrib"
SHARDS = "shards"
FROM = "from"
TO = "to"
SORT = "sort"
SIMPLE = "simple"

KEYWORDS = [OFFSET, LIMIT, QUERY, DISTRIB, SHARDS, FROM, TO, SORT, SIMPLE]

# standard metadata fields, always included for each result (if available)
FIELD_ID = "id"
FIELD_TYPE = "type"
FIELD_REPLICA = "replica"
FIELD_RETRACTED = "retracted"
FIELD_LATEST = "latest"
FIELD_MASTER_ID = "master_id"
FIELD_INSTANCE_ID = "instance_id"
FIELD_DRS_ID = "drs_id"
FIELD_TITLE = "title"
FIELD_DESCRIPTION = "description"
FIELD_TIMESTAMP = "timestamp"
FIELD_TIMESTAMP_ = "_timestamp"
FIELD_URL = "url"
FIELD_ACCESS = "access"
FIELD_XLINK = "xlink"
FIELD_SIZE = "size"
FIELD_DATASET_ID = "dataset_id"
FIELD_TRACKING_ID = "tracking_id"
FIELD_VERSION = "version"
FIELD_VERSION_ = "_version_"
FIELD_MAX_VERSION = "max_version"
FIELD_MIN_VERSION = "min_version"
FIELD_SCORE = "score"
FIELD_UNITS = "units"

FIELD_CHECKSUM = "checksum"
FIELD_CHECKSUM_TYPE = "checksum_type"
FIELD_INDEX_NODE = "index_node"
FIELD_DATA_NODE = "data_node"
FIELD_NUMBER_OF_FILES = "number_of_files"
FIELD_NUMBER_OF_AGGREGATIONS = "number_of_aggregations"
FIELD_DATASET_ID_TEMPLATE = "dataset_id_template_"
FIELD_DATETIME_START = "datetime_start"
FIELD_DATETIME_STOP = "datetime_stop"
FIELD_TEXT = "text"

# special query fields for open search geo extension
FIELD_BBOX = "bbox" # west, south, east, north
FIELD_LAT = "lat"
FIELD_LON = "lon"
FIELD_LOCATION = "location"
FIELD_RADIUS = "radius"
FIELD_POLYGON = "polygon"
FIELD_EAST_DEGREES = "east_degrees"
FIELD_WEST_DEGREES = "west_degrees"
FIELD_NORTH_DEGREES = "north_degrees"
FIELD_SOUTH_DEGREES = "south_degrees"
FIELD_HEIGHT_BOTTOM = "height_bottom"
FIELD_HEIGHT_TOP = "height_top"
FIELD_HEIGHT_UNITS = "height_units"
FIELD_VARIABLE_UNITS = "variable_units"
FIELD_GEO = "geo"
FIELD_GEO_UNITS = "geo_units"

# special query fields for open search time extension
FIELD_START = "start"
FIELD_END = "end"

# special query fields for the wget scirpt generator
FIELD_WGET_PATH = "download_structure"
FIELD_WGET_EMPTYPATH = "download_emptypath"

# fields that specify project
FIELD_PROJECT = "project"
FIELD_MIP_ERA = "mip_era"

# fields that are always allowed in queries, in addition to configured facets
CORE_QUERY_FIELDS = [
FIELD_ID,
FIELD_TYPE,
FIELD_REPLICA,
FIELD_RETRACTED,
FIELD_LATEST,
FIELD_MASTER_ID,
FIELD_INSTANCE_ID,
FIELD_DRS_ID,
FIELD_TITLE,
FIELD_DESCRIPTION,
FIELD_TIMESTAMP,
FIELD_TIMESTAMP_,
FIELD_URL,
FIELD_XLINK,
FIELD_SIZE,
FIELD_NUMBER_OF_FILES,
FIELD_NUMBER_OF_AGGREGATIONS,
FIELD_DATASET_ID,
FIELD_TRACKING_ID,
FIELD_ACCESS,
FIELD_VERSION,
FIELD_MAX_VERSION,
FIELD_MIN_VERSION,
FIELD_CHECKSUM,
FIELD_CHECKSUM_TYPE,
FIELD_DATA_NODE,
FIELD_INDEX_NODE,
FIELD_BBOX,
FIELD_LAT,
FIELD_LON,
FIELD_RADIUS,
FIELD_POLYGON,
FIELD_START,
FIELD_END,
FIELD_WGET_PATH,
FIELD_WGET_EMPTYPATH,
FIELD_PROJECT,
FIELD_MIP_ERA,
]

# fields that should NOT be used as facets
NOT_FACETS = [
FIELD_ID,
FIELD_MASTER_ID,
FIELD_INSTANCE_ID,
FIELD_DATASET_ID,
FIELD_DATASET_ID_TEMPLATE,
FIELD_DRS_ID,
FIELD_DATETIME_START,
FIELD_DATETIME_STOP,
FIELD_EAST_DEGREES,
FIELD_WEST_DEGREES,
FIELD_NORTH_DEGREES,
FIELD_SOUTH_DEGREES,
FIELD_BBOX,
FIELD_LAT,
FIELD_LON,
FIELD_RADIUS,
FIELD_POLYGON,
FIELD_HEIGHT_BOTTOM,
FIELD_HEIGHT_TOP,
FIELD_HEIGHT_UNITS,
FIELD_LATEST,
FIELD_REPLICA,
FIELD_RETRACTED,
FIELD_NUMBER_OF_FILES,
FIELD_NUMBER_OF_AGGREGATIONS,
FIELD_TRACKING_ID,
FIELD_TIMESTAMP,
FIELD_TITLE,
FIELD_DESCRIPTION,
FIELD_URL,
FIELD_XLINK,
FIELD_SIZE,
FIELD_TEXT,
FIELD_TYPE,
FIELD_VARIABLE_UNITS,
FIELD_GEO,
FIELD_GEO_UNITS,
FIELD_TIMESTAMP_,
FIELD_VERSION_,
FIELD_SCORE,
FIELD_UNITS,
]

# unsupported fields
UNSUPPORTED_FIELDS = [
FIELD_LAT,
FIELD_LON,
FIELD_LOCATION,
FIELD_RADIUS,
FIELD_POLYGON,
]

# ID fields
ID_FIELDS = [FIELD_ID, FIELD_DATASET_ID, FIELD_MASTER_ID, FIELD_INSTANCE_ID]


def split_value(value):
"""
Utility method to split an HTTP parameter value into comma-separated
values but keep intact patterns such as "CESM1(CAM5.1,FV2)
"""

# first split by comma
values = [v.strip() for v in value.split(",")]
values_length = len(values)

if len(values) == 1: # no splitting occurred
return values
else: # possibly re-assemble broken pieces
_values = []
i = 0
while i < values_length:
if i < values_length - 1:
if (
values[i].find("(") >= 0
and values[i].find(")") < 0
and values[i + 1].find(")") >= 0
and values[i + 1].find("(") < 0
):
_values.append(
values[i] + "," + values[i + 1]
) # re-assemble
i += 1 # skip next value
elif (
values[i].find("[") >= 0
and values[i].find("]") < 0
and values[i + 1].find("]") >= 0
and values[i + 1].find("[") < 0
):
_values.append(
values[i] + "," + values[i + 1]
) # re-assemble
i += 1 # skip next value
elif (
values[i].find("{") >= 0
and values[i].find("}") < 0
and values[i + 1].find("}") >= 0
and values[i + 1].find("{") < 0
):
_values.append(
values[i] + "," + values[i + 1]
) # re-assemble
i += 1 # skip next value
else:
_values.append(values[i])
else:
_values.append(values[i])
i += 1

# convert listo into array
return _values
11 changes: 11 additions & 0 deletions backend/metagrid/wget/templates/wget-simple-template.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

download_files=(
{% spaceless %}{% for filename, file in files.items %}'{{file.url}}'
{% endfor %}{% endspaceless %}
)

for i in "${download_files[@]}"
do
wget $i
done
Loading
Loading