Skip to content

Commit dc8bb22

Browse files
author
Michael Buchar
committed
feat(rest): limit workspace file listing and add forced GC (#657)
Introduce WORKSPACE_DISPLAY_FILE_LIMIT env var to cap the number of files returned by workspace listing endpoints, raising BadRequest when the limit is exceeded. Add FORCE_GARBAGE_COLLECTION env var to allow forcing manual gc.collect() before selected workspace operations (ls, list, rm, delete). Closes #644
1 parent b6eafc5 commit dc8bb22

7 files changed

Lines changed: 400 additions & 9 deletions

File tree

docs/openapi.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1658,7 +1658,7 @@
16581658
"type": "integer"
16591659
},
16601660
{
1661-
"description": "Filter workflow workspace files.",
1661+
"description": "Filter workflow workspace files by file name, size, or modification date.",
16621662
"in": "query",
16631663
"name": "search",
16641664
"required": false,
@@ -1706,7 +1706,7 @@
17061706
}
17071707
},
17081708
"400": {
1709-
"description": "Request failed. The incoming data specification seems malformed."
1709+
"description": "Request failed. The request parameters are invalid or the filtered result set exceeds the configured display limit."
17101710
},
17111711
"404": {
17121712
"description": "Request failed. Workflow does not exist.",

reana_workflow_controller/config.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
)
1919
from reana_db.models import JobStatus, RunStatus
2020
from distutils.util import strtobool
21+
from typing import List
2122

2223
from reana_workflow_controller.version import __version__
2324

@@ -465,3 +466,39 @@ def _parse_interactive_sessions_environments(env_var):
465466

466467
MAX_WORKFLOW_SHARING_MESSAGE_LENGTH = 5000
467468
"""Maximum length of the user-provided message when sharing a workflow."""
469+
470+
471+
def _parse_comma_separated_list(value: str) -> List[str]:
472+
"""Parse comma-separated env var values into a list of strings."""
473+
if not value:
474+
return []
475+
return [x.strip() for x in value.split(",") if x.strip()]
476+
477+
478+
WORKSPACE_DISPLAY_FILE_LIMIT = int(os.getenv("WORKSPACE_DISPLAY_FILE_LIMIT", "100000"))
479+
"""Maximum number of file entries returned by workspace listing endpoints."""
480+
481+
_VALID_GC_COMMANDS = {"ls", "list", "rm", "delete"}
482+
"""Valid FORCE_GARBAGE_COLLECTION command values."""
483+
484+
_gc_env = os.getenv("FORCE_GARBAGE_COLLECTION", "")
485+
FORCE_GARBAGE_COLLECTION = _parse_comma_separated_list(_gc_env)
486+
"""Comma-separated list of commands that trigger a manual `gc.collect()` before operations.
487+
488+
Example:
489+
$ export FORCE_GARBAGE_COLLECTION=ls,list,rm,delete
490+
491+
Supported values:
492+
- ls: trigger `gc.collect()` before listing workspace files
493+
- list: trigger `gc.collect()` before listing all workflows
494+
- rm: trigger `gc.collect()` before removing workspace files
495+
- delete: trigger `gc.collect()` before deleting workflows
496+
"""
497+
_invalid_gc = sorted(set(FORCE_GARBAGE_COLLECTION) - _VALID_GC_COMMANDS)
498+
if _invalid_gc:
499+
valid_gc_values = ", ".join(sorted(_VALID_GC_COMMANDS))
500+
invalid_gc_values = ", ".join(_invalid_gc)
501+
raise ValueError(
502+
"Invalid FORCE_GARBAGE_COLLECTION values: "
503+
f"{invalid_gc_values}. Valid values: {valid_gc_values}"
504+
)

reana_workflow_controller/rest/utils.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"""REANA Workflow Controller workflows REST API."""
1010

1111
import difflib
12-
import fs
12+
import gc
1313
import json
1414
import logging
1515
import mimetypes
@@ -63,6 +63,8 @@
6363
PROGRESS_STATUSES,
6464
REANA_GITLAB_HOST,
6565
PREVIEWABLE_MIME_TYPE_PREFIXES,
66+
FORCE_GARBAGE_COLLECTION,
67+
WORKSPACE_DISPLAY_FILE_LIMIT,
6668
)
6769
from reana_workflow_controller.consumer import _update_workflow_status
6870
from reana_workflow_controller.errors import (
@@ -73,6 +75,12 @@
7375
)
7476
from reana_workflow_controller.workflow_run_manager import KubernetesWorkflowRunManager
7577

78+
WORKSPACE_FILE_LISTING_FILTER_HINT = (
79+
"Please use more specific filters to narrow the results. "
80+
"Available filters: file name, size, or last-modified."
81+
)
82+
"""Neutral filter hint shown when workspace listings exceed the display limit."""
83+
7684

7785
def start_workflow(workflow, parameters):
7886
"""Start a workflow."""
@@ -222,6 +230,9 @@ def remove_workflow_jobs_from_cache(workflow):
222230

223231
def delete_workflow(workflow, all_runs=False, workspace=False):
224232
"""Delete workflow."""
233+
if "delete" in FORCE_GARBAGE_COLLECTION:
234+
gc.collect()
235+
225236
if workflow.status in [
226237
RunStatus.created,
227238
RunStatus.finished,
@@ -434,8 +445,16 @@ def list_directory_files(
434445
workspace_path: str, search: Dict[str, List[str]] = None
435446
) -> List[dict]:
436447
"""Return a list of files inside a given workspace."""
448+
if "ls" in FORCE_GARBAGE_COLLECTION:
449+
gc.collect()
437450
file_list = []
438451
for file_name in workspace.walk(workspace_path, include_dirs=False):
452+
if len(file_list) >= WORKSPACE_DISPLAY_FILE_LIMIT:
453+
raise BadRequest(
454+
"Too many files to display "
455+
f"(limit={WORKSPACE_DISPLAY_FILE_LIMIT}). "
456+
f"{WORKSPACE_FILE_LISTING_FILTER_HINT}"
457+
)
439458
st = workspace.lstat(workspace_path, file_name)
440459
file_info = {
441460
"name": file_name,
@@ -465,10 +484,12 @@ def remove_files_recursive_wildcard(workspace_path, path_or_pattern):
465484
:param workspace_path: Directory to delete files from.
466485
:param path_or_pattern: Wildcard pattern to use for the removal.
467486
:return: Dictionary with the results:
468-
- dictionary with names of succesfully deleted files and their sizes
487+
- dictionary with names of successfully deleted files and their sizes
469488
- dictionary with names of failed deletions and corresponding
470489
error messages.
471490
"""
491+
if "rm" in FORCE_GARBAGE_COLLECTION:
492+
gc.collect()
472493
deleted = {"deleted": {}, "failed": {}}
473494
for file_name in workspace.glob_or_walk_directory(
474495
workspace_path, path_or_pattern, topdown=False
@@ -488,12 +509,20 @@ def list_files_recursive_wildcard(workspace_path, path_or_pattern, search=None):
488509
:param workspace_path: Directory to list files from.
489510
:param path_or_pattern: Wildcard pattern to use for the listing.
490511
:return: Dictionary with the results:
491-
- dictionary with names of succesfully listed files and their sizes
512+
- dictionary with names of successfully listed files and their sizes
492513
- dictionary with names of failed listing and corresponding
493514
error messages.
494515
"""
516+
if "ls" in FORCE_GARBAGE_COLLECTION:
517+
gc.collect()
495518
list_files_recursive = []
496519
for path in workspace.glob_or_walk_directory(workspace_path, path_or_pattern):
520+
if len(list_files_recursive) >= WORKSPACE_DISPLAY_FILE_LIMIT:
521+
raise BadRequest(
522+
"Too many files to display "
523+
f"(limit={WORKSPACE_DISPLAY_FILE_LIMIT}). "
524+
f"{WORKSPACE_FILE_LISTING_FILTER_HINT}"
525+
)
497526
st = workspace.lstat(workspace_path, path)
498527
raw_size = st.st_size
499528
mtime = st.st_mtime

reana_workflow_controller/rest/workflows.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import datetime
1212
import json
1313
import logging
14-
import re
14+
import gc
1515
from typing import Optional
1616
from uuid import uuid4
1717

@@ -44,6 +44,7 @@
4444
REANA_URL,
4545
DEFAULT_NAME_FOR_WORKFLOWS,
4646
MAX_WORKFLOW_SHARING_MESSAGE_LENGTH,
47+
FORCE_GARBAGE_COLLECTION,
4748
)
4849
from reana_workflow_controller.errors import (
4950
REANAWorkflowControllerError,
@@ -308,6 +309,9 @@ def get_workflows(args, paginate=None): # noqa
308309
shared_by: Optional[str] = args.get("shared_by")
309310
shared_with: Optional[str] = args.get("shared_with")
310311

312+
if "list" in FORCE_GARBAGE_COLLECTION:
313+
gc.collect()
314+
311315
if shared_by and shared_with:
312316
message = "You cannot filter by shared_by and shared_with at the same time."
313317
return (jsonify({"message": message}), 400)

reana_workflow_controller/rest/workflows_workspace.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
)
2121
from fs.errors import CreateFailed
2222
from werkzeug.datastructures import FileStorage
23-
from werkzeug.exceptions import NotFound
23+
from werkzeug.exceptions import BadRequest, NotFound
2424

2525
from reana_commons import workspace
2626
from reana_commons.errors import REANAWorkspaceError
@@ -417,7 +417,7 @@ def get_files(workflow_id_or_name, paginate=None): # noqa
417417
type: integer
418418
- name: search
419419
in: query
420-
description: Filter workflow workspace files.
420+
description: Filter workflow workspace files by file name, size, or modification date.
421421
required: false
422422
type: string
423423
responses:
@@ -448,7 +448,8 @@ def get_files(workflow_id_or_name, paginate=None): # noqa
448448
type: string
449449
400:
450450
description: >-
451-
Request failed. The incoming data specification seems malformed.
451+
Request failed. The request parameters are invalid or the filtered
452+
result set exceeds the configured display limit.
452453
404:
453454
description: >-
454455
Request failed. Workflow does not exist.
@@ -487,6 +488,8 @@ def get_files(workflow_id_or_name, paginate=None): # noqa
487488
pagination_dict = paginate(file_list)
488489
return jsonify(pagination_dict), 200
489490

491+
except json.JSONDecodeError:
492+
return jsonify({"message": "Malformed request."}), 400
490493
except ValueError:
491494
return (
492495
jsonify(
@@ -501,6 +504,8 @@ def get_files(workflow_id_or_name, paginate=None): # noqa
501504
)
502505
except KeyError:
503506
return jsonify({"message": "Malformed request."}), 400
507+
except BadRequest as e:
508+
return jsonify({"message": e.description}), e.code
504509
except REANAWorkspaceError as e:
505510
return jsonify({"message": str(e)}), 400
506511
except FileNotFoundError:

tests/test_config.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# This file is part of REANA.
4+
# Copyright (C) 2025, 2026 CERN.
5+
#
6+
# REANA is free software; you can redistribute it and/or modify it
7+
# under the terms of the MIT License; see LICENSE file for more details.
8+
9+
import importlib
10+
11+
import pytest
12+
13+
import reana_workflow_controller.config as config
14+
15+
16+
def _reload_config():
17+
"""Reload configuration module after environment changes."""
18+
return importlib.reload(config)
19+
20+
21+
def test_parse_comma_separated_list():
22+
assert config._parse_comma_separated_list("") == []
23+
assert config._parse_comma_separated_list("ls") == ["ls"]
24+
assert config._parse_comma_separated_list("ls,list") == ["ls", "list"]
25+
assert config._parse_comma_separated_list(" ls, list ,rm,, ") == [
26+
"ls",
27+
"list",
28+
"rm",
29+
]
30+
31+
parsed = config._parse_comma_separated_list("ls,list")
32+
assert "ls" in parsed
33+
assert "list" in parsed
34+
assert "l" not in parsed # ensures no more substring matching
35+
36+
37+
def test_force_garbage_collection_rejects_invalid_values(monkeypatch):
38+
"""Test FORCE_GARBAGE_COLLECTION rejects unsupported command values."""
39+
with monkeypatch.context() as m:
40+
m.setenv("FORCE_GARBAGE_COLLECTION", "lis,delet")
41+
with pytest.raises(ValueError) as exc_info:
42+
_reload_config()
43+
message = str(exc_info.value)
44+
assert "Invalid FORCE_GARBAGE_COLLECTION values:" in message
45+
assert "delet" in message
46+
assert "lis" in message
47+
assert "Valid values: delete, list, ls, rm" in message
48+
49+
_reload_config()
50+
51+
52+
def test_force_garbage_collection_accepts_valid_values(monkeypatch):
53+
"""Test FORCE_GARBAGE_COLLECTION accepts supported command values."""
54+
with monkeypatch.context() as m:
55+
m.setenv("FORCE_GARBAGE_COLLECTION", "ls,list,rm,delete")
56+
reloaded_config = _reload_config()
57+
assert reloaded_config.FORCE_GARBAGE_COLLECTION == [
58+
"ls",
59+
"list",
60+
"rm",
61+
"delete",
62+
]
63+
64+
_reload_config()

0 commit comments

Comments
 (0)