From a72718d7003a85c4e2411e60b37e7f673a6ebe2a Mon Sep 17 00:00:00 2001 From: Hilary James Oliver Date: Wed, 6 Aug 2025 17:15:56 +1200 Subject: [PATCH 1/9] Replace some detect_old_contact_file calls with ping. Also: remove ping task-id: it was broken and not used. --- cylc/flow/network/client.py | 4 ++ cylc/flow/scheduler_cli.py | 34 +++++++++----- cylc/flow/scripts/clean.py | 2 +- cylc/flow/scripts/ping.py | 61 +++++-------------------- cylc/flow/scripts/reinstall.py | 5 +- cylc/flow/scripts/validate_reinstall.py | 52 ++++++++++----------- tests/integration/test_reinstall.py | 4 +- 7 files changed, 66 insertions(+), 96 deletions(-) diff --git a/cylc/flow/network/client.py b/cylc/flow/network/client.py index 175d635fa3b..883c930c52f 100644 --- a/cylc/flow/network/client.py +++ b/cylc/flow/network/client.py @@ -158,6 +158,10 @@ def timeout_handler(self) -> None: WorkflowStopped: if the workflow has already stopped. CyclError: if the workflow has moved to different host/port. """ + LOG.warning( + f"{self.workflow} {self.host}:{self.port}:" + f" Connection timed out ({self.timeout} ms)" + ) contact_host, contact_port, *_ = get_location(self.workflow) if ( contact_host != get_fqdn_by_host(self._orig_host) diff --git a/cylc/flow/scheduler_cli.py b/cylc/flow/scheduler_cli.py index 36607e1a20a..cfff96bd3b4 100644 --- a/cylc/flow/scheduler_cli.py +++ b/cylc/flow/scheduler_cli.py @@ -30,9 +30,12 @@ from cylc.flow import LOG, __version__ from cylc.flow.exceptions import ( - ContactFileExists, CylcError, ServiceFileError, + WorkflowStopped, +) +from cylc.flow.scripts.ping import ( + run as cylc_ping, ) import cylc.flow.flags from cylc.flow.id import upgrade_legacy_ids @@ -60,7 +63,6 @@ from cylc.flow.workflow_db_mgr import WorkflowDatabaseManager from cylc.flow.workflow_files import ( SUITERC_DEPR_MSG, - detect_old_contact_file, get_workflow_srv_dir, ) from cylc.flow.terminal import ( @@ -474,13 +476,28 @@ async def _scheduler_cli_3( async def _resume(workflow_id, options): """Resume the workflow if it is already running.""" try: - detect_old_contact_file(workflow_id) - except ContactFileExists as exc: - print(f"Resuming already-running workflow\n\n{exc}") pclient = WorkflowRuntimeClient( workflow_id, timeout=options.comms_timeout, ) + except WorkflowStopped: + # Not running - don't resume. + return + + # Is it running? If yes, send resume command. + try: + await cylc_ping(options, workflow_id, pclient) + except WorkflowStopped: + # Not running (orphaned contact file). + return + except CylcError as exc: + # PID check failed - abort. + LOG.error(exc) + LOG.critical('Cannot tell if the workflow is running') + sys.exit(1) + else: + # It's running: resume it and exit. + print("Resuming already-running workflow") mutation_kwargs = { 'request_string': RESUME_MUTATION, 'variables': { @@ -489,13 +506,6 @@ async def _resume(workflow_id, options): } await pclient.async_request('graphql', mutation_kwargs) sys.exit(0) - except CylcError as exc: - LOG.error(exc) - LOG.critical( - 'Cannot tell if the workflow is running' - '\nNote, Cylc 8 cannot restart Cylc 7 workflows.' - ) - sys.exit(1) def _version_check( diff --git a/cylc/flow/scripts/clean.py b/cylc/flow/scripts/clean.py index b3afe679429..95e241ed73a 100644 --- a/cylc/flow/scripts/clean.py +++ b/cylc/flow/scripts/clean.py @@ -220,7 +220,7 @@ async def run(*ids: str, opts: 'Values') -> None: workflows, multi_mode = await scan(workflows, multi_mode) if not workflows: - LOG.warning(f"No workflows matching {', '.join(ids)}") + LOG.warning(f"No stopped workflows matching {', '.join(ids)}") return workflows.sort() diff --git a/cylc/flow/scripts/ping.py b/cylc/flow/scripts/ping.py index 899151b1977..0be9fef98f6 100755 --- a/cylc/flow/scripts/ping.py +++ b/cylc/flow/scripts/ping.py @@ -18,24 +18,22 @@ """cylc ping [OPTIONS] ARGS -Test communication with a running workflow. +Test communication with running workflows. -If workflow WORKFLOW is running or TASK in WORKFLOW is currently running, -exit with success status, else exit with error status. +Print the HOST:PORT of running workflows. +If any are not running, exit with error status. """ from functools import partial import sys from typing import Any, Dict, TYPE_CHECKING -import cylc.flow.flags from cylc.flow.network.client_factory import get_client from cylc.flow.network.multi import call_multi from cylc.flow.option_parsers import ( - FULL_ID_MULTI_ARG_DOC, + ID_MULTI_ARG_DOC, CylcOptionParser as COP, ) -from cylc.flow.task_state import TASK_STATUS_RUNNING from cylc.flow.terminal import cli_function if TYPE_CHECKING: @@ -48,16 +46,6 @@ id name port - pubPort - } -} -''' - -TASK_QUERY = ''' -query ($tProxy: ID!) { - taskProxy (id: $tProxy) { - state - id } } ''' @@ -67,9 +55,8 @@ def get_option_parser() -> COP: parser = COP( __doc__, comms=True, - multitask=True, multiworkflow=True, - argdoc=[FULL_ID_MULTI_ARG_DOC], + argdoc=[ID_MULTI_ARG_DOC], ) return parser @@ -78,9 +65,10 @@ def get_option_parser() -> COP: async def run( options: 'Values', workflow_id: str, - *tokens_list, + client=None ) -> Dict: - pclient = get_client(workflow_id, timeout=options.comms_timeout) + + pclient = client or get_client(workflow_id, timeout=options.comms_timeout) ret: Dict[str, Any] = { 'stdout': [], @@ -91,38 +79,11 @@ async def run( 'request_string': FLOW_QUERY, 'variables': {'wFlows': [workflow_id]} } - task_kwargs: Dict[str, Any] = { - 'request_string': TASK_QUERY, - } - # ping called on the workflow result = await pclient.async_request('graphql', flow_kwargs) - msg = "" - for flow in result['workflows']: - w_name = flow['name'] - w_port = flow['port'] - w_pub_port = flow['pubPort'] - if cylc.flow.flags.verbosity > 0: - ret['stdout'].append( - f'{w_name} running on ' - f'{pclient.host}:{w_port} {w_pub_port}\n' - ) - - # ping called with task-like objects - for tokens in tokens_list: - task_kwargs['variables'] = { - 'tProxy': tokens.relative_id - } - task_result = await pclient.async_request('graphql', task_kwargs) - string_id = tokens.relative_id - if not task_result.get('taskProxy'): - msg = f"task not found: {string_id}" - elif task_result['taskProxy']['state'] != TASK_STATUS_RUNNING: - msg = f"task not {TASK_STATUS_RUNNING}: {string_id}" - if msg: - ret['stderr'].append(msg) - ret['exit'] = 1 + for flow in result['workflows']: + ret['stdout'].append(f"{pclient.host}:{flow['port']}") return ret @@ -144,6 +105,6 @@ def main( partial(run, options), *ids, report=report, - constraint='mixed', + constraint='workflows', ) sys.exit(all(rets.values()) is False) diff --git a/cylc/flow/scripts/reinstall.py b/cylc/flow/scripts/reinstall.py index 3e55e6edd7f..a58be83f6ec 100644 --- a/cylc/flow/scripts/reinstall.py +++ b/cylc/flow/scripts/reinstall.py @@ -217,7 +217,8 @@ async def reinstall_cli( # no rsync output == no changes => exit print(cparse( '' - f'{workflow_id} up to date with {source}' + 'No changes made:' + f' {workflow_id} is up to date with {source}' '' )) return False @@ -249,7 +250,7 @@ async def reinstall_cli( else: # no reinstall print( - cparse('Reinstall canceled, no changes made.') + cparse('No changes made: reinstall cancelled.') ) return False diff --git a/cylc/flow/scripts/validate_reinstall.py b/cylc/flow/scripts/validate_reinstall.py index afb36ddeab4..f521f33e04f 100644 --- a/cylc/flow/scripts/validate_reinstall.py +++ b/cylc/flow/scripts/validate_reinstall.py @@ -18,12 +18,14 @@ """cylc validate-reinstall [OPTIONS] ARGS -Validate, reinstall and apply changes to a workflow. +Validate, reinstall, and reload or restart a workflow. -Validate and reinstall a workflow then either: +If the source has changed and you respond Yes to the reinstall prompt: +* reinstall and reload the workflow, if it is running +* or reinstall and restart the workflow, if it is not running -* "Reload" the workflow (if it is running), -* or "play" it (if it is stopped). +If the source has not changed or you respond No to the reinstall prompt, the +workflow will not be reloaded or restarted. This command is equivalent to: $ cylc validate myworkflow --against-source @@ -39,6 +41,7 @@ in the installed workflow to ensure the change can be safely applied. """ +from ansimarkup import parse as cparse import asyncio from pathlib import Path import sys @@ -49,8 +52,7 @@ from cylc.flow import LOG from cylc.flow.exceptions import ( - ContactFileExists, - CylcError, + WorkflowStopped, ) from cylc.flow.id_cli import parse_id_async from cylc.flow.loggingutil import set_timestamps @@ -62,6 +64,7 @@ cleanup_sysargv ) from cylc.flow.scheduler_cli import PLAY_OPTIONS, cylc_play +from cylc.flow.scripts.ping import run as cylc_ping from cylc.flow.scripts.validate import ( VALIDATE_OPTIONS, VALIDATE_AGAINST_SOURCE_OPTION, @@ -78,10 +81,10 @@ ) from cylc.flow.terminal import cli_function from cylc.flow.workflow_files import ( - detect_old_contact_file, get_workflow_run_dir, ) + CYLC_ROSE_OPTIONS = COP.get_cylc_rose_options() VR_OPTIONS = combine_options( VALIDATE_OPTIONS, @@ -159,32 +162,25 @@ async def vr_cli( False: If this command should "exit 1". """ - # Attempt to work out whether the workflow is running. - # We are trying to avoid reinstalling then subsequently being - # unable to play or reload because we cannot identify workflow state. unparsed_wid = workflow_id workflow_id, *_ = await parse_id_async( workflow_id, constraint='workflows', ) - # Use this interface instead of scan, because it can have an ambiguous - # outcome which we want to capture before we install. + # First attempt to work out whether the workflow is running. + # We are trying to avoid reinstalling then subsequently being + # unable to play or reload because we cannot identify workflow state. + log_subcommand('ping', workflow_id) try: - detect_old_contact_file(workflow_id) - except ContactFileExists: - # Workflow is definitely running: - workflow_running = True - except CylcError as exc: - LOG.error(exc) - LOG.critical( - 'Cannot tell if the workflow is running' - '\nNote, Cylc 8 cannot restart Cylc 7 workflows.' - ) - raise - else: - # Workflow is definitely stopped: + result = await cylc_ping(options, workflow_id) + # (don't catch CylcError: unable to determine if running or not) + except WorkflowStopped: + print(cparse(f"{workflow_id} is not running")) workflow_running = False + else: + print(cparse(f"{workflow_id}: {result['stdout'][0]}")) + workflow_running = True # options.tvars and tvars_file are _only_ valid when playing a stopped # workflow: Fail if they are set and workflow running: @@ -211,11 +207,9 @@ async def vr_cli( [], print_reload_tip=False ) + if not reinstall_ok: - LOG.warning( - 'No changes to source: No reinstall or' - f' {"reload" if workflow_running else "play"} required.' - ) + # No changes, OR user said No to the reinstall prompt: abort. return False # Run reload if workflow is running or paused: diff --git a/tests/integration/test_reinstall.py b/tests/integration/test_reinstall.py index 00f2de7cb12..fcfd0214d99 100644 --- a/tests/integration/test_reinstall.py +++ b/tests/integration/test_reinstall.py @@ -178,7 +178,7 @@ async def test_interactive( # only one rsync call should have been made (i.e. the --dry-run) assert [call[1].get('dry_run') for call in reinstall_calls] == [True] - assert 'Reinstall canceled, no changes made.' in capsys.readouterr().out + assert 'reinstall cancelled' in capsys.readouterr().out reinstall_calls.clear() answer_prompt('y') @@ -287,7 +287,7 @@ def raise_keyboard_interrupt(): ) await reinstall_cli(opts=ReInstallOptions(), workflow_id=one_run.id) - assert 'Reinstall canceled, no changes made' in capsys.readouterr().out + assert 'reinstall cancelled' in capsys.readouterr().out async def test_rsync_fail(one_src, one_run, mock_glbl_cfg, non_interactive): From 624d7c5e39f4e78b38b4bd610e3cb6e7d0b26a9a Mon Sep 17 00:00:00 2001 From: Hilary James Oliver Date: Mon, 11 Aug 2025 14:20:52 +1200 Subject: [PATCH 2/9] Make `cylc psutil` timeout configurable. --- cylc/flow/cfgspec/globalcfg.py | 16 ++++++++++++++++ cylc/flow/scheduler_cli.py | 4 +--- cylc/flow/workflow_files.py | 11 +++++++++-- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 176f24b0141..00af574120a 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -1044,7 +1044,23 @@ def default_for( {REPLACES}``[suite servers][run host select]rank``. ''') + Conf('process check timeout', VDR.V_INTERVAL, DurationFloat(10), + desc=''' + Maximum time for the `cylc play` and `cylc vr` commands to wait + for a remote process that checks if an unresponsive scheduler + is still alive (for workflows with existing contact files). + + .. note:: + This check involves running ``cylc psutil`` on the run host. + You may need to increase the timeout if shared filesystem + latency (for example) results in slow Python script startup. + Increasing the timeout unnecessarily, however, will just + cause these commands to hang for an unnecessarily long time + in this circumstance. + + .. versionadded:: 8.5.2 + ''') with Conf('host self-identification', desc=f''' How Cylc determines and shares the identity of the workflow host. diff --git a/cylc/flow/scheduler_cli.py b/cylc/flow/scheduler_cli.py index cfff96bd3b4..2efc50004ff 100644 --- a/cylc/flow/scheduler_cli.py +++ b/cylc/flow/scheduler_cli.py @@ -34,9 +34,7 @@ ServiceFileError, WorkflowStopped, ) -from cylc.flow.scripts.ping import ( - run as cylc_ping, -) +from cylc.flow.scripts.ping import run as cylc_ping import cylc.flow.flags from cylc.flow.id import upgrade_legacy_ids from cylc.flow.host_select import select_workflow_host diff --git a/cylc/flow/workflow_files.py b/cylc/flow/workflow_files.py index a2956ce591a..428e9f135dc 100644 --- a/cylc/flow/workflow_files.py +++ b/cylc/flow/workflow_files.py @@ -48,6 +48,7 @@ import cylc.flow.flags from cylc.flow import LOG from cylc.flow.async_util import make_async +from cylc.flow.cfgspec.glbl_cfg import glbl_cfg from cylc.flow.exceptions import ( ContactFileExists, CylcError, @@ -372,6 +373,7 @@ def _is_process_running( ) -> bool: """Check if a workflow process is still running. + Runs `cylc psutil` on the scheduler run host. * Returns True if the process is still running. * Returns False if it is not. * Raises CylcError if we cannot tell (e.g. due to network issues). @@ -428,8 +430,13 @@ def _is_process_running( text=True ) # * hardcoded command try: - # Terminate command after 10 seconds to prevent hanging, etc. - out, err = proc.communicate(timeout=10, input=metric) + out, err = proc.communicate( + timeout=( + glbl_cfg().get( + ['scheduler', 'run hosts', 'process check timeout']) + ), + input=metric + ) except TimeoutExpired: raise CylcError( f'Attempt to determine whether workflow is running on {host}' From 3ebc07ce9faf2ab5c6c13d698f27ccfea279c272 Mon Sep 17 00:00:00 2001 From: Hilary James Oliver Date: Mon, 11 Aug 2025 16:24:21 +1200 Subject: [PATCH 3/9] Fix exception name. --- cylc/flow/clean.py | 4 ++-- cylc/flow/exceptions.py | 4 ++-- cylc/flow/network/client.py | 4 ++-- cylc/flow/scheduler.py | 2 +- cylc/flow/workflow_files.py | 8 ++++---- tests/integration/test_workflow_files.py | 6 +++--- tests/unit/test_clean.py | 6 +++--- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/cylc/flow/clean.py b/cylc/flow/clean.py index d4a443b4900..e3eacabe4d7 100644 --- a/cylc/flow/clean.py +++ b/cylc/flow/clean.py @@ -48,7 +48,7 @@ from cylc.flow import LOG from cylc.flow.cfgspec.glbl_cfg import glbl_cfg from cylc.flow.exceptions import ( - ContactFileExists, + SchedulerAlive, CylcError, InputError, PlatformError, @@ -123,7 +123,7 @@ def _clean_check(opts: 'Values', id_: str, run_dir: Path) -> None: return try: detect_old_contact_file(id_) - except ContactFileExists as exc: + except SchedulerAlive as exc: raise ServiceFileError( f"Cannot clean running workflow {id_}.\n\n{exc}" ) from None diff --git a/cylc/flow/exceptions.py b/cylc/flow/exceptions.py index 4deee62690f..ee215b47c9d 100644 --- a/cylc/flow/exceptions.py +++ b/cylc/flow/exceptions.py @@ -136,8 +136,8 @@ class WorkflowFilesError(CylcError): bullet = "\n -" -class ContactFileExists(CylcError): - """Workflow contact file exists.""" +class SchedulerAlive(CylcError): + """Workflow contact file exists and scheduler is alive.""" class FileRemovalError(CylcError): diff --git a/cylc/flow/network/client.py b/cylc/flow/network/client.py index 883c930c52f..e66b3b40b18 100644 --- a/cylc/flow/network/client.py +++ b/cylc/flow/network/client.py @@ -41,7 +41,7 @@ ) from cylc.flow.exceptions import ( ClientTimeout, - ContactFileExists, + SchedulerAlive, CylcError, RequestError, WorkflowStopped, @@ -181,7 +181,7 @@ def timeout_handler(self) -> None: # behind a contact file? try: detect_old_contact_file(self.workflow) - except ContactFileExists: + except SchedulerAlive: # old contact file exists and the workflow process still alive return else: diff --git a/cylc/flow/scheduler.py b/cylc/flow/scheduler.py index e110c3bd9f0..556a93b054f 100644 --- a/cylc/flow/scheduler.py +++ b/cylc/flow/scheduler.py @@ -1144,7 +1144,7 @@ def _configure_contact(self) -> None: """Create contact file.""" # Make sure another workflow of the same name hasn't started while this # one is starting - # NOTE: raises ContactFileExists if workflow is running + # NOTE: raises SchedulerAlive if workflow is running workflow_files.detect_old_contact_file(self.workflow) # Extract contact data. diff --git a/cylc/flow/workflow_files.py b/cylc/flow/workflow_files.py index 428e9f135dc..35e6393d4e7 100644 --- a/cylc/flow/workflow_files.py +++ b/cylc/flow/workflow_files.py @@ -50,7 +50,7 @@ from cylc.flow.async_util import make_async from cylc.flow.cfgspec.glbl_cfg import glbl_cfg from cylc.flow.exceptions import ( - ContactFileExists, + SchedulerAlive, CylcError, InputError, ServiceFileError, @@ -486,7 +486,7 @@ def detect_old_contact_file( * If one does exist but the workflow process is definitely not alive, remove it. * If one exists and the workflow process is still alive, raise - ContactFileExists. + SchedulerAlive. Args: id_: workflow ID @@ -496,7 +496,7 @@ def detect_old_contact_file( CylcError: If it is not possible to tell for sure if the workflow is running or not. - ContactFileExists: + SchedulerAlive: If old contact file exists and the workflow process still alive. ServiceFileError: For corrupt / incompatible contact files. @@ -529,7 +529,7 @@ def detect_old_contact_file( fname = get_contact_file_path(id_) if process_is_running: # ... the process is running, raise an exception - raise ContactFileExists( + raise SchedulerAlive( CONTACT_FILE_EXISTS_MSG % { "host": old_host, "port": old_port, diff --git a/tests/integration/test_workflow_files.py b/tests/integration/test_workflow_files.py index 77e5a84fd31..d594f24cf0f 100644 --- a/tests/integration/test_workflow_files.py +++ b/tests/integration/test_workflow_files.py @@ -25,7 +25,7 @@ from cylc.flow import CYLC_LOG from cylc.flow.exceptions import ( - ContactFileExists, + SchedulerAlive, CylcError, ) from cylc.flow.workflow_files import ( @@ -107,7 +107,7 @@ def dump_contact(**kwargs): def test_detect_old_contact_file_running(workflow): """It should raise an error if the workflow is running.""" # the workflow is running so we should get a ServiceFileError - with pytest.raises(ContactFileExists): + with pytest.raises(SchedulerAlive): detect_old_contact_file(workflow.id_) # the contact file is valid so should be left alone assert workflow.contact_file.exists() @@ -234,7 +234,7 @@ def _unlink(*args): # try to remove the contact file if process_running: # this should error if the process is running - with pytest.raises(ContactFileExists): + with pytest.raises(SchedulerAlive): detect_old_contact_file(workflow.id_) else: detect_old_contact_file(workflow.id_) diff --git a/tests/unit/test_clean.py b/tests/unit/test_clean.py index e2abbbdd0e2..bf2f959c5c7 100644 --- a/tests/unit/test_clean.py +++ b/tests/unit/test_clean.py @@ -44,7 +44,7 @@ init_clean, ) from cylc.flow.exceptions import ( - ContactFileExists, + SchedulerAlive, CylcError, InputError, PlatformError, @@ -119,7 +119,7 @@ def test_clean_check__fail( """ def mocked_detect_old_contact_file(*a, **k): if not stopped: - raise ContactFileExists('Mocked error') + raise SchedulerAlive('Mocked error') monkeypatch.setattr( 'cylc.flow.clean.detect_old_contact_file', @@ -236,7 +236,7 @@ def test_init_clean__running_workflow( ) -> None: """Test init_clean() fails when workflow is still running""" def mock_err(*args, **kwargs): - raise ContactFileExists("Mocked error") + raise SchedulerAlive("Mocked error") monkeypatch.setattr('cylc.flow.clean.detect_old_contact_file', mock_err) tmp_run_dir('yavin') From 464a0ac9d4fe41ebf4f10b600f3d3dfd39941813 Mon Sep 17 00:00:00 2001 From: Hilary James Oliver Date: Mon, 11 Aug 2025 14:17:21 +1200 Subject: [PATCH 4/9] Update change log. --- changes.d/6909.fix.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 changes.d/6909.fix.md diff --git a/changes.d/6909.fix.md b/changes.d/6909.fix.md new file mode 100644 index 00000000000..2e2761888a2 --- /dev/null +++ b/changes.d/6909.fix.md @@ -0,0 +1,3 @@ +Fix potential timeout of the play and vr commands for workflows with contact +files, due to an unnecessary remote process check - now only done if the +workflow fails to respond on the network. From 0dff5b074e5bace046a86a780f168bc55e81f556 Mon Sep 17 00:00:00 2001 From: Hilary James Oliver Date: Thu, 14 Aug 2025 05:35:30 +0000 Subject: [PATCH 5/9] Adapt a func test. --- .../functional/cylc-combination-scripts/05-vr-fail-is-running.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/cylc-combination-scripts/05-vr-fail-is-running.t b/tests/functional/cylc-combination-scripts/05-vr-fail-is-running.t index 90275dae037..ff7826f5962 100644 --- a/tests/functional/cylc-combination-scripts/05-vr-fail-is-running.t +++ b/tests/functional/cylc-combination-scripts/05-vr-fail-is-running.t @@ -52,7 +52,7 @@ sed -i 's@CYLC_WORKFLOW_HOST=.*@CYLC_WORKFLOW_HOST=elephantshrew@' "${CONTACTFIL # Change source workflow and run vr: run_fail "${TEST_NAME_BASE}-runs" cylc vr "${WORKFLOW_NAME}" -grep_ok "on elephantshrew." "${TEST_NAME_BASE}-runs.stderr" +grep_ok "elephantshrew." "${TEST_NAME_BASE}-runs.stderr" # Clean Up: mv "${CONTACTFILE}.old" "$CONTACTFILE" From 9540baaa79b7b5e05568d9d2f9432bce6c09d3ff Mon Sep 17 00:00:00 2001 From: Hilary James Oliver Date: Mon, 18 Aug 2025 09:18:45 +1200 Subject: [PATCH 6/9] Expand a comment. [skip ci] --- cylc/flow/scheduler_cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cylc/flow/scheduler_cli.py b/cylc/flow/scheduler_cli.py index 2efc50004ff..af55f66d2ed 100644 --- a/cylc/flow/scheduler_cli.py +++ b/cylc/flow/scheduler_cli.py @@ -486,7 +486,8 @@ async def _resume(workflow_id, options): try: await cylc_ping(options, workflow_id, pclient) except WorkflowStopped: - # Not running (orphaned contact file). + # Not running, restart instead of resume. + # (Orphaned contact file will be removed by cylc_ping client logic). return except CylcError as exc: # PID check failed - abort. From 44f1c29f212c88260ad6ff2b04495fedcdbfb956 Mon Sep 17 00:00:00 2001 From: Hilary James Oliver Date: Tue, 19 Aug 2025 02:19:48 +0000 Subject: [PATCH 7/9] Tweak vr command help again. --- cylc/flow/scripts/validate_reinstall.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cylc/flow/scripts/validate_reinstall.py b/cylc/flow/scripts/validate_reinstall.py index f521f33e04f..8be1806385c 100644 --- a/cylc/flow/scripts/validate_reinstall.py +++ b/cylc/flow/scripts/validate_reinstall.py @@ -24,8 +24,8 @@ * reinstall and reload the workflow, if it is running * or reinstall and restart the workflow, if it is not running -If the source has not changed or you respond No to the reinstall prompt, the -workflow will not be reloaded or restarted. +If the source has not changed or you respond No to the reinstall prompt +the workflow will not be reinstalled, reloaded, or restarted. This command is equivalent to: $ cylc validate myworkflow --against-source From b5e5835b726d12df1664d96adc758b22f6c37e52 Mon Sep 17 00:00:00 2001 From: Hilary James Oliver Date: Mon, 25 Aug 2025 19:15:03 +1200 Subject: [PATCH 8/9] Revert CLI help change. --- cylc/flow/scripts/validate_reinstall.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cylc/flow/scripts/validate_reinstall.py b/cylc/flow/scripts/validate_reinstall.py index 8be1806385c..05704eab7e9 100644 --- a/cylc/flow/scripts/validate_reinstall.py +++ b/cylc/flow/scripts/validate_reinstall.py @@ -18,14 +18,12 @@ """cylc validate-reinstall [OPTIONS] ARGS -Validate, reinstall, and reload or restart a workflow. +Validate, reinstall and apply changes to a workflow. -If the source has changed and you respond Yes to the reinstall prompt: -* reinstall and reload the workflow, if it is running -* or reinstall and restart the workflow, if it is not running +Validate and reinstall a workflow then either: -If the source has not changed or you respond No to the reinstall prompt -the workflow will not be reinstalled, reloaded, or restarted. +* "Reload" the workflow (if it is running), +* or "play" it (if it is stopped). This command is equivalent to: $ cylc validate myworkflow --against-source From 3a73be7928c730b3619205d86d19e06af4a69ce2 Mon Sep 17 00:00:00 2001 From: Hilary James Oliver Date: Tue, 26 Aug 2025 08:53:27 +1200 Subject: [PATCH 9/9] Add a functional test for 'cylc play' contact scenarios. --- tests/functional/cylc-clean/00-basic.t | 9 ++- tests/functional/cylc-play/00-contact.t | 84 +++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 tests/functional/cylc-play/00-contact.t diff --git a/tests/functional/cylc-clean/00-basic.t b/tests/functional/cylc-clean/00-basic.t index 72106b87198..93d15bff0bd 100644 --- a/tests/functional/cylc-clean/00-basic.t +++ b/tests/functional/cylc-clean/00-basic.t @@ -21,7 +21,7 @@ if ! command -v 'tree' >'/dev/null'; then skip_all '"tree" command not available' fi -set_test_number 10 +set_test_number 12 # Generate random name for symlink dirs to avoid any clashes with other tests SYM_NAME="$(mktemp -u)" @@ -124,4 +124,11 @@ cmp_ok "${TEST_NAME}.stdout" << __EOF__ INFO - No directory to clean at ${WORKFLOW_RUN_DIR} __EOF__ # ----------------------------------------------------------------------------- +TEST_NAME="clean-non-exist-pattern" +run_ok "$TEST_NAME" cylc clean "nope*" +dump_std "$TEST_NAME" +cmp_ok "${TEST_NAME}.stderr" << __EOF__ +WARNING - No stopped workflows matching nope* +__EOF__ +# ----------------------------------------------------------------------------- purge diff --git a/tests/functional/cylc-play/00-contact.t b/tests/functional/cylc-play/00-contact.t new file mode 100644 index 00000000000..db8897e2af9 --- /dev/null +++ b/tests/functional/cylc-play/00-contact.t @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. +# Copyright (C) NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Test `cylc play` start up or resume under various contact scenarios: +# - running, normally +# - stopped, normally +# - stopped, but orphaned contact file +# - running, but can't be contacted + +. "$(dirname "$0")/test_header" + +set_test_number 10 + +init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__' +[scheduling] + [[graph]] + R1 = foo +[runtime] + [[foo]] + script = false +__FLOW_CONFIG__ + +run_ok "${TEST_NAME_BASE}-validate" \ + cylc validate "${WORKFLOW_NAME}" + +# Not running: play should start it up. +# (run like this "(cmd &) >/dev/null" to discard the process killed message) +(cylc play --pause --no-detach "${WORKFLOW_NAME}" \ + 1>"${TEST_NAME_BASE}.out" 2>&1 &) 2>/dev/null +poll_workflow_running +poll_grep_workflow_log "Pausing the workflow: Paused on start up" + +# Already running: play should resume. +TEST_NAME="${TEST_NAME_BASE}-resume" +run_ok "${TEST_NAME}" \ + cylc play "${WORKFLOW_NAME}" + +grep_ok "Resuming already-running workflow" "${TEST_NAME}.stdout" +poll_grep_workflow_log "RESUMING the workflow now" + +# Orphan the contact file +# Play should timeout, remove the contact file, and start up. +TEST_NAME="${TEST_NAME_BASE}-orphan" + +eval "$(cylc get-workflow-contact "${WORKFLOW_NAME}" | grep CYLC_WORKFLOW_PID)" +kill -9 "${CYLC_WORKFLOW_PID}" > /dev/null 2>&1 + +run_ok "${TEST_NAME}" \ + cylc play "${WORKFLOW_NAME}" --comms-timeout=1 --pause +grep_ok "Connection timed out (1000.0 ms)" "${TEST_NAME}.stderr" +grep_ok "Removed contact file" "${TEST_NAME}.stderr" +poll_grep_workflow_log "Pausing the workflow: Paused on start up" + +# Mess with the port: play aborts as can't tell if workflow is running or not. +# (The ping times out, then `cylc psutil` can't find the workflow). +eval "$(cylc get-workflow-contact "${WORKFLOW_NAME}" | grep CYLC_WORKFLOW_PORT)" +sed -i 's/CYLC_WORKFLOW_PORT=.*/CYLC_WORKFLOW_PORT=490001/' \ + "$WORKFLOW_RUN_DIR/.service/contact" +run_fail "${TEST_NAME}" \ + cylc play "${WORKFLOW_NAME}" --comms-timeout=1 +grep_ok "Connection timed out (1000.0 ms)" "${TEST_NAME}.stderr" +grep_ok "Cannot tell if the workflow is running" "${TEST_NAME}.stderr" + +# Restore contact file and shut down. +sed -i "s/CYLC_WORKFLOW_PORT=.*/CYLC_WORKFLOW_PORT=${CYLC_WORKFLOW_PORT}/" \ + "$WORKFLOW_RUN_DIR/.service/contact" +run_ok "${TEST_NAME}" \ + cylc stop --now "${WORKFLOW_NAME}" + +purge