Handle missing tests gracefully by adding MissingTestError to avoid backtrace (#640)

TaekyungHeo · web-flow · commit e981b2c4040a · 2025-08-11T11:50:04.000-04:00
diff --git a/src/cloudai/_core/exceptions.py b/src/cloudai/_core/exceptions.py
@@ -129,6 +129,21 @@ class SystemConfigParsingError(Exception):
     pass
 
 
+class MissingTestError(Exception):
+    """Exception raised when a test specified in a test scenario is not found in the test directory."""
+
+    def __init__(self, test_name: str):
+        self.test_name = test_name
+        self.message = (
+            f"Test '{test_name}' is not defined.\n"
+            "Please check:\n"
+            "1. The tests directory argument (--tests-dir) is set correctly\n"
+            "2. The test name in your test scenario matches the test name defined in the test file\n"
+            "3. The test file exists in your tests directory"
+        )
+        super().__init__(self.message)
+
+
 def format_validation_error(err: ErrorDetails) -> str:
     flattened_field = ".".join(str(v) for v in err["loc"])
 
diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
@@ -31,6 +31,7 @@
     BaseInstaller,
     CloudAIGymEnv,
     Installable,
+    MissingTestError,
     Parser,
     Registry,
     Runner,
@@ -195,34 +196,47 @@ def register_signal_handlers(signal_handler: Callable) -> None:
         signal.signal(sig, signal_handler)
 
 
-def handle_dry_run_and_run(args: argparse.Namespace) -> int:
+def _setup_system_and_scenario(
+    args: argparse.Namespace,
+) -> tuple[Optional[System], Optional[TestScenario], Optional[list[Test]]]:
     parser = Parser(args.system_config)
-    system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario)
+    try:
+        system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario)
+    except MissingTestError as e:
+        logging.error(e.message)
+        return None, None, None
 
     assert test_scenario is not None
 
     if args.output_dir:
         system.output_path = args.output_dir.absolute()
 
     if not prepare_output_dir(system.output_path):
-        return 1
+        return None, None, None
+
     if args.mode == "dry-run":
         system.monitor_interval = 1
     system.update()
 
-    if args.single_sbatch:
-        if not isinstance(system, SlurmSystem):
-            logging.error("Single sbatch is only supported for Slurm systems.")
-            return 1
+    return system, test_scenario, tests
 
-        Registry().update_runner("slurm", SingleSbatchRunner)
 
-    logging.info(f"System Name: {system.name}")
-    logging.info(f"Scheduler: {system.scheduler}")
-    logging.info(f"Test Scenario Name: {test_scenario.name}")
+def _handle_single_sbatch(args: argparse.Namespace, system: System) -> bool:
+    if not args.single_sbatch:
+        return True
 
-    logging.info("Checking if test templates are installed.")
+    if not isinstance(system, SlurmSystem):
+        logging.error("Single sbatch is only supported for Slurm systems.")
+        return False
 
+    Registry().update_runner("slurm", SingleSbatchRunner)
+    return True
+
+
+def _check_installation(
+    args: argparse.Namespace, system: System, tests: list[Test], test_scenario: TestScenario
+) -> bool:
+    logging.info("Checking if test templates are installed.")
     installables, installer = prepare_installation(system, tests, test_scenario)
 
     if args.enable_cache_without_check:
@@ -233,6 +247,29 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
     if args.mode == "run" and not result.success:
         logging.error("CloudAI has not been installed. Please run install mode first.")
         logging.error(result.message)
+        return False
+
+    return True
+
+
+def handle_dry_run_and_run(args: argparse.Namespace) -> int:
+    setup_result = _setup_system_and_scenario(args)
+    if setup_result == (None, None, None):
+        return 1
+
+    system, test_scenario, tests = setup_result
+    assert system is not None
+    assert test_scenario is not None
+    assert tests is not None
+
+    if not _handle_single_sbatch(args, system):
+        return 1
+
+    logging.info(f"System Name: {system.name}")
+    logging.info(f"Scheduler: {system.scheduler}")
+    logging.info(f"Test Scenario Name: {test_scenario.name}")
+
+    if not _check_installation(args, system, tests, test_scenario):
         return 1
 
     logging.info(test_scenario.pretty_print())
@@ -247,11 +284,10 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
 
     if all(tr.is_dse_job for tr in test_scenario.test_runs):
         handle_dse_job(runner, args)
-    else:
-        logging.error("Mixing DSE and non-DSE jobs is not allowed.")
-        return 1
+        return 0
 
-    return 0
+    logging.error("Mixing DSE and non-DSE jobs is not allowed.")
+    return 1
 
 
 def handle_generate_report(args: argparse.Namespace) -> int:
diff --git a/src/cloudai/core.py b/src/cloudai/core.py
@@ -24,6 +24,7 @@
 from ._core.command_gen_strategy import CommandGenStrategy
 from ._core.exceptions import (
     JobIdRetrievalError,
+    MissingTestError,
     SystemConfigParsingError,
     TestConfigParsingError,
     TestScenarioParsingError,
@@ -72,6 +73,7 @@
     "JobIdRetrievalError",
     "JobStatusResult",
     "JsonGenStrategy",
+    "MissingTestError",
     "NsysConfiguration",
     "Parser",
     "PerTestReporter",
diff --git a/src/cloudai/test_scenario_parser.py b/src/cloudai/test_scenario_parser.py
@@ -26,6 +26,7 @@
 from cloudai.util import format_time_limit, parse_time_limit
 
 from .core import (
+    MissingTestError,
     Registry,
     ReportGenerationStrategy,
     System,
@@ -231,7 +232,7 @@ def _prepare_tdef(self, test_info: TestRunModel) -> Tuple[Test, TestDefinition]:
 
         if test_info.test_name:
             if test_info.test_name not in self.test_mapping:
-                raise ValueError(f"Test '{test_info.test_name}' is not defined. Was tests directory correctly set?")
+                raise MissingTestError(test_info.test_name)
             test = self.test_mapping[test_info.test_name]
 
             test_defined = test.test_definition.model_dump(by_alias=True)
diff --git a/tests/test_test_scenario.py b/tests/test_test_scenario.py
@@ -22,6 +22,7 @@
 import pytest
 import toml
 
+from cloudai._core.exceptions import MissingTestError
 from cloudai.core import (
     CmdArgs,
     GitRepo,
@@ -301,9 +302,16 @@ def test_without_base(self, missing_arg: str):
         )
 
     def test_name_is_not_in_mapping(self, test_scenario_parser: TestScenarioParser):
-        with pytest.raises(ValueError) as exc_info:
+        with pytest.raises(MissingTestError) as exc_info:
             test_scenario_parser._prepare_tdef(TestRunModel(id="1", test_name="nccl"))
-        assert exc_info.match("Test 'nccl' is not defined. Was tests directory correctly set?")
+        expected_msg = (
+            "Test 'nccl' is not defined.\n"
+            "Please check:\n"
+            "1. The tests directory argument (--tests-dir) is set correctly\n"
+            "2. The test name in your test scenario matches the test name defined in the test file\n"
+            "3. The test file exists in your tests directory"
+        )
+        assert str(exc_info.value) == expected_msg
 
     @pytest.mark.parametrize("override_arg", ["name", "description"])
     def test_can_override_name_and_description(self, override_arg: str):