Merge pull request #438 from NVIDIA/am/multi-dse

srivatsankrishnan · web-flow · commit 3075facdfd83 · 2025-04-02T06:18:13.000-07:00
Allow multiple DSE cases in a scenario
diff --git a/conf/common/test_scenario/ucc_test.toml b/conf/common/test_scenario/ucc_test.toml
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,11 +19,13 @@ name = "ucc_test"
 [[Tests]]
 id = "Tests.1"
 test_name = "ucc_test_alltoall"
+time_limit = "00:20:00"
 num_nodes = "2"
 
 [[Tests]]
 id = "Tests.2"
 test_name = "ucc_test_alltoall"
+time_limit = "00:20:00"
 num_nodes = "2"
   [[Tests.dependencies]]
   type = "start_post_comp"
@@ -32,6 +34,7 @@ num_nodes = "2"
 [[Tests]]
 id = "Tests.3"
 test_name = "ucc_test_alltoall"
+time_limit = "00:20:00"
 num_nodes = "2"
   [[Tests.dependencies]]
   type = "start_post_comp"
@@ -40,6 +43,7 @@ num_nodes = "2"
 [[Tests]]
 id = "Tests.4"
 test_name = "ucc_test_alltoall"
+time_limit = "00:20:00"
 num_nodes = "2"
   [[Tests.dependencies]]
   type = "start_post_comp"
@@ -48,6 +52,7 @@ num_nodes = "2"
 [[Tests]]
 id = "Tests.5"
 test_name = "ucc_test_alltoall"
+time_limit = "00:20:00"
 num_nodes = "2"
   [[Tests.dependencies]]
   type = "start_post_comp"
diff --git a/src/cloudai/_core/base_runner.py b/src/cloudai/_core/base_runner.py
@@ -199,6 +199,7 @@ def get_job_output_path(self, tr: TestRun) -> Path:
             self.scenario_root.mkdir()
 
         job_output_path = self.scenario_root / tr.name / str(tr.current_iteration)
+        # here it is required to check DSE as step number because test_definition object is not a DSE object anymore
         if tr.step > 0:
             job_output_path = job_output_path / str(tr.step)
 
@@ -272,14 +273,17 @@ async def handle_job_completion(self, completed_job: BaseJob):
         Args:
             completed_job (BaseJob): The job that has just been completed.
         """
-        logging.info(f"Job completed: {completed_job.test_run.name}")
+        logging.info(
+            f"Job completed: {completed_job.test_run.name} "
+            f"(iteration {completed_job.test_run.current_iteration+1} of {completed_job.test_run.iterations})"
+        )
 
         self.jobs.remove(completed_job)
         del self.testrun_to_job_map[completed_job.test_run]
 
         if completed_job.test_run.step <= 0:
-            completed_job.test_run.current_iteration += 1
             if not completed_job.terminated_by_dependency and completed_job.test_run.has_more_iterations():
+                completed_job.test_run.current_iteration += 1
                 msg = f"Re-running job for iteration {completed_job.test_run.current_iteration}"
                 logging.info(msg)
                 await self.submit_test(completed_job.test_run)
diff --git a/src/cloudai/_core/configurator/cloudai_gym.py b/src/cloudai/_core/configurator/cloudai_gym.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 import asyncio
+import copy
 import csv
 import logging
 from typing import Any, Dict, Optional, Tuple
@@ -43,7 +44,6 @@ def __init__(self, test_run: TestRun, runner: Runner):
         """
         self.test_run = test_run
         self.runner = runner
-        self.test_scenario = runner.runner.test_scenario
         self.max_steps = test_run.test.test_definition.agent_steps
         super().__init__()
 
@@ -134,15 +134,17 @@ def step(self, action: Any) -> Tuple[list, float, bool, dict]:
         if not self.test_run.test.test_definition.constraint_check:
             logging.info("Constraint check failed. Skipping step.")
             return [-1.0], -1.0, True, {}
-        logging.info(f"Running step {self.test_run.current_iteration} with action {action}")
+
+        logging.info(f"Running step {self.test_run.step} with action {action}")
+        self.runner.runner.test_scenario.test_runs = [copy.deepcopy(self.test_run)]
         asyncio.run(self.runner.run())
 
         observation = self.get_observation(action)
         reward = self.compute_reward(observation)
         done = False
         info = {}
 
-        self.write_trajectory(self.test_run.current_iteration, action, reward, observation)
+        self.write_trajectory(self.test_run.step, action, reward, observation)
 
         return observation, reward, done, info
 
@@ -222,11 +224,15 @@ def write_trajectory(self, step: int, action: Any, reward: float, observation: l
             reward (float): The reward received for the action.
             observation (list): The observation after taking the action.
         """
-        output_path = self.runner.runner.scenario_root
-        subdir = next(output_path.iterdir())
-        trajectory_file_path = subdir / f"{self.test_run.current_iteration}" / "trajectory.csv"
+        trajectory_file_path = (
+            self.runner.runner.scenario_root
+            / self.test_run.name
+            / f"{self.test_run.current_iteration}"
+            / "trajectory.csv"
+        )
 
         file_exists = trajectory_file_path.exists()
+        logging.debug(f"Writing trajectory into {trajectory_file_path} (exists: {file_exists})")
 
         with open(trajectory_file_path, mode="a", newline="") as file:
             writer = csv.writer(file)
diff --git a/src/cloudai/_core/test_scenario.py b/src/cloudai/_core/test_scenario.py
@@ -79,7 +79,7 @@ def has_more_iterations(self) -> bool:
         Returns
             bool: True if more iterations are pending, False otherwise.
         """
-        return self.current_iteration < self.iterations
+        return self.current_iteration + 1 < self.iterations
 
     @property
     def metric_reporter(self) -> Optional[Type["ReportGenerationStrategy"]]:
diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
@@ -16,6 +16,7 @@
 
 import argparse
 import asyncio
+import copy
 import logging
 import signal
 from pathlib import Path
@@ -86,31 +87,32 @@ def handle_install_and_uninstall(args: argparse.Namespace) -> int:
 
 
 def handle_dse_job(runner: Runner, args: argparse.Namespace):
-    test_run = next(iter(runner.runner.test_scenario.test_runs))
-    env = CloudAIGymEnv(test_run=test_run, runner=runner)
     registry = Registry()
 
-    agent_type = test_run.test.test_definition.agent
-
-    agent_class = registry.agents_map.get(agent_type)
-    if agent_class is None:
-        logging.error(
-            f"No agent available for type: {agent_type}. Please make sure {agent_type} "
-            f"is a valid agent type. Available agents: {registry.agents_map.keys()}"
-        )
-        exit(1)
+    for tr in runner.runner.test_scenario.test_runs:
+        test_run = copy.deepcopy(tr)
+        env = CloudAIGymEnv(test_run=test_run, runner=runner)
+        agent_type = test_run.test.test_definition.agent
+
+        agent_class = registry.agents_map.get(agent_type)
+        if agent_class is None:
+            logging.error(
+                f"No agent available for type: {agent_type}. Please make sure {agent_type} "
+                f"is a valid agent type. Available agents: {registry.agents_map.keys()}"
+            )
+            continue
 
-    agent = agent_class(env)
-    for step in range(agent.max_steps):
-        result = agent.select_action()
-        if result is None:
-            break
-        step, action = result
-        test_run.step = step
-        observation, reward, done, info = env.step(action)
-        feedback = {"trial_index": step, "value": reward}
-        agent.update_policy(feedback)
-        logging.info(f"Step {step}: Observation: {observation}, Reward: {reward}")
+        agent = agent_class(env)
+        for step in range(agent.max_steps):
+            result = agent.select_action()
+            if result is None:
+                break
+            step, action = result
+            test_run.step = step
+            observation, reward, done, info = env.step(action)
+            feedback = {"trial_index": step, "value": reward}
+            agent.update_policy(feedback)
+            logging.info(f"Step {step}: Observation: {observation}, Reward: {reward}")
 
 
 def handle_non_dse_job(runner: Runner, args: argparse.Namespace) -> None:
@@ -187,8 +189,14 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
     runner = Runner(args.mode, system, test_scenario)
     register_signal_handlers(runner.cancel_on_signal)
 
+    all_dse = all(tr.test.test_definition.is_dse_job for tr in test_scenario.test_runs)
+
     if any(tr.test.test_definition.is_dse_job for tr in test_scenario.test_runs):
-        handle_dse_job(runner, args)
+        if all_dse:
+            handle_dse_job(runner, args)
+        else:
+            logging.error("Mixing DSE and non-DSE jobs is not allowed.")
+            return 1
     else:
         handle_non_dse_job(runner, args)