Merge pull request #400 from NVIDIA/am/scenario-report

srivatsankrishnan · web-flow · commit 552e35a5be3a · 2025-03-13T10:53:51.000-07:00
Generate scenario-level report
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,7 @@ dependencies = [
   "toml==0.10.2",
   "kubernetes==30.1.0",
   "pydantic==2.8.2",
+  "jinja2==3.1.5",
 ]
   [project.scripts]
   cloudai = "cloudai.__main__:main"
diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ tbparse==0.0.8
 toml==0.10.2
 kubernetes==30.1.0
 pydantic==2.8.2
+jinja2==3.1.5
diff --git a/src/cloudai/_core/base_runner.py b/src/cloudai/_core/base_runner.py
@@ -17,7 +17,6 @@
 import asyncio
 import datetime
 import logging
-import sys
 from abc import ABC, abstractmethod
 from asyncio import Task
 from pathlib import Path
@@ -94,14 +93,11 @@ async def shutdown(self):
             self.system.kill(job)
         logging.info("All jobs have been killed.")
 
-        sys.exit(0)
-
     async def run(self):
         """Asynchronously run the test scenario."""
         if self.shutting_down:
             return
 
-        logging.info("Starting test scenario execution.")
         total_tests = len(self.test_scenario.test_runs)
         completed_jobs_count = 0
 
diff --git a/src/cloudai/_core/reporter.py b/src/cloudai/_core/reporter.py
@@ -17,6 +17,8 @@
 import logging
 from pathlib import Path
 
+import jinja2
+
 from .system import System
 from .test_scenario import TestRun, TestScenario
 
@@ -44,15 +46,41 @@ def generate(self) -> None:
         Args:
             test_scenario (TestScenario): The scenario containing tests.
         """
+        self.generate_scenario_report()
+
         for tr in self.test_scenario.test_runs:
             test_output_dir = self.results_root / tr.name
             if not test_output_dir.exists() or not test_output_dir.is_dir():
                 logging.warning(f"Directory '{test_output_dir}' not found.")
                 continue
 
-            self._generate_test_report(test_output_dir, tr)
+            self.generate_per_case_reports(test_output_dir, tr)
 
-    def _generate_test_report(self, directory_path: Path, tr: TestRun) -> None:
+    def generate_scenario_report(self) -> None:
+        template = jinja2.Environment(loader=jinja2.FileSystemLoader("src/cloudai/util")).get_template(
+            "general-report.jinja2"
+        )
+
+        results = {}
+        for tr in self.test_scenario.test_runs:
+            for iter in range(tr.iterations):
+                run_dir = self.results_root / tr.name / f"{iter}"
+                if run_dir.exists():
+                    results.setdefault(
+                        tr.name + f"{iter}", {"logs_path": f"./{run_dir.relative_to(self.results_root)}"}
+                    )
+
+        report = template.render(
+            test_scenario=self.test_scenario,
+            tr_results=results,
+        )
+        report_path = self.results_root / f"{self.test_scenario.name}.html"
+        with report_path.open("w") as f:
+            f.write(report)
+
+        logging.info(f"Generated scenario report at {report_path}")
+
+    def generate_per_case_reports(self, directory_path: Path, tr: TestRun) -> None:
         """
         Generate reports for a test by iterating through subdirectories within the directory path.
 
@@ -71,7 +99,7 @@ def _generate_test_report(self, directory_path: Path, tr: TestRun) -> None:
                 tr.output_path = subdir
 
                 if not rgs.can_handle_directory():
-                    logging.warning(f"Skipping '{tr.output_path}', can't handle with " f"strategy={reporter}.")
+                    logging.warning(f"Skipping '{tr.output_path}', can't handle with " f"strategy={reporter.__name__}.")
                     continue
 
                 rgs.generate_report()
diff --git a/src/cloudai/_core/runner.py b/src/cloudai/_core/runner.py
@@ -20,6 +20,7 @@
 from typing import Optional
 
 from .base_runner import BaseRunner
+from .exceptions import JobFailureError
 from .registry import Registry
 from .system import System
 from .test_scenario import TestScenario
@@ -74,7 +75,11 @@ def create_runner(self, mode: str, system: System, test_scenario: TestScenario)
 
     async def run(self):
         """Run the test scenario using the instantiated runner."""
-        await self.runner.run()
+        try:
+            await self.runner.run()
+            logging.debug("All jobs finished successfully.")
+        except JobFailureError as exc:
+            logging.debug(f"Runner failed JobFailure exception: {exc}", exc_info=True)
 
     def _cancel_all(self):
         # the below code might look excessive, this is to address https://docs.astral.sh/ruff/rules/asyncio-dangling-task/
diff --git a/src/cloudai/util/general-report.jinja2 b/src/cloudai/util/general-report.jinja2
@@ -0,0 +1,26 @@
+<html>
+    <head>
+        <title>{{ test_scenario.name }}</title>
+    </head>
+    <body>
+        <h1>{{ test_scenario.name }}</h1>
+    </body>
+    <table>
+        <tr>
+            <th>Test</th>
+            <th>Status</th>
+        </tr>
+        {% for tr in test_scenario.test_runs %}
+        {% for iter in range(tr.iterations) %}
+        <tr>
+            <td>{{ tr.name }}.{{ iter }}</td>
+            {% if tr.name + iter|string in tr_results %}
+            <td><a href="{{ tr_results[tr.name + iter|string]["logs_path"] }}">logs</a></td>
+            {% else %}
+            <td>no logs</td>
+            {% endif %}
+        </tr>
+        {% endfor %}
+        {% endfor %}
+    </table>
+</html>

Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,7 @@ dependencies = [`
`24`	`24`	`"toml==0.10.2",`
`25`	`25`	`"kubernetes==30.1.0",`
`26`	`26`	`"pydantic==2.8.2",`
	`27`	`+ "jinja2==3.1.5",`
`27`	`28`	`]`
`28`	`29`	`[project.scripts]`
`29`	`30`	`cloudai = "cloudai.__main__:main"`