-
Notifications
You must be signed in to change notification settings - Fork 25
fix: handle startup errors more gracefully #207
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -23,6 +23,7 @@ | |||||||
| ServiceRegistrationStatus, | ||||||||
| ServiceType, | ||||||||
| ) | ||||||||
| from aiperf.common.enums.system_enums import SystemState | ||||||||
| from aiperf.common.factories import ( | ||||||||
| AIPerfUIFactory, | ||||||||
| ServiceFactory, | ||||||||
|
|
@@ -51,6 +52,7 @@ | |||||||
| from aiperf.common.types import ServiceTypeT | ||||||||
| from aiperf.controller.proxy_manager import ProxyManager | ||||||||
| from aiperf.controller.system_mixins import SignalHandlerMixin | ||||||||
| from aiperf.controller.system_utils import display_command_errors, extract_errors | ||||||||
| from aiperf.exporters.exporter_manager import ExporterManager | ||||||||
|
|
||||||||
|
|
||||||||
|
|
@@ -75,6 +77,7 @@ def __init__( | |||||||
| ) | ||||||||
| self.debug("Creating System Controller") | ||||||||
| self._was_cancelled = False | ||||||||
| self._system_state = SystemState.CREATED | ||||||||
| # List of required service types, in no particular order | ||||||||
| # These are services that must be running before the system controller can start profiling | ||||||||
| self.required_services: dict[ServiceTypeT, int] = { | ||||||||
|
|
@@ -124,10 +127,24 @@ async def request_realtime_metrics(self) -> None: | |||||||
| ) | ||||||||
| ) | ||||||||
|
|
||||||||
| @property | ||||||||
| def system_state(self) -> SystemState: | ||||||||
| """Get the current state of the system.""" | ||||||||
| return self._system_state | ||||||||
|
|
||||||||
| @system_state.setter | ||||||||
| def system_state(self, state: SystemState) -> None: | ||||||||
| """Set the current state of the system.""" | ||||||||
| if state == self._system_state: | ||||||||
| return | ||||||||
| self.info(f"AIPerf System is {state.name}") | ||||||||
| self._system_state = state | ||||||||
|
|
||||||||
| async def initialize(self) -> None: | ||||||||
| """We need to override the initialize method to run the proxy manager before the base service initialize. | ||||||||
| This is because the proxies need to be running before we can subscribe to the message bus. | ||||||||
| """ | ||||||||
| self.system_state = SystemState.INITIALIZING | ||||||||
| self.debug("Running ZMQ Proxy Manager Before Initialize") | ||||||||
| await self.proxy_manager.initialize_and_start() | ||||||||
| # Once the proxies are running, call the original initialize method | ||||||||
|
|
@@ -151,27 +168,30 @@ async def _start_services(self) -> None: | |||||||
| - Start all required services | ||||||||
| """ | ||||||||
| self.debug("System Controller is bootstrapping services") | ||||||||
| self.system_state = SystemState.STARTING_SERVICES | ||||||||
| # Start all required services | ||||||||
| await self.service_manager.start() | ||||||||
| # Wait for all services to be registered | ||||||||
| await self.service_manager.wait_for_all_services_registration( | ||||||||
| stop_event=self._stop_requested_event, | ||||||||
| ) | ||||||||
|
|
||||||||
| self.info("AIPerf System is CONFIGURING") | ||||||||
| await self._profile_configure_all_services() | ||||||||
| self.info("AIPerf System is CONFIGURED") | ||||||||
| await self._start_profiling_all_services() | ||||||||
| self.info("AIPerf System is PROFILING") | ||||||||
| self.system_state = SystemState.CONFIGURING_SERVICES | ||||||||
| if not await self._profile_configure_all_services(): | ||||||||
| return | ||||||||
| self.system_state = SystemState.PROFILING | ||||||||
| if not await self._start_profiling_all_services(): | ||||||||
| return | ||||||||
|
|
||||||||
| async def _profile_configure_all_services(self) -> None: | ||||||||
| async def _profile_configure_all_services(self) -> bool: | ||||||||
| """Configure all services to start profiling. | ||||||||
|
|
||||||||
| This is a blocking call that will wait for all services to be configured before returning. This way | ||||||||
| we can ensure that all services are configured before we start profiling. | ||||||||
| """ | ||||||||
| self.info("Configuring all services to start profiling") | ||||||||
| begin = time.perf_counter() | ||||||||
| await self.send_command_and_wait_for_all_responses( | ||||||||
| responses = await self.send_command_and_wait_for_all_responses( | ||||||||
| ProfileConfigureCommand( | ||||||||
| service_id=self.service_id, | ||||||||
| config=self.user_config, | ||||||||
|
|
@@ -181,18 +201,30 @@ async def _profile_configure_all_services(self) -> None: | |||||||
| ) | ||||||||
| duration = time.perf_counter() - begin | ||||||||
| self.info(f"All services configured in {duration:.2f} seconds") | ||||||||
|
|
||||||||
| async def _start_profiling_all_services(self) -> None: | ||||||||
| errors = extract_errors(responses) | ||||||||
| if errors: | ||||||||
| display_command_errors("Failed to configure all services", errors) | ||||||||
|
||||||||
| display_command_errors("Failed to configure all services", errors) | |
| display_command_errors("Failed to configure all services", errors) | |
| # Shielding stop() to ensure cleanup always runs even if cancelled. |
the-david-oy marked this conversation as resolved.
Show resolved
Hide resolved
Copilot
AI
Aug 12, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[nitpick] Duplicate use of asyncio.shield() for stop operations. Consider extracting this pattern into a helper method to reduce code duplication and ensure consistent error handling behavior across the codebase.
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,61 @@ | ||||||
| # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||||||
| # SPDX-License-Identifier: Apache-2.0 | ||||||
| from rich.console import Console | ||||||
| from rich.panel import Panel | ||||||
| from rich.text import Text | ||||||
|
|
||||||
| from aiperf.common.messages import CommandErrorResponse | ||||||
| from aiperf.common.messages.command_messages import CommandResponse | ||||||
| from aiperf.common.models.error_models import ErrorDetails | ||||||
|
|
||||||
|
|
||||||
| def extract_errors( | ||||||
| responses: list[CommandResponse | ErrorDetails], | ||||||
| ) -> list[CommandErrorResponse | ErrorDetails]: | ||||||
| """Extract errors from a list of command responses.""" | ||||||
| return [ | ||||||
| response | ||||||
| for response in responses | ||||||
| if isinstance(response, CommandErrorResponse | ErrorDetails) | ||||||
| ] | ||||||
|
|
||||||
|
|
||||||
| def display_command_errors( | ||||||
| title: str, errors: list[CommandErrorResponse | ErrorDetails] | ||||||
| ) -> None: | ||||||
| """Display command errors to the user.""" | ||||||
| if not errors: | ||||||
| return | ||||||
| summary = [] | ||||||
| for error in errors: | ||||||
| if isinstance(error, CommandErrorResponse): | ||||||
| summary.append( | ||||||
| Text.assemble( | ||||||
| Text("•", style="bold red"), | ||||||
| f" Service: {error.service_id}: Command: {error.command}\n", | ||||||
| ) | ||||||
| ) | ||||||
| summary.append( | ||||||
| Text.assemble( | ||||||
| Text(f"\t{error.error.type}:", style="bold red"), | ||||||
| f" {error.error.message}\n", | ||||||
| ) | ||||||
| ) | ||||||
| else: | ||||||
| summary.append( | ||||||
| Text.assemble( | ||||||
| Text(f"• {error.type}:", style="bold red"), f" {error.message}\n" | ||||||
| ) | ||||||
| ) | ||||||
| # Remove the trailing newline from the last summary item | ||||||
| summary[-1]._text[-1] = summary[-1]._text[-1].rstrip("\n") | ||||||
|
||||||
| summary[-1]._text[-1] = summary[-1]._text[-1].rstrip("\n") | |
| summary[-1] = Text(summary[-1].plain.rstrip("\n"), style=summary[-1].style) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you update the comments to make it clear what the return values signify?