diff --git a/src/operator/backend_test_runner/daemonset_manager.py b/src/operator/backend_test_runner/daemonset_manager.py index 0a26874f7..6d0759d60 100644 --- a/src/operator/backend_test_runner/daemonset_manager.py +++ b/src/operator/backend_test_runner/daemonset_manager.py @@ -269,7 +269,7 @@ def delete_daemonset(self) -> None: name=pod.metadata.name, namespace=self.namespace, body=kb_client.V1DeleteOptions( - grace_period_seconds=0, + grace_period_seconds=30, propagation_policy='Background' ) ) @@ -281,6 +281,30 @@ def delete_daemonset(self) -> None: logging.error('Error during cleanup: %s', e) raise + def _wait_for_daemonset_deletion(self, timeout: int = 120) -> None: + """Poll until the daemonset is fully removed from the API server. + + After delete_daemonset() the object may still exist with a deletionTimestamp + while Kubernetes garbage-collects dependents. Attempting to create a new + daemonset with the same name before it is gone causes a 409 Conflict. + + Args: + timeout: Maximum seconds to wait for the daemonset to disappear. + """ + start = time.time() + while time.time() - start < timeout: + try: + self.apps_v1.read_namespaced_daemon_set( + name=self.name, namespace=self.namespace) + logging.info('Waiting for daemonset %s to be fully deleted...', self.name) + time.sleep(2) + except kb_client.rest.ApiException as e: + if e.status == 404: + logging.info('Daemonset %s fully deleted', self.name) + return + raise + logging.warning('Timed out waiting for daemonset %s deletion after %ds', self.name, timeout) + def deploy_and_wait(self) -> bool: """Deploy daemonset and wait for condition on all nodes. @@ -291,6 +315,7 @@ def deploy_and_wait(self) -> bool: try: # Clean up any existing resources with this name if exists self.delete_daemonset() + self._wait_for_daemonset_deletion() # Create daemonset self.create_daemonset() logging.info('Waiting for daemonset and conditions at %s', time.time()) diff --git a/src/operator/utils/node_validation_test/test_base.py b/src/operator/utils/node_validation_test/test_base.py index 6504d7212..16d559efc 100644 --- a/src/operator/utils/node_validation_test/test_base.py +++ b/src/operator/utils/node_validation_test/test_base.py @@ -19,6 +19,7 @@ from datetime import datetime import logging +import signal import time from typing import Any, Dict, List, Optional @@ -28,6 +29,23 @@ from src.lib.utils import logging as logging_utils, osmo_errors from src.utils import static_config + +def _sigterm_handler(signum: int, frame: Any) -> None: # pylint: disable=unused-argument + """Convert SIGTERM into SystemExit so that finally blocks execute during pod termination.""" + logging.info('Received SIGTERM (signal %d), raising SystemExit for graceful cleanup', signum) + raise SystemExit(128 + signum) + + +def register_graceful_shutdown() -> None: + """Register a SIGTERM handler that triggers finally-block cleanup. + + Kubernetes sends SIGTERM before SIGKILL during pod termination. + Python's default SIGTERM handler terminates without running finally blocks. + This converts SIGTERM into SystemExit, which does trigger finally blocks, + allowing validators to clean up resources (e.g. benchmark pods) on shutdown. + """ + signal.signal(signal.SIGTERM, _sigterm_handler) + DEFAULT_NODE_CONDITION_PREFIX = 'osmo.nvidia.com/' @@ -138,6 +156,8 @@ def __init__(self, node_name: str, Args: node_name: Optional node name. If not provided, will be read from NODE_NAME env var. """ + register_graceful_shutdown() + # Load in-cluster config try: kb_config.load_incluster_config()