Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion src/operator/backend_test_runner/daemonset_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ def delete_daemonset(self) -> None:
name=pod.metadata.name,
namespace=self.namespace,
body=kb_client.V1DeleteOptions(
grace_period_seconds=0,
grace_period_seconds=30,
propagation_policy='Background'
)
)
Expand All @@ -281,6 +281,30 @@ def delete_daemonset(self) -> None:
logging.error('Error during cleanup: %s', e)
raise

def _wait_for_daemonset_deletion(self, timeout: int = 120) -> None:
"""Poll until the daemonset is fully removed from the API server.

After delete_daemonset() the object may still exist with a deletionTimestamp
while Kubernetes garbage-collects dependents. Attempting to create a new
daemonset with the same name before it is gone causes a 409 Conflict.

Args:
timeout: Maximum seconds to wait for the daemonset to disappear.
"""
start = time.time()
while time.time() - start < timeout:
try:
self.apps_v1.read_namespaced_daemon_set(
name=self.name, namespace=self.namespace)
logging.info('Waiting for daemonset %s to be fully deleted...', self.name)
time.sleep(2)
except kb_client.rest.ApiException as e:
if e.status == 404:
logging.info('Daemonset %s fully deleted', self.name)
return
raise
logging.warning('Timed out waiting for daemonset %s deletion after %ds', self.name, timeout)

def deploy_and_wait(self) -> bool:
"""Deploy daemonset and wait for condition on all nodes.

Expand All @@ -291,6 +315,7 @@ def deploy_and_wait(self) -> bool:
try:
# Clean up any existing resources with this name if exists
self.delete_daemonset()
self._wait_for_daemonset_deletion()
# Create daemonset
self.create_daemonset()
logging.info('Waiting for daemonset and conditions at %s', time.time())
Expand Down
20 changes: 20 additions & 0 deletions src/operator/utils/node_validation_test/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from datetime import datetime
import logging
import signal
import time
from typing import Any, Dict, List, Optional

Expand All @@ -28,6 +29,23 @@
from src.lib.utils import logging as logging_utils, osmo_errors
from src.utils import static_config


def _sigterm_handler(signum: int, frame: Any) -> None: # pylint: disable=unused-argument
"""Convert SIGTERM into SystemExit so that finally blocks execute during pod termination."""
logging.info('Received SIGTERM (signal %d), raising SystemExit for graceful cleanup', signum)
raise SystemExit(128 + signum)


def register_graceful_shutdown() -> None:
"""Register a SIGTERM handler that triggers finally-block cleanup.

Kubernetes sends SIGTERM before SIGKILL during pod termination.
Python's default SIGTERM handler terminates without running finally blocks.
This converts SIGTERM into SystemExit, which does trigger finally blocks,
allowing validators to clean up resources (e.g. benchmark pods) on shutdown.
"""
signal.signal(signal.SIGTERM, _sigterm_handler)

DEFAULT_NODE_CONDITION_PREFIX = 'osmo.nvidia.com/'


Expand Down Expand Up @@ -138,6 +156,8 @@ def __init__(self, node_name: str,
Args:
node_name: Optional node name. If not provided, will be read from NODE_NAME env var.
"""
register_graceful_shutdown()

# Load in-cluster config
try:
kb_config.load_incluster_config()
Expand Down
Loading