Skip to content

Commit d4f6511

Browse files
committed
better visibility
Signed-off-by: Zoe Blevins <zblevins@nvidia.com>
1 parent be12efb commit d4f6511

5 files changed

Lines changed: 298 additions & 3 deletions

File tree

.github/workflows/kind-integration.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,7 @@ jobs:
112112
make kind-up \
113113
KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME}" \
114114
INSTALL_CONFIG=deploy/configs/ci-integration-test.yaml \
115-
HELM_TIMEOUT=20m \
116-
HELM_DEBUG=1
115+
HELM_TIMEOUT=20m
117116
kubectl get pods -n "${NV_CONFIG_MANAGER_NAMESPACE}" -o wide
118117
- name: Run integration tests
119118
timeout-minutes: 50

installer/src/nv_config_manager_installer/cli.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,11 @@ def generate_values(
181181
@click.option("--install-cnpg-operator", is_flag=True, help="Install CNPG operator.")
182182
@click.option("--helm-timeout", default="15m", help="Helm install/upgrade timeout.")
183183
@click.option("--helm-debug", is_flag=True, help="Enable Helm debug output during install/upgrade.")
184+
@click.option(
185+
"--watch-pods/--no-watch-pods",
186+
default=True,
187+
help="Stream pod readiness summaries while Helm waits.",
188+
)
184189
@click.option("--recreate-secrets", is_flag=True, help="Recreate existing K8s secrets.")
185190
@click.option("--dry-run", is_flag=True, help="Generate values only, skip helm install.")
186191
def deploy(
@@ -196,6 +201,7 @@ def deploy(
196201
install_cnpg_operator: bool,
197202
helm_timeout: str,
198203
helm_debug: bool,
204+
watch_pods: bool,
199205
recreate_secrets: bool,
200206
dry_run: bool,
201207
) -> None:
@@ -225,6 +231,7 @@ def deploy(
225231
install_cnpg_operator=install_cnpg_operator,
226232
helm_timeout=helm_timeout,
227233
helm_debug=helm_debug,
234+
watch_pods=watch_pods,
228235
recreate_secrets=recreate_secrets,
229236
dry_run=dry_run,
230237
)

installer/src/nv_config_manager_installer/deployer.py

Lines changed: 201 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import sys
3434
import tarfile
3535
import tempfile
36+
import threading
3637
import time
3738
import traceback
3839
from dataclasses import dataclass, field
@@ -155,6 +156,7 @@ class DeployOptions:
155156
install_cnpg_operator: bool = False
156157
helm_timeout: str = "15m"
157158
helm_debug: bool = False
159+
watch_pods: bool = False
158160
recreate_secrets: bool = False
159161
run_tests: bool = False
160162
dry_run: bool = False
@@ -455,6 +457,194 @@ def _run_logged(
455457
return subprocess.CompletedProcess(cmd, proc.returncode, stdout_text, stderr_text)
456458

457459

460+
def _short_pod_text(value: str, *, limit: int = 140) -> str:
461+
"""Return a compact single-line pod status detail."""
462+
value = " ".join(value.split())
463+
if len(value) <= limit:
464+
return value
465+
return f"{value[: limit - 3]}..."
466+
467+
468+
def _pod_ready_counts(pod: Any) -> tuple[int, int]:
469+
statuses = (pod.status.container_statuses if pod.status else None) or []
470+
total = len(statuses)
471+
ready = sum(1 for status in statuses if status.ready)
472+
return ready, total
473+
474+
475+
def _pod_restart_count(pod: Any) -> int:
476+
statuses = []
477+
if pod.status:
478+
statuses.extend(pod.status.init_container_statuses or [])
479+
statuses.extend(pod.status.container_statuses or [])
480+
return sum(status.restart_count or 0 for status in statuses)
481+
482+
483+
def _pod_is_ready(pod: Any) -> bool:
484+
if pod.status and pod.status.phase == "Succeeded":
485+
return True
486+
conditions = (pod.status.conditions if pod.status else None) or []
487+
for condition in conditions:
488+
if condition.type == "Ready":
489+
return condition.status == "True"
490+
ready, total = _pod_ready_counts(pod)
491+
return total > 0 and ready == total
492+
493+
494+
def _container_waiting_details(statuses: list[Any] | None, *, prefix: str) -> list[str]:
495+
details: list[str] = []
496+
for status in statuses or []:
497+
if status.ready:
498+
continue
499+
name = status.name or "<unknown>"
500+
state = status.state
501+
if state and state.waiting:
502+
reason = state.waiting.reason or "Waiting"
503+
message = state.waiting.message or ""
504+
text = f"{prefix} {name}: {reason}"
505+
if message:
506+
text = f"{text} ({_short_pod_text(message)})"
507+
details.append(text)
508+
elif state and state.terminated:
509+
reason = state.terminated.reason or "Terminated"
510+
details.append(f"{prefix} {name}: {reason} exitCode={state.terminated.exit_code}")
511+
elif state and state.running:
512+
details.append(f"{prefix} {name}: Running but not ready")
513+
else:
514+
details.append(f"{prefix} {name}: status unknown")
515+
return details
516+
517+
518+
def _pod_waiting_details(pod: Any) -> list[str]:
519+
details: list[str] = []
520+
if pod.status:
521+
for condition in pod.status.conditions or []:
522+
if condition.type in {"Ready", "ContainersReady"} or condition.status == "True":
523+
continue
524+
text = f"condition {condition.type}={condition.status}"
525+
if condition.reason:
526+
text = f"{text} ({condition.reason})"
527+
if condition.message:
528+
text = f"{text}: {_short_pod_text(condition.message)}"
529+
details.append(text)
530+
531+
details.extend(
532+
_container_waiting_details(
533+
pod.status.init_container_statuses,
534+
prefix="init",
535+
)
536+
)
537+
details.extend(
538+
_container_waiting_details(
539+
pod.status.container_statuses,
540+
prefix="container",
541+
)
542+
)
543+
544+
if not details and (pod.status.reason or pod.status.message):
545+
text = pod.status.reason or "Waiting"
546+
if pod.status.message:
547+
text = f"{text}: {_short_pod_text(pod.status.message)}"
548+
details.append(text)
549+
return details or ["waiting for readiness"]
550+
551+
552+
def _unready_pod_summary_lines(k8s: K8sClient, namespace: str, *, limit: int = 8) -> list[str]:
553+
"""Return concise status lines for pods that are not ready."""
554+
try:
555+
pods = k8s.v1.list_namespaced_pod(namespace).items
556+
except Exception as exc:
557+
return [f"pod summary unavailable: {exc}"]
558+
559+
if not pods:
560+
return ["no pods found yet"]
561+
562+
unready = [pod for pod in pods if not _pod_is_ready(pod)]
563+
unready.sort(key=lambda pod: pod.metadata.name or "")
564+
if not unready:
565+
return ["all pods ready"]
566+
567+
visible = unready[:limit]
568+
lines = [
569+
f"{len(unready)} pod(s) not ready"
570+
+ (f"; showing first {limit}" if len(unready) > limit else "")
571+
]
572+
for pod in visible:
573+
name = pod.metadata.name or "<unknown>"
574+
ready, total = _pod_ready_counts(pod)
575+
phase = pod.status.phase if pod.status and pod.status.phase else "Unknown"
576+
restarts = _pod_restart_count(pod)
577+
details = "; ".join(_pod_waiting_details(pod)[:3])
578+
lines.append(f"{name} ready={ready}/{total} phase={phase} restarts={restarts}: {details}")
579+
return lines
580+
581+
582+
def _emit_pod_summary(lines: list[str], step: DeployStep, callback: DeployCallback) -> None:
583+
for line in lines:
584+
msg = f"[pods] {line}"
585+
step.output.append(msg)
586+
callback.on_log(msg)
587+
588+
589+
def _poll_pod_summary(
590+
k8s: K8sClient,
591+
namespace: str,
592+
step: DeployStep,
593+
callback: DeployCallback,
594+
stop_event: threading.Event,
595+
*,
596+
interval: float,
597+
heartbeat_interval: float,
598+
) -> None:
599+
"""Poll and emit changed pod readiness summaries until stopped."""
600+
last_signature = ""
601+
last_emit = 0.0
602+
while not stop_event.is_set():
603+
lines = _unready_pod_summary_lines(k8s, namespace)
604+
signature = "\n".join(lines)
605+
now = time.monotonic()
606+
has_blockers = lines != ["all pods ready"]
607+
if signature != last_signature or (has_blockers and now - last_emit >= heartbeat_interval):
608+
_emit_pod_summary(lines, step, callback)
609+
last_signature = signature
610+
last_emit = now
611+
if stop_event.wait(interval):
612+
break
613+
614+
615+
def _run_logged_with_pod_summary(
616+
cmd: list[str],
617+
k8s: K8sClient | None,
618+
namespace: str,
619+
step: DeployStep,
620+
callback: DeployCallback,
621+
*,
622+
check: bool = True,
623+
timeout: int | None = 600,
624+
poll_interval: float = 10.0,
625+
heartbeat_interval: float = 60.0,
626+
) -> subprocess.CompletedProcess[str]:
627+
"""Run a command while periodically logging unready pod summaries."""
628+
if k8s is None:
629+
callback.on_log("Pod readiness summaries unavailable: Kubernetes client is not initialized")
630+
return _run_logged(cmd, step, callback, check=check, timeout=timeout)
631+
632+
callback.on_log("Streaming pod readiness summaries while Helm waits...")
633+
stop_event = threading.Event()
634+
poll_thread = threading.Thread(
635+
target=_poll_pod_summary,
636+
args=(k8s, namespace, step, callback, stop_event),
637+
kwargs={"interval": poll_interval, "heartbeat_interval": heartbeat_interval},
638+
daemon=True,
639+
)
640+
poll_thread.start()
641+
try:
642+
return _run_logged(cmd, step, callback, check=check, timeout=timeout)
643+
finally:
644+
stop_event.set()
645+
poll_thread.join(timeout=2)
646+
647+
458648
def _format_elapsed(seconds: float) -> str:
459649
"""Return a compact elapsed-time string for long-running command status."""
460650
seconds_int = max(0, int(seconds))
@@ -2077,7 +2267,17 @@ def _helm_install(self) -> None:
20772267

20782268
debug_suffix = " --debug" if "--debug" in helm_args else ""
20792269
self.callback.on_log(f"Running: helm upgrade --install {release}{debug_suffix} ...")
2080-
_run_logged(helm_args, step, self.callback, timeout=1200)
2270+
if self.options.watch_pods:
2271+
_run_logged_with_pod_summary(
2272+
helm_args,
2273+
self._k8s,
2274+
ns,
2275+
step,
2276+
self.callback,
2277+
timeout=1200,
2278+
)
2279+
else:
2280+
_run_logged(helm_args, step, self.callback, timeout=1200)
20812281
self._finish_step(step)
20822282

20832283
def _patch_gateway(self) -> None:

installer/tests/test_cli.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,3 +111,5 @@ def test_deploy_help(self):
111111
assert "--dry-run" in result.output
112112
assert "--helm-timeout" in result.output
113113
assert "--helm-debug" in result.output
114+
assert "--watch-pods" in result.output
115+
assert "--no-watch-pods" in result.output

0 commit comments

Comments
 (0)