Skip to content

Commit 6676a4a

Browse files
committed
feat(bundler): add pre-flight checks to deploy.sh and post-flight to undeploy.sh
deploy.sh now runs pre-flight checks before installing components: - Detects terminating namespaces that overlap with bundle components - Detects stale mutating/validating webhooks whose backing services no longer exist (blocks pod creation with fail-closed webhooks) - Detects unavailable API services (blocks namespace deletion) - Aborts with actionable fix instructions if issues are found undeploy.sh now runs post-flight verification after cleanup: - Warns about remaining terminating namespaces - Warns about stale webhooks missed by the orphan cleanup - Warns about unavailable API services - Reports clean/dirty state for subsequent deploy.sh These checks prevent silent deployment failures caused by stale cluster-scoped resources from a previous install/uninstall cycle.
1 parent 8312960 commit 6676a4a

File tree

2 files changed

+101
-0
lines changed

2 files changed

+101
-0
lines changed

pkg/bundler/deployer/helm/templates/deploy.sh.tmpl

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,63 @@ helm_failed() {
4747
# all pods start successfully. These components are installed without --wait.
4848
ASYNC_COMPONENTS="kai-scheduler"
4949

50+
# ==============================================================================
51+
# Pre-flight checks
52+
# ==============================================================================
53+
# Verify the cluster is clean before deploying. Stale webhooks, terminating
54+
# namespaces, and orphaned API services from a previous install can block pod
55+
# creation and namespace deletion, causing silent deployment failures.
56+
57+
preflight_failed=false
58+
59+
# Check for terminating namespaces that overlap with our components
60+
for ns in {{ range .Components }}{{ .Namespace }} {{ end }}; do
61+
phase=$(kubectl get ns "${ns}" -o jsonpath='{.status.phase}' 2>/dev/null || true)
62+
if [[ "${phase}" == "Terminating" ]]; then
63+
echo "ERROR: namespace '${ns}' is still terminating from a previous install."
64+
echo " Wait for it to finish, or force-finalize with:"
65+
echo " kubectl get ns ${ns} -o json | jq '.spec.finalizers=[]' | kubectl replace --raw /api/v1/namespaces/${ns}/finalize -f -"
66+
preflight_failed=true
67+
fi
68+
done
69+
70+
# Check for stale mutating webhooks whose backing services no longer exist
71+
if command -v jq &>/dev/null; then
72+
while IFS=$'\t' read -r wh_name svc_ns svc_name; do
73+
if ! kubectl get svc "${svc_name}" -n "${svc_ns}" &>/dev/null; then
74+
echo "ERROR: mutating webhook '${wh_name}' references non-existent service ${svc_ns}/${svc_name}."
75+
echo " This will block pod creation. Delete with: kubectl delete mutatingwebhookconfiguration ${wh_name}"
76+
preflight_failed=true
77+
fi
78+
done < <(kubectl get mutatingwebhookconfigurations -o json 2>/dev/null | \
79+
jq -r '.items[] | .metadata.name as $wh | .webhooks[]? | select(.clientConfig.service != null) | [$wh, .clientConfig.service.namespace, .clientConfig.service.name] | @tsv' 2>/dev/null || true)
80+
81+
# Check for stale validating webhooks
82+
while IFS=$'\t' read -r wh_name svc_ns svc_name; do
83+
if ! kubectl get svc "${svc_name}" -n "${svc_ns}" &>/dev/null; then
84+
echo "ERROR: validating webhook '${wh_name}' references non-existent service ${svc_ns}/${svc_name}."
85+
echo " This will block resource creation. Delete with: kubectl delete validatingwebhookconfiguration ${wh_name}"
86+
preflight_failed=true
87+
fi
88+
done < <(kubectl get validatingwebhookconfigurations -o json 2>/dev/null | \
89+
jq -r '.items[] | .metadata.name as $wh | .webhooks[]? | select(.clientConfig.service != null) | [$wh, .clientConfig.service.namespace, .clientConfig.service.name] | @tsv' 2>/dev/null || true)
90+
fi
91+
92+
# Check for stale API services (e.g., custom.metrics.k8s.io from prometheus-adapter)
93+
for api_svc in $(kubectl get apiservices -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | .type == "Available" and .status == "False") | .metadata.name' 2>/dev/null || true); do
94+
echo "ERROR: API service '${api_svc}' is unavailable. This can block namespace deletion."
95+
echo " Delete with: kubectl delete apiservice ${api_svc}"
96+
preflight_failed=true
97+
done
98+
99+
if [[ "${preflight_failed}" == "true" ]]; then
100+
echo ""
101+
echo "Pre-flight checks failed. Fix the issues above before deploying."
102+
echo "To skip pre-flight checks, run: ./undeploy.sh first, then retry."
103+
exit 1
104+
fi
105+
106+
echo "Pre-flight checks passed."
50107
echo "Deploying Cloud Native Stack components..."
51108

52109
# Install components in order

pkg/bundler/deployer/helm/templates/undeploy.sh.tmpl

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,4 +188,48 @@ done
188188
{{ range .Namespaces -}}
189189
delete_orphaned_webhooks_for_ns "{{ . }}"
190190
{{ end }}
191+
# ==============================================================================
192+
# Post-flight verification
193+
# ==============================================================================
194+
# Verify the cluster is clean after undeployment. Warn about any stale
195+
# resources that could block a subsequent deploy.
196+
197+
postflight_issues=false
198+
199+
# Check for remaining terminating namespaces
200+
TERMINATING=$(kubectl get namespaces -o jsonpath='{range .items[?(@.status.phase=="Terminating")]}{.metadata.name}{" "}{end}' 2>/dev/null || true)
201+
if [[ -n "${TERMINATING}" ]]; then
202+
echo "WARNING: namespaces still terminating: ${TERMINATING}"
203+
echo " A subsequent deploy.sh may fail. Wait or force-finalize these namespaces."
204+
postflight_issues=true
205+
fi
206+
207+
# Check for stale webhooks
208+
if command -v jq &>/dev/null; then
209+
stale_wh=$(kubectl get mutatingwebhookconfigurations,validatingwebhookconfigurations -o json 2>/dev/null | \
210+
jq -r '.items[] | .metadata.name as $wh | .webhooks[]? | select(.clientConfig.service != null) | [$wh, .clientConfig.service.namespace, .clientConfig.service.name] | @tsv' 2>/dev/null | \
211+
while IFS=$'\t' read -r wh_name svc_ns svc_name; do
212+
kubectl get svc "${svc_name}" -n "${svc_ns}" &>/dev/null || echo "${wh_name}"
213+
done || true)
214+
if [[ -n "${stale_wh}" ]]; then
215+
echo "WARNING: stale webhooks found (backing service missing): ${stale_wh}"
216+
postflight_issues=true
217+
fi
218+
fi
219+
220+
# Check for stale API services
221+
stale_apis=$(kubectl get apiservices -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | .type == "Available" and .status == "False") | .metadata.name' 2>/dev/null || true)
222+
if [[ -n "${stale_apis}" ]]; then
223+
echo "WARNING: unavailable API services found: ${stale_apis}"
224+
echo " These can block namespace deletion. Delete with: kubectl delete apiservice <name>"
225+
postflight_issues=true
226+
fi
227+
228+
if [[ "${postflight_issues}" == "true" ]]; then
229+
echo ""
230+
echo "Post-flight: some stale resources remain. Run deploy.sh pre-flight checks to verify before redeploying."
231+
else
232+
echo "Post-flight: cluster is clean."
233+
fi
234+
191235
echo "Undeployment complete."

0 commit comments

Comments
 (0)