From 48f6d06effb4cd042fb3e39eb96902c3980dee36 Mon Sep 17 00:00:00 2001 From: "rw-codebundle-agent[bot]" Date: Thu, 11 Jun 2026 06:01:00 +0000 Subject: [PATCH] Add k8s-deployment-rollout-troubleshoot CodeBundle. Read-only diagnostics for stuck or failing Kubernetes deployment rollouts, covering status, ReplicaSets, pod failures, strategy, PDBs, events, and history. Co-authored-by: Cursor --- .../k8s-deployment-rollout-troubleshoot.yaml | 22 + ...s-deployment-rollout-troubleshoot-sli.yaml | 28 ++ ...s-deployment-rollout-troubleshoot-slx.yaml | 25 ++ ...ployment-rollout-troubleshoot-taskset.yaml | 45 ++ .../.test/README.md | 15 + .../.test/Taskfile.yaml | 178 ++++++++ .../.test/kubernetes/manifest.yaml | 171 ++++++++ .../README.md | 76 ++++ .../check-pdb-rollout-impact.sh | 87 ++++ .../check-rollout-status.sh | 102 +++++ .../check-rollout-strategy-config.sh | 90 ++++ .../check-stuck-terminating-pods.sh | 80 ++++ .../compare-replicasets-during-rollout.sh | 100 +++++ .../detect-rollout-blocking-events.sh | 85 ++++ .../fetch-rollout-history.sh | 102 +++++ .../inspect-new-replicaset-pod-failures.sh | 92 ++++ .../k8s-rollout-helpers.sh | 96 +++++ .../runbook.robot | 395 ++++++++++++++++++ 18 files changed, 1789 insertions(+) create mode 100644 codebundles/k8s-deployment-rollout-troubleshoot/.runwhen/generation-rules/k8s-deployment-rollout-troubleshoot.yaml create mode 100644 codebundles/k8s-deployment-rollout-troubleshoot/.runwhen/templates/k8s-deployment-rollout-troubleshoot-sli.yaml create mode 100644 codebundles/k8s-deployment-rollout-troubleshoot/.runwhen/templates/k8s-deployment-rollout-troubleshoot-slx.yaml create mode 100644 codebundles/k8s-deployment-rollout-troubleshoot/.runwhen/templates/k8s-deployment-rollout-troubleshoot-taskset.yaml create mode 100644 codebundles/k8s-deployment-rollout-troubleshoot/.test/README.md create mode 100644 codebundles/k8s-deployment-rollout-troubleshoot/.test/Taskfile.yaml create mode 100644 codebundles/k8s-deployment-rollout-troubleshoot/.test/kubernetes/manifest.yaml create mode 100644 codebundles/k8s-deployment-rollout-troubleshoot/README.md create mode 100755 codebundles/k8s-deployment-rollout-troubleshoot/check-pdb-rollout-impact.sh create mode 100755 codebundles/k8s-deployment-rollout-troubleshoot/check-rollout-status.sh create mode 100755 codebundles/k8s-deployment-rollout-troubleshoot/check-rollout-strategy-config.sh create mode 100755 codebundles/k8s-deployment-rollout-troubleshoot/check-stuck-terminating-pods.sh create mode 100755 codebundles/k8s-deployment-rollout-troubleshoot/compare-replicasets-during-rollout.sh create mode 100755 codebundles/k8s-deployment-rollout-troubleshoot/detect-rollout-blocking-events.sh create mode 100755 codebundles/k8s-deployment-rollout-troubleshoot/fetch-rollout-history.sh create mode 100755 codebundles/k8s-deployment-rollout-troubleshoot/inspect-new-replicaset-pod-failures.sh create mode 100755 codebundles/k8s-deployment-rollout-troubleshoot/k8s-rollout-helpers.sh create mode 100644 codebundles/k8s-deployment-rollout-troubleshoot/runbook.robot diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/.runwhen/generation-rules/k8s-deployment-rollout-troubleshoot.yaml b/codebundles/k8s-deployment-rollout-troubleshoot/.runwhen/generation-rules/k8s-deployment-rollout-troubleshoot.yaml new file mode 100644 index 00000000..a2983d9a --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/.runwhen/generation-rules/k8s-deployment-rollout-troubleshoot.yaml @@ -0,0 +1,22 @@ +apiVersion: runwhen.com/v1 +kind: GenerationRules +spec: + generationRules: + - resourceTypes: + - deployment + matchRules: + - type: pattern + pattern: ".+" + properties: [name] + mode: substring + slxs: + - baseName: k8s-dep-rollout + shortenedBaseName: k8s-dep-rollout + qualifiers: ["resource", "namespace", "cluster"] + baseTemplateName: k8s-deployment-rollout-troubleshoot + levelOfDetail: basic + outputItems: + - type: slx + - type: sli + - type: runbook + templateName: k8s-deployment-rollout-troubleshoot-taskset.yaml diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/.runwhen/templates/k8s-deployment-rollout-troubleshoot-sli.yaml b/codebundles/k8s-deployment-rollout-troubleshoot/.runwhen/templates/k8s-deployment-rollout-troubleshoot-sli.yaml new file mode 100644 index 00000000..dce590b3 --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/.runwhen/templates/k8s-deployment-rollout-troubleshoot-sli.yaml @@ -0,0 +1,28 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelIndicator +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + displayUnitsLong: Execution Status + displayUnitsShort: exe + locations: + - {{default_location}} + description: Periodically triggers k8s-deployment-rollout-troubleshoot runbook for deployment {{match_resource.resource.metadata.name}}. + codeBundle: + repoUrl: https://github.com/runwhen-contrib/rw-workspace-utils.git + ref: main + pathToRobot: codebundles/cron-scheduler-sli/sli.robot + intervalStrategy: intermezzo + intervalSeconds: 300 + configProvided: + - name: CRON_SCHEDULE + value: "0 */6 * * *" + - name: TARGET_SLX + value: "" + - name: DRY_RUN + value: "false" + secretsProvided: [] diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/.runwhen/templates/k8s-deployment-rollout-troubleshoot-slx.yaml b/codebundles/k8s-deployment-rollout-troubleshoot/.runwhen/templates/k8s-deployment-rollout-troubleshoot-slx.yaml new file mode 100644 index 00000000..6d6e4e3a --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/.runwhen/templates/k8s-deployment-rollout-troubleshoot-slx.yaml @@ -0,0 +1,25 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelX +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/deploy.svg + alias: {{match_resource.resource.metadata.name}} Rollout Troubleshoot + asMeasuredBy: Periodic execution of rollout troubleshoot tasks when rollout health degrades. + configProvided: + - name: DEPLOYMENT_NAME + value: {{match_resource.resource.metadata.name}} + owners: + - {{workspace.owner_email}} + statement: Deployment {{match_resource.resource.metadata.name}} rollouts should complete successfully without stalled or failed revisions. + additionalContext: + {% include "kubernetes-hierarchy.yaml" ignore missing %} + qualified_name: "{{ match_resource.qualified_name }}" + tags: + {% include "kubernetes-tags.yaml" ignore missing %} + - name: access + value: read-only diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/.runwhen/templates/k8s-deployment-rollout-troubleshoot-taskset.yaml b/codebundles/k8s-deployment-rollout-troubleshoot/.runwhen/templates/k8s-deployment-rollout-troubleshoot-taskset.yaml new file mode 100644 index 00000000..11b4cf7e --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/.runwhen/templates/k8s-deployment-rollout-troubleshoot-taskset.yaml @@ -0,0 +1,45 @@ +apiVersion: runwhen.com/v1 +kind: Runbook +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + location: {{default_location}} + description: Read-only diagnostics for Kubernetes deployment rollout lifecycle failures. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/k8s-deployment-rollout-troubleshoot/runbook.robot + configProvided: + - name: NAMESPACE + value: "{{match_resource.resource.metadata.namespace}}" + - name: CONTEXT + value: "{{context}}" + - name: DEPLOYMENT_NAME + value: "{{match_resource.resource.metadata.name}}" + - name: KUBERNETES_DISTRIBUTION_BINARY + value: "{{custom.kubernetes_distribution_binary | default('kubectl')}}" + - name: EVENT_AGE + value: "30m" + - name: ROLLOUT_STATUS_TIMEOUT + value: "30" + - name: STUCK_TERMINATING_THRESHOLD + value: "5" + secretsProvided: + {% if wb_version %} + {% include "kubernetes-auth.yaml" ignore missing %} + {% else %} + - name: kubeconfig + workspaceKey: {{custom.kubeconfig_secret_name | default("kubeconfig")}} + {% endif %} diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/.test/README.md b/codebundles/k8s-deployment-rollout-troubleshoot/.test/README.md new file mode 100644 index 00000000..2316b037 --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/.test/README.md @@ -0,0 +1,15 @@ +# Test Infrastructure for k8s-deployment-rollout-troubleshoot + +Apply manifests with `task build-infra` to create test deployments covering rollout troubleshoot scenarios in the `test-rollout-troubleshoot` namespace. + +## Scenarios + +| Deployment | Scenario | +|---|---| +| `healthy-rollout` | Complete healthy rollout | +| `progress-deadline-fail` | ProgressDeadlineExceeded via failing readiness probe | +| `pdb-blocked-rollout` | PDB minAvailable blocks eviction during rollout | +| `image-pull-fail` | ImagePullBackOff on bad image tag | +| `stuck-terminating-seed` | Long preStop hook for terminating pod testing | + +Run `task default` after committing and pushing changes to validate generation rules via RunWhen Local discovery. diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/.test/Taskfile.yaml b/codebundles/k8s-deployment-rollout-troubleshoot/.test/Taskfile.yaml new file mode 100644 index 00000000..adc10a98 --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/.test/Taskfile.yaml @@ -0,0 +1,178 @@ +version: "3" + +tasks: + default: + desc: "Run/refresh config" + cmds: + - task: check-unpushed-commits + - task: generate-rwl-config + - task: run-rwl-discovery + + clean: + desc: "Run cleanup tasks" + cmds: + - task: remove-kubernetes-objects + - task: delete-slxs + - task: clean-rwl-discovery + + build-infra: + desc: "Build test infrastructure" + cmds: + - task: create-kubernetes-objects + + create-kubernetes-objects: + desc: "Apply manifests from kubernetes directory using kubectl" + cmds: + - kubectl apply -f kubernetes/* + silent: true + + remove-kubernetes-objects: + desc: "Delete kubernetes objects" + cmds: + - kubectl delete -f kubernetes/* --ignore-not-found + silent: true + + check-unpushed-commits: + desc: Check if outstanding commits or file updates need to be pushed before testing. + vars: + BASE_DIR: "../" + cmds: + - | + echo "Checking for uncommitted changes in $BASE_DIR and $BASE_DIR.runwhen, excluding '.test'..." + UNCOMMITTED_FILES=$(git diff --name-only HEAD | grep -E "^${BASE_DIR}(\.runwhen|[^/]+)" | grep -v "/\.test/" || true) + if [ -n "$UNCOMMITTED_FILES" ]; then + echo "✗" + echo "Uncommitted changes found:" + echo "$UNCOMMITTED_FILES" + echo "Remember to commit & push changes before executing the run-rwl-discovery task." + echo "------------" + exit 1 + else + echo "√" + echo "No uncommitted changes in specified directories." + echo "------------" + fi + - | + echo "Checking for unpushed commits in $BASE_DIR and $BASE_DIR.runwhen, excluding '.test'..." + git fetch origin + UNPUSHED_FILES=$(git diff --name-only origin/$(git rev-parse --abbrev-ref HEAD) HEAD | grep -E "^${BASE_DIR}(\.runwhen|[^/]+)" | grep -v "/\.test/" || true) + if [ -n "$UNPUSHED_FILES" ]; then + echo "✗" + echo "Unpushed commits found:" + echo "$UNPUSHED_FILES" + echo "Remember to push changes before executing the run-rwl-discovery task." + echo "------------" + exit 1 + else + echo "√" + echo "No unpushed commits in specified directories." + echo "------------" + fi + silent: true + + generate-rwl-config: + desc: "Generate RunWhen Local configuration (workspaceInfo.yaml)" + env: + RW_WORKSPACE: '{{.RW_WORKSPACE | default "my-workspace"}}' + cmds: + - | + repo_url=$(git config --get remote.origin.url) + branch_name=$(git rev-parse --abbrev-ref HEAD) + codebundle=$(basename "$(dirname "$PWD")") + + namespace=$(yq e 'select(.kind == "Namespace") | .metadata.name' kubernetes/manifest.yaml -N) + cat < workspaceInfo.yaml + workspaceName: "$RW_WORKSPACE" + workspaceOwnerEmail: authors@runwhen.com + defaultLocation: location-01 + defaultLOD: none + cloudConfig: + kubernetes: + kubeconfigFile: /shared/kubeconfig + namespaceLODs: + $namespace: detailed + namespaces: + - $namespace + codeCollections: + - repoURL: "$repo_url" + branch: "$branch_name" + codeBundles: ["$codebundle"] + custom: + kubeconfig_secret_name: "kubeconfig" + kubernetes_distribution_binary: kubectl + EOF + silent: true + + run-rwl-discovery: + desc: "Run RunWhen Local Discovery on test infrastructure" + cmds: + - | + CONTAINER_NAME="RunWhenLocal" + if docker ps -q --filter "name=$CONTAINER_NAME" | grep -q .; then + echo "Stopping and removing existing container $CONTAINER_NAME..." + docker stop $CONTAINER_NAME && docker rm $CONTAINER_NAME + elif docker ps -a -q --filter "name=$CONTAINER_NAME" | grep -q .; then + echo "Removing existing stopped container $CONTAINER_NAME..." + docker rm $CONTAINER_NAME + else + echo "No existing container named $CONTAINER_NAME found." + fi + + echo "Cleaning up output directory..." + sudo rm -rf output || { echo "Failed to remove output directory"; exit 1; } + mkdir output && chmod 777 output || { echo "Failed to set permissions"; exit 1; } + + echo "Starting new container $CONTAINER_NAME..." + + kubeconfig=$(echo $RW_FROM_FILE | jq -r .kubeconfig) + + docker run --name $CONTAINER_NAME -p 8081:8081 \ + -v "$(pwd)":/shared \ + -v $kubeconfig:/shared/kubeconfig \ + -d ghcr.io/runwhen-contrib/runwhen-local:latest || { + echo "Failed to start container"; exit 1; + } + + echo "Running workspace builder script in container..." + docker exec -w /workspace-builder $CONTAINER_NAME ./run.sh $1 --verbose || { + echo "Error executing script in container"; exit 1; + } + + echo "Review generated config files under output/workspaces/" + silent: true + + validate-generation-rules: + desc: "Validate YAML files in .runwhen/generation-rules" + cmds: + - | + for cmd in curl yq ajv; do + if ! command -v $cmd &> /dev/null; then + echo "Error: $cmd is required but not installed." + exit 1 + fi + done + + temp_dir=$(mktemp -d) + curl -s -o "$temp_dir/generation-rule-schema.json" \ + https://raw.githubusercontent.com/runwhen-contrib/runwhen-local/refs/heads/main/src/generation-rule-schema.json + + for yaml_file in ../.runwhen/generation-rules/*.yaml; do + echo "Validating $yaml_file" + json_file="$temp_dir/$(basename "${yaml_file%.*}.json")" + yq -o=json "$yaml_file" > "$json_file" + ajv validate -s "$temp_dir/generation-rule-schema.json" -d "$json_file" \ + --spec=draft2020 --strict=false \ + && echo "$yaml_file is valid." || echo "$yaml_file is invalid." + done + + rm -rf "$temp_dir" + silent: true + + clean-rwl-discovery: + desc: "Check and clean up RunWhen Local discovery output" + cmds: + - | + sudo rm -rf output + rm -f workspaceInfo.yaml + rm -f kubeconfig + silent: true diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/.test/kubernetes/manifest.yaml b/codebundles/k8s-deployment-rollout-troubleshoot/.test/kubernetes/manifest.yaml new file mode 100644 index 00000000..7f0cd527 --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/.test/kubernetes/manifest.yaml @@ -0,0 +1,171 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: test-rollout-troubleshoot + +--- +# Healthy rollout - nginx stable deployment at desired replicas +apiVersion: apps/v1 +kind: Deployment +metadata: + name: healthy-rollout + namespace: test-rollout-troubleshoot + labels: + app: healthy-rollout + scenario: healthy_rollout_complete +spec: + replicas: 2 + selector: + matchLabels: + app: healthy-rollout + template: + metadata: + labels: + app: healthy-rollout + spec: + containers: + - name: nginx + image: nginx:1.25-alpine + ports: + - containerPort: 80 + readinessProbe: + httpGet: + path: / + port: 80 + initialDelaySeconds: 2 + periodSeconds: 5 + +--- +# Progress deadline exceeded - failing readiness with short deadline +apiVersion: apps/v1 +kind: Deployment +metadata: + name: progress-deadline-fail + namespace: test-rollout-troubleshoot + labels: + app: progress-deadline-fail + scenario: progress_deadline_exceeded +spec: + replicas: 2 + progressDeadlineSeconds: 60 + selector: + matchLabels: + app: progress-deadline-fail + template: + metadata: + labels: + app: progress-deadline-fail + spec: + containers: + - name: nginx + image: nginx:1.25-alpine + readinessProbe: + httpGet: + path: /nonexistent + port: 80 + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 1 + +--- +# PDB blocks eviction - minAvailable equals replicas during rollout +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: pdb-blocks-rollout + namespace: test-rollout-troubleshoot +spec: + minAvailable: 2 + selector: + matchLabels: + app: pdb-blocked-rollout + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: pdb-blocked-rollout + namespace: test-rollout-troubleshoot + labels: + app: pdb-blocked-rollout + scenario: pdb_blocks_eviction +spec: + replicas: 2 + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 0 + maxSurge: 1 + selector: + matchLabels: + app: pdb-blocked-rollout + template: + metadata: + labels: + app: pdb-blocked-rollout + spec: + containers: + - name: nginx + image: nginx:1.25-alpine + readinessProbe: + httpGet: + path: / + port: 80 + initialDelaySeconds: 2 + periodSeconds: 5 + +--- +# Image pull failure on new revision - bad image tag triggers ImagePullBackOff +apiVersion: apps/v1 +kind: Deployment +metadata: + name: image-pull-fail + namespace: test-rollout-troubleshoot + labels: + app: image-pull-fail + scenario: image_pull_failure_on_new_rs +spec: + replicas: 1 + selector: + matchLabels: + app: image-pull-fail + template: + metadata: + labels: + app: image-pull-fail + spec: + containers: + - name: bad-image + image: nginx:does-not-exist-tag-12345 + ports: + - containerPort: 80 + +--- +# Stuck terminating simulation - deployment with finalizer annotation pattern +# Note: actual stuck terminating requires runtime pod deletion; manifest seeds deployment for discovery +apiVersion: apps/v1 +kind: Deployment +metadata: + name: stuck-terminating-seed + namespace: test-rollout-troubleshoot + labels: + app: stuck-terminating-seed + scenario: stuck_terminating_pods +spec: + replicas: 1 + selector: + matchLabels: + app: stuck-terminating-seed + template: + metadata: + labels: + app: stuck-terminating-seed + spec: + terminationGracePeriodSeconds: 600 + containers: + - name: nginx + image: nginx:1.25-alpine + lifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "sleep 300"] diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/README.md b/codebundles/k8s-deployment-rollout-troubleshoot/README.md new file mode 100644 index 00000000..f900c680 --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/README.md @@ -0,0 +1,76 @@ +# Kubernetes Deployment Rollout Troubleshoot + +Read-only diagnostics for Kubernetes Deployments whose rolling updates are stuck, slow, or failing. This CodeBundle focuses on rollout lifecycle signals—conditions, ReplicaSet state, blocking pods and events, rollout strategy, and PodDisruptionBudget constraints—so operators can quickly determine why a deployment will not reach a successful rollout. + +## Overview + +- **Rollout status**: Evaluates deployment conditions, replica counts, and `kubectl rollout status` sampling +- **ReplicaSet comparison**: Detects conflicting active ReplicaSets and outdated pods during rollout +- **New ReplicaSet pod failures**: Surfaces Pending, image pull, crash, and readiness failures on the latest revision +- **Rollout strategy**: Reviews RollingUpdate/Recreate settings, progress deadlines, and paused state +- **PDB impact**: Identifies PodDisruptionBudgets that block pod eviction during rollout +- **Blocking events**: Collects recent Warning/Error events on the deployment, ReplicaSets, and pods +- **Stuck terminating pods**: Finds pods stuck in Terminating that block old ReplicaSet scale-down +- **Rollout history**: Summarizes revision history and recent template changes (image, probes, resources) + +All tasks are read-only. Remediation belongs in `k8s-deployment-ops`. + +## Configuration + +### Required Variables + +- `CONTEXT`: Kubernetes context to operate within +- `NAMESPACE`: Namespace containing the deployment +- `DEPLOYMENT_NAME`: Name of the deployment to troubleshoot + +### Optional Variables + +- `KUBERNETES_DISTRIBUTION_BINARY`: Kubernetes CLI binary (`kubectl` or `oc`) (default: `kubectl`) +- `EVENT_AGE`: Lookback window for rollout-related events (default: `30m`) +- `ROLLOUT_STATUS_TIMEOUT`: Seconds to wait when sampling rollout status (default: `30`) +- `STUCK_TERMINATING_THRESHOLD`: Minutes a pod may remain Terminating before raising an issue (default: `5`) + +### Secrets + +- `kubeconfig`: Standard kubeconfig YAML with RBAC read access to deployments, replicasets, pods, events, and poddisruptionbudgets in the target namespace + +## Tasks Overview + +### Check Deployment Rollout Status + +Evaluates rollout progress via deployment status fields and `kubectl rollout status`. Detects `ProgressDeadlineExceeded`, stalled progressing conditions, and mismatches between desired, updated, available, and ready replica counts. + +### Compare Deployment ReplicaSets During Rollout + +Compares the latest ReplicaSet against older ReplicaSets owned by the deployment. Flags conflicting active ReplicaSets, outdated pods not on the latest revision, and rollouts where the new ReplicaSet is not receiving traffic. + +### Inspect New ReplicaSet Pod Failures + +Focuses on pods owned by the latest ReplicaSet that block rollout completion: Pending, CrashLoopBackOff, ImagePullBackOff, ErrImagePull, CreateContainerConfigError, and containers failing readiness after start. + +### Check Rollout Strategy Configuration + +Reviews deployment update strategy (RollingUpdate vs Recreate), maxUnavailable, maxSurge, progressDeadlineSeconds, revisionHistoryLimit, and paused state. Identifies configurations that can stall or dangerously slow rollouts. + +### Check PodDisruptionBudget Impact on Rollout + +Finds PDBs whose selectors match the deployment and evaluates whether minAvailable or maxUnavailable constraints prevent eviction of old pods or creation/scheduling of new pods during the rollout. + +### Detect Rollout Blocking Events + +Surfaces recent Warning/Error events on the deployment, its ReplicaSets, and rollout pods (FailedScheduling, FailedCreate, ReplicaFailure, ProgressDeadlineExceeded, FailedMount, quota/admission failures) within a configurable time window. + +### Check Stuck Terminating Pods Blocking Rollout + +Identifies deployment pods stuck in Terminating state that prevent old ReplicaSet scale-down and block rollout completion; includes finalizer and node attachment hints. + +### Fetch Rollout History + +Retrieves rollout revision history and summarizes recent template changes (image, env, probes, resources) to correlate failed rollouts with specific revisions. + +## Related CodeBundles + +- `k8s-deployment-healthcheck`: General deployment triage (replicas, probes, logs, restarts, HPA) +- `k8s-deployment-ops`: Remediation actions (rollout restart, rollback, scale stale ReplicaSets, force delete pods) +- `k8s-app-troubleshoot`: Deep application log and stacktrace analysis when new ReplicaSet pods fail readiness or crash +- `k8s-argocd-application-health`: GitOps sync/health issues when deployment is ArgoCD-managed diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/check-pdb-rollout-impact.sh b/codebundles/k8s-deployment-rollout-troubleshoot/check-pdb-rollout-impact.sh new file mode 100755 index 00000000..fd816770 --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/check-pdb-rollout-impact.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# REQUIRED ENV VARS: CONTEXT, NAMESPACE, DEPLOYMENT_NAME +# Outputs issues to check_pdb_rollout_impact.json +# ----------------------------------------------------------------------------- + +: "${KUBERNETES_DISTRIBUTION_BINARY:=kubectl}" +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" +: "${DEPLOYMENT_NAME:?Must set DEPLOYMENT_NAME}" + +OUTPUT_FILE="check_pdb_rollout_impact.json" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=k8s-rollout-helpers.sh +source "${SCRIPT_DIR}/k8s-rollout-helpers.sh" + +init_issues_json + +echo "Checking PDB impact on rollout for deployment ${DEPLOYMENT_NAME}" + +if ! fetch_deployment_json; then + write_issues "$OUTPUT_FILE" + exit 0 +fi + +match_labels=$(echo "$DEPLOYMENT_JSON" | jq -c '.spec.selector.matchLabels // {}') +replicas=$(echo "$DEPLOYMENT_JSON" | jq '.spec.replicas // 0') +ready_replicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.readyReplicas // 0') + +PDB_JSON=$("${K8S_CMD[@]}" get pdb -o json 2>/dev/null || echo '{"items":[]}') + +matching_pdbs=$(echo "$PDB_JSON" | jq --argjson labels "$match_labels" \ + '[.items[] | select(.spec.selector.matchLabels as $pdb_labels | + ($labels | to_entries | all(.key as $k | .value as $v | $pdb_labels[$k] == $v)))]') + +pdb_count=$(echo "$matching_pdbs" | jq 'length') +echo "Found ${pdb_count} matching PDB(s)" + +if [[ "$pdb_count" -eq 0 ]]; then + echo "No PodDisruptionBudgets match deployment selector." + write_issues "$OUTPUT_FILE" + exit 0 +fi + +while IFS= read -r pdb_name; do + [[ -z "$pdb_name" ]] && continue + pdb_detail=$(echo "$matching_pdbs" | jq --arg name "$pdb_name" '.[] | select(.metadata.name==$name)') + min_available=$(echo "$pdb_detail" | jq -r '.spec.minAvailable // empty') + max_unavailable=$(echo "$pdb_detail" | jq -r '.spec.maxUnavailable // empty') + disruptions_allowed=$(echo "$pdb_detail" | jq -r '.status.disruptionsAllowed // 0') + current_healthy=$(echo "$pdb_detail" | jq -r '.status.currentHealthy // 0') + desired_healthy=$(echo "$pdb_detail" | jq -r '.status.desiredHealthy // 0') + + echo "PDB ${pdb_name}: minAvailable=${min_available:-n/a}, maxUnavailable=${max_unavailable:-n/a}, disruptionsAllowed=${disruptions_allowed}" + + if [[ "$disruptions_allowed" == "0" && "$replicas" -gt 0 ]]; then + add_issue "2" \ + "PDB \`${pdb_name}\` Blocks Pod Eviction During Rollout for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "PDB ${pdb_name} allows 0 disruptions (currentHealthy=${current_healthy}, desiredHealthy=${desired_healthy}, minAvailable=${min_available:-unset}, maxUnavailable=${max_unavailable:-unset}). Ready replicas=${ready_replicas}, desired=${replicas}." \ + "Temporarily relax PDB minAvailable/maxUnavailable or scale deployment carefully. Scale Down Stale ReplicaSets after resolving constraints." + fi + + if [[ -n "$min_available" && "$min_available" =~ ^[0-9]+$ ]]; then + if [[ "$replicas" -le "$min_available" ]]; then + add_issue "3" \ + "PDB \`${pdb_name}\` minAvailable Prevents Rollout for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "minAvailable=${min_available} with deployment replicas=${replicas} leaves no room to terminate old pods during rolling update." \ + "Increase replica count or adjust PDB minAvailable to allow at least one pod disruption during rollout." + fi + fi + + if [[ -n "$min_available" && "$min_available" =~ ^[0-9]+%$ ]]; then + pct=${min_available%\%} + required=$(( (replicas * pct + 99) / 100 )) + if [[ "$required" -ge "$replicas" && "$replicas" -gt 0 ]]; then + add_issue "3" \ + "PDB \`${pdb_name}\` Percentage minAvailable May Block Rollout for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "minAvailable=${min_available} requires ${required} of ${replicas} pods available, leaving no eviction budget." \ + "Review PDB percentage relative to replica count during rollouts." + fi + fi +done < <(echo "$matching_pdbs" | jq -r '.[].metadata.name') + +write_issues "$OUTPUT_FILE" +echo "Analysis completed. Results saved to ${OUTPUT_FILE}" diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/check-rollout-status.sh b/codebundles/k8s-deployment-rollout-troubleshoot/check-rollout-status.sh new file mode 100755 index 00000000..32d5567f --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/check-rollout-status.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# REQUIRED ENV VARS: CONTEXT, NAMESPACE, DEPLOYMENT_NAME +# OPTIONAL: KUBERNETES_DISTRIBUTION_BINARY, ROLLOUT_STATUS_TIMEOUT +# Outputs issues to check_rollout_status.json +# ----------------------------------------------------------------------------- + +: "${KUBERNETES_DISTRIBUTION_BINARY:=kubectl}" +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" +: "${DEPLOYMENT_NAME:?Must set DEPLOYMENT_NAME}" +: "${ROLLOUT_STATUS_TIMEOUT:=30}" + +OUTPUT_FILE="check_rollout_status.json" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=k8s-rollout-helpers.sh +source "${SCRIPT_DIR}/k8s-rollout-helpers.sh" + +init_issues_json + +echo "Checking rollout status for deployment ${DEPLOYMENT_NAME} in namespace ${NAMESPACE}" + +if ! fetch_deployment_json; then + write_issues "$OUTPUT_FILE" + exit 0 +fi + +replicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.replicas // 0') +updated_replicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.updatedReplicas // 0') +available_replicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.availableReplicas // 0') +ready_replicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.readyReplicas // 0') +desired_replicas=$(echo "$DEPLOYMENT_JSON" | jq '.spec.replicas // 0') +paused=$(echo "$DEPLOYMENT_JSON" | jq -r '.spec.paused // false') + +progressing=$(echo "$DEPLOYMENT_JSON" | jq '.status.conditions[]? | select(.type=="Progressing")') +progressing_status=$(echo "$progressing" | jq -r '.status // "Unknown"') +progressing_reason=$(echo "$progressing" | jq -r '.reason // "Unknown"') +progressing_message=$(echo "$progressing" | jq -r '.message // ""') + +available_condition=$(echo "$DEPLOYMENT_JSON" | jq '.status.conditions[]? | select(.type=="Available")') +available_status=$(echo "$available_condition" | jq -r '.status // "Unknown"') + +echo "Deployment status: replicas=${replicas}, updated=${updated_replicas}, available=${available_replicas}, ready=${ready_replicas}, desired=${desired_replicas}" +echo "Progressing: status=${progressing_status}, reason=${progressing_reason}, message=${progressing_message}" + +if [[ "$paused" == "true" ]]; then + add_issue "3" \ + "Deployment \`${DEPLOYMENT_NAME}\` Rollout is Paused in Namespace \`${NAMESPACE}\`" \ + "spec.paused is true. No rollout progress will occur until the deployment is resumed." \ + "Resume the deployment rollout or investigate why it was paused. Run k8s-deployment-ops Rollback Deployment if a bad revision was paused mid-rollout." +fi + +if [[ "$progressing_reason" == "ProgressDeadlineExceeded" ]]; then + add_issue "2" \ + "Progress Deadline Exceeded for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "Progressing condition reason is ProgressDeadlineExceeded. Message: ${progressing_message}. Replica counts: updated=${updated_replicas}/${replicas}, available=${available_replicas}, ready=${ready_replicas}." \ + "Inspect New ReplicaSet Pod Failures for \`${DEPLOYMENT_NAME}\`. Check Rollout Strategy Configuration. Consider Rollback Deployment in k8s-deployment-ops." +fi + +if [[ "$progressing_status" == "False" && "$progressing_reason" != "NewReplicaSetAvailable" ]]; then + add_issue "2" \ + "Deployment \`${DEPLOYMENT_NAME}\` Progressing Condition is False in Namespace \`${NAMESPACE}\`" \ + "Progressing condition status=False, reason=${progressing_reason}, message=${progressing_message}." \ + "Compare Deployment ReplicaSets During Rollout. Detect Rollout Blocking Events. Inspect New ReplicaSet Pod Failures." +fi + +if [[ "$updated_replicas" -lt "$replicas" ]]; then + add_issue "2" \ + "Deployment \`${DEPLOYMENT_NAME}\` Has Outdated ReplicaSet Pods During Rollout in Namespace \`${NAMESPACE}\`" \ + "updatedReplicas (${updated_replicas}) is less than total replicas (${replicas}). Rollout has not fully shifted pods to the new revision." \ + "Compare Deployment ReplicaSets During Rollout. Check PodDisruptionBudget Impact on Rollout. Check Stuck Terminating Pods Blocking Rollout." +fi + +if [[ "$available_replicas" -lt "$updated_replicas" || "$ready_replicas" -lt "$updated_replicas" ]]; then + add_issue "2" \ + "Deployment \`${DEPLOYMENT_NAME}\` New Revision Pods Not Ready in Namespace \`${NAMESPACE}\`" \ + "New revision pods are not fully available/ready: updated=${updated_replicas}, available=${available_replicas}, ready=${ready_replicas}." \ + "Inspect New ReplicaSet Pod Failures for \`${DEPLOYMENT_NAME}\`. Run k8s-app-troubleshoot if pods fail readiness after starting." +fi + +if [[ "$available_status" == "False" && "$desired_replicas" -gt 0 ]]; then + add_issue "3" \ + "Deployment \`${DEPLOYMENT_NAME}\` Not Available in Namespace \`${NAMESPACE}\`" \ + "Available condition is False while desired replicas=${desired_replicas}. Rollout may be incomplete or failing." \ + "Check Deployment Rollout Status and Inspect New ReplicaSet Pod Failures." +fi + +rollout_sample=$("${K8S_CMD[@]}" rollout status "deployment/${DEPLOYMENT_NAME}" --timeout="${ROLLOUT_STATUS_TIMEOUT}s" 2>&1 || true) +echo "Rollout status sample (timeout ${ROLLOUT_STATUS_TIMEOUT}s):" +echo "$rollout_sample" + +if echo "$rollout_sample" | grep -qiE "progress deadline exceeded|timed out waiting"; then + add_issue "3" \ + "Rollout Status Timeout for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "kubectl rollout status did not complete within ${ROLLOUT_STATUS_TIMEOUT}s: ${rollout_sample}" \ + "Increase ROLLOUT_STATUS_TIMEOUT for deeper sampling or run Compare Deployment ReplicaSets During Rollout." +fi + +write_issues "$OUTPUT_FILE" +echo "Analysis completed. Results saved to ${OUTPUT_FILE}" diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/check-rollout-strategy-config.sh b/codebundles/k8s-deployment-rollout-troubleshoot/check-rollout-strategy-config.sh new file mode 100755 index 00000000..cd7034af --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/check-rollout-strategy-config.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# REQUIRED ENV VARS: CONTEXT, NAMESPACE, DEPLOYMENT_NAME +# Outputs issues to check_rollout_strategy_config.json +# ----------------------------------------------------------------------------- + +: "${KUBERNETES_DISTRIBUTION_BINARY:=kubectl}" +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" +: "${DEPLOYMENT_NAME:?Must set DEPLOYMENT_NAME}" + +OUTPUT_FILE="check_rollout_strategy_config.json" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=k8s-rollout-helpers.sh +source "${SCRIPT_DIR}/k8s-rollout-helpers.sh" + +init_issues_json + +echo "Checking rollout strategy configuration for deployment ${DEPLOYMENT_NAME}" + +if ! fetch_deployment_json; then + write_issues "$OUTPUT_FILE" + exit 0 +fi + +strategy_type=$(echo "$DEPLOYMENT_JSON" | jq -r '.spec.strategy.type // "RollingUpdate"') +max_unavailable=$(echo "$DEPLOYMENT_JSON" | jq -r '.spec.strategy.rollingUpdate.maxUnavailable // "25%"') +max_surge=$(echo "$DEPLOYMENT_JSON" | jq -r '.spec.strategy.rollingUpdate.maxSurge // "25%"') +progress_deadline=$(echo "$DEPLOYMENT_JSON" | jq -r '.spec.progressDeadlineSeconds // 600') +revision_limit=$(echo "$DEPLOYMENT_JSON" | jq -r '.spec.revisionHistoryLimit // 10') +paused=$(echo "$DEPLOYMENT_JSON" | jq -r '.spec.paused // false') +replicas=$(echo "$DEPLOYMENT_JSON" | jq '.spec.replicas // 0') + +echo "Strategy: ${strategy_type}, maxUnavailable=${max_unavailable}, maxSurge=${max_surge}, progressDeadlineSeconds=${progress_deadline}, revisionHistoryLimit=${revision_limit}, paused=${paused}" + +if [[ "$paused" == "true" ]]; then + add_issue "3" \ + "Deployment \`${DEPLOYMENT_NAME}\` Rollout is Paused in Namespace \`${NAMESPACE}\`" \ + "spec.paused=true prevents rollout progress." \ + "Resume rollout after verifying the desired template. Use k8s-deployment-ops Rollback Deployment if needed." +fi + +if [[ "$strategy_type" == "Recreate" && "$replicas" -gt 1 ]]; then + add_issue "4" \ + "Recreate Strategy May Cause Downtime for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "Deployment uses Recreate strategy with ${replicas} replicas. All pods terminate before new ones start." \ + "Expected for Recreate; confirm downtime window is acceptable. Consider RollingUpdate if zero-downtime is required." +fi + +if [[ "$progress_deadline" -lt 120 && "$replicas" -gt 0 ]]; then + add_issue "3" \ + "Short Progress Deadline for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "progressDeadlineSeconds=${progress_deadline} may mark slow-starting pods as failed too quickly." \ + "Increase progressDeadlineSeconds or optimize startup/readiness probes if rollouts fail prematurely." +fi + +if [[ "$max_unavailable" == "0" || "$max_unavailable" == "0%" ]]; then + if [[ "$max_surge" == "0" || "$max_surge" == "0%" ]]; then + add_issue "3" \ + "Rollout Strategy Cannot Progress for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "Both maxUnavailable and maxSurge are 0. Rolling updates cannot replace pods." \ + "Adjust maxUnavailable or maxSurge to allow rollout progress." + elif [[ "$replicas" -eq 1 ]]; then + add_issue "4" \ + "Single-Replica Deployment with maxUnavailable=0 for \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "With 1 replica and maxUnavailable=0, rollout depends entirely on maxSurge=${max_surge}." \ + "Ensure maxSurge allows a temporary extra pod during rollout." + fi +fi + +if [[ "$max_unavailable" =~ ^[0-9]+$ ]] && [[ "$replicas" -gt 0 ]]; then + if [[ "$max_unavailable" -eq 0 && ! "$max_surge" =~ ^[1-9] ]]; then + add_issue "3" \ + "Restrictive Rollout Limits for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "maxUnavailable=0 with low maxSurge=${max_surge} can stall rollouts on small replica counts." \ + "Review maxSurge/maxUnavailable relative to replica count and PDB constraints." + fi +fi + +if [[ "$revision_limit" == "0" ]]; then + add_issue "4" \ + "No Rollout History Retained for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "revisionHistoryLimit=0 prevents kubectl rollout undo." \ + "Set revisionHistoryLimit to at least 2 if rollback capability is needed." +fi + +write_issues "$OUTPUT_FILE" +echo "Analysis completed. Results saved to ${OUTPUT_FILE}" diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/check-stuck-terminating-pods.sh b/codebundles/k8s-deployment-rollout-troubleshoot/check-stuck-terminating-pods.sh new file mode 100755 index 00000000..626817ea --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/check-stuck-terminating-pods.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# REQUIRED ENV VARS: CONTEXT, NAMESPACE, DEPLOYMENT_NAME +# OPTIONAL: STUCK_TERMINATING_THRESHOLD (default 5 minutes) +# Outputs issues to check_stuck_terminating_pods.json +# ----------------------------------------------------------------------------- + +: "${KUBERNETES_DISTRIBUTION_BINARY:=kubectl}" +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" +: "${DEPLOYMENT_NAME:?Must set DEPLOYMENT_NAME}" +: "${STUCK_TERMINATING_THRESHOLD:=5}" + +OUTPUT_FILE="check_stuck_terminating_pods.json" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=k8s-rollout-helpers.sh +source "${SCRIPT_DIR}/k8s-rollout-helpers.sh" + +init_issues_json + +echo "Checking stuck terminating pods for deployment ${DEPLOYMENT_NAME} (threshold: ${STUCK_TERMINATING_THRESHOLD}m)" + +if ! fetch_deployment_json; then + write_issues "$OUTPUT_FILE" + exit 0 +fi + +get_deployment_pods_json +threshold_seconds=$(( STUCK_TERMINATING_THRESHOLD * 60 )) +now_epoch=$(date -u +%s) + +terminating_pods=$(echo "$PODS_JSON" | jq '[.items[] | select(.metadata.deletionTimestamp != null)]') +terminating_count=$(echo "$terminating_pods" | jq 'length') + +echo "Found ${terminating_count} pod(s) in Terminating state" + +if [[ "$terminating_count" -eq 0 ]]; then + write_issues "$OUTPUT_FILE" + echo "No terminating pods found." + exit 0 +fi + +stuck_pods=() +while IFS= read -r pod_line; do + [[ -z "$pod_line" ]] && continue + pod_name=$(echo "$pod_line" | jq -r '.metadata.name') + deletion_ts=$(echo "$pod_line" | jq -r '.metadata.deletionTimestamp') + finalizers=$(echo "$pod_line" | jq -r '.metadata.finalizers // [] | join(", ")') + node=$(echo "$pod_line" | jq -r '.spec.nodeName // "unscheduled"') + grace=$(echo "$pod_line" | jq -r '.spec.terminationGracePeriodSeconds // 30') + + deletion_epoch=$(date -d "$deletion_ts" +%s 2>/dev/null || echo "$now_epoch") + age_seconds=$(( now_epoch - deletion_epoch )) + + echo "Pod ${pod_name}: terminating for ${age_seconds}s, node=${node}, finalizers=[${finalizers}], grace=${grace}s" + + if [[ "$age_seconds" -ge "$threshold_seconds" ]]; then + stuck_pods+=("${pod_name} (age=${age_seconds}s, node=${node}, finalizers=[${finalizers}])") + fi +done < <(echo "$terminating_pods" | jq -c '.[]') + +if [[ ${#stuck_pods[@]} -gt 0 ]]; then + stuck_list=$(printf '%s; ' "${stuck_pods[@]}") + add_issue "2" \ + "Stuck Terminating Pods Block Rollout for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "${#stuck_pods[@]} pod(s) exceeded ${STUCK_TERMINATING_THRESHOLD}m terminating threshold: ${stuck_list}" \ + "Force Delete Pods in k8s-deployment-ops if safe. Check node connectivity and finalizers. Verify kubelet and CNI on affected nodes." +fi + +if [[ "$terminating_count" -gt 0 && ${#stuck_pods[@]} -eq 0 ]]; then + add_issue "3" \ + "Pods Terminating During Rollout for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "${terminating_count} pod(s) are terminating but within ${STUCK_TERMINATING_THRESHOLD}m threshold. Monitor for completion." \ + "Re-run this check if rollout remains stalled. Compare Deployment ReplicaSets During Rollout." +fi + +write_issues "$OUTPUT_FILE" +echo "Analysis completed. Results saved to ${OUTPUT_FILE}" diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/compare-replicasets-during-rollout.sh b/codebundles/k8s-deployment-rollout-troubleshoot/compare-replicasets-during-rollout.sh new file mode 100755 index 00000000..214ccbaf --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/compare-replicasets-during-rollout.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# REQUIRED ENV VARS: CONTEXT, NAMESPACE, DEPLOYMENT_NAME +# Outputs issues to compare_replicasets_during_rollout.json +# ----------------------------------------------------------------------------- + +: "${KUBERNETES_DISTRIBUTION_BINARY:=kubectl}" +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" +: "${DEPLOYMENT_NAME:?Must set DEPLOYMENT_NAME}" + +OUTPUT_FILE="compare_replicasets_during_rollout.json" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=k8s-rollout-helpers.sh +source "${SCRIPT_DIR}/k8s-rollout-helpers.sh" + +init_issues_json + +echo "Comparing ReplicaSets for deployment ${DEPLOYMENT_NAME} in namespace ${NAMESPACE}" + +if ! fetch_deployment_json; then + write_issues "$OUTPUT_FILE" + exit 0 +fi + +fetch_deployment_replicasets_json +LATEST_RS=$(get_latest_replicaset_name) +rs_count=$(echo "$REPLICASETS_JSON" | jq 'length') + +echo "Found ${rs_count} ReplicaSet(s). Latest: ${LATEST_RS:-none}" + +if [[ -z "$LATEST_RS" ]]; then + add_issue "3" \ + "No ReplicaSets Found for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "No ReplicaSets are owned by this deployment." \ + "Verify deployment exists and has created a ReplicaSet. Inspect Deployment Warning Events." + write_issues "$OUTPUT_FILE" + exit 0 +fi + +latest_rs_replicas=$(echo "$REPLICASETS_JSON" | jq --arg rs "$LATEST_RS" '.[] | select(.metadata.name==$rs) | .status.replicas // 0') +latest_rs_ready=$(echo "$REPLICASETS_JSON" | jq --arg rs "$LATEST_RS" '.[] | select(.metadata.name==$rs) | .status.readyReplicas // 0') +desired_replicas=$(echo "$DEPLOYMENT_JSON" | jq '.spec.replicas // 0') + +get_deployment_pods_json +outdated_pods=$(echo "$PODS_JSON" | jq --arg rs "$LATEST_RS" \ + '[.items[] | select(.metadata.ownerReferences[]? | select(.kind=="ReplicaSet" and .name != $rs)) | .metadata.name] | length') + +if [[ "$outdated_pods" -gt 0 ]]; then + outdated_names=$(echo "$PODS_JSON" | jq -r --arg rs "$LATEST_RS" \ + '.items[] | select(.metadata.ownerReferences[]? | select(.kind=="ReplicaSet" and .name != $rs)) | .metadata.name' | tr '\n' ', ') + add_issue "2" \ + "Outdated Pods Not on Latest ReplicaSet for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "${outdated_pods} pod(s) are owned by older ReplicaSets: ${outdated_names}. Latest RS: ${LATEST_RS}." \ + "Scale Down Stale ReplicaSets in k8s-deployment-ops. Check Stuck Terminating Pods Blocking Rollout." +fi + +active_old_rs=() +while IFS= read -r rs_name; do + [[ -z "$rs_name" || "$rs_name" == "$LATEST_RS" ]] && continue + rs_replicas=$(echo "$REPLICASETS_JSON" | jq --arg rs "$rs_name" '.[] | select(.metadata.name==$rs) | .status.replicas // 0') + if [[ "$rs_replicas" -gt 0 ]]; then + active_old_rs+=("$rs_name (replicas=${rs_replicas})") + fi +done < <(echo "$REPLICASETS_JSON" | jq -r '.[].metadata.name') + +if [[ ${#active_old_rs[@]} -gt 0 ]]; then + active_list=$(printf '%s; ' "${active_old_rs[@]}") + updated_replicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.updatedReplicas // 0') + if [[ "$updated_replicas" -lt "$desired_replicas" ]]; then + add_issue "2" \ + "Conflicting Active ReplicaSets During Rollout for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "Older ReplicaSets still have active replicas during incomplete rollout: ${active_list}. Latest RS ${LATEST_RS} has ${latest_rs_replicas} replicas (${latest_rs_ready} ready)." \ + "Compare rollout strategy maxUnavailable/maxSurge. Check PodDisruptionBudget Impact. Scale Down Stale ReplicaSets after confirming new pods are healthy." + else + add_issue "3" \ + "Multiple Active ReplicaSets for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "Rollout may still be in progress. Active older ReplicaSets: ${active_list}." \ + "Wait for rollout to complete or inspect blocking events if stalled." + fi +fi + +if [[ "$latest_rs_replicas" -eq 0 && "$desired_replicas" -gt 0 ]]; then + add_issue "2" \ + "Latest ReplicaSet Has Zero Replicas for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "Latest ReplicaSet ${LATEST_RS} has 0 replicas while deployment desired=${desired_replicas}. New revision is not receiving traffic." \ + "Inspect New ReplicaSet Pod Failures. Check Rollout Strategy Configuration and PDB constraints." +fi + +if [[ "$latest_rs_ready" -lt "$latest_rs_replicas" ]]; then + add_issue "2" \ + "Latest ReplicaSet Pods Not Ready for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "Latest RS ${LATEST_RS}: ready=${latest_rs_ready}, replicas=${latest_rs_replicas}." \ + "Inspect New ReplicaSet Pod Failures for \`${DEPLOYMENT_NAME}\`. Run k8s-app-troubleshoot for application-level failures." +fi + +write_issues "$OUTPUT_FILE" +echo "Analysis completed. Results saved to ${OUTPUT_FILE}" diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/detect-rollout-blocking-events.sh b/codebundles/k8s-deployment-rollout-troubleshoot/detect-rollout-blocking-events.sh new file mode 100755 index 00000000..f7e45e7c --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/detect-rollout-blocking-events.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# REQUIRED ENV VARS: CONTEXT, NAMESPACE, DEPLOYMENT_NAME +# OPTIONAL: EVENT_AGE (default 30m) +# Outputs issues to detect_rollout_blocking_events.json +# ----------------------------------------------------------------------------- + +: "${KUBERNETES_DISTRIBUTION_BINARY:=kubectl}" +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" +: "${DEPLOYMENT_NAME:?Must set DEPLOYMENT_NAME}" +: "${EVENT_AGE:=30m}" + +OUTPUT_FILE="detect_rollout_blocking_events.json" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=k8s-rollout-helpers.sh +source "${SCRIPT_DIR}/k8s-rollout-helpers.sh" + +init_issues_json + +echo "Detecting rollout blocking events for deployment ${DEPLOYMENT_NAME} (window: ${EVENT_AGE})" + +if ! fetch_deployment_json; then + write_issues "$OUTPUT_FILE" + exit 0 +fi + +fetch_deployment_replicasets_json +LATEST_RS=$(get_latest_replicaset_name) +rs_names=$(echo "$REPLICASETS_JSON" | jq -r '.[].metadata.name' | tr '\n' '|') +rs_pattern="${DEPLOYMENT_NAME}|${rs_names%|}" + +EVENTS_JSON=$("${K8S_CMD[@]}" get events -o json 2>/dev/null || echo '{"items":[]}') +lookback_seconds=$(parse_duration_to_seconds "$EVENT_AGE") + +blocking_reasons="FailedScheduling|FailedCreate|ReplicaFailure|ProgressDeadlineExceeded|FailedMount|FailedAttachVolume|Failed|Error|BackOff|ExceededGracePeriod|EvictionThresholdMet|FailedKillPod|FailedPreStopHook|FailedPostStartHook|SandboxChanged|NetworkNotReady|InspectFailed" + +filtered_events=$(echo "$EVENTS_JSON" | jq \ + --arg pattern "$rs_pattern" \ + --arg dep "$DEPLOYMENT_NAME" \ + --argjson lookback "$lookback_seconds" \ + --arg reasons "$blocking_reasons" \ + '[.items[] | + select(.type == "Warning" or .type == "Error") | + select(.involvedObject.name | test($dep) or test($pattern)) | + select((.lastTimestamp // .eventTime // .metadata.creationTimestamp) as $ts | + ($ts | fromdateiso8601) >= (now - $lookback)) | + select(.reason | test($reasons; "i")) + ]') + +event_count=$(echo "$filtered_events" | jq 'length') +echo "Found ${event_count} blocking event(s) in the last ${EVENT_AGE}" + +if [[ "$event_count" -gt 0 ]]; then + summary=$(echo "$filtered_events" | jq -r '.[] | "\(.lastTimestamp // .eventTime // .metadata.creationTimestamp) [\(.type)/\(.reason)] \(.involvedObject.kind)/\(.involvedObject.name): \(.message)"' | head -20) + add_issue "2" \ + "Rollout Blocking Events for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "${event_count} Warning/Error event(s) in the last ${EVENT_AGE}:\n${summary}" \ + "Address root cause per event type. For pod failures run Inspect New ReplicaSet Pod Failures. For quota/admission issues check cluster limits." +fi + +# Cluster-level quota/admission hints +quota_events=$(echo "$EVENTS_JSON" | jq \ + --arg dep "$DEPLOYMENT_NAME" \ + --argjson lookback "$lookback_seconds" \ + '[.items[] | + select(.message | test("quota|admission|forbidden|exceeded"; "i")) | + select(.involvedObject.name | test($dep)) | + select((.lastTimestamp // .eventTime // .metadata.creationTimestamp) as $ts | + ($ts | fromdateiso8601) >= (now - $lookback)) + ]') + +quota_count=$(echo "$quota_events" | jq 'length') +if [[ "$quota_count" -gt 0 ]]; then + quota_summary=$(echo "$quota_events" | jq -r '.[] | "\(.reason): \(.message)"' | head -10) + add_issue "3" \ + "Quota or Admission Failures Affecting Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "${quota_summary}" \ + "Review namespace ResourceQuota and LimitRange. Reduce resource requests or increase quota." +fi + +write_issues "$OUTPUT_FILE" +echo "Analysis completed. Results saved to ${OUTPUT_FILE}" diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/fetch-rollout-history.sh b/codebundles/k8s-deployment-rollout-troubleshoot/fetch-rollout-history.sh new file mode 100755 index 00000000..c35df305 --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/fetch-rollout-history.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# REQUIRED ENV VARS: CONTEXT, NAMESPACE, DEPLOYMENT_NAME +# Outputs issues to fetch_rollout_history.json +# ----------------------------------------------------------------------------- + +: "${KUBERNETES_DISTRIBUTION_BINARY:=kubectl}" +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" +: "${DEPLOYMENT_NAME:?Must set DEPLOYMENT_NAME}" + +OUTPUT_FILE="fetch_rollout_history.json" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=k8s-rollout-helpers.sh +source "${SCRIPT_DIR}/k8s-rollout-helpers.sh" + +init_issues_json + +echo "Fetching rollout history for deployment ${DEPLOYMENT_NAME}" + +if ! fetch_deployment_json; then + write_issues "$OUTPUT_FILE" + exit 0 +fi + +current_revision=$(echo "$DEPLOYMENT_JSON" | jq -r '.metadata.annotations["deployment.kubernetes.io/revision"] // "unknown"') +echo "Current revision: ${current_revision}" + +history_output=$("${K8S_CMD[@]}" rollout history "deployment/${DEPLOYMENT_NAME}" 2>&1 || true) +echo "Rollout history:" +echo "$history_output" + +fetch_deployment_replicasets_json + +revision_summary=$(echo "$REPLICASETS_JSON" | jq -r \ + 'sort_by(.metadata.creationTimestamp) | reverse | .[:5][] | + "RS \(.metadata.name) rev=\(.metadata.annotations["deployment.kubernetes.io/revision"] // "?") replicas=\(.status.replicas // 0) image=\(.spec.template.spec.containers[0].image // "unknown")"' | tr '\n' '; ') + +if [[ -z "$revision_summary" ]]; then + add_issue "4" \ + "No Rollout Revision History for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "No ReplicaSets found to summarize revision history." \ + "Verify deployment has been updated at least once." + write_issues "$OUTPUT_FILE" + exit 0 +fi + +echo "Recent revisions: ${revision_summary}" + +# Compare latest two revisions for meaningful template changes +sorted_rs=$(echo "$REPLICASETS_JSON" | jq 'sort_by(.metadata.creationTimestamp) | reverse') +latest=$(echo "$sorted_rs" | jq '.[0] // empty') +previous=$(echo "$sorted_rs" | jq '.[1] // empty') + +if [[ -n "$previous" && "$previous" != "null" ]]; then + latest_image=$(echo "$latest" | jq -r '.spec.template.spec.containers[0].image // ""') + prev_image=$(echo "$previous" | jq -r '.spec.template.spec.containers[0].image // ""') + latest_rev=$(echo "$latest" | jq -r '.metadata.annotations["deployment.kubernetes.io/revision"] // "?"') + prev_rev=$(echo "$previous" | jq -r '.metadata.annotations["deployment.kubernetes.io/revision"] // "?"') + + changes=() + [[ "$latest_image" != "$prev_image" ]] && changes+=("image: ${prev_image} -> ${latest_image}") + + latest_env_count=$(echo "$latest" | jq '.spec.template.spec.containers[0].env // [] | length') + prev_env_count=$(echo "$previous" | jq '.spec.template.spec.containers[0].env // [] | length') + [[ "$latest_env_count" != "$prev_env_count" ]] && changes+=("env var count: ${prev_env_count} -> ${latest_env_count}") + + latest_probe=$(echo "$latest" | jq -c '.spec.template.spec.containers[0].readinessProbe // {}') + prev_probe=$(echo "$previous" | jq -c '.spec.template.spec.containers[0].readinessProbe // {}') + [[ "$latest_probe" != "$prev_probe" ]] && changes+=("readinessProbe changed") + + latest_resources=$(echo "$latest" | jq -c '.spec.template.spec.containers[0].resources // {}') + prev_resources=$(echo "$previous" | jq -c '.spec.template.spec.containers[0].resources // {}') + [[ "$latest_resources" != "$prev_resources" ]] && changes+=("resources changed") + + updated_replicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.updatedReplicas // 0') + ready_replicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.readyReplicas // 0') + rollout_failing=false + if [[ "$updated_replicas" -lt "$(echo "$DEPLOYMENT_JSON" | jq '.status.replicas // 0')" ]] || \ + [[ "$ready_replicas" -lt "$updated_replicas" ]]; then + rollout_failing=true + fi + + if [[ ${#changes[@]} -gt 0 && "$rollout_failing" == "true" ]]; then + change_list=$(printf '%s; ' "${changes[@]}") + add_issue "3" \ + "Recent Revision Changes May Correlate with Failed Rollout for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "Revision ${prev_rev} -> ${latest_rev} changes: ${change_list}. History:\n${history_output}" \ + "Rollback Deployment to revision ${prev_rev} in k8s-deployment-ops if the change caused the failure. Inspect New ReplicaSet Pod Failures." + elif [[ ${#changes[@]} -gt 0 ]]; then + change_list=$(printf '%s; ' "${changes[@]}") + add_issue "4" \ + "Recent Template Changes for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "Latest revision ${latest_rev} differs from ${prev_rev}: ${change_list}" \ + "Use this context when correlating rollout issues with specific changes." + fi +fi + +write_issues "$OUTPUT_FILE" +echo "Analysis completed. Results saved to ${OUTPUT_FILE}" diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/inspect-new-replicaset-pod-failures.sh b/codebundles/k8s-deployment-rollout-troubleshoot/inspect-new-replicaset-pod-failures.sh new file mode 100755 index 00000000..128415dd --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/inspect-new-replicaset-pod-failures.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# REQUIRED ENV VARS: CONTEXT, NAMESPACE, DEPLOYMENT_NAME +# Outputs issues to inspect_new_replicaset_pod_failures.json +# ----------------------------------------------------------------------------- + +: "${KUBERNETES_DISTRIBUTION_BINARY:=kubectl}" +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" +: "${DEPLOYMENT_NAME:?Must set DEPLOYMENT_NAME}" + +OUTPUT_FILE="inspect_new_replicaset_pod_failures.json" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=k8s-rollout-helpers.sh +source "${SCRIPT_DIR}/k8s-rollout-helpers.sh" + +init_issues_json + +echo "Inspecting new ReplicaSet pod failures for deployment ${DEPLOYMENT_NAME}" + +if ! fetch_deployment_json; then + write_issues "$OUTPUT_FILE" + exit 0 +fi + +fetch_deployment_replicasets_json +LATEST_RS=$(get_latest_replicaset_name) + +if [[ -z "$LATEST_RS" ]]; then + add_issue "3" \ + "No Latest ReplicaSet for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "Cannot inspect new ReplicaSet pods without a ReplicaSet." \ + "Verify deployment configuration and events." + write_issues "$OUTPUT_FILE" + exit 0 +fi + +get_latest_replicaset_pods_json "$LATEST_RS" +pod_count=$(echo "$PODS_JSON" | jq '.items | length') +echo "Latest ReplicaSet ${LATEST_RS} has ${pod_count} pod(s)" + +blocking_states=("Pending" "CrashLoopBackOff" "ImagePullBackOff" "ErrImagePull" "CreateContainerConfigError" "CreateContainerError" "RunContainerError" "InvalidImageName") + +for state in "${blocking_states[@]}"; do + matches=$(echo "$PODS_JSON" | jq --arg state "$state" \ + '[.items[] | select(.status.phase==$state or (.status.containerStatuses[]? | .state.waiting.reason==$state) or (.status.initContainerStatuses[]? | .state.waiting.reason==$state)) | .metadata.name]') + count=$(echo "$matches" | jq 'length') + if [[ "$count" -gt 0 ]]; then + names=$(echo "$matches" | jq -r '.[]' | tr '\n' ', ') + details="" + while IFS= read -r pod_name; do + [[ -z "$pod_name" ]] && continue + pod_detail=$(echo "$PODS_JSON" | jq --arg pod "$pod_name" \ + '.items[] | select(.metadata.name==$pod) | {phase: .status.phase, conditions: .status.conditions, containerStatuses: .status.containerStatuses}') + details="${details}Pod ${pod_name}: ${pod_detail}\n" + done < <(echo "$matches" | jq -r '.[]') + + severity="2" + next_steps="Detect Rollout Blocking Events for \`${DEPLOYMENT_NAME}\`." + if [[ "$state" == "ImagePullBackOff" || "$state" == "ErrImagePull" || "$state" == "InvalidImageName" ]]; then + next_steps="Verify container image name, tag, and registry credentials. Check Rollout History for recent image changes." + elif [[ "$state" == "CrashLoopBackOff" ]]; then + next_steps="Run k8s-app-troubleshoot for application logs. Consider Rollback Deployment in k8s-deployment-ops." + elif [[ "$state" == "Pending" ]]; then + next_steps="Detect Rollout Blocking Events for scheduling failures. Check cluster resource quotas and node capacity." + elif [[ "$state" == "CreateContainerConfigError" ]]; then + next_steps="Verify ConfigMaps, Secrets, and env references in the latest deployment revision." + fi + + add_issue "$severity" \ + "New ReplicaSet Pods in ${state} for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "${count} pod(s) on latest RS ${LATEST_RS} in ${state}: ${names}. ${details}" \ + "${next_steps}" + fi +done + +# Readiness failures after start +not_ready=$(echo "$PODS_JSON" | jq \ + '[.items[] | select(.status.phase=="Running") | select(.status.conditions[]? | select(.type=="Ready" and .status=="False")) | .metadata.name]') +not_ready_count=$(echo "$not_ready" | jq 'length') +if [[ "$not_ready_count" -gt 0 ]]; then + names=$(echo "$not_ready" | jq -r '.[]' | tr '\n' ', ') + add_issue "2" \ + "New ReplicaSet Pods Failing Readiness for Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "${not_ready_count} running pod(s) on latest RS ${LATEST_RS} are not Ready: ${names}." \ + "Run k8s-deployment-healthcheck probe validation tasks. Run k8s-app-troubleshoot for deeper log analysis." +fi + +write_issues "$OUTPUT_FILE" +echo "Analysis completed. Results saved to ${OUTPUT_FILE}" diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/k8s-rollout-helpers.sh b/codebundles/k8s-deployment-rollout-troubleshoot/k8s-rollout-helpers.sh new file mode 100755 index 00000000..d462c30a --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/k8s-rollout-helpers.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash +# Shared helpers for k8s-deployment-rollout-troubleshoot scripts. +# Source this file; do not execute directly. + +: "${KUBERNETES_DISTRIBUTION_BINARY:=kubectl}" +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" +: "${DEPLOYMENT_NAME:?Must set DEPLOYMENT_NAME}" + +K8S_CMD=( "${KUBERNETES_DISTRIBUTION_BINARY}" --context "${CONTEXT}" -n "${NAMESPACE}" ) + +init_issues_json() { + issues_json='[]' +} + +add_issue() { + local severity="$1" + local title="$2" + local details="$3" + local next_steps="$4" + issues_json=$(echo "$issues_json" | jq \ + --arg title "$title" \ + --arg details "$details" \ + --arg severity "$severity" \ + --arg next_steps "$next_steps" \ + '. += [{ + "title": $title, + "details": $details, + "severity": ($severity | tonumber), + "next_steps": $next_steps + }]') +} + +write_issues() { + local output_file="$1" + echo "$issues_json" > "$output_file" +} + +fetch_deployment_json() { + if ! DEPLOYMENT_JSON=$("${K8S_CMD[@]}" get deployment "${DEPLOYMENT_NAME}" -o json 2>deployment_err.log); then + local err_msg + err_msg=$(cat deployment_err.log) + rm -f deployment_err.log + add_issue "4" \ + "Cannot Access Deployment \`${DEPLOYMENT_NAME}\` in Namespace \`${NAMESPACE}\`" \ + "Failed to fetch deployment: ${err_msg}" \ + "Verify kubeconfig RBAC permissions and that deployment ${DEPLOYMENT_NAME} exists in namespace ${NAMESPACE}." + return 1 + fi + rm -f deployment_err.log + return 0 +} + +fetch_deployment_replicasets_json() { + REPLICASETS_JSON=$("${K8S_CMD[@]}" get rs -o json | jq --arg DEPLOYMENT_NAME "$DEPLOYMENT_NAME" \ + '[.items[] | select(.metadata.ownerReferences[]? | select(.kind == "Deployment" and .name == $DEPLOYMENT_NAME))]') +} + +get_latest_replicaset_name() { + echo "$REPLICASETS_JSON" | jq -r 'sort_by(.metadata.creationTimestamp) | last(.[]?) | .metadata.name // empty' +} + +get_deployment_selector() { + echo "$DEPLOYMENT_JSON" | jq -r '.spec.selector.matchLabels | to_entries | map("\(.key)=\(.value)") | join(",")' +} + +get_deployment_pods_json() { + local selector + selector=$(get_deployment_selector) + if [[ -z "$selector" ]]; then + PODS_JSON='{"items":[]}' + return + fi + PODS_JSON=$("${K8S_CMD[@]}" get pods -l "$selector" -o json 2>/dev/null || echo '{"items":[]}') +} + +get_latest_replicaset_pods_json() { + local latest_rs="$1" + PODS_JSON=$("${K8S_CMD[@]}" get pods -o json | jq --arg rs "$latest_rs" \ + '[.items[] | select(.metadata.ownerReferences[]? | select(.kind == "ReplicaSet" and .name == $rs))] | {items: .}') +} + +parse_duration_to_seconds() { + local value="$1" + if [[ "$value" =~ ^([0-9]+)m$ ]]; then + echo $(( ${BASH_REMATCH[1]} * 60 )) + elif [[ "$value" =~ ^([0-9]+)h$ ]]; then + echo $(( ${BASH_REMATCH[1]} * 3600 )) + elif [[ "$value" =~ ^([0-9]+)s$ ]]; then + echo "${BASH_REMATCH[1]}" + elif [[ "$value" =~ ^[0-9]+$ ]]; then + echo "$value" + else + echo "1800" + fi +} diff --git a/codebundles/k8s-deployment-rollout-troubleshoot/runbook.robot b/codebundles/k8s-deployment-rollout-troubleshoot/runbook.robot new file mode 100644 index 00000000..cc161f53 --- /dev/null +++ b/codebundles/k8s-deployment-rollout-troubleshoot/runbook.robot @@ -0,0 +1,395 @@ +*** Settings *** +Documentation Read-only diagnostics for Kubernetes Deployments whose rolling updates are stuck, slow, or failing. +Metadata Author rw-codebundle-agent +Metadata Display Name Kubernetes Deployment Rollout Troubleshoot +Metadata Supports Kubernetes Deployment Rollout Troubleshoot ReadOnly +Force Tags Kubernetes Deployment Rollout Troubleshoot + +Library String +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform + +Suite Setup Suite Initialization + + +*** Tasks *** +Check Deployment Rollout Status for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + [Documentation] Evaluates rollout progress via deployment status fields and kubectl rollout status, detecting ProgressDeadlineExceeded and replica count mismatches. + [Tags] Kubernetes Deployment Rollout Status access:read-only data:config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-rollout-status.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./check-rollout-status.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat check_rollout_status.json + ... env=${env} + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for rollout status task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Deployment `${DEPLOYMENT_NAME}` rollout should complete with updated, available, and ready replicas aligned + ... actual=${issue['title']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Rollout Status Analysis: + RW.Core.Add Pre To Report ${result.stdout} + +Compare Deployment ReplicaSets During Rollout for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + [Documentation] Compares the latest ReplicaSet against older ReplicaSets and flags conflicting active ReplicaSets or outdated pods blocking rollout completion. + [Tags] Kubernetes Deployment ReplicaSet Rollout access:read-only data:config + + ${result}= RW.CLI.Run Bash File + ... bash_file=compare-replicasets-during-rollout.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./compare-replicasets-during-rollout.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat compare_replicasets_during_rollout.json + ... env=${env} + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for ReplicaSet comparison task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Only the latest ReplicaSet should serve traffic after rollout completes + ... actual=${issue['title']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report ReplicaSet Comparison Analysis: + RW.Core.Add Pre To Report ${result.stdout} + +Inspect New ReplicaSet Pod Failures for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + [Documentation] Focuses on pods owned by the latest ReplicaSet that block rollout completion due to scheduling, image pull, crash, or readiness failures. + [Tags] Kubernetes Deployment Pods Failures access:read-only data:logs-config + + ${result}= RW.CLI.Run Bash File + ... bash_file=inspect-new-replicaset-pod-failures.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./inspect-new-replicaset-pod-failures.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat inspect_new_replicaset_pod_failures.json + ... env=${env} + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for new ReplicaSet pod failures task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=New ReplicaSet pods should start and become Ready + ... actual=${issue['title']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report New ReplicaSet Pod Failure Analysis: + RW.Core.Add Pre To Report ${result.stdout} + +Check Rollout Strategy Configuration for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + [Documentation] Reviews deployment update strategy, maxUnavailable, maxSurge, progressDeadlineSeconds, revisionHistoryLimit, and paused state for rollout-stalling configurations. + [Tags] Kubernetes Deployment Strategy Configuration access:read-only data:config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-rollout-strategy-config.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./check-rollout-strategy-config.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat check_rollout_strategy_config.json + ... env=${env} + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for rollout strategy config task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Rollout strategy should allow safe, timely pod replacement + ... actual=${issue['title']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Rollout Strategy Configuration Analysis: + RW.Core.Add Pre To Report ${result.stdout} + +Check PodDisruptionBudget Impact on Rollout for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + [Documentation] Finds PDBs whose selectors match the deployment and evaluates whether minAvailable or maxUnavailable constraints block rollout eviction or scheduling. + [Tags] Kubernetes Deployment PDB Rollout access:read-only data:config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-pdb-rollout-impact.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./check-pdb-rollout-impact.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat check_pdb_rollout_impact.json + ... env=${env} + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for PDB rollout impact task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=PDB constraints should not prevent necessary pod disruption during rollout + ... actual=${issue['title']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report PDB Rollout Impact Analysis: + RW.Core.Add Pre To Report ${result.stdout} + +Detect Rollout Blocking Events for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + [Documentation] Surfaces recent Warning and Error events on the deployment, its ReplicaSets, and rollout pods within the configured time window. + [Tags] Kubernetes Deployment Events Rollout access:read-only data:logs-config + + ${result}= RW.CLI.Run Bash File + ... bash_file=detect-rollout-blocking-events.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./detect-rollout-blocking-events.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat detect_rollout_blocking_events.json + ... env=${env} + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for rollout blocking events task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=No blocking Warning or Error events during rollout + ... actual=${issue['title']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Rollout Blocking Events Analysis: + RW.Core.Add Pre To Report ${result.stdout} + +Check Stuck Terminating Pods Blocking Rollout for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + [Documentation] Identifies deployment pods stuck in Terminating state that prevent old ReplicaSet scale-down and block rollout completion. + [Tags] Kubernetes Deployment Pods Terminating access:read-only data:config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-stuck-terminating-pods.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./check-stuck-terminating-pods.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat check_stuck_terminating_pods.json + ... env=${env} + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for stuck terminating pods task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Old ReplicaSet pods should terminate promptly during rollout + ... actual=${issue['title']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Stuck Terminating Pods Analysis: + RW.Core.Add Pre To Report ${result.stdout} + +Fetch Rollout History for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + [Documentation] Retrieves rollout revision history and summarizes recent template changes to correlate failed rollouts with specific revisions. + [Tags] Kubernetes Deployment History Revision access:read-only data:config + + ${result}= RW.CLI.Run Bash File + ... bash_file=fetch-rollout-history.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./fetch-rollout-history.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat fetch_rollout_history.json + ... env=${env} + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for rollout history task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Recent revisions should not introduce breaking template changes during rollout + ... actual=${issue['title']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Rollout History Analysis: + RW.Core.Add Pre To Report ${result.stdout} + + +*** Keywords *** +Suite Initialization + ${kubeconfig}= RW.Core.Import Secret kubeconfig + ... type=string + ... description=The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). + ... pattern=\w* + ${CONTEXT}= RW.Core.Import User Variable CONTEXT + ... type=string + ... description=Kubernetes context to operate within. + ... pattern=\w* + ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE + ... type=string + ... description=Namespace containing the deployment. + ... pattern=\w* + ${DEPLOYMENT_NAME}= RW.Core.Import User Variable DEPLOYMENT_NAME + ... type=string + ... description=Name of the deployment to troubleshoot. + ... pattern=\w* + ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY + ... type=string + ... description=Kubernetes CLI binary (kubectl or oc). + ... enum=[kubectl,oc] + ... default=kubectl + ${EVENT_AGE}= RW.Core.Import User Variable EVENT_AGE + ... type=string + ... description=Lookback window for rollout-related events (e.g. 30m, 1h). + ... pattern=\w* + ... default=30m + ${ROLLOUT_STATUS_TIMEOUT}= RW.Core.Import User Variable ROLLOUT_STATUS_TIMEOUT + ... type=string + ... description=Seconds to wait when sampling rollout status (non-blocking sample). + ... pattern=^\d+$ + ... default=30 + ${STUCK_TERMINATING_THRESHOLD}= RW.Core.Import User Variable STUCK_TERMINATING_THRESHOLD + ... type=string + ... description=Minutes a pod may remain Terminating before raising an issue. + ... pattern=^\d+$ + ... default=5 + + Set Suite Variable ${kubeconfig} ${kubeconfig} + Set Suite Variable ${CONTEXT} ${CONTEXT} + Set Suite Variable ${NAMESPACE} ${NAMESPACE} + Set Suite Variable ${DEPLOYMENT_NAME} ${DEPLOYMENT_NAME} + Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} + Set Suite Variable ${EVENT_AGE} ${EVENT_AGE} + Set Suite Variable ${ROLLOUT_STATUS_TIMEOUT} ${ROLLOUT_STATUS_TIMEOUT} + Set Suite Variable ${STUCK_TERMINATING_THRESHOLD} ${STUCK_TERMINATING_THRESHOLD} + + ${env_dict}= Create Dictionary + ... CONTEXT=${CONTEXT} + ... NAMESPACE=${NAMESPACE} + ... DEPLOYMENT_NAME=${DEPLOYMENT_NAME} + ... KUBERNETES_DISTRIBUTION_BINARY=${KUBERNETES_DISTRIBUTION_BINARY} + ... EVENT_AGE=${EVENT_AGE} + ... ROLLOUT_STATUS_TIMEOUT=${ROLLOUT_STATUS_TIMEOUT} + ... STUCK_TERMINATING_THRESHOLD=${STUCK_TERMINATING_THRESHOLD} + Set Suite Variable ${env} ${env_dict}