Skip to content

Merge pull request #28 from test-zeus-ai/helm-updates #31

Merge pull request #28 from test-zeus-ai/helm-updates

Merge pull request #28 from test-zeus-ai/helm-updates #31

Workflow file for this run

name: Helm - GKE workflow
on:
push:
branches:
- main-prod
paths:
- .github/workflows/deploy.yaml
- helm/**
- package.json
- package-lock.json
- packages/injected/**
- packages/playwright-core/**
- packages/protocol/**
- packages/trace/**
- packages/trace-viewer/**
- packages/web/**
- utils/build/**
workflow_dispatch:
inputs:
deploy_prod:
description: "Deploy to production"
required: true
type: boolean
default: false
target_env:
description: "Deploy environment (dev/dev4/dev5/main-prod)"
required: false
default: "dev"
type: choice
options:
- dev
- dev4
- dev5
- main-prod
refresh_secrets:
description: "Force refresh ExternalSecrets before deploy, use only after secret changes or rotation"
required: false
default: false
type: boolean
branch:
description: "Branch to deploy (for dev/dev4/dev5)"
required: false
default: "main-prod"
permissions:
contents: read
id-token: write
env:
SERVICE_NAME: traceviewer
APP_DIR: .
DOCKERFILE: packages/trace-viewer/Dockerfile
CHART_DIR: helm
GAR_LOCATION: ${{ vars.GAR_LOCATION }}
jobs:
build:
runs-on: ubuntu-latest
permissions:
contents: read
id-token: write
outputs:
image_tag: ${{ steps.meta.outputs.image_tag }}
image_repo: ${{ steps.meta.outputs.image_repo }}
steps:
- uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'workflow_dispatch' && inputs.branch || github.ref }}
- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/auth@v2
with:
workload_identity_provider: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && secrets.GCP_WIF_PROVIDER_PROD || secrets.GCP_WIF_PROVIDER_DEV }}
service_account: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && secrets.GCP_SA_PROD || secrets.GCP_SA_DEV }}
token_format: access_token
- name: Setup gcloud
uses: google-github-actions/setup-gcloud@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
driver: docker-container
- name: Login to GAR
uses: docker/login-action@v3
with:
registry: ${{ env.GAR_LOCATION }}-docker.pkg.dev
username: oauth2accesstoken
password: ${{ steps.auth.outputs.access_token }}
- name: Build image metadata
id: meta
run: |
TS="$(date -u +%Y%m%d%H%M%S)"
IMAGE_TAG="${GITHUB_SHA}-${GITHUB_RUN_ATTEMPT}-$TS"
PROJECT_ID="${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && vars.GCP_PROJECT_ID_PROD || vars.GCP_PROJECT_ID_DEV }}"
GAR_REPO="${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && vars.GAR_DOCKER_REPOSITORY_PROD || vars.GAR_DOCKER_REPOSITORY_DEV }}"
IMAGE_REPO="${GAR_LOCATION}-docker.pkg.dev/${PROJECT_ID}/${GAR_REPO}/${SERVICE_NAME}"
echo "image_tag=${IMAGE_TAG}" >>"${GITHUB_OUTPUT}"
echo "image_repo=${IMAGE_REPO}" >>"${GITHUB_OUTPUT}"
- name: Build and push image
uses: docker/build-push-action@v6
with:
context: ${{ env.APP_DIR }}
file: ${{ env.DOCKERFILE }}
push: true
tags: ${{ steps.meta.outputs.image_repo }}:${{ steps.meta.outputs.image_tag }}
build-args: |
GIT_SHA=${{ github.sha }}
GIT_REF=${{ github.ref_name }}
BUILD_TIME=${{ steps.meta.outputs.image_tag }}
cache-from: type=gha
cache-to: type=gha,mode=max
helm4-compat:
if: github.ref == 'refs/heads/main-prod' && (github.event_name == 'push' || github.event_name == 'workflow_dispatch')
needs: build
runs-on: ubuntu-latest
env:
TARGET_ENV: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && 'prod' || (github.event_name == 'workflow_dispatch' && github.event.inputs.target_env != '' && github.event.inputs.target_env) || 'dev' }}
VALUES_FILE: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && 'values-prod.yaml' || (github.event_name == 'workflow_dispatch' && github.event.inputs.target_env != '' && format('values-{0}.yaml', github.event.inputs.target_env)) || 'values-dev.yaml' }}
DEPLOY_NAMESPACE: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && 'testzeus-prod' || (github.event_name == 'workflow_dispatch' && github.event.inputs.target_env != '' && format('testzeus-{0}', github.event.inputs.target_env)) || 'testzeus-dev' }}
GKE_CLUSTER: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && vars.GKE_PROD_CLUSTER || vars.GKE_DEV_CLUSTER }}
GKE_LOCATION: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && vars.GKE_PROD_LOCATION || vars.GKE_DEV_LOCATION }}
GCP_PROJECT_ID: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && vars.GCP_PROJECT_ID_PROD || vars.GCP_PROJECT_ID_DEV }}
GAR_HELM_REPOSITORY: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && vars.GAR_HELM_REPOSITORY_PROD || vars.GAR_HELM_REPOSITORY_DEV }}
GCP_SA: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && secrets.GCP_SA_PROD || secrets.GCP_SA_DEV }}
GCP_WIF_PROVIDER: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && secrets.GCP_WIF_PROVIDER_PROD || secrets.GCP_WIF_PROVIDER_DEV }}
steps:
- uses: actions/checkout@v4
- name: Set up Helm 4 (compat check)
uses: azure/setup-helm@v4
with:
version: v4.0.0
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
workload_identity_provider: ${{ env.GCP_WIF_PROVIDER }}
service_account: ${{ env.GCP_SA }}
- name: Login Helm to GAR OCI registry
run: |
gcloud auth print-access-token | \
helm registry login -u oauth2accesstoken --password-stdin \
"${GAR_LOCATION}-docker.pkg.dev"
- name: Set Helm dependency repository (target env)
run: |
CHART_REPO="oci://${GAR_LOCATION}-docker.pkg.dev/${GCP_PROJECT_ID}/${GAR_HELM_REPOSITORY}"
sed -i "/- name: service-template/,/repository:/ s#repository: \".*\"#repository: \"${CHART_REPO}\"#" "${CHART_DIR}/Chart.yaml"
- name: Helm dependency update (Helm 4)
run: helm dependency update "${CHART_DIR}"
- name: Helm lint (Helm 4)
run: helm lint "${CHART_DIR}" -f "${CHART_DIR}/${VALUES_FILE}"
- name: Helm template render (Helm 4)
run: |
helm template "${SERVICE_NAME}-helm4-compat" "${CHART_DIR}" \
-f "${CHART_DIR}/${VALUES_FILE}" \
--set-string service-template.containers[0].image.repository=${{ needs.build.outputs.image_repo }} \
--set-string service-template.containers[0].image.tag=${{ needs.build.outputs.image_tag }} \
> rendered-helm4.yaml
- name: Guard rendered manifest fields
run: |
if grep -nE '^[[:space:]]*(managedFields:|resourceVersion:|uid:|creationTimestamp:)$' rendered-helm4.yaml; then
echo "Forbidden metadata fields found in rendered output."
exit 1
fi
if grep -nE '^[[:space:]]*status:[[:space:]]*$' rendered-helm4.yaml; then
echo "Forbidden status field found in rendered output."
exit 1
fi
deploy-dev:
if: (github.event_name == 'push' && github.ref == 'refs/heads/main-prod') || (github.event_name == 'workflow_dispatch' && github.event.inputs.deploy_prod == 'false' && (github.event.inputs.target_env == 'dev' || github.event.inputs.target_env == 'dev4' || github.event.inputs.target_env == 'dev5'))
needs: [build, helm4-compat]
runs-on: ubuntu-latest
environment: dev
concurrency:
group: deploy-${{ github.event_name == 'workflow_dispatch' && (inputs.target_env == 'dev' && 'testzeus-dev' || format('testzeus-{0}', inputs.target_env)) || 'testzeus-dev' }}
cancel-in-progress: false
env:
GKE_CLUSTER: ${{ vars.GKE_DEV_CLUSTER }}
GKE_LOCATION: ${{ vars.GKE_DEV_LOCATION }}
GCP_PROJECT_ID: ${{ vars.GCP_PROJECT_ID_DEV }}
VALUES_FILE: ${{ github.event_name == 'workflow_dispatch' && format('values-{0}.yaml', github.event.inputs.target_env) || 'values-dev.yaml' }}
DEPLOY_NAMESPACE: ${{ github.event_name == 'workflow_dispatch' && format('testzeus-{0}', github.event.inputs.target_env) || 'testzeus-dev' }}
steps:
- uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'workflow_dispatch' && inputs.branch || github.ref }}
- name: Set up Helm
uses: azure/setup-helm@v4
with:
version: v4.0.0
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
workload_identity_provider: ${{ secrets.GCP_WIF_PROVIDER_DEV }}
service_account: ${{ secrets.GCP_SA_DEV }}
- name: Get GKE credentials
uses: google-github-actions/get-gke-credentials@v2
with:
cluster_name: ${{ env.GKE_CLUSTER }}
location: ${{ env.GKE_LOCATION }}
project_id: ${{ env.GCP_PROJECT_ID }}
- name: Login Helm to GAR OCI registry
run: |
gcloud auth print-access-token | \
helm registry login -u oauth2accesstoken --password-stdin \
"${GAR_LOCATION}-docker.pkg.dev"
- name: Set Helm dependency repository (dev)
run: |
CHART_REPO="oci://${GAR_LOCATION}-docker.pkg.dev/${{ vars.GCP_PROJECT_ID_DEV }}/${{ vars.GAR_HELM_REPOSITORY_DEV }}"
sed -i "/- name: service-template/,/repository:/ s#repository: \".*\"#repository: \"${CHART_REPO}\"#" "${CHART_DIR}/Chart.yaml"
- name: Helm dependency update
run: helm dependency update "${CHART_DIR}"
- name: Helm pre-deploy state check
run: |
echo "Checking existing release state for ${SERVICE_NAME} in ${DEPLOY_NAMESPACE}"
echo "::group::Helm status"
HELM_STATUS_OUTPUT=$(helm status "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" 2>&1 || true)
echo "${HELM_STATUS_OUTPUT}"
echo "::endgroup::"
RELEASE_STATUS=$(printf '%s\n' "${HELM_STATUS_OUTPUT}" | awk '/^STATUS:/ {print $2; exit}')
if [ -n "${RELEASE_STATUS}" ]; then
echo "::notice::Release status: ${RELEASE_STATUS}"
case "${RELEASE_STATUS}" in
pending-install|pending-upgrade|pending-rollback)
echo "::error::Helm release is stuck in ${RELEASE_STATUS}. This usually means another install, upgrade, or rollback is still in progress."
{
echo "### Deployment summary"
echo ""
echo "| Field | Value |"
echo "| --- | --- |"
echo "| Environment | dev |"
echo "| Release | ${SERVICE_NAME} |"
echo "| Namespace | ${DEPLOY_NAMESPACE} |"
echo "| Result | blocked before deploy |"
echo "| Likely cause | Helm release is stuck in ${RELEASE_STATUS}. Another install, upgrade, or rollback is still in progress. |"
} >> "${GITHUB_STEP_SUMMARY}"
echo "::warning::Continuing so the recovery step can attempt rollback."
;;
esac
fi
echo "::group::Helm history"
HELM_HISTORY_OUTPUT=$(helm history "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" 2>&1 || true)
echo "${HELM_HISTORY_OUTPUT}"
echo "::endgroup::"
- name: Reset stuck Helm release (if pending)
run: |
STATUS=$(helm status "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" --output json 2>/dev/null | jq -r '.info.status' || echo "not-found")
echo "Current release status: ${STATUS}"
if [[ "${STATUS}" == "pending-upgrade" || "${STATUS}" == "pending-install" || "${STATUS}" == "pending-rollback" ]]; then
echo "Release is stuck in '${STATUS}' — attempting rollback to last good revision..."
LAST_GOOD=$(helm history "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" --output json | jq '[.[] | select(.status == "deployed")] | last | .revision')
if [[ -n "${LAST_GOOD}" && "${LAST_GOOD}" != "null" ]]; then
# No --wait: avoids re-hitting the ScaledObject readiness timeout on the old chart.
# The upgrade step immediately following will be the source of truth for success.
helm rollback "${SERVICE_NAME}" "${LAST_GOOD}" --namespace "${DEPLOY_NAMESPACE}" --timeout 5m
echo "Rolled back to revision ${LAST_GOOD}"
else
# No deployed revision exists (first-ever install got stuck).
# Uninstalling automatically would delete all cluster resources and cause downtime.
# Require manual intervention to avoid accidental teardown.
echo "::error::Release '${SERVICE_NAME}' is stuck in '${STATUS}' with no clean revision to roll back to."
echo "::error::Manual fix required: kubectl delete secret -n ${DEPLOY_NAMESPACE} $(kubectl get secrets -n ${DEPLOY_NAMESPACE} -l owner=helm,name=${SERVICE_NAME} -o name)"
exit 1
fi
fi
- name: Force ExternalSecret sync
if: github.event_name == 'workflow_dispatch' && inputs.refresh_secrets
run: |
SYNC_TOKEN=$(date +%s)
echo "Forcing ExternalSecret sync in ${DEPLOY_NAMESPACE} (token=${SYNC_TOKEN})"
for EXT_SECRET in $(kubectl get externalsecret -n "${DEPLOY_NAMESPACE}" -o name 2>/dev/null || true); do
kubectl annotate "${EXT_SECRET}" -n "${DEPLOY_NAMESPACE}" external-secrets.io/force-sync="${SYNC_TOKEN}" --overwrite
done
- name: Helm deploy to dev
run: |
helm upgrade "${SERVICE_NAME}" "${CHART_DIR}" \
-f "${CHART_DIR}/${VALUES_FILE}" \
--set-string service-template.containers[0].image.repository=${{ needs.build.outputs.image_repo }} \
--set-string service-template.containers[0].image.tag=${{ needs.build.outputs.image_tag }} \
--namespace "${DEPLOY_NAMESPACE}" \
--create-namespace \
--install \
--wait \
--rollback-on-failure \
--server-side=false \
--timeout 5m
- name: Collect Helm diagnostics on failure
if: failure()
run: |
echo "Collecting Helm diagnostics for ${SERVICE_NAME}"
echo "::group::Helm status"
HELM_STATUS_OUTPUT=$(helm status "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" 2>&1 || true)
echo "${HELM_STATUS_OUTPUT}"
echo "::endgroup::"
echo "::group::Helm history"
HELM_HISTORY_OUTPUT=$(helm history "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" 2>&1 || true)
echo "${HELM_HISTORY_OUTPUT}"
echo "::endgroup::"
echo "::group::Helm release resources"
HELM_RESOURCES_OUTPUT=$(helm get all "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" 2>&1 || true)
echo "${HELM_RESOURCES_OUTPUT}"
echo "::endgroup::"
echo "::group::Deployment describe"
kubectl describe deployment "${SERVICE_NAME}" -n "${DEPLOY_NAMESPACE}" || true
echo "::endgroup::"
echo "::group::Release events"
EVENTS_OUTPUT=$(
{
kubectl get events -n "${DEPLOY_NAMESPACE}" --sort-by=.lastTimestamp --field-selector involvedObject.kind=Deployment,involvedObject.name="${SERVICE_NAME}" 2>&1 || true
kubectl get events -n "${DEPLOY_NAMESPACE}" --sort-by=.lastTimestamp --field-selector involvedObject.kind=ReplicaSet 2>/dev/null | grep -F "${SERVICE_NAME}" || true
kubectl get events -n "${DEPLOY_NAMESPACE}" --sort-by=.lastTimestamp --field-selector involvedObject.kind=Pod 2>/dev/null | grep -F "${SERVICE_NAME}" || true
} || true
)
echo "${EVENTS_OUTPUT}"
echo "::endgroup::"
echo "::group::Release pods"
PODS_OUTPUT=$(kubectl get pods -n "${DEPLOY_NAMESPACE}" -l "app.kubernetes.io/instance=${SERVICE_NAME}" -o wide 2>&1 || true)
echo "${PODS_OUTPUT}"
echo "::endgroup::"
if printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'Evicted|Insufficient memory'; then
echo "::notice::Likely cause: node memory pressure or pod eviction."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'FailedScheduling|Insufficient cpu|Too many pods'; then
echo "::notice::Likely cause: insufficient cluster capacity for the rollout."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'ImagePullBackOff|ErrImagePull'; then
echo "::notice::Likely cause: image pull or registry access failure."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'CreateContainerConfigError|Secret .* not found|configmap .* not found'; then
echo "::notice::Likely cause: missing secret or config map."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'FailedMount|MountVolume|volume attach|AttachVolume'; then
echo "::notice::Likely cause: storage or volume attach failure."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'Forbidden|RBAC|permission denied'; then
echo "::notice::Likely cause: permissions or RBAC failure."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'AdmissionWebhook|denied by webhook|policy violation'; then
echo "::notice::Likely cause: admission webhook or policy rejection."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'x509|TLS|certificate'; then
echo "::notice::Likely cause: TLS or certificate validation failure."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'context deadline exceeded|DeadlineExceeded|timed out'; then
echo "::notice::Likely cause: rollout or readiness timeout."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'Readiness probe failed|Liveness probe failed|Back-off restarting failed container|CrashLoopBackOff|Error'; then
echo "::notice::Likely cause: probe failures, a crashing container, or an application startup error."
else
echo "::notice::Likely cause: review Helm status, pod events, and pod state above."
fi
deploy-prod:
if: github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/main-prod' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod')
needs: [build, helm4-compat]
runs-on: ubuntu-latest
environment: production
env:
GKE_CLUSTER: ${{ vars.GKE_PROD_CLUSTER }}
GKE_LOCATION: ${{ vars.GKE_PROD_LOCATION }}
GCP_PROJECT_ID: ${{ vars.GCP_PROJECT_ID_PROD }}
# Keep this aligned with the prod NAT IP from Terraform if it changes.
BASELINE_RANGES: "10.20.0.0/16,10.100.0.0/24,34.121.117.161/32"
steps:
- uses: actions/checkout@v4
- name: Set up Helm
uses: azure/setup-helm@v4
with:
version: v4.0.0
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
workload_identity_provider: ${{ secrets.GCP_WIF_PROVIDER_PROD }}
service_account: ${{ secrets.GCP_SA_PROD }}
- name: Get GKE credentials
uses: google-github-actions/get-gke-credentials@v2
with:
cluster_name: ${{ env.GKE_CLUSTER }}
location: ${{ env.GKE_LOCATION }}
project_id: ${{ env.GCP_PROJECT_ID }}
- name: Whitelist Runner IP
run: |
RUNNER_IP=$(curl -sf https://api.ipify.org) || { echo "Failed to get runner IP"; exit 1; }
echo "RUNNER_IP=${RUNNER_IP}" >> $GITHUB_ENV
echo "Whitelisting IP: ${RUNNER_IP}"
EXISTING_RANGES=$(gcloud container clusters describe ${{ env.GKE_CLUSTER }} \
--location ${{ env.GKE_LOCATION }} --project ${{ env.GCP_PROJECT_ID }} \
--format="value(masterAuthorizedNetworksConfig.cidrBlocks[].cidrBlock)" \
| tr '\n' ',' | tr ';' ',' | sed 's/,$//; s/,,*/,/g')
if [[ ",${EXISTING_RANGES}," == *",${RUNNER_IP}/32,"* ]]; then
echo "IP already whitelisted."
else
NEW_RANGES="${BASELINE_RANGES},${RUNNER_IP}/32"
gcloud container clusters update ${{ env.GKE_CLUSTER }} \
--location ${{ env.GKE_LOCATION }} --project ${{ env.GCP_PROJECT_ID }} \
--enable-authorized-networks-on-private-endpoint \
--enable-google-cloud-access \
--enable-master-authorized-networks \
--master-authorized-networks "$NEW_RANGES"
fi
- name: Login Helm to GAR OCI registry
run: |
gcloud auth print-access-token | \
helm registry login -u oauth2accesstoken --password-stdin \
"${GAR_LOCATION}-docker.pkg.dev"
- name: Set Helm dependency repository (prod)
run: |
CHART_REPO="oci://${GAR_LOCATION}-docker.pkg.dev/${{ vars.GCP_PROJECT_ID_PROD }}/${{ vars.GAR_HELM_REPOSITORY_PROD }}"
sed -i "/- name: service-template/,/repository:/ s#repository: \".*\"#repository: \"${CHART_REPO}\"#" "${CHART_DIR}/Chart.yaml"
- name: Helm dependency update
run: helm dependency update "${CHART_DIR}"
- name: Reset stuck Helm release (if pending)
run: |
STATUS=$(helm status "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" --output json 2>/dev/null \
| jq -r '.info.status' || echo "not-found")
echo "Current release status: ${STATUS}"
if [[ "${STATUS}" == "pending-upgrade" || "${STATUS}" == "pending-install" || "${STATUS}" == "pending-rollback" ]]; then
echo "Release is stuck in '${STATUS}' — attempting rollback to last good revision..."
LAST_GOOD=$(helm history "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" --output json \
| jq '[.[] | select(.status == "deployed")] | last | .revision')
if [[ -n "${LAST_GOOD}" && "${LAST_GOOD}" != "null" ]]; then
# No --wait: avoids re-hitting the ScaledObject readiness timeout on the old chart.
# The upgrade step immediately following will be the source of truth for success.
helm rollback "${SERVICE_NAME}" "${LAST_GOOD}" --namespace "${DEPLOY_NAMESPACE}" --timeout 5m
echo "Rolled back to revision ${LAST_GOOD}"
else
# No deployed revision exists (first-ever install got stuck).
# Uninstalling automatically would delete all cluster resources and cause downtime.
# Require manual intervention to avoid accidental teardown.
echo "::error::Release '${SERVICE_NAME}' is stuck in '${STATUS}' with no clean revision to roll back to."
echo "::error::Manual fix required: kubectl delete secret -n ${DEPLOY_NAMESPACE} $(kubectl get secrets -n ${DEPLOY_NAMESPACE} -l owner=helm,name=${SERVICE_NAME} -o name)"
exit 1
fi
fi
- name: Force ExternalSecret sync
if: github.event_name == 'workflow_dispatch' && inputs.refresh_secrets
run: |
SYNC_TOKEN=$(date +%s)
echo "Forcing ExternalSecret sync in ${DEPLOY_NAMESPACE} (token=${SYNC_TOKEN})"
for EXT_SECRET in $(kubectl get externalsecret -n "${DEPLOY_NAMESPACE}" -o name 2>/dev/null || true); do
kubectl annotate "${EXT_SECRET}" -n "${DEPLOY_NAMESPACE}" external-secrets.io/force-sync="${SYNC_TOKEN}" --overwrite
done
- name: Helm deploy to prod
run: |
helm upgrade "${SERVICE_NAME}" "${CHART_DIR}" \
-f "${CHART_DIR}/values-prod.yaml" \
--set-string service-template.containers[0].image.repository=${{ needs.build.outputs.image_repo }} \
--set-string service-template.containers[0].image.tag=${{ needs.build.outputs.image_tag }} \
--namespace "testzeus-prod" \
--create-namespace \
--install \
--wait \
--rollback-on-failure \
--server-side=false \
--timeout 5m
- name: Collect Helm diagnostics on failure
if: failure()
run: |
echo "Collecting Helm diagnostics for ${SERVICE_NAME}"
echo "::group::Helm status"
HELM_STATUS_OUTPUT=$(helm status "${SERVICE_NAME}" --namespace "testzeus-prod" 2>&1 || true)
echo "${HELM_STATUS_OUTPUT}"
echo "::endgroup::"
echo "::group::Helm history"
HELM_HISTORY_OUTPUT=$(helm history "${SERVICE_NAME}" --namespace "testzeus-prod" 2>&1 || true)
echo "${HELM_HISTORY_OUTPUT}"
echo "::endgroup::"
echo "::group::Helm release resources"
HELM_RESOURCES_OUTPUT=$(helm get all "${SERVICE_NAME}" --namespace "testzeus-prod" 2>&1 || true)
echo "${HELM_RESOURCES_OUTPUT}"
echo "::endgroup::"
echo "::group::Namespace events"
EVENTS_OUTPUT=$(kubectl get events -n "testzeus-prod" --sort-by=.lastTimestamp 2>&1 || true)
echo "${EVENTS_OUTPUT}"
echo "::endgroup::"
echo "::group::Pods"
PODS_OUTPUT=$(kubectl get pods -n "testzeus-prod" -o wide 2>&1 || true)
echo "${PODS_OUTPUT}"
echo "::endgroup::"
if printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'Evicted|Insufficient memory'; then
echo "::notice::Likely cause: node memory pressure or pod eviction."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'FailedScheduling|Insufficient cpu|Too many pods'; then
echo "::notice::Likely cause: insufficient cluster capacity for the rollout."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'ImagePullBackOff|ErrImagePull'; then
echo "::notice::Likely cause: image pull or registry access failure."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'CreateContainerConfigError|Secret .* not found|configmap .* not found'; then
echo "::notice::Likely cause: missing secret or config map."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'FailedMount|MountVolume|volume attach|AttachVolume'; then
echo "::notice::Likely cause: storage or volume attach failure."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'Forbidden|RBAC|permission denied'; then
echo "::notice::Likely cause: permissions or RBAC failure."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'AdmissionWebhook|denied by webhook|policy violation'; then
echo "::notice::Likely cause: admission webhook or policy rejection."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'x509|TLS|certificate'; then
echo "::notice::Likely cause: TLS or certificate validation failure."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'context deadline exceeded|DeadlineExceeded|timed out'; then
echo "::notice::Likely cause: rollout or readiness timeout."
elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'Readiness probe failed|Liveness probe failed|Back-off restarting failed container|CrashLoopBackOff|Error'; then
echo "::notice::Likely cause: probe failures, a crashing container, or an application startup error."
else
echo "::notice::Likely cause: review Helm status, pod events, and pod state above."
fi
- name: Cleanup Runner IP
if: always()
run: |
if [ -n "${RUNNER_IP}" ]; then
echo "Removing Runner IP: ${RUNNER_IP}"
EXISTING_RANGES=$(gcloud container clusters describe ${{ env.GKE_CLUSTER }} \
--location ${{ env.GKE_LOCATION }} --project ${{ env.GCP_PROJECT_ID }} \
--format="value(masterAuthorizedNetworksConfig.cidrBlocks[].cidrBlock)" \
| tr '\n' ',' | tr ';' ',' | sed 's/,$//; s/,,*/,/g')
REMAINING_RANGES=$(echo "$EXISTING_RANGES" | sed "s/${RUNNER_IP}\/32//g; s/,,*/,/g; s/^,//; s/,$//")
if [ -n "$REMAINING_RANGES" ]; then
gcloud container clusters update ${{ env.GKE_CLUSTER }} \
--location ${{ env.GKE_LOCATION }} --project ${{ env.GCP_PROJECT_ID }} \
--enable-authorized-networks-on-private-endpoint \
--enable-google-cloud-access \
--enable-master-authorized-networks \
--master-authorized-networks "$REMAINING_RANGES"
else
echo "::error::Refusing to disable master authorized networks during prod cleanup because no baseline authorized ranges remain."
exit 1
fi
fi