Merge pull request #28 from test-zeus-ai/helm-updates #31
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Helm - GKE workflow | |
| on: | |
| push: | |
| branches: | |
| - main-prod | |
| paths: | |
| - .github/workflows/deploy.yaml | |
| - helm/** | |
| - package.json | |
| - package-lock.json | |
| - packages/injected/** | |
| - packages/playwright-core/** | |
| - packages/protocol/** | |
| - packages/trace/** | |
| - packages/trace-viewer/** | |
| - packages/web/** | |
| - utils/build/** | |
| workflow_dispatch: | |
| inputs: | |
| deploy_prod: | |
| description: "Deploy to production" | |
| required: true | |
| type: boolean | |
| default: false | |
| target_env: | |
| description: "Deploy environment (dev/dev4/dev5/main-prod)" | |
| required: false | |
| default: "dev" | |
| type: choice | |
| options: | |
| - dev | |
| - dev4 | |
| - dev5 | |
| - main-prod | |
| refresh_secrets: | |
| description: "Force refresh ExternalSecrets before deploy, use only after secret changes or rotation" | |
| required: false | |
| default: false | |
| type: boolean | |
| branch: | |
| description: "Branch to deploy (for dev/dev4/dev5)" | |
| required: false | |
| default: "main-prod" | |
| permissions: | |
| contents: read | |
| id-token: write | |
| env: | |
| SERVICE_NAME: traceviewer | |
| APP_DIR: . | |
| DOCKERFILE: packages/trace-viewer/Dockerfile | |
| CHART_DIR: helm | |
| GAR_LOCATION: ${{ vars.GAR_LOCATION }} | |
| jobs: | |
| build: | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| id-token: write | |
| outputs: | |
| image_tag: ${{ steps.meta.outputs.image_tag }} | |
| image_repo: ${{ steps.meta.outputs.image_repo }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ github.event_name == 'workflow_dispatch' && inputs.branch || github.ref }} | |
| - name: Authenticate to Google Cloud | |
| id: auth | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| workload_identity_provider: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && secrets.GCP_WIF_PROVIDER_PROD || secrets.GCP_WIF_PROVIDER_DEV }} | |
| service_account: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && secrets.GCP_SA_PROD || secrets.GCP_SA_DEV }} | |
| token_format: access_token | |
| - name: Setup gcloud | |
| uses: google-github-actions/setup-gcloud@v2 | |
| - name: Set up Docker Buildx | |
| uses: docker/setup-buildx-action@v3 | |
| with: | |
| driver: docker-container | |
| - name: Login to GAR | |
| uses: docker/login-action@v3 | |
| with: | |
| registry: ${{ env.GAR_LOCATION }}-docker.pkg.dev | |
| username: oauth2accesstoken | |
| password: ${{ steps.auth.outputs.access_token }} | |
| - name: Build image metadata | |
| id: meta | |
| run: | | |
| TS="$(date -u +%Y%m%d%H%M%S)" | |
| IMAGE_TAG="${GITHUB_SHA}-${GITHUB_RUN_ATTEMPT}-$TS" | |
| PROJECT_ID="${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && vars.GCP_PROJECT_ID_PROD || vars.GCP_PROJECT_ID_DEV }}" | |
| GAR_REPO="${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && vars.GAR_DOCKER_REPOSITORY_PROD || vars.GAR_DOCKER_REPOSITORY_DEV }}" | |
| IMAGE_REPO="${GAR_LOCATION}-docker.pkg.dev/${PROJECT_ID}/${GAR_REPO}/${SERVICE_NAME}" | |
| echo "image_tag=${IMAGE_TAG}" >>"${GITHUB_OUTPUT}" | |
| echo "image_repo=${IMAGE_REPO}" >>"${GITHUB_OUTPUT}" | |
| - name: Build and push image | |
| uses: docker/build-push-action@v6 | |
| with: | |
| context: ${{ env.APP_DIR }} | |
| file: ${{ env.DOCKERFILE }} | |
| push: true | |
| tags: ${{ steps.meta.outputs.image_repo }}:${{ steps.meta.outputs.image_tag }} | |
| build-args: | | |
| GIT_SHA=${{ github.sha }} | |
| GIT_REF=${{ github.ref_name }} | |
| BUILD_TIME=${{ steps.meta.outputs.image_tag }} | |
| cache-from: type=gha | |
| cache-to: type=gha,mode=max | |
| helm4-compat: | |
| if: github.ref == 'refs/heads/main-prod' && (github.event_name == 'push' || github.event_name == 'workflow_dispatch') | |
| needs: build | |
| runs-on: ubuntu-latest | |
| env: | |
| TARGET_ENV: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && 'prod' || (github.event_name == 'workflow_dispatch' && github.event.inputs.target_env != '' && github.event.inputs.target_env) || 'dev' }} | |
| VALUES_FILE: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && 'values-prod.yaml' || (github.event_name == 'workflow_dispatch' && github.event.inputs.target_env != '' && format('values-{0}.yaml', github.event.inputs.target_env)) || 'values-dev.yaml' }} | |
| DEPLOY_NAMESPACE: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && 'testzeus-prod' || (github.event_name == 'workflow_dispatch' && github.event.inputs.target_env != '' && format('testzeus-{0}', github.event.inputs.target_env)) || 'testzeus-dev' }} | |
| GKE_CLUSTER: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && vars.GKE_PROD_CLUSTER || vars.GKE_DEV_CLUSTER }} | |
| GKE_LOCATION: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && vars.GKE_PROD_LOCATION || vars.GKE_DEV_LOCATION }} | |
| GCP_PROJECT_ID: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && vars.GCP_PROJECT_ID_PROD || vars.GCP_PROJECT_ID_DEV }} | |
| GAR_HELM_REPOSITORY: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && vars.GAR_HELM_REPOSITORY_PROD || vars.GAR_HELM_REPOSITORY_DEV }} | |
| GCP_SA: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && secrets.GCP_SA_PROD || secrets.GCP_SA_DEV }} | |
| GCP_WIF_PROVIDER: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') && secrets.GCP_WIF_PROVIDER_PROD || secrets.GCP_WIF_PROVIDER_DEV }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Set up Helm 4 (compat check) | |
| uses: azure/setup-helm@v4 | |
| with: | |
| version: v4.0.0 | |
| - name: Authenticate to Google Cloud | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| workload_identity_provider: ${{ env.GCP_WIF_PROVIDER }} | |
| service_account: ${{ env.GCP_SA }} | |
| - name: Login Helm to GAR OCI registry | |
| run: | | |
| gcloud auth print-access-token | \ | |
| helm registry login -u oauth2accesstoken --password-stdin \ | |
| "${GAR_LOCATION}-docker.pkg.dev" | |
| - name: Set Helm dependency repository (target env) | |
| run: | | |
| CHART_REPO="oci://${GAR_LOCATION}-docker.pkg.dev/${GCP_PROJECT_ID}/${GAR_HELM_REPOSITORY}" | |
| sed -i "/- name: service-template/,/repository:/ s#repository: \".*\"#repository: \"${CHART_REPO}\"#" "${CHART_DIR}/Chart.yaml" | |
| - name: Helm dependency update (Helm 4) | |
| run: helm dependency update "${CHART_DIR}" | |
| - name: Helm lint (Helm 4) | |
| run: helm lint "${CHART_DIR}" -f "${CHART_DIR}/${VALUES_FILE}" | |
| - name: Helm template render (Helm 4) | |
| run: | | |
| helm template "${SERVICE_NAME}-helm4-compat" "${CHART_DIR}" \ | |
| -f "${CHART_DIR}/${VALUES_FILE}" \ | |
| --set-string service-template.containers[0].image.repository=${{ needs.build.outputs.image_repo }} \ | |
| --set-string service-template.containers[0].image.tag=${{ needs.build.outputs.image_tag }} \ | |
| > rendered-helm4.yaml | |
| - name: Guard rendered manifest fields | |
| run: | | |
| if grep -nE '^[[:space:]]*(managedFields:|resourceVersion:|uid:|creationTimestamp:)$' rendered-helm4.yaml; then | |
| echo "Forbidden metadata fields found in rendered output." | |
| exit 1 | |
| fi | |
| if grep -nE '^[[:space:]]*status:[[:space:]]*$' rendered-helm4.yaml; then | |
| echo "Forbidden status field found in rendered output." | |
| exit 1 | |
| fi | |
| deploy-dev: | |
| if: (github.event_name == 'push' && github.ref == 'refs/heads/main-prod') || (github.event_name == 'workflow_dispatch' && github.event.inputs.deploy_prod == 'false' && (github.event.inputs.target_env == 'dev' || github.event.inputs.target_env == 'dev4' || github.event.inputs.target_env == 'dev5')) | |
| needs: [build, helm4-compat] | |
| runs-on: ubuntu-latest | |
| environment: dev | |
| concurrency: | |
| group: deploy-${{ github.event_name == 'workflow_dispatch' && (inputs.target_env == 'dev' && 'testzeus-dev' || format('testzeus-{0}', inputs.target_env)) || 'testzeus-dev' }} | |
| cancel-in-progress: false | |
| env: | |
| GKE_CLUSTER: ${{ vars.GKE_DEV_CLUSTER }} | |
| GKE_LOCATION: ${{ vars.GKE_DEV_LOCATION }} | |
| GCP_PROJECT_ID: ${{ vars.GCP_PROJECT_ID_DEV }} | |
| VALUES_FILE: ${{ github.event_name == 'workflow_dispatch' && format('values-{0}.yaml', github.event.inputs.target_env) || 'values-dev.yaml' }} | |
| DEPLOY_NAMESPACE: ${{ github.event_name == 'workflow_dispatch' && format('testzeus-{0}', github.event.inputs.target_env) || 'testzeus-dev' }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ github.event_name == 'workflow_dispatch' && inputs.branch || github.ref }} | |
| - name: Set up Helm | |
| uses: azure/setup-helm@v4 | |
| with: | |
| version: v4.0.0 | |
| - name: Authenticate to Google Cloud | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| workload_identity_provider: ${{ secrets.GCP_WIF_PROVIDER_DEV }} | |
| service_account: ${{ secrets.GCP_SA_DEV }} | |
| - name: Get GKE credentials | |
| uses: google-github-actions/get-gke-credentials@v2 | |
| with: | |
| cluster_name: ${{ env.GKE_CLUSTER }} | |
| location: ${{ env.GKE_LOCATION }} | |
| project_id: ${{ env.GCP_PROJECT_ID }} | |
| - name: Login Helm to GAR OCI registry | |
| run: | | |
| gcloud auth print-access-token | \ | |
| helm registry login -u oauth2accesstoken --password-stdin \ | |
| "${GAR_LOCATION}-docker.pkg.dev" | |
| - name: Set Helm dependency repository (dev) | |
| run: | | |
| CHART_REPO="oci://${GAR_LOCATION}-docker.pkg.dev/${{ vars.GCP_PROJECT_ID_DEV }}/${{ vars.GAR_HELM_REPOSITORY_DEV }}" | |
| sed -i "/- name: service-template/,/repository:/ s#repository: \".*\"#repository: \"${CHART_REPO}\"#" "${CHART_DIR}/Chart.yaml" | |
| - name: Helm dependency update | |
| run: helm dependency update "${CHART_DIR}" | |
| - name: Helm pre-deploy state check | |
| run: | | |
| echo "Checking existing release state for ${SERVICE_NAME} in ${DEPLOY_NAMESPACE}" | |
| echo "::group::Helm status" | |
| HELM_STATUS_OUTPUT=$(helm status "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" 2>&1 || true) | |
| echo "${HELM_STATUS_OUTPUT}" | |
| echo "::endgroup::" | |
| RELEASE_STATUS=$(printf '%s\n' "${HELM_STATUS_OUTPUT}" | awk '/^STATUS:/ {print $2; exit}') | |
| if [ -n "${RELEASE_STATUS}" ]; then | |
| echo "::notice::Release status: ${RELEASE_STATUS}" | |
| case "${RELEASE_STATUS}" in | |
| pending-install|pending-upgrade|pending-rollback) | |
| echo "::error::Helm release is stuck in ${RELEASE_STATUS}. This usually means another install, upgrade, or rollback is still in progress." | |
| { | |
| echo "### Deployment summary" | |
| echo "" | |
| echo "| Field | Value |" | |
| echo "| --- | --- |" | |
| echo "| Environment | dev |" | |
| echo "| Release | ${SERVICE_NAME} |" | |
| echo "| Namespace | ${DEPLOY_NAMESPACE} |" | |
| echo "| Result | blocked before deploy |" | |
| echo "| Likely cause | Helm release is stuck in ${RELEASE_STATUS}. Another install, upgrade, or rollback is still in progress. |" | |
| } >> "${GITHUB_STEP_SUMMARY}" | |
| echo "::warning::Continuing so the recovery step can attempt rollback." | |
| ;; | |
| esac | |
| fi | |
| echo "::group::Helm history" | |
| HELM_HISTORY_OUTPUT=$(helm history "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" 2>&1 || true) | |
| echo "${HELM_HISTORY_OUTPUT}" | |
| echo "::endgroup::" | |
| - name: Reset stuck Helm release (if pending) | |
| run: | | |
| STATUS=$(helm status "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" --output json 2>/dev/null | jq -r '.info.status' || echo "not-found") | |
| echo "Current release status: ${STATUS}" | |
| if [[ "${STATUS}" == "pending-upgrade" || "${STATUS}" == "pending-install" || "${STATUS}" == "pending-rollback" ]]; then | |
| echo "Release is stuck in '${STATUS}' — attempting rollback to last good revision..." | |
| LAST_GOOD=$(helm history "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" --output json | jq '[.[] | select(.status == "deployed")] | last | .revision') | |
| if [[ -n "${LAST_GOOD}" && "${LAST_GOOD}" != "null" ]]; then | |
| # No --wait: avoids re-hitting the ScaledObject readiness timeout on the old chart. | |
| # The upgrade step immediately following will be the source of truth for success. | |
| helm rollback "${SERVICE_NAME}" "${LAST_GOOD}" --namespace "${DEPLOY_NAMESPACE}" --timeout 5m | |
| echo "Rolled back to revision ${LAST_GOOD}" | |
| else | |
| # No deployed revision exists (first-ever install got stuck). | |
| # Uninstalling automatically would delete all cluster resources and cause downtime. | |
| # Require manual intervention to avoid accidental teardown. | |
| echo "::error::Release '${SERVICE_NAME}' is stuck in '${STATUS}' with no clean revision to roll back to." | |
| echo "::error::Manual fix required: kubectl delete secret -n ${DEPLOY_NAMESPACE} $(kubectl get secrets -n ${DEPLOY_NAMESPACE} -l owner=helm,name=${SERVICE_NAME} -o name)" | |
| exit 1 | |
| fi | |
| fi | |
| - name: Force ExternalSecret sync | |
| if: github.event_name == 'workflow_dispatch' && inputs.refresh_secrets | |
| run: | | |
| SYNC_TOKEN=$(date +%s) | |
| echo "Forcing ExternalSecret sync in ${DEPLOY_NAMESPACE} (token=${SYNC_TOKEN})" | |
| for EXT_SECRET in $(kubectl get externalsecret -n "${DEPLOY_NAMESPACE}" -o name 2>/dev/null || true); do | |
| kubectl annotate "${EXT_SECRET}" -n "${DEPLOY_NAMESPACE}" external-secrets.io/force-sync="${SYNC_TOKEN}" --overwrite | |
| done | |
| - name: Helm deploy to dev | |
| run: | | |
| helm upgrade "${SERVICE_NAME}" "${CHART_DIR}" \ | |
| -f "${CHART_DIR}/${VALUES_FILE}" \ | |
| --set-string service-template.containers[0].image.repository=${{ needs.build.outputs.image_repo }} \ | |
| --set-string service-template.containers[0].image.tag=${{ needs.build.outputs.image_tag }} \ | |
| --namespace "${DEPLOY_NAMESPACE}" \ | |
| --create-namespace \ | |
| --install \ | |
| --wait \ | |
| --rollback-on-failure \ | |
| --server-side=false \ | |
| --timeout 5m | |
| - name: Collect Helm diagnostics on failure | |
| if: failure() | |
| run: | | |
| echo "Collecting Helm diagnostics for ${SERVICE_NAME}" | |
| echo "::group::Helm status" | |
| HELM_STATUS_OUTPUT=$(helm status "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" 2>&1 || true) | |
| echo "${HELM_STATUS_OUTPUT}" | |
| echo "::endgroup::" | |
| echo "::group::Helm history" | |
| HELM_HISTORY_OUTPUT=$(helm history "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" 2>&1 || true) | |
| echo "${HELM_HISTORY_OUTPUT}" | |
| echo "::endgroup::" | |
| echo "::group::Helm release resources" | |
| HELM_RESOURCES_OUTPUT=$(helm get all "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" 2>&1 || true) | |
| echo "${HELM_RESOURCES_OUTPUT}" | |
| echo "::endgroup::" | |
| echo "::group::Deployment describe" | |
| kubectl describe deployment "${SERVICE_NAME}" -n "${DEPLOY_NAMESPACE}" || true | |
| echo "::endgroup::" | |
| echo "::group::Release events" | |
| EVENTS_OUTPUT=$( | |
| { | |
| kubectl get events -n "${DEPLOY_NAMESPACE}" --sort-by=.lastTimestamp --field-selector involvedObject.kind=Deployment,involvedObject.name="${SERVICE_NAME}" 2>&1 || true | |
| kubectl get events -n "${DEPLOY_NAMESPACE}" --sort-by=.lastTimestamp --field-selector involvedObject.kind=ReplicaSet 2>/dev/null | grep -F "${SERVICE_NAME}" || true | |
| kubectl get events -n "${DEPLOY_NAMESPACE}" --sort-by=.lastTimestamp --field-selector involvedObject.kind=Pod 2>/dev/null | grep -F "${SERVICE_NAME}" || true | |
| } || true | |
| ) | |
| echo "${EVENTS_OUTPUT}" | |
| echo "::endgroup::" | |
| echo "::group::Release pods" | |
| PODS_OUTPUT=$(kubectl get pods -n "${DEPLOY_NAMESPACE}" -l "app.kubernetes.io/instance=${SERVICE_NAME}" -o wide 2>&1 || true) | |
| echo "${PODS_OUTPUT}" | |
| echo "::endgroup::" | |
| if printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'Evicted|Insufficient memory'; then | |
| echo "::notice::Likely cause: node memory pressure or pod eviction." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'FailedScheduling|Insufficient cpu|Too many pods'; then | |
| echo "::notice::Likely cause: insufficient cluster capacity for the rollout." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'ImagePullBackOff|ErrImagePull'; then | |
| echo "::notice::Likely cause: image pull or registry access failure." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'CreateContainerConfigError|Secret .* not found|configmap .* not found'; then | |
| echo "::notice::Likely cause: missing secret or config map." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'FailedMount|MountVolume|volume attach|AttachVolume'; then | |
| echo "::notice::Likely cause: storage or volume attach failure." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'Forbidden|RBAC|permission denied'; then | |
| echo "::notice::Likely cause: permissions or RBAC failure." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'AdmissionWebhook|denied by webhook|policy violation'; then | |
| echo "::notice::Likely cause: admission webhook or policy rejection." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'x509|TLS|certificate'; then | |
| echo "::notice::Likely cause: TLS or certificate validation failure." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'context deadline exceeded|DeadlineExceeded|timed out'; then | |
| echo "::notice::Likely cause: rollout or readiness timeout." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'Readiness probe failed|Liveness probe failed|Back-off restarting failed container|CrashLoopBackOff|Error'; then | |
| echo "::notice::Likely cause: probe failures, a crashing container, or an application startup error." | |
| else | |
| echo "::notice::Likely cause: review Helm status, pod events, and pod state above." | |
| fi | |
| deploy-prod: | |
| if: github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/main-prod' && (github.event.inputs.deploy_prod == 'true' || github.event.inputs.target_env == 'main-prod') | |
| needs: [build, helm4-compat] | |
| runs-on: ubuntu-latest | |
| environment: production | |
| env: | |
| GKE_CLUSTER: ${{ vars.GKE_PROD_CLUSTER }} | |
| GKE_LOCATION: ${{ vars.GKE_PROD_LOCATION }} | |
| GCP_PROJECT_ID: ${{ vars.GCP_PROJECT_ID_PROD }} | |
| # Keep this aligned with the prod NAT IP from Terraform if it changes. | |
| BASELINE_RANGES: "10.20.0.0/16,10.100.0.0/24,34.121.117.161/32" | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Set up Helm | |
| uses: azure/setup-helm@v4 | |
| with: | |
| version: v4.0.0 | |
| - name: Authenticate to Google Cloud | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| workload_identity_provider: ${{ secrets.GCP_WIF_PROVIDER_PROD }} | |
| service_account: ${{ secrets.GCP_SA_PROD }} | |
| - name: Get GKE credentials | |
| uses: google-github-actions/get-gke-credentials@v2 | |
| with: | |
| cluster_name: ${{ env.GKE_CLUSTER }} | |
| location: ${{ env.GKE_LOCATION }} | |
| project_id: ${{ env.GCP_PROJECT_ID }} | |
| - name: Whitelist Runner IP | |
| run: | | |
| RUNNER_IP=$(curl -sf https://api.ipify.org) || { echo "Failed to get runner IP"; exit 1; } | |
| echo "RUNNER_IP=${RUNNER_IP}" >> $GITHUB_ENV | |
| echo "Whitelisting IP: ${RUNNER_IP}" | |
| EXISTING_RANGES=$(gcloud container clusters describe ${{ env.GKE_CLUSTER }} \ | |
| --location ${{ env.GKE_LOCATION }} --project ${{ env.GCP_PROJECT_ID }} \ | |
| --format="value(masterAuthorizedNetworksConfig.cidrBlocks[].cidrBlock)" \ | |
| | tr '\n' ',' | tr ';' ',' | sed 's/,$//; s/,,*/,/g') | |
| if [[ ",${EXISTING_RANGES}," == *",${RUNNER_IP}/32,"* ]]; then | |
| echo "IP already whitelisted." | |
| else | |
| NEW_RANGES="${BASELINE_RANGES},${RUNNER_IP}/32" | |
| gcloud container clusters update ${{ env.GKE_CLUSTER }} \ | |
| --location ${{ env.GKE_LOCATION }} --project ${{ env.GCP_PROJECT_ID }} \ | |
| --enable-authorized-networks-on-private-endpoint \ | |
| --enable-google-cloud-access \ | |
| --enable-master-authorized-networks \ | |
| --master-authorized-networks "$NEW_RANGES" | |
| fi | |
| - name: Login Helm to GAR OCI registry | |
| run: | | |
| gcloud auth print-access-token | \ | |
| helm registry login -u oauth2accesstoken --password-stdin \ | |
| "${GAR_LOCATION}-docker.pkg.dev" | |
| - name: Set Helm dependency repository (prod) | |
| run: | | |
| CHART_REPO="oci://${GAR_LOCATION}-docker.pkg.dev/${{ vars.GCP_PROJECT_ID_PROD }}/${{ vars.GAR_HELM_REPOSITORY_PROD }}" | |
| sed -i "/- name: service-template/,/repository:/ s#repository: \".*\"#repository: \"${CHART_REPO}\"#" "${CHART_DIR}/Chart.yaml" | |
| - name: Helm dependency update | |
| run: helm dependency update "${CHART_DIR}" | |
| - name: Reset stuck Helm release (if pending) | |
| run: | | |
| STATUS=$(helm status "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" --output json 2>/dev/null \ | |
| | jq -r '.info.status' || echo "not-found") | |
| echo "Current release status: ${STATUS}" | |
| if [[ "${STATUS}" == "pending-upgrade" || "${STATUS}" == "pending-install" || "${STATUS}" == "pending-rollback" ]]; then | |
| echo "Release is stuck in '${STATUS}' — attempting rollback to last good revision..." | |
| LAST_GOOD=$(helm history "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" --output json \ | |
| | jq '[.[] | select(.status == "deployed")] | last | .revision') | |
| if [[ -n "${LAST_GOOD}" && "${LAST_GOOD}" != "null" ]]; then | |
| # No --wait: avoids re-hitting the ScaledObject readiness timeout on the old chart. | |
| # The upgrade step immediately following will be the source of truth for success. | |
| helm rollback "${SERVICE_NAME}" "${LAST_GOOD}" --namespace "${DEPLOY_NAMESPACE}" --timeout 5m | |
| echo "Rolled back to revision ${LAST_GOOD}" | |
| else | |
| # No deployed revision exists (first-ever install got stuck). | |
| # Uninstalling automatically would delete all cluster resources and cause downtime. | |
| # Require manual intervention to avoid accidental teardown. | |
| echo "::error::Release '${SERVICE_NAME}' is stuck in '${STATUS}' with no clean revision to roll back to." | |
| echo "::error::Manual fix required: kubectl delete secret -n ${DEPLOY_NAMESPACE} $(kubectl get secrets -n ${DEPLOY_NAMESPACE} -l owner=helm,name=${SERVICE_NAME} -o name)" | |
| exit 1 | |
| fi | |
| fi | |
| - name: Force ExternalSecret sync | |
| if: github.event_name == 'workflow_dispatch' && inputs.refresh_secrets | |
| run: | | |
| SYNC_TOKEN=$(date +%s) | |
| echo "Forcing ExternalSecret sync in ${DEPLOY_NAMESPACE} (token=${SYNC_TOKEN})" | |
| for EXT_SECRET in $(kubectl get externalsecret -n "${DEPLOY_NAMESPACE}" -o name 2>/dev/null || true); do | |
| kubectl annotate "${EXT_SECRET}" -n "${DEPLOY_NAMESPACE}" external-secrets.io/force-sync="${SYNC_TOKEN}" --overwrite | |
| done | |
| - name: Helm deploy to prod | |
| run: | | |
| helm upgrade "${SERVICE_NAME}" "${CHART_DIR}" \ | |
| -f "${CHART_DIR}/values-prod.yaml" \ | |
| --set-string service-template.containers[0].image.repository=${{ needs.build.outputs.image_repo }} \ | |
| --set-string service-template.containers[0].image.tag=${{ needs.build.outputs.image_tag }} \ | |
| --namespace "testzeus-prod" \ | |
| --create-namespace \ | |
| --install \ | |
| --wait \ | |
| --rollback-on-failure \ | |
| --server-side=false \ | |
| --timeout 5m | |
| - name: Collect Helm diagnostics on failure | |
| if: failure() | |
| run: | | |
| echo "Collecting Helm diagnostics for ${SERVICE_NAME}" | |
| echo "::group::Helm status" | |
| HELM_STATUS_OUTPUT=$(helm status "${SERVICE_NAME}" --namespace "testzeus-prod" 2>&1 || true) | |
| echo "${HELM_STATUS_OUTPUT}" | |
| echo "::endgroup::" | |
| echo "::group::Helm history" | |
| HELM_HISTORY_OUTPUT=$(helm history "${SERVICE_NAME}" --namespace "testzeus-prod" 2>&1 || true) | |
| echo "${HELM_HISTORY_OUTPUT}" | |
| echo "::endgroup::" | |
| echo "::group::Helm release resources" | |
| HELM_RESOURCES_OUTPUT=$(helm get all "${SERVICE_NAME}" --namespace "testzeus-prod" 2>&1 || true) | |
| echo "${HELM_RESOURCES_OUTPUT}" | |
| echo "::endgroup::" | |
| echo "::group::Namespace events" | |
| EVENTS_OUTPUT=$(kubectl get events -n "testzeus-prod" --sort-by=.lastTimestamp 2>&1 || true) | |
| echo "${EVENTS_OUTPUT}" | |
| echo "::endgroup::" | |
| echo "::group::Pods" | |
| PODS_OUTPUT=$(kubectl get pods -n "testzeus-prod" -o wide 2>&1 || true) | |
| echo "${PODS_OUTPUT}" | |
| echo "::endgroup::" | |
| if printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'Evicted|Insufficient memory'; then | |
| echo "::notice::Likely cause: node memory pressure or pod eviction." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'FailedScheduling|Insufficient cpu|Too many pods'; then | |
| echo "::notice::Likely cause: insufficient cluster capacity for the rollout." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'ImagePullBackOff|ErrImagePull'; then | |
| echo "::notice::Likely cause: image pull or registry access failure." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'CreateContainerConfigError|Secret .* not found|configmap .* not found'; then | |
| echo "::notice::Likely cause: missing secret or config map." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'FailedMount|MountVolume|volume attach|AttachVolume'; then | |
| echo "::notice::Likely cause: storage or volume attach failure." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'Forbidden|RBAC|permission denied'; then | |
| echo "::notice::Likely cause: permissions or RBAC failure." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'AdmissionWebhook|denied by webhook|policy violation'; then | |
| echo "::notice::Likely cause: admission webhook or policy rejection." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'x509|TLS|certificate'; then | |
| echo "::notice::Likely cause: TLS or certificate validation failure." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'context deadline exceeded|DeadlineExceeded|timed out'; then | |
| echo "::notice::Likely cause: rollout or readiness timeout." | |
| elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" | grep -qiE 'Readiness probe failed|Liveness probe failed|Back-off restarting failed container|CrashLoopBackOff|Error'; then | |
| echo "::notice::Likely cause: probe failures, a crashing container, or an application startup error." | |
| else | |
| echo "::notice::Likely cause: review Helm status, pod events, and pod state above." | |
| fi | |
| - name: Cleanup Runner IP | |
| if: always() | |
| run: | | |
| if [ -n "${RUNNER_IP}" ]; then | |
| echo "Removing Runner IP: ${RUNNER_IP}" | |
| EXISTING_RANGES=$(gcloud container clusters describe ${{ env.GKE_CLUSTER }} \ | |
| --location ${{ env.GKE_LOCATION }} --project ${{ env.GCP_PROJECT_ID }} \ | |
| --format="value(masterAuthorizedNetworksConfig.cidrBlocks[].cidrBlock)" \ | |
| | tr '\n' ',' | tr ';' ',' | sed 's/,$//; s/,,*/,/g') | |
| REMAINING_RANGES=$(echo "$EXISTING_RANGES" | sed "s/${RUNNER_IP}\/32//g; s/,,*/,/g; s/^,//; s/,$//") | |
| if [ -n "$REMAINING_RANGES" ]; then | |
| gcloud container clusters update ${{ env.GKE_CLUSTER }} \ | |
| --location ${{ env.GKE_LOCATION }} --project ${{ env.GCP_PROJECT_ID }} \ | |
| --enable-authorized-networks-on-private-endpoint \ | |
| --enable-google-cloud-access \ | |
| --enable-master-authorized-networks \ | |
| --master-authorized-networks "$REMAINING_RANGES" | |
| else | |
| echo "::error::Refusing to disable master authorized networks during prod cleanup because no baseline authorized ranges remain." | |
| exit 1 | |
| fi | |
| fi |