Merge pull request #28 from test-zeus-ai/helm-updates #31

Workflow file for this run

.github/workflows/deploy.yaml at 6e03bc3

	name: Helm - GKE workflow

	on:
	push:
	branches:
	- main-prod
	paths:
	- .github/workflows/deploy.yaml
	- helm/**
	- package.json
	- package-lock.json
	- packages/injected/**
	- packages/playwright-core/**
	- packages/protocol/**
	- packages/trace/**
	- packages/trace-viewer/**
	- packages/web/**
	- utils/build/**
	workflow_dispatch:
	inputs:
	deploy_prod:
	description: "Deploy to production"
	required: true
	type: boolean
	default: false
	target_env:
	description: "Deploy environment (dev/dev4/dev5/main-prod)"
	required: false
	default: "dev"
	type: choice
	options:
	- dev
	- dev4
	- dev5
	- main-prod
	refresh_secrets:
	description: "Force refresh ExternalSecrets before deploy, use only after secret changes or rotation"
	required: false
	default: false
	type: boolean
	branch:
	description: "Branch to deploy (for dev/dev4/dev5)"
	required: false
	default: "main-prod"

	permissions:
	contents: read
	id-token: write

	env:
	SERVICE_NAME: traceviewer
	APP_DIR: .
	DOCKERFILE: packages/trace-viewer/Dockerfile
	CHART_DIR: helm
	GAR_LOCATION: ${{ vars.GAR_LOCATION }}

	jobs:
	build:
	runs-on: ubuntu-latest
	permissions:
	contents: read
	id-token: write
	outputs:
	image_tag: ${{ steps.meta.outputs.image_tag }}
	image_repo: ${{ steps.meta.outputs.image_repo }}
	steps:
	- uses: actions/checkout@v4
	with:
	ref: ${{ github.event_name == 'workflow_dispatch' && inputs.branch \|\| github.ref }}

	- name: Authenticate to Google Cloud
	id: auth
	uses: google-github-actions/auth@v2
	with:
	workload_identity_provider: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' \|\| github.event.inputs.target_env == 'main-prod') && secrets.GCP_WIF_PROVIDER_PROD \|\| secrets.GCP_WIF_PROVIDER_DEV }}
	service_account: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' \|\| github.event.inputs.target_env == 'main-prod') && secrets.GCP_SA_PROD \|\| secrets.GCP_SA_DEV }}
	token_format: access_token

	- name: Setup gcloud
	uses: google-github-actions/setup-gcloud@v2

	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@v3
	with:
	driver: docker-container

	- name: Login to GAR
	uses: docker/login-action@v3
	with:
	registry: ${{ env.GAR_LOCATION }}-docker.pkg.dev
	username: oauth2accesstoken
	password: ${{ steps.auth.outputs.access_token }}

	- name: Build image metadata
	id: meta
	run: \|
	TS="$(date -u +%Y%m%d%H%M%S)"
	IMAGE_TAG="${GITHUB_SHA}-${GITHUB_RUN_ATTEMPT}-$TS"
	PROJECT_ID="${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' \|\| github.event.inputs.target_env == 'main-prod') && vars.GCP_PROJECT_ID_PROD \|\| vars.GCP_PROJECT_ID_DEV }}"
	GAR_REPO="${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' \|\| github.event.inputs.target_env == 'main-prod') && vars.GAR_DOCKER_REPOSITORY_PROD \|\| vars.GAR_DOCKER_REPOSITORY_DEV }}"
	IMAGE_REPO="${GAR_LOCATION}-docker.pkg.dev/${PROJECT_ID}/${GAR_REPO}/${SERVICE_NAME}"
	echo "image_tag=${IMAGE_TAG}" >>"${GITHUB_OUTPUT}"
	echo "image_repo=${IMAGE_REPO}" >>"${GITHUB_OUTPUT}"

	- name: Build and push image
	uses: docker/build-push-action@v6
	with:
	context: ${{ env.APP_DIR }}
	file: ${{ env.DOCKERFILE }}
	push: true
	tags: ${{ steps.meta.outputs.image_repo }}:${{ steps.meta.outputs.image_tag }}
	build-args: \|
	GIT_SHA=${{ github.sha }}
	GIT_REF=${{ github.ref_name }}
	BUILD_TIME=${{ steps.meta.outputs.image_tag }}
	cache-from: type=gha
	cache-to: type=gha,mode=max

	helm4-compat:
	if: github.ref == 'refs/heads/main-prod' && (github.event_name == 'push' \|\| github.event_name == 'workflow_dispatch')
	needs: build
	runs-on: ubuntu-latest
	env:
	TARGET_ENV: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' \|\| github.event.inputs.target_env == 'main-prod') && 'prod' \|\| (github.event_name == 'workflow_dispatch' && github.event.inputs.target_env != '' && github.event.inputs.target_env) \|\| 'dev' }}
	VALUES_FILE: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' \|\| github.event.inputs.target_env == 'main-prod') && 'values-prod.yaml' \|\| (github.event_name == 'workflow_dispatch' && github.event.inputs.target_env != '' && format('values-{0}.yaml', github.event.inputs.target_env)) \|\| 'values-dev.yaml' }}
	DEPLOY_NAMESPACE: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' \|\| github.event.inputs.target_env == 'main-prod') && 'testzeus-prod' \|\| (github.event_name == 'workflow_dispatch' && github.event.inputs.target_env != '' && format('testzeus-{0}', github.event.inputs.target_env)) \|\| 'testzeus-dev' }}
	GKE_CLUSTER: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' \|\| github.event.inputs.target_env == 'main-prod') && vars.GKE_PROD_CLUSTER \|\| vars.GKE_DEV_CLUSTER }}
	GKE_LOCATION: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' \|\| github.event.inputs.target_env == 'main-prod') && vars.GKE_PROD_LOCATION \|\| vars.GKE_DEV_LOCATION }}
	GCP_PROJECT_ID: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' \|\| github.event.inputs.target_env == 'main-prod') && vars.GCP_PROJECT_ID_PROD \|\| vars.GCP_PROJECT_ID_DEV }}
	GAR_HELM_REPOSITORY: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' \|\| github.event.inputs.target_env == 'main-prod') && vars.GAR_HELM_REPOSITORY_PROD \|\| vars.GAR_HELM_REPOSITORY_DEV }}
	GCP_SA: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' \|\| github.event.inputs.target_env == 'main-prod') && secrets.GCP_SA_PROD \|\| secrets.GCP_SA_DEV }}
	GCP_WIF_PROVIDER: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.deploy_prod == 'true' \|\| github.event.inputs.target_env == 'main-prod') && secrets.GCP_WIF_PROVIDER_PROD \|\| secrets.GCP_WIF_PROVIDER_DEV }}
	steps:
	- uses: actions/checkout@v4

	- name: Set up Helm 4 (compat check)
	uses: azure/setup-helm@v4
	with:
	version: v4.0.0

	- name: Authenticate to Google Cloud
	uses: google-github-actions/auth@v2
	with:
	workload_identity_provider: ${{ env.GCP_WIF_PROVIDER }}
	service_account: ${{ env.GCP_SA }}

	- name: Login Helm to GAR OCI registry
	run: \|
	gcloud auth print-access-token \| \
	helm registry login -u oauth2accesstoken --password-stdin \
	"${GAR_LOCATION}-docker.pkg.dev"

	- name: Set Helm dependency repository (target env)
	run: \|
	CHART_REPO="oci://${GAR_LOCATION}-docker.pkg.dev/${GCP_PROJECT_ID}/${GAR_HELM_REPOSITORY}"
	sed -i "/- name: service-template/,/repository:/ s#repository: \".*\"#repository: \"${CHART_REPO}\"#" "${CHART_DIR}/Chart.yaml"

	- name: Helm dependency update (Helm 4)
	run: helm dependency update "${CHART_DIR}"

	- name: Helm lint (Helm 4)
	run: helm lint "${CHART_DIR}" -f "${CHART_DIR}/${VALUES_FILE}"

	- name: Helm template render (Helm 4)
	run: \|
	helm template "${SERVICE_NAME}-helm4-compat" "${CHART_DIR}" \
	-f "${CHART_DIR}/${VALUES_FILE}" \
	--set-string service-template.containers[0].image.repository=${{ needs.build.outputs.image_repo }} \
	--set-string service-template.containers[0].image.tag=${{ needs.build.outputs.image_tag }} \
	> rendered-helm4.yaml

	- name: Guard rendered manifest fields
	run: \|
	if grep -nE '^[[:space:]]*(managedFields:\|resourceVersion:\|uid:\|creationTimestamp:)$' rendered-helm4.yaml; then
	echo "Forbidden metadata fields found in rendered output."
	exit 1
	fi
	if grep -nE '^[[:space:]]status:[[:space:]]$' rendered-helm4.yaml; then
	echo "Forbidden status field found in rendered output."
	exit 1
	fi

	deploy-dev:
	if: (github.event_name == 'push' && github.ref == 'refs/heads/main-prod') \|\| (github.event_name == 'workflow_dispatch' && github.event.inputs.deploy_prod == 'false' && (github.event.inputs.target_env == 'dev' \|\| github.event.inputs.target_env == 'dev4' \|\| github.event.inputs.target_env == 'dev5'))
	needs: [build, helm4-compat]
	runs-on: ubuntu-latest
	environment: dev
	concurrency:
	group: deploy-${{ github.event_name == 'workflow_dispatch' && (inputs.target_env == 'dev' && 'testzeus-dev' \|\| format('testzeus-{0}', inputs.target_env)) \|\| 'testzeus-dev' }}
	cancel-in-progress: false
	env:
	GKE_CLUSTER: ${{ vars.GKE_DEV_CLUSTER }}
	GKE_LOCATION: ${{ vars.GKE_DEV_LOCATION }}
	GCP_PROJECT_ID: ${{ vars.GCP_PROJECT_ID_DEV }}
	VALUES_FILE: ${{ github.event_name == 'workflow_dispatch' && format('values-{0}.yaml', github.event.inputs.target_env) \|\| 'values-dev.yaml' }}
	DEPLOY_NAMESPACE: ${{ github.event_name == 'workflow_dispatch' && format('testzeus-{0}', github.event.inputs.target_env) \|\| 'testzeus-dev' }}
	steps:
	- uses: actions/checkout@v4
	with:
	ref: ${{ github.event_name == 'workflow_dispatch' && inputs.branch \|\| github.ref }}

	- name: Set up Helm
	uses: azure/setup-helm@v4
	with:
	version: v4.0.0

	- name: Authenticate to Google Cloud
	uses: google-github-actions/auth@v2
	with:
	workload_identity_provider: ${{ secrets.GCP_WIF_PROVIDER_DEV }}
	service_account: ${{ secrets.GCP_SA_DEV }}

	- name: Get GKE credentials
	uses: google-github-actions/get-gke-credentials@v2
	with:
	cluster_name: ${{ env.GKE_CLUSTER }}
	location: ${{ env.GKE_LOCATION }}
	project_id: ${{ env.GCP_PROJECT_ID }}

	- name: Login Helm to GAR OCI registry
	run: \|
	gcloud auth print-access-token \| \
	helm registry login -u oauth2accesstoken --password-stdin \
	"${GAR_LOCATION}-docker.pkg.dev"

	- name: Set Helm dependency repository (dev)
	run: \|
	CHART_REPO="oci://${GAR_LOCATION}-docker.pkg.dev/${{ vars.GCP_PROJECT_ID_DEV }}/${{ vars.GAR_HELM_REPOSITORY_DEV }}"
	sed -i "/- name: service-template/,/repository:/ s#repository: \".*\"#repository: \"${CHART_REPO}\"#" "${CHART_DIR}/Chart.yaml"

	- name: Helm dependency update
	run: helm dependency update "${CHART_DIR}"

	- name: Helm pre-deploy state check
	run: \|
	echo "Checking existing release state for ${SERVICE_NAME} in ${DEPLOY_NAMESPACE}"
	echo "::group::Helm status"
	HELM_STATUS_OUTPUT=$(helm status "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" 2>&1 \|\| true)
	echo "${HELM_STATUS_OUTPUT}"
	echo "::endgroup::"
	RELEASE_STATUS=$(printf '%s\n' "${HELM_STATUS_OUTPUT}" \| awk '/^STATUS:/ {print $2; exit}')
	if [ -n "${RELEASE_STATUS}" ]; then
	echo "::notice::Release status: ${RELEASE_STATUS}"
	case "${RELEASE_STATUS}" in
	pending-install\|pending-upgrade\|pending-rollback)
	echo "::error::Helm release is stuck in ${RELEASE_STATUS}. This usually means another install, upgrade, or rollback is still in progress."
	{
	echo "### Deployment summary"
	echo ""
	echo "\| Field \| Value \|"
	echo "\| --- \| --- \|"
	echo "\| Environment \| dev \|"
	echo "\| Release \| ${SERVICE_NAME} \|"
	echo "\| Namespace \| ${DEPLOY_NAMESPACE} \|"
	echo "\| Result \| blocked before deploy \|"
	echo "\| Likely cause \| Helm release is stuck in ${RELEASE_STATUS}. Another install, upgrade, or rollback is still in progress. \|"
	} >> "${GITHUB_STEP_SUMMARY}"
	echo "::warning::Continuing so the recovery step can attempt rollback."
	;;
	esac
	fi
	echo "::group::Helm history"
	HELM_HISTORY_OUTPUT=$(helm history "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" 2>&1 \|\| true)
	echo "${HELM_HISTORY_OUTPUT}"
	echo "::endgroup::"

	- name: Reset stuck Helm release (if pending)
	run: \|
	STATUS=$(helm status "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" --output json 2>/dev/null \| jq -r '.info.status' \|\| echo "not-found")
	echo "Current release status: ${STATUS}"
	if [[ "${STATUS}" == "pending-upgrade" \|\| "${STATUS}" == "pending-install" \|\| "${STATUS}" == "pending-rollback" ]]; then
	echo "Release is stuck in '${STATUS}' — attempting rollback to last good revision..."
	LAST_GOOD=$(helm history "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" --output json \| jq '[.[] \| select(.status == "deployed")] \| last \| .revision')
	if [[ -n "${LAST_GOOD}" && "${LAST_GOOD}" != "null" ]]; then
	# No --wait: avoids re-hitting the ScaledObject readiness timeout on the old chart.
	# The upgrade step immediately following will be the source of truth for success.
	helm rollback "${SERVICE_NAME}" "${LAST_GOOD}" --namespace "${DEPLOY_NAMESPACE}" --timeout 5m
	echo "Rolled back to revision ${LAST_GOOD}"
	else
	# No deployed revision exists (first-ever install got stuck).
	# Uninstalling automatically would delete all cluster resources and cause downtime.
	# Require manual intervention to avoid accidental teardown.
	echo "::error::Release '${SERVICE_NAME}' is stuck in '${STATUS}' with no clean revision to roll back to."
	echo "::error::Manual fix required: kubectl delete secret -n ${DEPLOY_NAMESPACE} $(kubectl get secrets -n ${DEPLOY_NAMESPACE} -l owner=helm,name=${SERVICE_NAME} -o name)"
	exit 1
	fi
	fi

	- name: Force ExternalSecret sync
	if: github.event_name == 'workflow_dispatch' && inputs.refresh_secrets
	run: \|
	SYNC_TOKEN=$(date +%s)
	echo "Forcing ExternalSecret sync in ${DEPLOY_NAMESPACE} (token=${SYNC_TOKEN})"
	for EXT_SECRET in $(kubectl get externalsecret -n "${DEPLOY_NAMESPACE}" -o name 2>/dev/null \|\| true); do
	kubectl annotate "${EXT_SECRET}" -n "${DEPLOY_NAMESPACE}" external-secrets.io/force-sync="${SYNC_TOKEN}" --overwrite
	done
	- name: Helm deploy to dev
	run: \|
	helm upgrade "${SERVICE_NAME}" "${CHART_DIR}" \
	-f "${CHART_DIR}/${VALUES_FILE}" \
	--set-string service-template.containers[0].image.repository=${{ needs.build.outputs.image_repo }} \
	--set-string service-template.containers[0].image.tag=${{ needs.build.outputs.image_tag }} \
	--namespace "${DEPLOY_NAMESPACE}" \
	--create-namespace \
	--install \
	--wait \
	--rollback-on-failure \
	--server-side=false \
	--timeout 5m

	- name: Collect Helm diagnostics on failure
	if: failure()
	run: \|
	echo "Collecting Helm diagnostics for ${SERVICE_NAME}"
	echo "::group::Helm status"
	HELM_STATUS_OUTPUT=$(helm status "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" 2>&1 \|\| true)
	echo "${HELM_STATUS_OUTPUT}"
	echo "::endgroup::"
	echo "::group::Helm history"
	HELM_HISTORY_OUTPUT=$(helm history "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" 2>&1 \|\| true)
	echo "${HELM_HISTORY_OUTPUT}"
	echo "::endgroup::"
	echo "::group::Helm release resources"
	HELM_RESOURCES_OUTPUT=$(helm get all "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" 2>&1 \|\| true)
	echo "${HELM_RESOURCES_OUTPUT}"
	echo "::endgroup::"
	echo "::group::Deployment describe"
	kubectl describe deployment "${SERVICE_NAME}" -n "${DEPLOY_NAMESPACE}" \|\| true
	echo "::endgroup::"
	echo "::group::Release events"
	EVENTS_OUTPUT=$(
	{
	kubectl get events -n "${DEPLOY_NAMESPACE}" --sort-by=.lastTimestamp --field-selector involvedObject.kind=Deployment,involvedObject.name="${SERVICE_NAME}" 2>&1 \|\| true
	kubectl get events -n "${DEPLOY_NAMESPACE}" --sort-by=.lastTimestamp --field-selector involvedObject.kind=ReplicaSet 2>/dev/null \| grep -F "${SERVICE_NAME}" \|\| true
	kubectl get events -n "${DEPLOY_NAMESPACE}" --sort-by=.lastTimestamp --field-selector involvedObject.kind=Pod 2>/dev/null \| grep -F "${SERVICE_NAME}" \|\| true
	} \|\| true
	)
	echo "${EVENTS_OUTPUT}"
	echo "::endgroup::"
	echo "::group::Release pods"
	PODS_OUTPUT=$(kubectl get pods -n "${DEPLOY_NAMESPACE}" -l "app.kubernetes.io/instance=${SERVICE_NAME}" -o wide 2>&1 \|\| true)
	echo "${PODS_OUTPUT}"
	echo "::endgroup::"
	if printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'Evicted\|Insufficient memory'; then
	echo "::notice::Likely cause: node memory pressure or pod eviction."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'FailedScheduling\|Insufficient cpu\|Too many pods'; then
	echo "::notice::Likely cause: insufficient cluster capacity for the rollout."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'ImagePullBackOff\|ErrImagePull'; then
	echo "::notice::Likely cause: image pull or registry access failure."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'CreateContainerConfigError\|Secret .* not found\|configmap .* not found'; then
	echo "::notice::Likely cause: missing secret or config map."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'FailedMount\|MountVolume\|volume attach\|AttachVolume'; then
	echo "::notice::Likely cause: storage or volume attach failure."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'Forbidden\|RBAC\|permission denied'; then
	echo "::notice::Likely cause: permissions or RBAC failure."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'AdmissionWebhook\|denied by webhook\|policy violation'; then
	echo "::notice::Likely cause: admission webhook or policy rejection."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'x509\|TLS\|certificate'; then
	echo "::notice::Likely cause: TLS or certificate validation failure."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'context deadline exceeded\|DeadlineExceeded\|timed out'; then
	echo "::notice::Likely cause: rollout or readiness timeout."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'Readiness probe failed\|Liveness probe failed\|Back-off restarting failed container\|CrashLoopBackOff\|Error'; then
	echo "::notice::Likely cause: probe failures, a crashing container, or an application startup error."
	else
	echo "::notice::Likely cause: review Helm status, pod events, and pod state above."
	fi
	deploy-prod:
	if: github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/main-prod' && (github.event.inputs.deploy_prod == 'true' \|\| github.event.inputs.target_env == 'main-prod')
	needs: [build, helm4-compat]
	runs-on: ubuntu-latest
	environment: production
	env:
	GKE_CLUSTER: ${{ vars.GKE_PROD_CLUSTER }}
	GKE_LOCATION: ${{ vars.GKE_PROD_LOCATION }}
	GCP_PROJECT_ID: ${{ vars.GCP_PROJECT_ID_PROD }}
	# Keep this aligned with the prod NAT IP from Terraform if it changes.
	BASELINE_RANGES: "10.20.0.0/16,10.100.0.0/24,34.121.117.161/32"
	steps:
	- uses: actions/checkout@v4

	- name: Set up Helm
	uses: azure/setup-helm@v4
	with:
	version: v4.0.0

	- name: Authenticate to Google Cloud
	uses: google-github-actions/auth@v2
	with:
	workload_identity_provider: ${{ secrets.GCP_WIF_PROVIDER_PROD }}
	service_account: ${{ secrets.GCP_SA_PROD }}

	- name: Get GKE credentials
	uses: google-github-actions/get-gke-credentials@v2
	with:
	cluster_name: ${{ env.GKE_CLUSTER }}
	location: ${{ env.GKE_LOCATION }}
	project_id: ${{ env.GCP_PROJECT_ID }}

	- name: Whitelist Runner IP
	run: \|
	RUNNER_IP=$(curl -sf https://api.ipify.org) \|\| { echo "Failed to get runner IP"; exit 1; }
	echo "RUNNER_IP=${RUNNER_IP}" >> $GITHUB_ENV
	echo "Whitelisting IP: ${RUNNER_IP}"

	EXISTING_RANGES=$(gcloud container clusters describe ${{ env.GKE_CLUSTER }} \
	--location ${{ env.GKE_LOCATION }} --project ${{ env.GCP_PROJECT_ID }} \
	--format="value(masterAuthorizedNetworksConfig.cidrBlocks[].cidrBlock)" \
	\| tr '\n' ',' \| tr ';' ',' \| sed 's/,$//; s/,,*/,/g')

	if [[ ",${EXISTING_RANGES}," == ",${RUNNER_IP}/32," ]]; then
	echo "IP already whitelisted."
	else
	NEW_RANGES="${BASELINE_RANGES},${RUNNER_IP}/32"
	gcloud container clusters update ${{ env.GKE_CLUSTER }} \
	--location ${{ env.GKE_LOCATION }} --project ${{ env.GCP_PROJECT_ID }} \
	--enable-authorized-networks-on-private-endpoint \
	--enable-google-cloud-access \
	--enable-master-authorized-networks \
	--master-authorized-networks "$NEW_RANGES"
	fi

	- name: Login Helm to GAR OCI registry
	run: \|
	gcloud auth print-access-token \| \
	helm registry login -u oauth2accesstoken --password-stdin \
	"${GAR_LOCATION}-docker.pkg.dev"

	- name: Set Helm dependency repository (prod)
	run: \|
	CHART_REPO="oci://${GAR_LOCATION}-docker.pkg.dev/${{ vars.GCP_PROJECT_ID_PROD }}/${{ vars.GAR_HELM_REPOSITORY_PROD }}"
	sed -i "/- name: service-template/,/repository:/ s#repository: \".*\"#repository: \"${CHART_REPO}\"#" "${CHART_DIR}/Chart.yaml"

	- name: Helm dependency update
	run: helm dependency update "${CHART_DIR}"

	- name: Reset stuck Helm release (if pending)
	run: \|
	STATUS=$(helm status "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" --output json 2>/dev/null \
	\| jq -r '.info.status' \|\| echo "not-found")
	echo "Current release status: ${STATUS}"
	if [[ "${STATUS}" == "pending-upgrade" \|\| "${STATUS}" == "pending-install" \|\| "${STATUS}" == "pending-rollback" ]]; then
	echo "Release is stuck in '${STATUS}' — attempting rollback to last good revision..."
	LAST_GOOD=$(helm history "${SERVICE_NAME}" --namespace "${DEPLOY_NAMESPACE}" --output json \
	\| jq '[.[] \| select(.status == "deployed")] \| last \| .revision')
	if [[ -n "${LAST_GOOD}" && "${LAST_GOOD}" != "null" ]]; then
	# No --wait: avoids re-hitting the ScaledObject readiness timeout on the old chart.
	# The upgrade step immediately following will be the source of truth for success.
	helm rollback "${SERVICE_NAME}" "${LAST_GOOD}" --namespace "${DEPLOY_NAMESPACE}" --timeout 5m
	echo "Rolled back to revision ${LAST_GOOD}"
	else
	# No deployed revision exists (first-ever install got stuck).
	# Uninstalling automatically would delete all cluster resources and cause downtime.
	# Require manual intervention to avoid accidental teardown.
	echo "::error::Release '${SERVICE_NAME}' is stuck in '${STATUS}' with no clean revision to roll back to."
	echo "::error::Manual fix required: kubectl delete secret -n ${DEPLOY_NAMESPACE} $(kubectl get secrets -n ${DEPLOY_NAMESPACE} -l owner=helm,name=${SERVICE_NAME} -o name)"
	exit 1
	fi
	fi

	- name: Force ExternalSecret sync
	if: github.event_name == 'workflow_dispatch' && inputs.refresh_secrets
	run: \|
	SYNC_TOKEN=$(date +%s)
	echo "Forcing ExternalSecret sync in ${DEPLOY_NAMESPACE} (token=${SYNC_TOKEN})"
	for EXT_SECRET in $(kubectl get externalsecret -n "${DEPLOY_NAMESPACE}" -o name 2>/dev/null \|\| true); do
	kubectl annotate "${EXT_SECRET}" -n "${DEPLOY_NAMESPACE}" external-secrets.io/force-sync="${SYNC_TOKEN}" --overwrite
	done
	- name: Helm deploy to prod
	run: \|
	helm upgrade "${SERVICE_NAME}" "${CHART_DIR}" \
	-f "${CHART_DIR}/values-prod.yaml" \
	--set-string service-template.containers[0].image.repository=${{ needs.build.outputs.image_repo }} \
	--set-string service-template.containers[0].image.tag=${{ needs.build.outputs.image_tag }} \
	--namespace "testzeus-prod" \
	--create-namespace \
	--install \
	--wait \
	--rollback-on-failure \
	--server-side=false \
	--timeout 5m

	- name: Collect Helm diagnostics on failure
	if: failure()
	run: \|
	echo "Collecting Helm diagnostics for ${SERVICE_NAME}"
	echo "::group::Helm status"
	HELM_STATUS_OUTPUT=$(helm status "${SERVICE_NAME}" --namespace "testzeus-prod" 2>&1 \|\| true)
	echo "${HELM_STATUS_OUTPUT}"
	echo "::endgroup::"
	echo "::group::Helm history"
	HELM_HISTORY_OUTPUT=$(helm history "${SERVICE_NAME}" --namespace "testzeus-prod" 2>&1 \|\| true)
	echo "${HELM_HISTORY_OUTPUT}"
	echo "::endgroup::"
	echo "::group::Helm release resources"
	HELM_RESOURCES_OUTPUT=$(helm get all "${SERVICE_NAME}" --namespace "testzeus-prod" 2>&1 \|\| true)
	echo "${HELM_RESOURCES_OUTPUT}"
	echo "::endgroup::"
	echo "::group::Namespace events"
	EVENTS_OUTPUT=$(kubectl get events -n "testzeus-prod" --sort-by=.lastTimestamp 2>&1 \|\| true)
	echo "${EVENTS_OUTPUT}"
	echo "::endgroup::"
	echo "::group::Pods"
	PODS_OUTPUT=$(kubectl get pods -n "testzeus-prod" -o wide 2>&1 \|\| true)
	echo "${PODS_OUTPUT}"
	echo "::endgroup::"
	if printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'Evicted\|Insufficient memory'; then
	echo "::notice::Likely cause: node memory pressure or pod eviction."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'FailedScheduling\|Insufficient cpu\|Too many pods'; then
	echo "::notice::Likely cause: insufficient cluster capacity for the rollout."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'ImagePullBackOff\|ErrImagePull'; then
	echo "::notice::Likely cause: image pull or registry access failure."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'CreateContainerConfigError\|Secret .* not found\|configmap .* not found'; then
	echo "::notice::Likely cause: missing secret or config map."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'FailedMount\|MountVolume\|volume attach\|AttachVolume'; then
	echo "::notice::Likely cause: storage or volume attach failure."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'Forbidden\|RBAC\|permission denied'; then
	echo "::notice::Likely cause: permissions or RBAC failure."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'AdmissionWebhook\|denied by webhook\|policy violation'; then
	echo "::notice::Likely cause: admission webhook or policy rejection."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'x509\|TLS\|certificate'; then
	echo "::notice::Likely cause: TLS or certificate validation failure."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'context deadline exceeded\|DeadlineExceeded\|timed out'; then
	echo "::notice::Likely cause: rollout or readiness timeout."
	elif printf '%s\n%s\n' "${EVENTS_OUTPUT}" "${PODS_OUTPUT}" \| grep -qiE 'Readiness probe failed\|Liveness probe failed\|Back-off restarting failed container\|CrashLoopBackOff\|Error'; then
	echo "::notice::Likely cause: probe failures, a crashing container, or an application startup error."
	else
	echo "::notice::Likely cause: review Helm status, pod events, and pod state above."
	fi

	- name: Cleanup Runner IP
	if: always()
	run: \|
	if [ -n "${RUNNER_IP}" ]; then
	echo "Removing Runner IP: ${RUNNER_IP}"
	EXISTING_RANGES=$(gcloud container clusters describe ${{ env.GKE_CLUSTER }} \
	--location ${{ env.GKE_LOCATION }} --project ${{ env.GCP_PROJECT_ID }} \
	--format="value(masterAuthorizedNetworksConfig.cidrBlocks[].cidrBlock)" \
	\| tr '\n' ',' \| tr ';' ',' \| sed 's/,$//; s/,,*/,/g')

	REMAINING_RANGES=$(echo "$EXISTING_RANGES" \| sed "s/${RUNNER_IP}\/32//g; s/,,*/,/g; s/^,//; s/,$//")

	if [ -n "$REMAINING_RANGES" ]; then
	gcloud container clusters update ${{ env.GKE_CLUSTER }} \
	--location ${{ env.GKE_LOCATION }} --project ${{ env.GCP_PROJECT_ID }} \
	--enable-authorized-networks-on-private-endpoint \
	--enable-google-cloud-access \
	--enable-master-authorized-networks \
	--master-authorized-networks "$REMAINING_RANGES"
	else
	echo "::error::Refusing to disable master authorized networks during prod cleanup because no baseline authorized ranges remain."
	exit 1
	fi
	fi

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Merge pull request #28 from test-zeus-ai/helm-updates #31

Workflow file

Merge pull request #28 from test-zeus-ai/helm-updates #31

Uh oh!

Workflow file for this run