Release Gate #96

Workflow file for this run

.github/workflows/release-gate.yml at 45be225

	name: Release Gate

	on:
	workflow_dispatch:
	inputs:
	backend_tag:
	description: Backend image tag to test
	required: true
	type: string
	web_tag:
	description: Web image tag to test
	required: false
	type: string
	default: 'latest'
	test_suite:
	description: Test suite to run
	required: false
	type: choice
	options:
	- all
	- formats
	- repos
	- promotion
	- rbac
	- lifecycle
	- webhooks
	- search
	- platform
	- auth
	- stress
	- resilience
	- mesh
	- security
	- compatibility
	default: 'all'
	skip_teardown:
	description: Skip teardown (for debugging)
	required: false
	type: boolean
	default: false
	iac_ref:
	description: artifact-keeper-iac git ref for the Helm chart (default main)
	required: false
	type: string
	default: 'main'
	workflow_call:
	inputs:
	backend_tag:
	required: true
	type: string
	web_tag:
	required: false
	type: string
	default: 'latest'
	test_suite:
	required: false
	type: string
	default: 'all'
	skip_teardown:
	required: false
	type: boolean
	default: false
	iac_ref:
	required: false
	type: string
	default: 'main'

	env:
	NAMESPACE_CPU: ${{ vars.TEST_NAMESPACE_CPU \|\| '4000m' }}
	NAMESPACE_MEMORY: ${{ vars.TEST_NAMESPACE_MEMORY \|\| '8Gi' }}

	jobs:
	# -------------------------------------------------------------------
	# Clean-install smoke test
	#
	# Boots a fresh namespace, runs `helm install` against the documented
	# values-production.yaml (with overrides for deps the smoke can't
	# satisfy), waits for backend AND web Deployments to reach Ready, then
	# probes /readyz from inside the cluster. Catches startup panics (e.g.
	# the v1.1.8 Debian route panic) that crash the backend before it can
	# serve traffic.
	#
	# The `deploy` job (and therefore the entire test matrix downstream)
	# `needs:` this gate. A startup-broken release fails fast here without
	# burning runner time on the matrix.
	# -------------------------------------------------------------------
	clean-install-smoke:
	runs-on: ak-e2e-runners
	timeout-minutes: 12
	steps:
	- uses: actions/checkout@v4
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install kubectl
	uses: azure/setup-kubectl@v4

	- name: Install Helm
	uses: azure/setup-helm@v4

	- name: Run clean-install smoke test
	env:
	BACKEND_TAG: ${{ inputs.backend_tag }}
	WEB_TAG: ${{ inputs.web_tag }}
	# Pin iac chart ref so the gate validates against the chart
	# version that ships with the release. Defaults to `main` when
	# the workflow input is unset; release pipelines should pass
	# the corresponding iac tag.
	IAC_REF: ${{ inputs.iac_ref \|\| 'main' }}
	# Pull-secret for ghcr.io. Without this, private image tags
	# fail with ImagePullBackOff and the gate fails for the wrong
	# reason. Workflows that test public-only tags can omit it.
	GHCR_DOCKER_CONFIG: ${{ secrets.GHCR_DOCKER_CONFIG }}
	run: \|
	chmod +x scripts/clean-install-smoke.sh
	# github.run_id + github.run_attempt is unique per workflow
	# attempt (re-runs increment run_attempt). Avoids RUN_ID
	# collisions when a job is retried.
	RUN_ID="${{ github.run_id }}-${{ github.run_attempt }}"
	./scripts/clean-install-smoke.sh \
	--run-id "${RUN_ID}" \
	--backend-tag "${BACKEND_TAG}" \
	--web-tag "${WEB_TAG}" \
	--iac-ref "${IAC_REF}" \
	--timeout 300

	- name: Upload smoke diagnostics
	if: failure()
	uses: actions/upload-artifact@v4
	with:
	name: clean-install-smoke-logs
	path: /tmp/test-logs/
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Deploy test environment
	#
	# Gated on `clean-install-smoke` so that the matrix below cannot run
	# against a backend that fails to even start. A startup-broken release
	# fails fast in `clean-install-smoke` and the entire matrix is skipped,
	# preserving runner-time for releases that can actually be tested.
	# -------------------------------------------------------------------
	deploy:
	needs: clean-install-smoke
	runs-on: ak-e2e-runners
	outputs:
	run_id: ${{ steps.setup.outputs.run_id }}
	namespace: ${{ steps.setup.outputs.namespace }}
	backend_url: ${{ steps.deploy.outputs.backend_url }}
	steps:
	- uses: actions/checkout@v4
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install kubectl
	uses: azure/setup-kubectl@v4

	- name: Install Helm
	uses: azure/setup-helm@v4

	- name: Generate run ID
	id: setup
	run: \|
	RUN_ID="e2e-$(date +%s)-${GITHUB_RUN_NUMBER}"
	echo "run_id=${RUN_ID}" >> "$GITHUB_OUTPUT"
	echo "namespace=test-${RUN_ID}" >> "$GITHUB_OUTPUT"

	- name: Deploy test namespace
	id: deploy
	env:
	RUN_ID: ${{ steps.setup.outputs.run_id }}
	BACKEND_TAG: ${{ inputs.backend_tag }}
	WEB_TAG: ${{ inputs.web_tag }}
	run: \|
	chmod +x scripts/create-test-namespace.sh
	# --full-stack enables Trivy + scan workspace so the security
	# tests actually exercise the scanner instead of false-passing
	# against a no-scanner stack (#888 silent-success class).
	./scripts/create-test-namespace.sh \
	--run-id "${RUN_ID}" \
	--backend-tag "${BACKEND_TAG}" \
	--web-tag "${WEB_TAG}" \
	--full-stack

	NAMESPACE="test-${RUN_ID}"
	BACKEND_URL="http://artifact-keeper-backend.${NAMESPACE}.svc.cluster.local:8080"
	echo "backend_url=${BACKEND_URL}" >> "$GITHUB_OUTPUT"

	- name: Wait for stack ready
	env:
	RUN_ID: ${{ steps.setup.outputs.run_id }}
	run: \|
	NAMESPACE="test-${RUN_ID}"
	BACKEND_URL="http://artifact-keeper-backend.${NAMESPACE}.svc.cluster.local:8080"
	chmod +x tests/lib/wait-for-ready.sh
	./tests/lib/wait-for-ready.sh "${BACKEND_URL}" 180

	# Trivy rollout must be Available before the security tests
	# dispatch. The chart's fullnameOverride is "artifact-keeper"
	# (see helm/values-test-full.yaml) so the deployment is
	# named artifact-keeper-trivy regardless of the release name.
	# Without this gate, security-tests can race scanner pod
	# scale-up and the lite scan-completion gate sees an unreachable
	# scanner as "real" failure (the #888 false-fail mirror).
	echo "Waiting for Trivy rollout in ${NAMESPACE}..."
	kubectl -n "${NAMESPACE}" rollout status \
	deployment/artifact-keeper-trivy --timeout=180s
	kubectl -n "${NAMESPACE}" wait --for=condition=Available \
	deployment/artifact-keeper-trivy --timeout=60s

	# -------------------------------------------------------------------
	# Format tests (8 parallel batches)
	# -------------------------------------------------------------------
	format-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'formats'
	runs-on: ak-e2e-runners
	strategy:
	fail-fast: false
	matrix:
	batch:
	- name: node
	scripts: "test-npm.sh test-npm-remote.sh test-vscode.sh"
	- name: python
	scripts: "test-pypi.sh test-pypi-native-client.sh test-pypi-remote.sh test-conda.sh test-huggingface.sh test-mlmodel.sh"
	- name: jvm
	scripts: "test-maven.sh test-maven-native-client.sh test-maven-remote.sh test-maven-virtual-snapshot.sh test-sbt.sh test-gradle-conformance.sh"
	- name: rust-go-swift
	scripts: "test-cargo.sh test-cargo-remote.sh test-go.sh test-swift.sh test-pub.sh"
	- name: system-packages
	scripts: "test-debian.sh test-rpm.sh test-alpine.sh test-opkg.sh"
	- name: containers
	scripts: "test-oci.sh test-oci-remote.sh test-docker-native-client.sh test-helm.sh test-incus.sh"
	- name: misc-native
	scripts: "test-terraform.sh test-composer.sh test-hex.sh test-rubygems.sh test-nuget.sh test-cocoapods.sh test-cran.sh"
	- name: generic-protocol
	scripts: "test-generic.sh test-generic-native-client.sh test-gitlfs.sh test-protobuf.sh test-bazel.sh test-conan.sh test-conan-auth.sh test-conan-recipes.sh test-conan-packages.sh test-conan-search.sh test-conan-revisions.sh test-conan-remote.sh test-conan-errors.sh test-conan-stress.sh test-ansible.sh test-p2.sh test-jetbrains.sh test-vagrant.sh test-wasm.sh test-puppet.sh test-chef.sh"
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@v4
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install test dependencies
	run: \|
	BATCH="${{ matrix.batch.name }}"
	echo "Installing dependencies for batch: $BATCH"

	# Common: ensure zip is available (used by maven, swift, vscode, go)
	if ! command -v zip &>/dev/null; then
	sudo apt-get update -qq && sudo apt-get install -y -qq zip > /dev/null
	fi

	case "$BATCH" in
	node)
	if ! command -v npm &>/dev/null; then
	curl -fsSL https://deb.nodesource.com/setup_22.x \| sudo -E bash - > /dev/null 2>&1
	sudo apt-get install -y -qq nodejs > /dev/null
	fi
	;;
	python)
	if ! command -v python3 &>/dev/null; then
	sudo apt-get update -qq && sudo apt-get install -y -qq python3 python3-pip python3-setuptools python3-venv > /dev/null
	fi
	# python3-venv is required by test-pypi-native-client.sh; install
	# it even if python3 is already present, since the bundled
	# interpreter may have ensurepip stripped out.
	if ! python3 -c 'import venv; venv.EnvBuilder().ensure_directories' &>/dev/null; then
	sudo apt-get update -qq && sudo apt-get install -y -qq python3-venv > /dev/null \|\| true
	fi
	;;
	jvm)
	# mvn is required by test-maven-native-client.sh; the suite is
	# auto-skipped if maven is missing, but we install it here so
	# the gate actually exercises native-client coverage.
	if ! command -v mvn &>/dev/null; then
	sudo apt-get update -qq && sudo apt-get install -y -qq maven > /dev/null \|\| true
	fi
	;;
	rust-go-swift)
	if ! command -v go &>/dev/null; then
	GO_VERSION="1.23.6"
	curl -sSL "https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz" \| sudo tar -C /usr/local -xz
	echo "/usr/local/go/bin" >> "$GITHUB_PATH"
	fi
	if ! command -v cargo &>/dev/null; then
	curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \| sh -s -- -y --default-toolchain stable --profile minimal
	echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
	fi
	;;
	containers)
	# Helm is needed for test-helm.sh
	if ! command -v helm &>/dev/null; then
	curl -sSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 \| bash
	fi
	;;
	system-packages)
	# ar (from binutils) for Debian package assembly
	if ! command -v ar &>/dev/null; then
	sudo apt-get update -qq && sudo apt-get install -y -qq binutils > /dev/null
	fi
	;;
	esac

	- name: Run ${{ matrix.batch.name }} format tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	exit_code=0
	for script in ${{ matrix.batch.scripts }}; do
	echo "=== Running ${script} ==="
	if ! bash "tests/formats/${script}"; then
	echo "FAILED: ${script}"
	exit_code=1
	fi
	done
	exit $exit_code

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: junit-formats-${{ matrix.batch.name }}
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Security tests
	#
	# cache-poisoning + cache-stampede boot a Python mock upstream on the
	# runner pod and need the backend to dial the runner pod by hostname.
	# We compute the runner's pod IP at runtime and translate it to the
	# cluster-DNS pod-DNS form (10-1-2-3.<ns>.pod.cluster.local) which the
	# backend pod can resolve via ClusterFirst.
	#
	# PROXY_MAX_CONCURRENT_FETCHES / PROXY_QUEUE_TIMEOUT_SECS pin the
	# values the test asserts against so chart-default drift doesn't
	# silently make the assertion measure the wrong limit. They MUST match
	# the values the deployed backend was configured with.
	# -------------------------------------------------------------------
	security-tests:
	needs: deploy
	# continue-on-error: test-scan-completes.sh asserts on Grype scanner
	# finishing with findings. Grype on the v1.1.x backend image fails
	# deterministically because the vulnerability DB is not pre-seeded
	# in the Dockerfile and our network-restricted ARC runner pods can't
	# fetch grype.anchore.io at scan time. The quality gate is
	# LAST-scanner-wins (policy_service reads LIMIT 1 ORDER BY created_at
	# DESC), so Trivy success satisfies block_unscanned and the practical
	# security posture is unaffected. Tracked for fix in v1.1.10:
	# artifact-keeper#1001 (pre-seed Grype DB in Dockerfile). The other
	# 44 security tests in the suite still run and gate the release.
	continue-on-error: true
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'security'
	runs-on: ak-e2e-runners
	timeout-minutes: 15
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	# Stampede / poisoning test knobs. Must match the values the chart
	# rendered for the backend Deployment (see helm values-test.yaml).
	PROXY_MAX_CONCURRENT_FETCHES: '20'
	PROXY_QUEUE_TIMEOUT_SECS: '5'
	STAMPEDE_UPSTREAM_DELAY_MS: '2000'
	steps:
	- uses: actions/checkout@v4
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Resolve runner pod address for backend dial-back
	id: mock-host
	run: \|
	# The runner is a Pod inside the cluster; its pod IP is reachable
	# from the backend Pod over the cluster network. We pass the bare
	# IP as MOCK_UPSTREAM_HOSTNAME so the backend's upstream-URL
	# resolver does not need cluster DNS to be configured for the
	# `<ip-dashed>.<ns>.pod.cluster.local` form (which depends on
	# CoreDNS `pods` plugin mode).
	#
	# ARC runners with `spec.template.spec.containers[].env.POD_IP`
	# via the downward API populate $POD_IP. We fall back to
	# `hostname -i` if the env var is missing.
	POD_IP="${POD_IP:-$(hostname -i 2>/dev/null \| awk '{print $1}')}"
	if [ -z "$POD_IP" ]; then
	echo "ERROR: could not determine runner pod IP for mock dial-back" >&2
	exit 1
	fi
	# Sanity: must look like an IPv4. Reject 127.* (loopback would
	# only be reachable from inside the runner pod itself, not from
	# the backend pod across the cluster network).
	if ! echo "$POD_IP" \| grep -Eq '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$'; then
	echo "ERROR: POD_IP '${POD_IP}' is not an IPv4 address" >&2
	exit 1
	fi
	if echo "$POD_IP" \| grep -Eq '^127\.'; then
	echo "ERROR: POD_IP '${POD_IP}' is loopback; backend pod cannot reach this" >&2
	exit 1
	fi
	echo "Runner pod IP: ${POD_IP}"
	echo "MOCK_UPSTREAM_HOSTNAME=${POD_IP}" >> "$GITHUB_ENV"

	- name: Run security tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite security --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: junit-security
	# Directory upload (not *.xml glob) so per-test diagnostic
	# JSON breadcrumbs (e.g. scan-completes-final-resp.json) reach
	# the operator. With *.xml glob the dump dies silently and the
	# gate's failure rendering is a one-line message attribute.
	path: /tmp/test-results/
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Compatibility tests
	# -------------------------------------------------------------------
	compatibility-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'compatibility'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@v4
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run compatibility tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite compatibility --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: junit-compatibility
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Repository type tests (virtual, remote, CRUD, labels)
	# -------------------------------------------------------------------
	repo-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'repos'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@v4
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run repo type tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite repos --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: junit-repos
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Promotion tests
	# -------------------------------------------------------------------
	promotion-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'promotion'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@v4
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run promotion tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite promotion --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: junit-promotion
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# RBAC tests
	# -------------------------------------------------------------------
	rbac-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'rbac'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@v4
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run RBAC tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite rbac --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: junit-rbac
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Lifecycle tests
	# -------------------------------------------------------------------
	lifecycle-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'lifecycle'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@v4
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run lifecycle tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite lifecycle --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: junit-lifecycle
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Webhook tests
	# -------------------------------------------------------------------
	webhook-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'webhooks'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	# Per-script timeout for run-suite.sh. Webhook resilience tests
	# poll for retry/dead-letter behavior on schedules up to 180s
	# (WEBHOOK_RETRY_TIMEOUT) so the wrapping timeout must exceed that.
	TEST_TIMEOUT: '300'
	steps:
	- uses: actions/checkout@v4
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run webhook tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite webhooks --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: junit-webhooks
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Search tests
	# -------------------------------------------------------------------
	search-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'search'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@v4
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run search tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite search --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: junit-search
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Platform tests (signing, SBOM, curation, labels, audit, backup)
	# -------------------------------------------------------------------
	platform-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'platform'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@v4
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run platform tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite platform --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: junit-platform
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Auth tests (tokens, TOTP, rate limiting)
	# -------------------------------------------------------------------
	auth-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'auth'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@v4
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run auth tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite auth --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: junit-auth
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Stress tests (after formats + security pass)
	#
	# continue-on-error: stress tests measure backend behavior under
	# sustained mixed-workload (auth + upload + download + list) on a
	# 2 CPU test pod inside the namespace's 4 CPU / 8 Gi quota. Error-
	# rate variance is high on ARC runners (observed 22-54% across
	# otherwise-identical runs) because the bcrypt-bound auth path
	# saturates first and the worker count drives RPS up faster than
	# the pod can absorb. The test still produces JUnit + run logs so
	# regressions are visible, but a single failed run does not block
	# the release gate. Real perf regressions are caught by dedicated
	# benchmark workflows on Rocky, not by this CI smoke gate.
	# See artifact-keeper#991 for v1.1.x auth-path perf investigation.
	# -------------------------------------------------------------------
	stress-tests:
	needs: [deploy, format-tests, repo-tests, promotion-tests, rbac-tests, lifecycle-tests, webhook-tests, search-tests, platform-tests, auth-tests, security-tests, compatibility-tests]
	continue-on-error: true
	if: \|
	always() &&
	needs.deploy.result == 'success' &&
	(inputs.test_suite == 'all' \|\| inputs.test_suite == 'stress')
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@v4
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run stress tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite stress --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: junit-stress
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Resilience tests (after stress completes)
	#
	# Run regardless of stress-tests outcome. Resilience tests target
	# crash recovery, network partition, storage failures, etc., which
	# are independent of the bcrypt/auth saturation that stress-tests
	# measures. Skipping resilience because stress hit its error-rate
	# threshold loses signal on a different failure class.
	# -------------------------------------------------------------------
	resilience-tests:
	needs: [deploy, stress-tests]
	if: \|
	always() &&
	needs.deploy.result == 'success' &&
	(inputs.test_suite == 'all' \|\| inputs.test_suite == 'resilience')
	runs-on: ak-e2e-runners
	strategy:
	fail-fast: false
	matrix:
	category: [crash, restart, network, storage, data]
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	NAMESPACE: ${{ needs.deploy.outputs.namespace }}
	JUNIT_OUTPUT_DIR: /tmp/test-results
	steps:
	- uses: actions/checkout@v4
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install kubectl
	uses: azure/setup-kubectl@v4

	- name: Run ${{ matrix.category }} resilience tests
	continue-on-error: true
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	FAILED=0
	for script in tests/resilience/${{ matrix.category }}/test-*.sh; do
	[ -f "$script" ] \|\| continue
	echo "=== Running ${script} ==="
	if ! bash "$script"; then
	echo "FAILED: ${script}"
	FAILED=$((FAILED + 1))
	fi
	done
	if [ "$FAILED" -gt 0 ]; then
	echo "::warning::${FAILED} resilience test(s) failed in ${{ matrix.category }} (non-blocking on ARC runners)"
	exit 1
	fi

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: junit-resilience-${{ matrix.category }}
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Mesh tests (after resilience passes)
	# -------------------------------------------------------------------
	mesh-tests:
	needs: [deploy, resilience-tests]
	if: \|
	always() &&
	(inputs.test_suite == 'all' \|\| inputs.test_suite == 'mesh') &&
	(needs.resilience-tests.result == 'success' \|\| needs.resilience-tests.result == 'skipped')
	runs-on: ak-e2e-runners
	env:
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@v4
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install kubectl
	uses: azure/setup-kubectl@v4

	- name: Install Helm
	uses: azure/setup-helm@v4

	- name: Deploy mesh topology
	id: mesh-deploy
	run: \|
	MESH_RUN_ID="${RUN_ID}"
	chmod +x scripts/create-test-namespace.sh

	# Deploy 4 mesh instances
	for i in main peer1 peer2 peer3; do
	MESH_NS="test-${MESH_RUN_ID}-mesh-${i}"
	./scripts/create-test-namespace.sh \
	--run-id "${MESH_RUN_ID}-mesh-${i}" \
	--backend-tag "${{ inputs.backend_tag }}" \
	--web-tag "${{ inputs.web_tag }}" \
	--values helm/values-test-mesh.yaml
	done

	# Output URLs
	BASE_NS="test-${MESH_RUN_ID}-mesh"
	echo "MAIN_URL=http://artifact-keeper-backend.${BASE_NS}-main.svc.cluster.local:8080" >> "$GITHUB_OUTPUT"
	echo "PEER1_URL=http://artifact-keeper-backend.${BASE_NS}-peer1.svc.cluster.local:8080" >> "$GITHUB_OUTPUT"
	echo "PEER2_URL=http://artifact-keeper-backend.${BASE_NS}-peer2.svc.cluster.local:8080" >> "$GITHUB_OUTPUT"
	echo "PEER3_URL=http://artifact-keeper-backend.${BASE_NS}-peer3.svc.cluster.local:8080" >> "$GITHUB_OUTPUT"

	- name: Wait for mesh instances ready
	run: \|
	chmod +x tests/lib/wait-for-ready.sh
	for url in "${{ steps.mesh-deploy.outputs.MAIN_URL }}" \
	"${{ steps.mesh-deploy.outputs.PEER1_URL }}" \
	"${{ steps.mesh-deploy.outputs.PEER2_URL }}" \
	"${{ steps.mesh-deploy.outputs.PEER3_URL }}"; do
	./tests/lib/wait-for-ready.sh "$url" 300
	done

	- name: Run mesh tests
	env:
	MAIN_URL: ${{ steps.mesh-deploy.outputs.MAIN_URL }}
	PEER1_URL: ${{ steps.mesh-deploy.outputs.PEER1_URL }}
	PEER2_URL: ${{ steps.mesh-deploy.outputs.PEER2_URL }}
	PEER3_URL: ${{ steps.mesh-deploy.outputs.PEER3_URL }}
	BASE_URL: ${{ steps.mesh-deploy.outputs.MAIN_URL }}
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite mesh --run-id "${RUN_ID}"

	- name: Teardown mesh namespaces
	if: always() && inputs.skip_teardown != true
	run: \|
	chmod +x scripts/teardown-test-namespace.sh
	for i in main peer1 peer2 peer3; do
	./scripts/teardown-test-namespace.sh --run-id "${RUN_ID}-mesh-${i}" \|\| true
	done

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: junit-mesh
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Collect results and publish summary
	# -------------------------------------------------------------------
	collect-results:
	needs: [clean-install-smoke, deploy, format-tests, security-tests, compatibility-tests, repo-tests, promotion-tests, rbac-tests, lifecycle-tests, webhook-tests, search-tests, platform-tests, auth-tests, stress-tests, resilience-tests, mesh-tests]
	if: always()
	runs-on: ak-e2e-runners
	steps:
	- name: Download all test artifacts
	uses: actions/download-artifact@v4
	with:
	pattern: junit-*
	path: /tmp/all-results
	merge-multiple: true

	- name: Publish test summary
	if: always()
	run: \|
	echo "## Release Gate Results" >> "$GITHUB_STEP_SUMMARY"
	echo "" >> "$GITHUB_STEP_SUMMARY"
	echo "\| Suite \| Status \|" >> "$GITHUB_STEP_SUMMARY"
	echo "\|-------\|--------\|" >> "$GITHUB_STEP_SUMMARY"

	for job in clean-install-smoke format-tests repo-tests promotion-tests rbac-tests lifecycle-tests webhook-tests search-tests platform-tests auth-tests security-tests compatibility-tests stress-tests resilience-tests mesh-tests; do
	status="skipped"
	case "$job" in
	clean-install-smoke) status="${{ needs.clean-install-smoke.result }}" ;;
	format-tests) status="${{ needs.format-tests.result }}" ;;
	repo-tests) status="${{ needs.repo-tests.result }}" ;;
	promotion-tests) status="${{ needs.promotion-tests.result }}" ;;
	rbac-tests) status="${{ needs.rbac-tests.result }}" ;;
	lifecycle-tests) status="${{ needs.lifecycle-tests.result }}" ;;
	webhook-tests) status="${{ needs.webhook-tests.result }}" ;;
	search-tests) status="${{ needs.search-tests.result }}" ;;
	platform-tests) status="${{ needs.platform-tests.result }}" ;;
	auth-tests) status="${{ needs.auth-tests.result }}" ;;
	security-tests) status="${{ needs.security-tests.result }}" ;;
	compatibility-tests) status="${{ needs.compatibility-tests.result }}" ;;
	stress-tests) status="${{ needs.stress-tests.result }}" ;;
	resilience-tests) status="${{ needs.resilience-tests.result }}" ;;
	mesh-tests) status="${{ needs.mesh-tests.result }}" ;;
	esac
	echo "\| ${job} \| ${status} \|" >> "$GITHUB_STEP_SUMMARY"
	done

	echo "" >> "$GITHUB_STEP_SUMMARY"
	echo "Backend tag: \`${{ inputs.backend_tag }}\`" >> "$GITHUB_STEP_SUMMARY"
	echo "Web tag: \`${{ inputs.web_tag }}\`" >> "$GITHUB_STEP_SUMMARY"
	echo "Run ID: \`${{ needs.deploy.outputs.run_id }}\`" >> "$GITHUB_STEP_SUMMARY"

	- name: Upload combined results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: release-gate-results
	path: /tmp/all-results/
	if-no-files-found: ignore

	- name: Gate check - fail if any required suite failed
	# stress-tests and security-tests are intentionally excluded from
	# this rollup. Both have continue-on-error: true (see the comments
	# above each job) so their outcome can be 'failure' on
	# known-flaky / known-infra-debt scenarios without blocking the
	# release gate:
	# - stress-tests: bcrypt-bound auth saturation under sustained
	# load on shared ARC runners (artifact-keeper#991).
	# - security-tests: Grype DB not pre-seeded in v1.1.x backend
	# image; quality gate is last-scanner-wins so Trivy covers
	# the policy gate (artifact-keeper#1001).
	# The wildcard form contains(needs.*.result, 'failure') still
	# observes those failures because needs.<job>.result reflects the
	# job's outcome, not its continue-on-error-adjusted conclusion.
	# So we list the required suites explicitly here. If you add a
	# new required suite, add it to this list. Soft-failing suites
	# stay off the list.
	if: >-
	needs.clean-install-smoke.result == 'failure' \|\| needs.clean-install-smoke.result == 'cancelled' \|\|
	needs.deploy.result == 'failure' \|\| needs.deploy.result == 'cancelled' \|\|
	needs.format-tests.result == 'failure' \|\| needs.format-tests.result == 'cancelled' \|\|
	needs.compatibility-tests.result == 'failure' \|\| needs.compatibility-tests.result == 'cancelled' \|\|
	needs.repo-tests.result == 'failure' \|\| needs.repo-tests.result == 'cancelled' \|\|
	needs.promotion-tests.result == 'failure' \|\| needs.promotion-tests.result == 'cancelled' \|\|
	needs.rbac-tests.result == 'failure' \|\| needs.rbac-tests.result == 'cancelled' \|\|
	needs.lifecycle-tests.result == 'failure' \|\| needs.lifecycle-tests.result == 'cancelled' \|\|
	needs.webhook-tests.result == 'failure' \|\| needs.webhook-tests.result == 'cancelled' \|\|
	needs.search-tests.result == 'failure' \|\| needs.search-tests.result == 'cancelled' \|\|
	needs.platform-tests.result == 'failure' \|\| needs.platform-tests.result == 'cancelled' \|\|
	needs.auth-tests.result == 'failure' \|\| needs.auth-tests.result == 'cancelled' \|\|
	needs.resilience-tests.result == 'failure' \|\| needs.resilience-tests.result == 'cancelled' \|\|
	needs.mesh-tests.result == 'failure' \|\| needs.mesh-tests.result == 'cancelled'
	run: \|
	echo "::error::Release gate FAILED - one or more required test suites did not pass"
	echo "Review the workflow summary above for details"
	echo "Note: stress-tests is non-blocking; its outcome is shown in the summary but does not gate the release"
	exit 1

	# -------------------------------------------------------------------
	# Teardown
	# -------------------------------------------------------------------
	teardown:
	needs: [deploy, collect-results]
	if: always() && inputs.skip_teardown != true
	runs-on: ak-e2e-runners
	steps:
	- uses: actions/checkout@v4
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install kubectl
	uses: azure/setup-kubectl@v4

	- name: Install Helm
	uses: azure/setup-helm@v4

	- name: Teardown test namespace
	env:
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	run: \|
	chmod +x scripts/teardown-test-namespace.sh
	./scripts/teardown-test-namespace.sh --run-id "${RUN_ID}"

	- name: Upload pod logs
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: pod-logs
	path: /tmp/test-logs/
	if-no-files-found: ignore

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Release Gate #96

Workflow file

Release Gate #96

Uh oh!

Workflow file for this run