Release Gate #114

Workflow file for this run

.github/workflows/release-gate.yml at c0cfe5b

	name: Release Gate

	on:
	workflow_dispatch:
	inputs:
	backend_tag:
	description: Backend image tag to test
	required: true
	type: string
	web_tag:
	description: Web image tag to test
	required: false
	type: string
	default: 'latest'
	test_suite:
	description: Test suite to run
	required: false
	type: choice
	options:
	- all
	- formats
	- repos
	- promotion
	- rbac
	- lifecycle
	- webhooks
	- search
	- platform
	- auth
	- stress
	- resilience
	- mesh
	- security
	- compatibility
	- pullthrough
	default: 'all'
	skip_teardown:
	description: Skip teardown (for debugging)
	required: false
	type: boolean
	default: false
	iac_ref:
	description: artifact-keeper-iac git ref for the Helm chart (default main)
	required: false
	type: string
	default: 'main'
	run_smoke_with_deps:
	description: \|
	Run the clean-install-smoke-with-deps variant (issue #53).
	Disabled by default because enabling Trivy/DT/edge/openSCAP
	can exceed the standard ARC runner namespace's 4 CPU / 8 Gi
	quota. Set to true once a beefier runner pool is wired.
	required: false
	type: boolean
	default: false
	workflow_call:
	inputs:
	backend_tag:
	required: true
	type: string
	web_tag:
	required: false
	type: string
	default: 'latest'
	test_suite:
	required: false
	type: string
	default: 'all'
	skip_teardown:
	required: false
	type: boolean
	default: false
	iac_ref:
	required: false
	type: string
	default: 'main'
	run_smoke_with_deps:
	required: false
	type: boolean
	default: false

	env:
	NAMESPACE_CPU: ${{ vars.TEST_NAMESPACE_CPU \|\| '4000m' }}
	NAMESPACE_MEMORY: ${{ vars.TEST_NAMESPACE_MEMORY \|\| '8Gi' }}

	jobs:
	# -------------------------------------------------------------------
	# Version-set integrity check (issue #63)
	#
	# Runs FIRST, before any namespace is provisioned. Verifies that every
	# container image referenced by the release set actually exists at the
	# tag the chart references. This catches the structural failure mode
	# behind artifact-keeper#872 (customer-flagged: "current main Helm
	# chart only works with main backend and frontend, not a tagged
	# release"), artifact-keeper#905 (versioned tags missing on ghcr.io),
	# and artifact-keeper-web#320 (v1.1.8 web image never published).
	#
	# A green release-gate today says "the test cluster works"; this job
	# turns that into "every image referenced by the release set actually
	# exists at that tag." Failing here is a release-blocker that should
	# NEVER be soft-failed: a missing tag is a publish-pipeline regression,
	# not a flake.
	# -------------------------------------------------------------------
	version-set-integrity:
	runs-on: ak-e2e-runners
	timeout-minutes: 5
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install Helm (for chart-default verification)
	uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0

	- name: Clone iac chart for default-tag verification
	env:
	IAC_REF: ${{ inputs.iac_ref \|\| 'main' }}
	run: \|
	# Clone the iac repo into a sibling directory of the test
	# checkout so verify-image-set.sh can render the chart with
	# no overrides and compare default image tags. This is the
	# #872 customer-pain shape: chart on a tag, images on a
	# different tag, no --set bridging them.
	git clone --depth 1 --branch "${IAC_REF}" \
	https://github.com/artifact-keeper/artifact-keeper-iac.git \
	"${RUNNER_TEMP}/iac"

	- name: Verify backend / web / openscap image tags exist on ghcr.io
	env:
	BACKEND_TAG: ${{ inputs.backend_tag }}
	WEB_TAG: ${{ inputs.web_tag }}
	# openscap is published in lockstep with backend on the same
	# tag. When that lockstep breaks (#872 customer pain), we want
	# to know BEFORE we try to deploy.
	OPENSCAP_TAG: ${{ inputs.backend_tag }}
	run: \|
	chmod +x tests/release-gate/verify-image-set.sh
	./tests/release-gate/verify-image-set.sh \
	--backend-tag "${BACKEND_TAG}" \
	--web-tag "${WEB_TAG}" \
	--openscap-tag "${OPENSCAP_TAG}" \
	--chart-dir "${RUNNER_TEMP}/iac/charts/artifact-keeper"

	- name: Upload version-set diagnostics
	if: failure()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: version-set-integrity-logs
	path: /tmp/version-set-*.log
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Clean-install smoke test
	#
	# Boots a fresh namespace, runs `helm install` against the documented
	# values-production.yaml (with overrides for deps the smoke can't
	# satisfy), waits for backend AND web Deployments to reach Ready, then
	# probes /readyz from inside the cluster. Catches startup panics (e.g.
	# the v1.1.8 Debian route panic) that crash the backend before it can
	# serve traffic.
	#
	# The `deploy` job (and therefore the entire test matrix downstream)
	# `needs:` this gate. A startup-broken release fails fast here without
	# burning runner time on the matrix.
	# -------------------------------------------------------------------
	clean-install-smoke:
	needs: version-set-integrity
	runs-on: ak-e2e-runners
	timeout-minutes: 12
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install kubectl
	uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0

	- name: Install Helm
	uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0

	- name: Run clean-install smoke test
	env:
	BACKEND_TAG: ${{ inputs.backend_tag }}
	WEB_TAG: ${{ inputs.web_tag }}
	# Pin iac chart ref so the gate validates against the chart
	# version that ships with the release. Defaults to `main` when
	# the workflow input is unset; release pipelines should pass
	# the corresponding iac tag.
	IAC_REF: ${{ inputs.iac_ref \|\| 'main' }}
	# Pull-secret for ghcr.io. Without this, private image tags
	# fail with ImagePullBackOff and the gate fails for the wrong
	# reason. Workflows that test public-only tags can omit it.
	GHCR_DOCKER_CONFIG: ${{ secrets.GHCR_DOCKER_CONFIG }}
	run: \|
	chmod +x scripts/clean-install-smoke.sh
	# github.run_id + github.run_attempt is unique per workflow
	# attempt (re-runs increment run_attempt). Avoids RUN_ID
	# collisions when a job is retried.
	RUN_ID="${{ github.run_id }}-${{ github.run_attempt }}"
	./scripts/clean-install-smoke.sh \
	--run-id "${RUN_ID}" \
	--backend-tag "${BACKEND_TAG}" \
	--web-tag "${WEB_TAG}" \
	--iac-ref "${IAC_REF}" \
	--timeout 300

	- name: Upload smoke diagnostics
	if: failure()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: clean-install-smoke-logs
	path: /tmp/test-logs/
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Clean-install smoke WITH dependencies (issue #53)
	#
	# The basic `clean-install-smoke` above disables Trivy, Dependency-
	# Track, edge replication, ingress, and openSCAP to keep the smoke
	# under the runner's memory budget. That leaves a real coverage gap:
	# chart wiring regressions in those subsystems pass the gate. A
	# v1.1.8-class regression that broke ONLY Trivy or Dependency-Track
	# wiring would NOT be caught by the basic smoke.
	#
	# This job runs the same smoke flow against a values overlay that
	# enables every optional subsystem and asserts each one reaches a
	# healthy state.
	#
	# CURRENTLY DISABLED with `if: false`. Enabling all subsystems can
	# exceed the ARC runner namespace's 4 CPU / 8 Gi quota:
	# - Trivy: 1 CPU / 2 Gi limit
	# - DependencyTrack: 2 CPU / 4 Gi limit
	# - Edge: 500m / 512 Mi limit
	# - OpenSCAP: 500m / 1 Gi limit
	# - Backend: 2 CPU / 2 Gi limit
	# - Web/Postgres/OpenSearch: ~1 CPU / ~2 Gi combined
	# Total: roughly 7 CPU / 12 Gi limits, which can OOM the namespace.
	#
	# To enable: bump the namespace quota OR move this job to a beefier
	# runner pool (e.g. `ak-beefy-runners`) and flip the `if:` here.
	# Tracked under #53.
	# -------------------------------------------------------------------
	clean-install-smoke-with-deps:
	needs: clean-install-smoke
	# TODO(#53): enable by default once a runner with >= 8 CPU / 16 Gi
	# is available in the ARC pool. The `run_smoke_with_deps` input
	# (defined in workflow_dispatch.inputs and workflow_call.inputs at
	# the top of this file) defaults to false, so the job is wired
	# and validated by actionlint but does not execute unless the
	# dispatching workflow explicitly opts in. The dispatching
	# workflow can override to true when running against a beefier
	# runner pool.
	if: ${{ inputs.run_smoke_with_deps == true }}
	runs-on: ak-e2e-runners
	timeout-minutes: 20
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install kubectl
	uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0

	- name: Install Helm
	uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0

	- name: Run clean-install-smoke with all deps enabled
	env:
	BACKEND_TAG: ${{ inputs.backend_tag }}
	WEB_TAG: ${{ inputs.web_tag }}
	IAC_REF: ${{ inputs.iac_ref \|\| 'main' }}
	GHCR_DOCKER_CONFIG: ${{ secrets.GHCR_DOCKER_CONFIG }}
	run: \|
	chmod +x tests/release-gate/clean-install-smoke-with-deps.sh
	RUN_ID="${{ github.run_id }}-${{ github.run_attempt }}-deps"
	./tests/release-gate/clean-install-smoke-with-deps.sh \
	--run-id "${RUN_ID}" \
	--backend-tag "${BACKEND_TAG}" \
	--web-tag "${WEB_TAG}" \
	--iac-ref "${IAC_REF}" \
	--timeout 600

	- name: Upload smoke-with-deps diagnostics
	if: failure()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: clean-install-smoke-with-deps-logs
	path: /tmp/test-logs/
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Chart upgrade smoke (issue #54)
	#
	# `clean-install-smoke` catches startup panics on a fresh install.
	# It does NOT catch:
	# - Migration-on-upgrade failures (schema change that fails to
	# apply when upgrading prev -> current)
	# - Chart-template breakage that only manifests on `helm upgrade`
	# (immutable field changes, StatefulSet rollout deadlocks)
	# - Resources that get re-created instead of preserved across
	# upgrades
	#
	# The script installs the previous stable release tag, pushes a
	# small artifact through the management API to establish state,
	# runs `helm upgrade` to the current backend image, then asserts
	# the artifact is still retrievable and `/readyz` returns 200.
	#
	# PREVIOUS_TAG: hardcoded today. Update on each release. Once
	# release tooling can introspect the previous tag automatically,
	# this can be derived from `gh release list` in a setup step.
	# -------------------------------------------------------------------
	chart-upgrade-smoke:
	needs: clean-install-smoke
	runs-on: ak-e2e-runners
	timeout-minutes: 25
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install kubectl
	uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0

	- name: Install Helm
	uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0

	- name: Run chart-upgrade-smoke
	env:
	BACKEND_TAG: ${{ inputs.backend_tag }}
	WEB_TAG: ${{ inputs.web_tag }}
	IAC_REF: ${{ inputs.iac_ref \|\| 'main' }}
	GHCR_DOCKER_CONFIG: ${{ secrets.GHCR_DOCKER_CONFIG }}
	# PREVIOUS_TAG: the previously-released stable tag whose chart
	# the upgrade originates from. UPDATE THIS ON EACH RELEASE.
	# When v1.1.10 ships, bump this to "1.1.9" (the new "previous
	# stable"). The script accepts the unprefixed semver form
	# because docker tags drop the leading 'v' (see CLAUDE.md).
	PREVIOUS_TAG: '1.1.9'
	# PREVIOUS_WEB_TAG: the web image tag for the previous-tag side
	# of the upgrade. The web repo cuts its own release cadence and
	# does NOT mirror backend version tags, so the backend's
	# PREVIOUS_TAG (e.g. "1.1.9") is NOT a valid web image tag and
	# causes ImagePullBackOff (closes artifact-keeper#1378). `main`
	# is published on every push to artifact-keeper-web main and is
	# always pullable. If a future release wants stricter pinning,
	# bump this to a specific SHA tag (e.g. `sha-ea664a1`).
	PREVIOUS_WEB_TAG: 'main'
	run: \|
	chmod +x tests/release-gate/chart-upgrade-smoke.sh
	RUN_ID="${{ github.run_id }}-${{ github.run_attempt }}-upgrade"
	# PREVIOUS_IAC_REF: the iac chart tag that shipped with the
	# previous release. Defaults to artifact-keeper-1.1.9 inside
	# the script; pinned here so the chart-template upgrade path
	# exercises the actual prev->current chart diff (issue #54).
	PREVIOUS_IAC_REF="artifact-keeper-${PREVIOUS_TAG}"
	./tests/release-gate/chart-upgrade-smoke.sh \
	--run-id "${RUN_ID}" \
	--previous-tag "${PREVIOUS_TAG}" \
	--backend-tag "${BACKEND_TAG}" \
	--web-tag "${WEB_TAG}" \
	--previous-web-tag "${PREVIOUS_WEB_TAG}" \
	--iac-ref "${IAC_REF}" \
	--previous-iac-ref "${PREVIOUS_IAC_REF}" \
	--timeout 600

	- name: Upload chart-upgrade diagnostics
	if: failure()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: chart-upgrade-smoke-logs
	path: /tmp/test-logs/
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Deploy test environment
	#
	# Gated on `clean-install-smoke` so that the matrix below cannot run
	# against a backend that fails to even start. A startup-broken release
	# fails fast in `clean-install-smoke` and the entire matrix is skipped,
	# preserving runner-time for releases that can actually be tested.
	# -------------------------------------------------------------------
	deploy:
	needs: clean-install-smoke
	runs-on: ak-e2e-runners
	outputs:
	run_id: ${{ steps.setup.outputs.run_id }}
	namespace: ${{ steps.setup.outputs.namespace }}
	backend_url: ${{ steps.deploy.outputs.backend_url }}
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install kubectl
	uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0

	- name: Install Helm
	uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0

	- name: Generate run ID
	id: setup
	run: \|
	RUN_ID="e2e-$(date +%s)-${GITHUB_RUN_NUMBER}"
	echo "run_id=${RUN_ID}" >> "$GITHUB_OUTPUT"
	echo "namespace=test-${RUN_ID}" >> "$GITHUB_OUTPUT"

	- name: Deploy test namespace
	id: deploy
	env:
	RUN_ID: ${{ steps.setup.outputs.run_id }}
	BACKEND_TAG: ${{ inputs.backend_tag }}
	WEB_TAG: ${{ inputs.web_tag }}
	run: \|
	chmod +x scripts/create-test-namespace.sh
	# --full-stack enables Trivy + scan workspace so the security
	# tests actually exercise the scanner instead of false-passing
	# against a no-scanner stack (#888 silent-success class).
	./scripts/create-test-namespace.sh \
	--run-id "${RUN_ID}" \
	--backend-tag "${BACKEND_TAG}" \
	--web-tag "${WEB_TAG}" \
	--full-stack

	NAMESPACE="test-${RUN_ID}"
	BACKEND_URL="http://artifact-keeper-backend.${NAMESPACE}.svc.cluster.local:8080"
	echo "backend_url=${BACKEND_URL}" >> "$GITHUB_OUTPUT"

	- name: Wait for stack ready
	env:
	RUN_ID: ${{ steps.setup.outputs.run_id }}
	run: \|
	NAMESPACE="test-${RUN_ID}"
	BACKEND_URL="http://artifact-keeper-backend.${NAMESPACE}.svc.cluster.local:8080"
	chmod +x tests/lib/wait-for-ready.sh
	./tests/lib/wait-for-ready.sh "${BACKEND_URL}" 180

	# Trivy rollout must be Available before the security tests
	# dispatch. The chart's fullnameOverride is "artifact-keeper"
	# (see helm/values-test-full.yaml) so the deployment is
	# named artifact-keeper-trivy regardless of the release name.
	# Without this gate, security-tests can race scanner pod
	# scale-up and the lite scan-completion gate sees an unreachable
	# scanner as "real" failure (the #888 false-fail mirror).
	echo "Waiting for Trivy rollout in ${NAMESPACE}..."
	kubectl -n "${NAMESPACE}" rollout status \
	deployment/artifact-keeper-trivy --timeout=180s
	kubectl -n "${NAMESPACE}" wait --for=condition=Available \
	deployment/artifact-keeper-trivy --timeout=60s

	# -------------------------------------------------------------------
	# Real-flow smoke (issue #45)
	#
	# The user's actual flow as a single gate check: push an artifact
	# through a native client, pull it back, trigger a scan, poll until
	# completion. Regressions in any step (broken upload, broken
	# download, scan-stuck-queued mirror of #871) fail the gate before
	# the broader format/security/repo matrix runs, so the operator
	# sees the FIRST signal that pierced the smoke.
	#
	# Uses npm (already installed on the runner pod for the `node` batch
	# in format-tests) so the gate stays under the 5-minute target in
	# the acceptance criteria for #45.
	# -------------------------------------------------------------------
	real-flow-smoke:
	needs: deploy
	runs-on: ak-e2e-runners
	timeout-minutes: 8
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped real-flow smoke is exactly the silent-success
	# class (#870/#871/#888) this gate was added to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install npm (real-flow uses the npm native client)
	run: \|
	if ! command -v npm >/dev/null 2>&1; then
	curl -fsSL https://deb.nodesource.com/setup_22.x \| sudo -E bash - > /dev/null 2>&1
	sudo apt-get install -y -qq nodejs > /dev/null
	fi
	npm --version

	- name: Run real-flow smoke
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x tests/release-gate/test-real-flow-smoke.sh
	./tests/release-gate/test-real-flow-smoke.sh

	- name: Upload real-flow smoke results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-real-flow-smoke
	path: /tmp/test-results/
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Scan-completion gate (matrix across representative formats).
	#
	# Closes the gap surfaced by artifact-keeper#888 ("works for npm,
	# silently fails for docker"). Each matrix entry runs the gate
	# primitive in its own job (parallel) so a regression in any one
	# format's scanner pipeline fails the release-gate loud.
	#
	# The wired formats (currently: npm) run the lite primitive at
	# tests/security/test-scan-completes.sh via the release-gate wrapper.
	#
	# The matrix is intentionally restricted to formats whose fixtures
	# exist. Scaffolded formats (oci, maven, pypi, cargo, helm) are NOT
	# in the matrix because a green checkmark on an `exit 0` scaffold
	# is the same silent-success class the gate exists to prevent. The
	# scan-completion-gate-scaffolds-pending job below surfaces the
	# deferred formats as ::warning:: annotations so the gap is visible
	# without painting the dashboard with fake passes. When a fixture-
	# builder for a deferred format lands (#62), add the format to this
	# matrix in the same PR.
	#
	# Why a matrix rather than a single sequential driver:
	# - Each format scan can take 30-60s; sequential 6-format runs
	# blow the 5-min release-gate budget.
	# - The workflow-level matrix surfaces per-format outcomes in the
	# GitHub Actions UI so an operator can see "oci failed, npm
	# passed" at a glance.
	# -------------------------------------------------------------------
	scan-completion-gate:
	needs: deploy
	runs-on: ak-e2e-runners
	timeout-minutes: 8
	strategy:
	fail-fast: false
	matrix:
	format: [npm]
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# ALLOW_SCANNER_SKIP=0 is enforced inside the gate primitive --
	# a scanner-pod-down skip in release-gate context is exactly the
	# silent-success class this gate exists to catch.
	RELEASE_GATE: '1'
	ALLOW_SCANNER_SKIP: '0'
	FIXTURE_FORMAT: ${{ matrix.format }}
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run scan-completion gate for ${{ matrix.format }}
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x tests/release-gate/scan-completion-gate.sh
	# Per-format RUN_ID suffix so concurrent matrix jobs do not
	# collide on the repo key (scan-complete-<RUN_ID>).
	RUN_ID="${RUN_ID}-${{ matrix.format }}" \
	./tests/release-gate/scan-completion-gate.sh

	- name: Capture scanner pod logs on failure
	if: failure()
	run: \|
	# The deploy job exports the test namespace; resolve it via
	# the standard naming pattern. We use kubectl best-effort:
	# if the runner pod has no kubeconfig, this step's failure
	# is benign (we still surface the JUnit XML).
	NS="test-${{ needs.deploy.outputs.run_id }}"
	mkdir -p /tmp/test-logs
	kubectl -n "$NS" logs -l app.kubernetes.io/component=scanner \
	--tail=2000 > /tmp/test-logs/scanner-${{ matrix.format }}.log 2>&1 \|\| true
	# #1379: chart labels Trivy pods with app.kubernetes.io/component=trivy
	# (app.kubernetes.io/name is the chart name "artifact-keeper", not the
	# component). Use the component label so the log capture actually finds
	# the pod when the gate is failing and the operator most needs the logs.
	kubectl -n "$NS" logs -l app.kubernetes.io/component=trivy \
	--tail=2000 > /tmp/test-logs/trivy-${{ matrix.format }}.log 2>&1 \|\| true
	kubectl -n "$NS" logs -l app=artifact-keeper-backend \
	--tail=1000 > /tmp/test-logs/backend-${{ matrix.format }}.log 2>&1 \|\| true

	- name: Upload scan-completion gate results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-scan-completion-${{ matrix.format }}
	path: \|
	/tmp/test-results/
	/tmp/test-logs/
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Scaffolds-pending sentinel (#62).
	#
	# This job exists ONLY to surface the deferred formats as a
	# ::warning:: annotation on every release-gate run so the gap is
	# visible. It does NOT run any test. It is NOT a required gate.
	# Deliberately a separate job (not a matrix step) so the GitHub
	# Actions UI shows ONE warning row rather than five green
	# checkmarks that imply broader format coverage than exists.
	#
	# When a fixture-builder for one of the listed formats lands,
	# remove the format from this list and add it to the matrix above.
	# -------------------------------------------------------------------
	scan-completion-gate-scaffolds-pending:
	needs: deploy
	runs-on: ak-e2e-runners
	timeout-minutes: 2
	steps:
	- name: Emit scaffolds-pending warnings
	run: \|
	for fmt in oci maven pypi cargo helm; do
	echo "::warning title=scan-completion gate: ${fmt} fixture missing::No scan-completion fixture exists for ${fmt}; the silent-success class (#888) is NOT covered for this format. Tracked under artifact-keeper-test#62."
	done
	# Echo to the runner log too so the operator sees this even
	# without expanding the annotations pane.
	echo ""
	echo "Scan-completion format coverage:"
	echo " wired: npm"
	echo " deferred: oci, maven, pypi, cargo, helm (artifact-keeper-test#62)"

	# -------------------------------------------------------------------
	# SBOM correctness gate (scaffold).
	#
	# Pins the SBOM endpoint contract (POST /api/v1/sbom returns 200
	# with the documented SbomResponse shape) so an endpoint deletion
	# or 5xx regression fails the release loud. The component_count > 0
	# assertion is deferred to artifact-keeper#903 (--list-all-pkgs)
	# per the #57 epic.
	# -------------------------------------------------------------------
	sbom-correctness-gate:
	needs: deploy
	runs-on: ak-e2e-runners
	timeout-minutes: 6
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run SBOM correctness gate
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x tests/release-gate/sbom-correctness-gate.sh
	./tests/release-gate/sbom-correctness-gate.sh

	- name: Capture scanner pod logs on failure
	if: failure()
	run: \|
	NS="test-${{ needs.deploy.outputs.run_id }}"
	mkdir -p /tmp/test-logs
	kubectl -n "$NS" logs -l app.kubernetes.io/component=scanner \
	--tail=2000 > /tmp/test-logs/sbom-scanner.log 2>&1 \|\| true
	kubectl -n "$NS" logs -l app=artifact-keeper-backend \
	--tail=1000 > /tmp/test-logs/sbom-backend.log 2>&1 \|\| true

	- name: Upload SBOM gate results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-sbom-correctness
	path: \|
	/tmp/test-results/
	/tmp/test-logs/
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Pinned-CVE assertion (#64).
	#
	# Tightens the gate beyond "findings_count >= 1" to a specific CVE
	# id. Catches scanner-DB drift and parser-correctness regressions
	# that the findings-count check alone would let through.
	# -------------------------------------------------------------------
	pinned-cve-gate:
	needs: deploy
	runs-on: ak-e2e-runners
	timeout-minutes: 8
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	RELEASE_GATE: '1'
	# CVE-2019-10744 (lodash 4.17.4 prototype pollution) is what the
	# lite fixture pins. log4j 2.14.0 / CVE-2021-44228 will move
	# under the oci matrix entry once the oci fixture-builder lands
	# (#62 + #64 extension).
	EXPECTED_VULN_CVE: CVE-2019-10744
	# Trivy DB age threshold for the freshness pre-flight. The DB is
	# rebuilt every 6 hours upstream; we accept anything <= 14 days
	# so weekend gaps and slow mirror sync do not generate false
	# failures, while still catching DBs old enough that
	# CVE-2019-10744 (published 2019) could fall out of recent index
	# shards. Override at workflow_dispatch via repo variable
	# TRIVY_DB_MAX_AGE_DAYS if needed.
	TRIVY_DB_MAX_AGE_DAYS: ${{ vars.TRIVY_DB_MAX_AGE_DAYS \|\| '14' }}
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install kubectl
	uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0

	- name: Trivy DB freshness pre-flight
	# Distinguish "CVE database is stale" from "CVE-2019-10744 not
	# surfaced". Without this step, a stale Trivy DB on the runner
	# image causes the assertion to silently pass (no CVE found ==
	# 0 findings == "did Trivy run at all?"). The misleading failure
	# message blames the gate; the actual root cause is upstream.
	#
	# Exit codes:
	# 0 - DB reachable, age within TRIVY_DB_MAX_AGE_DAYS
	# 42 - DB too old (distinct so the rollup operator can tell
	# scanner DB drift apart from a real CVE-detection bug)
	# 43 - Trivy pod unreachable or trivy --version failed
	run: \|
	set -uo pipefail
	NS="test-${{ needs.deploy.outputs.run_id }}"
	MAX_AGE_DAYS="${TRIVY_DB_MAX_AGE_DAYS}"
	MAX_AGE_SECONDS=$(( MAX_AGE_DAYS * 86400 ))

	# Locate the Trivy pod. The Helm chart names the deployment
	# artifact-keeper-trivy; we kubectl-exec into the first ready
	# pod backed by that deployment.
	#
	# Label selector note (#1379): the chart's _helpers.tpl labels
	# every component with app.kubernetes.io/name=artifact-keeper
	# (the chart name) and distinguishes components via
	# app.kubernetes.io/component=<name>. Earlier revisions of this
	# pre-flight queried `app.kubernetes.io/name=trivy`, which
	# never matched, producing the misleading "Trivy pod not
	# found" error even though the deploy job's
	# `kubectl rollout status deployment/artifact-keeper-trivy`
	# had already succeeded. The correct selector is
	# `app.kubernetes.io/component=trivy`.
	POD=$(kubectl -n "$NS" get pods \
	-l app.kubernetes.io/component=trivy \
	-o jsonpath='{.items[0].metadata.name}' 2>/dev/null \|\| true)
	if [ -z "$POD" ]; then
	echo "::error::Trivy pod not found in namespace ${NS} (pre-flight cannot run)"
	echo "::error::Checked selector: app.kubernetes.io/component=trivy"
	echo "::error::Deploy job should have installed Trivy via values-test-full.yaml (trivy.enabled=true)."
	kubectl -n "$NS" get pods --show-labels 2>&1 \| head -40 \|\| true
	exit 43
	fi

	# Warm up the Trivy vulnerability DB before reading its
	# UpdatedAt. Trivy 0.62 downloads the DB lazily on first scan,
	# not at pod startup, so `trivy --version` immediately after
	# rollout returns only `Version: <x>` with no `UpdatedAt:`
	# line, which the parser below cannot handle (#197). Using
	# `image --download-db-only` is idempotent: it no-ops if the
	# DB is already present and within the upstream
	# download-interval window.
	echo "Warming up Trivy vulnerability DB..."
	kubectl -n "$NS" exec "$POD" -c trivy -- \
	trivy image --download-db-only --quiet >/dev/null 2>&1 \
	\|\| kubectl -n "$NS" exec "$POD" -- \
	trivy image --download-db-only --quiet >/dev/null 2>&1 \
	\|\| echo "::warning::trivy image --download-db-only returned non-zero; will inspect --version anyway"

	# trivy --version emits "Vulnerability DB: ... UpdatedAt: 2026-04-30 ..."
	# in older versions and a YAML-ish block in newer versions.
	# Parse both shapes.
	VER_OUT=$(kubectl -n "$NS" exec "$POD" -c trivy -- trivy --version 2>/dev/null \
	\|\| kubectl -n "$NS" exec "$POD" -- trivy --version 2>/dev/null \
	\|\| true)
	if [ -z "$VER_OUT" ]; then
	echo "::error::trivy --version returned empty output from pod ${POD}"
	exit 43
	fi
	echo "trivy --version output:"
	echo "$VER_OUT"
	echo ""

	# Extract UpdatedAt timestamp. Format varies:
	# " UpdatedAt: 2026-04-30 12:34:56.789 +0000 UTC"
	# " UpdatedAt 2026-04-30 12:34:56.789 +0000 UTC"
	#
	# Pipeline hardening (#197): grep's no-match exit code (1)
	# used to propagate via `set -o pipefail` and the workflow
	# shell's implicit `set -e`, killing the script before the
	# `if [ -z "$DB_DATE" ]` guard below could surface a clean
	# exit 43. The `\|\| true` here keeps `DB_DATE` empty on
	# no-match so the guard does its job.
	DB_DATE=$( { echo "$VER_OUT" \| grep -iE 'UpdatedAt' \|\| true; } \| head -n1 \
	\| sed -E 's/.UpdatedAt[: ]+([0-9-]+ [0-9:.]+)./\1/' \
	\| awk '{print $1" "$2}')
	if [ -z "$DB_DATE" ] \|\| ! date -d "$DB_DATE" +%s >/dev/null 2>&1; then
	echo "::error::Could not parse Trivy DB UpdatedAt from --version output."
	echo "::error::Likely cause: Trivy DB download failed during warmup, or trivy --version output format changed."
	echo "::error::Raw version output above for diagnosis."
	exit 43
	fi

	DB_EPOCH=$(date -d "$DB_DATE" -u +%s)
	NOW_EPOCH=$(date -u +%s)
	AGE_SECONDS=$(( NOW_EPOCH - DB_EPOCH ))
	AGE_DAYS=$(( AGE_SECONDS / 86400 ))

	echo "Trivy DB UpdatedAt: ${DB_DATE} (age: ${AGE_DAYS} days, threshold: ${MAX_AGE_DAYS} days)"

	if [ "$AGE_SECONDS" -gt "$MAX_AGE_SECONDS" ]; then
	echo "::error::Trivy DB is ${AGE_DAYS} days old, exceeds threshold of ${MAX_AGE_DAYS} days."
	echo "::error::The pinned-CVE assertion below would surface a misleading 'CVE-2019-10744 not found' failure;"
	echo "::error::the actual root cause is upstream DB staleness. Refresh Trivy DB or bump the runner image."
	exit 42
	fi

	echo "Trivy DB freshness OK."

	- name: Run pinned-CVE gate
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x tests/release-gate/test-pinned-cve.sh
	./tests/release-gate/test-pinned-cve.sh

	- name: Upload pinned-CVE gate results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-pinned-cve
	path: /tmp/test-results/
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Format tests (8 parallel batches)
	# -------------------------------------------------------------------
	format-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'formats'
	runs-on: ak-e2e-runners
	strategy:
	fail-fast: false
	matrix:
	batch:
	- name: node
	scripts: "test-npm.sh test-npm-remote.sh test-vscode.sh"
	- name: python
	scripts: "test-pypi.sh test-pypi-native-client.sh test-pypi-remote.sh test-conda.sh test-huggingface.sh test-mlmodel.sh"
	- name: jvm
	scripts: "test-maven.sh test-maven-native-client.sh test-maven-remote.sh test-maven-virtual-snapshot.sh test-sbt.sh test-gradle-conformance.sh"
	- name: rust-go-swift
	scripts: "test-cargo.sh test-cargo-remote.sh test-go.sh test-swift.sh test-pub.sh"
	- name: system-packages
	scripts: "test-debian.sh test-rpm.sh test-alpine.sh test-opkg.sh"
	- name: containers
	scripts: "test-oci.sh test-oci-remote.sh test-docker-native-client.sh test-helm.sh test-incus.sh"
	- name: misc-native
	scripts: "test-terraform.sh test-composer.sh test-hex.sh test-rubygems.sh test-nuget.sh test-cocoapods.sh test-cran.sh"
	- name: generic-protocol
	scripts: "test-generic.sh test-generic-native-client.sh test-gitlfs.sh test-protobuf.sh test-bazel.sh test-conan.sh test-conan-auth.sh test-conan-recipes.sh test-conan-packages.sh test-conan-search.sh test-conan-revisions.sh test-conan-remote.sh test-conan-errors.sh test-conan-stress.sh test-ansible.sh test-p2.sh test-jetbrains.sh test-vagrant.sh test-wasm.sh test-puppet.sh test-chef.sh"
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install test dependencies
	run: \|
	BATCH="${{ matrix.batch.name }}"
	echo "Installing dependencies for batch: $BATCH"

	# Common: ensure zip is available (used by maven, swift, vscode, go)
	if ! command -v zip &>/dev/null; then
	sudo apt-get update -qq && sudo apt-get install -y -qq zip > /dev/null
	fi

	case "$BATCH" in
	node)
	if ! command -v npm &>/dev/null; then
	curl -fsSL https://deb.nodesource.com/setup_22.x \| sudo -E bash - > /dev/null 2>&1
	sudo apt-get install -y -qq nodejs > /dev/null
	fi
	;;
	python)
	if ! command -v python3 &>/dev/null; then
	sudo apt-get update -qq && sudo apt-get install -y -qq python3 python3-pip python3-setuptools python3-venv > /dev/null
	fi
	# python3-venv is required by test-pypi-native-client.sh; install
	# it even if python3 is already present, since the bundled
	# interpreter may have ensurepip stripped out.
	if ! python3 -c 'import venv; venv.EnvBuilder().ensure_directories' &>/dev/null; then
	sudo apt-get update -qq && sudo apt-get install -y -qq python3-venv > /dev/null \|\| true
	fi
	;;
	jvm)
	# mvn is required by test-maven-native-client.sh; the suite is
	# auto-skipped if maven is missing, but we install it here so
	# the gate actually exercises native-client coverage.
	if ! command -v mvn &>/dev/null; then
	sudo apt-get update -qq && sudo apt-get install -y -qq maven > /dev/null \|\| true
	fi
	;;
	rust-go-swift)
	if ! command -v go &>/dev/null; then
	GO_VERSION="1.23.6"
	curl -sSL "https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz" \| sudo tar -C /usr/local -xz
	echo "/usr/local/go/bin" >> "$GITHUB_PATH"
	fi
	if ! command -v cargo &>/dev/null; then
	curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \| sh -s -- -y --default-toolchain stable --profile minimal
	echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
	fi
	;;
	containers)
	# Helm is needed for test-helm.sh
	if ! command -v helm &>/dev/null; then
	curl -sSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 \| bash
	fi
	;;
	system-packages)
	# ar (from binutils) for Debian package assembly
	if ! command -v ar &>/dev/null; then
	sudo apt-get update -qq && sudo apt-get install -y -qq binutils > /dev/null
	fi
	;;
	esac

	- name: Run ${{ matrix.batch.name }} format tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	exit_code=0
	for script in ${{ matrix.batch.scripts }}; do
	echo "=== Running ${script} ==="
	if ! bash "tests/formats/${script}"; then
	echo "FAILED: ${script}"
	exit_code=1
	fi
	done
	exit $exit_code

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-formats-${{ matrix.batch.name }}
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Security tests
	#
	# cache-poisoning + cache-stampede boot a Python mock upstream on the
	# runner pod and need the backend to dial the runner pod by hostname.
	# We compute the runner's pod IP at runtime and translate it to the
	# cluster-DNS pod-DNS form (10-1-2-3.<ns>.pod.cluster.local) which the
	# backend pod can resolve via ClusterFirst.
	#
	# PROXY_MAX_CONCURRENT_FETCHES / PROXY_QUEUE_TIMEOUT_SECS pin the
	# values the test asserts against so chart-default drift doesn't
	# silently make the assertion measure the wrong limit. They MUST match
	# the values the deployed backend was configured with.
	# -------------------------------------------------------------------
	security-tests:
	needs: deploy
	# continue-on-error: test-scan-completes.sh asserts on Grype scanner
	# finishing with findings. Grype on the v1.1.x backend image fails
	# deterministically because the vulnerability DB is not pre-seeded
	# in the Dockerfile and our network-restricted ARC runner pods can't
	# fetch grype.anchore.io at scan time. The quality gate is
	# LAST-scanner-wins (policy_service reads LIMIT 1 ORDER BY created_at
	# DESC), so Trivy success satisfies block_unscanned and the practical
	# security posture is unaffected. Tracked for fix in v1.1.10:
	# artifact-keeper#1001 (pre-seed Grype DB in Dockerfile). The other
	# 44 security tests in the suite still run and gate the release.
	continue-on-error: true
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'security'
	runs-on: ak-e2e-runners
	timeout-minutes: 20
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	# Per-script timeout for run-suite.sh. Several Epic 2 tests
	# (cve-history, license-policy, scan-policy, quality-gate-blocks-upload,
	# scan-dedup-checksum) poll for scan completion with default
	# SCAN_TIMEOUT=180. The default 120s wrapper would SIGKILL them before
	# they could write JUnit XML. 300s gives 120s headroom over SCAN_TIMEOUT
	# for fixture build, upload, and cleanup.
	TEST_TIMEOUT: '300'
	# Stampede / poisoning test knobs. Must match the values the chart
	# rendered for the backend Deployment (see helm values-test.yaml).
	PROXY_MAX_CONCURRENT_FETCHES: '20'
	PROXY_QUEUE_TIMEOUT_SECS: '5'
	STAMPEDE_UPSTREAM_DELAY_MS: '2000'
	# AK_BACKEND_BRANCH for the feature-flag layer (issue #65). The
	# security suite hosts test-feature-flag-drift.sh which is the
	# truth-side check that AK_FEATURES matches the deployed
	# backend's reported version. See pullthrough-tests env for the
	# rationale on how this maps from inputs.backend_tag.
	AK_BACKEND_BRANCH: ${{ startsWith(inputs.backend_tag, '1.1.') && 'release/1.1.x' \|\| (startsWith(inputs.backend_tag, '1.2.') && 'release/1.2.x' \|\| 'main') }}
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Resolve runner pod address for backend dial-back
	id: mock-host
	run: \|
	# The runner is a Pod inside the cluster; its pod IP is reachable
	# from the backend Pod over the cluster network. We pass the bare
	# IP as MOCK_UPSTREAM_HOSTNAME so the backend's upstream-URL
	# resolver does not need cluster DNS to be configured for the
	# `<ip-dashed>.<ns>.pod.cluster.local` form (which depends on
	# CoreDNS `pods` plugin mode).
	#
	# ARC runners with `spec.template.spec.containers[].env.POD_IP`
	# via the downward API populate $POD_IP. We fall back to
	# `hostname -i` if the env var is missing.
	POD_IP="${POD_IP:-$(hostname -i 2>/dev/null \| awk '{print $1}')}"
	if [ -z "$POD_IP" ]; then
	echo "ERROR: could not determine runner pod IP for mock dial-back" >&2
	exit 1
	fi
	# Sanity: must look like an IPv4. Reject 127.* (loopback would
	# only be reachable from inside the runner pod itself, not from
	# the backend pod across the cluster network).
	if ! echo "$POD_IP" \| grep -Eq '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$'; then
	echo "ERROR: POD_IP '${POD_IP}' is not an IPv4 address" >&2
	exit 1
	fi
	if echo "$POD_IP" \| grep -Eq '^127\.'; then
	echo "ERROR: POD_IP '${POD_IP}' is loopback; backend pod cannot reach this" >&2
	exit 1
	fi
	echo "Runner pod IP: ${POD_IP}"
	echo "MOCK_UPSTREAM_HOSTNAME=${POD_IP}" >> "$GITHUB_ENV"

	- name: Run security tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite security --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-security
	# Directory upload (not *.xml glob) so per-test diagnostic
	# JSON breadcrumbs (e.g. scan-completes-final-resp.json) reach
	# the operator. With *.xml glob the dump dies silently and the
	# gate's failure rendering is a one-line message attribute.
	path: /tmp/test-results/
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Pull-through cache reliability tests (epic #69 cluster D, v1.1.9).
	#
	# Lives in its own job (not merged into repo-tests or security-tests)
	# for three reasons:
	#
	# 1. The cross-format shadowing-guard test takes ~60s on its own
	# because it cycles through six format handlers. Folding that
	# into repo-tests would dominate the suite's wall-clock.
	#
	# 2. The cache-ttl tests need stable TTL plumbing on the backend;
	# a regression in only this surface should fail this job
	# without blocking the other 20 jobs.
	#
	# 3. RELEASE_GATE=1 is set: silent skips here are the exact
	# silent-success class (#888) this cluster exists to catch.
	# -------------------------------------------------------------------
	pullthrough-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'pullthrough'
	runs-on: ak-e2e-runners
	timeout-minutes: 15
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	# Per-script timeout for run-suite.sh. SCAN_TIMEOUT defaults to
	# 180s in test-stuck-scan-janitor.sh and the scan-depth scripts;
	# virtual-shadowing-guard.sh walks 6 formats serially. The default
	# 120s budget will SIGTERM polls mid-flight on a slightly slow
	# backend, masking real signal with timeout failures. 300s gives
	# ~2x headroom over the worst per-script case and matches the
	# security-tests / webhook-tests jobs.
	TEST_TIMEOUT: '300'
	# AK_BACKEND_BRANCH feeds the branch-aware feature flag layer
	# in tests/lib/feature-flags.sh (issue #65). We derive the branch
	# from the backend image tag: a tag like `1.1.9` or `1.1.10-rc.2`
	# is release/1.1.x; `1.2.0` is release/1.2.x; `latest`, `main`,
	# or anything else falls back to `main`. The mapping below is
	# intentionally conservative -- if we can't tell, we use the
	# most-restrictive 1.1.x flag set (see feature_flags_init for
	# rationale). When that's wrong, test-feature-flag-drift.sh
	# fails loudly with the actual /health version, surfacing the
	# workflow-vs-deploy mismatch in ONE place.
	AK_BACKEND_BRANCH: ${{ startsWith(inputs.backend_tag, '1.1.') && 'release/1.1.x' \|\| (startsWith(inputs.backend_tag, '1.2.') && 'release/1.2.x' \|\| 'main') }}
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run pull-through cache tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite pullthrough --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-pullthrough
	path: /tmp/test-results/
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Compatibility tests
	# -------------------------------------------------------------------
	compatibility-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'compatibility'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run compatibility tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite compatibility --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-compatibility
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Repository type tests (virtual, remote, CRUD, labels)
	# -------------------------------------------------------------------
	repo-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'repos'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run repo type tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite repos --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-repos
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Promotion tests
	# -------------------------------------------------------------------
	promotion-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'promotion'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run promotion tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite promotion --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-promotion
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# RBAC tests
	# -------------------------------------------------------------------
	rbac-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'rbac'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run RBAC tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite rbac --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-rbac
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Lifecycle tests
	# -------------------------------------------------------------------
	lifecycle-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'lifecycle'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run lifecycle tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite lifecycle --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-lifecycle
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Webhook tests
	# -------------------------------------------------------------------
	webhook-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'webhooks'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	# Per-script timeout for run-suite.sh. Webhook resilience tests
	# poll for retry/dead-letter behavior on schedules up to 180s
	# (WEBHOOK_RETRY_TIMEOUT) so the wrapping timeout must exceed that.
	TEST_TIMEOUT: '300'
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Resolve runner pod address for backend dial-back
	id: receiver-host
	run: \|
	# Webhook tests spin up a mock receiver in this runner pod and
	# pass the URL to the backend, which lives in a DIFFERENT pod.
	# 127.0.0.1 from the backend's perspective is the backend pod
	# itself, not the runner, so the mock would never be reached.
	# Loopback is also a hard block in the backend's SSRF guard
	# (#199, artifact-keeper validation.rs:203-212), so we MUST
	# use the runner pod's RFC1918 IP.
	POD_IP="${POD_IP:-$(hostname -i 2>/dev/null \| awk '{print $1}')}"
	if [ -z "$POD_IP" ]; then
	echo "ERROR: could not determine runner pod IP" >&2
	exit 1
	fi
	if ! echo "$POD_IP" \| grep -Eq '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$'; then
	echo "ERROR: POD_IP '${POD_IP}' is not an IPv4 address" >&2
	exit 1
	fi
	if echo "$POD_IP" \| grep -Eq '^127\.'; then
	echo "ERROR: POD_IP '${POD_IP}' is loopback; backend pod cannot reach this" >&2
	exit 1
	fi
	echo "Runner pod IP: ${POD_IP}"
	echo "WEBHOOK_RECEIVER_HOST=${POD_IP}" >> "$GITHUB_ENV"

	- name: Run webhook tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite webhooks --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-webhooks
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Search tests
	# -------------------------------------------------------------------
	search-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'search'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run search tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite search --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-search
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Platform tests (signing, SBOM, curation, labels, audit, backup)
	#
	# Serialized after auth-tests to avoid cross-suite admin-JWT
	# contamination. test-admin-password-recovery.sh changes the admin
	# password, which calls the backend's change_password handler
	# (backend/src/api/handlers/users.rs) and triggers
	# invalidate_user_tokens(admin_id). That writes the admin's UUID
	# into the global CREDENTIAL_INVALIDATIONS map in
	# backend/src/services/auth_service.rs, so any auth-tests step
	# running in parallel that uses the admin JWT starts getting 401s
	# mid-suite once its cached token validation flips to "rejected".
	# See #137. admin-tests is serialized for the same reason and on the
	# same job ordering (see the comment block above the admin-tests
	# job definition below).
	# -------------------------------------------------------------------
	platform-tests:
	needs: [deploy, auth-tests]
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'platform'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run platform tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite platform --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-platform
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Auth tests (tokens, TOTP, rate limiting)
	# -------------------------------------------------------------------
	auth-tests:
	needs: deploy
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'auth'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run auth tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite auth --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-auth
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Admin tests (Epic 10, #77): operational admin endpoints
	# livez, backup execute/cancel/delete, monitoring alerts,
	# storage backends listing, reindex trigger.
	#
	# Serialized after auth-tests (same reason as platform-tests, see #137):
	# test-admin-password-recovery.sh in platform-tests changes the admin
	# password via change_password (backend/src/api/handlers/users.rs),
	# which writes the admin's UUID into the global CREDENTIAL_INVALIDATIONS
	# map (backend/src/services/auth_service.rs). Any suite using the admin
	# JWT that runs concurrently with auth-tests' password-change paths
	# starts getting 401s mid-suite once its cached token validation flips
	# to "rejected". admin-tests reuses ADMIN_PASS and exercises the admin
	# JWT on every endpoint it hits, so it has the same exposure as
	# platform-tests and gets the same serialization.
	# -------------------------------------------------------------------
	admin-tests:
	needs: [deploy, auth-tests]
	if: inputs.test_suite == 'all' \|\| inputs.test_suite == 'admin'
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	NAMESPACE: ${{ needs.deploy.outputs.namespace }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install kubectl
	uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0

	- name: Wait for OpenSearch ready
	# test-reindex.sh hits POST /api/v1/admin/reindex which requires
	# OpenSearch to be ready. Without this wait the endpoint can 404
	# (route not yet mounted on the backend's OpenSearch-dependent
	# path) or 503 and the test was previously soft-skipping on 404,
	# passing vacuously under RELEASE_GATE=1. Wait up to 120s for the
	# OpenSearch pod to be Ready before running the suite.
	run: \|
	# Try the bitnami chart label first, then app=opensearch as a
	# fallback for charts that use the legacy label scheme. If the
	# selector matches no pods (e.g. opensearch.enabled=false in this
	# build) the wait is a no-op and the suite proceeds, which is
	# the right behavior for builds that don't ship OpenSearch.
	set -e
	if kubectl -n "$NAMESPACE" get pod -l app.kubernetes.io/name=opensearch -o name 2>/dev/null \| grep -q pod/; then
	kubectl -n "$NAMESPACE" wait --for=condition=Ready pod \
	-l app.kubernetes.io/name=opensearch --timeout=120s
	elif kubectl -n "$NAMESPACE" get pod -l app=opensearch -o name 2>/dev/null \| grep -q pod/; then
	kubectl -n "$NAMESPACE" wait --for=condition=Ready pod \
	-l app=opensearch --timeout=120s
	else
	echo "No OpenSearch pods found in ${NAMESPACE}; skipping wait."
	fi

	- name: Run admin tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite admin --run-id "${RUN_ID}"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-admin
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Stress tests (after formats + security pass)
	#
	# continue-on-error: stress tests measure backend behavior under
	# sustained mixed-workload (auth + upload + download + list) on a
	# 2 CPU test pod inside the namespace's 4 CPU / 8 Gi quota. Error-
	# rate variance is high on ARC runners (observed 22-54% across
	# otherwise-identical runs) because the bcrypt-bound auth path
	# saturates first and the worker count drives RPS up faster than
	# the pod can absorb. The test still produces JUnit + run logs so
	# regressions are visible, but a single failed run does not block
	# the release gate. Real perf regressions are caught by dedicated
	# benchmark workflows on Rocky, not by this CI smoke gate.
	# See artifact-keeper#991 for v1.1.x auth-path perf investigation.
	# -------------------------------------------------------------------
	stress-tests:
	needs: [deploy, format-tests, repo-tests, promotion-tests, rbac-tests, lifecycle-tests, webhook-tests, search-tests, platform-tests, auth-tests, security-tests, compatibility-tests]
	continue-on-error: true
	if: \|
	always() &&
	needs.deploy.result == 'success' &&
	(inputs.test_suite == 'all' \|\| inputs.test_suite == 'stress')
	runs-on: ak-e2e-runners
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# Per-request HTTP-code logs end up here. The Upload stress-test logs
	# step below ships this directory as a workflow artifact so a failed
	# stress run can be debugged endpoint-by-endpoint instead of from
	# aggregate error counts alone (artifact-keeper-test#138 /
	# artifact-keeper#1088).
	STRESS_LOG_DIR: /tmp/stress-logs
	# Postgres + pod-resource snapshot directory. The Collect postgres
	# stats step below runs tests/stress/collect-pg-stats.sh after the
	# stress run and ships this directory as the stress-pg-stats
	# artifact. Captures the direct measurement that PR #148's
	# Fresh-Eyes review (artifact-keeper-test#154) asked for: pg
	# connection saturation + kubectl top, so the postgres-CPU
	# narrative behind PR #140 is backed by evidence on every run.
	PG_STATS_DIR: /tmp/pg-stats
	NAMESPACE: ${{ needs.deploy.outputs.namespace }}
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Run stress tests
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	mkdir -p "$STRESS_LOG_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite stress --run-id "${RUN_ID}"

	- name: Summarize per-request status codes
	if: always()
	run: \|
	if [ ! -d "$STRESS_LOG_DIR" ] \|\| [ -z "$(ls -A "$STRESS_LOG_DIR" 2>/dev/null)" ]; then
	echo "No stress logs were emitted at ${STRESS_LOG_DIR}"
	exit 0
	fi
	echo "## Stress-test per-request status codes" >> "$GITHUB_STEP_SUMMARY"
	echo "" >> "$GITHUB_STEP_SUMMARY"
	echo "Per-request rows logged to artifact \`stress-request-logs\` at \`${STRESS_LOG_DIR}\`." >> "$GITHUB_STEP_SUMMARY"
	echo "Row format: \`<epoch_ms> <suite> <method> <endpoint> <http_code> <elapsed_ms>\`." >> "$GITHUB_STEP_SUMMARY"
	echo "" >> "$GITHUB_STEP_SUMMARY"
	echo "\| Suite \| Total \| 2xx \| 3xx \| 4xx \| 5xx \| Timeouts (000) \|" >> "$GITHUB_STEP_SUMMARY"
	echo "\|-------\|------:\|----:\|----:\|----:\|----:\|---------------:\|" >> "$GITHUB_STEP_SUMMARY"
	for log in "$STRESS_LOG_DIR"/*.log; do
	[ -f "$log" ] \|\| continue
	suite=$(basename "$log" .log)
	total=$(wc -l < "$log" \| tr -d ' ')
	s2=$(awk '$5 ~ /^2[0-9][0-9]$/ {c++} END {print c+0}' "$log")
	s3=$(awk '$5 ~ /^3[0-9][0-9]$/ {c++} END {print c+0}' "$log")
	s4=$(awk '$5 ~ /^4[0-9][0-9]$/ {c++} END {print c+0}' "$log")
	s5=$(awk '$5 ~ /^5[0-9][0-9]$/ {c++} END {print c+0}' "$log")
	s0=$(awk '$5 == "000" {c++} END {print c+0}' "$log")
	echo "\| ${suite} \| ${total} \| ${s2} \| ${s3} \| ${s4} \| ${s5} \| ${s0} \|" >> "$GITHUB_STEP_SUMMARY"
	done
	echo "" >> "$GITHUB_STEP_SUMMARY"
	# Echo a short per-suite breakdown to the runner log too, so an
	# operator can scan the job page without first downloading the
	# artifact. Cap at the top 10 non-2xx endpoints to keep the log
	# readable.
	echo "Top non-2xx endpoints per suite:"
	for log in "$STRESS_LOG_DIR"/*.log; do
	[ -f "$log" ] \|\| continue
	suite=$(basename "$log" .log)
	echo " ${suite}:"
	awk '$5 !~ /^2[0-9][0-9]$/ {print $5, $3, $4}' "$log" \
	\| sort \| uniq -c \| sort -rn \| head -n 10 \
	\| sed 's/^/ /'
	done

	- name: Collect postgres stats
	# Snapshot pg_stat_activity, pg_stat_statements, connection-count
	# vs max_connections, and kubectl top for the backend + postgres
	# pods. Runs after the stress workload and before teardown so the
	# capture reflects steady-state load. if: always() ensures the
	# snapshot still ships on a failed run, which is exactly when
	# the data is most useful (artifact-keeper-test#154; rationale
	# in PR #148 Fresh-Eyes review, Finding 2).
	if: always()
	run: \|
	bash tests/stress/collect-pg-stats.sh

	- name: Upload pg-stats snapshot
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: stress-pg-stats-${{ github.run_attempt }}
	path: /tmp/pg-stats/
	if-no-files-found: ignore

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-stress
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	- name: Upload stress-test logs
	# Per-request HTTP-code rows from the stress-test workers. Captured
	# before namespace teardown so a 50%-error run can be debugged
	# endpoint-by-endpoint (which path returned what code, how often)
	# instead of from aggregate counters alone. Investigating
	# artifact-keeper#1088 (POST /repositories DB contention under
	# load) would have been faster with this artifact in hand.
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: stress-request-logs
	path: /tmp/stress-logs/
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Resilience tests (after stress completes)
	#
	# Run regardless of stress-tests outcome. Resilience tests target
	# crash recovery, network partition, storage failures, etc., which
	# are independent of the bcrypt/auth saturation that stress-tests
	# measures. Skipping resilience because stress hit its error-rate
	# threshold loses signal on a different failure class.
	# -------------------------------------------------------------------
	resilience-tests:
	needs: [deploy, stress-tests]
	if: \|
	always() &&
	needs.deploy.result == 'success' &&
	(inputs.test_suite == 'all' \|\| inputs.test_suite == 'resilience')
	runs-on: ak-e2e-runners
	strategy:
	fail-fast: false
	matrix:
	category: [crash, restart, network, storage, data]
	env:
	BASE_URL: ${{ needs.deploy.outputs.backend_url }}
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	NAMESPACE: ${{ needs.deploy.outputs.namespace }}
	JUNIT_OUTPUT_DIR: /tmp/test-results
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install kubectl
	uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0

	- name: Run ${{ matrix.category }} resilience tests
	continue-on-error: true
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	FAILED=0
	for script in tests/resilience/${{ matrix.category }}/test-*.sh; do
	[ -f "$script" ] \|\| continue
	echo "=== Running ${script} ==="
	if ! bash "$script"; then
	echo "FAILED: ${script}"
	FAILED=$((FAILED + 1))
	fi
	done
	if [ "$FAILED" -gt 0 ]; then
	echo "::warning::${FAILED} resilience test(s) failed in ${{ matrix.category }} (non-blocking on ARC runners)"
	exit 1
	fi

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-resilience-${{ matrix.category }}
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Mesh tests (after resilience passes)
	# -------------------------------------------------------------------
	mesh-tests:
	needs: [deploy, resilience-tests]
	if: \|
	always() &&
	(inputs.test_suite == 'all' \|\| inputs.test_suite == 'mesh') &&
	(needs.resilience-tests.result == 'success' \|\| needs.resilience-tests.result == 'skipped')
	runs-on: ak-e2e-runners
	env:
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	ADMIN_PASS: TestRunner!2026secure
	JUNIT_OUTPUT_DIR: /tmp/test-results
	# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
	# A silently-skipped test in release-gate context is exactly the
	# silent-success class (#870/#871/#888) the gate exists to catch.
	RELEASE_GATE: '1'
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install kubectl
	uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0

	- name: Install Helm
	uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0

	- name: Deploy mesh topology
	id: mesh-deploy
	run: \|
	MESH_RUN_ID="${RUN_ID}"
	chmod +x scripts/create-test-namespace.sh

	# Deploy 4 mesh instances
	for i in main peer1 peer2 peer3; do
	MESH_NS="test-${MESH_RUN_ID}-mesh-${i}"
	./scripts/create-test-namespace.sh \
	--run-id "${MESH_RUN_ID}-mesh-${i}" \
	--backend-tag "${{ inputs.backend_tag }}" \
	--web-tag "${{ inputs.web_tag }}" \
	--values helm/values-test-mesh.yaml
	done

	# Output URLs
	BASE_NS="test-${MESH_RUN_ID}-mesh"
	echo "MAIN_URL=http://artifact-keeper-backend.${BASE_NS}-main.svc.cluster.local:8080" >> "$GITHUB_OUTPUT"
	echo "PEER1_URL=http://artifact-keeper-backend.${BASE_NS}-peer1.svc.cluster.local:8080" >> "$GITHUB_OUTPUT"
	echo "PEER2_URL=http://artifact-keeper-backend.${BASE_NS}-peer2.svc.cluster.local:8080" >> "$GITHUB_OUTPUT"
	echo "PEER3_URL=http://artifact-keeper-backend.${BASE_NS}-peer3.svc.cluster.local:8080" >> "$GITHUB_OUTPUT"

	- name: Wait for mesh instances ready
	run: \|
	chmod +x tests/lib/wait-for-ready.sh
	for url in "${{ steps.mesh-deploy.outputs.MAIN_URL }}" \
	"${{ steps.mesh-deploy.outputs.PEER1_URL }}" \
	"${{ steps.mesh-deploy.outputs.PEER2_URL }}" \
	"${{ steps.mesh-deploy.outputs.PEER3_URL }}"; do
	./tests/lib/wait-for-ready.sh "$url" 300
	done

	- name: Run mesh tests
	env:
	MAIN_URL: ${{ steps.mesh-deploy.outputs.MAIN_URL }}
	PEER1_URL: ${{ steps.mesh-deploy.outputs.PEER1_URL }}
	PEER2_URL: ${{ steps.mesh-deploy.outputs.PEER2_URL }}
	PEER3_URL: ${{ steps.mesh-deploy.outputs.PEER3_URL }}
	BASE_URL: ${{ steps.mesh-deploy.outputs.MAIN_URL }}
	run: \|
	mkdir -p "$JUNIT_OUTPUT_DIR"
	chmod +x scripts/run-suite.sh
	./scripts/run-suite.sh --suite mesh --run-id "${RUN_ID}"

	- name: Teardown mesh namespaces
	if: always() && inputs.skip_teardown != true
	run: \|
	chmod +x scripts/teardown-test-namespace.sh
	for i in main peer1 peer2 peer3; do
	./scripts/teardown-test-namespace.sh --run-id "${RUN_ID}-mesh-${i}" \|\| true
	done

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: junit-mesh
	path: /tmp/test-results/*.xml
	if-no-files-found: ignore

	# -------------------------------------------------------------------
	# Collect results and publish summary
	# -------------------------------------------------------------------
	collect-results:
	needs: [version-set-integrity, clean-install-smoke, clean-install-smoke-with-deps, chart-upgrade-smoke, deploy, real-flow-smoke, scan-completion-gate, sbom-correctness-gate, pinned-cve-gate, format-tests, security-tests, compatibility-tests, repo-tests, promotion-tests, rbac-tests, lifecycle-tests, webhook-tests, search-tests, platform-tests, auth-tests, admin-tests, stress-tests, resilience-tests, mesh-tests]
	if: always()
	runs-on: ak-e2e-runners
	steps:
	- name: Download all test artifacts
	uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
	with:
	pattern: junit-*
	path: /tmp/all-results
	merge-multiple: true

	- name: Publish test summary
	if: always()
	run: \|
	echo "## Release Gate Results" >> "$GITHUB_STEP_SUMMARY"
	echo "" >> "$GITHUB_STEP_SUMMARY"
	echo "\| Suite \| Status \|" >> "$GITHUB_STEP_SUMMARY"
	echo "\|-------\|--------\|" >> "$GITHUB_STEP_SUMMARY"

	for job in version-set-integrity clean-install-smoke clean-install-smoke-with-deps chart-upgrade-smoke real-flow-smoke scan-completion-gate sbom-correctness-gate pinned-cve-gate format-tests repo-tests promotion-tests rbac-tests lifecycle-tests webhook-tests search-tests platform-tests auth-tests admin-tests security-tests compatibility-tests stress-tests resilience-tests mesh-tests; do
	status="skipped"
	case "$job" in
	version-set-integrity) status="${{ needs.version-set-integrity.result }}" ;;
	clean-install-smoke) status="${{ needs.clean-install-smoke.result }}" ;;
	clean-install-smoke-with-deps) status="${{ needs.clean-install-smoke-with-deps.result }}" ;;
	chart-upgrade-smoke) status="${{ needs.chart-upgrade-smoke.result }}" ;;
	real-flow-smoke) status="${{ needs.real-flow-smoke.result }}" ;;
	scan-completion-gate) status="${{ needs.scan-completion-gate.result }}" ;;
	sbom-correctness-gate) status="${{ needs.sbom-correctness-gate.result }}" ;;
	pinned-cve-gate) status="${{ needs.pinned-cve-gate.result }}" ;;
	format-tests) status="${{ needs.format-tests.result }}" ;;
	repo-tests) status="${{ needs.repo-tests.result }}" ;;
	promotion-tests) status="${{ needs.promotion-tests.result }}" ;;
	rbac-tests) status="${{ needs.rbac-tests.result }}" ;;
	lifecycle-tests) status="${{ needs.lifecycle-tests.result }}" ;;
	webhook-tests) status="${{ needs.webhook-tests.result }}" ;;
	search-tests) status="${{ needs.search-tests.result }}" ;;
	platform-tests) status="${{ needs.platform-tests.result }}" ;;
	auth-tests) status="${{ needs.auth-tests.result }}" ;;
	admin-tests) status="${{ needs.admin-tests.result }}" ;;
	security-tests) status="${{ needs.security-tests.result }}" ;;
	compatibility-tests) status="${{ needs.compatibility-tests.result }}" ;;
	stress-tests) status="${{ needs.stress-tests.result }}" ;;
	resilience-tests) status="${{ needs.resilience-tests.result }}" ;;
	mesh-tests) status="${{ needs.mesh-tests.result }}" ;;
	esac
	echo "\| ${job} \| ${status} \|" >> "$GITHUB_STEP_SUMMARY"
	done

	echo "" >> "$GITHUB_STEP_SUMMARY"
	echo "Backend tag: \`${{ inputs.backend_tag }}\`" >> "$GITHUB_STEP_SUMMARY"
	echo "Web tag: \`${{ inputs.web_tag }}\`" >> "$GITHUB_STEP_SUMMARY"
	echo "Run ID: \`${{ needs.deploy.outputs.run_id }}\`" >> "$GITHUB_STEP_SUMMARY"

	- name: Upload combined results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: release-gate-results
	path: /tmp/all-results/
	if-no-files-found: ignore

	- name: Gate check - fail if any required suite did not succeed
	# stress-tests and security-tests are intentionally excluded from
	# this rollup. Both have continue-on-error: true (see the comments
	# above each job) so their outcome can be 'failure' on
	# known-flaky / known-infra-debt scenarios without blocking the
	# release gate:
	# - stress-tests: bcrypt-bound auth saturation under sustained
	# load on shared ARC runners (artifact-keeper#991).
	# - security-tests: Grype DB not pre-seeded in v1.1.x backend
	# image; quality gate is last-scanner-wins so Trivy covers
	# the policy gate (artifact-keeper#1001).
	#
	# The three new silent-success gates (scan-completion-gate,
	# sbom-correctness-gate, pinned-cve-gate) use the STRICTER
	# `result != 'success'` predicate. The looser
	# `result == 'failure' \|\| result == 'cancelled'` form lets a
	# 'skipped' outcome (matrix-leg eval failure, future conditional
	# `if:` gate, or a transitive `needs:` skip when `deploy` is
	# skipped) through as green. For the silent-success gates that
	# is precisely the regression class we are guarding against, so
	# we close it explicitly.
	#
	# clean-install-smoke-with-deps is the one legitimate 'skipped'
	# case: it is opt-in (gated on the `run_smoke_with_deps` workflow
	# input, default false; see #53). When the input is false the
	# job's result is 'skipped' by design. We continue to use the
	# looser predicate for it so the default-off path stays green.
	# When the input is true, a failure or cancellation DOES block
	# the release.
	#
	# The wildcard form contains(needs.*.result, 'failure') still
	# observes failures because needs.<job>.result reflects the
	# job's outcome, not its continue-on-error-adjusted conclusion.
	# So we list the required suites explicitly here. If you add a
	# new required suite, add it to this list. Soft-failing suites
	# stay off the list.
	if: >-
	needs.version-set-integrity.result == 'failure' \|\| needs.version-set-integrity.result == 'cancelled' \|\|
	needs.clean-install-smoke.result == 'failure' \|\| needs.clean-install-smoke.result == 'cancelled' \|\|
	needs.clean-install-smoke-with-deps.result == 'failure' \|\| needs.clean-install-smoke-with-deps.result == 'cancelled' \|\|
	needs.chart-upgrade-smoke.result == 'failure' \|\| needs.chart-upgrade-smoke.result == 'cancelled' \|\|
	needs.real-flow-smoke.result == 'failure' \|\| needs.real-flow-smoke.result == 'cancelled' \|\|
	needs.scan-completion-gate.result != 'success' \|\|
	needs.sbom-correctness-gate.result != 'success' \|\|
	needs.pinned-cve-gate.result != 'success' \|\|
	needs.deploy.result == 'failure' \|\| needs.deploy.result == 'cancelled' \|\|
	needs.format-tests.result == 'failure' \|\| needs.format-tests.result == 'cancelled' \|\|
	needs.compatibility-tests.result == 'failure' \|\| needs.compatibility-tests.result == 'cancelled' \|\|
	needs.repo-tests.result == 'failure' \|\| needs.repo-tests.result == 'cancelled' \|\|
	needs.promotion-tests.result == 'failure' \|\| needs.promotion-tests.result == 'cancelled' \|\|
	needs.rbac-tests.result == 'failure' \|\| needs.rbac-tests.result == 'cancelled' \|\|
	needs.lifecycle-tests.result == 'failure' \|\| needs.lifecycle-tests.result == 'cancelled' \|\|
	needs.webhook-tests.result == 'failure' \|\| needs.webhook-tests.result == 'cancelled' \|\|
	needs.search-tests.result == 'failure' \|\| needs.search-tests.result == 'cancelled' \|\|
	needs.platform-tests.result == 'failure' \|\| needs.platform-tests.result == 'cancelled' \|\|
	needs.auth-tests.result == 'failure' \|\| needs.auth-tests.result == 'cancelled' \|\|
	needs.admin-tests.result == 'failure' \|\| needs.admin-tests.result == 'cancelled' \|\|
	needs.resilience-tests.result == 'failure' \|\| needs.resilience-tests.result == 'cancelled' \|\|
	needs.mesh-tests.result == 'failure' \|\| needs.mesh-tests.result == 'cancelled'
	run: \|
	echo "::error::Release gate FAILED - one or more required test suites did not pass"
	echo "Review the workflow summary above for details"
	echo "Note: stress-tests is non-blocking; its outcome is shown in the summary but does not gate the release"
	exit 1

	# -------------------------------------------------------------------
	# Teardown
	# -------------------------------------------------------------------
	teardown:
	needs: [deploy, collect-results]
	if: always() && inputs.skip_teardown != true
	runs-on: ak-e2e-runners
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	repository: artifact-keeper/artifact-keeper-test

	- name: Install kubectl
	uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0

	- name: Install Helm
	uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0

	- name: Teardown test namespace
	env:
	RUN_ID: ${{ needs.deploy.outputs.run_id }}
	run: \|
	chmod +x scripts/teardown-test-namespace.sh
	./scripts/teardown-test-namespace.sh --run-id "${RUN_ID}"

	- name: Upload pod logs
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: pod-logs
	path: /tmp/test-logs/
	if-no-files-found: ignore

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Release Gate #114

Workflow file

Release Gate #114

Uh oh!

Workflow file for this run