Skip to content

Release Gate

Release Gate #114

Workflow file for this run

name: Release Gate
on:
workflow_dispatch:
inputs:
backend_tag:
description: Backend image tag to test
required: true
type: string
web_tag:
description: Web image tag to test
required: false
type: string
default: 'latest'
test_suite:
description: Test suite to run
required: false
type: choice
options:
- all
- formats
- repos
- promotion
- rbac
- lifecycle
- webhooks
- search
- platform
- auth
- stress
- resilience
- mesh
- security
- compatibility
- pullthrough
default: 'all'
skip_teardown:
description: Skip teardown (for debugging)
required: false
type: boolean
default: false
iac_ref:
description: artifact-keeper-iac git ref for the Helm chart (default main)
required: false
type: string
default: 'main'
run_smoke_with_deps:
description: |
Run the clean-install-smoke-with-deps variant (issue #53).
Disabled by default because enabling Trivy/DT/edge/openSCAP
can exceed the standard ARC runner namespace's 4 CPU / 8 Gi
quota. Set to true once a beefier runner pool is wired.
required: false
type: boolean
default: false
workflow_call:
inputs:
backend_tag:
required: true
type: string
web_tag:
required: false
type: string
default: 'latest'
test_suite:
required: false
type: string
default: 'all'
skip_teardown:
required: false
type: boolean
default: false
iac_ref:
required: false
type: string
default: 'main'
run_smoke_with_deps:
required: false
type: boolean
default: false
env:
NAMESPACE_CPU: ${{ vars.TEST_NAMESPACE_CPU || '4000m' }}
NAMESPACE_MEMORY: ${{ vars.TEST_NAMESPACE_MEMORY || '8Gi' }}
jobs:
# -------------------------------------------------------------------
# Version-set integrity check (issue #63)
#
# Runs FIRST, before any namespace is provisioned. Verifies that every
# container image referenced by the release set actually exists at the
# tag the chart references. This catches the structural failure mode
# behind artifact-keeper#872 (customer-flagged: "current main Helm
# chart only works with main backend and frontend, not a tagged
# release"), artifact-keeper#905 (versioned tags missing on ghcr.io),
# and artifact-keeper-web#320 (v1.1.8 web image never published).
#
# A green release-gate today says "the test cluster works"; this job
# turns that into "every image referenced by the release set actually
# exists at that tag." Failing here is a release-blocker that should
# NEVER be soft-failed: a missing tag is a publish-pipeline regression,
# not a flake.
# -------------------------------------------------------------------
version-set-integrity:
runs-on: ak-e2e-runners
timeout-minutes: 5
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Install Helm (for chart-default verification)
uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0
- name: Clone iac chart for default-tag verification
env:
IAC_REF: ${{ inputs.iac_ref || 'main' }}
run: |
# Clone the iac repo into a sibling directory of the test
# checkout so verify-image-set.sh can render the chart with
# no overrides and compare default image tags. This is the
# #872 customer-pain shape: chart on a tag, images on a
# different tag, no --set bridging them.
git clone --depth 1 --branch "${IAC_REF}" \
https://github.com/artifact-keeper/artifact-keeper-iac.git \
"${RUNNER_TEMP}/iac"
- name: Verify backend / web / openscap image tags exist on ghcr.io
env:
BACKEND_TAG: ${{ inputs.backend_tag }}
WEB_TAG: ${{ inputs.web_tag }}
# openscap is published in lockstep with backend on the same
# tag. When that lockstep breaks (#872 customer pain), we want
# to know BEFORE we try to deploy.
OPENSCAP_TAG: ${{ inputs.backend_tag }}
run: |
chmod +x tests/release-gate/verify-image-set.sh
./tests/release-gate/verify-image-set.sh \
--backend-tag "${BACKEND_TAG}" \
--web-tag "${WEB_TAG}" \
--openscap-tag "${OPENSCAP_TAG}" \
--chart-dir "${RUNNER_TEMP}/iac/charts/artifact-keeper"
- name: Upload version-set diagnostics
if: failure()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: version-set-integrity-logs
path: /tmp/version-set-*.log
if-no-files-found: ignore
# -------------------------------------------------------------------
# Clean-install smoke test
#
# Boots a fresh namespace, runs `helm install` against the documented
# values-production.yaml (with overrides for deps the smoke can't
# satisfy), waits for backend AND web Deployments to reach Ready, then
# probes /readyz from inside the cluster. Catches startup panics (e.g.
# the v1.1.8 Debian route panic) that crash the backend before it can
# serve traffic.
#
# The `deploy` job (and therefore the entire test matrix downstream)
# `needs:` this gate. A startup-broken release fails fast here without
# burning runner time on the matrix.
# -------------------------------------------------------------------
clean-install-smoke:
needs: version-set-integrity
runs-on: ak-e2e-runners
timeout-minutes: 12
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Install kubectl
uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0
- name: Install Helm
uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0
- name: Run clean-install smoke test
env:
BACKEND_TAG: ${{ inputs.backend_tag }}
WEB_TAG: ${{ inputs.web_tag }}
# Pin iac chart ref so the gate validates against the chart
# version that ships with the release. Defaults to `main` when
# the workflow input is unset; release pipelines should pass
# the corresponding iac tag.
IAC_REF: ${{ inputs.iac_ref || 'main' }}
# Pull-secret for ghcr.io. Without this, private image tags
# fail with ImagePullBackOff and the gate fails for the wrong
# reason. Workflows that test public-only tags can omit it.
GHCR_DOCKER_CONFIG: ${{ secrets.GHCR_DOCKER_CONFIG }}
run: |
chmod +x scripts/clean-install-smoke.sh
# github.run_id + github.run_attempt is unique per workflow
# attempt (re-runs increment run_attempt). Avoids RUN_ID
# collisions when a job is retried.
RUN_ID="${{ github.run_id }}-${{ github.run_attempt }}"
./scripts/clean-install-smoke.sh \
--run-id "${RUN_ID}" \
--backend-tag "${BACKEND_TAG}" \
--web-tag "${WEB_TAG}" \
--iac-ref "${IAC_REF}" \
--timeout 300
- name: Upload smoke diagnostics
if: failure()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: clean-install-smoke-logs
path: /tmp/test-logs/
if-no-files-found: ignore
# -------------------------------------------------------------------
# Clean-install smoke WITH dependencies (issue #53)
#
# The basic `clean-install-smoke` above disables Trivy, Dependency-
# Track, edge replication, ingress, and openSCAP to keep the smoke
# under the runner's memory budget. That leaves a real coverage gap:
# chart wiring regressions in those subsystems pass the gate. A
# v1.1.8-class regression that broke ONLY Trivy or Dependency-Track
# wiring would NOT be caught by the basic smoke.
#
# This job runs the same smoke flow against a values overlay that
# enables every optional subsystem and asserts each one reaches a
# healthy state.
#
# CURRENTLY DISABLED with `if: false`. Enabling all subsystems can
# exceed the ARC runner namespace's 4 CPU / 8 Gi quota:
# - Trivy: 1 CPU / 2 Gi limit
# - DependencyTrack: 2 CPU / 4 Gi limit
# - Edge: 500m / 512 Mi limit
# - OpenSCAP: 500m / 1 Gi limit
# - Backend: 2 CPU / 2 Gi limit
# - Web/Postgres/OpenSearch: ~1 CPU / ~2 Gi combined
# Total: roughly 7 CPU / 12 Gi limits, which can OOM the namespace.
#
# To enable: bump the namespace quota OR move this job to a beefier
# runner pool (e.g. `ak-beefy-runners`) and flip the `if:` here.
# Tracked under #53.
# -------------------------------------------------------------------
clean-install-smoke-with-deps:
needs: clean-install-smoke
# TODO(#53): enable by default once a runner with >= 8 CPU / 16 Gi
# is available in the ARC pool. The `run_smoke_with_deps` input
# (defined in workflow_dispatch.inputs and workflow_call.inputs at
# the top of this file) defaults to false, so the job is wired
# and validated by actionlint but does not execute unless the
# dispatching workflow explicitly opts in. The dispatching
# workflow can override to true when running against a beefier
# runner pool.
if: ${{ inputs.run_smoke_with_deps == true }}
runs-on: ak-e2e-runners
timeout-minutes: 20
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Install kubectl
uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0
- name: Install Helm
uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0
- name: Run clean-install-smoke with all deps enabled
env:
BACKEND_TAG: ${{ inputs.backend_tag }}
WEB_TAG: ${{ inputs.web_tag }}
IAC_REF: ${{ inputs.iac_ref || 'main' }}
GHCR_DOCKER_CONFIG: ${{ secrets.GHCR_DOCKER_CONFIG }}
run: |
chmod +x tests/release-gate/clean-install-smoke-with-deps.sh
RUN_ID="${{ github.run_id }}-${{ github.run_attempt }}-deps"
./tests/release-gate/clean-install-smoke-with-deps.sh \
--run-id "${RUN_ID}" \
--backend-tag "${BACKEND_TAG}" \
--web-tag "${WEB_TAG}" \
--iac-ref "${IAC_REF}" \
--timeout 600
- name: Upload smoke-with-deps diagnostics
if: failure()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: clean-install-smoke-with-deps-logs
path: /tmp/test-logs/
if-no-files-found: ignore
# -------------------------------------------------------------------
# Chart upgrade smoke (issue #54)
#
# `clean-install-smoke` catches startup panics on a fresh install.
# It does NOT catch:
# - Migration-on-upgrade failures (schema change that fails to
# apply when upgrading prev -> current)
# - Chart-template breakage that only manifests on `helm upgrade`
# (immutable field changes, StatefulSet rollout deadlocks)
# - Resources that get re-created instead of preserved across
# upgrades
#
# The script installs the previous stable release tag, pushes a
# small artifact through the management API to establish state,
# runs `helm upgrade` to the current backend image, then asserts
# the artifact is still retrievable and `/readyz` returns 200.
#
# PREVIOUS_TAG: hardcoded today. Update on each release. Once
# release tooling can introspect the previous tag automatically,
# this can be derived from `gh release list` in a setup step.
# -------------------------------------------------------------------
chart-upgrade-smoke:
needs: clean-install-smoke
runs-on: ak-e2e-runners
timeout-minutes: 25
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Install kubectl
uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0
- name: Install Helm
uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0
- name: Run chart-upgrade-smoke
env:
BACKEND_TAG: ${{ inputs.backend_tag }}
WEB_TAG: ${{ inputs.web_tag }}
IAC_REF: ${{ inputs.iac_ref || 'main' }}
GHCR_DOCKER_CONFIG: ${{ secrets.GHCR_DOCKER_CONFIG }}
# PREVIOUS_TAG: the previously-released stable tag whose chart
# the upgrade originates from. UPDATE THIS ON EACH RELEASE.
# When v1.1.10 ships, bump this to "1.1.9" (the new "previous
# stable"). The script accepts the unprefixed semver form
# because docker tags drop the leading 'v' (see CLAUDE.md).
PREVIOUS_TAG: '1.1.9'
# PREVIOUS_WEB_TAG: the web image tag for the previous-tag side
# of the upgrade. The web repo cuts its own release cadence and
# does NOT mirror backend version tags, so the backend's
# PREVIOUS_TAG (e.g. "1.1.9") is NOT a valid web image tag and
# causes ImagePullBackOff (closes artifact-keeper#1378). `main`
# is published on every push to artifact-keeper-web main and is
# always pullable. If a future release wants stricter pinning,
# bump this to a specific SHA tag (e.g. `sha-ea664a1`).
PREVIOUS_WEB_TAG: 'main'
run: |
chmod +x tests/release-gate/chart-upgrade-smoke.sh
RUN_ID="${{ github.run_id }}-${{ github.run_attempt }}-upgrade"
# PREVIOUS_IAC_REF: the iac chart tag that shipped with the
# previous release. Defaults to artifact-keeper-1.1.9 inside
# the script; pinned here so the chart-template upgrade path
# exercises the actual prev->current chart diff (issue #54).
PREVIOUS_IAC_REF="artifact-keeper-${PREVIOUS_TAG}"
./tests/release-gate/chart-upgrade-smoke.sh \
--run-id "${RUN_ID}" \
--previous-tag "${PREVIOUS_TAG}" \
--backend-tag "${BACKEND_TAG}" \
--web-tag "${WEB_TAG}" \
--previous-web-tag "${PREVIOUS_WEB_TAG}" \
--iac-ref "${IAC_REF}" \
--previous-iac-ref "${PREVIOUS_IAC_REF}" \
--timeout 600
- name: Upload chart-upgrade diagnostics
if: failure()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: chart-upgrade-smoke-logs
path: /tmp/test-logs/
if-no-files-found: ignore
# -------------------------------------------------------------------
# Deploy test environment
#
# Gated on `clean-install-smoke` so that the matrix below cannot run
# against a backend that fails to even start. A startup-broken release
# fails fast in `clean-install-smoke` and the entire matrix is skipped,
# preserving runner-time for releases that can actually be tested.
# -------------------------------------------------------------------
deploy:
needs: clean-install-smoke
runs-on: ak-e2e-runners
outputs:
run_id: ${{ steps.setup.outputs.run_id }}
namespace: ${{ steps.setup.outputs.namespace }}
backend_url: ${{ steps.deploy.outputs.backend_url }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Install kubectl
uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0
- name: Install Helm
uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0
- name: Generate run ID
id: setup
run: |
RUN_ID="e2e-$(date +%s)-${GITHUB_RUN_NUMBER}"
echo "run_id=${RUN_ID}" >> "$GITHUB_OUTPUT"
echo "namespace=test-${RUN_ID}" >> "$GITHUB_OUTPUT"
- name: Deploy test namespace
id: deploy
env:
RUN_ID: ${{ steps.setup.outputs.run_id }}
BACKEND_TAG: ${{ inputs.backend_tag }}
WEB_TAG: ${{ inputs.web_tag }}
run: |
chmod +x scripts/create-test-namespace.sh
# --full-stack enables Trivy + scan workspace so the security
# tests actually exercise the scanner instead of false-passing
# against a no-scanner stack (#888 silent-success class).
./scripts/create-test-namespace.sh \
--run-id "${RUN_ID}" \
--backend-tag "${BACKEND_TAG}" \
--web-tag "${WEB_TAG}" \
--full-stack
NAMESPACE="test-${RUN_ID}"
BACKEND_URL="http://artifact-keeper-backend.${NAMESPACE}.svc.cluster.local:8080"
echo "backend_url=${BACKEND_URL}" >> "$GITHUB_OUTPUT"
- name: Wait for stack ready
env:
RUN_ID: ${{ steps.setup.outputs.run_id }}
run: |
NAMESPACE="test-${RUN_ID}"
BACKEND_URL="http://artifact-keeper-backend.${NAMESPACE}.svc.cluster.local:8080"
chmod +x tests/lib/wait-for-ready.sh
./tests/lib/wait-for-ready.sh "${BACKEND_URL}" 180
# Trivy rollout must be Available before the security tests
# dispatch. The chart's fullnameOverride is "artifact-keeper"
# (see helm/values-test-full.yaml) so the deployment is
# named artifact-keeper-trivy regardless of the release name.
# Without this gate, security-tests can race scanner pod
# scale-up and the lite scan-completion gate sees an unreachable
# scanner as "real" failure (the #888 false-fail mirror).
echo "Waiting for Trivy rollout in ${NAMESPACE}..."
kubectl -n "${NAMESPACE}" rollout status \
deployment/artifact-keeper-trivy --timeout=180s
kubectl -n "${NAMESPACE}" wait --for=condition=Available \
deployment/artifact-keeper-trivy --timeout=60s
# -------------------------------------------------------------------
# Real-flow smoke (issue #45)
#
# The user's actual flow as a single gate check: push an artifact
# through a native client, pull it back, trigger a scan, poll until
# completion. Regressions in any step (broken upload, broken
# download, scan-stuck-queued mirror of #871) fail the gate before
# the broader format/security/repo matrix runs, so the operator
# sees the FIRST signal that pierced the smoke.
#
# Uses npm (already installed on the runner pod for the `node` batch
# in format-tests) so the gate stays under the 5-minute target in
# the acceptance criteria for #45.
# -------------------------------------------------------------------
real-flow-smoke:
needs: deploy
runs-on: ak-e2e-runners
timeout-minutes: 8
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
# A silently-skipped real-flow smoke is exactly the silent-success
# class (#870/#871/#888) this gate was added to catch.
RELEASE_GATE: '1'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Install npm (real-flow uses the npm native client)
run: |
if ! command -v npm >/dev/null 2>&1; then
curl -fsSL https://deb.nodesource.com/setup_22.x | sudo -E bash - > /dev/null 2>&1
sudo apt-get install -y -qq nodejs > /dev/null
fi
npm --version
- name: Run real-flow smoke
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
chmod +x tests/release-gate/test-real-flow-smoke.sh
./tests/release-gate/test-real-flow-smoke.sh
- name: Upload real-flow smoke results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-real-flow-smoke
path: /tmp/test-results/
if-no-files-found: ignore
# -------------------------------------------------------------------
# Scan-completion gate (matrix across representative formats).
#
# Closes the gap surfaced by artifact-keeper#888 ("works for npm,
# silently fails for docker"). Each matrix entry runs the gate
# primitive in its own job (parallel) so a regression in any one
# format's scanner pipeline fails the release-gate loud.
#
# The wired formats (currently: npm) run the lite primitive at
# tests/security/test-scan-completes.sh via the release-gate wrapper.
#
# The matrix is intentionally restricted to formats whose fixtures
# exist. Scaffolded formats (oci, maven, pypi, cargo, helm) are NOT
# in the matrix because a green checkmark on an `exit 0` scaffold
# is the same silent-success class the gate exists to prevent. The
# scan-completion-gate-scaffolds-pending job below surfaces the
# deferred formats as ::warning:: annotations so the gap is visible
# without painting the dashboard with fake passes. When a fixture-
# builder for a deferred format lands (#62), add the format to this
# matrix in the same PR.
#
# Why a matrix rather than a single sequential driver:
# - Each format scan can take 30-60s; sequential 6-format runs
# blow the 5-min release-gate budget.
# - The workflow-level matrix surfaces per-format outcomes in the
# GitHub Actions UI so an operator can see "oci failed, npm
# passed" at a glance.
# -------------------------------------------------------------------
scan-completion-gate:
needs: deploy
runs-on: ak-e2e-runners
timeout-minutes: 8
strategy:
fail-fast: false
matrix:
format: [npm]
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
# ALLOW_SCANNER_SKIP=0 is enforced inside the gate primitive --
# a scanner-pod-down skip in release-gate context is exactly the
# silent-success class this gate exists to catch.
RELEASE_GATE: '1'
ALLOW_SCANNER_SKIP: '0'
FIXTURE_FORMAT: ${{ matrix.format }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Run scan-completion gate for ${{ matrix.format }}
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
chmod +x tests/release-gate/scan-completion-gate.sh
# Per-format RUN_ID suffix so concurrent matrix jobs do not
# collide on the repo key (scan-complete-<RUN_ID>).
RUN_ID="${RUN_ID}-${{ matrix.format }}" \
./tests/release-gate/scan-completion-gate.sh
- name: Capture scanner pod logs on failure
if: failure()
run: |
# The deploy job exports the test namespace; resolve it via
# the standard naming pattern. We use kubectl best-effort:
# if the runner pod has no kubeconfig, this step's failure
# is benign (we still surface the JUnit XML).
NS="test-${{ needs.deploy.outputs.run_id }}"
mkdir -p /tmp/test-logs
kubectl -n "$NS" logs -l app.kubernetes.io/component=scanner \
--tail=2000 > /tmp/test-logs/scanner-${{ matrix.format }}.log 2>&1 || true
# #1379: chart labels Trivy pods with app.kubernetes.io/component=trivy
# (app.kubernetes.io/name is the chart name "artifact-keeper", not the
# component). Use the component label so the log capture actually finds
# the pod when the gate is failing and the operator most needs the logs.
kubectl -n "$NS" logs -l app.kubernetes.io/component=trivy \
--tail=2000 > /tmp/test-logs/trivy-${{ matrix.format }}.log 2>&1 || true
kubectl -n "$NS" logs -l app=artifact-keeper-backend \
--tail=1000 > /tmp/test-logs/backend-${{ matrix.format }}.log 2>&1 || true
- name: Upload scan-completion gate results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-scan-completion-${{ matrix.format }}
path: |
/tmp/test-results/
/tmp/test-logs/
if-no-files-found: ignore
# -------------------------------------------------------------------
# Scaffolds-pending sentinel (#62).
#
# This job exists ONLY to surface the deferred formats as a
# ::warning:: annotation on every release-gate run so the gap is
# visible. It does NOT run any test. It is NOT a required gate.
# Deliberately a separate job (not a matrix step) so the GitHub
# Actions UI shows ONE warning row rather than five green
# checkmarks that imply broader format coverage than exists.
#
# When a fixture-builder for one of the listed formats lands,
# remove the format from this list and add it to the matrix above.
# -------------------------------------------------------------------
scan-completion-gate-scaffolds-pending:
needs: deploy
runs-on: ak-e2e-runners
timeout-minutes: 2
steps:
- name: Emit scaffolds-pending warnings
run: |
for fmt in oci maven pypi cargo helm; do
echo "::warning title=scan-completion gate: ${fmt} fixture missing::No scan-completion fixture exists for ${fmt}; the silent-success class (#888) is NOT covered for this format. Tracked under artifact-keeper-test#62."
done
# Echo to the runner log too so the operator sees this even
# without expanding the annotations pane.
echo ""
echo "Scan-completion format coverage:"
echo " wired: npm"
echo " deferred: oci, maven, pypi, cargo, helm (artifact-keeper-test#62)"
# -------------------------------------------------------------------
# SBOM correctness gate (scaffold).
#
# Pins the SBOM endpoint contract (POST /api/v1/sbom returns 200
# with the documented SbomResponse shape) so an endpoint deletion
# or 5xx regression fails the release loud. The component_count > 0
# assertion is deferred to artifact-keeper#903 (--list-all-pkgs)
# per the #57 epic.
# -------------------------------------------------------------------
sbom-correctness-gate:
needs: deploy
runs-on: ak-e2e-runners
timeout-minutes: 6
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
RELEASE_GATE: '1'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Run SBOM correctness gate
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
chmod +x tests/release-gate/sbom-correctness-gate.sh
./tests/release-gate/sbom-correctness-gate.sh
- name: Capture scanner pod logs on failure
if: failure()
run: |
NS="test-${{ needs.deploy.outputs.run_id }}"
mkdir -p /tmp/test-logs
kubectl -n "$NS" logs -l app.kubernetes.io/component=scanner \
--tail=2000 > /tmp/test-logs/sbom-scanner.log 2>&1 || true
kubectl -n "$NS" logs -l app=artifact-keeper-backend \
--tail=1000 > /tmp/test-logs/sbom-backend.log 2>&1 || true
- name: Upload SBOM gate results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-sbom-correctness
path: |
/tmp/test-results/
/tmp/test-logs/
if-no-files-found: ignore
# -------------------------------------------------------------------
# Pinned-CVE assertion (#64).
#
# Tightens the gate beyond "findings_count >= 1" to a specific CVE
# id. Catches scanner-DB drift and parser-correctness regressions
# that the findings-count check alone would let through.
# -------------------------------------------------------------------
pinned-cve-gate:
needs: deploy
runs-on: ak-e2e-runners
timeout-minutes: 8
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
RELEASE_GATE: '1'
# CVE-2019-10744 (lodash 4.17.4 prototype pollution) is what the
# lite fixture pins. log4j 2.14.0 / CVE-2021-44228 will move
# under the oci matrix entry once the oci fixture-builder lands
# (#62 + #64 extension).
EXPECTED_VULN_CVE: CVE-2019-10744
# Trivy DB age threshold for the freshness pre-flight. The DB is
# rebuilt every 6 hours upstream; we accept anything <= 14 days
# so weekend gaps and slow mirror sync do not generate false
# failures, while still catching DBs old enough that
# CVE-2019-10744 (published 2019) could fall out of recent index
# shards. Override at workflow_dispatch via repo variable
# TRIVY_DB_MAX_AGE_DAYS if needed.
TRIVY_DB_MAX_AGE_DAYS: ${{ vars.TRIVY_DB_MAX_AGE_DAYS || '14' }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Install kubectl
uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0
- name: Trivy DB freshness pre-flight
# Distinguish "CVE database is stale" from "CVE-2019-10744 not
# surfaced". Without this step, a stale Trivy DB on the runner
# image causes the assertion to silently pass (no CVE found ==
# 0 findings == "did Trivy run at all?"). The misleading failure
# message blames the gate; the actual root cause is upstream.
#
# Exit codes:
# 0 - DB reachable, age within TRIVY_DB_MAX_AGE_DAYS
# 42 - DB too old (distinct so the rollup operator can tell
# scanner DB drift apart from a real CVE-detection bug)
# 43 - Trivy pod unreachable or trivy --version failed
run: |
set -uo pipefail
NS="test-${{ needs.deploy.outputs.run_id }}"
MAX_AGE_DAYS="${TRIVY_DB_MAX_AGE_DAYS}"
MAX_AGE_SECONDS=$(( MAX_AGE_DAYS * 86400 ))
# Locate the Trivy pod. The Helm chart names the deployment
# artifact-keeper-trivy; we kubectl-exec into the first ready
# pod backed by that deployment.
#
# Label selector note (#1379): the chart's _helpers.tpl labels
# every component with app.kubernetes.io/name=artifact-keeper
# (the chart name) and distinguishes components via
# app.kubernetes.io/component=<name>. Earlier revisions of this
# pre-flight queried `app.kubernetes.io/name=trivy`, which
# never matched, producing the misleading "Trivy pod not
# found" error even though the deploy job's
# `kubectl rollout status deployment/artifact-keeper-trivy`
# had already succeeded. The correct selector is
# `app.kubernetes.io/component=trivy`.
POD=$(kubectl -n "$NS" get pods \
-l app.kubernetes.io/component=trivy \
-o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
if [ -z "$POD" ]; then
echo "::error::Trivy pod not found in namespace ${NS} (pre-flight cannot run)"
echo "::error::Checked selector: app.kubernetes.io/component=trivy"
echo "::error::Deploy job should have installed Trivy via values-test-full.yaml (trivy.enabled=true)."
kubectl -n "$NS" get pods --show-labels 2>&1 | head -40 || true
exit 43
fi
# Warm up the Trivy vulnerability DB before reading its
# UpdatedAt. Trivy 0.62 downloads the DB lazily on first scan,
# not at pod startup, so `trivy --version` immediately after
# rollout returns only `Version: <x>` with no `UpdatedAt:`
# line, which the parser below cannot handle (#197). Using
# `image --download-db-only` is idempotent: it no-ops if the
# DB is already present and within the upstream
# download-interval window.
echo "Warming up Trivy vulnerability DB..."
kubectl -n "$NS" exec "$POD" -c trivy -- \
trivy image --download-db-only --quiet >/dev/null 2>&1 \
|| kubectl -n "$NS" exec "$POD" -- \
trivy image --download-db-only --quiet >/dev/null 2>&1 \
|| echo "::warning::trivy image --download-db-only returned non-zero; will inspect --version anyway"
# trivy --version emits "Vulnerability DB: ... UpdatedAt: 2026-04-30 ..."
# in older versions and a YAML-ish block in newer versions.
# Parse both shapes.
VER_OUT=$(kubectl -n "$NS" exec "$POD" -c trivy -- trivy --version 2>/dev/null \
|| kubectl -n "$NS" exec "$POD" -- trivy --version 2>/dev/null \
|| true)
if [ -z "$VER_OUT" ]; then
echo "::error::trivy --version returned empty output from pod ${POD}"
exit 43
fi
echo "trivy --version output:"
echo "$VER_OUT"
echo ""
# Extract UpdatedAt timestamp. Format varies:
# " UpdatedAt: 2026-04-30 12:34:56.789 +0000 UTC"
# " UpdatedAt 2026-04-30 12:34:56.789 +0000 UTC"
#
# Pipeline hardening (#197): grep's no-match exit code (1)
# used to propagate via `set -o pipefail` and the workflow
# shell's implicit `set -e`, killing the script before the
# `if [ -z "$DB_DATE" ]` guard below could surface a clean
# exit 43. The `|| true` here keeps `DB_DATE` empty on
# no-match so the guard does its job.
DB_DATE=$( { echo "$VER_OUT" | grep -iE 'UpdatedAt' || true; } | head -n1 \
| sed -E 's/.*UpdatedAt[: ]+([0-9-]+ [0-9:.]+).*/\1/' \
| awk '{print $1" "$2}')
if [ -z "$DB_DATE" ] || ! date -d "$DB_DATE" +%s >/dev/null 2>&1; then
echo "::error::Could not parse Trivy DB UpdatedAt from --version output."
echo "::error::Likely cause: Trivy DB download failed during warmup, or trivy --version output format changed."
echo "::error::Raw version output above for diagnosis."
exit 43
fi
DB_EPOCH=$(date -d "$DB_DATE" -u +%s)
NOW_EPOCH=$(date -u +%s)
AGE_SECONDS=$(( NOW_EPOCH - DB_EPOCH ))
AGE_DAYS=$(( AGE_SECONDS / 86400 ))
echo "Trivy DB UpdatedAt: ${DB_DATE} (age: ${AGE_DAYS} days, threshold: ${MAX_AGE_DAYS} days)"
if [ "$AGE_SECONDS" -gt "$MAX_AGE_SECONDS" ]; then
echo "::error::Trivy DB is ${AGE_DAYS} days old, exceeds threshold of ${MAX_AGE_DAYS} days."
echo "::error::The pinned-CVE assertion below would surface a misleading 'CVE-2019-10744 not found' failure;"
echo "::error::the actual root cause is upstream DB staleness. Refresh Trivy DB or bump the runner image."
exit 42
fi
echo "Trivy DB freshness OK."
- name: Run pinned-CVE gate
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
chmod +x tests/release-gate/test-pinned-cve.sh
./tests/release-gate/test-pinned-cve.sh
- name: Upload pinned-CVE gate results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-pinned-cve
path: /tmp/test-results/
if-no-files-found: ignore
# -------------------------------------------------------------------
# Format tests (8 parallel batches)
# -------------------------------------------------------------------
format-tests:
needs: deploy
if: inputs.test_suite == 'all' || inputs.test_suite == 'formats'
runs-on: ak-e2e-runners
strategy:
fail-fast: false
matrix:
batch:
- name: node
scripts: "test-npm.sh test-npm-remote.sh test-vscode.sh"
- name: python
scripts: "test-pypi.sh test-pypi-native-client.sh test-pypi-remote.sh test-conda.sh test-huggingface.sh test-mlmodel.sh"
- name: jvm
scripts: "test-maven.sh test-maven-native-client.sh test-maven-remote.sh test-maven-virtual-snapshot.sh test-sbt.sh test-gradle-conformance.sh"
- name: rust-go-swift
scripts: "test-cargo.sh test-cargo-remote.sh test-go.sh test-swift.sh test-pub.sh"
- name: system-packages
scripts: "test-debian.sh test-rpm.sh test-alpine.sh test-opkg.sh"
- name: containers
scripts: "test-oci.sh test-oci-remote.sh test-docker-native-client.sh test-helm.sh test-incus.sh"
- name: misc-native
scripts: "test-terraform.sh test-composer.sh test-hex.sh test-rubygems.sh test-nuget.sh test-cocoapods.sh test-cran.sh"
- name: generic-protocol
scripts: "test-generic.sh test-generic-native-client.sh test-gitlfs.sh test-protobuf.sh test-bazel.sh test-conan.sh test-conan-auth.sh test-conan-recipes.sh test-conan-packages.sh test-conan-search.sh test-conan-revisions.sh test-conan-remote.sh test-conan-errors.sh test-conan-stress.sh test-ansible.sh test-p2.sh test-jetbrains.sh test-vagrant.sh test-wasm.sh test-puppet.sh test-chef.sh"
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
# A silently-skipped test in release-gate context is exactly the
# silent-success class (#870/#871/#888) the gate exists to catch.
RELEASE_GATE: '1'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Install test dependencies
run: |
BATCH="${{ matrix.batch.name }}"
echo "Installing dependencies for batch: $BATCH"
# Common: ensure zip is available (used by maven, swift, vscode, go)
if ! command -v zip &>/dev/null; then
sudo apt-get update -qq && sudo apt-get install -y -qq zip > /dev/null
fi
case "$BATCH" in
node)
if ! command -v npm &>/dev/null; then
curl -fsSL https://deb.nodesource.com/setup_22.x | sudo -E bash - > /dev/null 2>&1
sudo apt-get install -y -qq nodejs > /dev/null
fi
;;
python)
if ! command -v python3 &>/dev/null; then
sudo apt-get update -qq && sudo apt-get install -y -qq python3 python3-pip python3-setuptools python3-venv > /dev/null
fi
# python3-venv is required by test-pypi-native-client.sh; install
# it even if python3 is already present, since the bundled
# interpreter may have ensurepip stripped out.
if ! python3 -c 'import venv; venv.EnvBuilder().ensure_directories' &>/dev/null; then
sudo apt-get update -qq && sudo apt-get install -y -qq python3-venv > /dev/null || true
fi
;;
jvm)
# mvn is required by test-maven-native-client.sh; the suite is
# auto-skipped if maven is missing, but we install it here so
# the gate actually exercises native-client coverage.
if ! command -v mvn &>/dev/null; then
sudo apt-get update -qq && sudo apt-get install -y -qq maven > /dev/null || true
fi
;;
rust-go-swift)
if ! command -v go &>/dev/null; then
GO_VERSION="1.23.6"
curl -sSL "https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz" | sudo tar -C /usr/local -xz
echo "/usr/local/go/bin" >> "$GITHUB_PATH"
fi
if ! command -v cargo &>/dev/null; then
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable --profile minimal
echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
fi
;;
containers)
# Helm is needed for test-helm.sh
if ! command -v helm &>/dev/null; then
curl -sSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
fi
;;
system-packages)
# ar (from binutils) for Debian package assembly
if ! command -v ar &>/dev/null; then
sudo apt-get update -qq && sudo apt-get install -y -qq binutils > /dev/null
fi
;;
esac
- name: Run ${{ matrix.batch.name }} format tests
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
exit_code=0
for script in ${{ matrix.batch.scripts }}; do
echo "=== Running ${script} ==="
if ! bash "tests/formats/${script}"; then
echo "FAILED: ${script}"
exit_code=1
fi
done
exit $exit_code
- name: Upload test results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-formats-${{ matrix.batch.name }}
path: /tmp/test-results/*.xml
if-no-files-found: ignore
# -------------------------------------------------------------------
# Security tests
#
# cache-poisoning + cache-stampede boot a Python mock upstream on the
# runner pod and need the backend to dial the runner pod by hostname.
# We compute the runner's pod IP at runtime and translate it to the
# cluster-DNS pod-DNS form (10-1-2-3.<ns>.pod.cluster.local) which the
# backend pod can resolve via ClusterFirst.
#
# PROXY_MAX_CONCURRENT_FETCHES / PROXY_QUEUE_TIMEOUT_SECS pin the
# values the test asserts against so chart-default drift doesn't
# silently make the assertion measure the wrong limit. They MUST match
# the values the deployed backend was configured with.
# -------------------------------------------------------------------
security-tests:
needs: deploy
# continue-on-error: test-scan-completes.sh asserts on Grype scanner
# finishing with findings. Grype on the v1.1.x backend image fails
# deterministically because the vulnerability DB is not pre-seeded
# in the Dockerfile and our network-restricted ARC runner pods can't
# fetch grype.anchore.io at scan time. The quality gate is
# LAST-scanner-wins (policy_service reads LIMIT 1 ORDER BY created_at
# DESC), so Trivy success satisfies block_unscanned and the practical
# security posture is unaffected. Tracked for fix in v1.1.10:
# artifact-keeper#1001 (pre-seed Grype DB in Dockerfile). The other
# 44 security tests in the suite still run and gate the release.
continue-on-error: true
if: inputs.test_suite == 'all' || inputs.test_suite == 'security'
runs-on: ak-e2e-runners
timeout-minutes: 20
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
# A silently-skipped test in release-gate context is exactly the
# silent-success class (#870/#871/#888) the gate exists to catch.
RELEASE_GATE: '1'
# Per-script timeout for run-suite.sh. Several Epic 2 tests
# (cve-history, license-policy, scan-policy, quality-gate-blocks-upload,
# scan-dedup-checksum) poll for scan completion with default
# SCAN_TIMEOUT=180. The default 120s wrapper would SIGKILL them before
# they could write JUnit XML. 300s gives 120s headroom over SCAN_TIMEOUT
# for fixture build, upload, and cleanup.
TEST_TIMEOUT: '300'
# Stampede / poisoning test knobs. Must match the values the chart
# rendered for the backend Deployment (see helm values-test.yaml).
PROXY_MAX_CONCURRENT_FETCHES: '20'
PROXY_QUEUE_TIMEOUT_SECS: '5'
STAMPEDE_UPSTREAM_DELAY_MS: '2000'
# AK_BACKEND_BRANCH for the feature-flag layer (issue #65). The
# security suite hosts test-feature-flag-drift.sh which is the
# truth-side check that AK_FEATURES matches the deployed
# backend's reported version. See pullthrough-tests env for the
# rationale on how this maps from inputs.backend_tag.
AK_BACKEND_BRANCH: ${{ startsWith(inputs.backend_tag, '1.1.') && 'release/1.1.x' || (startsWith(inputs.backend_tag, '1.2.') && 'release/1.2.x' || 'main') }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Resolve runner pod address for backend dial-back
id: mock-host
run: |
# The runner is a Pod inside the cluster; its pod IP is reachable
# from the backend Pod over the cluster network. We pass the bare
# IP as MOCK_UPSTREAM_HOSTNAME so the backend's upstream-URL
# resolver does not need cluster DNS to be configured for the
# `<ip-dashed>.<ns>.pod.cluster.local` form (which depends on
# CoreDNS `pods` plugin mode).
#
# ARC runners with `spec.template.spec.containers[].env.POD_IP`
# via the downward API populate $POD_IP. We fall back to
# `hostname -i` if the env var is missing.
POD_IP="${POD_IP:-$(hostname -i 2>/dev/null | awk '{print $1}')}"
if [ -z "$POD_IP" ]; then
echo "ERROR: could not determine runner pod IP for mock dial-back" >&2
exit 1
fi
# Sanity: must look like an IPv4. Reject 127.* (loopback would
# only be reachable from inside the runner pod itself, not from
# the backend pod across the cluster network).
if ! echo "$POD_IP" | grep -Eq '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$'; then
echo "ERROR: POD_IP '${POD_IP}' is not an IPv4 address" >&2
exit 1
fi
if echo "$POD_IP" | grep -Eq '^127\.'; then
echo "ERROR: POD_IP '${POD_IP}' is loopback; backend pod cannot reach this" >&2
exit 1
fi
echo "Runner pod IP: ${POD_IP}"
echo "MOCK_UPSTREAM_HOSTNAME=${POD_IP}" >> "$GITHUB_ENV"
- name: Run security tests
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
chmod +x scripts/run-suite.sh
./scripts/run-suite.sh --suite security --run-id "${RUN_ID}"
- name: Upload test results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-security
# Directory upload (not *.xml glob) so per-test diagnostic
# JSON breadcrumbs (e.g. scan-completes-final-resp.json) reach
# the operator. With *.xml glob the dump dies silently and the
# gate's failure rendering is a one-line message attribute.
path: /tmp/test-results/
if-no-files-found: ignore
# -------------------------------------------------------------------
# Pull-through cache reliability tests (epic #69 cluster D, v1.1.9).
#
# Lives in its own job (not merged into repo-tests or security-tests)
# for three reasons:
#
# 1. The cross-format shadowing-guard test takes ~60s on its own
# because it cycles through six format handlers. Folding that
# into repo-tests would dominate the suite's wall-clock.
#
# 2. The cache-ttl tests need stable TTL plumbing on the backend;
# a regression in only this surface should fail this job
# without blocking the other 20 jobs.
#
# 3. RELEASE_GATE=1 is set: silent skips here are the exact
# silent-success class (#888) this cluster exists to catch.
# -------------------------------------------------------------------
pullthrough-tests:
needs: deploy
if: inputs.test_suite == 'all' || inputs.test_suite == 'pullthrough'
runs-on: ak-e2e-runners
timeout-minutes: 15
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
# A silently-skipped test in release-gate context is exactly the
# silent-success class (#870/#871/#888) the gate exists to catch.
RELEASE_GATE: '1'
# Per-script timeout for run-suite.sh. SCAN_TIMEOUT defaults to
# 180s in test-stuck-scan-janitor.sh and the scan-depth scripts;
# virtual-shadowing-guard.sh walks 6 formats serially. The default
# 120s budget will SIGTERM polls mid-flight on a slightly slow
# backend, masking real signal with timeout failures. 300s gives
# ~2x headroom over the worst per-script case and matches the
# security-tests / webhook-tests jobs.
TEST_TIMEOUT: '300'
# AK_BACKEND_BRANCH feeds the branch-aware feature flag layer
# in tests/lib/feature-flags.sh (issue #65). We derive the branch
# from the backend image tag: a tag like `1.1.9` or `1.1.10-rc.2`
# is release/1.1.x; `1.2.0` is release/1.2.x; `latest`, `main`,
# or anything else falls back to `main`. The mapping below is
# intentionally conservative -- if we can't tell, we use the
# most-restrictive 1.1.x flag set (see feature_flags_init for
# rationale). When that's wrong, test-feature-flag-drift.sh
# fails loudly with the actual /health version, surfacing the
# workflow-vs-deploy mismatch in ONE place.
AK_BACKEND_BRANCH: ${{ startsWith(inputs.backend_tag, '1.1.') && 'release/1.1.x' || (startsWith(inputs.backend_tag, '1.2.') && 'release/1.2.x' || 'main') }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Run pull-through cache tests
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
chmod +x scripts/run-suite.sh
./scripts/run-suite.sh --suite pullthrough --run-id "${RUN_ID}"
- name: Upload test results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-pullthrough
path: /tmp/test-results/
if-no-files-found: ignore
# -------------------------------------------------------------------
# Compatibility tests
# -------------------------------------------------------------------
compatibility-tests:
needs: deploy
if: inputs.test_suite == 'all' || inputs.test_suite == 'compatibility'
runs-on: ak-e2e-runners
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
# A silently-skipped test in release-gate context is exactly the
# silent-success class (#870/#871/#888) the gate exists to catch.
RELEASE_GATE: '1'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Run compatibility tests
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
chmod +x scripts/run-suite.sh
./scripts/run-suite.sh --suite compatibility --run-id "${RUN_ID}"
- name: Upload test results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-compatibility
path: /tmp/test-results/*.xml
if-no-files-found: ignore
# -------------------------------------------------------------------
# Repository type tests (virtual, remote, CRUD, labels)
# -------------------------------------------------------------------
repo-tests:
needs: deploy
if: inputs.test_suite == 'all' || inputs.test_suite == 'repos'
runs-on: ak-e2e-runners
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
# A silently-skipped test in release-gate context is exactly the
# silent-success class (#870/#871/#888) the gate exists to catch.
RELEASE_GATE: '1'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Run repo type tests
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
chmod +x scripts/run-suite.sh
./scripts/run-suite.sh --suite repos --run-id "${RUN_ID}"
- name: Upload test results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-repos
path: /tmp/test-results/*.xml
if-no-files-found: ignore
# -------------------------------------------------------------------
# Promotion tests
# -------------------------------------------------------------------
promotion-tests:
needs: deploy
if: inputs.test_suite == 'all' || inputs.test_suite == 'promotion'
runs-on: ak-e2e-runners
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
# A silently-skipped test in release-gate context is exactly the
# silent-success class (#870/#871/#888) the gate exists to catch.
RELEASE_GATE: '1'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Run promotion tests
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
chmod +x scripts/run-suite.sh
./scripts/run-suite.sh --suite promotion --run-id "${RUN_ID}"
- name: Upload test results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-promotion
path: /tmp/test-results/*.xml
if-no-files-found: ignore
# -------------------------------------------------------------------
# RBAC tests
# -------------------------------------------------------------------
rbac-tests:
needs: deploy
if: inputs.test_suite == 'all' || inputs.test_suite == 'rbac'
runs-on: ak-e2e-runners
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
# A silently-skipped test in release-gate context is exactly the
# silent-success class (#870/#871/#888) the gate exists to catch.
RELEASE_GATE: '1'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Run RBAC tests
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
chmod +x scripts/run-suite.sh
./scripts/run-suite.sh --suite rbac --run-id "${RUN_ID}"
- name: Upload test results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-rbac
path: /tmp/test-results/*.xml
if-no-files-found: ignore
# -------------------------------------------------------------------
# Lifecycle tests
# -------------------------------------------------------------------
lifecycle-tests:
needs: deploy
if: inputs.test_suite == 'all' || inputs.test_suite == 'lifecycle'
runs-on: ak-e2e-runners
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
# A silently-skipped test in release-gate context is exactly the
# silent-success class (#870/#871/#888) the gate exists to catch.
RELEASE_GATE: '1'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Run lifecycle tests
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
chmod +x scripts/run-suite.sh
./scripts/run-suite.sh --suite lifecycle --run-id "${RUN_ID}"
- name: Upload test results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-lifecycle
path: /tmp/test-results/*.xml
if-no-files-found: ignore
# -------------------------------------------------------------------
# Webhook tests
# -------------------------------------------------------------------
webhook-tests:
needs: deploy
if: inputs.test_suite == 'all' || inputs.test_suite == 'webhooks'
runs-on: ak-e2e-runners
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
# A silently-skipped test in release-gate context is exactly the
# silent-success class (#870/#871/#888) the gate exists to catch.
RELEASE_GATE: '1'
# Per-script timeout for run-suite.sh. Webhook resilience tests
# poll for retry/dead-letter behavior on schedules up to 180s
# (WEBHOOK_RETRY_TIMEOUT) so the wrapping timeout must exceed that.
TEST_TIMEOUT: '300'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Resolve runner pod address for backend dial-back
id: receiver-host
run: |
# Webhook tests spin up a mock receiver in this runner pod and
# pass the URL to the backend, which lives in a DIFFERENT pod.
# 127.0.0.1 from the backend's perspective is the backend pod
# itself, not the runner, so the mock would never be reached.
# Loopback is also a hard block in the backend's SSRF guard
# (#199, artifact-keeper validation.rs:203-212), so we MUST
# use the runner pod's RFC1918 IP.
POD_IP="${POD_IP:-$(hostname -i 2>/dev/null | awk '{print $1}')}"
if [ -z "$POD_IP" ]; then
echo "ERROR: could not determine runner pod IP" >&2
exit 1
fi
if ! echo "$POD_IP" | grep -Eq '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$'; then
echo "ERROR: POD_IP '${POD_IP}' is not an IPv4 address" >&2
exit 1
fi
if echo "$POD_IP" | grep -Eq '^127\.'; then
echo "ERROR: POD_IP '${POD_IP}' is loopback; backend pod cannot reach this" >&2
exit 1
fi
echo "Runner pod IP: ${POD_IP}"
echo "WEBHOOK_RECEIVER_HOST=${POD_IP}" >> "$GITHUB_ENV"
- name: Run webhook tests
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
chmod +x scripts/run-suite.sh
./scripts/run-suite.sh --suite webhooks --run-id "${RUN_ID}"
- name: Upload test results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-webhooks
path: /tmp/test-results/*.xml
if-no-files-found: ignore
# -------------------------------------------------------------------
# Search tests
# -------------------------------------------------------------------
search-tests:
needs: deploy
if: inputs.test_suite == 'all' || inputs.test_suite == 'search'
runs-on: ak-e2e-runners
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
# A silently-skipped test in release-gate context is exactly the
# silent-success class (#870/#871/#888) the gate exists to catch.
RELEASE_GATE: '1'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Run search tests
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
chmod +x scripts/run-suite.sh
./scripts/run-suite.sh --suite search --run-id "${RUN_ID}"
- name: Upload test results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-search
path: /tmp/test-results/*.xml
if-no-files-found: ignore
# -------------------------------------------------------------------
# Platform tests (signing, SBOM, curation, labels, audit, backup)
#
# Serialized after auth-tests to avoid cross-suite admin-JWT
# contamination. test-admin-password-recovery.sh changes the admin
# password, which calls the backend's change_password handler
# (backend/src/api/handlers/users.rs) and triggers
# invalidate_user_tokens(admin_id). That writes the admin's UUID
# into the global CREDENTIAL_INVALIDATIONS map in
# backend/src/services/auth_service.rs, so any auth-tests step
# running in parallel that uses the admin JWT starts getting 401s
# mid-suite once its cached token validation flips to "rejected".
# See #137. admin-tests is serialized for the same reason and on the
# same job ordering (see the comment block above the admin-tests
# job definition below).
# -------------------------------------------------------------------
platform-tests:
needs: [deploy, auth-tests]
if: inputs.test_suite == 'all' || inputs.test_suite == 'platform'
runs-on: ak-e2e-runners
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
# A silently-skipped test in release-gate context is exactly the
# silent-success class (#870/#871/#888) the gate exists to catch.
RELEASE_GATE: '1'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Run platform tests
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
chmod +x scripts/run-suite.sh
./scripts/run-suite.sh --suite platform --run-id "${RUN_ID}"
- name: Upload test results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-platform
path: /tmp/test-results/*.xml
if-no-files-found: ignore
# -------------------------------------------------------------------
# Auth tests (tokens, TOTP, rate limiting)
# -------------------------------------------------------------------
auth-tests:
needs: deploy
if: inputs.test_suite == 'all' || inputs.test_suite == 'auth'
runs-on: ak-e2e-runners
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
# A silently-skipped test in release-gate context is exactly the
# silent-success class (#870/#871/#888) the gate exists to catch.
RELEASE_GATE: '1'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Run auth tests
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
chmod +x scripts/run-suite.sh
./scripts/run-suite.sh --suite auth --run-id "${RUN_ID}"
- name: Upload test results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-auth
path: /tmp/test-results/*.xml
if-no-files-found: ignore
# -------------------------------------------------------------------
# Admin tests (Epic 10, #77): operational admin endpoints
# livez, backup execute/cancel/delete, monitoring alerts,
# storage backends listing, reindex trigger.
#
# Serialized after auth-tests (same reason as platform-tests, see #137):
# test-admin-password-recovery.sh in platform-tests changes the admin
# password via change_password (backend/src/api/handlers/users.rs),
# which writes the admin's UUID into the global CREDENTIAL_INVALIDATIONS
# map (backend/src/services/auth_service.rs). Any suite using the admin
# JWT that runs concurrently with auth-tests' password-change paths
# starts getting 401s mid-suite once its cached token validation flips
# to "rejected". admin-tests reuses ADMIN_PASS and exercises the admin
# JWT on every endpoint it hits, so it has the same exposure as
# platform-tests and gets the same serialization.
# -------------------------------------------------------------------
admin-tests:
needs: [deploy, auth-tests]
if: inputs.test_suite == 'all' || inputs.test_suite == 'admin'
runs-on: ak-e2e-runners
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
NAMESPACE: ${{ needs.deploy.outputs.namespace }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
RELEASE_GATE: '1'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Install kubectl
uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0
- name: Wait for OpenSearch ready
# test-reindex.sh hits POST /api/v1/admin/reindex which requires
# OpenSearch to be ready. Without this wait the endpoint can 404
# (route not yet mounted on the backend's OpenSearch-dependent
# path) or 503 and the test was previously soft-skipping on 404,
# passing vacuously under RELEASE_GATE=1. Wait up to 120s for the
# OpenSearch pod to be Ready before running the suite.
run: |
# Try the bitnami chart label first, then app=opensearch as a
# fallback for charts that use the legacy label scheme. If the
# selector matches no pods (e.g. opensearch.enabled=false in this
# build) the wait is a no-op and the suite proceeds, which is
# the right behavior for builds that don't ship OpenSearch.
set -e
if kubectl -n "$NAMESPACE" get pod -l app.kubernetes.io/name=opensearch -o name 2>/dev/null | grep -q pod/; then
kubectl -n "$NAMESPACE" wait --for=condition=Ready pod \
-l app.kubernetes.io/name=opensearch --timeout=120s
elif kubectl -n "$NAMESPACE" get pod -l app=opensearch -o name 2>/dev/null | grep -q pod/; then
kubectl -n "$NAMESPACE" wait --for=condition=Ready pod \
-l app=opensearch --timeout=120s
else
echo "No OpenSearch pods found in ${NAMESPACE}; skipping wait."
fi
- name: Run admin tests
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
chmod +x scripts/run-suite.sh
./scripts/run-suite.sh --suite admin --run-id "${RUN_ID}"
- name: Upload test results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-admin
path: /tmp/test-results/*.xml
if-no-files-found: ignore
# -------------------------------------------------------------------
# Stress tests (after formats + security pass)
#
# continue-on-error: stress tests measure backend behavior under
# sustained mixed-workload (auth + upload + download + list) on a
# 2 CPU test pod inside the namespace's 4 CPU / 8 Gi quota. Error-
# rate variance is high on ARC runners (observed 22-54% across
# otherwise-identical runs) because the bcrypt-bound auth path
# saturates first and the worker count drives RPS up faster than
# the pod can absorb. The test still produces JUnit + run logs so
# regressions are visible, but a single failed run does not block
# the release gate. Real perf regressions are caught by dedicated
# benchmark workflows on Rocky, not by this CI smoke gate.
# See artifact-keeper#991 for v1.1.x auth-path perf investigation.
# -------------------------------------------------------------------
stress-tests:
needs: [deploy, format-tests, repo-tests, promotion-tests, rbac-tests, lifecycle-tests, webhook-tests, search-tests, platform-tests, auth-tests, security-tests, compatibility-tests]
continue-on-error: true
if: |
always() &&
needs.deploy.result == 'success' &&
(inputs.test_suite == 'all' || inputs.test_suite == 'stress')
runs-on: ak-e2e-runners
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
# Per-request HTTP-code logs end up here. The Upload stress-test logs
# step below ships this directory as a workflow artifact so a failed
# stress run can be debugged endpoint-by-endpoint instead of from
# aggregate error counts alone (artifact-keeper-test#138 /
# artifact-keeper#1088).
STRESS_LOG_DIR: /tmp/stress-logs
# Postgres + pod-resource snapshot directory. The Collect postgres
# stats step below runs tests/stress/collect-pg-stats.sh after the
# stress run and ships this directory as the stress-pg-stats
# artifact. Captures the direct measurement that PR #148's
# Fresh-Eyes review (artifact-keeper-test#154) asked for: pg
# connection saturation + kubectl top, so the postgres-CPU
# narrative behind PR #140 is backed by evidence on every run.
PG_STATS_DIR: /tmp/pg-stats
NAMESPACE: ${{ needs.deploy.outputs.namespace }}
# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
# A silently-skipped test in release-gate context is exactly the
# silent-success class (#870/#871/#888) the gate exists to catch.
RELEASE_GATE: '1'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Run stress tests
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
mkdir -p "$STRESS_LOG_DIR"
chmod +x scripts/run-suite.sh
./scripts/run-suite.sh --suite stress --run-id "${RUN_ID}"
- name: Summarize per-request status codes
if: always()
run: |
if [ ! -d "$STRESS_LOG_DIR" ] || [ -z "$(ls -A "$STRESS_LOG_DIR" 2>/dev/null)" ]; then
echo "No stress logs were emitted at ${STRESS_LOG_DIR}"
exit 0
fi
echo "## Stress-test per-request status codes" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "Per-request rows logged to artifact \`stress-request-logs\` at \`${STRESS_LOG_DIR}\`." >> "$GITHUB_STEP_SUMMARY"
echo "Row format: \`<epoch_ms> <suite> <method> <endpoint> <http_code> <elapsed_ms>\`." >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "| Suite | Total | 2xx | 3xx | 4xx | 5xx | Timeouts (000) |" >> "$GITHUB_STEP_SUMMARY"
echo "|-------|------:|----:|----:|----:|----:|---------------:|" >> "$GITHUB_STEP_SUMMARY"
for log in "$STRESS_LOG_DIR"/*.log; do
[ -f "$log" ] || continue
suite=$(basename "$log" .log)
total=$(wc -l < "$log" | tr -d ' ')
s2=$(awk '$5 ~ /^2[0-9][0-9]$/ {c++} END {print c+0}' "$log")
s3=$(awk '$5 ~ /^3[0-9][0-9]$/ {c++} END {print c+0}' "$log")
s4=$(awk '$5 ~ /^4[0-9][0-9]$/ {c++} END {print c+0}' "$log")
s5=$(awk '$5 ~ /^5[0-9][0-9]$/ {c++} END {print c+0}' "$log")
s0=$(awk '$5 == "000" {c++} END {print c+0}' "$log")
echo "| ${suite} | ${total} | ${s2} | ${s3} | ${s4} | ${s5} | ${s0} |" >> "$GITHUB_STEP_SUMMARY"
done
echo "" >> "$GITHUB_STEP_SUMMARY"
# Echo a short per-suite breakdown to the runner log too, so an
# operator can scan the job page without first downloading the
# artifact. Cap at the top 10 non-2xx endpoints to keep the log
# readable.
echo "Top non-2xx endpoints per suite:"
for log in "$STRESS_LOG_DIR"/*.log; do
[ -f "$log" ] || continue
suite=$(basename "$log" .log)
echo " ${suite}:"
awk '$5 !~ /^2[0-9][0-9]$/ {print $5, $3, $4}' "$log" \
| sort | uniq -c | sort -rn | head -n 10 \
| sed 's/^/ /'
done
- name: Collect postgres stats
# Snapshot pg_stat_activity, pg_stat_statements, connection-count
# vs max_connections, and kubectl top for the backend + postgres
# pods. Runs after the stress workload and before teardown so the
# capture reflects steady-state load. if: always() ensures the
# snapshot still ships on a failed run, which is exactly when
# the data is most useful (artifact-keeper-test#154; rationale
# in PR #148 Fresh-Eyes review, Finding 2).
if: always()
run: |
bash tests/stress/collect-pg-stats.sh
- name: Upload pg-stats snapshot
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: stress-pg-stats-${{ github.run_attempt }}
path: /tmp/pg-stats/
if-no-files-found: ignore
- name: Upload test results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-stress
path: /tmp/test-results/*.xml
if-no-files-found: ignore
- name: Upload stress-test logs
# Per-request HTTP-code rows from the stress-test workers. Captured
# before namespace teardown so a 50%-error run can be debugged
# endpoint-by-endpoint (which path returned what code, how often)
# instead of from aggregate counters alone. Investigating
# artifact-keeper#1088 (POST /repositories DB contention under
# load) would have been faster with this artifact in hand.
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: stress-request-logs
path: /tmp/stress-logs/
if-no-files-found: ignore
# -------------------------------------------------------------------
# Resilience tests (after stress completes)
#
# Run regardless of stress-tests outcome. Resilience tests target
# crash recovery, network partition, storage failures, etc., which
# are independent of the bcrypt/auth saturation that stress-tests
# measures. Skipping resilience because stress hit its error-rate
# threshold loses signal on a different failure class.
# -------------------------------------------------------------------
resilience-tests:
needs: [deploy, stress-tests]
if: |
always() &&
needs.deploy.result == 'success' &&
(inputs.test_suite == 'all' || inputs.test_suite == 'resilience')
runs-on: ak-e2e-runners
strategy:
fail-fast: false
matrix:
category: [crash, restart, network, storage, data]
env:
BASE_URL: ${{ needs.deploy.outputs.backend_url }}
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
NAMESPACE: ${{ needs.deploy.outputs.namespace }}
JUNIT_OUTPUT_DIR: /tmp/test-results
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Install kubectl
uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0
- name: Run ${{ matrix.category }} resilience tests
continue-on-error: true
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
FAILED=0
for script in tests/resilience/${{ matrix.category }}/test-*.sh; do
[ -f "$script" ] || continue
echo "=== Running ${script} ==="
if ! bash "$script"; then
echo "FAILED: ${script}"
FAILED=$((FAILED + 1))
fi
done
if [ "$FAILED" -gt 0 ]; then
echo "::warning::${FAILED} resilience test(s) failed in ${{ matrix.category }} (non-blocking on ARC runners)"
exit 1
fi
- name: Upload test results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-resilience-${{ matrix.category }}
path: /tmp/test-results/*.xml
if-no-files-found: ignore
# -------------------------------------------------------------------
# Mesh tests (after resilience passes)
# -------------------------------------------------------------------
mesh-tests:
needs: [deploy, resilience-tests]
if: |
always() &&
(inputs.test_suite == 'all' || inputs.test_suite == 'mesh') &&
(needs.resilience-tests.result == 'success' || needs.resilience-tests.result == 'skipped')
runs-on: ak-e2e-runners
env:
RUN_ID: ${{ needs.deploy.outputs.run_id }}
ADMIN_PASS: TestRunner!2026secure
JUNIT_OUTPUT_DIR: /tmp/test-results
# RELEASE_GATE=1 turns common.sh's skip_suite() into a hard fail.
# A silently-skipped test in release-gate context is exactly the
# silent-success class (#870/#871/#888) the gate exists to catch.
RELEASE_GATE: '1'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Install kubectl
uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0
- name: Install Helm
uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0
- name: Deploy mesh topology
id: mesh-deploy
run: |
MESH_RUN_ID="${RUN_ID}"
chmod +x scripts/create-test-namespace.sh
# Deploy 4 mesh instances
for i in main peer1 peer2 peer3; do
MESH_NS="test-${MESH_RUN_ID}-mesh-${i}"
./scripts/create-test-namespace.sh \
--run-id "${MESH_RUN_ID}-mesh-${i}" \
--backend-tag "${{ inputs.backend_tag }}" \
--web-tag "${{ inputs.web_tag }}" \
--values helm/values-test-mesh.yaml
done
# Output URLs
BASE_NS="test-${MESH_RUN_ID}-mesh"
echo "MAIN_URL=http://artifact-keeper-backend.${BASE_NS}-main.svc.cluster.local:8080" >> "$GITHUB_OUTPUT"
echo "PEER1_URL=http://artifact-keeper-backend.${BASE_NS}-peer1.svc.cluster.local:8080" >> "$GITHUB_OUTPUT"
echo "PEER2_URL=http://artifact-keeper-backend.${BASE_NS}-peer2.svc.cluster.local:8080" >> "$GITHUB_OUTPUT"
echo "PEER3_URL=http://artifact-keeper-backend.${BASE_NS}-peer3.svc.cluster.local:8080" >> "$GITHUB_OUTPUT"
- name: Wait for mesh instances ready
run: |
chmod +x tests/lib/wait-for-ready.sh
for url in "${{ steps.mesh-deploy.outputs.MAIN_URL }}" \
"${{ steps.mesh-deploy.outputs.PEER1_URL }}" \
"${{ steps.mesh-deploy.outputs.PEER2_URL }}" \
"${{ steps.mesh-deploy.outputs.PEER3_URL }}"; do
./tests/lib/wait-for-ready.sh "$url" 300
done
- name: Run mesh tests
env:
MAIN_URL: ${{ steps.mesh-deploy.outputs.MAIN_URL }}
PEER1_URL: ${{ steps.mesh-deploy.outputs.PEER1_URL }}
PEER2_URL: ${{ steps.mesh-deploy.outputs.PEER2_URL }}
PEER3_URL: ${{ steps.mesh-deploy.outputs.PEER3_URL }}
BASE_URL: ${{ steps.mesh-deploy.outputs.MAIN_URL }}
run: |
mkdir -p "$JUNIT_OUTPUT_DIR"
chmod +x scripts/run-suite.sh
./scripts/run-suite.sh --suite mesh --run-id "${RUN_ID}"
- name: Teardown mesh namespaces
if: always() && inputs.skip_teardown != true
run: |
chmod +x scripts/teardown-test-namespace.sh
for i in main peer1 peer2 peer3; do
./scripts/teardown-test-namespace.sh --run-id "${RUN_ID}-mesh-${i}" || true
done
- name: Upload test results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: junit-mesh
path: /tmp/test-results/*.xml
if-no-files-found: ignore
# -------------------------------------------------------------------
# Collect results and publish summary
# -------------------------------------------------------------------
collect-results:
needs: [version-set-integrity, clean-install-smoke, clean-install-smoke-with-deps, chart-upgrade-smoke, deploy, real-flow-smoke, scan-completion-gate, sbom-correctness-gate, pinned-cve-gate, format-tests, security-tests, compatibility-tests, repo-tests, promotion-tests, rbac-tests, lifecycle-tests, webhook-tests, search-tests, platform-tests, auth-tests, admin-tests, stress-tests, resilience-tests, mesh-tests]
if: always()
runs-on: ak-e2e-runners
steps:
- name: Download all test artifacts
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
with:
pattern: junit-*
path: /tmp/all-results
merge-multiple: true
- name: Publish test summary
if: always()
run: |
echo "## Release Gate Results" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "| Suite | Status |" >> "$GITHUB_STEP_SUMMARY"
echo "|-------|--------|" >> "$GITHUB_STEP_SUMMARY"
for job in version-set-integrity clean-install-smoke clean-install-smoke-with-deps chart-upgrade-smoke real-flow-smoke scan-completion-gate sbom-correctness-gate pinned-cve-gate format-tests repo-tests promotion-tests rbac-tests lifecycle-tests webhook-tests search-tests platform-tests auth-tests admin-tests security-tests compatibility-tests stress-tests resilience-tests mesh-tests; do
status="skipped"
case "$job" in
version-set-integrity) status="${{ needs.version-set-integrity.result }}" ;;
clean-install-smoke) status="${{ needs.clean-install-smoke.result }}" ;;
clean-install-smoke-with-deps) status="${{ needs.clean-install-smoke-with-deps.result }}" ;;
chart-upgrade-smoke) status="${{ needs.chart-upgrade-smoke.result }}" ;;
real-flow-smoke) status="${{ needs.real-flow-smoke.result }}" ;;
scan-completion-gate) status="${{ needs.scan-completion-gate.result }}" ;;
sbom-correctness-gate) status="${{ needs.sbom-correctness-gate.result }}" ;;
pinned-cve-gate) status="${{ needs.pinned-cve-gate.result }}" ;;
format-tests) status="${{ needs.format-tests.result }}" ;;
repo-tests) status="${{ needs.repo-tests.result }}" ;;
promotion-tests) status="${{ needs.promotion-tests.result }}" ;;
rbac-tests) status="${{ needs.rbac-tests.result }}" ;;
lifecycle-tests) status="${{ needs.lifecycle-tests.result }}" ;;
webhook-tests) status="${{ needs.webhook-tests.result }}" ;;
search-tests) status="${{ needs.search-tests.result }}" ;;
platform-tests) status="${{ needs.platform-tests.result }}" ;;
auth-tests) status="${{ needs.auth-tests.result }}" ;;
admin-tests) status="${{ needs.admin-tests.result }}" ;;
security-tests) status="${{ needs.security-tests.result }}" ;;
compatibility-tests) status="${{ needs.compatibility-tests.result }}" ;;
stress-tests) status="${{ needs.stress-tests.result }}" ;;
resilience-tests) status="${{ needs.resilience-tests.result }}" ;;
mesh-tests) status="${{ needs.mesh-tests.result }}" ;;
esac
echo "| ${job} | ${status} |" >> "$GITHUB_STEP_SUMMARY"
done
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "**Backend tag:** \`${{ inputs.backend_tag }}\`" >> "$GITHUB_STEP_SUMMARY"
echo "**Web tag:** \`${{ inputs.web_tag }}\`" >> "$GITHUB_STEP_SUMMARY"
echo "**Run ID:** \`${{ needs.deploy.outputs.run_id }}\`" >> "$GITHUB_STEP_SUMMARY"
- name: Upload combined results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: release-gate-results
path: /tmp/all-results/
if-no-files-found: ignore
- name: Gate check - fail if any required suite did not succeed
# stress-tests and security-tests are intentionally excluded from
# this rollup. Both have continue-on-error: true (see the comments
# above each job) so their outcome can be 'failure' on
# known-flaky / known-infra-debt scenarios without blocking the
# release gate:
# - stress-tests: bcrypt-bound auth saturation under sustained
# load on shared ARC runners (artifact-keeper#991).
# - security-tests: Grype DB not pre-seeded in v1.1.x backend
# image; quality gate is last-scanner-wins so Trivy covers
# the policy gate (artifact-keeper#1001).
#
# The three new silent-success gates (scan-completion-gate,
# sbom-correctness-gate, pinned-cve-gate) use the STRICTER
# `result != 'success'` predicate. The looser
# `result == 'failure' || result == 'cancelled'` form lets a
# 'skipped' outcome (matrix-leg eval failure, future conditional
# `if:` gate, or a transitive `needs:` skip when `deploy` is
# skipped) through as green. For the silent-success gates that
# is precisely the regression class we are guarding against, so
# we close it explicitly.
#
# clean-install-smoke-with-deps is the one legitimate 'skipped'
# case: it is opt-in (gated on the `run_smoke_with_deps` workflow
# input, default false; see #53). When the input is false the
# job's result is 'skipped' by design. We continue to use the
# looser predicate for it so the default-off path stays green.
# When the input is true, a failure or cancellation DOES block
# the release.
#
# The wildcard form contains(needs.*.result, 'failure') still
# observes failures because needs.<job>.result reflects the
# job's outcome, not its continue-on-error-adjusted conclusion.
# So we list the required suites explicitly here. If you add a
# new required suite, add it to this list. Soft-failing suites
# stay off the list.
if: >-
needs.version-set-integrity.result == 'failure' || needs.version-set-integrity.result == 'cancelled' ||
needs.clean-install-smoke.result == 'failure' || needs.clean-install-smoke.result == 'cancelled' ||
needs.clean-install-smoke-with-deps.result == 'failure' || needs.clean-install-smoke-with-deps.result == 'cancelled' ||
needs.chart-upgrade-smoke.result == 'failure' || needs.chart-upgrade-smoke.result == 'cancelled' ||
needs.real-flow-smoke.result == 'failure' || needs.real-flow-smoke.result == 'cancelled' ||
needs.scan-completion-gate.result != 'success' ||
needs.sbom-correctness-gate.result != 'success' ||
needs.pinned-cve-gate.result != 'success' ||
needs.deploy.result == 'failure' || needs.deploy.result == 'cancelled' ||
needs.format-tests.result == 'failure' || needs.format-tests.result == 'cancelled' ||
needs.compatibility-tests.result == 'failure' || needs.compatibility-tests.result == 'cancelled' ||
needs.repo-tests.result == 'failure' || needs.repo-tests.result == 'cancelled' ||
needs.promotion-tests.result == 'failure' || needs.promotion-tests.result == 'cancelled' ||
needs.rbac-tests.result == 'failure' || needs.rbac-tests.result == 'cancelled' ||
needs.lifecycle-tests.result == 'failure' || needs.lifecycle-tests.result == 'cancelled' ||
needs.webhook-tests.result == 'failure' || needs.webhook-tests.result == 'cancelled' ||
needs.search-tests.result == 'failure' || needs.search-tests.result == 'cancelled' ||
needs.platform-tests.result == 'failure' || needs.platform-tests.result == 'cancelled' ||
needs.auth-tests.result == 'failure' || needs.auth-tests.result == 'cancelled' ||
needs.admin-tests.result == 'failure' || needs.admin-tests.result == 'cancelled' ||
needs.resilience-tests.result == 'failure' || needs.resilience-tests.result == 'cancelled' ||
needs.mesh-tests.result == 'failure' || needs.mesh-tests.result == 'cancelled'
run: |
echo "::error::Release gate FAILED - one or more required test suites did not pass"
echo "Review the workflow summary above for details"
echo "Note: stress-tests is non-blocking; its outcome is shown in the summary but does not gate the release"
exit 1
# -------------------------------------------------------------------
# Teardown
# -------------------------------------------------------------------
teardown:
needs: [deploy, collect-results]
if: always() && inputs.skip_teardown != true
runs-on: ak-e2e-runners
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: artifact-keeper/artifact-keeper-test
- name: Install kubectl
uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0
- name: Install Helm
uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0
- name: Teardown test namespace
env:
RUN_ID: ${{ needs.deploy.outputs.run_id }}
run: |
chmod +x scripts/teardown-test-namespace.sh
./scripts/teardown-test-namespace.sh --run-id "${RUN_ID}"
- name: Upload pod logs
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: pod-logs
path: /tmp/test-logs/
if-no-files-found: ignore