Skip to content

AKS Test

AKS Test #629

Workflow file for this run

# Copyright 2025-2026 Defense Unicorns
# SPDX-License-Identifier: AGPL-3.0-or-later OR LicenseRef-Defense-Unicorns-Commercial
name: AKS Test
on:
schedule:
- cron: "0 0 * * 0" # Every Sunday Midnight (UTC) / Saturday 5pm MT
workflow_call: {}
permissions:
id-token: write
contents: read
packages: read
jobs:
test-aks-install:
strategy:
fail-fast: false
matrix:
flavor: [upstream, registry1, unicorn]
runs-on: ubuntu-latest
name: test-aks
permissions:
id-token: write
contents: read
packages: read
pull-requests: write # Allows writing to pull requests (needed for renovate-readiness)
env:
SHA: ${{ github.sha }}
UDS_REGION: centralus
UDS_RESOURCE_GROUP_NAME: ${{ secrets.AZURE_RESOURCE_GROUP }}
UDS_STORAGE_ACCOUNT_NAME: ${{ secrets.AZURE_STORAGE_ACCOUNT_NAME }}
UDS_CONTAINER_NAME: ${{ secrets.AZURE_STORAGE_CONTAINER_NAME }}
steps:
- name: Set ENV
run: |
echo "UDS_CLUSTER_NAME=uds-ci-${{ matrix.flavor }}-${SHA:0:7}" >> $GITHUB_ENV
echo "UDS_STATE_KEY="uds-core/${SHA:0:7}-nightly-aks-core-${{ matrix.flavor }}.tfstate >> $GITHUB_ENV
echo "TF_VAR_location=${UDS_REGION}" >> $GITHUB_ENV
echo "TF_VAR_cluster_name=uds-ci-${{ matrix.flavor }}-${SHA:0:7}" >> $GITHUB_ENV
echo "TF_VAR_resource_group_name=uds-ci-${{ matrix.flavor }}" >> $GITHUB_ENV
echo "ARM_SUBSCRIPTION_ID=${{ secrets.AZURE_SUBSCRIPTION_ID }}" >> $GITHUB_ENV
echo "ARM_CLIENT_ID=${{ secrets.AZURE_CLIENT_ID }}" >> $GITHUB_ENV
echo "ARM_TENANT_ID=${{ secrets.AZURE_TENANT_ID }}" >> $GITHUB_ENV
echo "ARM_USE_OIDC=true" >> $GITHUB_ENV
echo "ARM_STORAGE_USE_AZUREAD=true" >> $GITHUB_ENV
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
- name: Check renovate readiness
if: startsWith(github.event.pull_request.head.ref, 'renovate/') # Only call for Renovate PRs
uses: ./.github/actions/renovate-readiness
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
- name: Azure login
uses: azure/login@532459ea530d8321f2fb9bb10d1e0bcf23869a43 # v3
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
- name: Environment setup
uses: ./.github/actions/setup
with:
ghToken: ${{ secrets.GITHUB_TOKEN }}
registry1Username: ${{ (matrix.flavor == 'registry1') && secrets.IRON_BANK_ROBOT_USERNAME || '' }}
registry1Password: ${{ (matrix.flavor == 'registry1') && secrets.IRON_BANK_ROBOT_PASSWORD || '' }}
rapidfortUsername: ${{ (matrix.flavor == 'unicorn') && secrets.RAPIDFORT_USERNAME || '' }}
rapidfortPassword: ${{ (matrix.flavor == 'unicorn') && secrets.RAPIDFORT_PASSWORD || '' }}
- name: Setup Tofu
uses: opentofu/setup-opentofu@fc711fa910b93cba0f3fbecaafc9f42fd0c411cb # v2.0.0
with:
# renovate: datasource=github-tags depName=opentofu/opentofu extractVersion=^v?(?<version>.*)$
tofu_version: 1.11.6
tofu_wrapper: false
- name: Create UDS Core Package
run: ZARF_ARCHITECTURE=amd64 uds run -f tasks/create.yaml standard-package --no-progress --with create_options="--skip-sbom" --set FLAVOR=${{ matrix.flavor }}
- name: Create Core Bundle
run: uds create .github/bundles/aks --confirm
- name: Create IAC
run: uds run -f tasks/iac.yaml apply-tofu --no-progress --set K8S_DISTRO=aks --set CLOUD=azure
- name: Get Resource Group Name
run: echo "AKS_RESOURCE_GROUP=$(tofu output -raw resource_group_name)" >> $GITHUB_ENV
working-directory: .github/test-infra/azure/aks
- name: Configure Cluster DNS
run: uds run -f tasks/utils.yaml aks-coredns-setup --no-progress
- name: Wait for cluster ready
run: uds run -f tasks/iac.yaml cluster-ready --no-progress
- name: Deploy Core Bundle
env:
UDS_CONFIG: .github/bundles/aks/uds-config.yaml
# Retry up to 2 times on failure to ride out transient AKS apiserver restart
# windows that cause webhook proxy errors. --force-conflicts on retries works
# around zarf issue 4771 (field-manager conflicts on partial re-apply).
run: |
for attempt in 1 2 3; do
FORCE=""
[ "$attempt" -gt 1 ] && FORCE="--force-conflicts"
if uds deploy .github/bundles/aks/uds-bundle-uds-core-aks-nightly-*.tar.zst --confirm $FORCE; then
echo "CORE_ATTEMPTS=$attempt" >> $GITHUB_ENV
exit 0
fi
if [ "$attempt" -lt 3 ]; then
echo "Deploy failed on attempt $attempt; sleeping 30s before retry..."
sleep 30
fi
done
echo "CORE_ATTEMPTS=fail" >> $GITHUB_ENV
exit 1
timeout-minutes: 40
- name: Test UDS Core
# Retry up to 2 times on failure to ride out transient AKS apiserver
# webhook disruptions during test-app deploy, playwright, or vitest phases.
run: |
for attempt in 1 2 3; do
if uds run -f tasks/test.yaml uds-core-non-k3d --set EXCLUDED_PACKAGES="metrics-server"; then
echo "TEST_ATTEMPTS=$attempt" >> $GITHUB_ENV
exit 0
fi
if [ "$attempt" -lt 3 ]; then
echo "Tests failed on attempt $attempt; sleeping 30s before retry..."
sleep 30
fi
done
echo "TEST_ATTEMPTS=fail" >> $GITHUB_ENV
exit 1
- name: Retry summary
if: always()
run: |
status() {
case "$1" in
1) echo "✅ $1 (first try)" ;;
2|3) echo "⚠️ $1 (retry needed)" ;;
"") echo "❔ not run" ;;
*) echo "❌ failed after 3 attempts" ;;
esac
}
{
echo "## Core Retry Summary"
echo ""
echo "| Step | Attempts |"
echo "|------|----------|"
echo "| Core deploy | $(status "${CORE_ATTEMPTS:-}") |"
echo "| Test UDS Core | $(status "${TEST_ATTEMPTS:-}") |"
} >> "$GITHUB_STEP_SUMMARY"
- name: Debug Output
if: always()
uses: ./.github/actions/debug-output
- name: Azure login (refresh token for log collection)
if: always()
uses: azure/login@532459ea530d8321f2fb9bb10d1e0bcf23869a43 # v3
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
- name: Install log-analytics extension
if: always()
run: |
if ! az extension add --name log-analytics --allow-preview true --yes; then
echo "::warning::Failed to install Azure CLI log-analytics extension; control plane log collection may be incomplete."
fi
- name: Wait for Log Analytics ingestion
if: always()
run: |
TARGET=$(date -u +%s)
DEADLINE=$((TARGET + 600)) # 10 min cap
WORKSPACE_ID=$(az monitor log-analytics workspace list \
--resource-group "$AKS_RESOURCE_GROUP" \
--query "[0].customerId" -o tsv 2>/dev/null) || true
if [ -z "$WORKSPACE_ID" ]; then
echo "::warning::No Log Analytics workspace found; skipping ingestion wait."
exit 0
fi
while [ $(date +%s) -lt $DEADLINE ]; do
LATEST=$(az monitor log-analytics query \
--workspace "$WORKSPACE_ID" \
--analytics-query "AzureDiagnostics | summarize max(TimeGenerated)" \
-o tsv 2>/dev/null | awk 'NF{print $NF; exit}') || true
if [ -n "$LATEST" ]; then
LATEST_SEC=$(date -u -d "$LATEST" +%s 2>/dev/null || echo 0)
if [ "$LATEST_SEC" -ge "$TARGET" ]; then
echo "Ingestion caught up: latest=$LATEST"
exit 0
fi
echo "Latest ingested: $LATEST (target: $(date -u -d @$TARGET +%FT%TZ)); waiting..."
else
echo "No ingestion timestamp yet; waiting..."
fi
sleep 30
done
echo "::warning::Log Analytics ingestion did not catch up within 10m; log collection may be incomplete."
- name: Save logs
if: always()
uses: ./.github/actions/save-logs
with:
suffix: -aks-${{ matrix.flavor }}
distro: "aks"
resource_group: ${{ env.AKS_RESOURCE_GROUP }}
- name: Remove UDS Core
if: always()
run: set +e; for i in $(seq 0 2); do uds remove .github/bundles/aks/uds-bundle-uds-core-aks-nightly-*.tar.zst --confirm; if [ $? -eq 0 ]; then break; fi; done
timeout-minutes: 10
continue-on-error: true
- name: Remove IAC
if: always()
run: uds run -f tasks/iac.yaml destroy-iac --no-progress --set K8S_DISTRO=aks --set CLOUD=azure
- name: Send Slack notification
if: failure() && github.event_name == 'schedule'
uses: ./.github/actions/slack-alert
with:
webhook-url: ${{ secrets.SLACK_WEBHOOK_URL }}
channel: ${{ vars.SLACK_ALERT_CHANNEL }}