From e4aedbf1071b9791be355af0b0c972a63e7092bf Mon Sep 17 00:00:00 2001 From: Scott J Dickerson Date: Mon, 9 Mar 2026 11:31:17 -0400 Subject: [PATCH 01/42] ci(hot-cluster): Add POC hot-cluster CI workflow Establish a basic GitHub action based CI workflow for running e2e tests on a hot-cluster. POC workflows include setting up IBM Cloud ROKS clusters. Full documentation is in the `POC_HOT_CLUSTER_CI.md` and referenced files. The hot-cluster can be any OpenShift cluster (FIPS and non-FIPS) and is based on the GitHub Actions runner controller (ARC). It is installed via helm, and only requires network access to pull from GitHub within the cluster itself. The POC can even be run on a local CRC / OpenShift local development cluster without any special networking configuration. The self-hosted runners are ephemeral and are deleted after the CI job is complete. Full setup documentation is in the `ci-scripts/README.md` file. Signed-off-by: Scott J Dickerson --- .dockerignore | 11 +- .../workflows/ibmc-cluster-auto-teardown.yml | 169 ++++++++- .github/workflows/ibmc-cluster-setup.yml | 64 ++-- .github/workflows/ibmc-cluster-teardown.yml | 86 +++-- .github/workflows/poc-e2e-ci-test.yml | 75 ++++ .github/workflows/poc-e2e-ci-test2.yml | 323 ++++++++++++++++++ .gitignore | 3 + .vscode/settings.json | 3 +- Dockerfile | 6 +- POC_HOT_CLUSTER_CI.md | 142 ++++++++ ci-scripts/README.md | 170 ++++----- ci-scripts/arc/README.md | 7 +- ci-scripts/arc/arc-dind-post-render.sh | 40 +++ ci-scripts/arc/arc-helm-helpers.sh | 50 +++ ci-scripts/arc/arc-openshift-scc.yaml | 16 +- ci-scripts/arc/arc-runner-rbac.yaml | 63 ++-- ci-scripts/arc/arc-runner-scale-set.pod.yaml | 34 +- ci-scripts/arc/install-arc-controller.sh | 25 +- ci-scripts/arc/install-runner-scale-set.sh | 110 +++--- ci-scripts/arc/runner-image/Dockerfile | 111 ++++++ ci-scripts/arc/setup-dind-mirror.sh | 58 ++++ ci-scripts/arc/setup-runner-image.sh | 154 +++++++++ ci-scripts/check-cluster-health.sh | 6 +- ci-scripts/check-roks-cluster-state.sh | 16 +- .../examples/arc-0.14-extra-values.yaml | 29 ++ ci-scripts/install-hco.sh | 27 +- ci-scripts/nginx-9080.conf | 35 ++ ci-scripts/nginx-9443.conf | 39 +++ ci-scripts/resolve-console-image.sh | 39 +++ ci-scripts/start-console.sh | 149 ++++++++ ci-scripts/start-plugin-container.sh | 99 ++++++ ci-scripts/test-cleanup.sh | 26 ++ cypress/tests/gating/poc-check-tab-yaml.cy.ts | 232 +++++++++++++ cypress/tests/poc-gating.cy.ts | 3 + start-console.sh | 3 +- test-cleanup.sh | 0 test-setup-downstream.sh | 0 test-setup.sh | 0 38 files changed, 2091 insertions(+), 332 deletions(-) create mode 100644 .github/workflows/poc-e2e-ci-test.yml create mode 100644 .github/workflows/poc-e2e-ci-test2.yml create mode 100644 POC_HOT_CLUSTER_CI.md create mode 100755 ci-scripts/arc/arc-dind-post-render.sh create mode 100644 ci-scripts/arc/runner-image/Dockerfile create mode 100755 ci-scripts/arc/setup-dind-mirror.sh create mode 100755 ci-scripts/arc/setup-runner-image.sh create mode 100644 ci-scripts/examples/arc-0.14-extra-values.yaml create mode 100644 ci-scripts/nginx-9080.conf create mode 100644 ci-scripts/nginx-9443.conf create mode 100755 ci-scripts/resolve-console-image.sh create mode 100755 ci-scripts/start-console.sh create mode 100755 ci-scripts/start-plugin-container.sh create mode 100755 ci-scripts/test-cleanup.sh create mode 100644 cypress/tests/gating/poc-check-tab-yaml.cy.ts create mode 100644 cypress/tests/poc-gating.cy.ts mode change 100644 => 100755 test-cleanup.sh mode change 100644 => 100755 test-setup-downstream.sh mode change 100644 => 100755 test-setup.sh diff --git a/.dockerignore b/.dockerignore index a581e6240c..e53732495b 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,4 +1,11 @@ -**/node_modules -dist/ +node_modules/ +.cursor/ +.github/ +.husky/ +.vscode/ + +coverage/ cypress/ +dist/ + *.env diff --git a/.github/workflows/ibmc-cluster-auto-teardown.yml b/.github/workflows/ibmc-cluster-auto-teardown.yml index 7fdbacb9e2..1a40e39acd 100644 --- a/.github/workflows/ibmc-cluster-auto-teardown.yml +++ b/.github/workflows/ibmc-cluster-auto-teardown.yml @@ -2,24 +2,173 @@ name: IBM Cloud Hot Cluster Auto-Teardown on: schedule: - # Runs daily at 02:00 UTC (safety net for forgotten clusters) - - cron: '0 2 * * *' + - cron: '*/30 * * * *' workflow_dispatch: inputs: cluster_name: - description: 'Cluster name to tear down' + description: 'Cluster name to check' required: true default: 'kubevirt-plugin-ci' type: string + idle_threshold_minutes: + description: 'Idle threshold in minutes before teardown' + required: true + default: '120' + type: string permissions: contents: read + actions: write + +env: + CLUSTER_NAME: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }} + IDLE_THRESHOLD_MINUTES: ${{ inputs.idle_threshold_minutes || '120' }} jobs: - auto-teardown: - name: Auto-Teardown Hot Cluster - uses: ./.github/workflows/ibmc-cluster-teardown.yml - with: - cluster_name: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }} - secrets: - IC_KEY: ${{ secrets.IC_KEY }} + check-and-teardown: + name: Check Idle & Teardown + runs-on: ubuntu-latest + timeout-minutes: 90 + steps: + - name: Check CI jobs + id: check_ci + uses: actions/github-script@v8 + env: + INCLUDE_WORKFLOWS: '[".github/workflows/poc-e2e-ci-test.yml", ".github/workflows/poc-e2e-ci-test2.yml"]' + with: + script: | + const INCLUDE_WORKFLOWS = JSON.parse(process.env.INCLUDE_WORKFLOWS); + + let inProgress = 0; + let queued = 0; + let completed = 0; + let lastRunTime = null; + + try { + const results = await Promise.all( + INCLUDE_WORKFLOWS.map(workflow => + Promise.all([ + github.rest.actions.listWorkflowRuns({ ...context.repo, workflow_id: workflow, status: 'in_progress', per_page: 1 }), + github.rest.actions.listWorkflowRuns({ ...context.repo, workflow_id: workflow, status: 'queued', per_page: 1 }), + github.rest.actions.listWorkflowRuns({ ...context.repo, workflow_id: workflow, status: 'completed', per_page: 1 }), + ]) + ) + ); + + for (const [ipRes, qRes, completedRes] of results) { + inProgress += ipRes.data.total_count; + queued += qRes.data.total_count; + completed += completedRes.data.total_count; + + const lastRun = completedRes.data.workflow_runs[0]; + if (lastRun) { + const t = new Date(lastRun.updated_at); + if (!lastRunTime || t > lastRunTime) lastRunTime = t; + } + } + } catch (err) { + core.setFailed(`Failed to query workflow runs: ${err.message}`); + return; + } + + const minutesAgo = lastRunTime ? Math.floor((new Date().getTime() - lastRunTime.getTime()) / 60000) : 'N/A'; + core.summary.addList([ + `In-progress CI runs: ${inProgress}`, + `Queued CI runs: ${queued}`, + `Completed CI runs: ${completed}`, + `Last run time: ${lastRunTime ? lastRunTime.toISOString() : 'N/A'} (${minutesAgo} minutes ago)`, + ]); + await core.summary.write(); + core.setOutput('active_jobs', (inProgress > 0 || queued > 0) ? 'true' : 'false'); + core.setOutput('last_run_time', lastRunTime ? lastRunTime.toISOString() : ''); + + - name: Setup IBM Cloud CLI + uses: IBM/actions-ibmcloud-cli@v1 + if: steps.check_ci.outputs.active_jobs == 'false' + with: + api_key: ${{ secrets.IBM_CLOUD_API_KEY }} + plugins: kubernetes-service + + - name: Check idle threshold + id: check_idle + if: steps.check_ci.outputs.active_jobs == 'false' + env: + LAST_RUN_TIME: ${{ steps.check_ci.outputs.last_run_time }} + run: | + if [[ -z "${LAST_RUN_TIME}" ]]; then + echo "No completed CI runs found, checking cluster creation time as fallback..." + LAST_RUN_TIME=$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --output json 2>/dev/null \ + | jq -r '.createdDate // empty') + if [[ -n "${LAST_RUN_TIME}" ]]; then + echo "Using cluster creation time: ${LAST_RUN_TIME}" + else + echo "Cannot determine last activity time, skipping teardown for safety" + echo "recent_activity=true" >> "$GITHUB_OUTPUT" + echo "reason=Cannot determine last activity timestamp" >> "$GITHUB_OUTPUT" + exit 0 + fi + fi + + echo "Last activity: ${LAST_RUN_TIME}" + + LAST_EPOCH=$(date -d "${LAST_RUN_TIME}" +%s) + IDLE_MINUTES=$(( ($(date +%s) - LAST_EPOCH) / 60 )) + echo "Idle for ${IDLE_MINUTES} minutes (threshold: ${IDLE_THRESHOLD_MINUTES} minutes)" + + if [[ ${IDLE_MINUTES} -ge ${IDLE_THRESHOLD_MINUTES} ]]; then + echo "recent_activity=false" >> "$GITHUB_OUTPUT" + echo "reason=CI jobs have been idle for ${IDLE_MINUTES} minutes (threshold: ${IDLE_THRESHOLD_MINUTES})" >> "$GITHUB_OUTPUT" + else + REMAINING=$((IDLE_THRESHOLD_MINUTES - IDLE_MINUTES)) + echo "recent_activity=true" >> "$GITHUB_OUTPUT" + echo "reason=CI jobs last ran ${IDLE_MINUTES} minutes ago, ${REMAINING} minutes remaining before threshold" >> "$GITHUB_OUTPUT" + fi + + - name: Check if cluster exists + id: check_cluster + if: steps.check_idle.outputs.recent_activity == 'false' + run: | + if ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" &>/dev/null; then + STATE=$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --output json 2>/dev/null | jq -r '.state // "unknown"') + echo "Cluster '${CLUSTER_NAME}' exists (state: ${STATE})" + echo "exists=true" >> "$GITHUB_OUTPUT" + else + echo "Cluster '${CLUSTER_NAME}' does not exist, nothing to do" + echo "exists=false" >> "$GITHUB_OUTPUT" + fi + + - name: Trigger teardown + if: steps.check_idle.outputs.recent_activity == 'false' && steps.check_cluster.outputs.exists == 'true' + uses: actions/github-script@v8 + with: + script: | + await github.rest.actions.createWorkflowDispatch({ + owner: context.repo.owner, + repo: context.repo.repo, + workflow_id: 'ibmc-cluster-teardown.yml', + ref: context.ref, + inputs: { + cluster_name: '${{ env.CLUSTER_NAME }}' + } + }); + core.info('Teardown workflow triggered for cluster ${{ env.CLUSTER_NAME }}'); + + - name: Summary + if: always() + env: + CLUSTER_EXISTS: ${{ steps.check_cluster.outputs.exists }} + WORKFLOW_ACTIVE_JOBS: ${{ steps.check_ci.outputs.active_jobs }} + WORKFLOW_LAST_RUN_TIME: ${{ steps.check_ci.outputs.last_run_time }} + IDLE_RECENT_ACTIVITY: ${{ steps.check_idle.outputs.recent_activity }} + IDLE_REASON: ${{ steps.check_idle.outputs.reason }} + run: | + echo "## Auto-Teardown Check" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "| Check | Result |" >> "$GITHUB_STEP_SUMMARY" + echo "|-------|--------|" >> "$GITHUB_STEP_SUMMARY" + echo "| Cluster | \`${CLUSTER_NAME}\` |" >> "$GITHUB_STEP_SUMMARY" + echo "| Cluster Exists | \`${CLUSTER_EXISTS:-(unknown)}\` |" >> "$GITHUB_STEP_SUMMARY" + echo "| Workflow Active Jobs? | \`${WORKFLOW_ACTIVE_JOBS:-N/A}\` |" >> "$GITHUB_STEP_SUMMARY" + echo "| Workflow Last Run Time | \`${WORKFLOW_LAST_RUN_TIME:-N/A}\` |" >> "$GITHUB_STEP_SUMMARY" + echo "| Idle Recent Activity? | \`${IDLE_RECENT_ACTIVITY:-N/A}\` |" >> "$GITHUB_STEP_SUMMARY" + echo "| Idle Reason | \`${IDLE_REASON:-N/A}\` |" >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index 07b738a060..12ef175ad7 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -47,16 +47,12 @@ jobs: timeout-minutes: 360 steps: - name: Checkout - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 - with: - persist-credentials: false + uses: actions/checkout@v5 - name: Setup IBM Cloud CLI - uses: IBM/actions-ibmcloud-cli@953e229550655a880eda6ecfb01fbbdf12f119a5 + uses: IBM/actions-ibmcloud-cli@v1 with: - api_key: ${{ secrets.IC_KEY }} - region: eu-de - group: cnv-ui + api_key: ${{ secrets.IBM_CLOUD_API_KEY }} plugins: kubernetes-service, container-registry - name: Check for existing cluster @@ -107,7 +103,7 @@ jobs: echo "ERROR: Flavor '${FLAVOR}' is not available in zone '${ZONE}'." echo "" echo "Available flavors in '${ZONE}':" - echo "${LOCATIONS_JSON}" | jq -r --arg z "${ZONE}" '.[] | select(.id == $z) | .flavors | split(",")[]' | sort + echo "${LOCATIONS_JSON}" | jq -r --arg z "${ZONE}" '.[] | select(.id == $z) | .flavors[]? | .id' | sort exit 2 fi echo "Flavor '${FLAVOR}' is available in zone '${ZONE}'" @@ -116,9 +112,6 @@ jobs: if: steps.check_cluster.outputs.exists == 'false' env: ZONE: ${{ inputs.zone }} - OPENSHIFT_VERSION: ${{ inputs.openshift_version }} - WORKER_FLAVOR: ${{ inputs.worker_flavor }} - WORKER_COUNT: ${{ inputs.worker_count }} run: | echo "Looking up existing VLANs in zone '${ZONE}'..." VLAN_JSON=$(ibmcloud oc vlan ls --zone "${ZONE}" --output json 2>/dev/null || echo "[]") @@ -132,31 +125,27 @@ jobs: echo "No existing VLANs in zone, new VLANs will be created" fi - echo "Creating cluster '${CLUSTER_NAME}' with ${WORKER_COUNT}x ${WORKER_FLAVOR} workers in zone ${ZONE}..." - VLAN_ARGS=() - if [[ -n "${PRIVATE_VLAN}" ]]; then - VLAN_ARGS+=(--private-vlan "${PRIVATE_VLAN}") - fi - if [[ -n "${PUBLIC_VLAN}" ]]; then - VLAN_ARGS+=(--public-vlan "${PUBLIC_VLAN}") - fi + echo "Creating cluster '${CLUSTER_NAME}' with ${{ inputs.worker_count }}x ${{ inputs.worker_flavor }} workers in zone ${ZONE}..." ibmcloud oc cluster create classic \ --name "${CLUSTER_NAME}" \ - --version "${OPENSHIFT_VERSION}" \ - --flavor "${WORKER_FLAVOR}" \ - --workers "${WORKER_COUNT}" \ + --version "${{ inputs.openshift_version }}" \ + --flavor "${{ inputs.worker_flavor }}" \ + --workers "${{ inputs.worker_count }}" \ --zone "${ZONE}" \ - "${VLAN_ARGS[@]}" + --private-vlan "${PRIVATE_VLAN}" \ + --public-vlan "${PUBLIC_VLAN}" - name: Wait for cluster to be ready to use run: | ./ci-scripts/check-roks-cluster-state.sh - - name: Install oc client from cluster version + - name: Install oc client from cluster run: | - CLUSTER_JSON="$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --output json)" - export CLUSTER_JSON - bash ./ci-scripts/install-oc-client.sh + INGRESS=$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --json | jq -r '.ingressHostname') + curl -kLo oc.tar "https://downloads-openshift-console.${INGRESS}/amd64/linux/oc.tar" + tar -xvf oc.tar + sudo mv oc /usr/local/bin/ + echo "oc version: $(oc version --client)" - name: Configure kubeconfig run: | @@ -184,12 +173,13 @@ jobs: fi echo "ARC secrets are present." - - name: Setup ARC runner image + - name: Setup ARC runner image and dind mirror id: build_runner env: OC_VERSION: '4.20' run: | - IMAGE_REF=$(./ci-scripts/images/setup-arc-runner-image.sh | grep '^IMAGE_REF=' | cut -d= -f2-) + ./ci-scripts/arc/setup-dind-mirror.sh + IMAGE_REF=$(./ci-scripts/arc/setup-runner-image.sh | grep '^IMAGE_REF=' | cut -d= -f2-) echo "image_ref=${IMAGE_REF}" >> "$GITHUB_OUTPUT" - name: Install ARC @@ -214,23 +204,17 @@ jobs: - name: Setup summary if: always() - env: - INPUT_ZONE: ${{ inputs.zone }} - INPUT_VERSION: ${{ inputs.openshift_version }} - INPUT_FLAVOR: ${{ inputs.worker_flavor }} - INPUT_WORKERS: ${{ inputs.worker_count }} - INPUT_KVM: ${{ inputs.kvm_emulation }} run: | echo "## Hot Cluster Setup Summary" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" echo "| Parameter | Value |" >> "$GITHUB_STEP_SUMMARY" echo "|-----------|-------|" >> "$GITHUB_STEP_SUMMARY" echo "| Cluster | \`${CLUSTER_NAME}\` |" >> "$GITHUB_STEP_SUMMARY" - echo "| Zone | \`${INPUT_ZONE}\` |" >> "$GITHUB_STEP_SUMMARY" - echo "| OpenShift | \`${INPUT_VERSION}\` |" >> "$GITHUB_STEP_SUMMARY" - echo "| Worker Flavor | \`${INPUT_FLAVOR}\` |" >> "$GITHUB_STEP_SUMMARY" - echo "| Workers | \`${INPUT_WORKERS}\` |" >> "$GITHUB_STEP_SUMMARY" - echo "| KVM Emulation | \`${INPUT_KVM}\` |" >> "$GITHUB_STEP_SUMMARY" + echo "| Zone | \`${{ inputs.zone }}\` |" >> "$GITHUB_STEP_SUMMARY" + echo "| OpenShift | \`${{ inputs.openshift_version }}\` |" >> "$GITHUB_STEP_SUMMARY" + echo "| Worker Flavor | \`${{ inputs.worker_flavor }}\` |" >> "$GITHUB_STEP_SUMMARY" + echo "| Workers | \`${{ inputs.worker_count }}\` |" >> "$GITHUB_STEP_SUMMARY" + echo "| KVM Emulation | \`${{ inputs.kvm_emulation }}\` |" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" if oc cluster-info &>/dev/null; then echo "Cluster is **healthy** and ready for CI." >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/ibmc-cluster-teardown.yml b/.github/workflows/ibmc-cluster-teardown.yml index a226cbc188..e023fdf46d 100644 --- a/.github/workflows/ibmc-cluster-teardown.yml +++ b/.github/workflows/ibmc-cluster-teardown.yml @@ -16,8 +16,10 @@ on: default: 'kubevirt-plugin-ci' type: string secrets: - IC_KEY: + IBM_CLOUD_API_KEY: required: true + BOT_PAT: + required: false permissions: contents: read @@ -29,52 +31,40 @@ jobs: teardown: name: Tear Down Hot Cluster runs-on: ubuntu-latest - timeout-minutes: 150 + timeout-minutes: 60 steps: - name: Setup IBM Cloud CLI - uses: IBM/actions-ibmcloud-cli@953e229550655a880eda6ecfb01fbbdf12f119a5 + uses: IBM/actions-ibmcloud-cli@v1 with: - api_key: ${{ secrets.IC_KEY }} - region: eu-de - group: cnv-ui + api_key: ${{ secrets.IBM_CLOUD_API_KEY }} plugins: kubernetes-service - name: Check cluster exists id: check_cluster run: | - OUTPUT=$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --output json 2>&1) && EXIT=0 || EXIT=$? - - if [[ ${EXIT} -eq 0 ]]; then + if ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" &>/dev/null; then echo "Cluster '${CLUSTER_NAME}' found" echo "exists=true" >> "$GITHUB_OUTPUT" ibmcloud oc cluster config --cluster "${CLUSTER_NAME}" --admin || true - elif grep -qE 'G0004|could not be found' <<< "${OUTPUT}"; then + else echo "Cluster '${CLUSTER_NAME}' not found, nothing to tear down" echo "exists=false" >> "$GITHUB_OUTPUT" - else - echo "ERROR: Failed to check cluster '${CLUSTER_NAME}':" - echo "${OUTPUT}" - exit 1 fi - - name: Checkout - if: steps.check_cluster.outputs.exists == 'true' - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 - with: - persist-credentials: false - - - name: Setup Helm - if: steps.check_cluster.outputs.exists == 'true' - uses: azure/setup-helm@bf6a7d304bc2fdb57e0331155b7ebf2c504acf0a # v4 - with: - version: '3.19.0' - - name: Deregister ARC runners if: steps.check_cluster.outputs.exists == 'true' continue-on-error: true run: | - ./ci-scripts/arc/uninstall-arc.sh + if command -v helm &>/dev/null || (curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash); then + echo "Uninstalling ARC runner scale set..." + helm uninstall kubevirt-plugin-ci --namespace arc-runners --wait --timeout 5m 2>/dev/null || echo "Runner scale set not found or already removed" + + echo "Uninstalling ARC controller..." + helm uninstall arc --namespace arc-systems --wait --timeout 5m 2>/dev/null || echo "ARC controller not found or already removed" + else + echo "WARNING: Helm not available, skipping Helm uninstall" + fi - name: Delete cluster if: steps.check_cluster.outputs.exists == 'true' @@ -89,33 +79,37 @@ jobs: ELAPSED=0 while [[ ${ELAPSED} -lt ${MAX_WAIT} ]]; do - OUTPUT=$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --output json 2>&1) && EXIT=0 || EXIT=$? - - if [[ ${EXIT} -eq 0 ]]; then - echo "[$(date '+%H:%M:%S')] Cluster still being deleted... (${ELAPSED}s elapsed)" - sleep ${INTERVAL} - ELAPSED=$((ELAPSED + INTERVAL)) - elif grep -qE 'G0004|could not be found' <<< "${OUTPUT}"; then + if ! ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" &>/dev/null; then echo "Cluster has been removed" break - else - echo "[$(date '+%H:%M:%S')] WARNING: Failed to check cluster status (will retry):" - echo "${OUTPUT}" - sleep ${INTERVAL} - ELAPSED=$((ELAPSED + INTERVAL)) fi + echo "[$(date '+%H:%M:%S')] Cluster still being deleted... (${ELAPSED}s elapsed)" + sleep ${INTERVAL} + ELAPSED=$((ELAPSED + INTERVAL)) done - OUTPUT=$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --output json 2>&1) && EXIT=0 || EXIT=$? - if [[ ${EXIT} -eq 0 ]]; then + if ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" &>/dev/null; then echo "::error::Timed out waiting for cluster deletion" exit 1 - elif grep -qE 'G0004|could not be found' <<< "${OUTPUT}"; then - echo "Cluster deletion confirmed" + fi + + # TODO: Followup about how to manually delete "Runner scale sets" from ARC + - name: Clean up ghost runners + continue-on-error: true + env: + GH_TOKEN: ${{ secrets.BOT_PAT }} + run: | + echo "Checking for offline self-hosted runners..." + RUNNERS=$(gh api "/repos/${{ github.repository }}/actions/runners" --jq '.runners[] | select(.status == "offline") | select(.labels[].name == "kubevirt-plugin-ci") | .id' 2>/dev/null || true) + + if [[ -z "${RUNNERS}" ]]; then + echo "No offline 'kubevirt-plugin-ci' runners found" else - echo "ERROR: Failed to verify cluster deletion for '${CLUSTER_NAME}':" - echo "${OUTPUT}" - exit 1 + for runner_id in ${RUNNERS}; do + echo "Deleting offline runner ${runner_id}..." + gh api -X DELETE "/repos/${{ github.repository }}/actions/runners/${runner_id}" || echo "Failed to delete runner ${runner_id}" + done + echo "Ghost runner cleanup complete" fi - name: Teardown summary diff --git a/.github/workflows/poc-e2e-ci-test.yml b/.github/workflows/poc-e2e-ci-test.yml new file mode 100644 index 0000000000..f34bf4e53a --- /dev/null +++ b/.github/workflows/poc-e2e-ci-test.yml @@ -0,0 +1,75 @@ +name: POC Hot Cluster E2E CI Test + +on: + workflow_dispatch: + inputs: + test_spec: + description: 'Cypress test spec to run (passed to POC E2E Test 2)' + required: true + default: 'tests/gating.cy.ts' + type: string + cluster_name: + description: 'Cluster name' + required: true + default: 'kubevirt-plugin-ci' + type: string + +permissions: + contents: read + actions: read + +env: + CLUSTER_NAME: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }} + +jobs: + cluster-health-check: + name: Cluster Health Check + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - name: Checkout + uses: actions/checkout@v5 + + - name: Setup IBM Cloud CLI + uses: IBM/actions-ibmcloud-cli@v1 + with: + api_key: ${{ secrets.IBM_CLOUD_API_KEY }} + plugins: kubernetes-service + + - name: Install oc client from cluster + run: | + INGRESS=$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --json | jq -r '.ingressHostname') + curl -kLo oc.tar "https://downloads-openshift-console.${INGRESS}/amd64/linux/oc.tar" + tar -xvf oc.tar + sudo mv oc /usr/local/bin/ + echo "oc version: $(oc version --client)" + + - name: Configure kubeconfig + run: | + ibmcloud oc cluster config --cluster "${CLUSTER_NAME}" --admin + oc cluster-info + oc get nodes -o wide + + - name: Run health checks + run: | + ./ci-scripts/check-cluster-health.sh + + - name: Health check summary + if: always() + run: | + echo "## Cluster Health Check" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + if [[ "${{ job.status }}" == "success" ]]; then + echo "All health checks **passed**. Invoking **POC Hot Cluster E2E CI Test 2** for Cypress." >> "$GITHUB_STEP_SUMMARY" + else + echo "Health checks **failed**. E2E workflow was not invoked." >> "$GITHUB_STEP_SUMMARY" + fi + + run-e2e-tests: + name: Run E2E Tests + needs: cluster-health-check + if: needs.cluster-health-check.result == 'success' + uses: ./.github/workflows/poc-e2e-ci-test2.yml + with: + test_spec: ${{ inputs.test_spec }} + secrets: inherit diff --git a/.github/workflows/poc-e2e-ci-test2.yml b/.github/workflows/poc-e2e-ci-test2.yml new file mode 100644 index 0000000000..4e34a17917 --- /dev/null +++ b/.github/workflows/poc-e2e-ci-test2.yml @@ -0,0 +1,323 @@ +name: POC Hot Cluster E2E CI Test 2 + +on: + workflow_dispatch: + inputs: + test_spec: + description: Cypress test spec to run + required: true + default: tests/poc-gating.cy.ts + type: string + workflow_call: + inputs: + test_spec: + description: Cypress test spec to run + type: string + required: false + default: tests/poc-gating.cy.ts + +permissions: + contents: read + actions: read + +env: + BRIDGE_BASE_ADDRESS: http://localhost:9000 + CYPRESS_CNV_NS: kubevirt-hyperconverged + CYPRESS_OS_IMAGES_NS: kubevirt-os-images + CYPRESS_TEST_NS: kubevirt-plugin-ci-test-${{ github.run_id }} + CYPRESS_TEST_SECRET_NAME: ci-test-secret + + # KUBEVIRT_PLUGIN_IMAGE: 'ttl.sh/kubevirt-plugin-ci-${{ github.run_id }}-${{ github.run_number }}:2h' + KUBEVIRT_PLUGIN_IMAGE: ttl.sh/kubevirt-plugin-ci-1234:6h + +jobs: + check-runner: + name: Check Runner Image + runs-on: kubevirt-plugin-ci + timeout-minutes: 15 + steps: + - name: Log environment summary + run: | + { + echo "
Key Environment Variables" + echo "" + echo "| Variable | Value |" + echo "| --- | --- |" + for var in HOME USER RUNNER_NAME RUNNER_OS RUNNER_ARCH \ + GITHUB_REPOSITORY GITHUB_REF GITHUB_SHA GITHUB_RUN_ID GITHUB_RUN_NUMBER \ + BRIDGE_BASE_ADDRESS CYPRESS_CNV_NS CYPRESS_OS_IMAGES_NS \ + CYPRESS_TEST_NS KUBEVIRT_PLUGIN_IMAGE KUBEVIRT_UI_PLUGIN_RUNNER; do + echo "| \`$var\` | \`${!var:-}\` |" + done + echo "
" + echo "" + + echo "
Tool Availability" + echo "" + echo "| Tool | Available |" + echo "| --- | --- |" + for cmd in jq yq envsubst curl kubectl oc virtctl docker npm node; do + if command -v "$cmd" &>/dev/null; then + echo "| \`$cmd\` | ✅ |" + else + echo "| \`$cmd\` | ❌ |" + fi + done + echo "
" + echo "" + + echo "
npm / Node Versions" + echo "" + echo "\`\`\`json" + npm version --json 2>/dev/null || echo "npm not found" + echo "\`\`\`" + echo "
" + echo "" + } | tee -a "$GITHUB_STEP_SUMMARY" + + - name: Log Client / Server Versions + run: | + { + echo "
Client / Server Versions" + echo "" + echo "| Tool | Client Version | Server Version |" + echo "| --- | --- | --- |" + for cmd in oc virtctl; do + if command -v "$cmd" &>/dev/null; then + client="" + server="" + version_output=$("$cmd" version 2>/dev/null || true) + client=$(echo "$version_output" | grep -i "client" | head -1 | sed 's/^[[:space:]]*//') + server=$(echo "$version_output" | grep -i "server" | head -1 | sed 's/^[[:space:]]*//') + echo "| \`$cmd\` | ${client:-N/A} | ${server:-N/A} |" + else + echo "| \`$cmd\` | ❌ not found | — |" + fi + done + echo "
" + echo "" + } | tee -a "$GITHUB_STEP_SUMMARY" + + - name: Log HCO and managed operator versions + continue-on-error: true + run: | + { + echo "
HCO & Managed Operator Versions" + echo "" + + if ! command -v oc &>/dev/null; then + echo "> ⚠️ \`oc\` not found — skipping cluster version checks." + else + # HCO itself is installed via OLM; its CSV is the authoritative version. + echo "### HCO Version (OLM CSV)" + echo "" + echo "| Name | Version | Phase |" + echo "| --- | --- | --- |" + oc get csv -n kubevirt-hyperconverged --no-headers 2>/dev/null \ + | grep -i hyperconverged \ + | awk '{ print "| `" $1 "` | `" $2 "` | " $NF " |" }' \ + || echo "| — | HCO CSV not found | — |" + echo "" + + # HCO labels every operand CR with app.kubernetes.io/managed-by=hco-operator. + # Use that label + -A (all namespaces) so we never hardcode a name or namespace. + # KubeVirt uses observedKubeVirtVersion; all others use observedVersion. + echo "### HCO Managed Operand Versions" + echo "" + echo "| Operand | Version |" + echo "| --- | --- |" + + HCO_LABEL="app.kubernetes.io/managed-by=hco-operator" + + kv_ver=$(oc get kubevirt -A -l "${HCO_LABEL}" \ + -o jsonpath='{.items[0].status.observedKubeVirtVersion}' 2>/dev/null || echo "") + echo "| \`kubevirt\` | \`${kv_ver:-not found}\` |" + + cdi_ver=$(oc get cdi -A -l "${HCO_LABEL}" \ + -o jsonpath='{.items[0].status.observedVersion}' 2>/dev/null || echo "") + echo "| \`cdi\` | \`${cdi_ver:-not found}\` |" + + ssp_ver=$(oc get ssp -A -l "${HCO_LABEL}" \ + -o jsonpath='{.items[0].status.observedVersion}' 2>/dev/null || echo "") + echo "| \`ssp\` | \`${ssp_ver:-not found}\` |" + + cnao_ver=$(oc get networkaddonsconfig -A -l "${HCO_LABEL}" \ + -o jsonpath='{.items[0].status.observedVersion}' 2>/dev/null || echo "") + echo "| \`cnao\` | \`${cnao_ver:-not found}\` |" + + hpp_ver=$(oc get hostpathprovisioner -A -l "${HCO_LABEL}" \ + -o jsonpath='{.items[0].status.observedVersion}' 2>/dev/null || echo "") + echo "| \`hostpath-provisioner\` | \`${hpp_ver:-not found}\` |" + fi + echo "
" + echo "" + } | tee -a "$GITHUB_STEP_SUMMARY" + + build-kubevirt-plugin-image: + name: Build Kubevirt Plugin Image + needs: check-runner + runs-on: ubuntu-latest + outputs: + kubevirt-plugin-image: ${{ env.KUBEVIRT_PLUGIN_IMAGE }} + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Check if kubevirt-plugin image exists in registry + id: check_image + run: | + # Try unauthenticated first (works for public registries) + if skopeo inspect docker://${KUBEVIRT_PLUGIN_IMAGE} &>/dev/null; then + echo "IMAGE_EXISTS=true" >> $GITHUB_OUTPUT + else + echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT + fi + + - name: Build kubevirt-plugin image + if: steps.check_image.outputs.IMAGE_EXISTS == 'false' + run: | + docker build -t ${KUBEVIRT_PLUGIN_IMAGE} -f Dockerfile . + docker push ${KUBEVIRT_PLUGIN_IMAGE} + + run-gating-tests: + name: Run Gating Tests + needs: build-kubevirt-plugin-image + runs-on: kubevirt-plugin-ci + timeout-minutes: 120 + env: + PLUGIN_IMAGE: ${{ needs.build-kubevirt-plugin-image.outputs.kubevirt-plugin-image }} + PLUGIN_PORT: 9001 + PLUGIN_NAME: kubevirt-plugin-ci + PLUGIN_TRANSPORT: http + CONSOLE_PORT: 9000 + + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Resolve console image from cluster OpenShift version + run: bash ci-scripts/resolve-console-image.sh >> "${GITHUB_ENV}" + + - name: Setup required namespaced resources + run: | + oc create namespace "${CYPRESS_TEST_NS}" --dry-run=client -o yaml | oc apply -f - + # Patch the existing fixture to substitute the CI secret's namespace and name. + yq e '.metadata.name = strenv(CYPRESS_TEST_SECRET_NAME) | .metadata.namespace = strenv(CYPRESS_TEST_NS)' \ + cypress/fixtures/secret.yaml | oc apply -f - + + - name: Start kubevirt-plugin container (mimics operator deployment with a ConfigMap + Secret) + env: + PLUGIN_URL: '${{ env.PLUGIN_TRANSPORT }}://localhost:${{ env.PLUGIN_PORT }}/plugin-manifest.json' + run: | + ./ci-scripts/start-plugin-container.sh + + echo "Waiting for plugin at ${PLUGIN_URL}..." + for i in $(seq 1 30); do + if curl -skSf "${PLUGIN_URL}" -o /dev/null; then + echo "Plugin is responding at ${PLUGIN_URL}." + exit 0 + fi + if [[ "$i" -eq 30 ]]; then + echo "::error::Plugin did not become ready on ${PLUGIN_URL}" + docker ps -a --filter "name=${PLUGIN_NAME}" || true + docker inspect "${PLUGIN_NAME}" --format 'status={{.State.Status}} exit={{.State.ExitCode}} err={{.State.Error}}' 2>/dev/null || true + docker logs "${PLUGIN_NAME}" 2>&1 || true + exit 1 + fi + sleep 2 + done + + - name: Start the "off cluster" console + shell: bash + run: | + ./ci-scripts/start-console.sh + + echo "Waiting for console at ${BRIDGE_BASE_ADDRESS}..." + for i in $(seq 1 60); do + if curl -s -o /dev/null -w "%{http_code}" "${BRIDGE_BASE_ADDRESS}/" | grep -qE '200|301|302'; then + echo "Console is responding." + break + fi + if [[ "$i" -eq 60 ]]; then + echo "::error::Console did not become ready within the wait window." + exit 1 + fi + sleep 5 + done + + # TODO: Add dependency caching (either use the setup-node action with caching, or add explicit caching) + - name: Install dependencies + run: | + npm ci --ignore-scripts --no-audit + npx cypress install + + # # TODO: Replace with the cypress action? + # - name: Run Cypress gating tests + # run: | + # npm run test-cypress-headless -- --spec="${{ inputs.test_spec }}" + + - name: Run gating tests + uses: cypress-io/github-action@v7 + with: + summary-title: 'Cypress gating tests' + install: false + working-directory: ./cypress + env: openshift=true + spec: '${{ inputs.test_spec }}' + + - name: Generate test report + if: always() + run: npm run cypress-postreport || true + + - name: Upload test artifacts + if: always() + uses: actions/upload-artifact@v6 + with: + name: cypress-results-${{ github.run_id }} + path: | + cypress/gui-test-screenshots/ + cypress/videos/ + cypress/results/ + retention-days: 7 + if-no-files-found: ignore + + - name: Capture logs, stop and rm the console and plugin containers + if: always() + run: | + TMP=/tmp/e2e-ci-diagnostics/container-logs + mkdir -p "${TMP}" + + docker logs console > "${TMP}/console.log" 2>&1 || true + docker stop console || echo "::warning::Could not stop console container" + docker rm -f console || true + + docker logs "${PLUGIN_NAME}" > "${TMP}/kubevirt-plugin.log" 2>&1 || true + docker stop "${PLUGIN_NAME}" || echo "::warning::Could not stop ${PLUGIN_NAME} container" + docker rm -f "${PLUGIN_NAME}" || true + + - name: Collect OpenShift cluster diagnostics on failure + if: failure() + run: | + TMP=/tmp/e2e-ci-diagnostics/cluster + mkdir -p "${TMP}" + + oc cluster-info dump > "${TMP}/cluster_info.json" 2>/dev/null || true + oc get pods -n kubevirt-hyperconverged -o wide > "${TMP}/hco_pods.txt" 2>/dev/null || true + oc get nodes -o wide > "${TMP}/nodes.txt" 2>/dev/null || true + oc get events -n "${CYPRESS_TEST_NS}" --sort-by='.lastTimestamp' > "${TMP}/test_ns_events.txt" 2>/dev/null || true + + - name: Upload E2E diagnostics (cluster + container logs) on failure + if: failure() + uses: actions/upload-artifact@v6 + with: + name: e2e-ci-diagnostics-cluster-and-containers-${{ github.run_id }} + path: /tmp/e2e-ci-diagnostics/ + retention-days: 7 + if-no-files-found: ignore + + # TODO: Add any other cleanup steps that are required for the test to this step + - name: Cleanup cluster resources + if: always() + run: | + ./ci-scripts/test-cleanup.sh || echo "::warning::Cleanup encountered errors (non-fatal)" + oc delete namespace "${CYPRESS_TEST_NS}" --wait=false || true diff --git a/.gitignore b/.gitignore index 245f3c3da9..9b20dd6846 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ **/node_modules +.npm-cache +.tmp +.tmp-plugin-cert.*/ npm-debug.log dist **/.env diff --git a/.vscode/settings.json b/.vscode/settings.json index e95fd508a1..9968743f07 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -6,5 +6,6 @@ "editor.codeActionsOnSave": { "source.fixAll.eslint": "always" }, - "eslint.validate": ["javascript", "typescript"] + "eslint.validate": ["javascript", "typescript"], + "cSpell.words": ["hyperconverged", "kubevirt"] } diff --git a/Dockerfile b/Dockerfile index 18568643ab..2665678083 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,9 +13,9 @@ COPY . /opt/app-root/src WORKDIR /opt/app-root/src ENV NODE_OPTIONS=--max-old-space-size=8192 ENV HUSKY=0 -RUN npm config set fetch-timeout 1200000 -RUN npm ci --ignore-scripts -RUN npm run build +RUN npm config set fetch-timeout 1200000 && \ + npm ci --ignore-scripts --no-audit && \ + npm run build # Image info: https://catalog.redhat.com/en/software/containers/ubi9/nginx-124/657b066b6c1bc124a1d7ff39 FROM registry.access.redhat.com/ubi9/nginx-124:1781729725 diff --git a/POC_HOT_CLUSTER_CI.md b/POC_HOT_CLUSTER_CI.md new file mode 100644 index 0000000000..fa1d7a0fb2 --- /dev/null +++ b/POC_HOT_CLUSTER_CI.md @@ -0,0 +1,142 @@ +# Running e2e CI on a hot cluster + +This POC explores how to run e2e CI testing on a long-lived OpenShift cluster, a hot cluster. The ability to repeatedly run multiple CI tests, and multiple CI tests in parallel, can deliver a much better e2e CI experience for developers. + +The work is split in a few parts: + +- GitHub action workflows to manage creating, configuring and tearing down a **OpenShift on IBM Cloud (ROKS)** cluster. +- Scripts to enable a cluster to run e2e CI from GitHub actions. The scripts install the Hyperconverged Cluster Operator (HCO) to provide kubevirt, and install GitHub Action Runner Controller (ARC) to provide the self-hosted runner support to the e2e CI workflows. +- Workflows to start an "off cluster" console with the kubevirt-plugin, and then run the e2e tests + +The hot cluster can be any Kubernetes cluster and is based on the GitHub Actions runner controller (ARC). It is installed via helm, and only requires network access to pull from GitHub within the cluster itself. The POC can even be run on a local CRC / OpenShift local development cluster without any special networking configuration. + +--- + +## ROKS as the hot cluster + +**ROKS** (Red Hat OpenShift Kubernetes Service) is IBM Cloud’s managed OpenShift. The hot cluster is created and destroyed through GitHub Actions (see `.github/workflows/ibmc-cluster-*.yml`): provision workers in a chosen zone/flavor, wait until the API is ready, then install the pieces below. **IBM Cloud CLI** plus `IBM_CLOUD_API_KEY` is used to pull an **admin kubeconfig** when needed—nothing long-lived is stored as a kubeconfig secret in GitHub. + +Keeping the cluster **up** avoids repeating **~1 hour** (or more) of create time per test wave and lets you use workers with **real KVM** or large memory where tests need it. **Auto-teardown** workflows can still remove an idle cluster to control cost. + +--- + +## HCO (Hyperconverged Cluster Operator) + +**HCO** installs and coordinates the KubeVirt stack on OpenShift (KubeVirt, CDI, networking helpers, etc.). After the hot cluster is reachable via `oc` cli, **`ci-scripts/install-hco.sh`** deploys the operator and related resources so the cluster can run **VMs and the virtualization UI** the same way a real product cluster would. Health checks (for example **`ci-scripts/check-cluster-health.sh`**) are available to verify HCO and core virt components before tests run. + +--- + +## ARC (Actions Runner Controller) + +**ARC** is GitHub’s supported way to run **self-hosted Actions runners on Kubernetes**. A **controller** enables a **runner scale set** to provide self-hosted runners to your repo's workflows. When a job requests a `runs-on` label that matches the runner scale set, ARC starts a **runner pod** that registers with GitHub, runs the job, then exits. Each run is a container inside an ephemeral runner. + +In this repo, ARC is installed with Helm charts **`gha-runner-scale-set-controller`** (once per cluster) and **`gha-runner-scale-set`** (once for each runner scale set needed). + +To best support running in OpenShift, and the specific needs of the kubevirt-plugin test stack, first the scripts **`setup-dind-mirror.sh`** (to mirror `docker:dind` into the cluster registry) and **`setup-runner-image.sh`** (build a **custom runner image** with node and cypress support, etc.) are run. + +Once the images are available, two scripts are used to fully setup the ARC install: + +- **`ci-scripts/arc/install-arc-controller.sh`** — controller namespace, SCC, controller Helm release. +- **`ci-scripts/arc/install-runner-scale-set.sh`** — runner scale set, dind post-render, SCC bind for runner pods, RBAC for `oc` in CI jobs. + +Runners default to **Docker-in-Docker (dind)** so workflow steps can use **`docker run`**. This is needed for the off-cluster console flow. GitHub authenticates ARC to the repo via a **GitHub App** (recommended) or a **PAT**. Specific details on setup and secretes needed are in **[ci-scripts/arc/README.md](ci-scripts/arc/README.md)** and **[ci-scripts/README.md](ci-scripts/README.md)**. + +--- + +## Self-hosted runner and off-cluster E2E + +E2E workflows can have jobs that use **`runs-on: kubevirt-plugin-ci`** so they execute **on the cluster** in the ARC ephemeral runners. The runners are close to the API and with `oc` RBAC. The **OpenShift console** under test is started **off-cluster**, similar to local development: + +1. A workflow job builds a kubevirt-plugin container specific for the workflow run, either from the workflow's running branch or, in the future, from a PR's branch, and pushes the container to an ephemeral container repo (ttl.sh currently). As long as the container repo being pushed to can be pulled from the cluster, the container build can run on standard GitHub runners. +2. **`ci-scripts/resolve-console-image.sh`** picks an **`origin-console`** image tag that matches the cluster’s OpenShift **x.y** version. +3. **`ci-scripts/start-plugin-container.sh`** runs the **plugin** image with HTTPS and nginx (like the operator-mounted serving certs pattern). +4. **`ci-scripts/start-console.sh`** runs the **bridge** in **off-cluster** mode: bearer token and API endpoint from `oc`, **plugin URL** pointing at the plugin container on the runner host, and optional **kubevirt API proxy** via a cluster Route. + +Cypress then drives the UI at **`http://localhost:9000`** while API calls go to the **real cluster**, so tests exercise **real KubeVirt** with a **local console + plugin** topology. + +Orchestration lives in **`.github/workflows/poc-e2e-ci-test.yml`** (cluster health on `ubuntu-latest`, then calls **`poc-e2e-ci-test2.yml`**) and the reusable **`poc-e2e-ci-test2.yml`** workflow that performs the steps above and runs **`npm run test-cypress-headless`**. + +--- + +## More documentation + +| Doc | Purpose | +| ---------------------------------------------------------- | ------------------------------------------------- | +| **[`ci-scripts/README.md`](ci-scripts/README.md)** | Secrets, workflows, troubleshooting, cost control | +| **[`ci-scripts/arc/README.md`](ci-scripts/arc/README.md)** | ARC install order, env vars, OpenShift notes | + +--- + +## Gaps + +### Runner RBAC is overly broad + +The ARC runner pods need to run docker-in-docker (dind) and interact with cluster resources: they create/delete test namespaces, manage secrets and PVCs, and drive KubeVirt resources via Cypress. All of this is currently covered by a single `ClusterRole` (`arc-runner-ci`) bound cluster-wide via `ClusterRoleBinding`. + +The problem is that the `ClusterRole` grants full CRUD plus `deletecollection` on `namespaces` and `secrets` across the entire cluster. A compromised or malicious workflow running in the ARC runner pod could exfiltrate every secret on the cluster or tear down arbitrary namespaces—cluster-admin blast radius. + +The core constraint is that the `poc-e2e-ci-test2.yml` workflow uses a unique namespace per run (`kubevirt-plugin-ci-test-`) and the ARC runner is the one that creates it (`oc create namespace`), injects a secret into it, and deletes it at the end. Namespace create/delete and secret write access are load-bearing for every run, so any RBAC improvement must account for them. + +Three options to address this, in order of increasing workflow restructuring required: + +**Option 1 — Drop `deletecollection` only (minimal, lowest effort)** + +The single most dangerous verb is `deletecollection` — it allows bulk-wiping all resources of a given type in a single API call. Removing it from the verbs list doesn't break any workflow step and immediately reduces the blast radius without touching the namespace or secret permissions that the runner needs. + +The cluster-wide write access to `namespaces` and `secrets` remains, so this is a partial improvement only. + +**Option 2 — Add an admission policy layer** + +Keep the RBAC structure as-is but deploy an OPA Gatekeeper or Kyverno policy that restricts the runner `ServiceAccount` to: + +- Creating namespaces whose name matches `kubevirt-plugin-ci-test-*` only. +- Writing secrets only within namespaces that match that same pattern. + +This limits the blast radius at the admission layer rather than at the RBAC layer, without requiring any workflow changes. It does require Gatekeeper or Kyverno to be installed and maintained on the hot cluster. + +**Option 3 — Split namespace provisioning into a separate standard-runner job (most robust)** + +Restructure the `poc-e2e-ci-test2.yml` workflow by extracting the "Setup required namespaced resources" step into a dedicated job that runs on a standard GitHub-hosted runner (`ubuntu-latest`) using a kubeconfig with elevated rights: + +1. **Provisioning job** (standard runner) — Creates `kubevirt-plugin-ci-test-`, injects the CI secret, and applies any other pre-test cluster resources. This job holds the elevated permissions and is short-lived. +2. **Test-execution job** (ARC runner, `runs-on: kubevirt-plugin-ci`) — Receives the pre-created namespace name as a job input. Its `ClusterRole` no longer needs `namespaces` write verbs or cluster-wide `secrets` write access; it is replaced with a namespaced `Role`/`RoleBinding` bound to the test namespace, plus a minimal read-only `ClusterRole` for cluster-info queries (nodes, console URL, cluster version). + +This follows least-privilege most closely and is the recommended end-state before production use. It requires workflow restructuring and is beyond the scope of the current POC. + +### ARC runner Dockerfile + +Noted by @coderabbitai + +- **Pin the runner base image**: In `ci-scripts/arc/runner-image/Dockerfile`, the base image is using the `:latest` tag. It would be more stable and predictable if the the version is pined to a sha or a versioned tag. + +- **Harden the binary downloads**: Implement checksum verification for the unconditional downloads (yq at line 38–40) and for the fallback download paths (kubectl line 75–76, oc line 88–89, virtctl line 100–101). Conditional downloads from environment variables (OC_URL, VIRTCTL_URL) may use console URLs that lack published checksums; document this trade-off or require verification for those paths as well. + +### DIND Mirror + +Noted by @coderabbitai + +The default `docker.io/library/docker:dind` uses a floating tag that advances with Docker releases. While the script allows overriding via `DIND_SOURCE_IMAGE` environment variable, the default floating tag means different CI runs—weeks or months apart—could pull and mirror different dind versions underneath identical source code. Given the repo's emphasis on aligned and pinned versions for reproducibility, the dind default should either be a specific version (e.g., `docker:26.0` or a sha256 digest) or the docs should explicitly document that `DIND_SOURCE_IMAGE=docker.io/library/docker:` must be set in CI to achieve reproducible runner pods. + +### FIPS-enabled cluster support + +The upstream GitHub Actions runner image (`ghcr.io/actions/actions-runner:latest`) is Ubuntu 22.04-based. On FIPS-enabled OpenShift clusters, the kernel exposes `/proc/sys/crypto/fips_enabled = 1` to all containers. OpenSSL and the .NET runtime (which powers the runner's `Runner.Listener` binary) detect this flag and attempt to use FIPS-validated cryptographic providers. Since the Ubuntu image lacks the required FIPS provider module (`fips.so`), the runner segfaults during TLS handshake with GitHub — and `run.sh` masks the crash as exit code 0, making the failure invisible. + +The current workaround sets `OPENSSL_FORCE_FIPS_MODE=0` in the runner container environment (see `arc-runner-scale-set.pod.yaml`). This tells OpenSSL to ignore the kernel's FIPS flag. It is sufficient for CI runners that do not need to perform FIPS-validated cryptographic operations themselves. + +The proper long-term solution is to rebase the custom runner image onto a FIPS-compatible base such as `registry.access.redhat.com/ubi9/ubi` (or `ubi9/ubi-minimal`). This would involve: + +1. Starting from UBI instead of `ghcr.io/actions/actions-runner:latest`. +2. Installing the .NET runtime (the runner requires .NET 8+). +3. Downloading and extracting the GitHub Actions runner binaries from the [runner releases](https://github.com/actions/runner/releases). +4. Installing the same additional tooling the current Dockerfile adds (node, jq, oc, virtctl, cypress dependencies, etc.). + +A UBI-based image carries FIPS-validated OpenSSL and crypto providers out of the box, so the runner's .NET TLS stack works correctly without any environment variable overrides. This also avoids the build-time `curl`/OpenSSL issues documented in the current Dockerfile (the `wget2`/GnuTLS workaround for FIPS DSO errors during `oc start-build`). + +References: + +- [actions/runner#4197](https://github.com/actions/runner/issues/4197) — Segfault on FIPS-enabled hosts +- [dotnet/dotnet-docker#5849](https://github.com/dotnet/dotnet-docker/issues/5849) — .NET crypto fails in containers on FIPS kernels + +### If adopted, hardening of the ROKS cluster handling, and cluster health checks are needed + +The workflows and scripts all function, but they should receive additional scrutiny before being adopted for real scenarios. diff --git a/ci-scripts/README.md b/ci-scripts/README.md index d481fd0cb9..8005a86a22 100644 --- a/ci-scripts/README.md +++ b/ci-scripts/README.md @@ -1,9 +1,5 @@ # Hot Cluster CI -> **Continuation guide (CNV-74265):** [docs/HOT_CLUSTER_CI_CONTINUATION.md](../docs/HOT_CLUSTER_CI_CONTINUATION.md) -> **Future work backlog:** [docs/HOT_CLUSTER_FUTURE_WORK.md](../docs/HOT_CLUSTER_FUTURE_WORK.md) -> **Cluster lifecycle:** [docs/CLUSTER_LIFECYCLE.md](../docs/CLUSTER_LIFECYCLE.md) - This directory contains scripts and documentation for the **IBM Cloud hot cluster** CI stack: an OpenShift (ROKS) cluster used for KubeVirt plugin integration testing, with **Hyperconverged Cluster Operator (HCO)** and **GitHub Actions Runner Controller (ARC)** so jobs can run on cluster-adjacent self-hosted runners (`kubevirt-plugin-ci`). Workers can be **bare metal** (real KVM) or **VPC / shared** flavors with **KVM emulation**; the setup workflow defaults favor VPC-style flavors and `kvm_emulation: true` unless you change inputs. @@ -14,36 +10,38 @@ Workers can be **bare metal** (real KVM) or **VPC / shared** flavors with **KVM | -------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | **Real KubeVirt / OpenShift behavior** | Tests run against a live cluster with HCO, virt stack, and storage—not mocks. | | **Console + plugin fidelity** | Two POC paths: hit the **in-cluster** console URL, or run an **off-cluster** console container with the plugin served like the operator (TLS + nginx), matching how developers run bridge locally. | -| **Long-running / privileged CI** | GitHub-hosted runners are a poor fit for nested virt, heavy Playwright, and Docker-heavy flows; **ARC** on the cluster provides dind-capable runners with `oc` RBAC. | -| **Cost control** | Bare metal and large workers are expensive; manual teardown via **IBM Cloud Hot Cluster Teardown** (auto-teardown in PR #4099). | +| **Long-running / privileged CI** | GitHub-hosted runners are a poor fit for nested virt, heavy Cypress, and Docker-heavy flows; **ARC** on the cluster provides dind-capable runners with `oc` RBAC. | +| **Cost control** | Bare metal and large workers are expensive; **auto-teardown** after idle time limits runaway spend. | ## Architecture **Lifecycle (IBM Cloud)** -```text +``` GitHub Actions │ ├── ibmc-cluster-setup.yml → "IBM Cloud Hot Cluster Setup" - ├── ibmc-cluster-teardown.yml → "IBM Cloud Hot Cluster Teardown" (also workflow_call) - └── ibmc-cluster-auto-teardown.yml → "IBM Cloud Hot Cluster Auto-Teardown" (daily cron safety net) + ├── ibmc-cluster-teardown.yml → "IBM Cloud Hot Cluster Teardown" (also workflow_call + ghost-runner cleanup) + └── ibmc-cluster-auto-teardown.yml → "IBM Cloud Hot Cluster Auto-Teardown" (cron + dispatch → teardown workflow) ``` -**Hot cluster E2E** (added in PR #4099) +**POC E2E (two variants)** -```text -hot-cluster-e2e.yml — "Hot Cluster E2E" (PR + manual dispatch) +``` +poc-e2e-ci-test.yml — "POC Hot Cluster E2E CI Test" ├── cluster-health-check (ubuntu-latest + IBM Cloud → kubeconfig) │ └── ci-scripts/check-cluster-health.sh - └── run-e2e-tests (workflow_call → hot-cluster-e2e-run.yml) + └── run-e2e-tests (workflow_call → poc-e2e-ci-test2.yml) -hot-cluster-e2e-run.yml — "Hot Cluster E2E Run" - ├── check-runner (diagnostics on ARC runner) - ├── build-kubevirt-plugin-image (ubuntu-latest; podman build + push) +poc-e2e-ci-test2.yml — "POC Hot Cluster E2E CI Test 2" + ├── check-runner (optional diagnostics on ARC runner) + ├── build-kubevirt-plugin-image (ubuntu-latest, Docker; may skip if image exists in registry) └── run-gating-tests (runs-on: kubevirt-plugin-ci) - ├── ci-env-request → ci-env-controller → ci-test-stack (console + plugin) - ├── BRIDGE_BASE_ADDRESS from test stack - └── Playwright gating (or features project) + ├── ci-scripts/resolve-console-image.sh → CONSOLE_IMAGE matches cluster OCP x.y + ├── ci-scripts/start-plugin-container.sh → plugin over HTTPS :9001 (dind/docker) + ├── ci-scripts/start-console.sh → origin-console container, off-cluster mode + ├── BRIDGE_BASE_ADDRESS=http://localhost:9000 + └── Cypress against local bridge + plugin proxy ``` ## Required GitHub Secrets @@ -52,9 +50,9 @@ These secrets must be configured in the repository settings before running the w ### IBM Cloud -| Secret | Description | How to Obtain | -| -------- | --------------------- | ------------------------------- | -| `IC_KEY` | IBM Cloud IAM API key | Repository/org secret (Actions) | +| Secret | Description | How to Obtain | +| ------------------- | --------------------- | ------------------------------------------------------------- | +| `IBM_CLOUD_API_KEY` | IBM Cloud IAM API key | IBM Cloud Console → Manage → Access (IAM) → API keys → Create | The API key must belong to a user or service ID with the following IAM permissions: @@ -62,6 +60,14 @@ The API key must belong to a user or service ID with the following IAM permissio - **VPC Infrastructure Services**: Editor role (if using VPC-based clusters) - **Classic Infrastructure**: Super User or equivalent (for bare metal provisioning) +### Ghost Runner Cleanup (optional) + +| Secret | Description | How to Obtain | +| --------- | ------------------------- | ------------------------------------------- | +| `BOT_PAT` | PAT with repo admin scope | GitHub Settings → Developer Settings → PATs | + +The `BOT_PAT` is only needed if you want the teardown workflow to automatically delete offline "ghost" runners from GitHub. Deleting self-hosted runners requires repository admin access which `GITHUB_TOKEN` cannot provide. The PAT needs the `repo` scope (classic) or **Administration: Read and Write** (fine-grained). If not set, ghost runners can be cleaned up manually via Settings → Actions → Runners. + ### ARC Authentication (choose one) #### Option A: GitHub App (recommended for production) @@ -89,11 +95,9 @@ All workflows that need cluster access use the IBM Cloud CLI to pull a kubeconfi ```yaml - name: Setup IBM Cloud CLI - uses: IBM/actions-ibmcloud-cli@953e229550655a880eda6ecfb01fbbdf12f119a5 # v1 + uses: IBM/actions-ibmcloud-cli@v1 with: - api_key: ${{ secrets.IC_KEY }} - region: eu-de - group: cnv-ui + api_key: ${{ secrets.IBM_CLOUD_API_KEY }} plugins: kubernetes-service - name: Configure kubeconfig @@ -102,7 +106,7 @@ All workflows that need cluster access use the IBM Cloud CLI to pull a kubeconfi oc cluster-info ``` -This avoids storing kubeconfig or credentials as GitHub secrets. Any workflow or job that needs `oc`/`kubectl` access simply repeats these two steps with the shared `IC_KEY` secret. +This avoids storing kubeconfig or credentials as GitHub secrets. Any workflow or job that needs `oc`/`kubectl` access simply repeats these two steps with the shared `IBM_CLOUD_API_KEY`. ## Creating a GitHub App for ARC @@ -129,23 +133,23 @@ This avoids storing kubeconfig or credentials as GitHub secrets. Any workflow or All ARC automation lives under **`ci-scripts/arc/`**. See **[`ci-scripts/arc/README.md`](arc/README.md)** for the full walkthrough. -| Script | Role | -| ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **`ci-scripts/arc/setup-dind-mirror.sh`** | Mirror **`docker:dind`** to the internal registry, write **`ci-scripts/generated/arc-dind-replace.env`** for Helm post-rendering (standard path; `SKIP_DIND_MIRROR=1` only if dind is provided another way). | -| **`ci-scripts/images/setup-arc-runner-image.sh`** | Build custom runner image (BuildConfig + `images/arc-runner/Dockerfile`); prints **`IMAGE_REF=`**. | -| **`ci-scripts/arc/install-arc-controller.sh`** | Once per cluster: `arc-systems`, **`ci-scripts/arc/arc-openshift-scc.yaml`**, Helm **`gha-runner-scale-set-controller`**. | -| **`ci-scripts/arc/install-runner-scale-set.sh`** | Per scale set: Helm **`gha-runner-scale-set`**, optional **`ARC_RUNNER_IMAGE`**, dind post-render (**`--storage-driver=vfs`** always; optional **`docker:dind`** mirror via env file or **`ARC_DIND_INTERNAL_IMAGE`**), SCC bind, **`arc-runner-rbac.yaml`** (unless `SKIP_ARC_RUNNER_RBAC=1`). Requires **`ARC_CONFIG_URL`** + GitHub auth. Run **after** the controller script. | +| Script | Role | +| ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **`ci-scripts/arc/setup-dind-mirror.sh`** | Mirror **`docker:dind`** to the internal registry, write **`ci-scripts/generated/arc-dind-replace.env`** for Helm post-rendering (standard path; `SKIP_DIND_MIRROR=1` only if dind is provided another way). | +| **`ci-scripts/arc/setup-runner-image.sh`** | Build custom runner image (BuildConfig + `runner-image/Dockerfile`); prints **`IMAGE_REF=`**. | +| **`ci-scripts/arc/install-arc-controller.sh`** | Once per cluster: `arc-systems`, **`ci-scripts/arc/arc-openshift-scc.yaml`**, Helm **`gha-runner-scale-set-controller`**. | +| **`ci-scripts/arc/install-runner-scale-set.sh`** | Per scale set: Helm **`gha-runner-scale-set`**, optional **`ARC_RUNNER_IMAGE`**, dind post-render (**`--storage-driver=vfs`** always; optional **`docker:dind`** mirror via env file or **`ARC_DIND_INTERNAL_IMAGE`**), SCC bind, **`arc-runner-rbac.yaml`** (unless `SKIP_ARC_RUNNER_RBAC=1`). Requires **`ARC_CONFIG_URL`** + GitHub auth. Run **after** the controller script. | -Hot Cluster Setup runs **`ci-scripts/images/setup-arc-runner-image.sh`**, then **`ci-scripts/arc/install-arc-controller.sh`** and **`ci-scripts/arc/install-runner-scale-set.sh`** (same env for the install steps). +Hot Cluster Setup runs **`ci-scripts/arc/setup-dind-mirror.sh`**, **`ci-scripts/arc/setup-runner-image.sh`**, then **`ci-scripts/arc/install-arc-controller.sh`** and **`ci-scripts/arc/install-runner-scale-set.sh`** (same env for the install steps). ### Custom runner image The setup workflow builds a **custom runner image** on the cluster. The image extends the official GitHub Actions runner with Node.js 22, kubectl, oc, virtctl, and jq. Container workflows use **Docker** via the ARC **dind** sidecar (`DOCKER_HOST`). -- **Dockerfile**: `ci-scripts/images/arc-runner/Dockerfile` +- **Dockerfile**: `ci-scripts/arc/runner-image/Dockerfile` - **Runner pod Helm fragment**: `ci-scripts/arc/arc-runner-scale-set.pod.yaml` — used by **`ci-scripts/arc/install-runner-scale-set.sh`**. - **Dind post-render**: **`ci-scripts/arc/install-runner-scale-set.sh`** always runs Helm with **`--post-renderer ci-scripts/arc/arc-dind-post-render.sh`** for **`CONTAINER_MODE=dind`** (injects **`--storage-driver=vfs`** so nested overlay does not fail on OpenShift). **`ci-scripts/arc/setup-dind-mirror.sh`** writes **`ci-scripts/generated/arc-dind-replace.env`** so the post-renderer also swaps **`docker:dind`** for the internal registry; you can set **`ARC_DIND_INTERNAL_IMAGE`** at install time instead (writes the same env file for that run). -- **Refresh runner image only**: re-run **`ci-scripts/images/setup-arc-runner-image.sh`**, then **`ci-scripts/arc/install-runner-scale-set.sh`** with **`ARC_RUNNER_IMAGE`** set to the new ref (and the same auth env vars). +- **Refresh runner image only**: re-run **`ci-scripts/arc/setup-runner-image.sh`**, then **`ci-scripts/arc/install-runner-scale-set.sh`** with **`ARC_RUNNER_IMAGE`** set to the new ref (and the same auth env vars). Optional: `OC_VERSION`, `VIRTCTL_VERSION`, `ARC_RUNNERS_NS`, `CONTAINER_MODE` (default **dind**), `ARC_VERSION`, `ARC_SCALE_SET_LABELS`, `SKIP_ARC_RUNNER_RBAC=1`. @@ -164,7 +168,7 @@ The **stable** chart still hardcodes **`docker:dind`** in templates; this repo k #### Dind image source -The stable chart embeds **`docker:dind`** (Docker Hub). This repo **mirrors by default** that image into the OpenShift internal registry via **`ci-scripts/arc/setup-dind-mirror.sh`** and rewrites rendered manifests with the Helm post-renderer so runner pods pull **arc-docker-dind** from the cluster registry (skippable via **`SKIP_DIND_MIRROR=1`** or **`ARC_DIND_INTERNAL_IMAGE`**). The approach avoids Docker Hub pull throttling / rate limiting. +The stable chart embeds **`docker:dind`** (Docker Hub). This repo **always mirrors** that image into the OpenShift internal registry via **`ci-scripts/arc/setup-dind-mirror.sh`** and rewrites rendered manifests with the Helm post-renderer so runner pods pull **arc-docker-dind** from the cluster registry. The approach avoid docker hub pull throttling / rate limiting. ### Docker-in-Docker (default) @@ -186,33 +190,41 @@ To turn off dind (no Docker daemon in the pod): `export CONTAINER_MODE=none` and | `.github/workflows/ibmc-cluster-setup.yml` | IBM Cloud Hot Cluster Setup | | `.github/workflows/ibmc-cluster-teardown.yml` | IBM Cloud Hot Cluster Teardown | | `.github/workflows/ibmc-cluster-auto-teardown.yml` | IBM Cloud Hot Cluster Auto-Teardown | -| `.github/workflows/hot-cluster-e2e.yml` | Hot Cluster E2E (PR #4099) | -| `.github/workflows/hot-cluster-e2e-run.yml` | Hot Cluster E2E Run (PR #4099) | +| `.github/workflows/poc-e2e-ci-test.yml` | POC Hot ClusterE2E CI Test | +| `.github/workflows/poc-e2e-ci-test2.yml` | POC Hot Cluster E2E CI Test 2 | ### Setting up the hot cluster 1. Actions → **IBM Cloud Hot Cluster Setup** → Run workflow 2. Inputs: cluster name, **classic** zone (e.g. `wdc04`), OpenShift version, worker flavor/count, **KVM emulation** (`true` for VPC-style workers, `false` for bare metal with hardware KVM) -3. Wait for completion (provisioning time depends on flavor; setup includes HCO, custom runner image build, ARC controller + scale set, `check-cluster-health.sh`) +3. Wait for completion (provisioning time depends on flavor; setup includes HCO, dind mirror, custom runner image build, ARC controller + scale set, `check-cluster-health.sh`) + +**Implementation notes:** Provisioning uses `ibmcloud oc cluster create classic` (not VPC workers in this workflow). Setup installs `oc` from the cluster downloads endpoint, runs `install-hco.sh`, then `arc/setup-dind-mirror.sh`, `arc/setup-runner-image.sh`, `install-arc-controller.sh`, and `install-runner-scale-set.sh`. -**Implementation notes:** Provisioning uses `ibmcloud oc cluster create classic` (not VPC workers in this workflow). Setup installs `oc` from the cluster downloads endpoint, runs `install-hco.sh`, then `images/setup-arc-runner-image.sh`, `install-arc-controller.sh`, and `install-runner-scale-set.sh`. +### Running POC E2E tests -### Running hot cluster E2E tests +**Variant A — `poc-e2e-ci-test.yml` (IBM Cloud cluster health checks then run `poc-e2e-ci-test2.yml`)** -1. Actions → **Hot Cluster E2E** (PR trigger or manual dispatch) -2. Inputs: Playwright project (`gating` or `features`), cluster name (default `kubevirt-plugin-ci`) -3. Health check on `ubuntu-latest`; on success calls **Hot Cluster E2E Run** -4. Run workflow provisions a `ci-test-stack`, runs Playwright, uploads artifacts, releases the stack +1. Actions → **POC Hot ClusterE2E CI Test** +2. Inputs: Cypress spec (default `tests/gating.cy.ts`), cluster name +3. Runs `check-cluster-health.sh` on `ubuntu-latest` with an IBM Cloud kubeconfig; fails fast if the cluster is unhealthy +4. On success, calls `poc-e2e-ci-test2.yml` via `workflow_call` to run the tests -To run only the test jobs (cluster already verified): dispatch **Hot Cluster E2E Run** directly. +**Variant B — `poc-e2e-ci-test2.yml` (off-cluster console + plugin containers)** + +1. Actions → **POC Hot Cluster E2E CI Test 2** +2. Default spec: `tests/poc-gating.cy.ts` (narrower gating bundle than full `gating.cy.ts`) +3. Build job pushes/pulls a **plugin image** from a registry (see **POC debt** below). +4. The test job creates only the **test namespace + dummy secret** (not full `test-setup.sh`); modal handling and other prep lean on Cypress `beforeSpec` / shared helpers. +5. Test job starts **plugin** then **console** via `ci-scripts/`, then Cypress with `BRIDGE_BASE_ADDRESS=http://localhost:9000` ### Tearing down the cluster **Manual:** Actions → **IBM Cloud Hot Cluster Teardown** -**Teardown implementation:** Runs `ci-scripts/arc/uninstall-arc.sh` to cleanly deregister ARC runner scale set and controller via Helm, then deletes the ROKS cluster. +**Automatic:** **IBM Cloud Hot Cluster Auto-Teardown** runs on a schedule (`*/30 * * * *`), uses `GITHUB_TOKEN` with `actions: write` to dispatch **IBM Cloud Hot Cluster Teardown** when idle thresholds are met. Idle detection monitors only the two E2E test workflows (`poc-e2e-ci-test.yml` and `poc-e2e-ci-test2.yml`) for in-progress, queued, or recently completed runs (fallback: cluster creation time). -**Automatic:** The `ibmc-cluster-auto-teardown.yml` workflow runs daily at 02:00 UTC as a safety net, calling the teardown workflow for the default cluster name. +**Teardown implementation:** Uninstalls Helm releases `kubevirt-plugin-ci` (scale set) and `arc` (controller) when possible, deletes the ROKS cluster, then optionally removes offline GitHub runners labeled `kubevirt-plugin-ci` using `BOT_PAT`. ## ARC on OpenShift vs [na-launch/github-arc](https://github.com/na-launch/github-arc/blob/main/README.md) @@ -236,21 +248,20 @@ You do **not** need to re-apply `ci-scripts/arc/arc-openshift-scc.yaml`. ## Scripts -| Script | Purpose | -| ---------------------------------- | --------------------------------------------------------------------------------- | -| `install-hco.sh` | Installs HCO operator, HPP storage, and virtctl | -| `arc/setup-dind-mirror.sh` | Mirror `docker:dind` to internal registry; write `generated/arc-dind-replace.env` | -| `images/setup-arc-runner-image.sh` | OpenShift binary build for custom ARC runner image | -| `arc/install-arc-controller.sh` | SCC + Helm `gha-runner-scale-set-controller` (once per cluster) | -| `arc/install-runner-scale-set.sh` | Helm `gha-runner-scale-set`, SCC bind, `arc-runner-rbac.yaml` | -| `arc/uninstall-arc.sh` | Reverse of install: Helm uninstall scale set + controller (same env vars) | -| `arc/README.md` | ARC on OpenShift setup guide | -| `check-cluster-health.sh` | Verifies cluster, HCO, ARC, storage, console; optional GitHub runner check | -| `check-roks-cluster-state.sh` | Waits until ROKS cluster is usable (used by setup workflow) | -| `resolve-console-image.sh` | Emits `CONSOLE_IMAGE` tag **x.y** from `ClusterVersion` for off-cluster console | -| `start-plugin-container.sh` | Runs plugin image with TLS + `nginx-9443.conf` (Docker dind–safe cert paths) | -| `start-console.sh` | Runs `origin-console` off-cluster; `BRIDGE_PLUGIN_PROXY` + kubevirt API route | -| `nginx-9443.conf` | Nginx config for plugin HTTPS (mounted into plugin container in POC test2) | +| Script | Purpose | +| --------------------------------- | --------------------------------------------------------------------------------- | +| `install-hco.sh` | Installs HCO operator, HPP storage, and virtctl | +| `arc/setup-dind-mirror.sh` | Mirror `docker:dind` to internal registry; write `generated/arc-dind-replace.env` | +| `arc/setup-runner-image.sh` | OpenShift binary build for custom ARC runner image | +| `arc/install-arc-controller.sh` | SCC + Helm `gha-runner-scale-set-controller` (once per cluster) | +| `arc/install-runner-scale-set.sh` | Helm `gha-runner-scale-set`, SCC bind, `arc-runner-rbac.yaml` | +| `arc/README.md` | ARC on OpenShift setup guide | +| `check-cluster-health.sh` | Verifies cluster, HCO, ARC, storage, console; optional GitHub runner check | +| `check-roks-cluster-state.sh` | Waits until ROKS cluster is usable (used by setup workflow) | +| `resolve-console-image.sh` | Emits `CONSOLE_IMAGE` tag **x.y** from `ClusterVersion` for off-cluster console | +| `start-plugin-container.sh` | Runs plugin image with TLS + `nginx-9443.conf` (Docker dind–safe cert paths) | +| `start-console.sh` | Runs `origin-console` off-cluster; `BRIDGE_PLUGIN_PROXY` + kubevirt API route | +| `nginx-9443.conf` | Nginx config for plugin HTTPS (mounted into plugin container in POC test2) | ### Script Configuration @@ -268,19 +279,14 @@ Key defaults: - `ARC_SCALE_SET_LABELS` (optional multilabel; requires matching `runs-on` array in workflows) - Additional scale sets: run only **`ci-scripts/arc/install-runner-scale-set.sh`** (skip **`ci-scripts/arc/install-arc-controller.sh`**) -## Follow-up work - -See [docs/HOT_CLUSTER_FUTURE_WORK.md](../docs/HOT_CLUSTER_FUTURE_WORK.md) for RBAC hardening, FIPS, ci-env-controller setup gap, and workflow hygiene items. - -- **Use `kubectl` + `_cluster-helpers.sh` to install `oc`** — Instead of downloading `oc` from `mirror.openshift.com` via `install-oc-client.sh`, use the `kubectl` binary already available on GitHub runners together with `_cluster-helpers.sh` `resolve_cli_downloads()` to fetch `oc` directly from the cluster's `ConsoleCLIDownload` resources. This avoids the external mirror dependency and ensures the binary matches the running cluster version exactly. -- **Harden `check-cluster-health.sh`** — The health check script may need adjustments once runtime cluster configuration issues are discovered during real usage. Revisit checks and thresholds based on operational experience. +## POC: immediate next steps (toward stable green runs) -Quick checklist: - -1. **Health check first** — Run **Hot Cluster E2E** (or health-check job only) to isolate cluster/HCO issues from test-stack issues. -2. **ci-env-controller** — Install once on the cluster if not already present (`./dev/ci-env.sh`). -3. **ARC on org repo** — Runners must register to `kubevirt-ui/kubevirt-plugin`, not a fork. -4. **Auto-teardown** — Confirm idle detection watches `hot-cluster-e2e.yml` and `hot-cluster-e2e-run.yml`. +1. **Plugin image supply chain (`poc-e2e-ci-test2.yml`)** — Replace the hard-coded `KUBEVIRT_PLUGIN_IMAGE` (currently a fixed `ttl.sh/...` tag) with a per-run or per-SHA tag (e.g. uncomment the `github.run_id`-style pattern), or build on every run and push to a registry your cluster/runner can pull. Ensure the **skopeo inspect** skip path does not mask a broken or stale image. +2. **Align Cypress coverage with stability** — `tests/poc-gating.cy.ts` is intentionally smaller than full `tests/gating.cy.ts`; expand only after the off-cluster stack is reliable. Fix flaky specs (VM start/status waits, tab navigation) using the same patterns as local CI. +3. **Run variant A first for signal** — Use `poc-e2e-ci-test.yml` against a healthy cluster to separate **cluster/HCO** issues from **docker/console/plugin** issues in test2. +4. **Fork / ARC** — Variant A (`poc-e2e-ci-test.yml`) runs the health check on `ubuntu-latest` and is fork-safe; variant B (`poc-e2e-ci-test2.yml`) still requires a runner labeled `kubevirt-plugin-ci` and cannot run on forks without ARC registered. +5. **Workflow hygiene** — Add dependency caching to `poc-e2e-ci-test2.yml` (open TODO: use `actions/setup-node` with caching or an explicit cache step). Consider pinning `actions/checkout` major versions consistently across workflows. +6. **Verify auto-teardown** — Confirm scheduled **IBM Cloud Hot Cluster Auto-Teardown** successfully dispatches **IBM Cloud Hot Cluster Teardown** (`workflow_id` must match `ibmc-cluster-teardown.yml`). ## Production and hardening review (before treating POC patterns as prod) @@ -292,6 +298,7 @@ Quick checklist: | **`ttl.sh` or ephemeral public registries** | Ephemeral tags, no provenance, rate/abuse limits | Internal registry + image signing, digest pinning | | **Skip `npm audit` / `--ignore-scripts`** | Supply-chain and lifecycle scripts not run | Revisit for production pipelines; use lockfile + audited base images | | **Cluster-scoped mutations in `test-setup.sh`** | Variant A may patch shared ConfigMaps | Prefer namespaced fixtures or dedicated test clusters | +| **Ghost runner cleanup via `BOT_PAT`** | PAT scope and rotation | GitHub App or org-level runner management; least privilege | | **Auto-teardown idle heuristic** | Monitors only the two E2E test workflows; a cluster used by other workflows may be torn down early | Tie to runner job queue or explicit "last test" workflow | | **Classic ROKS only in setup workflow** | Not IBM Cloud VPC Gen2 path | Add a parallel path or doc if prod standardizes on VPC | @@ -301,7 +308,14 @@ Quick checklist: ## Cost Control -Bare metal nodes on IBM Cloud are expensive. Tear down the cluster manually via **IBM Cloud Hot Cluster Teardown** when testing is complete. Automatic idle teardown is planned in PR #4099. +Bare metal nodes on IBM Cloud are expensive. The auto-teardown workflow provides automatic cost control: + +- Runs every 30 minutes via cron +- Checks if any CI jobs are in-progress or queued +- If idle for more than 2 hours, triggers the teardown workflow +- Worst case: an idle cluster runs ~2.5 hours before teardown + +**Important**: Always verify the cluster has been torn down if you're done testing. The auto-teardown is a safety net, not a substitute for manual cleanup. ## Troubleshooting @@ -339,7 +353,7 @@ Bare metal nodes on IBM Cloud are expensive. Tear down the cluster manually via - Go to repository Settings → Actions → Runners - Manually delete any offline runners -- Or run the teardown workflow again (Helm uninstall deregisters runners) +- Or run the teardown workflow again (it includes ghost runner cleanup) ### ARC runner `oc` / `kubectl` permissions diff --git a/ci-scripts/arc/README.md b/ci-scripts/arc/README.md index c839beaa05..55d3a0e301 100644 --- a/ci-scripts/arc/README.md +++ b/ci-scripts/arc/README.md @@ -9,7 +9,7 @@ All scripts expect **`oc login`** to an OpenShift cluster with permissions to cr Run from the **repository root** (paths below assume that). 1. **`setup-dind-mirror.sh`** — `oc import-image` **`docker:dind`** → `image-registry.../arc-runners/arc-docker-dind:dind` and write **`ci-scripts/generated/arc-dind-replace.env`**. Use `SKIP_DIND_MIRROR=1` only if dind is provided via **`ImageContentSourcePolicy`** or another cluster mirror (no import from Docker Hub). -2. **`ci-scripts/images/setup-arc-runner-image.sh`** — OpenShift `BuildConfig` binary build from [`images/arc-runner/Dockerfile`](../images/arc-runner/Dockerfile) → `arc-runner-custom:latest` in the internal registry. Prints **`IMAGE_REF=...`** for automation. +2. **`setup-runner-image.sh`** — OpenShift `BuildConfig` binary build from [`runner-image/Dockerfile`](runner-image/Dockerfile) → `arc-runner-custom:latest` in the internal registry. Prints **`IMAGE_REF=...`** for automation. 3. **`install-arc-controller.sh`** — Once per cluster: namespace `arc-systems`, apply **`arc-openshift-scc.yaml`**, Helm **`gha-runner-scale-set-controller`**. 4. **`install-runner-scale-set.sh`** — Per scale set: namespace `arc-runners`, Helm **`gha-runner-scale-set`** with GitHub auth, optional **`ARC_RUNNER_IMAGE`**, Helm **post-renderer** (always for dind: injects **`--storage-driver=vfs`** for OpenShift; optional **`docker:dind` → mirror** when `arc-dind-replace.env` exists or **`ARC_DIND_INTERNAL_IMAGE`** is set), **`oc policy add-role-to-user system:openshift:scc:github-arc`** on the runner SA, apply **`arc-runner-rbac.yaml`** (unless `SKIP_ARC_RUNNER_RBAC=1`). @@ -19,7 +19,7 @@ export ARC_APP_ID="..." ARC_APP_INSTALL_ID="..." ARC_APP_PRIVATE_KEY="$(cat app. # optional: export ARC_RUNNER_IMAGE after setup-runner-image prints IMAGE_REF= ./ci-scripts/arc/setup-dind-mirror.sh -IMAGE_REF=$(./ci-scripts/images/setup-arc-runner-image.sh | grep '^IMAGE_REF=' | cut -d= -f2-) +IMAGE_REF=$(./ci-scripts/arc/setup-runner-image.sh | grep '^IMAGE_REF=' | cut -d= -f2-) export ARC_RUNNER_IMAGE="${IMAGE_REF}" ./ci-scripts/arc/install-arc-controller.sh @@ -40,7 +40,7 @@ Details are in each script’s header comments. ## GitHub configuration - **GitHub App** (recommended): repository **Administration: Read and write**, organization **Self-hosted runners: Read and write**. Install the app on the target repo/org; use App ID, installation ID, and private key PEM. -- **PAT**: fine-grained or classic with sufficient repo + runner permissions (see [ci-scripts/README.md](../README.md#required-github-secrets)). +- **PAT**: fine-grained or classic with sufficient repo + runner permissions (see [HOT_CLUSTER_CI.md](../HOT_CLUSTER_CI.md)). Workflows must use `runs-on:` labels that match **`RUNNER_SCALE_SET_NAME`** (default **`kubevirt-plugin-ci`**) or every label in **`ARC_SCALE_SET_LABELS`** if set. @@ -61,7 +61,6 @@ You do **not** need to re-apply **`arc-openshift-scc.yaml`**. | `arc-runner-scale-set.pod.yaml` | Helm values fragment for the runner container (volumes, securityContext) | | `arc-helm-helpers.sh` | Shared Helm/auth helpers | | `arc-dind-post-render.sh` | Helm post-renderer: OpenShift dind `vfs` storage driver; optional `docker:dind` swap via `../generated/arc-dind-replace.env` | -| `uninstall-arc.sh` | Helm uninstall of scale set + controller (reverse of install; same env vars) | | `runner-image/Dockerfile` | Custom runner (Node, kubectl, oc, virtctl, jq) | Generated **`ci-scripts/generated/arc-dind-replace.env`** is gitignored; it is produced by **`setup-dind-mirror.sh`** or by **`install-runner-scale-set.sh`** when **`ARC_DIND_INTERNAL_IMAGE`** is set. diff --git a/ci-scripts/arc/arc-dind-post-render.sh b/ci-scripts/arc/arc-dind-post-render.sh new file mode 100755 index 0000000000..5fa55301db --- /dev/null +++ b/ci-scripts/arc/arc-dind-post-render.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# Helm post-renderer for gha-runner-scale-set + containerMode dind on OpenShift. +# +# 1) Optional: replace hardcoded docker:dind with internal registry image when +# ci-scripts/generated/arc-dind-replace.env exists (setup-dind-mirror.sh, etc.). +# 2) Always: append --storage-driver=vfs to dockerd args so the inner daemon +# does not use overlay on top of the pod's overlay (containerd EINVAL on mount). +# +# Ref: ARC chart dind args in actions-runner-controller _helpers.tpl (dind-container). +set -euo pipefail +ARC_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ENV_FILE="${ARC_ROOT}/../generated/arc-dind-replace.env" + +tmp="$(mktemp)" +trap 'rm -f "${tmp}"' EXIT +cat >"${tmp}" + +ARC_DIND_INTERNAL_IMAGE="" +if [[ -f "${ENV_FILE}" ]]; then + # shellcheck source=/dev/null + source "${ENV_FILE}" +fi + +skip_vfs=0 +if grep -q 'storage-driver=vfs' "${tmp}"; then + skip_vfs=1 +fi + +while IFS= read -r line || [[ -n "${line}" ]]; do + if [[ -n "${ARC_DIND_INTERNAL_IMAGE:-}" && "${line}" == *docker:dind* ]]; then + printf '%s\n' "${line//docker:dind/${ARC_DIND_INTERNAL_IMAGE}}" + continue + fi + if [[ "${skip_vfs}" == 0 && "${line}" =~ ^([[:space:]]*)-\ --group=\$\(DOCKER_GROUP_GID\)$ ]]; then + printf '%s\n' "${line}" + printf '%s- --storage-driver=vfs\n' "${BASH_REMATCH[1]}" + continue + fi + printf '%s\n' "${line}" +done <"${tmp}" diff --git a/ci-scripts/arc/arc-helm-helpers.sh b/ci-scripts/arc/arc-helm-helpers.sh index 45d757f6a0..5ea9b8c4f7 100755 --- a/ci-scripts/arc/arc-helm-helpers.sh +++ b/ci-scripts/arc/arc-helm-helpers.sh @@ -38,3 +38,53 @@ arc_github_config_secret_helm_auth() { trap '[[ -n "${AUTH_VALUES_FILE:-}" && -f "${AUTH_VALUES_FILE}" ]] && rm -f "${AUTH_VALUES_FILE}"' EXIT return 0 } + +# +# Multilabel (ARC 0.14+): optional comma-separated ARC_SCALE_SET_LABELS — workflows must use +# runs-on: [label1, label2, ...] matching every label (see HOT_CLUSTER_CI.md). +# +arc_helm_append_scale_set_labels() { + local -n _helm_arr="${1:?helm args array name required}" + [[ -z "${ARC_SCALE_SET_LABELS:-}" ]] && return 0 + local json="[" + local first=1 + local lab + IFS=',' read -ra _arc_ssl <<< "${ARC_SCALE_SET_LABELS}" + for lab in "${_arc_ssl[@]}"; do + lab="${lab//[[:space:]]/}" + [[ -z "$lab" ]] && continue + [[ $first -eq 0 ]] && json+="," + first=0 + json+="\"${lab//\"/\\\"}\"" + done + json+="]" + if [[ $first -eq 1 ]]; then + return 0 + fi + echo "Scale set labels (multilabel): ${ARC_SCALE_SET_LABELS}" + _helm_arr+=(--set-json "scaleSetLabels=${json}") +} + +# +# Post-renders gha-runner-scale-set dind manifests: optional docker:dind → mirror +# (ci-scripts/generated/arc-dind-replace.env), and always injects --storage-driver=vfs +# so dockerd works on OpenShift (nested overlay otherwise fails with EINVAL). +# +# Usage: arc_helm_append_dind_post_renderer RUNNER_SET_ARGS "${ARC_DIR}" "${CI_SCRIPTS_DIR}" +# Env: ARC_USE_DIND_POST_RENDER=0 to disable. +# +arc_helm_append_dind_post_renderer() { + local -n _helm_arr="${1:?helm args array name required}" + local arc_dir="${2:?arc directory required}" + local ci_scripts_dir="${3:?ci-scripts directory required}" + local env_file="${ci_scripts_dir}/generated/arc-dind-replace.env" + local pr_script="${arc_dir}/arc-dind-post-render.sh" + [[ "${ARC_USE_DIND_POST_RENDER:-1}" == "0" ]] && return 0 + [[ ! -f "${pr_script}" ]] && return 0 + if [[ -f "${env_file}" ]]; then + echo "Helm post-renderer: docker:dind mirror (${env_file}) + dind vfs (OpenShift)" + else + echo "Helm post-renderer: dind --storage-driver=vfs (OpenShift / nested overlay)" + fi + _helm_arr+=(--post-renderer "${pr_script}") +} diff --git a/ci-scripts/arc/arc-openshift-scc.yaml b/ci-scripts/arc/arc-openshift-scc.yaml index 38c260cf64..598e359991 100644 --- a/ci-scripts/arc/arc-openshift-scc.yaml +++ b/ci-scripts/arc/arc-openshift-scc.yaml @@ -1,25 +1,27 @@ # OpenShift Security Context Constraint and ClusterRole for GitHub ARC runners. # # The runner container uses UID 1001 / GID 123 via Helm template.securityContext. -# No privileged containers or privilege escalation is needed; the CI test stack -# (console + plugin) runs as separate unprivileged pods via the ci-test-stack -# Helm chart. +# With containerMode.type=dind, the chart adds a privileged docker:dind sidecar that +# must run as root and start dockerd — so this SCC allows privileged containers and +# RunAsAny for UIDs (stricter MustRunAs/1001 would block the dind container). # # Ref: https://developers.redhat.com/articles/2025/02/17/how-securely-deploy-github-arc-openshift +# ARC dind template: actions-runner-controller charts gha-runner-scale-set +# --- apiVersion: security.openshift.io/v1 kind: SecurityContextConstraints metadata: name: github-arc annotations: - kubernetes.io/description: 'ARC runners: main container as UID 1001 (Helm), non-root, unprivileged.' + kubernetes.io/description: 'ARC runners: main container as UID 1001 (Helm), Docker-in-Docker privileged sidecar. RunAsAny required for dind root + runner 1001.' allowHostDirVolumePlugin: false allowHostIPC: false allowHostNetwork: false allowHostPID: false allowHostPorts: false -allowPrivilegeEscalation: false -allowPrivilegedContainer: false +allowPrivilegeEscalation: true +allowPrivilegedContainer: true allowedCapabilities: null defaultAddCapabilities: null fsGroup: @@ -28,7 +30,7 @@ groups: [] priority: null readOnlyRootFilesystem: false runAsUser: - type: MustRunAsNonRoot + type: RunAsAny seLinuxContext: type: MustRunAs supplementalGroups: diff --git a/ci-scripts/arc/arc-runner-rbac.yaml b/ci-scripts/arc/arc-runner-rbac.yaml index 3050e3db6b..93c56061dd 100644 --- a/ci-scripts/arc/arc-runner-rbac.yaml +++ b/ci-scripts/arc/arc-runner-rbac.yaml @@ -1,6 +1,5 @@ -# Grant the ARC runner scale set ServiceAccount CI permissions. -# gha-runner-scale-set creates: -gha-rs-no-permission -# (default: kubevirt-plugin-ci-gha-rs-no-permission). +# Grant the ARC runner scale set ServiceAccount CI permissions (oc, test namespaces, KubeVirt). +# gha-runner-scale-set creates: -gha-rs-no-permission (default: kubevirt-plugin-ci-gha-rs-no-permission). # # Applied automatically by ci-scripts/arc/install-runner-scale-set.sh. # Subject name + namespace substituted for RUNNER_SCALE_SET_NAME / ARC_RUNNERS_NS. @@ -9,14 +8,12 @@ # # To skip cluster RBAC (custom bindings): SKIP_ARC_RUNNER_RBAC=1 # -# If RUNNER_SCALE_SET_NAME differs from defaults and you apply this file by hand, -# edit subjects or use: +# If RUNNER_SCALE_SET_NAME differs from defaults and you apply this file by hand, edit subjects or use: # oc create clusterrolebinding arc-runner-ci-custom --clusterrole=arc-runner-ci \ # --serviceaccount=:-gha-rs-no-permission # -# NOTE: The runner's ConfigMap access in the ci-env namespace is granted via a -# namespaced RoleBinding created by ci-scripts/ci-env/install-ci-env-controller.sh, -# not by this ClusterRole. +# Alternative (cluster-admin): bind cluster-admin to the same ServiceAccount instead of arc-runner-ci. +# --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -25,34 +22,44 @@ metadata: labels: app.kubernetes.io/component: arc-runner-rbac rules: - # --- Cluster reads (console URL, version logging) --- + # Read cluster and console config (for oc cluster-info, console URL) - apiGroups: [''] - resources: ['nodes'] + resources: ['nodes', 'namespaces'] verbs: ['get', 'list'] - apiGroups: ['config.openshift.io'] resources: ['consoles', 'clusterversions', 'ingresses'] verbs: ['get', 'list'] - - # --- Diagnostics (pod logs, events in test namespaces) --- + # Create/delete test namespaces and manage resources for Cypress - apiGroups: [''] - resources: ['pods', 'pods/log', 'events'] - verbs: ['get', 'list'] - - # --- Cypress cy.exec runs: oc patch virtualmachine --- - - apiGroups: ['kubevirt.io'] - resources: ['virtualmachines'] - verbs: ['get', 'list', 'watch', 'patch'] - - # --- check-runner job: HCO / operator version logging --- + resources: + [ + 'namespaces', + 'pods', + 'pods/log', + 'services', + 'secrets', + 'configmaps', + 'events', + 'serviceaccounts', + 'persistentvolumeclaims', + ] + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete', 'deletecollection'] + # Read HCO operand CRs for version logging (Log HCO and managed operator versions step) - apiGroups: ['operators.coreos.com'] resources: ['clusterserviceversions'] verbs: ['get', 'list'] - apiGroups: ['kubevirt.io'] - resources: ['kubevirts'] - verbs: ['get', 'list'] + resources: + [ + 'kubevirts', + 'virtualmachines', + 'virtualmachineinstances', + 'virtualmachineinstancemigrations', + ] + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] - apiGroups: ['cdi.kubevirt.io'] - resources: ['cdis'] - verbs: ['get', 'list'] + resources: ['cdis', 'datavolumes'] + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] - apiGroups: ['ssp.kubevirt.io'] resources: ['ssps'] verbs: ['get', 'list'] @@ -62,6 +69,12 @@ rules: - apiGroups: ['hostpathprovisioner.kubevirt.io'] resources: ['hostpathprovisioners'] verbs: ['get', 'list'] + - apiGroups: ['snapshot.storage.k8s.io'] + resources: ['volumesnapshots'] + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] + - apiGroups: ['subresources.kubevirt.io'] + resources: ['virtualmachineinstances/console', 'virtualmachineinstances/vnc'] + verbs: ['get'] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/ci-scripts/arc/arc-runner-scale-set.pod.yaml b/ci-scripts/arc/arc-runner-scale-set.pod.yaml index 9402169045..dd01930843 100644 --- a/ci-scripts/arc/arc-runner-scale-set.pod.yaml +++ b/ci-scripts/arc/arc-runner-scale-set.pod.yaml @@ -1,13 +1,28 @@ -# gha-runner-scale-set Helm values fragment: runner pod template (OpenShift). +# gha-runner-scale-set Helm values fragment: runner pod template (OpenShift + dind). # Used by ci-scripts/arc/install-runner-scale-set.sh. # -# The CI test stack (console + plugin) runs as separate cluster pods via the -# ci-test-stack Helm chart, so no Docker-in-Docker sidecar is needed. +# With containerMode.type=dind, the chart merges this with docker:dind, initContainers, +# DOCKER_HOST, and dind volume mounts. Only the container named "runner" is customized here. # -# Default image is pinned upstream until ARC_RUNNER_IMAGE / --set overrides: +# Nested overlay: dockerd's default overlay graph on the pod's overlayfs fails (EINVAL). +# arc-dind-post-render.sh injects --storage-driver=vfs into the chart's dockerd args; re-run +# install-runner-scale-set.sh after upgrading the post-renderer (ARC_USE_DIND_POST_RENDER=0 skips it). +# +# docker:dind: the upstream chart hardcodes image docker:dind (no Helm value). This repo expects +# setup-dind-mirror.sh to mirror docker:dind → internal ImageStream arc-docker-dind:dind and write +# ci-scripts/generated/arc-dind-replace.env; install-runner-scale-set.sh runs Helm with +# --post-renderer arc-dind-post-render.sh when that file exists (or ARC_DIND_INTERNAL_IMAGE). +# SKIP_DIND_MIRROR=1 is for clusters that replace dind via ImageContentSourcePolicy / other mirroring. +# +# Default image is upstream until ARC_RUNNER_IMAGE / --set overrides: # --set template.spec.containers[0].image=/... # # Ref: https://github.com/actions/actions-runner-controller/tree/master/charts/gha-runner-scale-set +# +# ARC 0.14+ (chart 0.14.0): multilabel via scaleSetLabels (see ARC_SCALE_SET_LABELS / examples/), +# resourceMeta for listener/RBAC metadata, listener defaults to kubernetes.io/os: linux. +# Experimental chart exposes runner.dind.container.image to avoid docker:dind post-rendering — +# see HOT_CLUSTER_CI.md if you migrate off the stable chart. --- template: spec: @@ -20,11 +35,18 @@ template: emptyDir: {} containers: - name: runner - image: ghcr.io/actions/actions-runner:2.335.1 + image: ghcr.io/actions/actions-runner:latest command: - '/home/runner/run.sh' + env: + # Workaround for FIPS-enabled clusters (see POC_HOT_CLUSTER_CI.md). + - name: OPENSSL_FORCE_FIPS_MODE + value: '0' + # Workaround for FIPS-enabled clusters (see POC_HOT_CLUSTER_CI.md). + - name: GOLANG_FIPS + value: '0' securityContext: - allowPrivilegeEscalation: false + allowPrivilegeEscalation: true capabilities: drop: - ALL diff --git a/ci-scripts/arc/install-arc-controller.sh b/ci-scripts/arc/install-arc-controller.sh index d6ae4017ac..1cd3697298 100755 --- a/ci-scripts/arc/install-arc-controller.sh +++ b/ci-scripts/arc/install-arc-controller.sh @@ -13,8 +13,6 @@ set -euo pipefail ARC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" CI_SCRIPTS_DIR="$(cd "${ARC_DIR}/.." && pwd)" -source "${CI_SCRIPTS_DIR}/_cluster-helpers.sh" -verify_oc ARC_CONTROLLER_NS="${ARC_CONTROLLER_NS:-arc-systems}" ARC_CONTROLLER_INSTALL_NAME="${ARC_CONTROLLER_INSTALL_NAME:-arc}" @@ -24,33 +22,36 @@ ARC_VERSION="${ARC_VERSION:-0.14.0}" echo "=== ARC controller installation (OpenShift) ===" echo " ARC_CONTROLLER_NS: ${ARC_CONTROLLER_NS}" echo " ARC_CONTROLLER_INSTALL_NAME: ${ARC_CONTROLLER_INSTALL_NAME}" -echo " ARC_HELM_REPO: ${ARC_HELM_REPO}" echo " ARC_VERSION: ${ARC_VERSION}" +echo " ARC_HELM_REPO: ${ARC_HELM_REPO}" echo "" +if ! oc get clusterversion version &>/dev/null; then + echo "ERROR: This script targets OpenShift only." + echo " Expected cluster-scoped ClusterVersion 'version'; use 'oc login' to an OpenShift cluster." + exit 1 +fi + echo "Creating namespace ${ARC_CONTROLLER_NS}..." oc create namespace "${ARC_CONTROLLER_NS}" --dry-run=client -o yaml | oc apply -f - echo "Applying ARC SCC and ClusterRole (github-arc)..." oc apply -f "${ARC_DIR}/arc-openshift-scc.yaml" -CONTROLLER_ARGS=() +CONTROLLER_SA_NAME="${ARC_CONTROLLER_INSTALL_NAME}-gha-rs-controller" +CONTROLLER_ARGS=(--namespace "${ARC_CONTROLLER_NS}") if [[ -n "${ARC_VERSION}" && "${ARC_VERSION}" != "latest" ]]; then CONTROLLER_ARGS+=(--version "${ARC_VERSION}") fi - -CONTROLLER_SA_NAME="${ARC_CONTROLLER_INSTALL_NAME}-gha-rs-controller" CONTROLLER_ARGS+=(--set "serviceAccount.name=${CONTROLLER_SA_NAME}") echo "Installing ARC controller (Helm release: ${ARC_CONTROLLER_INSTALL_NAME})..." -helm upgrade \ - "${ARC_CONTROLLER_INSTALL_NAME}" \ - "${ARC_HELM_REPO}/gha-runner-scale-set-controller" \ - --install \ - --namespace "${ARC_CONTROLLER_NS}" \ +helm upgrade --install "${ARC_CONTROLLER_INSTALL_NAME}" \ "${CONTROLLER_ARGS[@]}" \ - --wait --timeout 5m + "${ARC_HELM_REPO}/gha-runner-scale-set-controller" \ + --wait echo "" echo "=== ARC controller installation complete ===" +echo " Next: ./ci-scripts/arc/install-runner-scale-set.sh (requires ARC_CONFIG_URL + GitHub auth)" echo "" diff --git a/ci-scripts/arc/install-runner-scale-set.sh b/ci-scripts/arc/install-runner-scale-set.sh index 81bd1cd140..f1ae77b21a 100755 --- a/ci-scripts/arc/install-runner-scale-set.sh +++ b/ci-scripts/arc/install-runner-scale-set.sh @@ -1,7 +1,7 @@ #!/bin/bash # -# Install gha-runner-scale-set (OpenShift): runner namespace, Helm release, -# SCC bind, CI RBAC. +# Install gha-runner-scale-set (OpenShift): runner namespace, Helm release, optional dind +# post-render (mirror file or ARC_DIND_INTERNAL_IMAGE), SCC bind, CI RBAC. # Requires install-arc-controller.sh (or equivalent controller + SCC) already applied. # # Required environment variables: @@ -18,8 +18,13 @@ # ARC_CONTROLLER_INSTALL_NAME (default: arc) # ARC_RUNNERS_NS (default: arc-runners) # ARC_VERSION Helm chart version (default: 0.14.0); set to "latest" to omit --version +# ARC_SCALE_SET_LABELS Optional comma-separated multilabel (ARC 0.14+) +# CONTAINER_MODE default dind; set to "none" to disable Docker-in-Docker +# ARC_RUNNER_EXTRA_VALUES Optional second Helm values file (merged after pod.yaml) # ARC_RUNNER_IMAGE If set, use this image for the runner container -# ARC_RUNNER_EXTRA_VALUES Optional extra Helm values file (e.g. FIPS workarounds) +# ARC_DIND_INTERNAL_IMAGE If set, writes ci-scripts/generated/arc-dind-replace.env for this run +# (alternative to setup-dind-mirror.sh) +# ARC_USE_DIND_POST_RENDER Default 1; set to 0 to skip post-renderer # SKIP_ARC_RUNNER_RBAC Set to 1 to skip applying ci-scripts/arc/arc-runner-rbac.yaml # # Pod template fragment: ci-scripts/arc/arc-runner-scale-set.pod.yaml @@ -27,9 +32,6 @@ set -euo pipefail ARC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" CI_SCRIPTS_DIR="$(cd "${ARC_DIR}/.." && pwd)" -source "${CI_SCRIPTS_DIR}/_cluster-helpers.sh" -verify_oc - source "${ARC_DIR}/arc-helm-helpers.sh" RUNNER_POD_VALUES="${ARC_DIR}/arc-runner-scale-set.pod.yaml" @@ -39,38 +41,40 @@ if [[ ! -f "${RUNNER_POD_VALUES}" ]]; then fi ARC_CONFIG_URL="${ARC_CONFIG_URL:?ARC_CONFIG_URL is required}" +RUNNER_SCALE_SET_NAME="${RUNNER_SCALE_SET_NAME:-kubevirt-plugin-ci}" +MIN_RUNNERS="${MIN_RUNNERS:-0}" +MAX_RUNNERS="${MAX_RUNNERS:-5}" ARC_CONTROLLER_NS="${ARC_CONTROLLER_NS:-arc-systems}" ARC_CONTROLLER_INSTALL_NAME="${ARC_CONTROLLER_INSTALL_NAME:-arc}" ARC_RUNNERS_NS="${ARC_RUNNERS_NS:-arc-runners}" -ARC_HELM_REPO="oci://ghcr.io/actions/actions-runner-controller-charts" ARC_VERSION="${ARC_VERSION:-0.14.0}" -RUNNER_SCALE_SET_NAME="${RUNNER_SCALE_SET_NAME:-kubevirt-plugin-ci}" -MIN_RUNNERS="${MIN_RUNNERS:-0}" -MAX_RUNNERS="${MAX_RUNNERS:-5}" -if ! [[ "${MIN_RUNNERS}" =~ ^[0-9]+$ && "${MAX_RUNNERS}" =~ ^[0-9]+$ ]]; then - echo "ERROR: MIN_RUNNERS and MAX_RUNNERS must be non-negative integers" - exit 1 -fi -if (( MIN_RUNNERS > MAX_RUNNERS )); then - echo "ERROR: MIN_RUNNERS cannot be greater than MAX_RUNNERS" - exit 1 -fi -CONTROLLER_SA_NAME="${ARC_CONTROLLER_INSTALL_NAME}-gha-rs-controller" +CONTAINER_MODE="${CONTAINER_MODE:-dind}" +[[ "${CONTAINER_MODE}" == "none" || "${CONTAINER_MODE}" == "disabled" ]] && CONTAINER_MODE="" +ARC_HELM_REPO="oci://ghcr.io/actions/actions-runner-controller-charts" echo "=== ARC runner scale set installation (OpenShift) ===" echo " ARC_CONFIG_URL: ${ARC_CONFIG_URL}" +echo " RUNNER_SCALE_SET_NAME: ${RUNNER_SCALE_SET_NAME}" +echo " MIN_RUNNERS / MAX_RUNNERS: ${MIN_RUNNERS} / ${MAX_RUNNERS}" echo " ARC_CONTROLLER_NS: ${ARC_CONTROLLER_NS}" echo " ARC_CONTROLLER_INSTALL_NAME: ${ARC_CONTROLLER_INSTALL_NAME}" echo " ARC_RUNNERS_NS: ${ARC_RUNNERS_NS}" -echo " ARC_RUNNER_IMAGE: ${ARC_RUNNER_IMAGE:-(not set, will use default)}" -echo " ARC_HELM_REPO: ${ARC_HELM_REPO}" echo " ARC_VERSION: ${ARC_VERSION}" -echo " RUNNER_SCALE_SET_NAME: ${RUNNER_SCALE_SET_NAME}" -echo " MIN_RUNNERS / MAX_RUNNERS: ${MIN_RUNNERS} / ${MAX_RUNNERS}" +echo " CONTAINER_MODE: ${CONTAINER_MODE:-"(none — no dind)"}" echo " Runner pod values: ${RUNNER_POD_VALUES}" -echo " Controller SA name: ${CONTROLLER_SA_NAME}" echo "" +if ! oc get clusterversion version &>/dev/null; then + echo "ERROR: This script targets OpenShift only." + exit 1 +fi + +if [[ -n "${ARC_DIND_INTERNAL_IMAGE:-}" && "${ARC_USE_DIND_POST_RENDER:-1}" != "0" ]]; then + mkdir -p "${CI_SCRIPTS_DIR}/generated" + printf 'ARC_DIND_INTERNAL_IMAGE=%s\n' "${ARC_DIND_INTERNAL_IMAGE}" > "${CI_SCRIPTS_DIR}/generated/arc-dind-replace.env" + echo "Wrote ${CI_SCRIPTS_DIR}/generated/arc-dind-replace.env from ARC_DIND_INTERNAL_IMAGE" +fi + echo "Creating namespace ${ARC_RUNNERS_NS}..." oc create namespace "${ARC_RUNNERS_NS}" --dry-run=client -o yaml | oc apply -f - @@ -79,6 +83,8 @@ if ! arc_github_config_secret_helm_auth AUTH_ARGS; then exit 1 fi +CONTROLLER_SA_NAME="${ARC_CONTROLLER_INSTALL_NAME}-gha-rs-controller" + RUNNER_SET_ARGS=( --set "githubConfigUrl=${ARC_CONFIG_URL}" --set "minRunners=${MIN_RUNNERS}" @@ -92,26 +98,28 @@ if [[ -n "${ARC_RUNNER_IMAGE:-}" ]]; then echo "Using runner image from ARC_RUNNER_IMAGE" RUNNER_SET_ARGS+=(--set-string "template.spec.containers[0].image=${ARC_RUNNER_IMAGE}") fi -if [[ -n "${ARC_RUNNER_EXTRA_VALUES:-}" ]]; then - if [[ ! -f "${ARC_RUNNER_EXTRA_VALUES}" ]]; then - echo "ERROR: ARC_RUNNER_EXTRA_VALUES file not found: ${ARC_RUNNER_EXTRA_VALUES}" - exit 1 - fi - echo "Merging extra Helm values from ${ARC_RUNNER_EXTRA_VALUES}" +if [[ -n "${CONTAINER_MODE:-}" ]]; then + echo "Enabling container mode: ${CONTAINER_MODE} (Docker-in-Docker)" + RUNNER_SET_ARGS+=(--set "containerMode.type=${CONTAINER_MODE}") +fi +if [[ -n "${ARC_RUNNER_EXTRA_VALUES:-}" && -f "${ARC_RUNNER_EXTRA_VALUES}" ]]; then + echo "Merging extra Helm values: ${ARC_RUNNER_EXTRA_VALUES}" RUNNER_SET_ARGS+=(--values "${ARC_RUNNER_EXTRA_VALUES}") fi if [[ -n "${ARC_VERSION}" && "${ARC_VERSION}" != "latest" ]]; then RUNNER_SET_ARGS+=(--version "${ARC_VERSION}") fi +arc_helm_append_scale_set_labels RUNNER_SET_ARGS +arc_helm_append_dind_post_renderer RUNNER_SET_ARGS "${ARC_DIR}" "${CI_SCRIPTS_DIR}" echo "Installing runner scale set '${RUNNER_SCALE_SET_NAME}'..." -helm upgrade \ - "${RUNNER_SCALE_SET_NAME}" \ - "${ARC_HELM_REPO}/gha-runner-scale-set" \ - --install \ +helm upgrade --install "${RUNNER_SCALE_SET_NAME}" \ --namespace "${ARC_RUNNERS_NS}" \ "${RUNNER_SET_ARGS[@]}" \ - --wait --timeout 5m + "${ARC_HELM_REPO}/gha-runner-scale-set" \ + --wait + +[[ -n "${AUTH_VALUES_FILE:-}" ]] && rm -f "${AUTH_VALUES_FILE}" RUNNER_SA="${RUNNER_SCALE_SET_NAME}-gha-rs-no-permission" echo "Binding SCC github-arc to runner ServiceAccount ${RUNNER_SA}..." @@ -124,31 +132,10 @@ if [[ "${SKIP_ARC_RUNNER_RBAC:-0}" != "1" ]]; then exit 1 fi echo "Applying runner CI RBAC (ClusterRole arc-runner-ci → ${RUNNER_SA} in ${ARC_RUNNERS_NS})..." - DEFAULT_RUNNER_SA='kubevirt-plugin-ci-gha-rs-no-permission' - DEFAULT_RUNNERS_NS='arc-runners' - if ! grep -qF " name: ${DEFAULT_RUNNER_SA}" "${RUNNER_RBAC_MANIFEST}"; then - echo "ERROR: ${RUNNER_RBAC_MANIFEST} missing placeholder ServiceAccount '${DEFAULT_RUNNER_SA}'" - exit 1 - fi - if ! grep -qF " namespace: ${DEFAULT_RUNNERS_NS}" "${RUNNER_RBAC_MANIFEST}"; then - echo "ERROR: ${RUNNER_RBAC_MANIFEST} missing placeholder namespace '${DEFAULT_RUNNERS_NS}'" - exit 1 - fi - RBAC_RENDERED=$( - sed \ - -e "s/^ name: ${DEFAULT_RUNNER_SA}\$/ name: ${RUNNER_SA}/" \ - -e "s/^ namespace: ${DEFAULT_RUNNERS_NS}\$/ namespace: ${ARC_RUNNERS_NS}/" \ - "${RUNNER_RBAC_MANIFEST}" - ) - if ! grep -qF " name: ${RUNNER_SA}" <<< "${RBAC_RENDERED}"; then - echo "ERROR: RBAC substitution failed for ServiceAccount '${RUNNER_SA}'" - exit 1 - fi - if ! grep -qF " namespace: ${ARC_RUNNERS_NS}" <<< "${RBAC_RENDERED}"; then - echo "ERROR: RBAC substitution failed for namespace '${ARC_RUNNERS_NS}'" - exit 1 - fi - echo "${RBAC_RENDERED}" | oc apply -f - + sed \ + -e "s/^ name: kubevirt-plugin-ci-gha-rs-no-permission\$/ name: ${RUNNER_SA}/" \ + -e "s/^ namespace: arc-runners\$/ namespace: ${ARC_RUNNERS_NS}/" \ + "${RUNNER_RBAC_MANIFEST}" | oc apply -f - echo " (Additional scale sets: set SKIP_ARC_RUNNER_RBAC=1 and bind arc-runner-ci per SA, e.g. oc adm policy add-cluster-role-to-user arc-runner-ci -z -n .)" else echo "SKIP_ARC_RUNNER_RBAC=1 — not applying ${RUNNER_RBAC_MANIFEST}" @@ -157,5 +144,6 @@ fi echo "" echo "=== Runner scale set installation complete ===" echo " runs-on: ${RUNNER_SCALE_SET_NAME}" -echo " To refresh runner image: re-run this script with ARC_RUNNER_IMAGE set (after setup-arc-runner-image.sh)." +echo " To refresh runner image: re-run this script with ARC_RUNNER_IMAGE set (after setup-runner-image.sh)." echo "" +echo "Disable dind: CONTAINER_MODE=none and re-run this script." diff --git a/ci-scripts/arc/runner-image/Dockerfile b/ci-scripts/arc/runner-image/Dockerfile new file mode 100644 index 0000000000..4a246386d5 --- /dev/null +++ b/ci-scripts/arc/runner-image/Dockerfile @@ -0,0 +1,111 @@ +# Custom GitHub Actions runner image for kubevirt-plugin-ci CI. +# Extends the official runner image with cli tools used in the CI pipeline (such as jq, curl, +# envsubst, Node.js, oc, and virtctl) and support for running Cypress tests. + +# +# https://github.com/actions/runner/blob/main/images/Dockerfile: the base image +# The base image includes the docker CLI; with ARC containerMode.type=dind, jobs use the +# docker:dind sidecar (DOCKER_HOST=unix:///var/run/docker.sock) for docker/container actions. +# +ARG RUNNER_BASE=ghcr.io/actions/actions-runner:latest +FROM ${RUNNER_BASE} + +USER root + +# Go-based yq (mikefarah/yq) — matches the version on GitHub-hosted ubuntu-latest runners. +# NOTE: apt's `yq` package is the Python-based yq (kislyuk), which has incompatible syntax. +ARG YQ_VERSION=v4.52.5 +# Versions (override with build-args to match cluster) +ARG NODE_VERSION=22 +# OpenShift client version to match cluster (e.g. 4.20) +ARG OC_VERSION=4.20 +# KubeVirt CLI version to match HCO install (e.g. v1.4.0) +ARG VIRTCTL_VERSION=v1.4.0 + +# Direct binary download URLs resolved from ConsoleCLIDownload by setup-runner-image.sh. +# When set, these take precedence over the static mirror/GitHub URLs above and guarantee +# the binaries match the live cluster. Left empty to use the static fallback URLs. +ARG OC_URL="" +ARG VIRTCTL_URL="" + +# curl and wget2 are both installed: +# curl — kept for CI workflow scripts at runtime +# wget2 — used for all build-time HTTPS downloads +# On Ubuntu, apt and wget2 use GnuTLS; curl and wget (1.x) use OpenSSL. On OpenShift build pods +# where the FIPS provider .so can't load (DSO error), all OpenSSL-based HTTPS fails. wget2/GnuTLS +# is unaffected, so every build-time binary download goes through wget2 instead. +RUN apt-get update \ + && apt-get install -y --no-install-recommends ca-certificates jq curl wget2 gettext-base \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Cypress 14+ headless (bundled Electron) — Linux deps for Ubuntu 24.04+ (noble). +# Official runner image is noble (see actions/runner images/Dockerfile: mcr.microsoft.com/dotnet/runtime-deps:8.0-noble). +# Matches Cypress docs: https://docs.cypress.io/guides/getting-started/installing-cypress#Linux — Ubuntu >=24.04 list. +# GitHub-hosted ubuntu-latest (runner-images Ubuntu2404) also includes xvfb, Chrome/Firefox, and extra fonts; for ARC we +# install the minimum Electron stack plus common fonts so screenshots/video match hosted runners more closely. +# unzip is already present in the upstream actions-runner image (needed for fast Cypress binary unpack). +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + fontconfig \ + fonts-liberation \ + libasound2t64 \ + libgbm-dev \ + libgtk-3-0t64 \ + libnss3 \ + libnotify-dev \ + libxss1 \ + libxtst6 \ + xauth \ + xvfb \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# yq (Go/mikefarah) — wget2/GnuTLS avoids the OpenSSL FIPS DSO loading failure on hardened clusters +RUN wget2 -O /usr/local/bin/yq \ + "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" \ + && chmod +x /usr/local/bin/yq + +# Node.js — the nodesource setup script internally calls curl (OpenSSL), which fails on FIPS clusters. +# Fetch the version list from nodejs.org with wget2 (GnuTLS), pick the latest v${NODE_VERSION}.x, +# then download and extract the pre-built binary tarball directly. +RUN NVER=$(wget2 -qO- "https://nodejs.org/dist/index.json" \ + | jq -r --argjson maj "${NODE_VERSION}" \ + '[.[] | select(.version | test("^v" + ($maj|tostring) + "\\."))] | .[0].version') \ + && wget2 -qO /tmp/node.tar.gz \ + "https://nodejs.org/dist/${NVER}/node-${NVER}-linux-x64.tar.gz" \ + && tar -xzf /tmp/node.tar.gz -C /usr/local --strip-components=1 \ + && rm /tmp/node.tar.gz + +# OpenShift client (oc) — use console download URL if resolved, else mirror.openshift.com stable-4.x. +# Console route serves a plain .tar; download to a temp file so GNU tar auto-detects the format. +# Fallback uses wget2 (GnuTLS) for the external HTTPS download. +RUN if [ -n "${OC_URL}" ]; then \ + wget2 -qO /tmp/oc-archive "${OC_URL}" \ + && tar -xf /tmp/oc-archive -C /usr/local/bin oc \ + && rm /tmp/oc-archive; \ + else \ + wget2 -qO- "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable-${OC_VERSION}/openshift-client-linux.tar.gz" \ + | tar -xzf - -C /usr/local/bin; \ + fi + +# virtctl (KubeVirt CLI) — use console download URL if resolved, else GitHub releases. +# Console route serves a .tar.gz; fallback uses wget2 (GnuTLS) for the external HTTPS download. +RUN if [ -n "${VIRTCTL_URL}" ]; then \ + wget2 -qO /tmp/virtctl-archive "${VIRTCTL_URL}" \ + && tar -xf /tmp/virtctl-archive -C /usr/local/bin virtctl \ + && rm /tmp/virtctl-archive; \ + else \ + wget2 -qO /usr/local/bin/virtctl \ + "https://github.com/kubevirt/kubevirt/releases/download/${VIRTCTL_VERSION}/virtctl-${VIRTCTL_VERSION}-linux-amd64"; \ + fi \ + && chmod +x /usr/local/bin/virtctl + +USER runner + +# Default npm and tmp to HOME so npm ci works in restricted containers without workflow overrides +ENV KUBEVIRT_UI_PLUGIN_RUNNER=true \ + HOME=/home/runner \ + TMPDIR=/home/runner/.tmp + +# Keep the same entrypoint/CMD as the base image so ARC works unchanged. diff --git a/ci-scripts/arc/setup-dind-mirror.sh b/ci-scripts/arc/setup-dind-mirror.sh new file mode 100755 index 0000000000..d83bc3bcb6 --- /dev/null +++ b/ci-scripts/arc/setup-dind-mirror.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# +# OpenShift only: mirror docker:dind into the internal registry for ARC (avoids Docker Hub +# rate limits on the chart's hardcoded docker:dind image). Writes +# ci-scripts/generated/arc-dind-replace.env for Helm post-rendering in ci-scripts/arc/install-runner-scale-set.sh. +# +# Optional environment variables: +# ARC_RUNNERS_NS (default: arc-runners) +# SKIP_DIND_MIRROR (default: 0) — set to 1 to skip mirroring and remove stale arc-dind-replace.env +# DIND_SOURCE_IMAGE (default: docker.io/library/docker:dind) — source for oc import-image +# +# Requires: oc logged into OpenShift. +# Note: import-image pulls from Docker Hub server-side; rate limits may require cluster pull secrets. + +set -euo pipefail +ARC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CI_SCRIPTS_DIR="$(cd "${ARC_DIR}/.." && pwd)" + +ARC_RUNNERS_NS="${ARC_RUNNERS_NS:-arc-runners}" +GENERATED_DIR="${CI_SCRIPTS_DIR}/generated" +SKIP_DIND_MIRROR="${SKIP_DIND_MIRROR:-0}" +DIND_SOURCE_IMAGE="${DIND_SOURCE_IMAGE:-docker.io/library/docker:dind}" +DIND_INTERNAL_REF="image-registry.openshift-image-registry.svc:5000/${ARC_RUNNERS_NS}/arc-docker-dind:dind" + +if ! oc get clusterversion version &>/dev/null; then + echo "ERROR: OpenShift cluster required (clusterversion.version not found)." + exit 1 +fi + +echo "=== Mirror docker:dind for ARC (internal registry) ===" +echo " ARC_RUNNERS_NS: ${ARC_RUNNERS_NS}" +echo " From: ${DIND_SOURCE_IMAGE}" +echo " To ISTag: arc-docker-dind:dind → ${DIND_INTERNAL_REF}" +echo "" + +oc create namespace "${ARC_RUNNERS_NS}" --dry-run=client -o yaml | oc apply -f - +mkdir -p "${GENERATED_DIR}" + +if [[ "${SKIP_DIND_MIRROR}" == "1" ]]; then + echo "SKIP_DIND_MIRROR=1 — skipping docker:dind mirror (removing stale post-render config if any)." + rm -f "${GENERATED_DIR}/arc-dind-replace.env" + echo "=== setup-dind-mirror complete (skipped) ===" + exit 0 +fi + +if oc import-image arc-docker-dind:dind --from="${DIND_SOURCE_IMAGE}" --confirm -n "${ARC_RUNNERS_NS}"; then + printf 'ARC_DIND_INTERNAL_IMAGE=%s\n' "${DIND_INTERNAL_REF}" > "${GENERATED_DIR}/arc-dind-replace.env" + echo "Wrote ${GENERATED_DIR}/arc-dind-replace.env" + echo " ci-scripts/arc/install-runner-scale-set.sh will use arc-dind-post-render.sh when this file exists." + echo "" + echo "DIND_IMAGE_REF=${DIND_INTERNAL_REF}" +else + echo "ERROR: oc import-image failed (Docker Hub rate limit or cluster cannot pull docker.io?)." + echo " Configure a cluster pull secret for docker.io, set DIND_SOURCE_IMAGE, or SKIP_DIND_MIRROR=1 if dind is mirrored elsewhere (e.g. ImageContentSourcePolicy)." + exit 1 +fi + +echo "=== setup-dind-mirror complete ===" diff --git a/ci-scripts/arc/setup-runner-image.sh b/ci-scripts/arc/setup-runner-image.sh new file mode 100755 index 0000000000..c2481c7261 --- /dev/null +++ b/ci-scripts/arc/setup-runner-image.sh @@ -0,0 +1,154 @@ +#!/bin/bash +# +# OpenShift only: create ImageStream + BuildConfig and run a binary Docker build for the +# custom ARC runner image (ci-scripts/arc/runner-image/Dockerfile). +# +# Output: prints IMAGE_REF= to stdout (and to ARC_RUNNER_IMAGE_FILE if set). +# Run setup-dind-mirror.sh first if you need an internal docker:dind mirror (optional). +# +# Optional environment variables: +# ARC_RUNNERS_NS (default: arc-runners) +# OC_VERSION OpenShift client version build-arg (default: detect or 4.20) +# VIRTCTL_VERSION (default: v1.4.0) +# +# Requires: oc logged into OpenShift; jq optional for version detection and URL resolution. +# +# Binary URL resolution: +# When jq is available, this script queries ConsoleCLIDownload resources to find the +# exact binary download URLs for oc, kubectl, and virtctl that match the live cluster. +# These are passed to the Docker build as OC_URL and VIRTCTL_URL build-args. +# If resolution fails (CRD not found, jq absent, etc.), the Dockerfile falls back to +# mirror.openshift.com / GitHub releases using OC_VERSION / VIRTCTL_VERSION. + +set -euo pipefail +ARC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CI_SCRIPTS_DIR="$(cd "${ARC_DIR}/.." && pwd)" + +ARC_RUNNERS_NS="${ARC_RUNNERS_NS:-arc-runners}" +RUNNER_IMAGE_DIR="${ARC_DIR}/runner-image" + +if ! oc get clusterversion version &>/dev/null; then + echo "ERROR: OpenShift cluster required (clusterversion.version not found)." + exit 1 +fi + +if [[ -z "${OC_VERSION:-}" ]]; then + OC_VERSION=$(oc version --output json 2>/dev/null | jq -r '.openshiftVersion | split(".") | .[0:2] | join(".") // empty') || true + OC_VERSION="${OC_VERSION:-4.20}" +fi +VIRTCTL_VERSION="${VIRTCTL_VERSION:-v1.4.0}" + +# Resolve binary download URLs from ConsoleCLIDownload resources so the image binaries +# match the live cluster exactly. Requires jq; silently skipped if unavailable. +OC_URL="" +VIRTCTL_URL="" +if command -v jq &>/dev/null; then + CLI_DOWNLOAD_JSON=$(oc get consoleclidownload -o json 2>/dev/null || true) + if [[ -n "${CLI_DOWNLOAD_JSON}" ]]; then + OC_URL=$(echo "${CLI_DOWNLOAD_JSON}" \ + | jq -r '.items[].spec.links[] | select(.text | test("oc.*linux.*x86_64|oc.*linux.*amd64"; "i")) | .href' \ + | head -1) + VIRTCTL_URL=$(echo "${CLI_DOWNLOAD_JSON}" \ + | jq -r '.items[].spec.links[] | select(.text | test("virtctl.*linux.*amd64|virtctl.*linux.*x86_64"; "i")) | .href' \ + | head -1 || true) + + # Rewrite public console download route URLs to their backing internal HTTP services so + # that build pods don't need to trust the cluster's self-signed ingress CA. + # Each route's host maps to a service that listens on plain HTTP internally; TLS is only + # terminated at the ingress router. We resolve service+namespace from the route spec so + # this works for any URL regardless of hostname naming conventions. + _ALL_ROUTES_JSON=$(oc get route --all-namespaces -o json 2>/dev/null || true) + _url_to_internal() { + local url="${1}" + local host path route_info ns svc svc_port + host=$(echo "${url}" | sed -E 's|https://([^/]+).*|\1|') + path=$(echo "${url}" | sed -E 's|https://[^/]+(/.*)?|\1|' || echo '/') + route_info=$(echo "${_ALL_ROUTES_JSON}" \ + | jq -r --arg h "${host}" \ + '.items[] | select(.spec.host == $h) | "\(.metadata.namespace) \(.spec.to.name)"' \ + | head -1) + if [[ -n "${route_info}" ]]; then + read -r ns svc <<< "${route_info}" + svc_port=$(oc get service "${svc}" -n "${ns}" \ + -o jsonpath='{.spec.ports[0].port}' 2>/dev/null || echo "8080") + echo "http://${svc}.${ns}.svc.cluster.local:${svc_port}${path}" + else + echo "${url}" + fi + } + [[ -n "${OC_URL}" ]] && OC_URL=$(_url_to_internal "${OC_URL}") + [[ -n "${VIRTCTL_URL}" ]] && VIRTCTL_URL=$(_url_to_internal "${VIRTCTL_URL}") + fi +fi + +echo "=== Build ARC runner image (in-cluster, OpenShift) ===" +echo " ARC_RUNNERS_NS: ${ARC_RUNNERS_NS}" +echo " OC_VERSION: ${OC_VERSION}" +echo " VIRTCTL_VERSION: ${VIRTCTL_VERSION}" +echo " RUNNER_IMAGE_DIR: ${RUNNER_IMAGE_DIR}" +echo " OC_URL: ${OC_URL:-(fallback to mirror.openshift.com)}" +echo " VIRTCTL_URL: ${VIRTCTL_URL:-(fallback to GitHub releases)}" +echo "" + +if [[ ! -f "${RUNNER_IMAGE_DIR}/Dockerfile" ]]; then + echo "ERROR: Dockerfile not found at ${RUNNER_IMAGE_DIR}/Dockerfile" + exit 1 +fi + +oc create namespace "${ARC_RUNNERS_NS}" --dry-run=client -o yaml | oc apply -f - + +oc apply -f - < "${ARC_RUNNER_IMAGE_FILE}" + echo "Wrote ${ARC_RUNNER_IMAGE_FILE}" +fi + +echo "IMAGE_REF=${IMAGE_REF}" diff --git a/ci-scripts/check-cluster-health.sh b/ci-scripts/check-cluster-health.sh index 7e83050a86..425713760a 100755 --- a/ci-scripts/check-cluster-health.sh +++ b/ci-scripts/check-cluster-health.sh @@ -86,12 +86,12 @@ check "ARC AutoscalingRunnerSet in ${ARC_RUNNERS_NS}" bash -c " # The listener pod stays Running even when the scale set is idle (no ephemeral runner pods). # A missing or non-Running listener means the scale set cannot pick up jobs. check "ARC listener pod in ${ARC_RUNNERS_NS}" bash -c " - running=\$(oc get pods -n '${ARC_RUNNERS_NS}' -l app.kubernetes.io/component=runner-scale-set-listener --no-headers 2>/dev/null | grep -c 'Running') + running=\$(oc get pods -n '${ARC_RUNNERS_NS}' --no-headers 2>/dev/null | grep -c 'Running') if [[ \"\${running}\" -ge 1 ]]; then - echo \" \${running} Running listener pod(s)\" + echo \" \${running} Running pod(s) (listener/controller)\" exit 0 else - echo ' No Running listener pod in ${ARC_RUNNERS_NS}' + echo ' No Running pods in ${ARC_RUNNERS_NS} — listener may be down' exit 1 fi " diff --git a/ci-scripts/check-roks-cluster-state.sh b/ci-scripts/check-roks-cluster-state.sh index 329b0cd585..46ae82eb91 100755 --- a/ci-scripts/check-roks-cluster-state.sh +++ b/ci-scripts/check-roks-cluster-state.sh @@ -14,14 +14,6 @@ set -euo pipefail CLUSTER_NAME="${CLUSTER_NAME:?CLUSTER_NAME must be set}" MAX_WAIT="${MAX_WAIT:-7200}" INTERVAL="${INTERVAL:-60}" -if ! [[ "${MAX_WAIT}" =~ ^[0-9]+$ && "${INTERVAL}" =~ ^[0-9]+$ ]]; then - echo "ERROR: MAX_WAIT and INTERVAL must be positive integers" - exit 1 -fi -if (( MAX_WAIT <= 0 || INTERVAL <= 0 )); then - echo "ERROR: MAX_WAIT and INTERVAL must be greater than zero" - exit 1 -fi echo "Waiting for cluster '${CLUSTER_NAME}' to be fully available..." echo " Ready when: state=normal, ingressStatus=healthy" @@ -31,11 +23,7 @@ echo "" ELAPSED=0 while [[ ${ELAPSED} -lt ${MAX_WAIT} ]]; do - if ! CLUSTER_JSON=$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --output json 2>&1); then - echo "ERROR: Failed to get cluster '${CLUSTER_NAME}':" - echo "${CLUSTER_JSON}" - exit 1 - fi + CLUSTER_JSON=$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --output json 2>/dev/null || echo "{}") STATE=$(echo "${CLUSTER_JSON}" | jq -r '.state // "unknown"') MASTER_STATE=$(echo "${CLUSTER_JSON}" | jq -r '.masterState // "unknown"') @@ -54,7 +42,7 @@ while [[ ${ELAPSED} -lt ${MAX_WAIT} ]]; do exit 0 fi - sleep "${INTERVAL}" + sleep ${INTERVAL} ELAPSED=$((ELAPSED + INTERVAL)) done diff --git a/ci-scripts/examples/arc-0.14-extra-values.yaml b/ci-scripts/examples/arc-0.14-extra-values.yaml new file mode 100644 index 0000000000..3e5ce77656 --- /dev/null +++ b/ci-scripts/examples/arc-0.14-extra-values.yaml @@ -0,0 +1,29 @@ +# Example fragment for ARC Helm ≥ 0.14 (stable gha-runner-scale-set chart). +# Merge via ARC_RUNNER_EXTRA_VALUES=/path/to/this/file.yaml (ci-scripts/arc/install-runner-scale-set.sh) +# +# Ref: https://github.blog/changelog/2026-03-19-actions-runner-controller-release-0-14-0/ +# +# --- Multilabel (optional) --- +# Prefer ARC_SCALE_SET_LABELS=kubevirt-plugin-ci,linux,x64 in the environment when using +# ci-scripts/arc/install-runner-scale-set.sh. If you use this file instead, uncomment: +# +# scaleSetLabels: +# - kubevirt-plugin-ci +# - linux +# +# Workflows must then use: +# runs-on: [kubevirt-plugin-ci, linux] +# +# --- Labels / annotations on ARC-managed resources (0.14+) --- +# Useful for OpenShift scheduling, cost tags, or policy selectors on listener pods / RBAC. +# +# resourceMeta: +# autoscalingListener: +# labels: +# kubernetes.io/os: linux +# annotations: {} +# listenerServiceAccount: +# labels: {} +# annotations: {} +# +# (See upstream chart values.yaml for the full resourceMeta schema.) diff --git a/ci-scripts/install-hco.sh b/ci-scripts/install-hco.sh index 8642690617..d20e122129 100755 --- a/ci-scripts/install-hco.sh +++ b/ci-scripts/install-hco.sh @@ -85,21 +85,8 @@ spec: value: "${KVM_EMULATION}" EOF -echo "Waiting for Subscription to have an InstallPlan..." -for i in $(seq 1 60); do - INSTALL_PLAN="$(oc get subscription hco-operatorhub -n kubevirt-hyperconverged \ - -o jsonpath='{.status.installPlanRef.name}' 2>/dev/null || true)" - if [[ -n "${INSTALL_PLAN}" ]]; then - echo "InstallPlan found: ${INSTALL_PLAN}" - break - fi - if [[ "${i}" -eq 60 ]]; then - echo "ERROR: Timed out waiting for HCO InstallPlan" - exit 1 - fi - echo "Waiting for InstallPlan... (${i}/60)" - sleep 5 -done +echo "Waiting 90s for install plan and pods to be created..." +sleep 90 # --- Wait for HCO deployments --- echo "Waiting for HCO deployments to become available..." @@ -138,11 +125,13 @@ oc wait -n kubevirt-hyperconverged hyperconverged kubevirt-hyperconverged \ if [[ "${SKIP_HPP}" != "true" ]]; then echo "Installing HostPath Provisioner..." - oc apply -f \ - "https://raw.githubusercontent.com/kubevirt/hostpath-provisioner-operator/${HPP_VERSION}/deploy/hostpathprovisioner_cr.yaml" + oc create -f \ + "https://raw.githubusercontent.com/kubevirt/hostpath-provisioner-operator/${HPP_VERSION}/deploy/hostpathprovisioner_cr.yaml" \ + || echo "HPP CR may already exist, continuing..." - oc apply -f \ - "https://raw.githubusercontent.com/kubevirt/hostpath-provisioner-operator/${HPP_VERSION}/deploy/storageclass-wffc-csi.yaml" + oc create -f \ + "https://raw.githubusercontent.com/kubevirt/hostpath-provisioner-operator/${HPP_VERSION}/deploy/storageclass-wffc-csi.yaml" \ + || echo "HPP StorageClass may already exist, continuing..." oc annotate storageclasses --all storageclass.kubernetes.io/is-default-class- || true oc annotate storageclass hostpath-csi storageclass.kubernetes.io/is-default-class='true' diff --git a/ci-scripts/nginx-9080.conf b/ci-scripts/nginx-9080.conf new file mode 100644 index 0000000000..7b83f18cb9 --- /dev/null +++ b/ci-scripts/nginx-9080.conf @@ -0,0 +1,35 @@ +# Writable pid file: image runs as UID 1001 (see Dockerfile USER). start-plugin-container.sh passes -g "daemon off;". +pid /tmp/nginx.pid; +error_log /dev/stdout info; + +events { + worker_connections 1024; +} + +http { + access_log /dev/stdout; + include /etc/nginx/mime.types; + default_type application/octet-stream; + keepalive_timeout 65; + + add_header X-Content-Type-Options nosniff; + + server { + # Docker often has no routable IPv6; binding [::]:9443 can prevent nginx from starting. + listen 9080 default_server; + root /usr/share/nginx/html; + + # Prevent caching for plugin-manifest.json and plugin-entry.js + # to avoid "Unexpected end of JSON input" error + location = /plugin-manifest.json { + add_header Cache-Control 'no-cache, no-store, must-revalidate, proxy-revalidate, max-age=0'; + add_header Pragma 'no-cache'; + add_header Expires '0'; + } + location = /plugin-entry.js { + add_header Cache-Control 'no-cache, no-store, must-revalidate, proxy-revalidate, max-age=0'; + add_header Pragma 'no-cache'; + add_header Expires '0'; + } + } +} diff --git a/ci-scripts/nginx-9443.conf b/ci-scripts/nginx-9443.conf new file mode 100644 index 0000000000..a1ff000866 --- /dev/null +++ b/ci-scripts/nginx-9443.conf @@ -0,0 +1,39 @@ +# Writable pid file: image runs as UID 1001 (see Dockerfile USER). start-plugin-container.sh passes -g "daemon off;". +pid /tmp/nginx.pid; +error_log /dev/stdout info; + +events { + worker_connections 1024; +} + +http { + access_log /dev/stdout; + include /etc/nginx/mime.types; + default_type application/octet-stream; + keepalive_timeout 65; + + add_header X-Content-Type-Options nosniff; + + server { + # Docker often has no routable IPv6; binding [::]:9443 can prevent nginx from starting. + listen 9443 ssl; + ssl_certificate /var/serving-cert/tls.crt; + ssl_certificate_key /var/serving-cert/tls.key; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + root /usr/share/nginx/html; + + # Prevent caching for plugin-manifest.json and plugin-entry.js + # to avoid "Unexpected end of JSON input" error + location = /plugin-manifest.json { + add_header Cache-Control 'no-cache, no-store, must-revalidate, proxy-revalidate, max-age=0'; + add_header Pragma 'no-cache'; + add_header Expires '0'; + } + location = /plugin-entry.js { + add_header Cache-Control 'no-cache, no-store, must-revalidate, proxy-revalidate, max-age=0'; + add_header Pragma 'no-cache'; + add_header Expires '0'; + } + } +} diff --git a/ci-scripts/resolve-console-image.sh b/ci-scripts/resolve-console-image.sh new file mode 100755 index 0000000000..3d06ac2643 --- /dev/null +++ b/ci-scripts/resolve-console-image.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# Emit CONSOLE_IMAGE=... for GITHUB_ENV so off-cluster console matches the cluster's OpenShift x.y. +# Requires: oc logged in; OpenShift cluster with ClusterVersion. +set -euo pipefail + +CONSOLE_IMAGE_REGISTRY="${CONSOLE_IMAGE_REGISTRY:-quay.io/openshift/origin-console}" + +if [[ -n "${CONSOLE_IMAGE:-}" ]]; then + echo "CONSOLE_IMAGE already set (${CONSOLE_IMAGE}); skipping cluster version lookup." >&2 + echo "CONSOLE_IMAGE=${CONSOLE_IMAGE}" + exit 0 +fi + +VERSION="$(oc get clusterversion version -o jsonpath='{.status.desired.version}' 2>/dev/null || true)" +if [[ -z "${VERSION}" ]]; then + echo "::error::Could not read .status.desired.version from ClusterVersion 'version'. Is this an OpenShift cluster and is oc authenticated?" >&2 + exit 1 +fi + +IFS='.' read -r major minor _rest <<< "${VERSION}" +if [[ -z "${major:-}" || -z "${minor:-}" ]]; then + echo "::error::Could not parse OpenShift version '${VERSION}' as major.minor.*" >&2 + exit 1 +fi + +OCP_XY="${major}.${minor}" +CONSOLE_IMAGE="${CONSOLE_IMAGE_REGISTRY}:${OCP_XY}" + +echo "Cluster OpenShift version: ${VERSION} → console tag: ${OCP_XY}" >&2 +echo "CONSOLE_IMAGE=${CONSOLE_IMAGE}" + +if [[ -n "${GITHUB_STEP_SUMMARY:-}" ]]; then + { + echo "### Off-cluster console image" + echo "" + echo "Cluster version: **${VERSION}**" + echo "Console image: \`${CONSOLE_IMAGE}\`" + } >> "${GITHUB_STEP_SUMMARY}" +fi diff --git a/ci-scripts/start-console.sh b/ci-scripts/start-console.sh new file mode 100755 index 0000000000..9c796e57ef --- /dev/null +++ b/ci-scripts/start-console.sh @@ -0,0 +1,149 @@ +#! /bin/bash +# +# Start the "off cluster" console. Based on the `route-console.sh` and `start-console.sh` scripts. +# +set -euox pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# --------------------------------------------------------------------------- +# Route + BRIDGE_PLUGIN_PROXY (same behavior as route-console.sh) +# --------------------------------------------------------------------------- +ROUTE_NAME="kubevirt-apiserver-proxy" +ROUTE_NS="openshift-cnv" +APPS_DOMAIN="$(oc get ingress.config.openshift.io/cluster -o jsonpath='{.spec.domain}' 2>/dev/null || true)" +HOSTNAME="kubevirt-apiserver-proxy.${APPS_DOMAIN}" + +if [[ -z "${APPS_DOMAIN:-}" ]]; then + echo "::error::Could not read ingress.config.openshift.io/cluster domain." + exit 1 +fi + +if oc get route "$ROUTE_NAME" -n "$ROUTE_NS" &>/dev/null; then + echo "Route '${ROUTE_NAME}' already exists in namespace '${ROUTE_NS}'." +else + echo "Route '${ROUTE_NAME}' not found; creating..." + if { + echo "apiVersion: route.openshift.io/v1" + echo "kind: Route" + echo "metadata:" + echo " name: ${ROUTE_NAME}" + echo " namespace: ${ROUTE_NS}" + echo " annotations:" + echo " haproxy.router.openshift.io/hsts_header: max-age=31536000;includeSubDomains;preload" + echo "spec:" + echo " host: ${HOSTNAME}" + echo " to:" + echo " kind: Service" + echo " name: ${ROUTE_NAME}-service" + echo " weight: 100" + echo " port:" + echo " targetPort: 8080" + echo " tls:" + echo " termination: reencrypt" + echo " wildcardPolicy: None" + } | oc create -f - + then + echo "Route '${ROUTE_NAME}' created (host: ${HOSTNAME})." + else + echo "Route create skipped or failed (may already exist)." + fi + oc get route "$ROUTE_NAME" -n "$ROUTE_NS" &>/dev/null || echo "::warning::Route '${ROUTE_NAME}' is not present; kubevirt API proxy may not work." +fi + +if [[ "${PROXY_ENV:-production}" == "local" ]]; then + echo "PROXY_ENV=local — kubevirt proxy via host.docker.internal (same as route-console.sh + docker)." + ENDPOINT="http://host.docker.internal:8080" +else + echo "Using cluster route for kubevirt-apiserver-proxy." + ENDPOINT="https://${HOSTNAME}" +fi + +BRIDGE_PLUGIN_PROXY="$(jq -nc \ + --arg endpoint "$ENDPOINT" \ + '{"services":[{"consoleAPIPath":"/api/proxy/plugin/kubevirt-plugin/kubevirt-apiserver-proxy/","endpoint":$endpoint,"authorize":true}]}')" + +echo "BRIDGE_PLUGIN_PROXY (structure): $(echo "$BRIDGE_PLUGIN_PROXY" | jq .)" + +# --------------------------------------------------------------------------- +# BRIDGE_* — off-cluster console (kubevirt-plugin only; no extra dev plugins) +# --------------------------------------------------------------------------- +BRIDGE_USER_AUTH="disabled" +BRIDGE_K8S_MODE="off-cluster" +BRIDGE_K8S_AUTH="bearer-token" +BRIDGE_K8S_MODE_OFF_CLUSTER_SKIP_VERIFY_TLS="true" +BRIDGE_K8S_MODE_OFF_CLUSTER_ENDPOINT="$(oc whoami --show-server)" +BRIDGE_K8S_MODE_OFF_CLUSTER_THANOS="$(oc -n openshift-config-managed get configmap monitoring-shared-config -o jsonpath='{.data.thanosPublicURL}' 2>/dev/null || true)" +BRIDGE_K8S_MODE_OFF_CLUSTER_ALERTMANAGER="$(oc -n openshift-config-managed get configmap monitoring-shared-config -o jsonpath='{.data.alertmanagerPublicURL}' 2>/dev/null || true)" +BRIDGE_USER_SETTINGS_LOCATION="localstorage" +BRIDGE_I18N_NAMESPACES="plugin__kubevirt-plugin" + +BRIDGE_K8S_AUTH_BEARER_TOKEN="$(oc whoami --show-token 2>/dev/null || true)" +if [[ -z "${BRIDGE_K8S_AUTH_BEARER_TOKEN}" ]]; then + echo "::error::Could not read bearer token (oc whoami --show-token)." + exit 1 +fi +echo "::add-mask::${BRIDGE_K8S_AUTH_BEARER_TOKEN}" + +# Plugin listens to protocol${PLUGIN_TRANSPORT}, published as host :${PLUGIN_PORT}. Console runs +# in Docker; reach the host via host.docker.internal (Linux: --add-host below). +# Off-cluster bridge uses InsecureSkipVerify for plugin proxy TLS when +# BRIDGE_K8S_MODE_OFF_CLUSTER_SKIP_VERIFY_TLS=true, so the CI self-signed cert is accepted. +BRIDGE_PLUGINS="kubevirt-plugin=${PLUGIN_TRANSPORT}://host.docker.internal:${PLUGIN_PORT}" + +# Resolve CONSOLE_IMAGE from the cluster's OpenShift version when not already set. +# Falls back to :latest if resolution fails (e.g. non-OCP cluster, oc not logged in). +eval "$(bash "${SCRIPT_DIR}/resolve-console-image.sh")" || true +CONSOLE_IMAGE="${CONSOLE_IMAGE:-quay.io/openshift/origin-console:latest}" +CONSOLE_PORT=${CONSOLE_PORT:-9000} + +# --------------------------------------------------------------------------- +# Job summary (no secrets) +# --------------------------------------------------------------------------- +if [[ -n "${GITHUB_STEP_SUMMARY:-}" ]]; then + { + echo "
Off-cluster console (CI)" + echo "" + echo "| Item | Value |" + echo "|------|-------|" + echo "| Console image | \`${CONSOLE_IMAGE}\` |" + echo "| Console URL | \`http://localhost:${CONSOLE_PORT}\` |" + echo "| API server | \`${BRIDGE_K8S_MODE_OFF_CLUSTER_ENDPOINT}\` |" + echo "| Thanos URL | \`${BRIDGE_K8S_MODE_OFF_CLUSTER_THANOS:-(empty)}\` |" + echo "| Alertmanager URL | \`${BRIDGE_K8S_MODE_OFF_CLUSTER_ALERTMANAGER:-(empty)}\` |" + echo "| PROXY_ENV | \`${PROXY_ENV:-production}\` |" + echo "| Proxy endpoint (kubevirt-apiserver-proxy) | \`${ENDPOINT}\` |" + echo "| Route host | \`${HOSTNAME}\` |" + echo "| Container runtime | Docker |" + echo "| BRIDGE_PLUGINS | \`${BRIDGE_PLUGINS}\` |" + echo "| BRIDGE_PLUGIN_PROXY | \`$(echo "$BRIDGE_PLUGIN_PROXY" | jq -c .)\` |" + echo "" + echo "
" + } >> "${GITHUB_STEP_SUMMARY}" +fi + +echo "Starting console container..." +echo " API server: ${BRIDGE_K8S_MODE_OFF_CLUSTER_ENDPOINT}" +echo " Console image: ${CONSOLE_IMAGE}" + +DOCKER_RUN_EXTRA=() +if [[ "$(uname -s)" == "Linux" ]]; then + DOCKER_RUN_EXTRA+=(--add-host=host.docker.internal:host-gateway) +fi + +docker run -d --pull=always --rm "${DOCKER_RUN_EXTRA[@]}" \ + -p "${CONSOLE_PORT}:9000" \ + --name console \ + -e BRIDGE_USER_AUTH="${BRIDGE_USER_AUTH}" \ + -e BRIDGE_K8S_MODE="${BRIDGE_K8S_MODE}" \ + -e BRIDGE_K8S_AUTH="${BRIDGE_K8S_AUTH}" \ + -e BRIDGE_K8S_MODE_OFF_CLUSTER_SKIP_VERIFY_TLS="${BRIDGE_K8S_MODE_OFF_CLUSTER_SKIP_VERIFY_TLS}" \ + -e BRIDGE_K8S_MODE_OFF_CLUSTER_ENDPOINT="${BRIDGE_K8S_MODE_OFF_CLUSTER_ENDPOINT}" \ + -e BRIDGE_K8S_MODE_OFF_CLUSTER_THANOS="${BRIDGE_K8S_MODE_OFF_CLUSTER_THANOS}" \ + -e BRIDGE_K8S_MODE_OFF_CLUSTER_ALERTMANAGER="${BRIDGE_K8S_MODE_OFF_CLUSTER_ALERTMANAGER}" \ + -e BRIDGE_K8S_AUTH_BEARER_TOKEN="${BRIDGE_K8S_AUTH_BEARER_TOKEN}" \ + -e BRIDGE_USER_SETTINGS_LOCATION="${BRIDGE_USER_SETTINGS_LOCATION}" \ + -e BRIDGE_I18N_NAMESPACES="${BRIDGE_I18N_NAMESPACES}" \ + -e BRIDGE_PLUGIN_PROXY="${BRIDGE_PLUGIN_PROXY}" \ + -e BRIDGE_PLUGINS="${BRIDGE_PLUGINS}" \ + "${CONSOLE_IMAGE}" diff --git a/ci-scripts/start-plugin-container.sh b/ci-scripts/start-plugin-container.sh new file mode 100755 index 0000000000..aebf8c963e --- /dev/null +++ b/ci-scripts/start-plugin-container.sh @@ -0,0 +1,99 @@ +#! /bin/bash +set -euox pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# ARC jobs use Docker-in-Docker (DOCKER_HOST); prefer docker so ports publish on the dind host. +# If podman is also on PATH, `podman || docker` would bypass the daemon jobs use for `docker run`. +if [[ -n "${DOCKER_HOST:-}" ]] && command -v docker &>/dev/null; then + RUNTIME=$(command -v docker) +elif command -v podman &>/dev/null; then + RUNTIME=$(command -v podman) +else + RUNTIME=$(command -v docker) +fi + +# :z is for Podman SELinux; plain Docker (incl. dind) uses :ro only. +VOL_SUFFIX=":ro" +[[ "${RUNTIME}" == *podman ]] && VOL_SUFFIX=":ro,z" + +# +# If the PLUGIN_IMAGE is not set, build it locally. +# +if [[ -z "${PLUGIN_IMAGE:-}" ]]; then + PLUGIN_IMAGE="localhost/kubevirt-plugin:local" + $RUNTIME build -t "${PLUGIN_IMAGE}" -f Dockerfile "${REPO_ROOT}" +fi + +PLUGIN_NAME=${PLUGIN_NAME:-kubevirt-plugin-ci} +PLUGIN_PORT=${PLUGIN_PORT:-9001} +PLUGIN_TRANSPORT=${PLUGIN_TRANSPORT:-http} + +echo "Using PLUGIN_IMAGE: ${PLUGIN_IMAGE}" +echo "Using PLUGIN_NAME: ${PLUGIN_NAME}" +echo "Using PLUGIN_PORT: ${PLUGIN_PORT}" +echo "Using PLUGIN_TRANSPORT: ${PLUGIN_TRANSPORT}" + +if [[ -n "${GITHUB_STEP_SUMMARY:-}" ]]; then + { + echo "
Kubevirt Plugin Container" + echo "" + echo "| Item | Value |" + echo "|------|-------|" + echo "| Plugin image | \`${PLUGIN_IMAGE}\` |" + echo "| Plugin port | \`${PLUGIN_PORT}\` |" + echo "| Plugin transport | \`${PLUGIN_TRANSPORT}\` |" + echo "| Container name | \`${PLUGIN_NAME}\` |" + echo "| Container runtime | \`${RUNTIME}\` |" + echo "" + echo "
" + } >> "${GITHUB_STEP_SUMMARY}" +fi + +CERT_CONFIG="" +if [[ "${PLUGIN_TRANSPORT}" == "https" ]]; then + # + # Create the self-signed certs + # + # With Docker-in-Docker (ARC), bind-mount sources must exist on the docker *daemon* host. Paths under + # $TMPDIR (e.g. /home/runner/.tmp) are often not shared with dind, so the mount appears empty in the + # container and nginx fails: cannot load certificate "/var/serving-cert/tls.crt". Use the workspace. + # + CERT_PARENT="${REPO_ROOT}/ci-scripts/generated" + mkdir -p "${CERT_PARENT}" || true + KUBEVIRT_PLUGIN_CERT_DIR=$(mktemp -d "${CERT_PARENT}/.tmp-plugin-cert.XXXXXX") + openssl req -x509 -nodes -days 1 -newkey rsa:2048 \ + -keyout "${KUBEVIRT_PLUGIN_CERT_DIR}/tls.key" \ + -out "${KUBEVIRT_PLUGIN_CERT_DIR}/tls.crt" \ + -subj "/CN=localhost" \ + -addext "subjectAltName=DNS:localhost,DNS:host.docker.internal" + chmod a+rx "${KUBEVIRT_PLUGIN_CERT_DIR}" + chmod a+r "${KUBEVIRT_PLUGIN_CERT_DIR}/tls.crt" "${KUBEVIRT_PLUGIN_CERT_DIR}/tls.key" + + CERT_CONFIG="-v ${KUBEVIRT_PLUGIN_CERT_DIR}:/var/serving-cert${VOL_SUFFIX}" +fi + +CONTAINER_CONFIG=${SCRIPT_DIR}/nginx-9080.conf +CONTAINER_PORT=9080 +if [[ "${PLUGIN_TRANSPORT}" == "https" ]]; then + CONTAINER_CONFIG=${SCRIPT_DIR}/nginx-9443.conf + CONTAINER_PORT=9443 +fi + +# +# Start the plugin container with the self-signed certs and nginx `nginx-9443.conf` config +# mounted into the container. This emulates how the pod is deployed with the kubevirt operator +# using a ConfigMap and Secrets mounted into the container. +# +$RUNTIME rm -f "${PLUGIN_NAME}" 2>/dev/null || true +$RUNTIME run -d \ + --name "${PLUGIN_NAME}" \ + -p "${PLUGIN_PORT}:${CONTAINER_PORT}" \ + -v "${CONTAINER_CONFIG}:/etc/nginx/nginx.conf${VOL_SUFFIX}" \ + ${CERT_CONFIG} \ + "${PLUGIN_IMAGE}" + +# +# Note: If this run with podman, the IPv6 port is captured but not mapped to nginx. So, +# using localhost:${PLUGIN_PORT} may not work. Use 127.0.0.1:${PLUGIN_PORT} instead. +# diff --git a/ci-scripts/test-cleanup.sh b/ci-scripts/test-cleanup.sh new file mode 100755 index 0000000000..3863c0cdbb --- /dev/null +++ b/ci-scripts/test-cleanup.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# +# A copy of `../test-cleanup.sh` to use the correct namespace from the e2e run, and only delete resources in that namespace. +# +export TEST_NS=${CYPRESS_TEST_NS:-'auto-test-ns'} + +oc delete vm --all -n ${TEST_NS} --wait=false +oc delete template --all -n ${TEST_NS} --wait=false +#oc delete template -l app.kubernetes.io/name=custom-templates -n openshift --wait=false +oc delete VirtualMachineSnapshot --all -n ${TEST_NS} --ignore-not-found +oc delete datavolume --all -n ${TEST_NS} --wait=false +oc delete datasource --all -n ${TEST_NS} --wait=false +#oc delete datasource -n openshift-virtualization-os-images -l app.kubernetes.io/part-of!=hyperconverged-cluster +#oc delete datavolume -n openshift-virtualization-os-images -l app.kubernetes.io/part-of!=hyperconverged-cluster +oc delete pvc --all -n ${TEST_NS} --wait=false +#oc delete pvc -n openshift-cnv -l k8s-app!=hostpath-provisioner --wait=false +#oc delete pvc --all -n openshift --wait=false +#oc delete pvc --all -n openshift-virtualization-os-images --wait=false +oc delete secret --all -n ${TEST_NS} --ignore-not-found --wait=false +oc delete net-attach-def --all -n ${TEST_NS} --ignore-not-found --wait=false +#oc delete VirtualMachineClusterInstancetype example --ignore-not-found --wait=false +oc delete VirtualMachineInstancetype example -n ${TEST_NS} --ignore-not-found --wait=false +#oc delete VirtualMachineClusterPreference example --ignore-not-found --wait=false +oc delete VirtualMachinePreference example -n ${TEST_NS} --ignore-not-found --wait=false +#oc delete MigrationPolicy example --ignore-not-found --wait=false +oc delete migplan --all -n ${TEST_NS} --ignore-not-found --wait=false diff --git a/cypress/tests/gating/poc-check-tab-yaml.cy.ts b/cypress/tests/gating/poc-check-tab-yaml.cy.ts new file mode 100644 index 0000000000..dc94aa8b41 --- /dev/null +++ b/cypress/tests/gating/poc-check-tab-yaml.cy.ts @@ -0,0 +1,232 @@ +import { ALL_PROJ_NS, MINUTE, SECOND, TEST_NS, VM_STATUS } from '../../utils/const/index'; +import { Example, YAML } from '../../utils/const/string'; +import { TEMPLATE } from '../../utils/const/template'; +import * as sel from '../../views/selector'; +import { userButtonTxt } from '../../views/selector-instance'; +import { navigateToConfigurationSubTab, subTabName, tab } from '../../views/tab'; + +describe('Check all virtualization pages can be loaded', () => { + before(() => { + cy.beforeSpec(); + cy.visitVMsVirt(); + }); + + describe('Check VirtualMachines page', () => { + it('start example vm', () => { + cy.byLegacyTestID(Example).click(); + cy.get(sel.iconStartBtn, { timeout: MINUTE }).click(); + cy.wait(15 * SECOND); + }); + + it( + 'check the status of example vm', + { + retries: { + runMode: 8, + }, + }, + () => { + cy.contains(sel.vmStatusOnOverview, VM_STATUS.Running).should('be.visible'); + cy.wait(10 * SECOND); + }, + ); + + it('vm tabs are loaded', () => { + cy.contains('Hostname').should('be.visible'); + + tab.navigateToMetrics(); + cy.contains('Utilization').should('be.visible'); + + tab.navigateToYAML(); + cy.contains('Download').should('be.visible'); + + tab.navigateToEvents(); + cy.contains('event').should('be.visible'); + + tab.navigateToConsole(); + cy.contains('Guest login credentials').should('be.visible'); + + tab.navigateToSnapshots(); + cy.contains('No snapshots found').should('be.visible'); + + tab.navigateToDiagnostics(); + cy.contains('Status conditions').should('be.visible'); + + tab.navigateToDiagnosticsGuestSystemLog(); + cy.contains('Guest system log').should('be.visible'); + + tab.navigateToConfiguration(); + cy.contains('Headless mode').should('be.visible'); + + navigateToConfigurationSubTab(subTabName.Storage); + cy.contains('rootdisk').should('be.visible'); + + navigateToConfigurationSubTab(subTabName.Network); + cy.contains('Pod networking').should('be.visible'); + + navigateToConfigurationSubTab(subTabName.Scheduling); + cy.contains('Scheduling and resource requirements').should('be.visible'); + + navigateToConfigurationSubTab(subTabName.SSH); + cy.contains('SSH access').should('be.visible'); + + navigateToConfigurationSubTab(subTabName.InitialRun); + cy.contains('Cloud-init').should('be.visible'); + + navigateToConfigurationSubTab(subTabName.Metadata); + cy.contains('Annotations').should('be.visible'); + }); + + it('vmi tabs are loaded', () => { + tab.navigateToOverview(); + cy.contains('VirtualMachineInstance').should('be.visible'); + cy.byLegacyTestID(Example).click(); + + cy.contains('Annotations').should('be.visible'); + + tab.navigateToYAML(); + cy.contains('Download').should('be.visible'); + + tab.navigateToScheduling(); + cy.contains('Tolerations').should('be.visible'); + + tab.navigateToEvents(); + cy.contains('event').should('be.visible'); + + tab.navigateToConsole(); + cy.contains('Guest login credentials').should('be.visible'); + + tab.navigateToNetworks(); + cy.contains('Pod networking').should('be.visible'); + + tab.navigateToDisks(); + cy.contains('rootdisk').should('be.visible'); + }); + }); + + // describe('Check Templates page', () => { + // it('visit template page', () => { + // cy.visitTemplates(); + // cy.switchProject(ALL_PROJ_NS); + // }); + + // it('common template tabs are loaded', () => { + // cy.get(sel.nameFilter).type(TEMPLATE.RHEL9.metadataName); + // cy.byLegacyTestID(TEMPLATE.RHEL9.metadataName).click(); + + // cy.contains('Display name').should('be.visible'); + // cy.contains('not editable').should('be.visible'); + + // tab.navigateToYAML(); + // cy.contains('Download').should('be.visible'); + + // tab.navigateToScheduling(); + // cy.contains('Tolerations').should('be.visible'); + + // tab.navigateToNetworks(); + // cy.contains('Pod networking').should('be.visible'); + + // tab.navigateToDisks(); + // cy.contains('rootdisk').should('be.visible'); + + // tab.navigateToScripts(); + // cy.contains('Cloud-init').should('be.visible'); + + // tab.navigateToParameters(); + // cy.contains('DATA_SOURCE_NAME').should('be.visible'); + // }); + + // it('create example template', () => { + // cy.switchProject(TEST_NS); + // cy.get(sel.itemCreateBtn).click(); + // cy.get(sel.saveBtn).click(); + // }); + + // it('custom template tabs are loaded', () => { + // cy.contains('Display name').should('be.visible'); + + // tab.navigateToYAML(); + // cy.contains('Download').should('be.visible'); + + // tab.navigateToScheduling(); + // cy.contains('Tolerations').should('be.visible'); + + // tab.navigateToNetworks(); + // cy.contains('Pod networking').should('be.visible'); + + // tab.navigateToDisks(); + // cy.contains('rootdisk').should('be.visible'); + + // tab.navigateToScripts(); + // cy.contains('Cloud-init').should('be.visible'); + + // tab.navigateToParameters(); + // cy.contains('CLOUD_USER_PASSWORD').should('be.visible'); + // }); + // }); + + // describe('Check InstanceTypes tabs', () => { + // it('instanceTypes page is loaded', () => { + // cy.visitITs(); + // cy.contains('cx1.2xlarge').should('exist'); + // }); + + // it('create VirtualMachineClusterInstanceType from YAML', () => { + // cy.get('div.co-m-list').find(sel.itemCreateBtn).eq(0).click(); + // cy.get(sel.saveBtn).click(); + // cy.get(sel.breadcrumb).click(); + // cy.get(sel.nameFilter).first().type(Example); + // cy.byLegacyTestID(Example).should('exist'); + // cy.byLegacyTestID('cx1.2xlarge').should('not.exist'); + // }); + + // it('create VirtualMachineInstanceType from YAML', () => { + // cy.contains('span.pf-v6-c-tabs__item-text', userButtonTxt).click(); + // cy.switchProject(TEST_NS); + // cy.get(sel.itemCreateBtn).click(); + // cy.get(sel.saveBtn).click(); + // cy.get(sel.breadcrumb).click(); + // cy.byLegacyTestID(Example).should('exist'); + // }); + // }); + + // describe('Check Bootable volumes page', () => { + // it('bootable volume page is loaded', () => { + // cy.visitVolumes(); + // cy.switchProject(ALL_PROJ_NS); + // cy.contains('fedora').should('exist'); + // }); + + // it('create bootable volume from YAML', () => { + // cy.switchProject(TEST_NS); + // cy.wait(3000); + // cy.get(sel.itemCreateBtn).click(); + // cy.byButtonText(YAML).click(); + // cy.get(sel.saveBtn).click(); + // cy.byLegacyTestID(Example).should('exist'); + // }); + // }); + + // describe('Check MigrationPolicies page', () => { + // it('migration policy page is loaded', () => { + // cy.visitMPs(); + // cy.contains('No MigrationPolicies found').should('exist'); + // }); + + // it('create migration policy from YAML', () => { + // cy.get(sel.itemCreateBtn).click(); + // cy.byButtonText(YAML).click(); + // cy.get(sel.saveBtn).click(); + // cy.get('.pf-v6-c-breadcrumb__item').eq(0).click(); + // cy.byLegacyTestID(Example).should('exist'); + // }); + // }); + + // describe('Check Checkups tabs', () => { + // it('storage checkup pages is loaded', () => { + // cy.visitCheckups(); + // cy.contains('.pf-v6-c-tabs__item-text', 'Storage').click(); + // cy.contains('No storage checkups found').should('exist'); + // }); + // }); +}); diff --git a/cypress/tests/poc-gating.cy.ts b/cypress/tests/poc-gating.cy.ts new file mode 100644 index 0000000000..c3ce787bf3 --- /dev/null +++ b/cypress/tests/poc-gating.cy.ts @@ -0,0 +1,3 @@ +import './setup/setup.cy.ts'; +import './setup/shared-vm.cy.ts'; +import './gating/check-tab-yaml.cy.ts'; diff --git a/start-console.sh b/start-console.sh index 22877bf51a..c7a02a34e4 100755 --- a/start-console.sh +++ b/start-console.sh @@ -96,7 +96,8 @@ for arg in "$@"; do cd "$BASE_DIR" done -CONSOLE_IMAGE=${CONSOLE_IMAGE:-"quay.io/openshift/origin-console:latest"} +eval "$(bash ./ci-scripts/resolve-console-image.sh)" || true +CONSOLE_IMAGE="${CONSOLE_IMAGE:-quay.io/openshift/origin-console:latest}" CONSOLE_PORT=${CONSOLE_PORT:-9000} echo "Starting local OpenShift console..." diff --git a/test-cleanup.sh b/test-cleanup.sh old mode 100644 new mode 100755 diff --git a/test-setup-downstream.sh b/test-setup-downstream.sh old mode 100644 new mode 100755 diff --git a/test-setup.sh b/test-setup.sh old mode 100644 new mode 100755 From 030ce03e8b2d8ecc46166a88297839d58645e92e Mon Sep 17 00:00:00 2001 From: Scott J Dickerson Date: Tue, 14 Apr 2026 00:14:20 -0400 Subject: [PATCH 02/42] Setup "ci-env" using helm "ci-test-stack" to manage CI test environments `ci-env` controller: - manage CI test environments using ConfigMaps - GitHub actions to request and release the CI test environment via ConfigMaps `ci-test-stack`: - helm chart to configure and deploy the console and plugin - console and plugin are deployed into the test environment namespace - access from the workflow is via the service endpoints - external route is available for debugging test2 workflow updates to use the GitHub actions Signed-off-by: Scott J Dickerson --- .github/actions/ci-env-release/action.yml | 58 +++ .github/actions/ci-env-request/action.yml | 124 +++++++ .github/workflows/poc-e2e-ci-test2.yml | 161 ++++----- .prettierignore | 1 - .prettierignorecode | 3 + .vscode/settings.json | 6 +- ci-scripts/_cluster-helpers.sh | 25 +- ci-scripts/arc/arc-dind-post-render.sh | 40 --- ci-scripts/arc/arc-helm-helpers.sh | 24 -- ci-scripts/arc/arc-openshift-scc.yaml | 16 +- ci-scripts/arc/arc-runner-rbac.yaml | 63 ++-- ci-scripts/arc/arc-runner-scale-set.pod.yaml | 24 +- ci-scripts/arc/ci-console-clusterrole.yaml | 116 ++++++ ci-scripts/arc/install-arc-controller.sh | 8 +- ci-scripts/arc/install-runner-scale-set.sh | 36 +- ci-scripts/arc/runner-image/Dockerfile | 15 + ci-scripts/arc/setup-dind-mirror.sh | 58 --- ci-scripts/arc/setup-runner-image.sh | 68 +--- ci-scripts/ci-env/README.md | 130 +++++++ .../ci-env/ci-env-controller-deployment.yaml | 66 ++++ ci-scripts/ci-env/ci-env-controller-rbac.yaml | 216 +++++++++++ ci-scripts/ci-env/ci-env-controller.sh | 339 ++++++++++++++++++ ci-scripts/ci-env/ci-env-namespace.yaml | 7 + ci-scripts/ci-env/controller-image/Dockerfile | 64 ++++ .../controller-image/helm/ci-test-stack | 1 + .../ci-env/install-ci-env-controller.sh | 141 ++++++++ ci-scripts/ci-env/setup-controller-image.sh | 127 +++++++ .../examples/arc-0.14-extra-values.yaml | 29 -- ci-scripts/helm/ci-test-stack/Chart.yaml | 9 + .../helm/ci-test-stack/templates/NOTES.txt | 12 + .../helm/ci-test-stack/templates/_helpers.tpl | 30 ++ .../templates/console-clusterrolebinding.yaml | 18 + .../templates/console-configmap.yaml | 20 ++ .../templates/console-deployment.yaml | 81 +++++ .../templates/console-route.yaml | 23 ++ .../ci-test-stack/templates/console-sa.yaml | 20 ++ .../templates/console-service.yaml | 16 + .../templates/plugin-configmap.yaml | 40 +++ .../templates/plugin-deployment.yaml | 52 +++ .../templates/plugin-service.yaml | 16 + ci-scripts/helm/ci-test-stack/values.yaml | 28 ++ ci-scripts/test-cleanup.sh | 40 +-- 42 files changed, 1946 insertions(+), 425 deletions(-) create mode 100644 .github/actions/ci-env-release/action.yml create mode 100644 .github/actions/ci-env-request/action.yml delete mode 100755 ci-scripts/arc/arc-dind-post-render.sh create mode 100644 ci-scripts/arc/ci-console-clusterrole.yaml delete mode 100755 ci-scripts/arc/setup-dind-mirror.sh create mode 100644 ci-scripts/ci-env/README.md create mode 100644 ci-scripts/ci-env/ci-env-controller-deployment.yaml create mode 100644 ci-scripts/ci-env/ci-env-controller-rbac.yaml create mode 100755 ci-scripts/ci-env/ci-env-controller.sh create mode 100644 ci-scripts/ci-env/ci-env-namespace.yaml create mode 100644 ci-scripts/ci-env/controller-image/Dockerfile create mode 120000 ci-scripts/ci-env/controller-image/helm/ci-test-stack create mode 100755 ci-scripts/ci-env/install-ci-env-controller.sh create mode 100755 ci-scripts/ci-env/setup-controller-image.sh delete mode 100644 ci-scripts/examples/arc-0.14-extra-values.yaml create mode 100644 ci-scripts/helm/ci-test-stack/Chart.yaml create mode 100644 ci-scripts/helm/ci-test-stack/templates/NOTES.txt create mode 100644 ci-scripts/helm/ci-test-stack/templates/_helpers.tpl create mode 100644 ci-scripts/helm/ci-test-stack/templates/console-clusterrolebinding.yaml create mode 100644 ci-scripts/helm/ci-test-stack/templates/console-configmap.yaml create mode 100644 ci-scripts/helm/ci-test-stack/templates/console-deployment.yaml create mode 100644 ci-scripts/helm/ci-test-stack/templates/console-route.yaml create mode 100644 ci-scripts/helm/ci-test-stack/templates/console-sa.yaml create mode 100644 ci-scripts/helm/ci-test-stack/templates/console-service.yaml create mode 100644 ci-scripts/helm/ci-test-stack/templates/plugin-configmap.yaml create mode 100644 ci-scripts/helm/ci-test-stack/templates/plugin-deployment.yaml create mode 100644 ci-scripts/helm/ci-test-stack/templates/plugin-service.yaml create mode 100644 ci-scripts/helm/ci-test-stack/values.yaml diff --git a/.github/actions/ci-env-release/action.yml b/.github/actions/ci-env-release/action.yml new file mode 100644 index 0000000000..861b6fa422 --- /dev/null +++ b/.github/actions/ci-env-release/action.yml @@ -0,0 +1,58 @@ +name: Release CI Test Environment +description: > + Signal the ci-env-controller to tear down a test environment by patching + the trigger ConfigMap to desired-state=absent, then wait for cleanup to + complete and delete the ConfigMap. + +inputs: + configmap-name: + description: Name of the trigger ConfigMap + required: true + ci-env-namespace: + description: Namespace where ci-env-controller runs + default: ci-env + timeout: + description: Max seconds to wait for cleanup to complete + default: '300' + +runs: + using: composite + steps: + - name: Release environment + shell: bash + env: + CM_NAME: ${{ inputs.configmap-name }} + CM_NS: ${{ inputs.ci-env-namespace }} + TIMEOUT: ${{ inputs.timeout }} + run: | + if ! oc get configmap "${CM_NAME}" -n "${CM_NS}" &>/dev/null; then + echo "ConfigMap ${CM_NS}/${CM_NAME} not found, nothing to clean up." + exit 0 + fi + + oc patch configmap "${CM_NAME}" -n "${CM_NS}" \ + --type merge -p '{"data":{"desired-state":"absent"}}' + + echo "Waiting for controller to clean up..." + INTERVAL=5 + ELAPSED=0 + + while true; do + STATUS="$(oc get configmap "${CM_NAME}" -n "${CM_NS}" \ + -o jsonpath='{.data.status}' 2>/dev/null || echo "")" + + if [[ "${STATUS}" == "cleaned" ]]; then + echo "Cleanup complete." + break + fi + + if (( ELAPSED >= TIMEOUT )); then + echo "::warning::Timed out waiting for controller cleanup (status=${STATUS})" + break + fi + + sleep "${INTERVAL}" + ELAPSED=$(( ELAPSED + INTERVAL )) + done + + oc delete configmap "${CM_NAME}" -n "${CM_NS}" 2>/dev/null || true diff --git a/.github/actions/ci-env-request/action.yml b/.github/actions/ci-env-request/action.yml new file mode 100644 index 0000000000..a69e911b78 --- /dev/null +++ b/.github/actions/ci-env-request/action.yml @@ -0,0 +1,124 @@ +name: Request CI Test Environment +description: > + Create a trigger ConfigMap for the ci-env-controller and wait until the + test environment (namespace, console, plugin) is provisioned and ready. + +inputs: + plugin-image: + description: Plugin container image to deploy + required: true + test-namespace: + description: Kubernetes namespace for the test environment + required: true + configmap-name: + description: Name of the trigger ConfigMap + required: true + ci-env-namespace: + description: Namespace where ci-env-controller runs + default: ci-env + timeout: + description: Max seconds to wait for environment to become ready + default: '360' + +outputs: + bridge-base-address: + description: In-cluster URL for the console bridge + value: ${{ steps.wait.outputs.bridge-base-address }} + console-route: + description: External HTTPS route for the console + value: ${{ steps.wait.outputs.console-route }} + +runs: + using: composite + steps: + - name: Create trigger ConfigMap + shell: bash + run: | + cat </dev/null || echo "")" + + case "${STATUS}" in + ready) + echo "Environment is ready." + break + ;; + error) + ERR_MSG="$(oc get configmap "${CM_NAME}" -n "${CM_NS}" \ + -o jsonpath='{.data.error-message}' 2>/dev/null || echo "unknown error")" + echo "::error::Environment provisioning failed: ${ERR_MSG}" + exit 1 + ;; + *) + if (( ELAPSED >= TIMEOUT )); then + echo "::error::Timed out waiting for environment (status=${STATUS:-pending})" + exit 1 + fi + echo " status=${STATUS:-pending} (${ELAPSED}s / ${TIMEOUT}s)..." + sleep "${INTERVAL}" + ELAPSED=$(( ELAPSED + INTERVAL )) + ;; + esac + done + + BRIDGE_BASE_ADDRESS="$(oc get configmap "${CM_NAME}" -n "${CM_NS}" \ + -o jsonpath='{.data.bridge-base-address}')" + CONSOLE_ROUTE="$(oc get configmap "${CM_NAME}" -n "${CM_NS}" \ + -o jsonpath='{.data.console-route}' 2>/dev/null || echo "")" + + echo "bridge-base-address=${BRIDGE_BASE_ADDRESS}" >> "${GITHUB_OUTPUT}" + echo "console-route=${CONSOLE_ROUTE}" >> "${GITHUB_OUTPUT}" + + - name: Write job summary + shell: bash + env: + CM_NAME: ${{ inputs.configmap-name }} + CM_NS: ${{ inputs.ci-env-namespace }} + PLUGIN_IMAGE: ${{ inputs.plugin-image }} + TEST_NS: ${{ inputs.test-namespace }} + BRIDGE: ${{ steps.wait.outputs.bridge-base-address }} + ROUTE: ${{ steps.wait.outputs.console-route }} + run: | + { + echo "
CI Test Environment" + echo "" + echo "| Input Parameter | Value |" + echo "|------|-------|" + echo "| ConfigMap | \`${CM_NS}/${CM_NAME}\` |" + echo "| Plugin image | \`${PLUGIN_IMAGE}\` |" + echo "| Test namespace | \`${TEST_NS}\` |" + echo "" + echo "| Output Parameter | Value |" + echo "|------|-------|" + echo "| Bridge base address | \`${BRIDGE}\` |" + echo "| Console route | \`${ROUTE}\` |" + echo "" + echo "
" + } >> "${GITHUB_STEP_SUMMARY}" diff --git a/.github/workflows/poc-e2e-ci-test2.yml b/.github/workflows/poc-e2e-ci-test2.yml index 4e34a17917..70b79b55d0 100644 --- a/.github/workflows/poc-e2e-ci-test2.yml +++ b/.github/workflows/poc-e2e-ci-test2.yml @@ -21,14 +21,16 @@ permissions: actions: read env: - BRIDGE_BASE_ADDRESS: http://localhost:9000 CYPRESS_CNV_NS: kubevirt-hyperconverged CYPRESS_OS_IMAGES_NS: kubevirt-os-images CYPRESS_TEST_NS: kubevirt-plugin-ci-test-${{ github.run_id }} CYPRESS_TEST_SECRET_NAME: ci-test-secret - # KUBEVIRT_PLUGIN_IMAGE: 'ttl.sh/kubevirt-plugin-ci-${{ github.run_id }}-${{ github.run_number }}:2h' - KUBEVIRT_PLUGIN_IMAGE: ttl.sh/kubevirt-plugin-ci-1234:6h + KUBEVIRT_PLUGIN_IMAGE: 'ttl.sh/kubevirt-plugin-ci-${{ github.run_id }}-${{ github.run_number }}:2h' + # KUBEVIRT_PLUGIN_IMAGE: ttl.sh/kubevirt-plugin-ci-1234:6h + + CI_ENV_NS: ci-env + CI_ENV_CM: ci-env-${{ github.run_id }} jobs: check-runner: @@ -45,7 +47,7 @@ jobs: echo "| --- | --- |" for var in HOME USER RUNNER_NAME RUNNER_OS RUNNER_ARCH \ GITHUB_REPOSITORY GITHUB_REF GITHUB_SHA GITHUB_RUN_ID GITHUB_RUN_NUMBER \ - BRIDGE_BASE_ADDRESS CYPRESS_CNV_NS CYPRESS_OS_IMAGES_NS \ + CYPRESS_CNV_NS CYPRESS_OS_IMAGES_NS \ CYPRESS_TEST_NS KUBEVIRT_PLUGIN_IMAGE KUBEVIRT_UI_PLUGIN_RUNNER; do echo "| \`$var\` | \`${!var:-}\` |" done @@ -56,7 +58,7 @@ jobs: echo "" echo "| Tool | Available |" echo "| --- | --- |" - for cmd in jq yq envsubst curl kubectl oc virtctl docker npm node; do + for cmd in jq yq envsubst curl kubectl oc virtctl helm npm node; do if command -v "$cmd" &>/dev/null; then echo "| \`$cmd\` | ✅ |" else @@ -119,9 +121,6 @@ jobs: || echo "| — | HCO CSV not found | — |" echo "" - # HCO labels every operand CR with app.kubernetes.io/managed-by=hco-operator. - # Use that label + -A (all namespaces) so we never hardcode a name or namespace. - # KubeVirt uses observedKubeVirtVersion; all others use observedVersion. echo "### HCO Managed Operand Versions" echo "" echo "| Operand | Version |" @@ -166,18 +165,48 @@ jobs: - name: Check if kubevirt-plugin image exists in registry id: check_image run: | - # Try unauthenticated first (works for public registries) if skopeo inspect docker://${KUBEVIRT_PLUGIN_IMAGE} &>/dev/null; then echo "IMAGE_EXISTS=true" >> $GITHUB_OUTPUT else echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT fi - - name: Build kubevirt-plugin image + - name: Generate tags for the kubevirt-plugin image + id: meta + if: steps.check_image.outputs.IMAGE_EXISTS == 'false' + uses: docker/metadata-action@v6 + env: + DOCKER_METADATA_SET_OUTPUT_ENV: false + with: + images: ${{ env.KUBEVIRT_PLUGIN_IMAGE }} + tags: | + type=schedule + type=ref,event=branch + type=ref,event=pr + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=semver,pattern={{major}} + type=sha + + - name: Build and push if: steps.check_image.outputs.IMAGE_EXISTS == 'false' + env: + LABELS: ${{ steps.meta.outputs.labels }} run: | - docker build -t ${KUBEVIRT_PLUGIN_IMAGE} -f Dockerfile . - docker push ${KUBEVIRT_PLUGIN_IMAGE} + { + echo "## Labels" + echo "\`\`\`" + echo "${LABELS}" + echo "\`\`\`" + } | tee -a "$GITHUB_STEP_SUMMARY" + + LABEL_ARGS=() + while IFS= read -r line; do + [ -n "$line" ] && LABEL_ARGS+=(--label "$line") + done <<< "$LABELS" + + podman build "${LABEL_ARGS[@]}" -t ${KUBEVIRT_PLUGIN_IMAGE} -f Dockerfile . + podman push ${KUBEVIRT_PLUGIN_IMAGE} run-gating-tests: name: Run Gating Tests @@ -186,64 +215,25 @@ jobs: timeout-minutes: 120 env: PLUGIN_IMAGE: ${{ needs.build-kubevirt-plugin-image.outputs.kubevirt-plugin-image }} - PLUGIN_PORT: 9001 - PLUGIN_NAME: kubevirt-plugin-ci - PLUGIN_TRANSPORT: http - CONSOLE_PORT: 9000 steps: - name: Checkout uses: actions/checkout@v6 - - name: Resolve console image from cluster OpenShift version - run: bash ci-scripts/resolve-console-image.sh >> "${GITHUB_ENV}" - - - name: Setup required namespaced resources - run: | - oc create namespace "${CYPRESS_TEST_NS}" --dry-run=client -o yaml | oc apply -f - - # Patch the existing fixture to substitute the CI secret's namespace and name. - yq e '.metadata.name = strenv(CYPRESS_TEST_SECRET_NAME) | .metadata.namespace = strenv(CYPRESS_TEST_NS)' \ - cypress/fixtures/secret.yaml | oc apply -f - - - - name: Start kubevirt-plugin container (mimics operator deployment with a ConfigMap + Secret) - env: - PLUGIN_URL: '${{ env.PLUGIN_TRANSPORT }}://localhost:${{ env.PLUGIN_PORT }}/plugin-manifest.json' - run: | - ./ci-scripts/start-plugin-container.sh - - echo "Waiting for plugin at ${PLUGIN_URL}..." - for i in $(seq 1 30); do - if curl -skSf "${PLUGIN_URL}" -o /dev/null; then - echo "Plugin is responding at ${PLUGIN_URL}." - exit 0 - fi - if [[ "$i" -eq 30 ]]; then - echo "::error::Plugin did not become ready on ${PLUGIN_URL}" - docker ps -a --filter "name=${PLUGIN_NAME}" || true - docker inspect "${PLUGIN_NAME}" --format 'status={{.State.Status}} exit={{.State.ExitCode}} err={{.State.Error}}' 2>/dev/null || true - docker logs "${PLUGIN_NAME}" 2>&1 || true - exit 1 - fi - sleep 2 - done + - name: Provision CI test environment + id: ci-env + uses: ./.github/actions/ci-env-request + with: + plugin-image: ${{ env.PLUGIN_IMAGE }} + test-namespace: ${{ env.CYPRESS_TEST_NS }} + configmap-name: ${{ env.CI_ENV_CM }} - - name: Start the "off cluster" console - shell: bash + - name: Create test secret run: | - ./ci-scripts/start-console.sh - - echo "Waiting for console at ${BRIDGE_BASE_ADDRESS}..." - for i in $(seq 1 60); do - if curl -s -o /dev/null -w "%{http_code}" "${BRIDGE_BASE_ADDRESS}/" | grep -qE '200|301|302'; then - echo "Console is responding." - break - fi - if [[ "$i" -eq 60 ]]; then - echo "::error::Console did not become ready within the wait window." - exit 1 - fi - sleep 5 - done + yq e ' + .metadata.name = strenv(CYPRESS_TEST_SECRET_NAME) | + .metadata.namespace = strenv(CYPRESS_TEST_NS) + ' cypress/fixtures/secret.yaml | oc apply -f - # TODO: Add dependency caching (either use the setup-node action with caching, or add explicit caching) - name: Install dependencies @@ -251,13 +241,10 @@ jobs: npm ci --ignore-scripts --no-audit npx cypress install - # # TODO: Replace with the cypress action? - # - name: Run Cypress gating tests - # run: | - # npm run test-cypress-headless -- --spec="${{ inputs.test_spec }}" - - name: Run gating tests uses: cypress-io/github-action@v7 + env: + BRIDGE_BASE_ADDRESS: ${{ steps.ci-env.outputs.bridge-base-address }} with: summary-title: 'Cypress gating tests' install: false @@ -281,21 +268,21 @@ jobs: retention-days: 7 if-no-files-found: ignore - - name: Capture logs, stop and rm the console and plugin containers + - name: e2e CI diagnostics - Collect console and plugin pod logs if: always() run: | - TMP=/tmp/e2e-ci-diagnostics/container-logs + TMP=/tmp/e2e-ci-diagnostics/pod-logs mkdir -p "${TMP}" - docker logs console > "${TMP}/console.log" 2>&1 || true - docker stop console || echo "::warning::Could not stop console container" - docker rm -f console || true + HELM_RELEASE="$(oc get configmap "${CI_ENV_CM}" -n "${CI_ENV_NS}" \ + -o jsonpath='{.data.helm-release}' 2>/dev/null || echo "${CI_ENV_CM}")" - docker logs "${PLUGIN_NAME}" > "${TMP}/kubevirt-plugin.log" 2>&1 || true - docker stop "${PLUGIN_NAME}" || echo "::warning::Could not stop ${PLUGIN_NAME} container" - docker rm -f "${PLUGIN_NAME}" || true + oc logs -n "${CYPRESS_TEST_NS}" -l "app=${HELM_RELEASE}-console" --tail=-1 \ + > "${TMP}/console.log" 2>&1 || true + oc logs -n "${CYPRESS_TEST_NS}" -l "app=${HELM_RELEASE}-plugin" --tail=-1 \ + > "${TMP}/kubevirt-plugin.log" 2>&1 || true - - name: Collect OpenShift cluster diagnostics on failure + - name: e2e CI diagnostics - Collect OpenShift cluster information if: failure() run: | TMP=/tmp/e2e-ci-diagnostics/cluster @@ -305,19 +292,25 @@ jobs: oc get pods -n kubevirt-hyperconverged -o wide > "${TMP}/hco_pods.txt" 2>/dev/null || true oc get nodes -o wide > "${TMP}/nodes.txt" 2>/dev/null || true oc get events -n "${CYPRESS_TEST_NS}" --sort-by='.lastTimestamp' > "${TMP}/test_ns_events.txt" 2>/dev/null || true + oc get pods -n "${CYPRESS_TEST_NS}" -o wide > "${TMP}/test_ns_pods.txt" 2>/dev/null || true - - name: Upload E2E diagnostics (cluster + container logs) on failure - if: failure() + - name: e2e CI diagnostics - Upload artifacts + if: always() uses: actions/upload-artifact@v6 with: - name: e2e-ci-diagnostics-cluster-and-containers-${{ github.run_id }} + name: e2e-ci-diagnostics-${{ github.run_id }} path: /tmp/e2e-ci-diagnostics/ retention-days: 7 if-no-files-found: ignore - # TODO: Add any other cleanup steps that are required for the test to this step - - name: Cleanup cluster resources + - name: Clean up test entities if: always() run: | - ./ci-scripts/test-cleanup.sh || echo "::warning::Cleanup encountered errors (non-fatal)" - oc delete namespace "${CYPRESS_TEST_NS}" --wait=false || true + echo "Cleaning test entities in ${CYPRESS_TEST_NS}..." + bash ci-scripts/test-cleanup.sh + + - name: Release CI test environment + if: always() + uses: ./.github/actions/ci-env-release + with: + configmap-name: ${{ env.CI_ENV_CM }} diff --git a/.prettierignore b/.prettierignore index 99591d54e5..23f16e89ec 100644 --- a/.prettierignore +++ b/.prettierignore @@ -11,4 +11,3 @@ cypress/cypress-a11y-report.json # Helm templates contain Go template syntax that Prettier cannot parse ci-scripts/**/helm/**/templates/ -ci-scripts/images/**/helm/**/templates/ diff --git a/.prettierignorecode b/.prettierignorecode index 80daebe239..8fc9f02b34 100644 --- a/.prettierignorecode +++ b/.prettierignorecode @@ -9,6 +9,9 @@ cypress/gui-test-screenshots cypress/cypress-a11y-report.json /locales +# Helm templates contain Go template syntax that Prettier cannot parse +ci-scripts/helm/**/templates/ + # Ignore anything that eslint will look at since eslint also applies prettier. # **/*.js diff --git a/.vscode/settings.json b/.vscode/settings.json index 9968743f07..ba07a6a4ff 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -7,5 +7,9 @@ "source.fixAll.eslint": "always" }, "eslint.validate": ["javascript", "typescript"], - "cSpell.words": ["hyperconverged", "kubevirt"] + "cSpell.words": ["hyperconverged", "kubevirt"], + "yaml.validate": true, + "files.associations": { + "ci-scripts/helm/**/templates/*.yaml": "helm" + } } diff --git a/ci-scripts/_cluster-helpers.sh b/ci-scripts/_cluster-helpers.sh index 7b03c30261..208a82d65b 100644 --- a/ci-scripts/_cluster-helpers.sh +++ b/ci-scripts/_cluster-helpers.sh @@ -8,7 +8,6 @@ # After sourcing, call: # resolve_cli_downloads # populates OC_URL, VIRTCTL_URL, HELM_URL # resolve_oc_version # populates OC_VERSION (if not already set) -# resolve_internal_registry # populates INTERNAL_REGISTRY # # The functions use ConsoleCLIDownload resources and rewrite public route URLs # to cluster-internal service URLs so build pods don't need to trust the @@ -17,13 +16,6 @@ # Requires: oc logged into OpenShift. # Optional: jq (URL resolution is silently skipped without it). -verify_oc() { - if ! oc get clusterversion version &>/dev/null; then - echo "ERROR: OpenShift cluster required (clusterversion.version not found)." - exit 1 - fi -} - # Rewrite a public https route URL to its backing cluster-internal HTTP service. # Arg: $1 = URL Reads: _ALL_ROUTES_JSON (set by resolve_cli_downloads) _route_url_to_internal() { @@ -31,12 +23,11 @@ _route_url_to_internal() { [[ -z "${url}" ]] && return local host path route_info ns svc svc_port host=$(echo "${url}" | sed -E 's|https://([^/]+).*|\1|') - path=$(echo "${url}" | sed -E 's|https://[^/]+(/.*)?|\1|') - path="${path:-/}" + path=$(echo "${url}" | sed -E 's|https://[^/]+(/.*)?|\1|' || echo '/') route_info=$(echo "${_ALL_ROUTES_JSON}" \ | jq -r --arg h "${host}" \ - '.items[]? | select(.spec.host == $h) | "\(.metadata.namespace) \(.spec.to.name)"' \ - | head -1) || true + '.items[] | select(.spec.host == $h) | "\(.metadata.namespace) \(.spec.to.name)"' \ + | head -1) if [[ -n "${route_info}" ]]; then read -r ns svc <<< "${route_info}" svc_port=$(oc get service "${svc}" -n "${ns}" \ @@ -58,14 +49,6 @@ resolve_oc_version() { OC_VERSION="${OC_VERSION:-4.20}" } -# Resolve the internal image registry hostname from the cluster. -# Sets INTERNAL_REGISTRY; defaults to the well-known service address if detection fails. -resolve_internal_registry() { - INTERNAL_REGISTRY="$(oc get image.config.openshift.io/cluster \ - -o jsonpath='{.status.internalRegistryHostname}' 2>/dev/null || true)" - INTERNAL_REGISTRY="${INTERNAL_REGISTRY:-image-registry.openshift-image-registry.svc:5000}" -} - # Resolve binary download URLs from ConsoleCLIDownload resources. # Sets: OC_URL, VIRTCTL_URL, HELM_URL (empty string if not resolved). # Callers can choose which variables they need; unused ones remain empty. @@ -86,7 +69,7 @@ resolve_cli_downloads() { OC_URL=$(echo "${cli_json}" \ | jq -r '.items[].spec.links[] | select(.text | test("oc.*linux.*x86_64|oc.*linux.*amd64"; "i")) | .href' \ - | head -1 || true) + | head -1) VIRTCTL_URL=$(echo "${cli_json}" \ | jq -r '.items[].spec.links[] | select(.text | test("virtctl.*linux.*amd64|virtctl.*linux.*x86_64"; "i")) | .href' \ | head -1 || true) diff --git a/ci-scripts/arc/arc-dind-post-render.sh b/ci-scripts/arc/arc-dind-post-render.sh deleted file mode 100755 index 5fa55301db..0000000000 --- a/ci-scripts/arc/arc-dind-post-render.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash -# Helm post-renderer for gha-runner-scale-set + containerMode dind on OpenShift. -# -# 1) Optional: replace hardcoded docker:dind with internal registry image when -# ci-scripts/generated/arc-dind-replace.env exists (setup-dind-mirror.sh, etc.). -# 2) Always: append --storage-driver=vfs to dockerd args so the inner daemon -# does not use overlay on top of the pod's overlay (containerd EINVAL on mount). -# -# Ref: ARC chart dind args in actions-runner-controller _helpers.tpl (dind-container). -set -euo pipefail -ARC_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -ENV_FILE="${ARC_ROOT}/../generated/arc-dind-replace.env" - -tmp="$(mktemp)" -trap 'rm -f "${tmp}"' EXIT -cat >"${tmp}" - -ARC_DIND_INTERNAL_IMAGE="" -if [[ -f "${ENV_FILE}" ]]; then - # shellcheck source=/dev/null - source "${ENV_FILE}" -fi - -skip_vfs=0 -if grep -q 'storage-driver=vfs' "${tmp}"; then - skip_vfs=1 -fi - -while IFS= read -r line || [[ -n "${line}" ]]; do - if [[ -n "${ARC_DIND_INTERNAL_IMAGE:-}" && "${line}" == *docker:dind* ]]; then - printf '%s\n' "${line//docker:dind/${ARC_DIND_INTERNAL_IMAGE}}" - continue - fi - if [[ "${skip_vfs}" == 0 && "${line}" =~ ^([[:space:]]*)-\ --group=\$\(DOCKER_GROUP_GID\)$ ]]; then - printf '%s\n' "${line}" - printf '%s- --storage-driver=vfs\n' "${BASH_REMATCH[1]}" - continue - fi - printf '%s\n' "${line}" -done <"${tmp}" diff --git a/ci-scripts/arc/arc-helm-helpers.sh b/ci-scripts/arc/arc-helm-helpers.sh index 5ea9b8c4f7..a986c3b199 100755 --- a/ci-scripts/arc/arc-helm-helpers.sh +++ b/ci-scripts/arc/arc-helm-helpers.sh @@ -64,27 +64,3 @@ arc_helm_append_scale_set_labels() { echo "Scale set labels (multilabel): ${ARC_SCALE_SET_LABELS}" _helm_arr+=(--set-json "scaleSetLabels=${json}") } - -# -# Post-renders gha-runner-scale-set dind manifests: optional docker:dind → mirror -# (ci-scripts/generated/arc-dind-replace.env), and always injects --storage-driver=vfs -# so dockerd works on OpenShift (nested overlay otherwise fails with EINVAL). -# -# Usage: arc_helm_append_dind_post_renderer RUNNER_SET_ARGS "${ARC_DIR}" "${CI_SCRIPTS_DIR}" -# Env: ARC_USE_DIND_POST_RENDER=0 to disable. -# -arc_helm_append_dind_post_renderer() { - local -n _helm_arr="${1:?helm args array name required}" - local arc_dir="${2:?arc directory required}" - local ci_scripts_dir="${3:?ci-scripts directory required}" - local env_file="${ci_scripts_dir}/generated/arc-dind-replace.env" - local pr_script="${arc_dir}/arc-dind-post-render.sh" - [[ "${ARC_USE_DIND_POST_RENDER:-1}" == "0" ]] && return 0 - [[ ! -f "${pr_script}" ]] && return 0 - if [[ -f "${env_file}" ]]; then - echo "Helm post-renderer: docker:dind mirror (${env_file}) + dind vfs (OpenShift)" - else - echo "Helm post-renderer: dind --storage-driver=vfs (OpenShift / nested overlay)" - fi - _helm_arr+=(--post-renderer "${pr_script}") -} diff --git a/ci-scripts/arc/arc-openshift-scc.yaml b/ci-scripts/arc/arc-openshift-scc.yaml index 598e359991..38c260cf64 100644 --- a/ci-scripts/arc/arc-openshift-scc.yaml +++ b/ci-scripts/arc/arc-openshift-scc.yaml @@ -1,27 +1,25 @@ # OpenShift Security Context Constraint and ClusterRole for GitHub ARC runners. # # The runner container uses UID 1001 / GID 123 via Helm template.securityContext. -# With containerMode.type=dind, the chart adds a privileged docker:dind sidecar that -# must run as root and start dockerd — so this SCC allows privileged containers and -# RunAsAny for UIDs (stricter MustRunAs/1001 would block the dind container). +# No privileged containers or privilege escalation is needed; the CI test stack +# (console + plugin) runs as separate unprivileged pods via the ci-test-stack +# Helm chart. # # Ref: https://developers.redhat.com/articles/2025/02/17/how-securely-deploy-github-arc-openshift -# ARC dind template: actions-runner-controller charts gha-runner-scale-set -# --- apiVersion: security.openshift.io/v1 kind: SecurityContextConstraints metadata: name: github-arc annotations: - kubernetes.io/description: 'ARC runners: main container as UID 1001 (Helm), Docker-in-Docker privileged sidecar. RunAsAny required for dind root + runner 1001.' + kubernetes.io/description: 'ARC runners: main container as UID 1001 (Helm), non-root, unprivileged.' allowHostDirVolumePlugin: false allowHostIPC: false allowHostNetwork: false allowHostPID: false allowHostPorts: false -allowPrivilegeEscalation: true -allowPrivilegedContainer: true +allowPrivilegeEscalation: false +allowPrivilegedContainer: false allowedCapabilities: null defaultAddCapabilities: null fsGroup: @@ -30,7 +28,7 @@ groups: [] priority: null readOnlyRootFilesystem: false runAsUser: - type: RunAsAny + type: MustRunAsNonRoot seLinuxContext: type: MustRunAs supplementalGroups: diff --git a/ci-scripts/arc/arc-runner-rbac.yaml b/ci-scripts/arc/arc-runner-rbac.yaml index 93c56061dd..3050e3db6b 100644 --- a/ci-scripts/arc/arc-runner-rbac.yaml +++ b/ci-scripts/arc/arc-runner-rbac.yaml @@ -1,5 +1,6 @@ -# Grant the ARC runner scale set ServiceAccount CI permissions (oc, test namespaces, KubeVirt). -# gha-runner-scale-set creates: -gha-rs-no-permission (default: kubevirt-plugin-ci-gha-rs-no-permission). +# Grant the ARC runner scale set ServiceAccount CI permissions. +# gha-runner-scale-set creates: -gha-rs-no-permission +# (default: kubevirt-plugin-ci-gha-rs-no-permission). # # Applied automatically by ci-scripts/arc/install-runner-scale-set.sh. # Subject name + namespace substituted for RUNNER_SCALE_SET_NAME / ARC_RUNNERS_NS. @@ -8,12 +9,14 @@ # # To skip cluster RBAC (custom bindings): SKIP_ARC_RUNNER_RBAC=1 # -# If RUNNER_SCALE_SET_NAME differs from defaults and you apply this file by hand, edit subjects or use: +# If RUNNER_SCALE_SET_NAME differs from defaults and you apply this file by hand, +# edit subjects or use: # oc create clusterrolebinding arc-runner-ci-custom --clusterrole=arc-runner-ci \ # --serviceaccount=:-gha-rs-no-permission # -# Alternative (cluster-admin): bind cluster-admin to the same ServiceAccount instead of arc-runner-ci. -# +# NOTE: The runner's ConfigMap access in the ci-env namespace is granted via a +# namespaced RoleBinding created by ci-scripts/ci-env/install-ci-env-controller.sh, +# not by this ClusterRole. --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -22,44 +25,34 @@ metadata: labels: app.kubernetes.io/component: arc-runner-rbac rules: - # Read cluster and console config (for oc cluster-info, console URL) + # --- Cluster reads (console URL, version logging) --- - apiGroups: [''] - resources: ['nodes', 'namespaces'] + resources: ['nodes'] verbs: ['get', 'list'] - apiGroups: ['config.openshift.io'] resources: ['consoles', 'clusterversions', 'ingresses'] verbs: ['get', 'list'] - # Create/delete test namespaces and manage resources for Cypress + + # --- Diagnostics (pod logs, events in test namespaces) --- - apiGroups: [''] - resources: - [ - 'namespaces', - 'pods', - 'pods/log', - 'services', - 'secrets', - 'configmaps', - 'events', - 'serviceaccounts', - 'persistentvolumeclaims', - ] - verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete', 'deletecollection'] - # Read HCO operand CRs for version logging (Log HCO and managed operator versions step) + resources: ['pods', 'pods/log', 'events'] + verbs: ['get', 'list'] + + # --- Cypress cy.exec runs: oc patch virtualmachine --- + - apiGroups: ['kubevirt.io'] + resources: ['virtualmachines'] + verbs: ['get', 'list', 'watch', 'patch'] + + # --- check-runner job: HCO / operator version logging --- - apiGroups: ['operators.coreos.com'] resources: ['clusterserviceversions'] verbs: ['get', 'list'] - apiGroups: ['kubevirt.io'] - resources: - [ - 'kubevirts', - 'virtualmachines', - 'virtualmachineinstances', - 'virtualmachineinstancemigrations', - ] - verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] + resources: ['kubevirts'] + verbs: ['get', 'list'] - apiGroups: ['cdi.kubevirt.io'] - resources: ['cdis', 'datavolumes'] - verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] + resources: ['cdis'] + verbs: ['get', 'list'] - apiGroups: ['ssp.kubevirt.io'] resources: ['ssps'] verbs: ['get', 'list'] @@ -69,12 +62,6 @@ rules: - apiGroups: ['hostpathprovisioner.kubevirt.io'] resources: ['hostpathprovisioners'] verbs: ['get', 'list'] - - apiGroups: ['snapshot.storage.k8s.io'] - resources: ['volumesnapshots'] - verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] - - apiGroups: ['subresources.kubevirt.io'] - resources: ['virtualmachineinstances/console', 'virtualmachineinstances/vnc'] - verbs: ['get'] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/ci-scripts/arc/arc-runner-scale-set.pod.yaml b/ci-scripts/arc/arc-runner-scale-set.pod.yaml index dd01930843..c27f8109ad 100644 --- a/ci-scripts/arc/arc-runner-scale-set.pod.yaml +++ b/ci-scripts/arc/arc-runner-scale-set.pod.yaml @@ -1,28 +1,13 @@ -# gha-runner-scale-set Helm values fragment: runner pod template (OpenShift + dind). +# gha-runner-scale-set Helm values fragment: runner pod template (OpenShift). # Used by ci-scripts/arc/install-runner-scale-set.sh. # -# With containerMode.type=dind, the chart merges this with docker:dind, initContainers, -# DOCKER_HOST, and dind volume mounts. Only the container named "runner" is customized here. -# -# Nested overlay: dockerd's default overlay graph on the pod's overlayfs fails (EINVAL). -# arc-dind-post-render.sh injects --storage-driver=vfs into the chart's dockerd args; re-run -# install-runner-scale-set.sh after upgrading the post-renderer (ARC_USE_DIND_POST_RENDER=0 skips it). -# -# docker:dind: the upstream chart hardcodes image docker:dind (no Helm value). This repo expects -# setup-dind-mirror.sh to mirror docker:dind → internal ImageStream arc-docker-dind:dind and write -# ci-scripts/generated/arc-dind-replace.env; install-runner-scale-set.sh runs Helm with -# --post-renderer arc-dind-post-render.sh when that file exists (or ARC_DIND_INTERNAL_IMAGE). -# SKIP_DIND_MIRROR=1 is for clusters that replace dind via ImageContentSourcePolicy / other mirroring. +# The CI test stack (console + plugin) runs as separate cluster pods via the +# ci-test-stack Helm chart, so no Docker-in-Docker sidecar is needed. # # Default image is upstream until ARC_RUNNER_IMAGE / --set overrides: # --set template.spec.containers[0].image=/... # # Ref: https://github.com/actions/actions-runner-controller/tree/master/charts/gha-runner-scale-set -# -# ARC 0.14+ (chart 0.14.0): multilabel via scaleSetLabels (see ARC_SCALE_SET_LABELS / examples/), -# resourceMeta for listener/RBAC metadata, listener defaults to kubernetes.io/os: linux. -# Experimental chart exposes runner.dind.container.image to avoid docker:dind post-rendering — -# see HOT_CLUSTER_CI.md if you migrate off the stable chart. --- template: spec: @@ -42,11 +27,10 @@ template: # Workaround for FIPS-enabled clusters (see POC_HOT_CLUSTER_CI.md). - name: OPENSSL_FORCE_FIPS_MODE value: '0' - # Workaround for FIPS-enabled clusters (see POC_HOT_CLUSTER_CI.md). - name: GOLANG_FIPS value: '0' securityContext: - allowPrivilegeEscalation: true + allowPrivilegeEscalation: false capabilities: drop: - ALL diff --git a/ci-scripts/arc/ci-console-clusterrole.yaml b/ci-scripts/arc/ci-console-clusterrole.yaml new file mode 100644 index 0000000000..903a84826f --- /dev/null +++ b/ci-scripts/arc/ci-console-clusterrole.yaml @@ -0,0 +1,116 @@ +# ClusterRole for the CI test stack console ServiceAccount. +# Applied once at ARC install time by install-arc-controller.sh. +# Bound per CI run by the ci-test-stack Helm chart's ClusterRoleBinding. +# +# Permissions cover what the OpenShift console needs to proxy browser requests +# (including WebSocket watch) for KubeVirt/CDI management, plus cluster-info +# reads for the console UI chrome and overview pages. +# +# IMPORTANT: ci-env-controller-rbac.yaml (ci-env-controller ClusterRole) must be +# a SUPERSET of this file. Any additions here must also be added there. +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: ci-console + labels: + app.kubernetes.io/component: ci-console-rbac +rules: + # Cluster info (console chrome, overview pages, cluster version banner) + - apiGroups: [''] + resources: ['nodes', 'namespaces'] + verbs: ['get', 'list', 'watch'] + - apiGroups: ['config.openshift.io'] + resources: ['consoles', 'clusterversions', 'ingresses', 'infrastructures', 'dnses'] + verbs: ['get', 'list', 'watch'] + - apiGroups: ['storage.k8s.io'] + resources: ['storageclasses'] + verbs: ['get', 'list', 'watch'] + + # Operator lifecycle (HCO subscription/CSV status, console watches these via WebSocket) + - apiGroups: ['operators.coreos.com'] + resources: ['clusterserviceversions', 'subscriptions'] + verbs: ['get', 'list', 'watch'] + - apiGroups: ['packages.operators.coreos.com'] + resources: ['packagemanifests'] + verbs: ['get', 'list'] + + # Console operator config (console watches its own operator CR) + - apiGroups: ['operator.openshift.io'] + resources: ['consoles'] + verbs: ['get', 'list', 'watch'] + + # Console UI resources (notifications, quick starts, links, plugins, YAML samples) + - apiGroups: ['console.openshift.io'] + resources: + - consoleplugins + - consolenotifications + - consolequickstarts + - consolelinks + - consoleyamlsamples + verbs: ['get', 'list', 'watch'] + + # User identity (console watches current user, even with disabled auth) + - apiGroups: ['user.openshift.io'] + resources: ['users'] + verbs: ['get', 'list', 'watch'] + + # Projects (self-scoped, but console needs watch for project list) + - apiGroups: ['project.openshift.io'] + resources: ['projects'] + verbs: ['get', 'list', 'watch'] + + # Core namespaced resources (VM creation, test fixtures, etc.) + - apiGroups: [''] + resources: + - pods + - pods/log + - services + - secrets + - configmaps + - events + - serviceaccounts + - persistentvolumeclaims + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] + + # KubeVirt + - apiGroups: ['kubevirt.io'] + resources: + - kubevirts + - virtualmachines + - virtualmachineinstances + - virtualmachineinstancemigrations + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] + - apiGroups: ['subresources.kubevirt.io'] + resources: + - virtualmachineinstances/console + - virtualmachineinstances/vnc + verbs: ['get'] + - apiGroups: ['instancetype.kubevirt.io'] + resources: + - virtualmachineclusterinstancetypes + - virtualmachineinstancetypes + - virtualmachineclusterpreferences + - virtualmachinepreferences + verbs: ['get', 'list', 'watch'] + + # CDI + - apiGroups: ['cdi.kubevirt.io'] + resources: ['cdis', 'cdiconfigs', 'datavolumes', 'datasources'] + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] + + # HCO operand reads (version checks, overview status) + - apiGroups: ['ssp.kubevirt.io'] + resources: ['ssps'] + verbs: ['get', 'list'] + - apiGroups: ['networkaddonsoperator.network.kubevirt.io'] + resources: ['networkaddonsconfigs'] + verbs: ['get', 'list'] + - apiGroups: ['hostpathprovisioner.kubevirt.io'] + resources: ['hostpathprovisioners'] + verbs: ['get', 'list'] + + # Snapshots + - apiGroups: ['snapshot.storage.k8s.io'] + resources: ['volumesnapshots'] + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] diff --git a/ci-scripts/arc/install-arc-controller.sh b/ci-scripts/arc/install-arc-controller.sh index 1cd3697298..1b51507089 100755 --- a/ci-scripts/arc/install-arc-controller.sh +++ b/ci-scripts/arc/install-arc-controller.sh @@ -22,8 +22,8 @@ ARC_VERSION="${ARC_VERSION:-0.14.0}" echo "=== ARC controller installation (OpenShift) ===" echo " ARC_CONTROLLER_NS: ${ARC_CONTROLLER_NS}" echo " ARC_CONTROLLER_INSTALL_NAME: ${ARC_CONTROLLER_INSTALL_NAME}" -echo " ARC_VERSION: ${ARC_VERSION}" echo " ARC_HELM_REPO: ${ARC_HELM_REPO}" +echo " ARC_VERSION: ${ARC_VERSION}" echo "" if ! oc get clusterversion version &>/dev/null; then @@ -38,11 +38,15 @@ oc create namespace "${ARC_CONTROLLER_NS}" --dry-run=client -o yaml | oc apply - echo "Applying ARC SCC and ClusterRole (github-arc)..." oc apply -f "${ARC_DIR}/arc-openshift-scc.yaml" -CONTROLLER_SA_NAME="${ARC_CONTROLLER_INSTALL_NAME}-gha-rs-controller" +echo "Applying CI console ClusterRole (ci-console)..." +oc apply -f "${ARC_DIR}/ci-console-clusterrole.yaml" + CONTROLLER_ARGS=(--namespace "${ARC_CONTROLLER_NS}") if [[ -n "${ARC_VERSION}" && "${ARC_VERSION}" != "latest" ]]; then CONTROLLER_ARGS+=(--version "${ARC_VERSION}") fi + +CONTROLLER_SA_NAME="${ARC_CONTROLLER_INSTALL_NAME}-gha-rs-controller" CONTROLLER_ARGS+=(--set "serviceAccount.name=${CONTROLLER_SA_NAME}") echo "Installing ARC controller (Helm release: ${ARC_CONTROLLER_INSTALL_NAME})..." diff --git a/ci-scripts/arc/install-runner-scale-set.sh b/ci-scripts/arc/install-runner-scale-set.sh index f1ae77b21a..e659f83255 100755 --- a/ci-scripts/arc/install-runner-scale-set.sh +++ b/ci-scripts/arc/install-runner-scale-set.sh @@ -1,7 +1,7 @@ #!/bin/bash # -# Install gha-runner-scale-set (OpenShift): runner namespace, Helm release, optional dind -# post-render (mirror file or ARC_DIND_INTERNAL_IMAGE), SCC bind, CI RBAC. +# Install gha-runner-scale-set (OpenShift): runner namespace, Helm release, +# SCC bind, CI RBAC. # Requires install-arc-controller.sh (or equivalent controller + SCC) already applied. # # Required environment variables: @@ -19,12 +19,8 @@ # ARC_RUNNERS_NS (default: arc-runners) # ARC_VERSION Helm chart version (default: 0.14.0); set to "latest" to omit --version # ARC_SCALE_SET_LABELS Optional comma-separated multilabel (ARC 0.14+) -# CONTAINER_MODE default dind; set to "none" to disable Docker-in-Docker # ARC_RUNNER_EXTRA_VALUES Optional second Helm values file (merged after pod.yaml) # ARC_RUNNER_IMAGE If set, use this image for the runner container -# ARC_DIND_INTERNAL_IMAGE If set, writes ci-scripts/generated/arc-dind-replace.env for this run -# (alternative to setup-dind-mirror.sh) -# ARC_USE_DIND_POST_RENDER Default 1; set to 0 to skip post-renderer # SKIP_ARC_RUNNER_RBAC Set to 1 to skip applying ci-scripts/arc/arc-runner-rbac.yaml # # Pod template fragment: ci-scripts/arc/arc-runner-scale-set.pod.yaml @@ -41,26 +37,24 @@ if [[ ! -f "${RUNNER_POD_VALUES}" ]]; then fi ARC_CONFIG_URL="${ARC_CONFIG_URL:?ARC_CONFIG_URL is required}" -RUNNER_SCALE_SET_NAME="${RUNNER_SCALE_SET_NAME:-kubevirt-plugin-ci}" -MIN_RUNNERS="${MIN_RUNNERS:-0}" -MAX_RUNNERS="${MAX_RUNNERS:-5}" ARC_CONTROLLER_NS="${ARC_CONTROLLER_NS:-arc-systems}" ARC_CONTROLLER_INSTALL_NAME="${ARC_CONTROLLER_INSTALL_NAME:-arc}" ARC_RUNNERS_NS="${ARC_RUNNERS_NS:-arc-runners}" -ARC_VERSION="${ARC_VERSION:-0.14.0}" -CONTAINER_MODE="${CONTAINER_MODE:-dind}" -[[ "${CONTAINER_MODE}" == "none" || "${CONTAINER_MODE}" == "disabled" ]] && CONTAINER_MODE="" ARC_HELM_REPO="oci://ghcr.io/actions/actions-runner-controller-charts" +ARC_VERSION="${ARC_VERSION:-0.14.0}" +RUNNER_SCALE_SET_NAME="${RUNNER_SCALE_SET_NAME:-kubevirt-plugin-ci}" +MIN_RUNNERS="${MIN_RUNNERS:-0}" +MAX_RUNNERS="${MAX_RUNNERS:-5}" echo "=== ARC runner scale set installation (OpenShift) ===" echo " ARC_CONFIG_URL: ${ARC_CONFIG_URL}" -echo " RUNNER_SCALE_SET_NAME: ${RUNNER_SCALE_SET_NAME}" -echo " MIN_RUNNERS / MAX_RUNNERS: ${MIN_RUNNERS} / ${MAX_RUNNERS}" echo " ARC_CONTROLLER_NS: ${ARC_CONTROLLER_NS}" echo " ARC_CONTROLLER_INSTALL_NAME: ${ARC_CONTROLLER_INSTALL_NAME}" echo " ARC_RUNNERS_NS: ${ARC_RUNNERS_NS}" +echo " ARC_HELM_REPO: ${ARC_HELM_REPO}" echo " ARC_VERSION: ${ARC_VERSION}" -echo " CONTAINER_MODE: ${CONTAINER_MODE:-"(none — no dind)"}" +echo " RUNNER_SCALE_SET_NAME: ${RUNNER_SCALE_SET_NAME}" +echo " MIN_RUNNERS / MAX_RUNNERS: ${MIN_RUNNERS} / ${MAX_RUNNERS}" echo " Runner pod values: ${RUNNER_POD_VALUES}" echo "" @@ -69,12 +63,6 @@ if ! oc get clusterversion version &>/dev/null; then exit 1 fi -if [[ -n "${ARC_DIND_INTERNAL_IMAGE:-}" && "${ARC_USE_DIND_POST_RENDER:-1}" != "0" ]]; then - mkdir -p "${CI_SCRIPTS_DIR}/generated" - printf 'ARC_DIND_INTERNAL_IMAGE=%s\n' "${ARC_DIND_INTERNAL_IMAGE}" > "${CI_SCRIPTS_DIR}/generated/arc-dind-replace.env" - echo "Wrote ${CI_SCRIPTS_DIR}/generated/arc-dind-replace.env from ARC_DIND_INTERNAL_IMAGE" -fi - echo "Creating namespace ${ARC_RUNNERS_NS}..." oc create namespace "${ARC_RUNNERS_NS}" --dry-run=client -o yaml | oc apply -f - @@ -98,10 +86,6 @@ if [[ -n "${ARC_RUNNER_IMAGE:-}" ]]; then echo "Using runner image from ARC_RUNNER_IMAGE" RUNNER_SET_ARGS+=(--set-string "template.spec.containers[0].image=${ARC_RUNNER_IMAGE}") fi -if [[ -n "${CONTAINER_MODE:-}" ]]; then - echo "Enabling container mode: ${CONTAINER_MODE} (Docker-in-Docker)" - RUNNER_SET_ARGS+=(--set "containerMode.type=${CONTAINER_MODE}") -fi if [[ -n "${ARC_RUNNER_EXTRA_VALUES:-}" && -f "${ARC_RUNNER_EXTRA_VALUES}" ]]; then echo "Merging extra Helm values: ${ARC_RUNNER_EXTRA_VALUES}" RUNNER_SET_ARGS+=(--values "${ARC_RUNNER_EXTRA_VALUES}") @@ -110,7 +94,6 @@ if [[ -n "${ARC_VERSION}" && "${ARC_VERSION}" != "latest" ]]; then RUNNER_SET_ARGS+=(--version "${ARC_VERSION}") fi arc_helm_append_scale_set_labels RUNNER_SET_ARGS -arc_helm_append_dind_post_renderer RUNNER_SET_ARGS "${ARC_DIR}" "${CI_SCRIPTS_DIR}" echo "Installing runner scale set '${RUNNER_SCALE_SET_NAME}'..." helm upgrade --install "${RUNNER_SCALE_SET_NAME}" \ @@ -146,4 +129,3 @@ echo "=== Runner scale set installation complete ===" echo " runs-on: ${RUNNER_SCALE_SET_NAME}" echo " To refresh runner image: re-run this script with ARC_RUNNER_IMAGE set (after setup-runner-image.sh)." echo "" -echo "Disable dind: CONTAINER_MODE=none and re-run this script." diff --git a/ci-scripts/arc/runner-image/Dockerfile b/ci-scripts/arc/runner-image/Dockerfile index 4a246386d5..5162f2cc9f 100644 --- a/ci-scripts/arc/runner-image/Dockerfile +++ b/ci-scripts/arc/runner-image/Dockerfile @@ -21,12 +21,15 @@ ARG NODE_VERSION=22 ARG OC_VERSION=4.20 # KubeVirt CLI version to match HCO install (e.g. v1.4.0) ARG VIRTCTL_VERSION=v1.4.0 +# Helm version (e.g. 3.19.0) +ARG HELM_VERSION=3.19.0 # Direct binary download URLs resolved from ConsoleCLIDownload by setup-runner-image.sh. # When set, these take precedence over the static mirror/GitHub URLs above and guarantee # the binaries match the live cluster. Left empty to use the static fallback URLs. ARG OC_URL="" ARG VIRTCTL_URL="" +ARG HELM_URL="" # curl and wget2 are both installed: # curl — kept for CI workflow scripts at runtime @@ -101,6 +104,18 @@ RUN if [ -n "${VIRTCTL_URL}" ]; then \ fi \ && chmod +x /usr/local/bin/virtctl +# Helm — use console download URL if resolved, else get.helm.sh. +# Console route serves a .tar.gz; fallback uses wget2 (GnuTLS) for the external HTTPS download. +RUN if [ -n "${HELM_URL}" ]; then \ + wget2 -qO /tmp/helm-archive "${HELM_URL}" \ + && tar -xf /tmp/helm-archive -C /usr/local/bin --strip-components=1 linux-amd64/helm \ + && rm /tmp/helm-archive; \ + else \ + wget2 -qO- "https://get.helm.sh/helm-v${HELM_VERSION}-linux-amd64.tar.gz" \ + | tar -xzf - -C /usr/local/bin --strip-components=1 linux-amd64/helm; \ + fi \ + && chmod +x /usr/local/bin/helm + USER runner # Default npm and tmp to HOME so npm ci works in restricted containers without workflow overrides diff --git a/ci-scripts/arc/setup-dind-mirror.sh b/ci-scripts/arc/setup-dind-mirror.sh deleted file mode 100755 index d83bc3bcb6..0000000000 --- a/ci-scripts/arc/setup-dind-mirror.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash -# -# OpenShift only: mirror docker:dind into the internal registry for ARC (avoids Docker Hub -# rate limits on the chart's hardcoded docker:dind image). Writes -# ci-scripts/generated/arc-dind-replace.env for Helm post-rendering in ci-scripts/arc/install-runner-scale-set.sh. -# -# Optional environment variables: -# ARC_RUNNERS_NS (default: arc-runners) -# SKIP_DIND_MIRROR (default: 0) — set to 1 to skip mirroring and remove stale arc-dind-replace.env -# DIND_SOURCE_IMAGE (default: docker.io/library/docker:dind) — source for oc import-image -# -# Requires: oc logged into OpenShift. -# Note: import-image pulls from Docker Hub server-side; rate limits may require cluster pull secrets. - -set -euo pipefail -ARC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -CI_SCRIPTS_DIR="$(cd "${ARC_DIR}/.." && pwd)" - -ARC_RUNNERS_NS="${ARC_RUNNERS_NS:-arc-runners}" -GENERATED_DIR="${CI_SCRIPTS_DIR}/generated" -SKIP_DIND_MIRROR="${SKIP_DIND_MIRROR:-0}" -DIND_SOURCE_IMAGE="${DIND_SOURCE_IMAGE:-docker.io/library/docker:dind}" -DIND_INTERNAL_REF="image-registry.openshift-image-registry.svc:5000/${ARC_RUNNERS_NS}/arc-docker-dind:dind" - -if ! oc get clusterversion version &>/dev/null; then - echo "ERROR: OpenShift cluster required (clusterversion.version not found)." - exit 1 -fi - -echo "=== Mirror docker:dind for ARC (internal registry) ===" -echo " ARC_RUNNERS_NS: ${ARC_RUNNERS_NS}" -echo " From: ${DIND_SOURCE_IMAGE}" -echo " To ISTag: arc-docker-dind:dind → ${DIND_INTERNAL_REF}" -echo "" - -oc create namespace "${ARC_RUNNERS_NS}" --dry-run=client -o yaml | oc apply -f - -mkdir -p "${GENERATED_DIR}" - -if [[ "${SKIP_DIND_MIRROR}" == "1" ]]; then - echo "SKIP_DIND_MIRROR=1 — skipping docker:dind mirror (removing stale post-render config if any)." - rm -f "${GENERATED_DIR}/arc-dind-replace.env" - echo "=== setup-dind-mirror complete (skipped) ===" - exit 0 -fi - -if oc import-image arc-docker-dind:dind --from="${DIND_SOURCE_IMAGE}" --confirm -n "${ARC_RUNNERS_NS}"; then - printf 'ARC_DIND_INTERNAL_IMAGE=%s\n' "${DIND_INTERNAL_REF}" > "${GENERATED_DIR}/arc-dind-replace.env" - echo "Wrote ${GENERATED_DIR}/arc-dind-replace.env" - echo " ci-scripts/arc/install-runner-scale-set.sh will use arc-dind-post-render.sh when this file exists." - echo "" - echo "DIND_IMAGE_REF=${DIND_INTERNAL_REF}" -else - echo "ERROR: oc import-image failed (Docker Hub rate limit or cluster cannot pull docker.io?)." - echo " Configure a cluster pull secret for docker.io, set DIND_SOURCE_IMAGE, or SKIP_DIND_MIRROR=1 if dind is mirrored elsewhere (e.g. ImageContentSourcePolicy)." - exit 1 -fi - -echo "=== setup-dind-mirror complete ===" diff --git a/ci-scripts/arc/setup-runner-image.sh b/ci-scripts/arc/setup-runner-image.sh index c2481c7261..07c7911c65 100755 --- a/ci-scripts/arc/setup-runner-image.sh +++ b/ci-scripts/arc/setup-runner-image.sh @@ -4,21 +4,22 @@ # custom ARC runner image (ci-scripts/arc/runner-image/Dockerfile). # # Output: prints IMAGE_REF= to stdout (and to ARC_RUNNER_IMAGE_FILE if set). -# Run setup-dind-mirror.sh first if you need an internal docker:dind mirror (optional). # # Optional environment variables: # ARC_RUNNERS_NS (default: arc-runners) # OC_VERSION OpenShift client version build-arg (default: detect or 4.20) +# HELM_VERSION Helm version build-arg (default: 3.19.0) # VIRTCTL_VERSION (default: v1.4.0) # # Requires: oc logged into OpenShift; jq optional for version detection and URL resolution. # # Binary URL resolution: -# When jq is available, this script queries ConsoleCLIDownload resources to find the -# exact binary download URLs for oc, kubectl, and virtctl that match the live cluster. -# These are passed to the Docker build as OC_URL and VIRTCTL_URL build-args. +# Uses ci-scripts/_cluster-helpers.sh to query ConsoleCLIDownload resources for +# exact binary download URLs (oc, virtctl, helm) matching the live cluster. +# These are passed to the Docker build as OC_URL, VIRTCTL_URL, and HELM_URL build-args. # If resolution fails (CRD not found, jq absent, etc.), the Dockerfile falls back to -# mirror.openshift.com / GitHub releases using OC_VERSION / VIRTCTL_VERSION. +# mirror.openshift.com / GitHub releases / get.helm.sh using OC_VERSION / VIRTCTL_VERSION / +# HELM_VERSION. set -euo pipefail ARC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -32,62 +33,23 @@ if ! oc get clusterversion version &>/dev/null; then exit 1 fi -if [[ -z "${OC_VERSION:-}" ]]; then - OC_VERSION=$(oc version --output json 2>/dev/null | jq -r '.openshiftVersion | split(".") | .[0:2] | join(".") // empty') || true - OC_VERSION="${OC_VERSION:-4.20}" -fi -VIRTCTL_VERSION="${VIRTCTL_VERSION:-v1.4.0}" +source "${CI_SCRIPTS_DIR}/_cluster-helpers.sh" -# Resolve binary download URLs from ConsoleCLIDownload resources so the image binaries -# match the live cluster exactly. Requires jq; silently skipped if unavailable. -OC_URL="" -VIRTCTL_URL="" -if command -v jq &>/dev/null; then - CLI_DOWNLOAD_JSON=$(oc get consoleclidownload -o json 2>/dev/null || true) - if [[ -n "${CLI_DOWNLOAD_JSON}" ]]; then - OC_URL=$(echo "${CLI_DOWNLOAD_JSON}" \ - | jq -r '.items[].spec.links[] | select(.text | test("oc.*linux.*x86_64|oc.*linux.*amd64"; "i")) | .href' \ - | head -1) - VIRTCTL_URL=$(echo "${CLI_DOWNLOAD_JSON}" \ - | jq -r '.items[].spec.links[] | select(.text | test("virtctl.*linux.*amd64|virtctl.*linux.*x86_64"; "i")) | .href' \ - | head -1 || true) +resolve_oc_version +HELM_VERSION="${HELM_VERSION:-3.19.0}" +VIRTCTL_VERSION="${VIRTCTL_VERSION:-v1.4.0}" - # Rewrite public console download route URLs to their backing internal HTTP services so - # that build pods don't need to trust the cluster's self-signed ingress CA. - # Each route's host maps to a service that listens on plain HTTP internally; TLS is only - # terminated at the ingress router. We resolve service+namespace from the route spec so - # this works for any URL regardless of hostname naming conventions. - _ALL_ROUTES_JSON=$(oc get route --all-namespaces -o json 2>/dev/null || true) - _url_to_internal() { - local url="${1}" - local host path route_info ns svc svc_port - host=$(echo "${url}" | sed -E 's|https://([^/]+).*|\1|') - path=$(echo "${url}" | sed -E 's|https://[^/]+(/.*)?|\1|' || echo '/') - route_info=$(echo "${_ALL_ROUTES_JSON}" \ - | jq -r --arg h "${host}" \ - '.items[] | select(.spec.host == $h) | "\(.metadata.namespace) \(.spec.to.name)"' \ - | head -1) - if [[ -n "${route_info}" ]]; then - read -r ns svc <<< "${route_info}" - svc_port=$(oc get service "${svc}" -n "${ns}" \ - -o jsonpath='{.spec.ports[0].port}' 2>/dev/null || echo "8080") - echo "http://${svc}.${ns}.svc.cluster.local:${svc_port}${path}" - else - echo "${url}" - fi - } - [[ -n "${OC_URL}" ]] && OC_URL=$(_url_to_internal "${OC_URL}") - [[ -n "${VIRTCTL_URL}" ]] && VIRTCTL_URL=$(_url_to_internal "${VIRTCTL_URL}") - fi -fi +resolve_cli_downloads echo "=== Build ARC runner image (in-cluster, OpenShift) ===" echo " ARC_RUNNERS_NS: ${ARC_RUNNERS_NS}" echo " OC_VERSION: ${OC_VERSION}" echo " VIRTCTL_VERSION: ${VIRTCTL_VERSION}" +echo " HELM_VERSION: ${HELM_VERSION}" echo " RUNNER_IMAGE_DIR: ${RUNNER_IMAGE_DIR}" echo " OC_URL: ${OC_URL:-(fallback to mirror.openshift.com)}" echo " VIRTCTL_URL: ${VIRTCTL_URL:-(fallback to GitHub releases)}" +echo " HELM_URL: ${HELM_URL:-(fallback to get.helm.sh)}" echo "" if [[ ! -f "${RUNNER_IMAGE_DIR}/Dockerfile" ]]; then @@ -126,10 +88,14 @@ spec: value: "${OC_VERSION}" - name: VIRTCTL_VERSION value: "${VIRTCTL_VERSION}" + - name: HELM_VERSION + value: "${HELM_VERSION}" - name: OC_URL value: "${OC_URL}" - name: VIRTCTL_URL value: "${VIRTCTL_URL}" + - name: HELM_URL + value: "${HELM_URL}" output: to: kind: ImageStreamTag diff --git a/ci-scripts/ci-env/README.md b/ci-scripts/ci-env/README.md new file mode 100644 index 0000000000..46ceb501ed --- /dev/null +++ b/ci-scripts/ci-env/README.md @@ -0,0 +1,130 @@ +# CI Environment Controller + +A lightweight Kubernetes controller that manages CI test environments for the +kubevirt-plugin E2E tests. It runs as a Deployment in the `ci-env` namespace +and is triggered by ConfigMap creation from ARC runner workflows. + +## Why + +The ARC runner previously ran `helm install` directly, which required a massive +RBAC footprint (the runner SA had to hold every permission the console SA +needed, due to Kubernetes RBAC escalation prevention). By moving environment +lifecycle into a privileged controller, the runner RBAC drops from ~170 lines +to ~30 lines. + +## Architecture + +``` +ARC Runner ci-env namespace Test namespace ++-------------+ +------------------------+ +--------------------+ +| Workflow | | ci-env-controller | | console Deployment | +| step: |---->| (watches ConfigMaps) |--->| plugin Deployment | +| create CM | | | | Services, Routes | ++-------------+ +------------------------+ +--------------------+ + | | + | poll status=ready | patches ConfigMap + |<----------------------+ with bridge-base-address +``` + +1. The **runner** creates a ConfigMap with `desired-state: present` and minimal + input (plugin image, test namespace). +2. The **controller** discovers cluster endpoints, resolves the console image, + creates the test namespace, deploys the Helm chart, and waits for readiness. +3. The controller patches the ConfigMap with `status: ready` and the + `bridge-base-address` the runner needs for Cypress. +4. After tests, the runner sets `desired-state: absent` and the controller + tears down the environment. + +## ConfigMap Contract + +### Runner provides (required) + +| Field | Description | +| ---------------- | ---------------------------------------- | +| `desired-state` | `present` to create, `absent` to destroy | +| `plugin-image` | Container image for the kubevirt-plugin | +| `test-namespace` | Kubernetes namespace for the test stack | + +### Runner provides (optional overrides) + +| Field | Description | +| --------------- | -------------------------------------------------------------------- | +| `console-image` | Override console image (auto-resolved from cluster version if empty) | +| `helm-release` | Override Helm release name (defaults to ConfigMap name) | + +### Controller populates (read-only for runner) + +| Field | Description | +| --------------------- | ----------------------------------------------------------------------- | +| `status` | `pending` / `provisioning` / `ready` / `error` / `cleaning` / `cleaned` | +| `bridge-base-address` | In-cluster console URL for Cypress | +| `console-route` | External Route URL for debugging | +| `error-message` | Error details (only when `status=error`) | + +### Status Lifecycle + +``` +pending -> provisioning -> ready + -> error + +ready -> (runner sets desired-state=absent) -> cleaning -> cleaned +``` + +## Installation + +Prerequisites: `oc` logged in to OpenShift with cluster-admin, `helm` available. + +```bash +# Install the controller (builds image, creates RBAC, deploys) +./ci-scripts/ci-env/install-ci-env-controller.sh + +# Or with a pre-built image: +CI_ENV_CONTROLLER_IMAGE=quay.io/myorg/ci-env-controller:latest \ + ./ci-scripts/ci-env/install-ci-env-controller.sh +``` + +The install script: + +- Creates the `ci-env` namespace +- Applies controller RBAC (ServiceAccount, ClusterRole, ClusterRoleBinding) +- Applies the `ci-console` ClusterRole +- Builds the controller image via OpenShift BuildConfig (or uses a pre-built image) +- Creates ConfigMaps from the controller script, Helm chart, and cleanup script +- Deploys the controller +- Creates a namespaced RoleBinding so the ARC runner SA can manage ConfigMaps in `ci-env` + +## Configuration + +Environment variables on the controller Deployment: + +| Variable | Default | Description | +| -------------------- | ------------------------------------------ | ---------------------------------------- | +| `CI_ENV_NS` | `ci-env` | Namespace for trigger ConfigMaps | +| `CI_ENV_TTL_SECONDS` | `7200` | Force-clean stale environments (seconds) | +| `CI_ENV_LABEL` | `ci.kubevirt-plugin/type=test-environment` | ConfigMap label selector | +| `HELM_CHART_PATH` | `/opt/ci-env/helm/ci-test-stack` | Path to Helm chart in container | + +## Files + +| File | Purpose | +| ----------------------------------- | ------------------------------------------------------ | +| `ci-env-controller.sh` | Controller watch loop, provisioning, and cleanup logic | +| `ci-env-namespace.yaml` | Namespace definition | +| `ci-env-controller-rbac.yaml` | ServiceAccount, ClusterRole, ClusterRoleBinding | +| `ci-env-controller-deployment.yaml` | Deployment manifest | +| `install-ci-env-controller.sh` | Standalone install script | +| `controller-image/Dockerfile` | UBI9-based image with oc, helm, jq, yq, curl | + +## Relationship to ARC + +The controller is fully independent of the ARC installation. It runs in its own +namespace (`ci-env`) with its own ServiceAccount and ClusterRole. The only +connection is a namespaced RoleBinding that allows the ARC runner SA to create +ConfigMaps in `ci-env`. + +## Future Work + +- **Authenticated console mode**: Return a login Secret in the ConfigMap so the + console can run with `BRIDGE_USER_AUTH` enabled for production-like testing. +- **CRD evolution**: Replace ConfigMaps with a proper `CITestEnvironment` CRD + for structured status and validation webhooks. diff --git a/ci-scripts/ci-env/ci-env-controller-deployment.yaml b/ci-scripts/ci-env/ci-env-controller-deployment.yaml new file mode 100644 index 0000000000..8cce2b9d7b --- /dev/null +++ b/ci-scripts/ci-env/ci-env-controller-deployment.yaml @@ -0,0 +1,66 @@ +# ci-env-controller Deployment. +# The controller script is mounted from a ConfigMap so it can be updated +# without rebuilding the container image. +# +# The image is a UBI9-based image with oc, helm, jq, yq, curl. +# Set CI_ENV_CONTROLLER_IMAGE before applying, or substitute with sed/envsubst. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ci-env-controller + namespace: ci-env + labels: + app: ci-env-controller + app.kubernetes.io/component: ci-env-controller +spec: + replicas: 1 + selector: + matchLabels: + app: ci-env-controller + template: + metadata: + labels: + app: ci-env-controller + spec: + serviceAccountName: ci-env-controller + containers: + - name: controller + image: CI_ENV_CONTROLLER_IMAGE_PLACEHOLDER + command: ['/bin/bash', '/opt/ci-env/controller/ci-env-controller.sh'] + env: + - name: CI_ENV_NS + value: ci-env + - name: CI_ENV_TTL_SECONDS + value: '7200' + - name: CI_ENV_LABEL + value: ci.kubevirt-plugin/type=test-environment + - name: HELM_CHART_PATH + value: /opt/ci-env/helm/ci-test-stack + - name: RUNNER_SA_NAME + value: RUNNER_SA_NAME_PLACEHOLDER + - name: RUNNER_SA_NS + value: RUNNER_SA_NS_PLACEHOLDER + volumeMounts: + - name: controller-script + mountPath: /opt/ci-env/controller + readOnly: true + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + capabilities: + drop: ['ALL'] + volumes: + - name: controller-script + configMap: + name: ci-env-controller-script + defaultMode: 0555 diff --git a/ci-scripts/ci-env/ci-env-controller-rbac.yaml b/ci-scripts/ci-env/ci-env-controller-rbac.yaml new file mode 100644 index 0000000000..7b06206325 --- /dev/null +++ b/ci-scripts/ci-env/ci-env-controller-rbac.yaml @@ -0,0 +1,216 @@ +# RBAC for the ci-env-controller Deployment. +# +# This ClusterRole must be a SUPERSET of ci-console (ci-console-clusterrole.yaml) +# because the controller creates ClusterRoleBindings to ci-console, and Kubernetes +# prevents RBAC escalation -- you cannot grant permissions you do not hold. +# +# The controller also needs lifecycle permissions (namespaces, Helm resources, +# Routes, ClusterRoleBindings) and cluster discovery reads. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: ci-env-controller + namespace: ci-env + labels: + app.kubernetes.io/component: ci-env-controller +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: ci-env-controller + labels: + app.kubernetes.io/component: ci-env-controller +rules: + # ----------------------------------------------------------------------- + # Cluster discovery (api-server, apps-domain, monitoring URLs, console image) + # ----------------------------------------------------------------------- + - apiGroups: [''] + resources: ['nodes', 'namespaces'] + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] + - apiGroups: ['config.openshift.io'] + resources: ['consoles', 'clusterversions', 'ingresses', 'infrastructures', 'dnses'] + verbs: ['get', 'list', 'watch'] + - apiGroups: ['storage.k8s.io'] + resources: ['storageclasses'] + verbs: ['get', 'list', 'watch'] + + # ----------------------------------------------------------------------- + # Operator lifecycle (superset of ci-console) + # ----------------------------------------------------------------------- + - apiGroups: ['operators.coreos.com'] + resources: ['clusterserviceversions', 'subscriptions'] + verbs: ['get', 'list', 'watch'] + - apiGroups: ['packages.operators.coreos.com'] + resources: ['packagemanifests'] + verbs: ['get', 'list'] + - apiGroups: ['operator.openshift.io'] + resources: ['consoles'] + verbs: ['get', 'list', 'watch'] + + # ----------------------------------------------------------------------- + # Console UI resources (superset of ci-console) + # ----------------------------------------------------------------------- + - apiGroups: ['console.openshift.io'] + resources: + - consoleplugins + - consolenotifications + - consolequickstarts + - consolelinks + - consoleyamlsamples + verbs: ['get', 'list', 'watch'] + + # ----------------------------------------------------------------------- + # User/project identity (superset of ci-console) + # ----------------------------------------------------------------------- + - apiGroups: ['user.openshift.io'] + resources: ['users'] + verbs: ['get', 'list', 'watch'] + - apiGroups: ['project.openshift.io'] + resources: ['projects'] + verbs: ['get', 'list', 'watch'] + + # ----------------------------------------------------------------------- + # Core namespaced resources (Helm chart + test fixtures) + # ----------------------------------------------------------------------- + - apiGroups: [''] + resources: + - pods + - pods/log + - services + - secrets + - configmaps + - events + - serviceaccounts + - persistentvolumeclaims + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] + + # ----------------------------------------------------------------------- + # Helm chart resources (Deployments, ReplicaSets, Routes) + # ----------------------------------------------------------------------- + - apiGroups: ['apps'] + resources: ['deployments', 'replicasets'] + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] + - apiGroups: ['route.openshift.io'] + resources: ['routes'] + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] + - apiGroups: ['route.openshift.io'] + resources: ['routes/custom-host'] + verbs: ['create', 'update'] + + # ----------------------------------------------------------------------- + # RBAC for console SA and runner SA RoleBindings in test namespaces + # ----------------------------------------------------------------------- + - apiGroups: ['rbac.authorization.k8s.io'] + resources: ['clusterrolebindings', 'rolebindings'] + verbs: ['get', 'list', 'create', 'update', 'patch', 'delete'] + + # ----------------------------------------------------------------------- + # KubeVirt resources (superset of ci-console + cleanup) + # ----------------------------------------------------------------------- + - apiGroups: ['kubevirt.io'] + resources: + - kubevirts + - virtualmachines + - virtualmachineinstances + - virtualmachineinstancemigrations + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] + - apiGroups: ['subresources.kubevirt.io'] + resources: ['virtualmachineinstances/console', 'virtualmachineinstances/vnc'] + verbs: ['get'] + - apiGroups: ['instancetype.kubevirt.io'] + resources: + - virtualmachineclusterinstancetypes + - virtualmachineinstancetypes + - virtualmachineclusterpreferences + - virtualmachinepreferences + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] + - apiGroups: ['snapshot.kubevirt.io'] + resources: ['virtualmachinesnapshots'] + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] + + # ----------------------------------------------------------------------- + # CDI resources (superset of ci-console + cleanup) + # ----------------------------------------------------------------------- + - apiGroups: ['cdi.kubevirt.io'] + resources: ['cdis', 'cdiconfigs', 'datavolumes', 'datasources'] + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] + + # ----------------------------------------------------------------------- + # HCO operand reads + # ----------------------------------------------------------------------- + - apiGroups: ['ssp.kubevirt.io'] + resources: ['ssps'] + verbs: ['get', 'list'] + - apiGroups: ['networkaddonsoperator.network.kubevirt.io'] + resources: ['networkaddonsconfigs'] + verbs: ['get', 'list'] + - apiGroups: ['hostpathprovisioner.kubevirt.io'] + resources: ['hostpathprovisioners'] + verbs: ['get', 'list'] + + # ----------------------------------------------------------------------- + # Storage snapshots + # ----------------------------------------------------------------------- + - apiGroups: ['snapshot.storage.k8s.io'] + resources: ['volumesnapshots'] + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] + + # ----------------------------------------------------------------------- + # Cleanup resources (test-cleanup.sh) + # ----------------------------------------------------------------------- + - apiGroups: ['template.openshift.io'] + resources: ['templates'] + verbs: ['get', 'list', 'delete'] + - apiGroups: ['k8s.cni.cncf.io'] + resources: ['network-attachment-definitions'] + verbs: ['get', 'list', 'delete'] +--- +# ClusterRole bound (via RoleBinding) to the ARC runner SA in each test namespace. +# The ci-env-controller creates RoleBindings for this role during provisioning. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: ci-env-test-runner + labels: + app.kubernetes.io/component: ci-env-controller +rules: + - apiGroups: [''] + resources: ['secrets'] + verbs: ['get', 'list', 'create', 'update', 'patch', 'delete'] + - apiGroups: [''] + resources: ['persistentvolumeclaims'] + verbs: ['get', 'list', 'delete'] + - apiGroups: ['kubevirt.io'] + resources: ['virtualmachines'] + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] + - apiGroups: ['snapshot.kubevirt.io'] + resources: ['virtualmachinesnapshots'] + verbs: ['get', 'list', 'delete'] + - apiGroups: ['cdi.kubevirt.io'] + resources: ['datavolumes', 'datasources'] + verbs: ['get', 'list', 'delete'] + - apiGroups: ['template.openshift.io'] + resources: ['templates'] + verbs: ['get', 'list', 'delete'] + - apiGroups: ['k8s.cni.cncf.io'] + resources: ['network-attachment-definitions'] + verbs: ['get', 'list', 'delete'] + - apiGroups: ['instancetype.kubevirt.io'] + resources: ['virtualmachineinstancetypes', 'virtualmachinepreferences'] + verbs: ['get', 'list', 'delete'] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: ci-env-controller + labels: + app.kubernetes.io/component: ci-env-controller +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: ci-env-controller +subjects: + - kind: ServiceAccount + name: ci-env-controller + namespace: ci-env diff --git a/ci-scripts/ci-env/ci-env-controller.sh b/ci-scripts/ci-env/ci-env-controller.sh new file mode 100755 index 0000000000..691b986bc4 --- /dev/null +++ b/ci-scripts/ci-env/ci-env-controller.sh @@ -0,0 +1,339 @@ +#!/usr/bin/env bash +# ci-env-controller: watches labeled ConfigMaps in the ci-env namespace and +# reconciles CI test environments (namespace, Helm chart) on demand. +# +# Designed to run as a long-lived Deployment pod. The script is mounted via +# a ConfigMap volume so it can be updated without rebuilding the image. +# +# Environment variables (set on the Deployment): +# CI_ENV_NS Namespace where trigger ConfigMaps live (default: ci-env) +# CI_ENV_TTL_SECONDS Force-clean environments older than this (default: 7200 = 2h) +# CI_ENV_LABEL Label selector for trigger ConfigMaps +# HELM_CHART_PATH Path to the ci-test-stack Helm chart inside the container + +set -uo pipefail + +CI_ENV_NS="${CI_ENV_NS:-ci-env}" +CI_ENV_TTL_SECONDS="${CI_ENV_TTL_SECONDS:-7200}" +CI_ENV_LABEL="${CI_ENV_LABEL:-ci.kubevirt-plugin/type=test-environment}" +HELM_CHART_PATH="${HELM_CHART_PATH:-/opt/ci-env/helm/ci-test-stack}" + +RUNNER_SA_NAME="${RUNNER_SA_NAME:-kubevirt-plugin-ci-gha-rs-no-permission}" +RUNNER_SA_NS="${RUNNER_SA_NS:-arc-runners}" + +CONSOLE_IMAGE_REGISTRY="${CONSOLE_IMAGE_REGISTRY:-quay.io/openshift/origin-console}" + +log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*"; } + +# --------------------------------------------------------------------------- # +# Cluster discovery (cached per reconciliation cycle) +# --------------------------------------------------------------------------- # +discover_cluster() { + API_SERVER="${KUBERNETES_SERVICE_HOST:+https://${KUBERNETES_SERVICE_HOST}:${KUBERNETES_SERVICE_PORT:-443}}" + API_SERVER="${API_SERVER:-$(oc whoami --show-server 2>/dev/null || true)}" + + APPS_DOMAIN="$(oc get ingress.config.openshift.io/cluster \ + -o jsonpath='{.spec.domain}' 2>/dev/null || true)" + + THANOS_URL="$(oc -n openshift-config-managed get configmap monitoring-shared-config \ + -o jsonpath='{.data.thanosPublicURL}' 2>/dev/null || true)" + + ALERTMANAGER_URL="$(oc -n openshift-config-managed get configmap monitoring-shared-config \ + -o jsonpath='{.data.alertmanagerPublicURL}' 2>/dev/null || true)" + + if [[ -z "${APPS_DOMAIN}" ]]; then + log "ERROR: could not discover APPS_DOMAIN from ingress.config.openshift.io/cluster" + return 1 + fi + log "Cluster: API_SERVER=${API_SERVER} APPS_DOMAIN=${APPS_DOMAIN}" +} + +# --------------------------------------------------------------------------- # +# Console image resolution (same logic as resolve-console-image.sh) +# --------------------------------------------------------------------------- # +resolve_console_image() { + local override="${1:-}" + if [[ -n "${override}" ]]; then + echo "${override}" + return + fi + + local version + version="$(oc get clusterversion version \ + -o jsonpath='{.status.desired.version}' 2>/dev/null || true)" + if [[ -z "${version}" ]]; then + echo "${CONSOLE_IMAGE_REGISTRY}:latest" + return + fi + + local major minor + IFS='.' read -r major minor _ <<< "${version}" + echo "${CONSOLE_IMAGE_REGISTRY}:${major}.${minor}" +} + +# --------------------------------------------------------------------------- # +# Ensure kubevirt-apiserver-proxy Route exists +# --------------------------------------------------------------------------- # +ensure_proxy_route() { + local route_name="kubevirt-apiserver-proxy" + local route_ns="openshift-cnv" + local proxy_host="${route_name}.${APPS_DOMAIN}" + + if oc get route "${route_name}" -n "${route_ns}" &>/dev/null; then + log "Proxy route already exists in ${route_ns}" + else + log "Creating proxy route ${route_name} in ${route_ns}..." + cat </dev/null || log "Proxy route create skipped (may already exist or namespace missing)" +apiVersion: route.openshift.io/v1 +kind: Route +metadata: + name: ${route_name} + namespace: ${route_ns} + annotations: + haproxy.router.openshift.io/hsts_header: max-age=31536000;includeSubDomains;preload +spec: + host: ${proxy_host} + to: + kind: Service + name: ${route_name}-service + weight: 100 + port: + targetPort: 8080 + tls: + termination: reencrypt + wildcardPolicy: None +EOF + fi + + PLUGIN_PROXY_ENDPOINT="https://${proxy_host}" +} + +# --------------------------------------------------------------------------- # +# Patch ConfigMap helper +# --------------------------------------------------------------------------- # +patch_cm() { + local name="$1" + shift + local patch_data="$*" + oc patch configmap "${name}" -n "${CI_ENV_NS}" --type merge -p "${patch_data}" 2>/dev/null || true +} + +# --------------------------------------------------------------------------- # +# Provision a test environment +# --------------------------------------------------------------------------- # +provision() { + local cm_name="$1" + local plugin_image="$2" + local test_ns="$3" + local console_image_override="${4:-}" + local helm_release_override="${5:-}" + + local helm_release="${helm_release_override:-${cm_name}}" + + log "Provisioning: cm=${cm_name} ns=${test_ns} release=${helm_release}" + patch_cm "${cm_name}" '{"data":{"status":"provisioning"}}' + + discover_cluster || { + patch_cm "${cm_name}" '{"data":{"status":"error","error-message":"cluster discovery failed"}}' + return 1 + } + + local console_image + console_image="$(resolve_console_image "${console_image_override}")" + local route_host="console-${cm_name}.${APPS_DOMAIN}" + + ensure_proxy_route + + log "Creating namespace ${test_ns}..." + oc create namespace "${test_ns}" --dry-run=client -o yaml | oc apply -f - || true + + log "Granting runner SA ${RUNNER_SA_NS}/${RUNNER_SA_NAME} test permissions in ${test_ns}..." + oc apply -f - <&1; then + + local err="helm install failed" + log "ERROR: ${err}" + patch_cm "${cm_name}" "{\"data\":{\"status\":\"error\",\"error-message\":\"${err}\"}}" + return 1 + fi + + local bridge_base="http://${helm_release}-console.${test_ns}.svc.cluster.local:9000" + log "Waiting for console at ${bridge_base}..." + local ready=false + for i in $(seq 1 60); do + if curl -s -o /dev/null -w "%{http_code}" "${bridge_base}/" 2>/dev/null | grep -qE '200|301|302'; then + ready=true + break + fi + sleep 5 + done + + if [[ "${ready}" != "true" ]]; then + local err="console did not become ready within 5 minutes" + log "ERROR: ${err}" + patch_cm "${cm_name}" "{\"data\":{\"status\":\"error\",\"error-message\":\"${err}\"}}" + return 1 + fi + + local console_route="https://${route_host}" + log "Environment ready: bridge=${bridge_base} route=${console_route}" + patch_cm "${cm_name}" "{\"data\":{\"status\":\"ready\",\"bridge-base-address\":\"${bridge_base}\",\"console-route\":\"${console_route}\"}}" +} + +# --------------------------------------------------------------------------- # +# Tear down a test environment +# --------------------------------------------------------------------------- # +teardown() { + local cm_name="$1" + local test_ns="$2" + local helm_release="${3:-${cm_name}}" + + log "Tearing down: cm=${cm_name} ns=${test_ns} release=${helm_release}" + patch_cm "${cm_name}" '{"data":{"status":"cleaning"}}' + + helm uninstall "${helm_release}" -n "${test_ns}" --wait 2>/dev/null || true + + oc delete namespace "${test_ns}" --wait=false 2>/dev/null || true + + log "Teardown complete for ${cm_name}" + patch_cm "${cm_name}" '{"data":{"status":"cleaned"}}' +} + +# --------------------------------------------------------------------------- # +# Reconcile a single ConfigMap +# --------------------------------------------------------------------------- # +reconcile_one() { + local cm_json="$1" + + local cm_name desired status plugin_image test_ns console_image helm_release + cm_name="$(echo "${cm_json}" | jq -r '.metadata.name')" + desired="$(echo "${cm_json}" | jq -r '.data["desired-state"] // "unknown"')" + status="$(echo "${cm_json}" | jq -r '.data["status"] // ""')" + plugin_image="$(echo "${cm_json}" | jq -r '.data["plugin-image"] // ""')" + test_ns="$(echo "${cm_json}" | jq -r '.data["test-namespace"] // ""')" + console_image="$(echo "${cm_json}" | jq -r '.data["console-image"] // ""')" + helm_release="$(echo "${cm_json}" | jq -r '.data["helm-release"] // ""')" + + if [[ "${desired}" == "present" && "${status}" != "ready" && "${status}" != "provisioning" ]]; then + if [[ -z "${plugin_image}" || -z "${test_ns}" ]]; then + log "WARN: ConfigMap ${cm_name} missing required fields (plugin-image, test-namespace)" + patch_cm "${cm_name}" '{"data":{"status":"error","error-message":"missing required fields: plugin-image and test-namespace"}}' + return + fi + if ! provision "${cm_name}" "${plugin_image}" "${test_ns}" "${console_image}" "${helm_release}"; then + log "ERROR: provision failed for ${cm_name}, ensuring status=error" + local cur_status + cur_status="$(oc get configmap "${cm_name}" -n "${CI_ENV_NS}" -o jsonpath='{.data.status}' 2>/dev/null || echo "")" + if [[ "${cur_status}" != "error" ]]; then + patch_cm "${cm_name}" '{"data":{"status":"error","error-message":"provision failed unexpectedly"}}' + fi + fi + + elif [[ "${desired}" == "absent" && "${status}" != "cleaned" && "${status}" != "cleaning" ]]; then + if [[ -z "${test_ns}" ]]; then + log "WARN: ConfigMap ${cm_name} missing test-namespace for teardown" + patch_cm "${cm_name}" '{"data":{"status":"cleaned"}}' + return + fi + teardown "${cm_name}" "${test_ns}" "${helm_release}" || \ + log "WARN: teardown encountered errors for ${cm_name} (non-fatal)" + fi +} + +# --------------------------------------------------------------------------- # +# Stale environment reaper +# --------------------------------------------------------------------------- # +reap_stale() { + local now_epoch + now_epoch="$(date +%s)" + + local cms + cms="$(oc get configmap -n "${CI_ENV_NS}" -l "${CI_ENV_LABEL}" -o json 2>/dev/null || echo '{"items":[]}')" + + echo "${cms}" | jq -c '.items[]' 2>/dev/null | while IFS= read -r cm; do + local cm_name desired status created_ts + cm_name="$(echo "${cm}" | jq -r '.metadata.name')" + desired="$(echo "${cm}" | jq -r '.data["desired-state"] // ""')" + status="$(echo "${cm}" | jq -r '.data["status"] // ""')" + created_ts="$(echo "${cm}" | jq -r '.metadata.creationTimestamp // ""')" + + if [[ "${desired}" != "present" || "${status}" == "cleaning" || "${status}" == "cleaned" ]]; then + continue + fi + + if [[ -n "${created_ts}" ]]; then + local created_epoch + created_epoch="$(date -d "${created_ts}" +%s 2>/dev/null || echo 0)" + local age=$(( now_epoch - created_epoch )) + if (( age > CI_ENV_TTL_SECONDS )); then + log "REAPER: ConfigMap ${cm_name} is ${age}s old (TTL=${CI_ENV_TTL_SECONDS}s), forcing cleanup" + local test_ns helm_release + test_ns="$(echo "${cm}" | jq -r '.data["test-namespace"] // ""')" + helm_release="$(echo "${cm}" | jq -r '.data["helm-release"] // ""')" + teardown "${cm_name}" "${test_ns}" "${helm_release}" + fi + fi + done +} + +# --------------------------------------------------------------------------- # +# Main watch loop +# --------------------------------------------------------------------------- # +main() { + log "ci-env-controller starting" + log " CI_ENV_NS=${CI_ENV_NS}" + log " CI_ENV_TTL_SECONDS=${CI_ENV_TTL_SECONDS}" + log " CI_ENV_LABEL=${CI_ENV_LABEL}" + log " HELM_CHART_PATH=${HELM_CHART_PATH}" + + local reap_interval=300 + local last_reap=0 + + while true; do + local now + now="$(date +%s)" + if (( now - last_reap > reap_interval )); then + reap_stale + last_reap="${now}" + fi + + local cms + cms="$(oc get configmap -n "${CI_ENV_NS}" -l "${CI_ENV_LABEL}" -o json 2>/dev/null || echo '{"items":[]}')" + + echo "${cms}" | jq -c '.items[]' 2>/dev/null | while IFS= read -r cm; do + reconcile_one "${cm}" + done + + sleep 10 + done +} + +main "$@" diff --git a/ci-scripts/ci-env/ci-env-namespace.yaml b/ci-scripts/ci-env/ci-env-namespace.yaml new file mode 100644 index 0000000000..e2547fff21 --- /dev/null +++ b/ci-scripts/ci-env/ci-env-namespace.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: ci-env + labels: + app.kubernetes.io/component: ci-env-controller + app.kubernetes.io/part-of: kubevirt-plugin-ci diff --git a/ci-scripts/ci-env/controller-image/Dockerfile b/ci-scripts/ci-env/controller-image/Dockerfile new file mode 100644 index 0000000000..6f969b1854 --- /dev/null +++ b/ci-scripts/ci-env/controller-image/Dockerfile @@ -0,0 +1,64 @@ +# CI Environment Controller image. +# Lightweight UBI9 image with only the CLI tools needed by ci-env-controller.sh: +# oc, helm, jq, yq, curl +# +# Using UBI9 avoids FIPS/OpenSSL DSO issues on hardened OpenShift clusters +# (the Ubuntu-based ARC runner image fails OpenSSL loads on FIPS nodes). +# +# The Helm chart is staged into the build context by setup-controller-image.sh +# and COPY'd into the image, avoiding the ConfigMap flat-key limitation that +# drops subdirectories (templates/). + +FROM registry.access.redhat.com/ubi9/ubi:latest + +ARG OC_VERSION=4.20 +ARG OC_URL="" +ARG HELM_VERSION=3.17.3 +ARG HELM_URL="" +ARG YQ_VERSION=v4.52.5 + +RUN dnf install -y --nodocs \ + jq \ + && dnf clean all + +# oc (OpenShift CLI) -- prefer cluster-resolved URL if provided. +# Cluster URLs may serve plain .tar with only "oc" (no kubectl), so extract +# what's available and symlink kubectl to oc. +RUN set -e; \ + if [ -n "${OC_URL}" ]; then \ + curl -sL "${OC_URL}" | tar -xf - -C /usr/local/bin; \ + else \ + curl -sL "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable-${OC_VERSION}/openshift-client-linux.tar.gz" \ + | tar -xzf - -C /usr/local/bin oc kubectl; \ + fi \ + && chmod +x /usr/local/bin/oc \ + && [ -f /usr/local/bin/kubectl ] || ln -s oc /usr/local/bin/kubectl + +# helm -- prefer cluster-resolved URL if provided. +# Cluster URLs may serve plain .tar, so use tar -xf (auto-detect). +RUN set -e; \ + if [ -n "${HELM_URL}" ]; then \ + curl -sL "${HELM_URL}" | tar -xf - -C /usr/local/bin --strip-components=1 linux-amd64/helm; \ + else \ + curl -sL "https://get.helm.sh/helm-v${HELM_VERSION}-linux-amd64.tar.gz" \ + | tar -xzf - -C /usr/local/bin --strip-components=1 linux-amd64/helm; \ + fi \ + && chmod +x /usr/local/bin/helm + +# yq (Go/mikefarah) +RUN curl -sL -o /usr/local/bin/yq \ + "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" \ + && chmod +x /usr/local/bin/yq + +# Embed the Helm chart into the image. The chart is staged into the build +# context by setup-controller-image.sh via a symlink dereferenced by tar -ch. +COPY helm/ci-test-stack/ /opt/ci-env/helm/ci-test-stack/ + +RUN useradd -r -u 1001 -g 0 -d /home/controller -s /sbin/nologin controller \ + && mkdir -p /home/controller \ + && chown -R 1001:0 /home/controller /opt/ci-env + +USER 1001 +WORKDIR /home/controller + +ENTRYPOINT ["/bin/bash"] diff --git a/ci-scripts/ci-env/controller-image/helm/ci-test-stack b/ci-scripts/ci-env/controller-image/helm/ci-test-stack new file mode 120000 index 0000000000..6a4997a715 --- /dev/null +++ b/ci-scripts/ci-env/controller-image/helm/ci-test-stack @@ -0,0 +1 @@ +../../../helm/ci-test-stack \ No newline at end of file diff --git a/ci-scripts/ci-env/install-ci-env-controller.sh b/ci-scripts/ci-env/install-ci-env-controller.sh new file mode 100755 index 0000000000..f5b8e4805b --- /dev/null +++ b/ci-scripts/ci-env/install-ci-env-controller.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# +# Install the CI Environment Controller on an OpenShift cluster. +# This is a standalone script -- it does NOT depend on the ARC install. +# +# What it does: +# 1. Creates the ci-env namespace +# 2. Applies RBAC (ServiceAccount, ClusterRole, ClusterRoleBinding) +# 3. Applies the ci-console ClusterRole (needed by the Helm chart) +# 4. Builds the controller image via setup-controller-image.sh (or uses a pre-built image) +# The image embeds the Helm chart for the CI test stack. +# 5. Creates a ConfigMap from the controller script (mounted for easy updates) +# 6. Deploys the controller +# 7. Creates a RoleBinding so the ARC runner SA can create ConfigMaps in ci-env +# +# Optional environment variables: +# CI_ENV_NS Namespace for the controller (default: ci-env) +# CI_ENV_CONTROLLER_IMAGE Pre-built image; skips BuildConfig if set +# ARC_RUNNERS_NS Namespace where ARC runner pods run (default: arc-runners) +# RUNNER_SCALE_SET_NAME ARC scale set name (default: kubevirt-plugin-ci) +# +# Prerequisites: oc login to OpenShift with cluster-admin + +set -euo pipefail + +CI_ENV_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CI_SCRIPTS_DIR="$(cd "${CI_ENV_DIR}/.." && pwd)" + +CI_ENV_NS="${CI_ENV_NS:-ci-env}" +ARC_RUNNERS_NS="${ARC_RUNNERS_NS:-arc-runners}" +RUNNER_SCALE_SET_NAME="${RUNNER_SCALE_SET_NAME:-kubevirt-plugin-ci}" +RUNNER_SA_NAME="${RUNNER_SCALE_SET_NAME}-gha-rs-no-permission" + +echo "=== CI Environment Controller installation ===" +echo " CI_ENV_NS: ${CI_ENV_NS}" +echo " ARC_RUNNERS_NS: ${ARC_RUNNERS_NS}" +echo " RUNNER_SA_NAME: ${RUNNER_SA_NAME}" +echo "" + +if ! oc get clusterversion version &>/dev/null; then + echo "ERROR: This script targets OpenShift only." + exit 1 +fi + +# --- 1. Namespace --- +echo "Creating namespace ${CI_ENV_NS}..." +oc apply -f "${CI_ENV_DIR}/ci-env-namespace.yaml" + +# --- 2. RBAC --- +echo "Applying controller RBAC..." +oc apply -f "${CI_ENV_DIR}/ci-env-controller-rbac.yaml" + +# --- 3. ci-console ClusterRole --- +echo "Applying ci-console ClusterRole..." +oc apply -f "${CI_SCRIPTS_DIR}/arc/ci-console-clusterrole.yaml" + +# --- 4. Controller image --- +if [[ -z "${CI_ENV_CONTROLLER_IMAGE:-}" ]]; then + echo "Building controller image via setup-controller-image.sh..." + IMAGE_OUTPUT="$(CI_ENV_NS="${CI_ENV_NS}" bash "${CI_ENV_DIR}/setup-controller-image.sh")" + CI_ENV_CONTROLLER_IMAGE="$(echo "${IMAGE_OUTPUT}" | grep '^IMAGE_REF=' | cut -d= -f2-)" + if [[ -z "${CI_ENV_CONTROLLER_IMAGE}" ]]; then + echo "ERROR: setup-controller-image.sh did not output IMAGE_REF=" + exit 1 + fi + echo "Built image: ${CI_ENV_CONTROLLER_IMAGE}" +else + echo "Using pre-built image: ${CI_ENV_CONTROLLER_IMAGE}" +fi + +# --- 5. ConfigMap for controller script (mounted for easy updates) --- +echo "Creating ConfigMap from controller script..." +oc create configmap ci-env-controller-script \ + -n "${CI_ENV_NS}" \ + --from-file=ci-env-controller.sh="${CI_ENV_DIR}/ci-env-controller.sh" \ + --dry-run=client -o yaml | oc apply -f - + +# --- 6. Deploy the controller --- +echo "Deploying ci-env-controller..." +sed -e "s|CI_ENV_CONTROLLER_IMAGE_PLACEHOLDER|${CI_ENV_CONTROLLER_IMAGE}|g" \ + -e "s|RUNNER_SA_NAME_PLACEHOLDER|${RUNNER_SA_NAME}|g" \ + -e "s|RUNNER_SA_NS_PLACEHOLDER|${ARC_RUNNERS_NS}|g" \ + "${CI_ENV_DIR}/ci-env-controller-deployment.yaml" \ + | oc apply -f - + +echo "Waiting for controller to be ready..." +oc rollout status deployment/ci-env-controller -n "${CI_ENV_NS}" --timeout=120s || true + +# --- 7. RoleBinding for ARC runner SA --- +echo "Creating RoleBinding for runner SA (${RUNNER_SA_NAME}) in ${CI_ENV_NS}..." +oc apply -f - </dev/null; then + echo "ERROR: OpenShift cluster required (clusterversion.version not found)." + exit 1 +fi + +source "${CI_SCRIPTS_DIR}/_cluster-helpers.sh" + +resolve_oc_version +HELM_VERSION="${HELM_VERSION:-3.19.0}" +YQ_VERSION="${YQ_VERSION:-v4.52.5}" + +resolve_cli_downloads + +echo "=== Build ci-env-controller image (in-cluster, OpenShift) ===" +echo " CI_ENV_NS: ${CI_ENV_NS}" +echo " OC_VERSION: ${OC_VERSION}" +echo " HELM_VERSION: ${HELM_VERSION}" +echo " YQ_VERSION: ${YQ_VERSION}" +echo " CONTROLLER_IMAGE_DIR: ${CONTROLLER_IMAGE_DIR}" +echo " OC_URL: ${OC_URL:-(fallback to mirror.openshift.com)}" +echo " HELM_URL: ${HELM_URL:-(fallback to get.helm.sh)}" +echo "" + +if [[ ! -f "${CONTROLLER_IMAGE_DIR}/Dockerfile" ]]; then + echo "ERROR: Dockerfile not found at ${CONTROLLER_IMAGE_DIR}/Dockerfile" + exit 1 +fi + +BC_NAME="ci-env-controller" + +oc create namespace "${CI_ENV_NS}" --dry-run=client -o yaml | oc apply -f - + +oc apply -f - < "${CI_ENV_CONTROLLER_IMAGE_FILE}" + echo "Wrote ${CI_ENV_CONTROLLER_IMAGE_FILE}" +fi + +echo "IMAGE_REF=${IMAGE_REF}" diff --git a/ci-scripts/examples/arc-0.14-extra-values.yaml b/ci-scripts/examples/arc-0.14-extra-values.yaml deleted file mode 100644 index 3e5ce77656..0000000000 --- a/ci-scripts/examples/arc-0.14-extra-values.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# Example fragment for ARC Helm ≥ 0.14 (stable gha-runner-scale-set chart). -# Merge via ARC_RUNNER_EXTRA_VALUES=/path/to/this/file.yaml (ci-scripts/arc/install-runner-scale-set.sh) -# -# Ref: https://github.blog/changelog/2026-03-19-actions-runner-controller-release-0-14-0/ -# -# --- Multilabel (optional) --- -# Prefer ARC_SCALE_SET_LABELS=kubevirt-plugin-ci,linux,x64 in the environment when using -# ci-scripts/arc/install-runner-scale-set.sh. If you use this file instead, uncomment: -# -# scaleSetLabels: -# - kubevirt-plugin-ci -# - linux -# -# Workflows must then use: -# runs-on: [kubevirt-plugin-ci, linux] -# -# --- Labels / annotations on ARC-managed resources (0.14+) --- -# Useful for OpenShift scheduling, cost tags, or policy selectors on listener pods / RBAC. -# -# resourceMeta: -# autoscalingListener: -# labels: -# kubernetes.io/os: linux -# annotations: {} -# listenerServiceAccount: -# labels: {} -# annotations: {} -# -# (See upstream chart values.yaml for the full resourceMeta schema.) diff --git a/ci-scripts/helm/ci-test-stack/Chart.yaml b/ci-scripts/helm/ci-test-stack/Chart.yaml new file mode 100644 index 0000000000..597d5d1bec --- /dev/null +++ b/ci-scripts/helm/ci-test-stack/Chart.yaml @@ -0,0 +1,9 @@ +apiVersion: v2 +name: ci-test-stack +description: >- + Deploys the kubevirt-plugin and OpenShift console as Kubernetes workloads + for E2E CI testing. Designed to run in a per-test-run namespace alongside + ARC self-hosted runners. +version: 0.1.0 +appVersion: '1.0.0' +type: application diff --git a/ci-scripts/helm/ci-test-stack/templates/NOTES.txt b/ci-scripts/helm/ci-test-stack/templates/NOTES.txt new file mode 100644 index 0000000000..4b362d6a69 --- /dev/null +++ b/ci-scripts/helm/ci-test-stack/templates/NOTES.txt @@ -0,0 +1,12 @@ +CI Test Stack deployed. + +Plugin: http://{{ include "ci-test-stack.pluginName" . }}.{{ .Release.Namespace }}.svc.cluster.local:{{ .Values.plugin.port }} +Console: http://{{ include "ci-test-stack.consoleName" . }}.{{ .Release.Namespace }}.svc.cluster.local:{{ .Values.console.port }} +{{- if .Values.console.route.enabled }} + +Route: + oc get route {{ include "ci-test-stack.consoleName" . }} -n {{ .Release.Namespace }} -o jsonpath='{.spec.host}' +{{- end }} + +For Cypress, set: + BRIDGE_BASE_ADDRESS=http://{{ include "ci-test-stack.consoleName" . }}.{{ .Release.Namespace }}.svc.cluster.local:{{ .Values.console.port }} diff --git a/ci-scripts/helm/ci-test-stack/templates/_helpers.tpl b/ci-scripts/helm/ci-test-stack/templates/_helpers.tpl new file mode 100644 index 0000000000..479f9cedb2 --- /dev/null +++ b/ci-scripts/helm/ci-test-stack/templates/_helpers.tpl @@ -0,0 +1,30 @@ +{{/* +Common labels applied to every resource. +*/}} +{{- define "ci-test-stack.labels" -}} +app.kubernetes.io/managed-by: {{ .Release.Service }} +app.kubernetes.io/instance: {{ .Release.Name }} +helm.sh/chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }} +ci.kubevirt-plugin/component: ci-test-stack +{{- end }} + +{{/* +Plugin resource name. +*/}} +{{- define "ci-test-stack.pluginName" -}} +{{ .Release.Name }}-plugin +{{- end }} + +{{/* +Console resource name. +*/}} +{{- define "ci-test-stack.consoleName" -}} +{{ .Release.Name }}-console +{{- end }} + +{{/* +In-cluster URL the console uses to reach the plugin Service. +*/}} +{{- define "ci-test-stack.pluginUrl" -}} +http://{{ include "ci-test-stack.pluginName" . }}.{{ .Release.Namespace }}.svc.cluster.local:{{ .Values.plugin.port }} +{{- end }} diff --git a/ci-scripts/helm/ci-test-stack/templates/console-clusterrolebinding.yaml b/ci-scripts/helm/ci-test-stack/templates/console-clusterrolebinding.yaml new file mode 100644 index 0000000000..c1b6643dbc --- /dev/null +++ b/ci-scripts/helm/ci-test-stack/templates/console-clusterrolebinding.yaml @@ -0,0 +1,18 @@ +# Binds the console SA to the ci-console ClusterRole that was pre-created at +# ARC install time. The ClusterRoleBinding is cluster-scoped so it must be +# explicitly deleted on cleanup (helm uninstall handles this). +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "ci-test-stack.consoleName" . }} + labels: + {{- include "ci-test-stack.labels" . | nindent 4 }} + app.kubernetes.io/component: console +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ .Values.rbac.consoleClusterRole }} +subjects: + - kind: ServiceAccount + name: {{ include "ci-test-stack.consoleName" . }} + namespace: {{ .Release.Namespace }} diff --git a/ci-scripts/helm/ci-test-stack/templates/console-configmap.yaml b/ci-scripts/helm/ci-test-stack/templates/console-configmap.yaml new file mode 100644 index 0000000000..4b07c01dbe --- /dev/null +++ b/ci-scripts/helm/ci-test-stack/templates/console-configmap.yaml @@ -0,0 +1,20 @@ +{{- if .Values.console.pluginProxy.endpoint }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "ci-test-stack.consoleName" . }}-config + labels: + {{- include "ci-test-stack.labels" . | nindent 4 }} + app.kubernetes.io/component: console +data: + pluginProxy: | + { + "services": [ + { + "consoleAPIPath": "/api/proxy/plugin/kubevirt-plugin/kubevirt-apiserver-proxy/", + "endpoint": {{ .Values.console.pluginProxy.endpoint | quote }}, + "authorize": true + } + ] + } +{{- end }} diff --git a/ci-scripts/helm/ci-test-stack/templates/console-deployment.yaml b/ci-scripts/helm/ci-test-stack/templates/console-deployment.yaml new file mode 100644 index 0000000000..f9150e2937 --- /dev/null +++ b/ci-scripts/helm/ci-test-stack/templates/console-deployment.yaml @@ -0,0 +1,81 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "ci-test-stack.consoleName" . }} + labels: + {{- include "ci-test-stack.labels" . | nindent 4 }} + app.kubernetes.io/component: console +spec: + replicas: {{ .Values.console.replicas }} + selector: + matchLabels: + app: {{ include "ci-test-stack.consoleName" . }} + template: + metadata: + labels: + app: {{ include "ci-test-stack.consoleName" . }} + {{- include "ci-test-stack.labels" . | nindent 8 }} + spec: + serviceAccountName: {{ include "ci-test-stack.consoleName" . }} + containers: + - name: console + image: {{ required "console.image is required" .Values.console.image | quote }} + ports: + - containerPort: {{ .Values.console.port }} + protocol: TCP + env: + - name: BRIDGE_USER_AUTH + value: "disabled" + - name: BRIDGE_K8S_MODE + value: "off-cluster" + - name: BRIDGE_K8S_AUTH + value: "bearer-token" + - name: BRIDGE_K8S_MODE_OFF_CLUSTER_SKIP_VERIFY_TLS + value: "true" + - name: BRIDGE_K8S_MODE_OFF_CLUSTER_ENDPOINT + value: {{ required "console.apiServer is required" .Values.console.apiServer | quote }} + - name: BRIDGE_K8S_AUTH_BEARER_TOKEN + valueFrom: + secretKeyRef: + name: {{ include "ci-test-stack.consoleName" . }}-token + key: token + - name: BRIDGE_USER_SETTINGS_LOCATION + value: "localstorage" + - name: BRIDGE_I18N_NAMESPACES + value: "plugin__kubevirt-plugin" + - name: BRIDGE_PLUGINS + value: "kubevirt-plugin={{ include "ci-test-stack.pluginUrl" . }}" + {{- if .Values.console.pluginProxy.endpoint }} + - name: BRIDGE_PLUGIN_PROXY + valueFrom: + configMapKeyRef: + name: {{ include "ci-test-stack.consoleName" . }}-config + key: pluginProxy + {{- end }} + {{- if .Values.console.monitoring.thanosUrl }} + - name: BRIDGE_K8S_MODE_OFF_CLUSTER_THANOS + value: {{ .Values.console.monitoring.thanosUrl | quote }} + {{- end }} + {{- if .Values.console.monitoring.alertmanagerUrl }} + - name: BRIDGE_K8S_MODE_OFF_CLUSTER_ALERTMANAGER + value: {{ .Values.console.monitoring.alertmanagerUrl | quote }} + {{- end }} + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 256Mi + readinessProbe: + httpGet: + path: / + port: {{ .Values.console.port }} + initialDelaySeconds: 10 + periodSeconds: 10 + livenessProbe: + httpGet: + path: / + port: {{ .Values.console.port }} + initialDelaySeconds: 15 + periodSeconds: 30 diff --git a/ci-scripts/helm/ci-test-stack/templates/console-route.yaml b/ci-scripts/helm/ci-test-stack/templates/console-route.yaml new file mode 100644 index 0000000000..aa0f4e09b6 --- /dev/null +++ b/ci-scripts/helm/ci-test-stack/templates/console-route.yaml @@ -0,0 +1,23 @@ +{{- if .Values.console.route.enabled }} +apiVersion: route.openshift.io/v1 +kind: Route +metadata: + name: {{ include "ci-test-stack.consoleName" . }} + labels: + {{- include "ci-test-stack.labels" . | nindent 4 }} + app.kubernetes.io/component: console +spec: + {{- if .Values.console.route.host }} + host: {{ .Values.console.route.host | quote }} + {{- end }} + to: + kind: Service + name: {{ include "ci-test-stack.consoleName" . }} + weight: 100 + port: + targetPort: http + tls: + termination: edge + insecureEdgeTerminationPolicy: Redirect + wildcardPolicy: None +{{- end }} diff --git a/ci-scripts/helm/ci-test-stack/templates/console-sa.yaml b/ci-scripts/helm/ci-test-stack/templates/console-sa.yaml new file mode 100644 index 0000000000..91afda7f15 --- /dev/null +++ b/ci-scripts/helm/ci-test-stack/templates/console-sa.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "ci-test-stack.consoleName" . }} + labels: + {{- include "ci-test-stack.labels" . | nindent 4 }} + app.kubernetes.io/component: console +--- +# Legacy-style SA token Secret. The token controller auto-populates it so the +# console Deployment can reference the token without an out-of-band TokenRequest. +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "ci-test-stack.consoleName" . }}-token + labels: + {{- include "ci-test-stack.labels" . | nindent 4 }} + app.kubernetes.io/component: console + annotations: + kubernetes.io/service-account.name: {{ include "ci-test-stack.consoleName" . }} +type: kubernetes.io/service-account-token diff --git a/ci-scripts/helm/ci-test-stack/templates/console-service.yaml b/ci-scripts/helm/ci-test-stack/templates/console-service.yaml new file mode 100644 index 0000000000..dc93d67e15 --- /dev/null +++ b/ci-scripts/helm/ci-test-stack/templates/console-service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "ci-test-stack.consoleName" . }} + labels: + {{- include "ci-test-stack.labels" . | nindent 4 }} + app.kubernetes.io/component: console +spec: + type: ClusterIP + ports: + - port: {{ .Values.console.port }} + targetPort: {{ .Values.console.port }} + protocol: TCP + name: http + selector: + app: {{ include "ci-test-stack.consoleName" . }} diff --git a/ci-scripts/helm/ci-test-stack/templates/plugin-configmap.yaml b/ci-scripts/helm/ci-test-stack/templates/plugin-configmap.yaml new file mode 100644 index 0000000000..832ded6c6a --- /dev/null +++ b/ci-scripts/helm/ci-test-stack/templates/plugin-configmap.yaml @@ -0,0 +1,40 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "ci-test-stack.pluginName" . }}-nginx + labels: + {{- include "ci-test-stack.labels" . | nindent 4 }} + app.kubernetes.io/component: plugin +data: + nginx.conf: | + pid /tmp/nginx.pid; + error_log /dev/stdout info; + + events { + worker_connections 1024; + } + + http { + access_log /dev/stdout; + include /etc/nginx/mime.types; + default_type application/octet-stream; + keepalive_timeout 65; + + add_header X-Content-Type-Options nosniff; + + server { + listen {{ .Values.plugin.port }} default_server; + root /usr/share/nginx/html; + + location = /plugin-manifest.json { + add_header Cache-Control 'no-cache, no-store, must-revalidate, proxy-revalidate, max-age=0'; + add_header Pragma 'no-cache'; + add_header Expires '0'; + } + location = /plugin-entry.js { + add_header Cache-Control 'no-cache, no-store, must-revalidate, proxy-revalidate, max-age=0'; + add_header Pragma 'no-cache'; + add_header Expires '0'; + } + } + } diff --git a/ci-scripts/helm/ci-test-stack/templates/plugin-deployment.yaml b/ci-scripts/helm/ci-test-stack/templates/plugin-deployment.yaml new file mode 100644 index 0000000000..d387c3c7db --- /dev/null +++ b/ci-scripts/helm/ci-test-stack/templates/plugin-deployment.yaml @@ -0,0 +1,52 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "ci-test-stack.pluginName" . }} + labels: + {{- include "ci-test-stack.labels" . | nindent 4 }} + app.kubernetes.io/component: plugin +spec: + replicas: {{ .Values.plugin.replicas }} + selector: + matchLabels: + app: {{ include "ci-test-stack.pluginName" . }} + template: + metadata: + labels: + app: {{ include "ci-test-stack.pluginName" . }} + {{- include "ci-test-stack.labels" . | nindent 8 }} + spec: + containers: + - name: plugin + image: {{ required "plugin.image is required" .Values.plugin.image | quote }} + ports: + - containerPort: {{ .Values.plugin.port }} + protocol: TCP + volumeMounts: + - name: nginx-conf + mountPath: /etc/nginx/nginx.conf + subPath: nginx.conf + readOnly: true + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi + readinessProbe: + httpGet: + path: /plugin-manifest.json + port: {{ .Values.plugin.port }} + initialDelaySeconds: 5 + periodSeconds: 10 + livenessProbe: + httpGet: + path: /plugin-manifest.json + port: {{ .Values.plugin.port }} + initialDelaySeconds: 10 + periodSeconds: 30 + volumes: + - name: nginx-conf + configMap: + name: {{ include "ci-test-stack.pluginName" . }}-nginx diff --git a/ci-scripts/helm/ci-test-stack/templates/plugin-service.yaml b/ci-scripts/helm/ci-test-stack/templates/plugin-service.yaml new file mode 100644 index 0000000000..203aa1ae4d --- /dev/null +++ b/ci-scripts/helm/ci-test-stack/templates/plugin-service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "ci-test-stack.pluginName" . }} + labels: + {{- include "ci-test-stack.labels" . | nindent 4 }} + app.kubernetes.io/component: plugin +spec: + type: ClusterIP + ports: + - port: {{ .Values.plugin.port }} + targetPort: {{ .Values.plugin.port }} + protocol: TCP + name: http + selector: + app: {{ include "ci-test-stack.pluginName" . }} diff --git a/ci-scripts/helm/ci-test-stack/values.yaml b/ci-scripts/helm/ci-test-stack/values.yaml new file mode 100644 index 0000000000..616934b0c8 --- /dev/null +++ b/ci-scripts/helm/ci-test-stack/values.yaml @@ -0,0 +1,28 @@ +# Plugin (kubevirt-plugin nginx image built from the PR) +plugin: + image: '' + port: 9080 + replicas: 1 + +# Console (origin-console in off-cluster mode) +console: + image: 'quay.io/openshift/origin-console:latest' + port: 9000 + replicas: 1 + apiServer: '' + + route: + enabled: true + host: '' + + # kubevirt-apiserver-proxy endpoint (Route or internal Service URL) + pluginProxy: + endpoint: '' + + monitoring: + thanosUrl: '' + alertmanagerUrl: '' + +# ClusterRole pre-created at ARC install time (ci-console-clusterrole.yaml) +rbac: + consoleClusterRole: ci-console diff --git a/ci-scripts/test-cleanup.sh b/ci-scripts/test-cleanup.sh index 3863c0cdbb..18d55193f4 100755 --- a/ci-scripts/test-cleanup.sh +++ b/ci-scripts/test-cleanup.sh @@ -4,23 +4,23 @@ # export TEST_NS=${CYPRESS_TEST_NS:-'auto-test-ns'} -oc delete vm --all -n ${TEST_NS} --wait=false -oc delete template --all -n ${TEST_NS} --wait=false -#oc delete template -l app.kubernetes.io/name=custom-templates -n openshift --wait=false -oc delete VirtualMachineSnapshot --all -n ${TEST_NS} --ignore-not-found -oc delete datavolume --all -n ${TEST_NS} --wait=false -oc delete datasource --all -n ${TEST_NS} --wait=false -#oc delete datasource -n openshift-virtualization-os-images -l app.kubernetes.io/part-of!=hyperconverged-cluster -#oc delete datavolume -n openshift-virtualization-os-images -l app.kubernetes.io/part-of!=hyperconverged-cluster -oc delete pvc --all -n ${TEST_NS} --wait=false -#oc delete pvc -n openshift-cnv -l k8s-app!=hostpath-provisioner --wait=false -#oc delete pvc --all -n openshift --wait=false -#oc delete pvc --all -n openshift-virtualization-os-images --wait=false -oc delete secret --all -n ${TEST_NS} --ignore-not-found --wait=false -oc delete net-attach-def --all -n ${TEST_NS} --ignore-not-found --wait=false -#oc delete VirtualMachineClusterInstancetype example --ignore-not-found --wait=false -oc delete VirtualMachineInstancetype example -n ${TEST_NS} --ignore-not-found --wait=false -#oc delete VirtualMachineClusterPreference example --ignore-not-found --wait=false -oc delete VirtualMachinePreference example -n ${TEST_NS} --ignore-not-found --wait=false -#oc delete MigrationPolicy example --ignore-not-found --wait=false -oc delete migplan --all -n ${TEST_NS} --ignore-not-found --wait=false +oc -n ${TEST_NS} delete vm --all --wait=false || true +oc -n ${TEST_NS} delete template --all --wait=false || true +oc -n ${TEST_NS} delete VirtualMachineSnapshot --all --ignore-not-found || true +oc -n ${TEST_NS} delete datavolume --all --wait=false || true +oc -n ${TEST_NS} delete datasource --all --wait=false || true +oc -n ${TEST_NS} delete pvc --all --wait=false || true +oc -n ${TEST_NS} delete secret --all --ignore-not-found --wait=false || true +oc -n ${TEST_NS} delete net-attach-def --all --ignore-not-found --wait=false || true +oc -n ${TEST_NS} delete VirtualMachineInstancetype example --ignore-not-found --wait=false || true +oc -n ${TEST_NS} delete VirtualMachinePreference example --ignore-not-found --wait=false || true + +#oc -n openshift-virtualization-os-images delete datasource -l app.kubernetes.io/part-of!=hyperconverged-cluster +#oc -n openshift-virtualization-os-images delete datavolume -l app.kubernetes.io/part-of!=hyperconverged-cluster +#oc -n openshift-virtualization-os-images delete pvc --all --wait=false +#oc -n openshift-cnv delete pvc -l k8s-app!=hostpath-provisioner --wait=false +#oc -n openshift delete template -l app.kubernetes.io/name=custom-templates --wait=false +#oc -n openshift delete pvc --all --wait=false +#oc -n openshift delete VirtualMachineClusterInstancetype example --ignore-not-found --wait=false +#oc -n openshift delete VirtualMachineClusterPreference example --ignore-not-found --wait=false +#oc -n openshift delete MigrationPolicy example --ignore-not-found --wait=false From f77b00cc717b6f61fd707cd89f931b8c7a91dce9 Mon Sep 17 00:00:00 2001 From: Scott J Dickerson Date: Wed, 22 Apr 2026 19:21:04 -0400 Subject: [PATCH 03/42] push some ci-test-stack changes, include rolebinding instead of in the controller sh Signed-off-by: Scott J Dickerson --- ci-scripts/ci-env/ci-env-controller.sh | 21 +--------- .../templates/console-deployment.yaml | 42 ++++++++++++------- ...igmap.yaml => plugin-configmap-nginx.yaml} | 0 .../templates/runner-rolebinding.yaml | 19 +++++++++ ci-scripts/helm/ci-test-stack/values.yaml | 7 ++++ 5 files changed, 54 insertions(+), 35 deletions(-) rename ci-scripts/helm/ci-test-stack/templates/{plugin-configmap.yaml => plugin-configmap-nginx.yaml} (100%) create mode 100644 ci-scripts/helm/ci-test-stack/templates/runner-rolebinding.yaml diff --git a/ci-scripts/ci-env/ci-env-controller.sh b/ci-scripts/ci-env/ci-env-controller.sh index 691b986bc4..2df6e91111 100755 --- a/ci-scripts/ci-env/ci-env-controller.sh +++ b/ci-scripts/ci-env/ci-env-controller.sh @@ -147,25 +147,6 @@ provision() { log "Creating namespace ${test_ns}..." oc create namespace "${test_ns}" --dry-run=client -o yaml | oc apply -f - || true - log "Granting runner SA ${RUNNER_SA_NS}/${RUNNER_SA_NAME} test permissions in ${test_ns}..." - oc apply -f - <&1; then local err="helm install failed" diff --git a/ci-scripts/helm/ci-test-stack/templates/console-deployment.yaml b/ci-scripts/helm/ci-test-stack/templates/console-deployment.yaml index f9150e2937..4b003559d4 100644 --- a/ci-scripts/helm/ci-test-stack/templates/console-deployment.yaml +++ b/ci-scripts/helm/ci-test-stack/templates/console-deployment.yaml @@ -24,25 +24,40 @@ spec: - containerPort: {{ .Values.console.port }} protocol: TCP env: - - name: BRIDGE_USER_AUTH - value: "disabled" + # Off-cluster mode settings - name: BRIDGE_K8S_MODE value: "off-cluster" - - name: BRIDGE_K8S_AUTH - value: "bearer-token" - - name: BRIDGE_K8S_MODE_OFF_CLUSTER_SKIP_VERIFY_TLS - value: "true" - name: BRIDGE_K8S_MODE_OFF_CLUSTER_ENDPOINT value: {{ required "console.apiServer is required" .Values.console.apiServer | quote }} + - name: BRIDGE_K8S_MODE_OFF_CLUSTER_SKIP_VERIFY_TLS + value: "true" + {{- if .Values.console.monitoring.thanosUrl }} + - name: BRIDGE_K8S_MODE_OFF_CLUSTER_THANOS + value: {{ .Values.console.monitoring.thanosUrl | quote }} + {{- end }} + {{- if .Values.console.monitoring.alertmanagerUrl }} + - name: BRIDGE_K8S_MODE_OFF_CLUSTER_ALERTMANAGER + value: {{ .Values.console.monitoring.alertmanagerUrl | quote }} + {{- end }} + + # User auth settings + - name: BRIDGE_USER_AUTH + value: "disabled" - name: BRIDGE_K8S_AUTH_BEARER_TOKEN valueFrom: secretKeyRef: name: {{ include "ci-test-stack.consoleName" . }}-token key: token + + # Branding and docs settings + - name: BRIDGE_BRANDING + value: "okd" + + # Customization settings - name: BRIDGE_USER_SETTINGS_LOCATION value: "localstorage" - - name: BRIDGE_I18N_NAMESPACES - value: "plugin__kubevirt-plugin" + + # Plugins - name: BRIDGE_PLUGINS value: "kubevirt-plugin={{ include "ci-test-stack.pluginUrl" . }}" {{- if .Values.console.pluginProxy.endpoint }} @@ -52,14 +67,9 @@ spec: name: {{ include "ci-test-stack.consoleName" . }}-config key: pluginProxy {{- end }} - {{- if .Values.console.monitoring.thanosUrl }} - - name: BRIDGE_K8S_MODE_OFF_CLUSTER_THANOS - value: {{ .Values.console.monitoring.thanosUrl | quote }} - {{- end }} - {{- if .Values.console.monitoring.alertmanagerUrl }} - - name: BRIDGE_K8S_MODE_OFF_CLUSTER_ALERTMANAGER - value: {{ .Values.console.monitoring.alertmanagerUrl | quote }} - {{- end }} + - name: BRIDGE_I18N_NAMESPACES + value: "plugin__kubevirt-plugin" + resources: requests: cpu: 100m diff --git a/ci-scripts/helm/ci-test-stack/templates/plugin-configmap.yaml b/ci-scripts/helm/ci-test-stack/templates/plugin-configmap-nginx.yaml similarity index 100% rename from ci-scripts/helm/ci-test-stack/templates/plugin-configmap.yaml rename to ci-scripts/helm/ci-test-stack/templates/plugin-configmap-nginx.yaml diff --git a/ci-scripts/helm/ci-test-stack/templates/runner-rolebinding.yaml b/ci-scripts/helm/ci-test-stack/templates/runner-rolebinding.yaml new file mode 100644 index 0000000000..61425894e3 --- /dev/null +++ b/ci-scripts/helm/ci-test-stack/templates/runner-rolebinding.yaml @@ -0,0 +1,19 @@ +# Grants the ARC runner SA namespace-scoped permissions so Cypress tests can +# manage resources inside the test namespace. The subjects come from values so +# the same chart works for any runner SA / namespace combination. +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ci-env-test-runner + namespace: {{ .Release.Namespace }} + labels: + {{- include "ci-test-stack.labels" . | nindent 4 }} + app.kubernetes.io/component: ci-env-controller +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ .Values.rbac.testRunnerClusterRole }} +subjects: + - kind: ServiceAccount + name: {{ .Values.runner.saName }} + namespace: {{ .Values.runner.saNamespace }} diff --git a/ci-scripts/helm/ci-test-stack/values.yaml b/ci-scripts/helm/ci-test-stack/values.yaml index 616934b0c8..de9bfce238 100644 --- a/ci-scripts/helm/ci-test-stack/values.yaml +++ b/ci-scripts/helm/ci-test-stack/values.yaml @@ -26,3 +26,10 @@ console: # ClusterRole pre-created at ARC install time (ci-console-clusterrole.yaml) rbac: consoleClusterRole: ci-console + # ClusterRole that grants test permissions inside the test namespace + testRunnerClusterRole: ci-env-test-runner + +# ARC runner ServiceAccount that needs namespace-scoped test permissions +runner: + saName: kubevirt-plugin-ci-gha-rs-no-permission + saNamespace: arc-runners From 57ff3f5c10a9b9ebaf6a2f51f13960608dbb3c7c Mon Sep 17 00:00:00 2001 From: Scott J Dickerson Date: Wed, 22 Apr 2026 20:48:37 -0400 Subject: [PATCH 04/42] rearrange the custom images Signed-off-by: Scott J Dickerson --- ci-scripts/_cluster-helpers.sh | 16 +++ ci-scripts/arc/runner-image/Dockerfile | 126 ------------------ ci-scripts/arc/setup-runner-image.sh | 120 ----------------- ci-scripts/images/arc-runner/Dockerfile | 11 +- .../ci-env-runner}/Dockerfile | 0 .../ci-env-runner}/helm/ci-test-stack | 0 ci-scripts/images/setup-arc-runner-image.sh | 15 +-- .../setup-ci-env-runner-image.sh} | 79 +++++------ 8 files changed, 65 insertions(+), 302 deletions(-) delete mode 100644 ci-scripts/arc/runner-image/Dockerfile delete mode 100755 ci-scripts/arc/setup-runner-image.sh rename ci-scripts/{ci-env/controller-image => images/ci-env-runner}/Dockerfile (100%) rename ci-scripts/{ci-env/controller-image => images/ci-env-runner}/helm/ci-test-stack (100%) rename ci-scripts/{ci-env/setup-controller-image.sh => images/setup-ci-env-runner-image.sh} (52%) diff --git a/ci-scripts/_cluster-helpers.sh b/ci-scripts/_cluster-helpers.sh index 208a82d65b..c65a26dbda 100644 --- a/ci-scripts/_cluster-helpers.sh +++ b/ci-scripts/_cluster-helpers.sh @@ -8,6 +8,7 @@ # After sourcing, call: # resolve_cli_downloads # populates OC_URL, VIRTCTL_URL, HELM_URL # resolve_oc_version # populates OC_VERSION (if not already set) +# resolve_internal_registry # populates INTERNAL_REGISTRY # # The functions use ConsoleCLIDownload resources and rewrite public route URLs # to cluster-internal service URLs so build pods don't need to trust the @@ -16,6 +17,13 @@ # Requires: oc logged into OpenShift. # Optional: jq (URL resolution is silently skipped without it). +verify_oc() { + if ! oc get clusterversion version &>/dev/null; then + echo "ERROR: OpenShift cluster required (clusterversion.version not found)." + exit 1 + fi +} + # Rewrite a public https route URL to its backing cluster-internal HTTP service. # Arg: $1 = URL Reads: _ALL_ROUTES_JSON (set by resolve_cli_downloads) _route_url_to_internal() { @@ -49,6 +57,14 @@ resolve_oc_version() { OC_VERSION="${OC_VERSION:-4.20}" } +# Resolve the internal image registry hostname from the cluster. +# Sets INTERNAL_REGISTRY; defaults to the well-known service address if detection fails. +resolve_internal_registry() { + INTERNAL_REGISTRY="$(oc get image.config.openshift.io/cluster \ + -o jsonpath='{.status.internalRegistryHostname}' 2>/dev/null \ + || echo 'image-registry.openshift-image-registry.svc:5000')" +} + # Resolve binary download URLs from ConsoleCLIDownload resources. # Sets: OC_URL, VIRTCTL_URL, HELM_URL (empty string if not resolved). # Callers can choose which variables they need; unused ones remain empty. diff --git a/ci-scripts/arc/runner-image/Dockerfile b/ci-scripts/arc/runner-image/Dockerfile deleted file mode 100644 index 5162f2cc9f..0000000000 --- a/ci-scripts/arc/runner-image/Dockerfile +++ /dev/null @@ -1,126 +0,0 @@ -# Custom GitHub Actions runner image for kubevirt-plugin-ci CI. -# Extends the official runner image with cli tools used in the CI pipeline (such as jq, curl, -# envsubst, Node.js, oc, and virtctl) and support for running Cypress tests. - -# -# https://github.com/actions/runner/blob/main/images/Dockerfile: the base image -# The base image includes the docker CLI; with ARC containerMode.type=dind, jobs use the -# docker:dind sidecar (DOCKER_HOST=unix:///var/run/docker.sock) for docker/container actions. -# -ARG RUNNER_BASE=ghcr.io/actions/actions-runner:latest -FROM ${RUNNER_BASE} - -USER root - -# Go-based yq (mikefarah/yq) — matches the version on GitHub-hosted ubuntu-latest runners. -# NOTE: apt's `yq` package is the Python-based yq (kislyuk), which has incompatible syntax. -ARG YQ_VERSION=v4.52.5 -# Versions (override with build-args to match cluster) -ARG NODE_VERSION=22 -# OpenShift client version to match cluster (e.g. 4.20) -ARG OC_VERSION=4.20 -# KubeVirt CLI version to match HCO install (e.g. v1.4.0) -ARG VIRTCTL_VERSION=v1.4.0 -# Helm version (e.g. 3.19.0) -ARG HELM_VERSION=3.19.0 - -# Direct binary download URLs resolved from ConsoleCLIDownload by setup-runner-image.sh. -# When set, these take precedence over the static mirror/GitHub URLs above and guarantee -# the binaries match the live cluster. Left empty to use the static fallback URLs. -ARG OC_URL="" -ARG VIRTCTL_URL="" -ARG HELM_URL="" - -# curl and wget2 are both installed: -# curl — kept for CI workflow scripts at runtime -# wget2 — used for all build-time HTTPS downloads -# On Ubuntu, apt and wget2 use GnuTLS; curl and wget (1.x) use OpenSSL. On OpenShift build pods -# where the FIPS provider .so can't load (DSO error), all OpenSSL-based HTTPS fails. wget2/GnuTLS -# is unaffected, so every build-time binary download goes through wget2 instead. -RUN apt-get update \ - && apt-get install -y --no-install-recommends ca-certificates jq curl wget2 gettext-base \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -# Cypress 14+ headless (bundled Electron) — Linux deps for Ubuntu 24.04+ (noble). -# Official runner image is noble (see actions/runner images/Dockerfile: mcr.microsoft.com/dotnet/runtime-deps:8.0-noble). -# Matches Cypress docs: https://docs.cypress.io/guides/getting-started/installing-cypress#Linux — Ubuntu >=24.04 list. -# GitHub-hosted ubuntu-latest (runner-images Ubuntu2404) also includes xvfb, Chrome/Firefox, and extra fonts; for ARC we -# install the minimum Electron stack plus common fonts so screenshots/video match hosted runners more closely. -# unzip is already present in the upstream actions-runner image (needed for fast Cypress binary unpack). -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - fontconfig \ - fonts-liberation \ - libasound2t64 \ - libgbm-dev \ - libgtk-3-0t64 \ - libnss3 \ - libnotify-dev \ - libxss1 \ - libxtst6 \ - xauth \ - xvfb \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -# yq (Go/mikefarah) — wget2/GnuTLS avoids the OpenSSL FIPS DSO loading failure on hardened clusters -RUN wget2 -O /usr/local/bin/yq \ - "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" \ - && chmod +x /usr/local/bin/yq - -# Node.js — the nodesource setup script internally calls curl (OpenSSL), which fails on FIPS clusters. -# Fetch the version list from nodejs.org with wget2 (GnuTLS), pick the latest v${NODE_VERSION}.x, -# then download and extract the pre-built binary tarball directly. -RUN NVER=$(wget2 -qO- "https://nodejs.org/dist/index.json" \ - | jq -r --argjson maj "${NODE_VERSION}" \ - '[.[] | select(.version | test("^v" + ($maj|tostring) + "\\."))] | .[0].version') \ - && wget2 -qO /tmp/node.tar.gz \ - "https://nodejs.org/dist/${NVER}/node-${NVER}-linux-x64.tar.gz" \ - && tar -xzf /tmp/node.tar.gz -C /usr/local --strip-components=1 \ - && rm /tmp/node.tar.gz - -# OpenShift client (oc) — use console download URL if resolved, else mirror.openshift.com stable-4.x. -# Console route serves a plain .tar; download to a temp file so GNU tar auto-detects the format. -# Fallback uses wget2 (GnuTLS) for the external HTTPS download. -RUN if [ -n "${OC_URL}" ]; then \ - wget2 -qO /tmp/oc-archive "${OC_URL}" \ - && tar -xf /tmp/oc-archive -C /usr/local/bin oc \ - && rm /tmp/oc-archive; \ - else \ - wget2 -qO- "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable-${OC_VERSION}/openshift-client-linux.tar.gz" \ - | tar -xzf - -C /usr/local/bin; \ - fi - -# virtctl (KubeVirt CLI) — use console download URL if resolved, else GitHub releases. -# Console route serves a .tar.gz; fallback uses wget2 (GnuTLS) for the external HTTPS download. -RUN if [ -n "${VIRTCTL_URL}" ]; then \ - wget2 -qO /tmp/virtctl-archive "${VIRTCTL_URL}" \ - && tar -xf /tmp/virtctl-archive -C /usr/local/bin virtctl \ - && rm /tmp/virtctl-archive; \ - else \ - wget2 -qO /usr/local/bin/virtctl \ - "https://github.com/kubevirt/kubevirt/releases/download/${VIRTCTL_VERSION}/virtctl-${VIRTCTL_VERSION}-linux-amd64"; \ - fi \ - && chmod +x /usr/local/bin/virtctl - -# Helm — use console download URL if resolved, else get.helm.sh. -# Console route serves a .tar.gz; fallback uses wget2 (GnuTLS) for the external HTTPS download. -RUN if [ -n "${HELM_URL}" ]; then \ - wget2 -qO /tmp/helm-archive "${HELM_URL}" \ - && tar -xf /tmp/helm-archive -C /usr/local/bin --strip-components=1 linux-amd64/helm \ - && rm /tmp/helm-archive; \ - else \ - wget2 -qO- "https://get.helm.sh/helm-v${HELM_VERSION}-linux-amd64.tar.gz" \ - | tar -xzf - -C /usr/local/bin --strip-components=1 linux-amd64/helm; \ - fi \ - && chmod +x /usr/local/bin/helm - -USER runner - -# Default npm and tmp to HOME so npm ci works in restricted containers without workflow overrides -ENV KUBEVIRT_UI_PLUGIN_RUNNER=true \ - HOME=/home/runner \ - TMPDIR=/home/runner/.tmp - -# Keep the same entrypoint/CMD as the base image so ARC works unchanged. diff --git a/ci-scripts/arc/setup-runner-image.sh b/ci-scripts/arc/setup-runner-image.sh deleted file mode 100755 index 07c7911c65..0000000000 --- a/ci-scripts/arc/setup-runner-image.sh +++ /dev/null @@ -1,120 +0,0 @@ -#!/bin/bash -# -# OpenShift only: create ImageStream + BuildConfig and run a binary Docker build for the -# custom ARC runner image (ci-scripts/arc/runner-image/Dockerfile). -# -# Output: prints IMAGE_REF= to stdout (and to ARC_RUNNER_IMAGE_FILE if set). -# -# Optional environment variables: -# ARC_RUNNERS_NS (default: arc-runners) -# OC_VERSION OpenShift client version build-arg (default: detect or 4.20) -# HELM_VERSION Helm version build-arg (default: 3.19.0) -# VIRTCTL_VERSION (default: v1.4.0) -# -# Requires: oc logged into OpenShift; jq optional for version detection and URL resolution. -# -# Binary URL resolution: -# Uses ci-scripts/_cluster-helpers.sh to query ConsoleCLIDownload resources for -# exact binary download URLs (oc, virtctl, helm) matching the live cluster. -# These are passed to the Docker build as OC_URL, VIRTCTL_URL, and HELM_URL build-args. -# If resolution fails (CRD not found, jq absent, etc.), the Dockerfile falls back to -# mirror.openshift.com / GitHub releases / get.helm.sh using OC_VERSION / VIRTCTL_VERSION / -# HELM_VERSION. - -set -euo pipefail -ARC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -CI_SCRIPTS_DIR="$(cd "${ARC_DIR}/.." && pwd)" - -ARC_RUNNERS_NS="${ARC_RUNNERS_NS:-arc-runners}" -RUNNER_IMAGE_DIR="${ARC_DIR}/runner-image" - -if ! oc get clusterversion version &>/dev/null; then - echo "ERROR: OpenShift cluster required (clusterversion.version not found)." - exit 1 -fi - -source "${CI_SCRIPTS_DIR}/_cluster-helpers.sh" - -resolve_oc_version -HELM_VERSION="${HELM_VERSION:-3.19.0}" -VIRTCTL_VERSION="${VIRTCTL_VERSION:-v1.4.0}" - -resolve_cli_downloads - -echo "=== Build ARC runner image (in-cluster, OpenShift) ===" -echo " ARC_RUNNERS_NS: ${ARC_RUNNERS_NS}" -echo " OC_VERSION: ${OC_VERSION}" -echo " VIRTCTL_VERSION: ${VIRTCTL_VERSION}" -echo " HELM_VERSION: ${HELM_VERSION}" -echo " RUNNER_IMAGE_DIR: ${RUNNER_IMAGE_DIR}" -echo " OC_URL: ${OC_URL:-(fallback to mirror.openshift.com)}" -echo " VIRTCTL_URL: ${VIRTCTL_URL:-(fallback to GitHub releases)}" -echo " HELM_URL: ${HELM_URL:-(fallback to get.helm.sh)}" -echo "" - -if [[ ! -f "${RUNNER_IMAGE_DIR}/Dockerfile" ]]; then - echo "ERROR: Dockerfile not found at ${RUNNER_IMAGE_DIR}/Dockerfile" - exit 1 -fi - -oc create namespace "${ARC_RUNNERS_NS}" --dry-run=client -o yaml | oc apply -f - - -oc apply -f - < "${ARC_RUNNER_IMAGE_FILE}" - echo "Wrote ${ARC_RUNNER_IMAGE_FILE}" -fi - -echo "IMAGE_REF=${IMAGE_REF}" diff --git a/ci-scripts/images/arc-runner/Dockerfile b/ci-scripts/images/arc-runner/Dockerfile index 99521a6d50..5162f2cc9f 100644 --- a/ci-scripts/images/arc-runner/Dockerfile +++ b/ci-scripts/images/arc-runner/Dockerfile @@ -1,13 +1,13 @@ # Custom GitHub Actions runner image for kubevirt-plugin-ci CI. # Extends the official runner image with cli tools used in the CI pipeline (such as jq, curl, -# envsubst, Node.js, oc, and virtctl) and system dependencies for Playwright (Chromium). +# envsubst, Node.js, oc, and virtctl) and support for running Cypress tests. # # https://github.com/actions/runner/blob/main/images/Dockerfile: the base image # The base image includes the docker CLI; with ARC containerMode.type=dind, jobs use the # docker:dind sidecar (DOCKER_HOST=unix:///var/run/docker.sock) for docker/container actions. # -ARG RUNNER_BASE=ghcr.io/actions/actions-runner:2.335.1 +ARG RUNNER_BASE=ghcr.io/actions/actions-runner:latest FROM ${RUNNER_BASE} USER root @@ -42,7 +42,12 @@ RUN apt-get update \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -# Playwright / Chromium headless — Linux deps for Ubuntu 24.04+ (noble). +# Cypress 14+ headless (bundled Electron) — Linux deps for Ubuntu 24.04+ (noble). +# Official runner image is noble (see actions/runner images/Dockerfile: mcr.microsoft.com/dotnet/runtime-deps:8.0-noble). +# Matches Cypress docs: https://docs.cypress.io/guides/getting-started/installing-cypress#Linux — Ubuntu >=24.04 list. +# GitHub-hosted ubuntu-latest (runner-images Ubuntu2404) also includes xvfb, Chrome/Firefox, and extra fonts; for ARC we +# install the minimum Electron stack plus common fonts so screenshots/video match hosted runners more closely. +# unzip is already present in the upstream actions-runner image (needed for fast Cypress binary unpack). RUN apt-get update \ && apt-get install -y --no-install-recommends \ fontconfig \ diff --git a/ci-scripts/ci-env/controller-image/Dockerfile b/ci-scripts/images/ci-env-runner/Dockerfile similarity index 100% rename from ci-scripts/ci-env/controller-image/Dockerfile rename to ci-scripts/images/ci-env-runner/Dockerfile diff --git a/ci-scripts/ci-env/controller-image/helm/ci-test-stack b/ci-scripts/images/ci-env-runner/helm/ci-test-stack similarity index 100% rename from ci-scripts/ci-env/controller-image/helm/ci-test-stack rename to ci-scripts/images/ci-env-runner/helm/ci-test-stack diff --git a/ci-scripts/images/setup-arc-runner-image.sh b/ci-scripts/images/setup-arc-runner-image.sh index 61efad8aa8..4294d0a695 100755 --- a/ci-scripts/images/setup-arc-runner-image.sh +++ b/ci-scripts/images/setup-arc-runner-image.sh @@ -1,12 +1,12 @@ #!/bin/bash # # OpenShift only: create ImageStream + BuildConfig and run a binary Docker build for the -# custom ARC runner image (ci-scripts/images/arc-runner/Dockerfile). +# custom ARC runner image (ci-scripts/arc/runner-image/Dockerfile). # # Output: prints IMAGE_REF= to stdout (and to ARC_RUNNER_IMAGE_FILE if set). # # Optional environment variables: -# NS Namespace for the runner (default: arc-runners) +# NS Namespace for the runner (default: ci-env-images) # OC_VERSION OpenShift client version build-arg (default: detect or 4.20) # HELM_VERSION Helm version build-arg (default: 3.19.0) # VIRTCTL_VERSION (default: v1.4.0) @@ -16,11 +16,7 @@ # # Binary URL resolution: # Uses ci-scripts/_cluster-helpers.sh to resolve cluster resources -# -# Namespace note: If the namespace does not match the ci-env-runner deployment namespace, -# the running service account will need to add role "system:image-puller" so the built image -# can be pulled. -# + set -euo pipefail SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")" REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" @@ -28,7 +24,7 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" source "${REPO_ROOT}/ci-scripts/_cluster-helpers.sh" verify_oc -NS="${NS:-arc-runners}" +NS="${NS:-ci-env-images}" IMAGE_DIR="${SCRIPT_DIR}/arc-runner" IMAGE_NAME="arc-runner" @@ -116,9 +112,10 @@ IMAGE_REF="${INTERNAL_REGISTRY}/${NS}/${IMAGE_NAME}:latest" echo "" echo "=== Build complete ===" echo "Image: ${IMAGE_REF}" -echo "IMAGE_REF=${IMAGE_REF}" +# TODO: Better handling of passing the fqdn image name to the caller if [[ -n "${ARC_RUNNER_IMAGE_FILE:-}" ]]; then printf '%s\n' "${IMAGE_REF}" > "${ARC_RUNNER_IMAGE_FILE}" echo "Wrote ${ARC_RUNNER_IMAGE_FILE}" fi +echo "IMAGE_REF=${IMAGE_REF}" diff --git a/ci-scripts/ci-env/setup-controller-image.sh b/ci-scripts/images/setup-ci-env-runner-image.sh similarity index 52% rename from ci-scripts/ci-env/setup-controller-image.sh rename to ci-scripts/images/setup-ci-env-runner-image.sh index 4d767993a9..490233f23e 100755 --- a/ci-scripts/ci-env/setup-controller-image.sh +++ b/ci-scripts/images/setup-ci-env-runner-image.sh @@ -6,7 +6,7 @@ # Output: prints IMAGE_REF= to stdout (and to CI_ENV_CONTROLLER_IMAGE_FILE if set). # # Optional environment variables: -# CI_ENV_NS Namespace for the controller (default: ci-env) +# NS Namespace for the controller (default: ci-env-images) # OC_VERSION OpenShift client version build-arg (default: detect or 4.20) # HELM_VERSION Helm version build-arg (default: 3.19.0) # YQ_VERSION yq version build-arg (default: v4.52.5) @@ -14,25 +14,18 @@ # Requires: oc logged into OpenShift; jq optional for version detection and URL resolution. # # Binary URL resolution: -# Uses ci-scripts/_cluster-helpers.sh to query ConsoleCLIDownload resources for -# exact binary download URLs (oc, helm) matching the live cluster. -# These are passed to the Docker build as OC_URL and HELM_URL build-args. -# If resolution fails (CRD not found, jq absent, etc.), the Dockerfile falls back to -# mirror.openshift.com / get.helm.sh using OC_VERSION / HELM_VERSION. +# Uses ci-scripts/_cluster-helpers.sh to resolve cluster resources set -euo pipefail -CI_ENV_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -CI_SCRIPTS_DIR="$(cd "${CI_ENV_DIR}/.." && pwd)" +SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" -CI_ENV_NS="${CI_ENV_NS:-ci-env}" -CONTROLLER_IMAGE_DIR="${CI_ENV_DIR}/controller-image" +source "${REPO_ROOT}/ci-scripts/_cluster-helpers.sh" +verify_oc -if ! oc get clusterversion version &>/dev/null; then - echo "ERROR: OpenShift cluster required (clusterversion.version not found)." - exit 1 -fi - -source "${CI_SCRIPTS_DIR}/_cluster-helpers.sh" +NS="${NS:-ci-env-images}" +IMAGE_DIR="${SCRIPT_DIR}/ci-env-runner" +IMAGE_NAME="ci-env-runner" resolve_oc_version HELM_VERSION="${HELM_VERSION:-3.19.0}" @@ -40,31 +33,30 @@ YQ_VERSION="${YQ_VERSION:-v4.52.5}" resolve_cli_downloads -echo "=== Build ci-env-controller image (in-cluster, OpenShift) ===" -echo " CI_ENV_NS: ${CI_ENV_NS}" +echo "=== Build ci-env-runner image (in-cluster, OpenShift) ===" +echo " NS: ${NS}" +echo " IMAGE_DIR: ${IMAGE_DIR}" +echo " IMAGE_NAME: ${IMAGE_NAME}" echo " OC_VERSION: ${OC_VERSION}" echo " HELM_VERSION: ${HELM_VERSION}" echo " YQ_VERSION: ${YQ_VERSION}" -echo " CONTROLLER_IMAGE_DIR: ${CONTROLLER_IMAGE_DIR}" echo " OC_URL: ${OC_URL:-(fallback to mirror.openshift.com)}" echo " HELM_URL: ${HELM_URL:-(fallback to get.helm.sh)}" echo "" -if [[ ! -f "${CONTROLLER_IMAGE_DIR}/Dockerfile" ]]; then - echo "ERROR: Dockerfile not found at ${CONTROLLER_IMAGE_DIR}/Dockerfile" +if [[ ! -f "${IMAGE_DIR}/Dockerfile" ]]; then + echo "ERROR: Dockerfile not found at ${IMAGE_DIR}/Dockerfile" exit 1 fi -BC_NAME="ci-env-controller" - -oc create namespace "${CI_ENV_NS}" --dry-run=client -o yaml | oc apply -f - +oc create namespace "${NS}" --dry-run=client -o yaml | oc apply -f - oc apply -f - < "${CI_ENV_CONTROLLER_IMAGE_FILE}" - echo "Wrote ${CI_ENV_CONTROLLER_IMAGE_FILE}" -fi +# TODO: Better handling of passing the fqdn image name to the caller +if [[ -n "${CI_ENV_RUNNER_IMAGE_FILE:-}" ]]; then + printf '%s\n' "${IMAGE_REF}" > "${CI_ENV_RUNNER_IMAGE_FILE}" + echo "Wrote ${CI_ENV_RUNNER_IMAGE_FILE}" +fi echo "IMAGE_REF=${IMAGE_REF}" From d8767e7f4d985fe9ad0aed12066626eb356eaadf Mon Sep 17 00:00:00 2001 From: Scott J Dickerson Date: Thu, 23 Apr 2026 12:44:52 -0400 Subject: [PATCH 05/42] simplify arc scripts Signed-off-by: Scott J Dickerson --- ci-scripts/arc/arc-helm-helpers.sh | 26 ---------------------- ci-scripts/arc/install-arc-controller.sh | 9 ++------ ci-scripts/arc/install-runner-scale-set.sh | 21 +++++------------ 3 files changed, 7 insertions(+), 49 deletions(-) diff --git a/ci-scripts/arc/arc-helm-helpers.sh b/ci-scripts/arc/arc-helm-helpers.sh index a986c3b199..45d757f6a0 100755 --- a/ci-scripts/arc/arc-helm-helpers.sh +++ b/ci-scripts/arc/arc-helm-helpers.sh @@ -38,29 +38,3 @@ arc_github_config_secret_helm_auth() { trap '[[ -n "${AUTH_VALUES_FILE:-}" && -f "${AUTH_VALUES_FILE}" ]] && rm -f "${AUTH_VALUES_FILE}"' EXIT return 0 } - -# -# Multilabel (ARC 0.14+): optional comma-separated ARC_SCALE_SET_LABELS — workflows must use -# runs-on: [label1, label2, ...] matching every label (see HOT_CLUSTER_CI.md). -# -arc_helm_append_scale_set_labels() { - local -n _helm_arr="${1:?helm args array name required}" - [[ -z "${ARC_SCALE_SET_LABELS:-}" ]] && return 0 - local json="[" - local first=1 - local lab - IFS=',' read -ra _arc_ssl <<< "${ARC_SCALE_SET_LABELS}" - for lab in "${_arc_ssl[@]}"; do - lab="${lab//[[:space:]]/}" - [[ -z "$lab" ]] && continue - [[ $first -eq 0 ]] && json+="," - first=0 - json+="\"${lab//\"/\\\"}\"" - done - json+="]" - if [[ $first -eq 1 ]]; then - return 0 - fi - echo "Scale set labels (multilabel): ${ARC_SCALE_SET_LABELS}" - _helm_arr+=(--set-json "scaleSetLabels=${json}") -} diff --git a/ci-scripts/arc/install-arc-controller.sh b/ci-scripts/arc/install-arc-controller.sh index 1b51507089..3dcbed19f1 100755 --- a/ci-scripts/arc/install-arc-controller.sh +++ b/ci-scripts/arc/install-arc-controller.sh @@ -13,6 +13,8 @@ set -euo pipefail ARC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" CI_SCRIPTS_DIR="$(cd "${ARC_DIR}/.." && pwd)" +source "${CI_SCRIPTS_DIR}/_cluster-helpers.sh" +verify_oc ARC_CONTROLLER_NS="${ARC_CONTROLLER_NS:-arc-systems}" ARC_CONTROLLER_INSTALL_NAME="${ARC_CONTROLLER_INSTALL_NAME:-arc}" @@ -26,12 +28,6 @@ echo " ARC_HELM_REPO: ${ARC_HELM_REPO}" echo " ARC_VERSION: ${ARC_VERSION}" echo "" -if ! oc get clusterversion version &>/dev/null; then - echo "ERROR: This script targets OpenShift only." - echo " Expected cluster-scoped ClusterVersion 'version'; use 'oc login' to an OpenShift cluster." - exit 1 -fi - echo "Creating namespace ${ARC_CONTROLLER_NS}..." oc create namespace "${ARC_CONTROLLER_NS}" --dry-run=client -o yaml | oc apply -f - @@ -57,5 +53,4 @@ helm upgrade --install "${ARC_CONTROLLER_INSTALL_NAME}" \ echo "" echo "=== ARC controller installation complete ===" -echo " Next: ./ci-scripts/arc/install-runner-scale-set.sh (requires ARC_CONFIG_URL + GitHub auth)" echo "" diff --git a/ci-scripts/arc/install-runner-scale-set.sh b/ci-scripts/arc/install-runner-scale-set.sh index e659f83255..db17d2798c 100755 --- a/ci-scripts/arc/install-runner-scale-set.sh +++ b/ci-scripts/arc/install-runner-scale-set.sh @@ -18,8 +18,6 @@ # ARC_CONTROLLER_INSTALL_NAME (default: arc) # ARC_RUNNERS_NS (default: arc-runners) # ARC_VERSION Helm chart version (default: 0.14.0); set to "latest" to omit --version -# ARC_SCALE_SET_LABELS Optional comma-separated multilabel (ARC 0.14+) -# ARC_RUNNER_EXTRA_VALUES Optional second Helm values file (merged after pod.yaml) # ARC_RUNNER_IMAGE If set, use this image for the runner container # SKIP_ARC_RUNNER_RBAC Set to 1 to skip applying ci-scripts/arc/arc-runner-rbac.yaml # @@ -28,6 +26,9 @@ set -euo pipefail ARC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" CI_SCRIPTS_DIR="$(cd "${ARC_DIR}/.." && pwd)" +source "${CI_SCRIPTS_DIR}/_cluster-helpers.sh" +verify_oc + source "${ARC_DIR}/arc-helm-helpers.sh" RUNNER_POD_VALUES="${ARC_DIR}/arc-runner-scale-set.pod.yaml" @@ -45,6 +46,7 @@ ARC_VERSION="${ARC_VERSION:-0.14.0}" RUNNER_SCALE_SET_NAME="${RUNNER_SCALE_SET_NAME:-kubevirt-plugin-ci}" MIN_RUNNERS="${MIN_RUNNERS:-0}" MAX_RUNNERS="${MAX_RUNNERS:-5}" +CONTROLLER_SA_NAME="${ARC_CONTROLLER_INSTALL_NAME}-gha-rs-controller" echo "=== ARC runner scale set installation (OpenShift) ===" echo " ARC_CONFIG_URL: ${ARC_CONFIG_URL}" @@ -56,13 +58,9 @@ echo " ARC_VERSION: ${ARC_VERSION}" echo " RUNNER_SCALE_SET_NAME: ${RUNNER_SCALE_SET_NAME}" echo " MIN_RUNNERS / MAX_RUNNERS: ${MIN_RUNNERS} / ${MAX_RUNNERS}" echo " Runner pod values: ${RUNNER_POD_VALUES}" +echo " Controller SA name: ${CONTROLLER_SA_NAME}" echo "" -if ! oc get clusterversion version &>/dev/null; then - echo "ERROR: This script targets OpenShift only." - exit 1 -fi - echo "Creating namespace ${ARC_RUNNERS_NS}..." oc create namespace "${ARC_RUNNERS_NS}" --dry-run=client -o yaml | oc apply -f - @@ -71,8 +69,6 @@ if ! arc_github_config_secret_helm_auth AUTH_ARGS; then exit 1 fi -CONTROLLER_SA_NAME="${ARC_CONTROLLER_INSTALL_NAME}-gha-rs-controller" - RUNNER_SET_ARGS=( --set "githubConfigUrl=${ARC_CONFIG_URL}" --set "minRunners=${MIN_RUNNERS}" @@ -86,14 +82,9 @@ if [[ -n "${ARC_RUNNER_IMAGE:-}" ]]; then echo "Using runner image from ARC_RUNNER_IMAGE" RUNNER_SET_ARGS+=(--set-string "template.spec.containers[0].image=${ARC_RUNNER_IMAGE}") fi -if [[ -n "${ARC_RUNNER_EXTRA_VALUES:-}" && -f "${ARC_RUNNER_EXTRA_VALUES}" ]]; then - echo "Merging extra Helm values: ${ARC_RUNNER_EXTRA_VALUES}" - RUNNER_SET_ARGS+=(--values "${ARC_RUNNER_EXTRA_VALUES}") -fi if [[ -n "${ARC_VERSION}" && "${ARC_VERSION}" != "latest" ]]; then RUNNER_SET_ARGS+=(--version "${ARC_VERSION}") fi -arc_helm_append_scale_set_labels RUNNER_SET_ARGS echo "Installing runner scale set '${RUNNER_SCALE_SET_NAME}'..." helm upgrade --install "${RUNNER_SCALE_SET_NAME}" \ @@ -102,8 +93,6 @@ helm upgrade --install "${RUNNER_SCALE_SET_NAME}" \ "${ARC_HELM_REPO}/gha-runner-scale-set" \ --wait -[[ -n "${AUTH_VALUES_FILE:-}" ]] && rm -f "${AUTH_VALUES_FILE}" - RUNNER_SA="${RUNNER_SCALE_SET_NAME}-gha-rs-no-permission" echo "Binding SCC github-arc to runner ServiceAccount ${RUNNER_SA}..." oc policy add-role-to-user system:openshift:scc:github-arc -z "${RUNNER_SA}" -n "${ARC_RUNNERS_NS}" From 6b95ab80d51320d6e5d58531be99ec6f7a6e74c7 Mon Sep 17 00:00:00 2001 From: Scott J Dickerson Date: Wed, 22 Apr 2026 17:16:46 -0400 Subject: [PATCH 06/42] test2: patch kubevirt-user-settings Signed-off-by: Scott J Dickerson --- .github/workflows/poc-e2e-ci-test2.yml | 35 ++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/.github/workflows/poc-e2e-ci-test2.yml b/.github/workflows/poc-e2e-ci-test2.yml index 70b79b55d0..f338e89dc8 100644 --- a/.github/workflows/poc-e2e-ci-test2.yml +++ b/.github/workflows/poc-e2e-ci-test2.yml @@ -227,6 +227,7 @@ jobs: plugin-image: ${{ env.PLUGIN_IMAGE }} test-namespace: ${{ env.CYPRESS_TEST_NS }} configmap-name: ${{ env.CI_ENV_CM }} + ci-env-namespace: ${{ env.CI_ENV_NS }} - name: Create test secret run: | @@ -235,6 +236,39 @@ jobs: .metadata.namespace = strenv(CYPRESS_TEST_NS) ' cypress/fixtures/secret.yaml | oc apply -f - + - name: Seed kubevirt-user-settings ConfigMap + run: | + # The ci-env-controller sets helm_release=cm_name when no override is + # provided, so CI_ENV_CM is always the Helm release name. + SA_NAME="${CI_ENV_CM}-console" + + # The plugin hook (useKubevirtUserSettings) keys settings by: + # user?.metadata?.uid OR sanitized(user?.metadata?.name) + # In no-auth mode the SA bearer token is used for all k8s calls, so + # users/~ returns a virtual user for system:serviceaccount::. + # We patch both the SA UID and the sanitized name so the hook always + # finds the right key regardless of whether OpenShift populates the UID. + SA_UID="$(oc get sa "${SA_NAME}" -n "${CYPRESS_TEST_NS}" \ + -o jsonpath='{.metadata.uid}' 2>/dev/null || true)" + SANITIZED_NAME="system-serviceaccount-${CYPRESS_TEST_NS}-${SA_NAME}" + + USER_SETTINGS='{"quickStart":{"dontShowWelcomeModal":true},"onboardingPopoversHidden":{"vmsTab":true,"catalog":true,"createProject":true}}' + + # Create the ConfigMap if HCO has not done so yet. + if ! oc get configmap kubevirt-user-settings -n "${CYPRESS_CNV_NS}" &>/dev/null; then + oc create configmap kubevirt-user-settings -n "${CYPRESS_CNV_NS}" + fi + + # Build a merge-patch that sets both possible user keys. + PATCH=$(jq -cn \ + --arg sa_uid "${SA_UID}" \ + --arg name_key "${SANITIZED_NAME}" \ + --arg val "${USER_SETTINGS}" \ + '{data: (if ($sa_uid | length) > 0 then {($sa_uid): $val, ($name_key): $val} else {($name_key): $val} end)}') + + oc patch configmap kubevirt-user-settings -n "${CYPRESS_CNV_NS}" \ + --type merge -p "${PATCH}" + # TODO: Add dependency caching (either use the setup-node action with caching, or add explicit caching) - name: Install dependencies run: | @@ -314,3 +348,4 @@ jobs: uses: ./.github/actions/ci-env-release with: configmap-name: ${{ env.CI_ENV_CM }} + ci-env-namespace: ${{ env.CI_ENV_NS }} From 85b115f6d6e62d4a1621bbb52893d00ea74b4e1d Mon Sep 17 00:00:00 2001 From: Scott J Dickerson Date: Thu, 23 Apr 2026 13:18:26 -0400 Subject: [PATCH 07/42] migrate ci-env to a helm chart Signed-off-by: Scott J Dickerson --- ci-scripts/arc/install-arc-controller.sh | 3 - ci-scripts/ci-env/README.md | 79 ++++++++++----- .../ci-env/install-ci-env-controller.sh | 96 ++++--------------- ci-scripts/helm/ci-env-controller/Chart.yaml | 9 ++ .../scripts}/ci-env-controller.sh | 0 .../ci-env-controller/templates/_helpers.tpl | 9 ++ .../templates/clusterrole-console.yaml} | 8 +- .../templates/clusterrole-controller.yaml} | 68 +------------ .../templates/clusterrole-test-runner.yaml | 33 +++++++ .../templates/clusterrolebinding.yaml | 14 +++ .../templates/configmap-script.yaml | 10 ++ .../templates/deployment.yaml} | 31 +++--- .../templates/namespace.yaml} | 4 +- .../templates/role-trigger.yaml | 14 +++ .../templates/rolebinding-runner.yaml | 15 +++ .../templates/serviceaccount.yaml | 7 ++ ci-scripts/helm/ci-env-controller/values.yaml | 24 +++++ ci-scripts/helm/ci-test-stack/values.yaml | 2 +- 18 files changed, 231 insertions(+), 195 deletions(-) create mode 100644 ci-scripts/helm/ci-env-controller/Chart.yaml rename ci-scripts/{ci-env => helm/ci-env-controller/scripts}/ci-env-controller.sh (100%) create mode 100644 ci-scripts/helm/ci-env-controller/templates/_helpers.tpl rename ci-scripts/{arc/ci-console-clusterrole.yaml => helm/ci-env-controller/templates/clusterrole-console.yaml} (93%) rename ci-scripts/{ci-env/ci-env-controller-rbac.yaml => helm/ci-env-controller/templates/clusterrole-controller.yaml} (73%) create mode 100644 ci-scripts/helm/ci-env-controller/templates/clusterrole-test-runner.yaml create mode 100644 ci-scripts/helm/ci-env-controller/templates/clusterrolebinding.yaml create mode 100644 ci-scripts/helm/ci-env-controller/templates/configmap-script.yaml rename ci-scripts/{ci-env/ci-env-controller-deployment.yaml => helm/ci-env-controller/templates/deployment.yaml} (61%) rename ci-scripts/{ci-env/ci-env-namespace.yaml => helm/ci-env-controller/templates/namespace.yaml} (52%) create mode 100644 ci-scripts/helm/ci-env-controller/templates/role-trigger.yaml create mode 100644 ci-scripts/helm/ci-env-controller/templates/rolebinding-runner.yaml create mode 100644 ci-scripts/helm/ci-env-controller/templates/serviceaccount.yaml create mode 100644 ci-scripts/helm/ci-env-controller/values.yaml diff --git a/ci-scripts/arc/install-arc-controller.sh b/ci-scripts/arc/install-arc-controller.sh index 3dcbed19f1..97cf4c2a93 100755 --- a/ci-scripts/arc/install-arc-controller.sh +++ b/ci-scripts/arc/install-arc-controller.sh @@ -34,9 +34,6 @@ oc create namespace "${ARC_CONTROLLER_NS}" --dry-run=client -o yaml | oc apply - echo "Applying ARC SCC and ClusterRole (github-arc)..." oc apply -f "${ARC_DIR}/arc-openshift-scc.yaml" -echo "Applying CI console ClusterRole (ci-console)..." -oc apply -f "${ARC_DIR}/ci-console-clusterrole.yaml" - CONTROLLER_ARGS=(--namespace "${ARC_CONTROLLER_NS}") if [[ -n "${ARC_VERSION}" && "${ARC_VERSION}" != "latest" ]]; then CONTROLLER_ARGS+=(--version "${ARC_VERSION}") diff --git a/ci-scripts/ci-env/README.md b/ci-scripts/ci-env/README.md index 46ceb501ed..8a5ea8a6cd 100644 --- a/ci-scripts/ci-env/README.md +++ b/ci-scripts/ci-env/README.md @@ -75,7 +75,7 @@ ready -> (runner sets desired-state=absent) -> cleaning -> cleaned Prerequisites: `oc` logged in to OpenShift with cluster-admin, `helm` available. ```bash -# Install the controller (builds image, creates RBAC, deploys) +# Install the controller (builds image, installs chart) ./ci-scripts/ci-env/install-ci-env-controller.sh # Or with a pre-built image: @@ -83,44 +83,71 @@ CI_ENV_CONTROLLER_IMAGE=quay.io/myorg/ci-env-controller:latest \ ./ci-scripts/ci-env/install-ci-env-controller.sh ``` -The install script: +The install script resolves the controller image (building it if needed) then +runs a single `helm upgrade --install` against the `ci-env-controller` chart. -- Creates the `ci-env` namespace -- Applies controller RBAC (ServiceAccount, ClusterRole, ClusterRoleBinding) -- Applies the `ci-console` ClusterRole -- Builds the controller image via OpenShift BuildConfig (or uses a pre-built image) -- Creates ConfigMaps from the controller script, Helm chart, and cleanup script -- Deploys the controller -- Creates a namespaced RoleBinding so the ARC runner SA can manage ConfigMaps in `ci-env` +To update the controller script without rebuilding the image, edit +`ci-scripts/helm/ci-env-controller/scripts/ci-env-controller.sh` and re-run +the install script (or `helm upgrade --install` directly). The chart checksum +annotation on the Deployment triggers an automatic rolling restart. + +## Chart Layout + +All Kubernetes manifests live in the Helm chart under +`ci-scripts/helm/ci-env-controller/`: + +``` +ci-env-controller/ +├── Chart.yaml +├── values.yaml +├── scripts/ +│ └── ci-env-controller.sh # Controller watch loop (inlined into ConfigMap by Helm) +└── templates/ + ├── _helpers.tpl + ├── namespace.yaml + ├── serviceaccount.yaml + ├── clusterrole-controller.yaml + ├── clusterrole-test-runner.yaml + ├── clusterrole-console.yaml # ci-console ClusterRole (used by ci-test-stack) + ├── clusterrolebinding.yaml + ├── role-trigger.yaml # Allows runner SA to manage ConfigMaps in ci-env + ├── rolebinding-runner.yaml + ├── configmap-script.yaml # Inlines ci-env-controller.sh via .Files.Get + └── deployment.yaml +``` ## Configuration -Environment variables on the controller Deployment: +`values.yaml` defaults — override via `--set` or a values file: -| Variable | Default | Description | -| -------------------- | ------------------------------------------ | ---------------------------------------- | -| `CI_ENV_NS` | `ci-env` | Namespace for trigger ConfigMaps | -| `CI_ENV_TTL_SECONDS` | `7200` | Force-clean stale environments (seconds) | -| `CI_ENV_LABEL` | `ci.kubevirt-plugin/type=test-environment` | ConfigMap label selector | -| `HELM_CHART_PATH` | `/opt/ci-env/helm/ci-test-stack` | Path to Helm chart in container | +| Value | Default | Description | +| ---------------------- | ------------------------------------------ | ---------------------------------------- | +| `image` | _(required)_ | Controller pod image | +| `namespace` | `ci-env` | Namespace for controller and ConfigMaps | +| `ttlSeconds` | `7200` | Force-clean stale environments (seconds) | +| `label` | `ci.kubevirt-plugin/type=test-environment` | ConfigMap label selector | +| `helmChartPath` | `/opt/ci-env/helm/ci-test-stack` | Path to embedded Helm chart in image | +| `consoleImageRegistry` | `quay.io/openshift/origin-console` | Base registry for auto-resolved console | +| `runner.saName` | `kubevirt-plugin-ci-gha-rs-no-permission` | ARC runner ServiceAccount name | +| `runner.saNamespace` | `arc-runners` | ARC runner ServiceAccount namespace | ## Files -| File | Purpose | -| ----------------------------------- | ------------------------------------------------------ | -| `ci-env-controller.sh` | Controller watch loop, provisioning, and cleanup logic | -| `ci-env-namespace.yaml` | Namespace definition | -| `ci-env-controller-rbac.yaml` | ServiceAccount, ClusterRole, ClusterRoleBinding | -| `ci-env-controller-deployment.yaml` | Deployment manifest | -| `install-ci-env-controller.sh` | Standalone install script | -| `controller-image/Dockerfile` | UBI9-based image with oc, helm, jq, yq, curl | +| File | Purpose | +| ------------------------------ | ----------------------------------------------- | +| `install-ci-env-controller.sh` | Thin install wrapper: resolves image, runs helm | +| `README.md` | This file | ## Relationship to ARC The controller is fully independent of the ARC installation. It runs in its own namespace (`ci-env`) with its own ServiceAccount and ClusterRole. The only -connection is a namespaced RoleBinding that allows the ARC runner SA to create -ConfigMaps in `ci-env`. +connection is a namespaced RoleBinding (in the chart) that allows the ARC +runner SA to create ConfigMaps in `ci-env`. + +The `ci-console` ClusterRole (required by the ci-test-stack chart) is also +owned by this chart, so the ci-env-controller must be installed before any +test runs. ## Future Work diff --git a/ci-scripts/ci-env/install-ci-env-controller.sh b/ci-scripts/ci-env/install-ci-env-controller.sh index f5b8e4805b..f798e4262e 100755 --- a/ci-scripts/ci-env/install-ci-env-controller.sh +++ b/ci-scripts/ci-env/install-ci-env-controller.sh @@ -4,27 +4,22 @@ # This is a standalone script -- it does NOT depend on the ARC install. # # What it does: -# 1. Creates the ci-env namespace -# 2. Applies RBAC (ServiceAccount, ClusterRole, ClusterRoleBinding) -# 3. Applies the ci-console ClusterRole (needed by the Helm chart) -# 4. Builds the controller image via setup-controller-image.sh (or uses a pre-built image) -# The image embeds the Helm chart for the CI test stack. -# 5. Creates a ConfigMap from the controller script (mounted for easy updates) -# 6. Deploys the controller -# 7. Creates a RoleBinding so the ARC runner SA can create ConfigMaps in ci-env +# 1. Optionally builds the controller image via setup-ci-env-runner-image.sh +# 2. Runs helm upgrade --install with the ci-env-controller chart # # Optional environment variables: # CI_ENV_NS Namespace for the controller (default: ci-env) -# CI_ENV_CONTROLLER_IMAGE Pre-built image; skips BuildConfig if set +# CI_ENV_CONTROLLER_IMAGE Pre-built image; skips image build if set # ARC_RUNNERS_NS Namespace where ARC runner pods run (default: arc-runners) # RUNNER_SCALE_SET_NAME ARC scale set name (default: kubevirt-plugin-ci) # -# Prerequisites: oc login to OpenShift with cluster-admin +# Prerequisites: oc login to OpenShift with cluster-admin, helm available set -euo pipefail CI_ENV_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" CI_SCRIPTS_DIR="$(cd "${CI_ENV_DIR}/.." && pwd)" +CHART_DIR="${CI_SCRIPTS_DIR}/helm/ci-env-controller" CI_ENV_NS="${CI_ENV_NS:-ci-env}" ARC_RUNNERS_NS="${ARC_RUNNERS_NS:-arc-runners}" @@ -42,25 +37,14 @@ if ! oc get clusterversion version &>/dev/null; then exit 1 fi -# --- 1. Namespace --- -echo "Creating namespace ${CI_ENV_NS}..." -oc apply -f "${CI_ENV_DIR}/ci-env-namespace.yaml" - -# --- 2. RBAC --- -echo "Applying controller RBAC..." -oc apply -f "${CI_ENV_DIR}/ci-env-controller-rbac.yaml" - -# --- 3. ci-console ClusterRole --- -echo "Applying ci-console ClusterRole..." -oc apply -f "${CI_SCRIPTS_DIR}/arc/ci-console-clusterrole.yaml" - -# --- 4. Controller image --- +# --- Resolve controller image --- if [[ -z "${CI_ENV_CONTROLLER_IMAGE:-}" ]]; then - echo "Building controller image via setup-controller-image.sh..." - IMAGE_OUTPUT="$(CI_ENV_NS="${CI_ENV_NS}" bash "${CI_ENV_DIR}/setup-controller-image.sh")" + echo "Building controller image via setup-ci-env-runner-image.sh..." + IMAGE_OUTPUT="$(CI_ENV_NS="${CI_ENV_NS}" bash "${CI_SCRIPTS_DIR}/images/setup-ci-env-runner-image.sh" 2>&1)" CI_ENV_CONTROLLER_IMAGE="$(echo "${IMAGE_OUTPUT}" | grep '^IMAGE_REF=' | cut -d= -f2-)" if [[ -z "${CI_ENV_CONTROLLER_IMAGE}" ]]; then - echo "ERROR: setup-controller-image.sh did not output IMAGE_REF=" + echo "ERROR: setup-ci-env-runner-image.sh did not output IMAGE_REF=" + echo "${IMAGE_OUTPUT}" exit 1 fi echo "Built image: ${CI_ENV_CONTROLLER_IMAGE}" @@ -68,55 +52,17 @@ else echo "Using pre-built image: ${CI_ENV_CONTROLLER_IMAGE}" fi -# --- 5. ConfigMap for controller script (mounted for easy updates) --- -echo "Creating ConfigMap from controller script..." -oc create configmap ci-env-controller-script \ - -n "${CI_ENV_NS}" \ - --from-file=ci-env-controller.sh="${CI_ENV_DIR}/ci-env-controller.sh" \ - --dry-run=client -o yaml | oc apply -f - - -# --- 6. Deploy the controller --- -echo "Deploying ci-env-controller..." -sed -e "s|CI_ENV_CONTROLLER_IMAGE_PLACEHOLDER|${CI_ENV_CONTROLLER_IMAGE}|g" \ - -e "s|RUNNER_SA_NAME_PLACEHOLDER|${RUNNER_SA_NAME}|g" \ - -e "s|RUNNER_SA_NS_PLACEHOLDER|${ARC_RUNNERS_NS}|g" \ - "${CI_ENV_DIR}/ci-env-controller-deployment.yaml" \ - | oc apply -f - - -echo "Waiting for controller to be ready..." -oc rollout status deployment/ci-env-controller -n "${CI_ENV_NS}" --timeout=120s || true - -# --- 7. RoleBinding for ARC runner SA --- -echo "Creating RoleBinding for runner SA (${RUNNER_SA_NAME}) in ${CI_ENV_NS}..." -oc apply -f - <- + Long-lived controller that watches labeled ConfigMaps in the ci-env namespace + and reconciles CI test environments (namespace, Helm chart) on demand. + Replaces the manual YAML + sed-substitution install workflow. +version: 0.1.0 +appVersion: '1.0.0' +type: application diff --git a/ci-scripts/ci-env/ci-env-controller.sh b/ci-scripts/helm/ci-env-controller/scripts/ci-env-controller.sh similarity index 100% rename from ci-scripts/ci-env/ci-env-controller.sh rename to ci-scripts/helm/ci-env-controller/scripts/ci-env-controller.sh diff --git a/ci-scripts/helm/ci-env-controller/templates/_helpers.tpl b/ci-scripts/helm/ci-env-controller/templates/_helpers.tpl new file mode 100644 index 0000000000..fef9b4a2a7 --- /dev/null +++ b/ci-scripts/helm/ci-env-controller/templates/_helpers.tpl @@ -0,0 +1,9 @@ +{{/* +Common labels applied to every resource. +*/}} +{{- define "ci-env-controller.labels" -}} +app.kubernetes.io/managed-by: {{ .Release.Service }} +app.kubernetes.io/instance: {{ .Release.Name }} +helm.sh/chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }} +app.kubernetes.io/component: ci-env-controller +{{- end }} diff --git a/ci-scripts/arc/ci-console-clusterrole.yaml b/ci-scripts/helm/ci-env-controller/templates/clusterrole-console.yaml similarity index 93% rename from ci-scripts/arc/ci-console-clusterrole.yaml rename to ci-scripts/helm/ci-env-controller/templates/clusterrole-console.yaml index 903a84826f..131243b5fe 100644 --- a/ci-scripts/arc/ci-console-clusterrole.yaml +++ b/ci-scripts/helm/ci-env-controller/templates/clusterrole-console.yaml @@ -1,20 +1,18 @@ # ClusterRole for the CI test stack console ServiceAccount. -# Applied once at ARC install time by install-arc-controller.sh. # Bound per CI run by the ci-test-stack Helm chart's ClusterRoleBinding. # # Permissions cover what the OpenShift console needs to proxy browser requests # (including WebSocket watch) for KubeVirt/CDI management, plus cluster-info # reads for the console UI chrome and overview pages. # -# IMPORTANT: ci-env-controller-rbac.yaml (ci-env-controller ClusterRole) must be -# a SUPERSET of this file. Any additions here must also be added there. ---- +# IMPORTANT: clusterrole-controller.yaml must be a SUPERSET of this file. +# Any additions here must also be added there. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: ci-console labels: - app.kubernetes.io/component: ci-console-rbac + {{- include "ci-env-controller.labels" . | nindent 4 }} rules: # Cluster info (console chrome, overview pages, cluster version banner) - apiGroups: [''] diff --git a/ci-scripts/ci-env/ci-env-controller-rbac.yaml b/ci-scripts/helm/ci-env-controller/templates/clusterrole-controller.yaml similarity index 73% rename from ci-scripts/ci-env/ci-env-controller-rbac.yaml rename to ci-scripts/helm/ci-env-controller/templates/clusterrole-controller.yaml index 7b06206325..d56822426e 100644 --- a/ci-scripts/ci-env/ci-env-controller-rbac.yaml +++ b/ci-scripts/helm/ci-env-controller/templates/clusterrole-controller.yaml @@ -1,26 +1,9 @@ -# RBAC for the ci-env-controller Deployment. -# -# This ClusterRole must be a SUPERSET of ci-console (ci-console-clusterrole.yaml) -# because the controller creates ClusterRoleBindings to ci-console, and Kubernetes -# prevents RBAC escalation -- you cannot grant permissions you do not hold. -# -# The controller also needs lifecycle permissions (namespaces, Helm resources, -# Routes, ClusterRoleBindings) and cluster discovery reads. ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: ci-env-controller - namespace: ci-env - labels: - app.kubernetes.io/component: ci-env-controller ---- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: ci-env-controller labels: - app.kubernetes.io/component: ci-env-controller + {{- include "ci-env-controller.labels" . | nindent 4 }} rules: # ----------------------------------------------------------------------- # Cluster discovery (api-server, apps-domain, monitoring URLs, console image) @@ -165,52 +148,3 @@ rules: - apiGroups: ['k8s.cni.cncf.io'] resources: ['network-attachment-definitions'] verbs: ['get', 'list', 'delete'] ---- -# ClusterRole bound (via RoleBinding) to the ARC runner SA in each test namespace. -# The ci-env-controller creates RoleBindings for this role during provisioning. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: ci-env-test-runner - labels: - app.kubernetes.io/component: ci-env-controller -rules: - - apiGroups: [''] - resources: ['secrets'] - verbs: ['get', 'list', 'create', 'update', 'patch', 'delete'] - - apiGroups: [''] - resources: ['persistentvolumeclaims'] - verbs: ['get', 'list', 'delete'] - - apiGroups: ['kubevirt.io'] - resources: ['virtualmachines'] - verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] - - apiGroups: ['snapshot.kubevirt.io'] - resources: ['virtualmachinesnapshots'] - verbs: ['get', 'list', 'delete'] - - apiGroups: ['cdi.kubevirt.io'] - resources: ['datavolumes', 'datasources'] - verbs: ['get', 'list', 'delete'] - - apiGroups: ['template.openshift.io'] - resources: ['templates'] - verbs: ['get', 'list', 'delete'] - - apiGroups: ['k8s.cni.cncf.io'] - resources: ['network-attachment-definitions'] - verbs: ['get', 'list', 'delete'] - - apiGroups: ['instancetype.kubevirt.io'] - resources: ['virtualmachineinstancetypes', 'virtualmachinepreferences'] - verbs: ['get', 'list', 'delete'] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: ci-env-controller - labels: - app.kubernetes.io/component: ci-env-controller -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: ci-env-controller -subjects: - - kind: ServiceAccount - name: ci-env-controller - namespace: ci-env diff --git a/ci-scripts/helm/ci-env-controller/templates/clusterrole-test-runner.yaml b/ci-scripts/helm/ci-env-controller/templates/clusterrole-test-runner.yaml new file mode 100644 index 0000000000..51578eff51 --- /dev/null +++ b/ci-scripts/helm/ci-env-controller/templates/clusterrole-test-runner.yaml @@ -0,0 +1,33 @@ +# ClusterRole bound (via RoleBinding) to the ARC runner SA in each test namespace. +# The ci-env-controller creates RoleBindings for this role during provisioning. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: ci-env-test-runner + labels: + {{- include "ci-env-controller.labels" . | nindent 4 }} +rules: + - apiGroups: [''] + resources: ['secrets'] + verbs: ['get', 'list', 'create', 'update', 'patch', 'delete'] + - apiGroups: [''] + resources: ['persistentvolumeclaims'] + verbs: ['get', 'list', 'delete'] + - apiGroups: ['kubevirt.io'] + resources: ['virtualmachines'] + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] + - apiGroups: ['snapshot.kubevirt.io'] + resources: ['virtualmachinesnapshots'] + verbs: ['get', 'list', 'delete'] + - apiGroups: ['cdi.kubevirt.io'] + resources: ['datavolumes', 'datasources'] + verbs: ['get', 'list', 'delete'] + - apiGroups: ['template.openshift.io'] + resources: ['templates'] + verbs: ['get', 'list', 'delete'] + - apiGroups: ['k8s.cni.cncf.io'] + resources: ['network-attachment-definitions'] + verbs: ['get', 'list', 'delete'] + - apiGroups: ['instancetype.kubevirt.io'] + resources: ['virtualmachineinstancetypes', 'virtualmachinepreferences'] + verbs: ['get', 'list', 'delete'] diff --git a/ci-scripts/helm/ci-env-controller/templates/clusterrolebinding.yaml b/ci-scripts/helm/ci-env-controller/templates/clusterrolebinding.yaml new file mode 100644 index 0000000000..e9f8ecc3e8 --- /dev/null +++ b/ci-scripts/helm/ci-env-controller/templates/clusterrolebinding.yaml @@ -0,0 +1,14 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: ci-env-controller + labels: + {{- include "ci-env-controller.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: ci-env-controller +subjects: + - kind: ServiceAccount + name: ci-env-controller + namespace: {{ .Values.namespace }} diff --git a/ci-scripts/helm/ci-env-controller/templates/configmap-script.yaml b/ci-scripts/helm/ci-env-controller/templates/configmap-script.yaml new file mode 100644 index 0000000000..02a39b3d33 --- /dev/null +++ b/ci-scripts/helm/ci-env-controller/templates/configmap-script.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: ci-env-controller-script + namespace: {{ .Values.namespace }} + labels: + {{- include "ci-env-controller.labels" . | nindent 4 }} +data: + ci-env-controller.sh: | + {{- .Files.Get "scripts/ci-env-controller.sh" | nindent 4 }} diff --git a/ci-scripts/ci-env/ci-env-controller-deployment.yaml b/ci-scripts/helm/ci-env-controller/templates/deployment.yaml similarity index 61% rename from ci-scripts/ci-env/ci-env-controller-deployment.yaml rename to ci-scripts/helm/ci-env-controller/templates/deployment.yaml index 8cce2b9d7b..ff6f554ca3 100644 --- a/ci-scripts/ci-env/ci-env-controller-deployment.yaml +++ b/ci-scripts/helm/ci-env-controller/templates/deployment.yaml @@ -1,18 +1,11 @@ -# ci-env-controller Deployment. -# The controller script is mounted from a ConfigMap so it can be updated -# without rebuilding the container image. -# -# The image is a UBI9-based image with oc, helm, jq, yq, curl. -# Set CI_ENV_CONTROLLER_IMAGE before applying, or substitute with sed/envsubst. ---- apiVersion: apps/v1 kind: Deployment metadata: name: ci-env-controller - namespace: ci-env + namespace: {{ .Values.namespace }} labels: + {{- include "ci-env-controller.labels" . | nindent 4 }} app: ci-env-controller - app.kubernetes.io/component: ci-env-controller spec: replicas: 1 selector: @@ -22,25 +15,31 @@ spec: metadata: labels: app: ci-env-controller + {{- include "ci-env-controller.labels" . | nindent 8 }} + annotations: + # Triggers a rolling restart whenever the controller script changes. + checksum/script: {{ .Files.Get "scripts/ci-env-controller.sh" | sha256sum }} spec: serviceAccountName: ci-env-controller containers: - name: controller - image: CI_ENV_CONTROLLER_IMAGE_PLACEHOLDER + image: {{ required "image is required" .Values.image }} command: ['/bin/bash', '/opt/ci-env/controller/ci-env-controller.sh'] env: - name: CI_ENV_NS - value: ci-env + value: {{ .Values.namespace | quote }} - name: CI_ENV_TTL_SECONDS - value: '7200' + value: {{ .Values.ttlSeconds | quote }} - name: CI_ENV_LABEL - value: ci.kubevirt-plugin/type=test-environment + value: {{ .Values.label | quote }} - name: HELM_CHART_PATH - value: /opt/ci-env/helm/ci-test-stack + value: {{ .Values.helmChartPath | quote }} - name: RUNNER_SA_NAME - value: RUNNER_SA_NAME_PLACEHOLDER + value: {{ .Values.runner.saName | quote }} - name: RUNNER_SA_NS - value: RUNNER_SA_NS_PLACEHOLDER + value: {{ .Values.runner.saNamespace | quote }} + - name: CONSOLE_IMAGE_REGISTRY + value: {{ .Values.consoleImageRegistry | quote }} volumeMounts: - name: controller-script mountPath: /opt/ci-env/controller diff --git a/ci-scripts/ci-env/ci-env-namespace.yaml b/ci-scripts/helm/ci-env-controller/templates/namespace.yaml similarity index 52% rename from ci-scripts/ci-env/ci-env-namespace.yaml rename to ci-scripts/helm/ci-env-controller/templates/namespace.yaml index e2547fff21..408ab8f148 100644 --- a/ci-scripts/ci-env/ci-env-namespace.yaml +++ b/ci-scripts/helm/ci-env-controller/templates/namespace.yaml @@ -1,7 +1,7 @@ apiVersion: v1 kind: Namespace metadata: - name: ci-env + name: {{ .Values.namespace }} labels: - app.kubernetes.io/component: ci-env-controller + {{- include "ci-env-controller.labels" . | nindent 4 }} app.kubernetes.io/part-of: kubevirt-plugin-ci diff --git a/ci-scripts/helm/ci-env-controller/templates/role-trigger.yaml b/ci-scripts/helm/ci-env-controller/templates/role-trigger.yaml new file mode 100644 index 0000000000..43d54d2022 --- /dev/null +++ b/ci-scripts/helm/ci-env-controller/templates/role-trigger.yaml @@ -0,0 +1,14 @@ +# Namespaced Role that allows the ARC runner SA to manage trigger ConfigMaps +# in the ci-env namespace, which is how it signals the controller to provision +# or tear down a test environment. +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: ci-env-trigger + namespace: {{ .Values.namespace }} + labels: + {{- include "ci-env-controller.labels" . | nindent 4 }} +rules: + - apiGroups: [''] + resources: ['configmaps'] + verbs: ['get', 'list', 'watch', 'create', 'update', 'patch', 'delete'] diff --git a/ci-scripts/helm/ci-env-controller/templates/rolebinding-runner.yaml b/ci-scripts/helm/ci-env-controller/templates/rolebinding-runner.yaml new file mode 100644 index 0000000000..91a7f0ec56 --- /dev/null +++ b/ci-scripts/helm/ci-env-controller/templates/rolebinding-runner.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ci-env-trigger-runner + namespace: {{ .Values.namespace }} + labels: + {{- include "ci-env-controller.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: ci-env-trigger +subjects: + - kind: ServiceAccount + name: {{ .Values.runner.saName }} + namespace: {{ .Values.runner.saNamespace }} diff --git a/ci-scripts/helm/ci-env-controller/templates/serviceaccount.yaml b/ci-scripts/helm/ci-env-controller/templates/serviceaccount.yaml new file mode 100644 index 0000000000..235e9b1307 --- /dev/null +++ b/ci-scripts/helm/ci-env-controller/templates/serviceaccount.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: ci-env-controller + namespace: {{ .Values.namespace }} + labels: + {{- include "ci-env-controller.labels" . | nindent 4 }} diff --git a/ci-scripts/helm/ci-env-controller/values.yaml b/ci-scripts/helm/ci-env-controller/values.yaml new file mode 100644 index 0000000000..e65aa32e0b --- /dev/null +++ b/ci-scripts/helm/ci-env-controller/values.yaml @@ -0,0 +1,24 @@ +# Image for the ci-env-controller pod. +# Required: set at install time via --set image=... or CI_ENV_CONTROLLER_IMAGE. +image: '' + +# Namespace where the controller runs and where trigger ConfigMaps live. +namespace: ci-env + +# Force-clean environments older than this many seconds (default: 2h). +ttlSeconds: 7200 + +# Label selector the controller uses to find trigger ConfigMaps. +label: 'ci.kubevirt-plugin/type=test-environment' + +# Path to the ci-test-stack Helm chart embedded in the controller image. +helmChartPath: /opt/ci-env/helm/ci-test-stack + +# Base registry for auto-resolving the console image from cluster version. +consoleImageRegistry: quay.io/openshift/origin-console + +# ARC runner ServiceAccount that needs permission to create ConfigMaps in the +# ci-env namespace (triggers environment provisioning). +runner: + saName: kubevirt-plugin-ci-gha-rs-no-permission + saNamespace: arc-runners diff --git a/ci-scripts/helm/ci-test-stack/values.yaml b/ci-scripts/helm/ci-test-stack/values.yaml index de9bfce238..1a35d8302a 100644 --- a/ci-scripts/helm/ci-test-stack/values.yaml +++ b/ci-scripts/helm/ci-test-stack/values.yaml @@ -23,7 +23,7 @@ console: thanosUrl: '' alertmanagerUrl: '' -# ClusterRole pre-created at ARC install time (ci-console-clusterrole.yaml) +# ClusterRole created by the ci-env-controller Helm chart (clusterrole-console.yaml). rbac: consoleClusterRole: ci-console # ClusterRole that grants test permissions inside the test namespace From 72e66f5c50cff53223a066e8526a3d8aac647a5a Mon Sep 17 00:00:00 2001 From: Scott J Dickerson Date: Thu, 23 Apr 2026 15:07:46 -0400 Subject: [PATCH 08/42] update installs so everything works nicely Signed-off-by: Scott J Dickerson --- ci-scripts/arc/install-arc-controller.sh | 11 +++-- ci-scripts/arc/install-runner-scale-set.sh | 9 ++-- .../ci-env/install-ci-env-controller.sh | 42 ++++++++++++------- .../templates/namespace.yaml | 7 ---- ci-scripts/images/setup-arc-runner-image.sh | 11 +++-- .../images/setup-ci-env-runner-image.sh | 13 ++++-- 6 files changed, 56 insertions(+), 37 deletions(-) delete mode 100644 ci-scripts/helm/ci-env-controller/templates/namespace.yaml diff --git a/ci-scripts/arc/install-arc-controller.sh b/ci-scripts/arc/install-arc-controller.sh index 97cf4c2a93..d6ae4017ac 100755 --- a/ci-scripts/arc/install-arc-controller.sh +++ b/ci-scripts/arc/install-arc-controller.sh @@ -34,7 +34,7 @@ oc create namespace "${ARC_CONTROLLER_NS}" --dry-run=client -o yaml | oc apply - echo "Applying ARC SCC and ClusterRole (github-arc)..." oc apply -f "${ARC_DIR}/arc-openshift-scc.yaml" -CONTROLLER_ARGS=(--namespace "${ARC_CONTROLLER_NS}") +CONTROLLER_ARGS=() if [[ -n "${ARC_VERSION}" && "${ARC_VERSION}" != "latest" ]]; then CONTROLLER_ARGS+=(--version "${ARC_VERSION}") fi @@ -43,10 +43,13 @@ CONTROLLER_SA_NAME="${ARC_CONTROLLER_INSTALL_NAME}-gha-rs-controller" CONTROLLER_ARGS+=(--set "serviceAccount.name=${CONTROLLER_SA_NAME}") echo "Installing ARC controller (Helm release: ${ARC_CONTROLLER_INSTALL_NAME})..." -helm upgrade --install "${ARC_CONTROLLER_INSTALL_NAME}" \ - "${CONTROLLER_ARGS[@]}" \ +helm upgrade \ + "${ARC_CONTROLLER_INSTALL_NAME}" \ "${ARC_HELM_REPO}/gha-runner-scale-set-controller" \ - --wait + --install \ + --namespace "${ARC_CONTROLLER_NS}" \ + "${CONTROLLER_ARGS[@]}" \ + --wait --timeout 5m echo "" echo "=== ARC controller installation complete ===" diff --git a/ci-scripts/arc/install-runner-scale-set.sh b/ci-scripts/arc/install-runner-scale-set.sh index db17d2798c..b6b678c073 100755 --- a/ci-scripts/arc/install-runner-scale-set.sh +++ b/ci-scripts/arc/install-runner-scale-set.sh @@ -53,6 +53,7 @@ echo " ARC_CONFIG_URL: ${ARC_CONFIG_URL}" echo " ARC_CONTROLLER_NS: ${ARC_CONTROLLER_NS}" echo " ARC_CONTROLLER_INSTALL_NAME: ${ARC_CONTROLLER_INSTALL_NAME}" echo " ARC_RUNNERS_NS: ${ARC_RUNNERS_NS}" +echo " ARC_RUNNER_IMAGE: ${ARC_RUNNER_IMAGE:-(not set, will use default)}" echo " ARC_HELM_REPO: ${ARC_HELM_REPO}" echo " ARC_VERSION: ${ARC_VERSION}" echo " RUNNER_SCALE_SET_NAME: ${RUNNER_SCALE_SET_NAME}" @@ -87,11 +88,13 @@ if [[ -n "${ARC_VERSION}" && "${ARC_VERSION}" != "latest" ]]; then fi echo "Installing runner scale set '${RUNNER_SCALE_SET_NAME}'..." -helm upgrade --install "${RUNNER_SCALE_SET_NAME}" \ +helm upgrade \ + "${RUNNER_SCALE_SET_NAME}" \ + "${ARC_HELM_REPO}/gha-runner-scale-set" \ + --install \ --namespace "${ARC_RUNNERS_NS}" \ "${RUNNER_SET_ARGS[@]}" \ - "${ARC_HELM_REPO}/gha-runner-scale-set" \ - --wait + --wait --timeout 5m RUNNER_SA="${RUNNER_SCALE_SET_NAME}-gha-rs-no-permission" echo "Binding SCC github-arc to runner ServiceAccount ${RUNNER_SA}..." diff --git a/ci-scripts/ci-env/install-ci-env-controller.sh b/ci-scripts/ci-env/install-ci-env-controller.sh index f798e4262e..f3a498b8bc 100755 --- a/ci-scripts/ci-env/install-ci-env-controller.sh +++ b/ci-scripts/ci-env/install-ci-env-controller.sh @@ -19,6 +19,9 @@ set -euo pipefail CI_ENV_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" CI_SCRIPTS_DIR="$(cd "${CI_ENV_DIR}/.." && pwd)" +source "${CI_SCRIPTS_DIR}/_cluster-helpers.sh" +verify_oc + CHART_DIR="${CI_SCRIPTS_DIR}/helm/ci-env-controller" CI_ENV_NS="${CI_ENV_NS:-ci-env}" @@ -27,16 +30,13 @@ RUNNER_SCALE_SET_NAME="${RUNNER_SCALE_SET_NAME:-kubevirt-plugin-ci}" RUNNER_SA_NAME="${RUNNER_SCALE_SET_NAME}-gha-rs-no-permission" echo "=== CI Environment Controller installation ===" -echo " CI_ENV_NS: ${CI_ENV_NS}" -echo " ARC_RUNNERS_NS: ${ARC_RUNNERS_NS}" -echo " RUNNER_SA_NAME: ${RUNNER_SA_NAME}" +echo " CI_ENV_NS: ${CI_ENV_NS}" +echo " CI_ENV_CONTROLLER_IMAGE: ${CI_ENV_CONTROLLER_IMAGE:-(not set, will build)}" +echo " ARC_RUNNERS_NS: ${ARC_RUNNERS_NS}" +echo " RUNNER_SCALE_SET_NAME: ${RUNNER_SCALE_SET_NAME}" +echo " RUNNER_SA_NAME: ${RUNNER_SA_NAME}" echo "" -if ! oc get clusterversion version &>/dev/null; then - echo "ERROR: This script targets OpenShift only." - exit 1 -fi - # --- Resolve controller image --- if [[ -z "${CI_ENV_CONTROLLER_IMAGE:-}" ]]; then echo "Building controller image via setup-ci-env-runner-image.sh..." @@ -49,20 +49,30 @@ if [[ -z "${CI_ENV_CONTROLLER_IMAGE:-}" ]]; then fi echo "Built image: ${CI_ENV_CONTROLLER_IMAGE}" else - echo "Using pre-built image: ${CI_ENV_CONTROLLER_IMAGE}" + echo "Using provided image: ${CI_ENV_CONTROLLER_IMAGE}" fi +# --- Create namespace --- +echo "Creating namespace ${CI_ENV_NS}..." +oc create namespace "${CI_ENV_NS}" --dry-run=client -o yaml | oc apply -f - + # --- Install / upgrade the chart --- +VALUES_ARGS=( + --set "namespace=${CI_ENV_NS}" + --set "image=${CI_ENV_CONTROLLER_IMAGE}" + --set "runner.saName=${RUNNER_SA_NAME}" + --set "runner.saNamespace=${ARC_RUNNERS_NS}" +) + echo "" echo "Running helm upgrade --install..." -helm upgrade --install ci-env-controller \ +helm upgrade \ + ci-env-controller \ "${CHART_DIR}" \ - --create-namespace \ - --set "namespace=${CI_ENV_NS}" \ - --set "image=${CI_ENV_CONTROLLER_IMAGE}" \ - --set "runner.saName=${RUNNER_SA_NAME}" \ - --set "runner.saNamespace=${ARC_RUNNERS_NS}" \ - --wait --timeout 120s + --install \ + --namespace "${CI_ENV_NS}" \ + "${VALUES_ARGS[@]}" \ + --wait --timeout 5m echo "" echo "=== CI Environment Controller installation complete ===" diff --git a/ci-scripts/helm/ci-env-controller/templates/namespace.yaml b/ci-scripts/helm/ci-env-controller/templates/namespace.yaml deleted file mode 100644 index 408ab8f148..0000000000 --- a/ci-scripts/helm/ci-env-controller/templates/namespace.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: {{ .Values.namespace }} - labels: - {{- include "ci-env-controller.labels" . | nindent 4 }} - app.kubernetes.io/part-of: kubevirt-plugin-ci diff --git a/ci-scripts/images/setup-arc-runner-image.sh b/ci-scripts/images/setup-arc-runner-image.sh index 4294d0a695..1fdc468eb1 100755 --- a/ci-scripts/images/setup-arc-runner-image.sh +++ b/ci-scripts/images/setup-arc-runner-image.sh @@ -16,7 +16,11 @@ # # Binary URL resolution: # Uses ci-scripts/_cluster-helpers.sh to resolve cluster resources - +# +# Namespace note: If the namespace does not match the ci-env-runner deployment namespace, +# the running service account will need to add role "system:image-puller" so the built image +# can be pulled. +# set -euo pipefail SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")" REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" @@ -24,7 +28,7 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" source "${REPO_ROOT}/ci-scripts/_cluster-helpers.sh" verify_oc -NS="${NS:-ci-env-images}" +NS="${NS:-arc-runners}" IMAGE_DIR="${SCRIPT_DIR}/arc-runner" IMAGE_NAME="arc-runner" @@ -112,10 +116,11 @@ IMAGE_REF="${INTERNAL_REGISTRY}/${NS}/${IMAGE_NAME}:latest" echo "" echo "=== Build complete ===" echo "Image: ${IMAGE_REF}" +echo "" # TODO: Better handling of passing the fqdn image name to the caller if [[ -n "${ARC_RUNNER_IMAGE_FILE:-}" ]]; then printf '%s\n' "${IMAGE_REF}" > "${ARC_RUNNER_IMAGE_FILE}" echo "Wrote ${ARC_RUNNER_IMAGE_FILE}" fi -echo "IMAGE_REF=${IMAGE_REF}" +#echo "IMAGE_REF=${IMAGE_REF}" diff --git a/ci-scripts/images/setup-ci-env-runner-image.sh b/ci-scripts/images/setup-ci-env-runner-image.sh index 490233f23e..c08cb53036 100755 --- a/ci-scripts/images/setup-ci-env-runner-image.sh +++ b/ci-scripts/images/setup-ci-env-runner-image.sh @@ -6,7 +6,7 @@ # Output: prints IMAGE_REF= to stdout (and to CI_ENV_CONTROLLER_IMAGE_FILE if set). # # Optional environment variables: -# NS Namespace for the controller (default: ci-env-images) +# NS Namespace for the controller (default: ci-env) # OC_VERSION OpenShift client version build-arg (default: detect or 4.20) # HELM_VERSION Helm version build-arg (default: 3.19.0) # YQ_VERSION yq version build-arg (default: v4.52.5) @@ -15,7 +15,11 @@ # # Binary URL resolution: # Uses ci-scripts/_cluster-helpers.sh to resolve cluster resources - +# +# Namespace note: If the namespace does not match the ci-env-runner deployment namespace, +# the running service account will need to add role "system:image-puller" so the built image +# can be pulled. +# set -euo pipefail SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")" REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" @@ -23,7 +27,7 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" source "${REPO_ROOT}/ci-scripts/_cluster-helpers.sh" verify_oc -NS="${NS:-ci-env-images}" +NS="${NS:-ci-env}" IMAGE_DIR="${SCRIPT_DIR}/ci-env-runner" IMAGE_NAME="ci-env-runner" @@ -109,10 +113,11 @@ IMAGE_REF="${INTERNAL_REGISTRY}/${NS}/${IMAGE_NAME}:latest" echo "" echo "=== Build complete ===" echo "Image: ${IMAGE_REF}" +echo "" # TODO: Better handling of passing the fqdn image name to the caller if [[ -n "${CI_ENV_RUNNER_IMAGE_FILE:-}" ]]; then printf '%s\n' "${IMAGE_REF}" > "${CI_ENV_RUNNER_IMAGE_FILE}" echo "Wrote ${CI_ENV_RUNNER_IMAGE_FILE}" fi -echo "IMAGE_REF=${IMAGE_REF}" +#echo "IMAGE_REF=${IMAGE_REF}" From 2dd4a9322bd30062d17358232bc7025e4dae6dcf Mon Sep 17 00:00:00 2001 From: Scott J Dickerson Date: Thu, 30 Apr 2026 14:59:39 -0400 Subject: [PATCH 09/42] docs(ci): fix workflow name typo in README Correct "POC Hot ClusterE2E CI Test" to "POC Hot Cluster E2E CI Test" (missing space before E2E) in both the workflow-to-name table and the Variant A usage steps. Matches the actual name: field in .github/workflows/poc-e2e-ci-test.yml. Made-with: Cursor --- ci-scripts/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci-scripts/README.md b/ci-scripts/README.md index 8005a86a22..9f1868a53a 100644 --- a/ci-scripts/README.md +++ b/ci-scripts/README.md @@ -190,7 +190,7 @@ To turn off dind (no Docker daemon in the pod): `export CONTAINER_MODE=none` and | `.github/workflows/ibmc-cluster-setup.yml` | IBM Cloud Hot Cluster Setup | | `.github/workflows/ibmc-cluster-teardown.yml` | IBM Cloud Hot Cluster Teardown | | `.github/workflows/ibmc-cluster-auto-teardown.yml` | IBM Cloud Hot Cluster Auto-Teardown | -| `.github/workflows/poc-e2e-ci-test.yml` | POC Hot ClusterE2E CI Test | +| `.github/workflows/poc-e2e-ci-test.yml` | POC Hot Cluster E2E CI Test | | `.github/workflows/poc-e2e-ci-test2.yml` | POC Hot Cluster E2E CI Test 2 | ### Setting up the hot cluster @@ -205,7 +205,7 @@ To turn off dind (no Docker daemon in the pod): `export CONTAINER_MODE=none` and **Variant A — `poc-e2e-ci-test.yml` (IBM Cloud cluster health checks then run `poc-e2e-ci-test2.yml`)** -1. Actions → **POC Hot ClusterE2E CI Test** +1. Actions → **POC Hot Cluster E2E CI Test** 2. Inputs: Cypress spec (default `tests/gating.cy.ts`), cluster name 3. Runs `check-cluster-health.sh` on `ubuntu-latest` with an IBM Cloud kubeconfig; fails fast if the cluster is unhealthy 4. On success, calls `poc-e2e-ci-test2.yml` via `workflow_call` to run the tests From 9ce2f10e24a5175460bbb70270303a83cbd39c36 Mon Sep 17 00:00:00 2001 From: Scott J Dickerson Date: Thu, 30 Apr 2026 14:59:59 -0400 Subject: [PATCH 10/42] ci(poc-test2): run check-runner and build-plugin-image in parallel Remove the sequential dependency between check-runner and build-kubevirt-plugin-image. The diagnostics job produces no outputs consumed by the build, so there is no functional coupling. Gate run-gating-tests on both jobs so it still waits for runner validation before provisioning the test environment. This saves the full duration of check-runner (~1-2 min) from the critical path on every run. Made-with: Cursor --- .github/workflows/poc-e2e-ci-test2.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/poc-e2e-ci-test2.yml b/.github/workflows/poc-e2e-ci-test2.yml index f338e89dc8..e198e2bf70 100644 --- a/.github/workflows/poc-e2e-ci-test2.yml +++ b/.github/workflows/poc-e2e-ci-test2.yml @@ -154,7 +154,6 @@ jobs: build-kubevirt-plugin-image: name: Build Kubevirt Plugin Image - needs: check-runner runs-on: ubuntu-latest outputs: kubevirt-plugin-image: ${{ env.KUBEVIRT_PLUGIN_IMAGE }} @@ -210,7 +209,7 @@ jobs: run-gating-tests: name: Run Gating Tests - needs: build-kubevirt-plugin-image + needs: [check-runner, build-kubevirt-plugin-image] runs-on: kubevirt-plugin-ci timeout-minutes: 120 env: From 7850354fe752e141a15b2a97852c1334d5fe72c4 Mon Sep 17 00:00:00 2001 From: Scott J Dickerson Date: Tue, 12 May 2026 00:36:18 -0400 Subject: [PATCH 11/42] ci-env-controller: add helpful deployment notes Signed-off-by: Scott J Dickerson --- .../ci-env-controller/templates/NOTES.txt | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 ci-scripts/helm/ci-env-controller/templates/NOTES.txt diff --git a/ci-scripts/helm/ci-env-controller/templates/NOTES.txt b/ci-scripts/helm/ci-env-controller/templates/NOTES.txt new file mode 100644 index 0000000000..f98e0a145f --- /dev/null +++ b/ci-scripts/helm/ci-env-controller/templates/NOTES.txt @@ -0,0 +1,36 @@ +CI Environment Controller deployed. + +Controller namespace: {{ .Values.namespace }} +Runner SA binding: {{ .Values.runner.saNamespace }}/{{ .Values.runner.saName }} +Trigger selector: {{ .Values.label }} + +Manually request a test stack for a plugin image: + export CONFIGMAP_NAME=ci-env-test-1234 + export PLUGIN_IMAGE=ttl.sh/custom-plugin-image-1234:1h + export TEST_NAMESPACE=ci-env-test-ns-12324 + + oc create -f - < Date: Tue, 12 May 2026 00:36:40 -0400 Subject: [PATCH 12/42] docs(ci): add POC_OUTLINE.md Signed-off-by: Scott J Dickerson --- ci-scripts/POC_OUTLINE.md | 72 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 ci-scripts/POC_OUTLINE.md diff --git a/ci-scripts/POC_OUTLINE.md b/ci-scripts/POC_OUTLINE.md new file mode 100644 index 0000000000..becbb84377 --- /dev/null +++ b/ci-scripts/POC_OUTLINE.md @@ -0,0 +1,72 @@ +# Part 1 - Get a github action workflow running on an OpenShift cluster + +Use GitHub Action Runner Controller (ARC) to enable "ephemeral self-hosted runners" as a workflow job `runs-on` target. + +Notes: + +- A two part install via helm: + + - runner scale set controller (ARC) - installed once per cluster + - runner scale set (RSS) - installed per repo+runner, provides handling for a specific self-hosted runner + +- No incoming network access is required, ARC uses a polling system + +- The helm chart is patched at install time to: + + - Define the SCC, SA and RBAC bindings the ARC, RSS listener, and RSS ephemeral runner pods will be assigned + - Define the RSS ephemeral container's image + +- Authentication from the ARC and RSS to GitHub uses a GitHub App + +- All of the configurations allow us to control what image runs workflows, how that deployment is managed, and the workflow's embedded permissions to interact with the cluster + +# Part 2 - Control the workflow's container image + +Develop an ARC runner image that includes all needed tools. This allows specific control over what tools and libraries are provided by default to the workflows. + +Notes: + +- Use an `ImageStream` and `BuildConfig` to build the container image directly on the Cluster itself. +- The script that sets up the `BuildConfig` can lookup the proper versions of `oc` and `virtctl` +- The build can include specific versions/locations of important tools + +# Part 3 - Setup an action run specific CI environment on OpenShift cluster + +Each test run needs to establish its own self-contained (as much as possible at least) environment to run tests. This comes down to creating a namespace, deploying the plugin to be tests, and deploying a dev console. When it is all running, there is a console+plugin+CI namespace easily created for each CI run. + +Notes: + +- A two part helm chart based install: + + - `ci-env-controller` uses a collection of ConfigMaps to control CI test environments + - `ci-test-stack` is the deployment to be able to run and access a console, plugin and namespace + +- The arc RSS only needs enough permissions to manipulate the ConfigMaps + +- The controller sets up the namespace, and runs the console in "off cluster" mode with no authentication required + +- Accessing the test stack's route runs with a SA that has enough permissions to do everything needed for the e2e testing + +- The exact way this is all deployed and how the SA and role binding are setup could use some additional work to make the RBAC being used everywhere very obvious + +- The ConfigMaps will timeout and the test stack reaped around a default of 2 hours after it is created. + +# Part 4 - Run full workflow from GitHub, watch all job steps run as expected + +The workflow job will run on the RSS, be able to create a `ci-test-stack` by pushing a ConfigMap to the `ci-env-controller`, get a route to the test stack and run all tests targeting that route. + +Notes: + +- The route can be on-cluster only, or use a publicly available route + +- GitHub actions are used to request and release the test stack + +- Test can run any way we want. The POC is using a slim version of the cypress gating tests and the Cypress provided github action. + +# Part 5 - Updates to diagnostic and test results artifact tracking + +Anything that could be useful (test reports, pod logs, cluster log) are collected and pushed to artifacts to capture all the test run details. + +# Part 6 - Allow developers to manually create/remove CI environments + +Since a `ci-test-stack` can be requested by creating a ConfigMap, anyone login with the permissions to ask for one can get one. This allows developers to be able to manually create a test stack with any custom build of a plugin image they want. From 06a031e22fa3213bbd84ddf477ef1e550101c005 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 17 Jun 2026 16:41:00 -0400 Subject: [PATCH 13/42] CNV-74265: Rename hot-cluster workflows and consolidate docs - Remove duplicate poc-e2e-ci-test.yml; keep hot-cluster-e2e.yml as entry - Rename poc-e2e-ci-test2.yml to hot-cluster-e2e-run.yml - Delete POC_HOT_CLUSTER_CI.md; move backlog to docs/HOT_CLUSTER_FUTURE_WORK.md - Add docs/HOT_CLUSTER_CI_CONTINUATION.md and docs/CLUSTER_LIFECYCLE.md - Use IC_KEY secret, secure oc install, Playwright E2E updates - Update ci-scripts/README and auto-teardown workflow references --- ...e-ci-test2.yml => hot-cluster-e2e-run.yml} | 127 ++++++++-------- ...oc-e2e-ci-test.yml => hot-cluster-e2e.yml} | 46 +++--- .../workflows/ibmc-cluster-auto-teardown.yml | 4 +- .github/workflows/ibmc-cluster-setup.yml | 12 +- .github/workflows/ibmc-cluster-teardown.yml | 4 +- POC_HOT_CLUSTER_CI.md | 142 ------------------ ci-scripts/README.md | 78 +++++----- ci-scripts/arc/arc-runner-scale-set.pod.yaml | 2 +- ci-scripts/images/arc-runner/Dockerfile | 9 +- ci-scripts/install-hco.sh | 27 +++- ci-scripts/start-console.sh | 2 +- ci-scripts/test-cleanup.sh | 2 +- 12 files changed, 160 insertions(+), 295 deletions(-) rename .github/workflows/{poc-e2e-ci-test2.yml => hot-cluster-e2e-run.yml} (74%) rename .github/workflows/{poc-e2e-ci-test.yml => hot-cluster-e2e.yml} (55%) delete mode 100644 POC_HOT_CLUSTER_CI.md diff --git a/.github/workflows/poc-e2e-ci-test2.yml b/.github/workflows/hot-cluster-e2e-run.yml similarity index 74% rename from .github/workflows/poc-e2e-ci-test2.yml rename to .github/workflows/hot-cluster-e2e-run.yml index e198e2bf70..dd8ce9fcb7 100644 --- a/.github/workflows/poc-e2e-ci-test2.yml +++ b/.github/workflows/hot-cluster-e2e-run.yml @@ -1,33 +1,35 @@ -name: POC Hot Cluster E2E CI Test 2 +name: Hot Cluster E2E Run on: workflow_dispatch: inputs: - test_spec: - description: Cypress test spec to run + test_project: + description: Playwright project to run required: true - default: tests/poc-gating.cy.ts - type: string + default: gating + type: choice + options: + - gating + - features workflow_call: inputs: - test_spec: - description: Cypress test spec to run + test_project: + description: Playwright project to run type: string required: false - default: tests/poc-gating.cy.ts + default: gating permissions: contents: read actions: read env: - CYPRESS_CNV_NS: kubevirt-hyperconverged - CYPRESS_OS_IMAGES_NS: kubevirt-os-images - CYPRESS_TEST_NS: kubevirt-plugin-ci-test-${{ github.run_id }} - CYPRESS_TEST_SECRET_NAME: ci-test-secret + CNV_NS: kubevirt-hyperconverged + OS_IMAGES_NS: kubevirt-os-images + TEST_NS: kubevirt-plugin-ci-test-${{ github.run_id }} + TEST_SECRET_NAME: ci-test-secret - KUBEVIRT_PLUGIN_IMAGE: 'ttl.sh/kubevirt-plugin-ci-${{ github.run_id }}-${{ github.run_number }}:2h' - # KUBEVIRT_PLUGIN_IMAGE: ttl.sh/kubevirt-plugin-ci-1234:6h + KUBEVIRT_PLUGIN_IMAGE: 'ttl.sh/kubevirt-plugin-ci-${{ github.run_id }}-${{ github.run_attempt }}:2h' CI_ENV_NS: ci-env CI_ENV_CM: ci-env-${{ github.run_id }} @@ -47,8 +49,7 @@ jobs: echo "| --- | --- |" for var in HOME USER RUNNER_NAME RUNNER_OS RUNNER_ARCH \ GITHUB_REPOSITORY GITHUB_REF GITHUB_SHA GITHUB_RUN_ID GITHUB_RUN_NUMBER \ - CYPRESS_CNV_NS CYPRESS_OS_IMAGES_NS \ - CYPRESS_TEST_NS KUBEVIRT_PLUGIN_IMAGE KUBEVIRT_UI_PLUGIN_RUNNER; do + CNV_NS OS_IMAGES_NS TEST_NS KUBEVIRT_PLUGIN_IMAGE KUBEVIRT_UI_PLUGIN_RUNNER; do echo "| \`$var\` | \`${!var:-}\` |" done echo "" @@ -58,15 +59,21 @@ jobs: echo "" echo "| Tool | Available |" echo "| --- | --- |" + missing=0 for cmd in jq yq envsubst curl kubectl oc virtctl helm npm node; do if command -v "$cmd" &>/dev/null; then echo "| \`$cmd\` | ✅ |" else echo "| \`$cmd\` | ❌ |" + missing=1 fi done echo "" echo "" + if [[ "${missing}" -ne 0 ]]; then + echo "::error::Required tools are missing on the ARC runner" + exit 1 + fi echo "
npm / Node Versions" echo "" @@ -84,21 +91,29 @@ jobs: echo "" echo "| Tool | Client Version | Server Version |" echo "| --- | --- | --- |" + failed=0 for cmd in oc virtctl; do if command -v "$cmd" &>/dev/null; then - client="" - server="" - version_output=$("$cmd" version 2>/dev/null || true) + if ! version_output=$("$cmd" version 2>/dev/null); then + echo "| \`$cmd\` | ❌ version failed | ❌ |" + failed=1 + continue + fi client=$(echo "$version_output" | grep -i "client" | head -1 | sed 's/^[[:space:]]*//') server=$(echo "$version_output" | grep -i "server" | head -1 | sed 's/^[[:space:]]*//') echo "| \`$cmd\` | ${client:-N/A} | ${server:-N/A} |" else echo "| \`$cmd\` | ❌ not found | — |" + failed=1 fi done echo "
" echo "" } | tee -a "$GITHUB_STEP_SUMMARY" + if [[ "${failed}" -ne 0 ]]; then + echo "::error::Client/server version checks failed" + exit 1 + fi - name: Log HCO and managed operator versions continue-on-error: true @@ -110,7 +125,6 @@ jobs: if ! command -v oc &>/dev/null; then echo "> ⚠️ \`oc\` not found — skipping cluster version checks." else - # HCO itself is installed via OLM; its CSV is the authoritative version. echo "### HCO Version (OLM CSV)" echo "" echo "| Name | Version | Phase |" @@ -214,6 +228,8 @@ jobs: timeout-minutes: 120 env: PLUGIN_IMAGE: ${{ needs.build-kubevirt-plugin-image.outputs.kubevirt-plugin-image }} + HEADLESS: 'true' + RUN_FEATURE_TESTS: ${{ inputs.test_project == 'features' && 'true' || 'false' }} steps: - name: Checkout @@ -224,80 +240,65 @@ jobs: uses: ./.github/actions/ci-env-request with: plugin-image: ${{ env.PLUGIN_IMAGE }} - test-namespace: ${{ env.CYPRESS_TEST_NS }} + test-namespace: ${{ env.TEST_NS }} configmap-name: ${{ env.CI_ENV_CM }} ci-env-namespace: ${{ env.CI_ENV_NS }} - name: Create test secret run: | yq e ' - .metadata.name = strenv(CYPRESS_TEST_SECRET_NAME) | - .metadata.namespace = strenv(CYPRESS_TEST_NS) - ' cypress/fixtures/secret.yaml | oc apply -f - + .metadata.name = strenv(TEST_SECRET_NAME) | + .metadata.namespace = strenv(TEST_NS) + ' playwright/fixtures/secret.yaml | oc apply -f - - name: Seed kubevirt-user-settings ConfigMap run: | - # The ci-env-controller sets helm_release=cm_name when no override is - # provided, so CI_ENV_CM is always the Helm release name. SA_NAME="${CI_ENV_CM}-console" - # The plugin hook (useKubevirtUserSettings) keys settings by: - # user?.metadata?.uid OR sanitized(user?.metadata?.name) - # In no-auth mode the SA bearer token is used for all k8s calls, so - # users/~ returns a virtual user for system:serviceaccount::. - # We patch both the SA UID and the sanitized name so the hook always - # finds the right key regardless of whether OpenShift populates the UID. - SA_UID="$(oc get sa "${SA_NAME}" -n "${CYPRESS_TEST_NS}" \ + SA_UID="$(oc get sa "${SA_NAME}" -n "${TEST_NS}" \ -o jsonpath='{.metadata.uid}' 2>/dev/null || true)" - SANITIZED_NAME="system-serviceaccount-${CYPRESS_TEST_NS}-${SA_NAME}" + SANITIZED_NAME="system-serviceaccount-${TEST_NS}-${SA_NAME}" - USER_SETTINGS='{"quickStart":{"dontShowWelcomeModal":true},"onboardingPopoversHidden":{"vmsTab":true,"catalog":true,"createProject":true}}' + USER_SETTINGS='{"quickStart":{"dontShowWelcomeModal":true},"onboardingPopoversHidden":{"vmsTab":true,"catalog":true,"createProject":true,"navCollapse":true}}' - # Create the ConfigMap if HCO has not done so yet. - if ! oc get configmap kubevirt-user-settings -n "${CYPRESS_CNV_NS}" &>/dev/null; then - oc create configmap kubevirt-user-settings -n "${CYPRESS_CNV_NS}" + if ! oc get configmap kubevirt-user-settings -n "${CNV_NS}" &>/dev/null; then + oc create configmap kubevirt-user-settings -n "${CNV_NS}" fi - # Build a merge-patch that sets both possible user keys. PATCH=$(jq -cn \ --arg sa_uid "${SA_UID}" \ --arg name_key "${SANITIZED_NAME}" \ --arg val "${USER_SETTINGS}" \ '{data: (if ($sa_uid | length) > 0 then {($sa_uid): $val, ($name_key): $val} else {($name_key): $val} end)}') - oc patch configmap kubevirt-user-settings -n "${CYPRESS_CNV_NS}" \ + oc patch configmap kubevirt-user-settings -n "${CNV_NS}" \ --type merge -p "${PATCH}" - # TODO: Add dependency caching (either use the setup-node action with caching, or add explicit caching) + oc patch configmap kubevirt-ui-features -n "${CNV_NS}" --type=merge \ + --patch '{"data":{"advancedSearch":"true","treeViewFolders":"true"}}' + - name: Install dependencies run: | npm ci --ignore-scripts --no-audit - npx cypress install + npm run playwright-install - - name: Run gating tests - uses: cypress-io/github-action@v7 + - name: Run Playwright gating tests env: BRIDGE_BASE_ADDRESS: ${{ steps.ci-env.outputs.bridge-base-address }} - with: - summary-title: 'Cypress gating tests' - install: false - working-directory: ./cypress - env: openshift=true - spec: '${{ inputs.test_spec }}' - - - name: Generate test report - if: always() - run: npm run cypress-postreport || true + run: | + if [[ "${{ inputs.test_project }}" == "features" ]]; then + npm run test-playwright-headless -- --project=features + else + npm run test-playwright-headless -- --project=gating + fi - name: Upload test artifacts if: always() uses: actions/upload-artifact@v6 with: - name: cypress-results-${{ github.run_id }} + name: playwright-results-${{ github.run_id }} path: | - cypress/gui-test-screenshots/ - cypress/videos/ - cypress/results/ + playwright/test-results/ retention-days: 7 if-no-files-found: ignore @@ -310,9 +311,9 @@ jobs: HELM_RELEASE="$(oc get configmap "${CI_ENV_CM}" -n "${CI_ENV_NS}" \ -o jsonpath='{.data.helm-release}' 2>/dev/null || echo "${CI_ENV_CM}")" - oc logs -n "${CYPRESS_TEST_NS}" -l "app=${HELM_RELEASE}-console" --tail=-1 \ + oc logs -n "${TEST_NS}" -l "app=${HELM_RELEASE}-console" --tail=-1 \ > "${TMP}/console.log" 2>&1 || true - oc logs -n "${CYPRESS_TEST_NS}" -l "app=${HELM_RELEASE}-plugin" --tail=-1 \ + oc logs -n "${TEST_NS}" -l "app=${HELM_RELEASE}-plugin" --tail=-1 \ > "${TMP}/kubevirt-plugin.log" 2>&1 || true - name: e2e CI diagnostics - Collect OpenShift cluster information @@ -324,8 +325,8 @@ jobs: oc cluster-info dump > "${TMP}/cluster_info.json" 2>/dev/null || true oc get pods -n kubevirt-hyperconverged -o wide > "${TMP}/hco_pods.txt" 2>/dev/null || true oc get nodes -o wide > "${TMP}/nodes.txt" 2>/dev/null || true - oc get events -n "${CYPRESS_TEST_NS}" --sort-by='.lastTimestamp' > "${TMP}/test_ns_events.txt" 2>/dev/null || true - oc get pods -n "${CYPRESS_TEST_NS}" -o wide > "${TMP}/test_ns_pods.txt" 2>/dev/null || true + oc get events -n "${TEST_NS}" --sort-by='.lastTimestamp' > "${TMP}/test_ns_events.txt" 2>/dev/null || true + oc get pods -n "${TEST_NS}" -o wide > "${TMP}/test_ns_pods.txt" 2>/dev/null || true - name: e2e CI diagnostics - Upload artifacts if: always() @@ -339,7 +340,7 @@ jobs: - name: Clean up test entities if: always() run: | - echo "Cleaning test entities in ${CYPRESS_TEST_NS}..." + echo "Cleaning test entities in ${TEST_NS}..." bash ci-scripts/test-cleanup.sh - name: Release CI test environment diff --git a/.github/workflows/poc-e2e-ci-test.yml b/.github/workflows/hot-cluster-e2e.yml similarity index 55% rename from .github/workflows/poc-e2e-ci-test.yml rename to .github/workflows/hot-cluster-e2e.yml index f34bf4e53a..c86a4c3cde 100644 --- a/.github/workflows/poc-e2e-ci-test.yml +++ b/.github/workflows/hot-cluster-e2e.yml @@ -1,23 +1,32 @@ -name: POC Hot Cluster E2E CI Test +name: Hot Cluster E2E on: + pull_request: + branches: [main] workflow_dispatch: inputs: - test_spec: - description: 'Cypress test spec to run (passed to POC E2E Test 2)' + test_project: + description: Playwright project to run (setup runs automatically as dependency) required: true - default: 'tests/gating.cy.ts' - type: string + default: gating + type: choice + options: + - gating + - features cluster_name: - description: 'Cluster name' + description: IBM Cloud cluster name required: true - default: 'kubevirt-plugin-ci' + default: kubevirt-plugin-ci type: string permissions: contents: read actions: read +concurrency: + group: hot-cluster-e2e-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + env: CLUSTER_NAME: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }} @@ -28,21 +37,19 @@ jobs: timeout-minutes: 10 steps: - name: Checkout - uses: actions/checkout@v5 + uses: actions/checkout@v6 - name: Setup IBM Cloud CLI uses: IBM/actions-ibmcloud-cli@v1 with: - api_key: ${{ secrets.IBM_CLOUD_API_KEY }} + api_key: ${{ secrets.IC_KEY }} plugins: kubernetes-service - - name: Install oc client from cluster + - name: Install oc client run: | - INGRESS=$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --json | jq -r '.ingressHostname') - curl -kLo oc.tar "https://downloads-openshift-console.${INGRESS}/amd64/linux/oc.tar" - tar -xvf oc.tar - sudo mv oc /usr/local/bin/ - echo "oc version: $(oc version --client)" + CLUSTER_JSON="$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --output json)" + export CLUSTER_JSON + bash ./ci-scripts/install-oc-client.sh - name: Configure kubeconfig run: | @@ -51,8 +58,7 @@ jobs: oc get nodes -o wide - name: Run health checks - run: | - ./ci-scripts/check-cluster-health.sh + run: ./ci-scripts/check-cluster-health.sh - name: Health check summary if: always() @@ -60,7 +66,7 @@ jobs: echo "## Cluster Health Check" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" if [[ "${{ job.status }}" == "success" ]]; then - echo "All health checks **passed**. Invoking **POC Hot Cluster E2E CI Test 2** for Cypress." >> "$GITHUB_STEP_SUMMARY" + echo "All health checks **passed**. Invoking Playwright gating tests on the hot cluster." >> "$GITHUB_STEP_SUMMARY" else echo "Health checks **failed**. E2E workflow was not invoked." >> "$GITHUB_STEP_SUMMARY" fi @@ -69,7 +75,7 @@ jobs: name: Run E2E Tests needs: cluster-health-check if: needs.cluster-health-check.result == 'success' - uses: ./.github/workflows/poc-e2e-ci-test2.yml + uses: ./.github/workflows/hot-cluster-e2e-run.yml with: - test_spec: ${{ inputs.test_spec }} + test_project: ${{ inputs.test_project || 'gating' }} secrets: inherit diff --git a/.github/workflows/ibmc-cluster-auto-teardown.yml b/.github/workflows/ibmc-cluster-auto-teardown.yml index 1a40e39acd..0e1d6b3c78 100644 --- a/.github/workflows/ibmc-cluster-auto-teardown.yml +++ b/.github/workflows/ibmc-cluster-auto-teardown.yml @@ -34,7 +34,7 @@ jobs: id: check_ci uses: actions/github-script@v8 env: - INCLUDE_WORKFLOWS: '[".github/workflows/poc-e2e-ci-test.yml", ".github/workflows/poc-e2e-ci-test2.yml"]' + INCLUDE_WORKFLOWS: '[".github/workflows/hot-cluster-e2e.yml", ".github/workflows/hot-cluster-e2e-run.yml"]' with: script: | const INCLUDE_WORKFLOWS = JSON.parse(process.env.INCLUDE_WORKFLOWS); @@ -86,7 +86,7 @@ jobs: uses: IBM/actions-ibmcloud-cli@v1 if: steps.check_ci.outputs.active_jobs == 'false' with: - api_key: ${{ secrets.IBM_CLOUD_API_KEY }} + api_key: ${{ secrets.IC_KEY }} plugins: kubernetes-service - name: Check idle threshold diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index 12ef175ad7..b6e193b5ee 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -52,7 +52,7 @@ jobs: - name: Setup IBM Cloud CLI uses: IBM/actions-ibmcloud-cli@v1 with: - api_key: ${{ secrets.IBM_CLOUD_API_KEY }} + api_key: ${{ secrets.IC_KEY }} plugins: kubernetes-service, container-registry - name: Check for existing cluster @@ -139,13 +139,11 @@ jobs: run: | ./ci-scripts/check-roks-cluster-state.sh - - name: Install oc client from cluster + - name: Install oc client from cluster version run: | - INGRESS=$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --json | jq -r '.ingressHostname') - curl -kLo oc.tar "https://downloads-openshift-console.${INGRESS}/amd64/linux/oc.tar" - tar -xvf oc.tar - sudo mv oc /usr/local/bin/ - echo "oc version: $(oc version --client)" + CLUSTER_JSON="$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --output json)" + export CLUSTER_JSON + bash ./ci-scripts/install-oc-client.sh - name: Configure kubeconfig run: | diff --git a/.github/workflows/ibmc-cluster-teardown.yml b/.github/workflows/ibmc-cluster-teardown.yml index e023fdf46d..e0b5135f44 100644 --- a/.github/workflows/ibmc-cluster-teardown.yml +++ b/.github/workflows/ibmc-cluster-teardown.yml @@ -16,7 +16,7 @@ on: default: 'kubevirt-plugin-ci' type: string secrets: - IBM_CLOUD_API_KEY: + IC_KEY: required: true BOT_PAT: required: false @@ -36,7 +36,7 @@ jobs: - name: Setup IBM Cloud CLI uses: IBM/actions-ibmcloud-cli@v1 with: - api_key: ${{ secrets.IBM_CLOUD_API_KEY }} + api_key: ${{ secrets.IC_KEY }} plugins: kubernetes-service - name: Check cluster exists diff --git a/POC_HOT_CLUSTER_CI.md b/POC_HOT_CLUSTER_CI.md deleted file mode 100644 index fa1d7a0fb2..0000000000 --- a/POC_HOT_CLUSTER_CI.md +++ /dev/null @@ -1,142 +0,0 @@ -# Running e2e CI on a hot cluster - -This POC explores how to run e2e CI testing on a long-lived OpenShift cluster, a hot cluster. The ability to repeatedly run multiple CI tests, and multiple CI tests in parallel, can deliver a much better e2e CI experience for developers. - -The work is split in a few parts: - -- GitHub action workflows to manage creating, configuring and tearing down a **OpenShift on IBM Cloud (ROKS)** cluster. -- Scripts to enable a cluster to run e2e CI from GitHub actions. The scripts install the Hyperconverged Cluster Operator (HCO) to provide kubevirt, and install GitHub Action Runner Controller (ARC) to provide the self-hosted runner support to the e2e CI workflows. -- Workflows to start an "off cluster" console with the kubevirt-plugin, and then run the e2e tests - -The hot cluster can be any Kubernetes cluster and is based on the GitHub Actions runner controller (ARC). It is installed via helm, and only requires network access to pull from GitHub within the cluster itself. The POC can even be run on a local CRC / OpenShift local development cluster without any special networking configuration. - ---- - -## ROKS as the hot cluster - -**ROKS** (Red Hat OpenShift Kubernetes Service) is IBM Cloud’s managed OpenShift. The hot cluster is created and destroyed through GitHub Actions (see `.github/workflows/ibmc-cluster-*.yml`): provision workers in a chosen zone/flavor, wait until the API is ready, then install the pieces below. **IBM Cloud CLI** plus `IBM_CLOUD_API_KEY` is used to pull an **admin kubeconfig** when needed—nothing long-lived is stored as a kubeconfig secret in GitHub. - -Keeping the cluster **up** avoids repeating **~1 hour** (or more) of create time per test wave and lets you use workers with **real KVM** or large memory where tests need it. **Auto-teardown** workflows can still remove an idle cluster to control cost. - ---- - -## HCO (Hyperconverged Cluster Operator) - -**HCO** installs and coordinates the KubeVirt stack on OpenShift (KubeVirt, CDI, networking helpers, etc.). After the hot cluster is reachable via `oc` cli, **`ci-scripts/install-hco.sh`** deploys the operator and related resources so the cluster can run **VMs and the virtualization UI** the same way a real product cluster would. Health checks (for example **`ci-scripts/check-cluster-health.sh`**) are available to verify HCO and core virt components before tests run. - ---- - -## ARC (Actions Runner Controller) - -**ARC** is GitHub’s supported way to run **self-hosted Actions runners on Kubernetes**. A **controller** enables a **runner scale set** to provide self-hosted runners to your repo's workflows. When a job requests a `runs-on` label that matches the runner scale set, ARC starts a **runner pod** that registers with GitHub, runs the job, then exits. Each run is a container inside an ephemeral runner. - -In this repo, ARC is installed with Helm charts **`gha-runner-scale-set-controller`** (once per cluster) and **`gha-runner-scale-set`** (once for each runner scale set needed). - -To best support running in OpenShift, and the specific needs of the kubevirt-plugin test stack, first the scripts **`setup-dind-mirror.sh`** (to mirror `docker:dind` into the cluster registry) and **`setup-runner-image.sh`** (build a **custom runner image** with node and cypress support, etc.) are run. - -Once the images are available, two scripts are used to fully setup the ARC install: - -- **`ci-scripts/arc/install-arc-controller.sh`** — controller namespace, SCC, controller Helm release. -- **`ci-scripts/arc/install-runner-scale-set.sh`** — runner scale set, dind post-render, SCC bind for runner pods, RBAC for `oc` in CI jobs. - -Runners default to **Docker-in-Docker (dind)** so workflow steps can use **`docker run`**. This is needed for the off-cluster console flow. GitHub authenticates ARC to the repo via a **GitHub App** (recommended) or a **PAT**. Specific details on setup and secretes needed are in **[ci-scripts/arc/README.md](ci-scripts/arc/README.md)** and **[ci-scripts/README.md](ci-scripts/README.md)**. - ---- - -## Self-hosted runner and off-cluster E2E - -E2E workflows can have jobs that use **`runs-on: kubevirt-plugin-ci`** so they execute **on the cluster** in the ARC ephemeral runners. The runners are close to the API and with `oc` RBAC. The **OpenShift console** under test is started **off-cluster**, similar to local development: - -1. A workflow job builds a kubevirt-plugin container specific for the workflow run, either from the workflow's running branch or, in the future, from a PR's branch, and pushes the container to an ephemeral container repo (ttl.sh currently). As long as the container repo being pushed to can be pulled from the cluster, the container build can run on standard GitHub runners. -2. **`ci-scripts/resolve-console-image.sh`** picks an **`origin-console`** image tag that matches the cluster’s OpenShift **x.y** version. -3. **`ci-scripts/start-plugin-container.sh`** runs the **plugin** image with HTTPS and nginx (like the operator-mounted serving certs pattern). -4. **`ci-scripts/start-console.sh`** runs the **bridge** in **off-cluster** mode: bearer token and API endpoint from `oc`, **plugin URL** pointing at the plugin container on the runner host, and optional **kubevirt API proxy** via a cluster Route. - -Cypress then drives the UI at **`http://localhost:9000`** while API calls go to the **real cluster**, so tests exercise **real KubeVirt** with a **local console + plugin** topology. - -Orchestration lives in **`.github/workflows/poc-e2e-ci-test.yml`** (cluster health on `ubuntu-latest`, then calls **`poc-e2e-ci-test2.yml`**) and the reusable **`poc-e2e-ci-test2.yml`** workflow that performs the steps above and runs **`npm run test-cypress-headless`**. - ---- - -## More documentation - -| Doc | Purpose | -| ---------------------------------------------------------- | ------------------------------------------------- | -| **[`ci-scripts/README.md`](ci-scripts/README.md)** | Secrets, workflows, troubleshooting, cost control | -| **[`ci-scripts/arc/README.md`](ci-scripts/arc/README.md)** | ARC install order, env vars, OpenShift notes | - ---- - -## Gaps - -### Runner RBAC is overly broad - -The ARC runner pods need to run docker-in-docker (dind) and interact with cluster resources: they create/delete test namespaces, manage secrets and PVCs, and drive KubeVirt resources via Cypress. All of this is currently covered by a single `ClusterRole` (`arc-runner-ci`) bound cluster-wide via `ClusterRoleBinding`. - -The problem is that the `ClusterRole` grants full CRUD plus `deletecollection` on `namespaces` and `secrets` across the entire cluster. A compromised or malicious workflow running in the ARC runner pod could exfiltrate every secret on the cluster or tear down arbitrary namespaces—cluster-admin blast radius. - -The core constraint is that the `poc-e2e-ci-test2.yml` workflow uses a unique namespace per run (`kubevirt-plugin-ci-test-`) and the ARC runner is the one that creates it (`oc create namespace`), injects a secret into it, and deletes it at the end. Namespace create/delete and secret write access are load-bearing for every run, so any RBAC improvement must account for them. - -Three options to address this, in order of increasing workflow restructuring required: - -**Option 1 — Drop `deletecollection` only (minimal, lowest effort)** - -The single most dangerous verb is `deletecollection` — it allows bulk-wiping all resources of a given type in a single API call. Removing it from the verbs list doesn't break any workflow step and immediately reduces the blast radius without touching the namespace or secret permissions that the runner needs. - -The cluster-wide write access to `namespaces` and `secrets` remains, so this is a partial improvement only. - -**Option 2 — Add an admission policy layer** - -Keep the RBAC structure as-is but deploy an OPA Gatekeeper or Kyverno policy that restricts the runner `ServiceAccount` to: - -- Creating namespaces whose name matches `kubevirt-plugin-ci-test-*` only. -- Writing secrets only within namespaces that match that same pattern. - -This limits the blast radius at the admission layer rather than at the RBAC layer, without requiring any workflow changes. It does require Gatekeeper or Kyverno to be installed and maintained on the hot cluster. - -**Option 3 — Split namespace provisioning into a separate standard-runner job (most robust)** - -Restructure the `poc-e2e-ci-test2.yml` workflow by extracting the "Setup required namespaced resources" step into a dedicated job that runs on a standard GitHub-hosted runner (`ubuntu-latest`) using a kubeconfig with elevated rights: - -1. **Provisioning job** (standard runner) — Creates `kubevirt-plugin-ci-test-`, injects the CI secret, and applies any other pre-test cluster resources. This job holds the elevated permissions and is short-lived. -2. **Test-execution job** (ARC runner, `runs-on: kubevirt-plugin-ci`) — Receives the pre-created namespace name as a job input. Its `ClusterRole` no longer needs `namespaces` write verbs or cluster-wide `secrets` write access; it is replaced with a namespaced `Role`/`RoleBinding` bound to the test namespace, plus a minimal read-only `ClusterRole` for cluster-info queries (nodes, console URL, cluster version). - -This follows least-privilege most closely and is the recommended end-state before production use. It requires workflow restructuring and is beyond the scope of the current POC. - -### ARC runner Dockerfile - -Noted by @coderabbitai - -- **Pin the runner base image**: In `ci-scripts/arc/runner-image/Dockerfile`, the base image is using the `:latest` tag. It would be more stable and predictable if the the version is pined to a sha or a versioned tag. - -- **Harden the binary downloads**: Implement checksum verification for the unconditional downloads (yq at line 38–40) and for the fallback download paths (kubectl line 75–76, oc line 88–89, virtctl line 100–101). Conditional downloads from environment variables (OC_URL, VIRTCTL_URL) may use console URLs that lack published checksums; document this trade-off or require verification for those paths as well. - -### DIND Mirror - -Noted by @coderabbitai - -The default `docker.io/library/docker:dind` uses a floating tag that advances with Docker releases. While the script allows overriding via `DIND_SOURCE_IMAGE` environment variable, the default floating tag means different CI runs—weeks or months apart—could pull and mirror different dind versions underneath identical source code. Given the repo's emphasis on aligned and pinned versions for reproducibility, the dind default should either be a specific version (e.g., `docker:26.0` or a sha256 digest) or the docs should explicitly document that `DIND_SOURCE_IMAGE=docker.io/library/docker:` must be set in CI to achieve reproducible runner pods. - -### FIPS-enabled cluster support - -The upstream GitHub Actions runner image (`ghcr.io/actions/actions-runner:latest`) is Ubuntu 22.04-based. On FIPS-enabled OpenShift clusters, the kernel exposes `/proc/sys/crypto/fips_enabled = 1` to all containers. OpenSSL and the .NET runtime (which powers the runner's `Runner.Listener` binary) detect this flag and attempt to use FIPS-validated cryptographic providers. Since the Ubuntu image lacks the required FIPS provider module (`fips.so`), the runner segfaults during TLS handshake with GitHub — and `run.sh` masks the crash as exit code 0, making the failure invisible. - -The current workaround sets `OPENSSL_FORCE_FIPS_MODE=0` in the runner container environment (see `arc-runner-scale-set.pod.yaml`). This tells OpenSSL to ignore the kernel's FIPS flag. It is sufficient for CI runners that do not need to perform FIPS-validated cryptographic operations themselves. - -The proper long-term solution is to rebase the custom runner image onto a FIPS-compatible base such as `registry.access.redhat.com/ubi9/ubi` (or `ubi9/ubi-minimal`). This would involve: - -1. Starting from UBI instead of `ghcr.io/actions/actions-runner:latest`. -2. Installing the .NET runtime (the runner requires .NET 8+). -3. Downloading and extracting the GitHub Actions runner binaries from the [runner releases](https://github.com/actions/runner/releases). -4. Installing the same additional tooling the current Dockerfile adds (node, jq, oc, virtctl, cypress dependencies, etc.). - -A UBI-based image carries FIPS-validated OpenSSL and crypto providers out of the box, so the runner's .NET TLS stack works correctly without any environment variable overrides. This also avoids the build-time `curl`/OpenSSL issues documented in the current Dockerfile (the `wget2`/GnuTLS workaround for FIPS DSO errors during `oc start-build`). - -References: - -- [actions/runner#4197](https://github.com/actions/runner/issues/4197) — Segfault on FIPS-enabled hosts -- [dotnet/dotnet-docker#5849](https://github.com/dotnet/dotnet-docker/issues/5849) — .NET crypto fails in containers on FIPS kernels - -### If adopted, hardening of the ROKS cluster handling, and cluster health checks are needed - -The workflows and scripts all function, but they should receive additional scrutiny before being adopted for real scenarios. diff --git a/ci-scripts/README.md b/ci-scripts/README.md index 9f1868a53a..995d1c2847 100644 --- a/ci-scripts/README.md +++ b/ci-scripts/README.md @@ -1,5 +1,9 @@ # Hot Cluster CI +> **Continuation guide (CNV-74265):** [docs/HOT_CLUSTER_CI_CONTINUATION.md](../docs/HOT_CLUSTER_CI_CONTINUATION.md) +> **Future work backlog:** [docs/HOT_CLUSTER_FUTURE_WORK.md](../docs/HOT_CLUSTER_FUTURE_WORK.md) +> **Cluster lifecycle:** [docs/CLUSTER_LIFECYCLE.md](../docs/CLUSTER_LIFECYCLE.md) + This directory contains scripts and documentation for the **IBM Cloud hot cluster** CI stack: an OpenShift (ROKS) cluster used for KubeVirt plugin integration testing, with **Hyperconverged Cluster Operator (HCO)** and **GitHub Actions Runner Controller (ARC)** so jobs can run on cluster-adjacent self-hosted runners (`kubevirt-plugin-ci`). Workers can be **bare metal** (real KVM) or **VPC / shared** flavors with **KVM emulation**; the setup workflow defaults favor VPC-style flavors and `kvm_emulation: true` unless you change inputs. @@ -10,7 +14,7 @@ Workers can be **bare metal** (real KVM) or **VPC / shared** flavors with **KVM | -------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | **Real KubeVirt / OpenShift behavior** | Tests run against a live cluster with HCO, virt stack, and storage—not mocks. | | **Console + plugin fidelity** | Two POC paths: hit the **in-cluster** console URL, or run an **off-cluster** console container with the plugin served like the operator (TLS + nginx), matching how developers run bridge locally. | -| **Long-running / privileged CI** | GitHub-hosted runners are a poor fit for nested virt, heavy Cypress, and Docker-heavy flows; **ARC** on the cluster provides dind-capable runners with `oc` RBAC. | +| **Long-running / privileged CI** | GitHub-hosted runners are a poor fit for nested virt, heavy Playwright, and Docker-heavy flows; **ARC** on the cluster provides dind-capable runners with `oc` RBAC. | | **Cost control** | Bare metal and large workers are expensive; **auto-teardown** after idle time limits runaway spend. | ## Architecture @@ -25,23 +29,21 @@ GitHub Actions └── ibmc-cluster-auto-teardown.yml → "IBM Cloud Hot Cluster Auto-Teardown" (cron + dispatch → teardown workflow) ``` -**POC E2E (two variants)** +**Hot cluster E2E** ``` -poc-e2e-ci-test.yml — "POC Hot Cluster E2E CI Test" +hot-cluster-e2e.yml — "Hot Cluster E2E" (PR + manual dispatch) ├── cluster-health-check (ubuntu-latest + IBM Cloud → kubeconfig) │ └── ci-scripts/check-cluster-health.sh - └── run-e2e-tests (workflow_call → poc-e2e-ci-test2.yml) + └── run-e2e-tests (workflow_call → hot-cluster-e2e-run.yml) -poc-e2e-ci-test2.yml — "POC Hot Cluster E2E CI Test 2" - ├── check-runner (optional diagnostics on ARC runner) - ├── build-kubevirt-plugin-image (ubuntu-latest, Docker; may skip if image exists in registry) +hot-cluster-e2e-run.yml — "Hot Cluster E2E Run" + ├── check-runner (diagnostics on ARC runner) + ├── build-kubevirt-plugin-image (ubuntu-latest; podman build + push) └── run-gating-tests (runs-on: kubevirt-plugin-ci) - ├── ci-scripts/resolve-console-image.sh → CONSOLE_IMAGE matches cluster OCP x.y - ├── ci-scripts/start-plugin-container.sh → plugin over HTTPS :9001 (dind/docker) - ├── ci-scripts/start-console.sh → origin-console container, off-cluster mode - ├── BRIDGE_BASE_ADDRESS=http://localhost:9000 - └── Cypress against local bridge + plugin proxy + ├── ci-env-request → ci-env-controller → ci-test-stack (console + plugin) + ├── BRIDGE_BASE_ADDRESS from test stack + └── Playwright gating (or features project) ``` ## Required GitHub Secrets @@ -50,9 +52,9 @@ These secrets must be configured in the repository settings before running the w ### IBM Cloud -| Secret | Description | How to Obtain | -| ------------------- | --------------------- | ------------------------------------------------------------- | -| `IBM_CLOUD_API_KEY` | IBM Cloud IAM API key | IBM Cloud Console → Manage → Access (IAM) → API keys → Create | +| Secret | Description | How to Obtain | +| -------- | --------------------- | ------------------------------- | +| `IC_KEY` | IBM Cloud IAM API key | Repository/org secret (Actions) | The API key must belong to a user or service ID with the following IAM permissions: @@ -97,7 +99,7 @@ All workflows that need cluster access use the IBM Cloud CLI to pull a kubeconfi - name: Setup IBM Cloud CLI uses: IBM/actions-ibmcloud-cli@v1 with: - api_key: ${{ secrets.IBM_CLOUD_API_KEY }} + api_key: ${{ secrets.IC_KEY }} plugins: kubernetes-service - name: Configure kubeconfig @@ -106,7 +108,7 @@ All workflows that need cluster access use the IBM Cloud CLI to pull a kubeconfi oc cluster-info ``` -This avoids storing kubeconfig or credentials as GitHub secrets. Any workflow or job that needs `oc`/`kubectl` access simply repeats these two steps with the shared `IBM_CLOUD_API_KEY`. +This avoids storing kubeconfig or credentials as GitHub secrets. Any workflow or job that needs `oc`/`kubectl` access simply repeats these two steps with the shared `IC_KEY` secret. ## Creating a GitHub App for ARC @@ -190,8 +192,8 @@ To turn off dind (no Docker daemon in the pod): `export CONTAINER_MODE=none` and | `.github/workflows/ibmc-cluster-setup.yml` | IBM Cloud Hot Cluster Setup | | `.github/workflows/ibmc-cluster-teardown.yml` | IBM Cloud Hot Cluster Teardown | | `.github/workflows/ibmc-cluster-auto-teardown.yml` | IBM Cloud Hot Cluster Auto-Teardown | -| `.github/workflows/poc-e2e-ci-test.yml` | POC Hot Cluster E2E CI Test | -| `.github/workflows/poc-e2e-ci-test2.yml` | POC Hot Cluster E2E CI Test 2 | +| `.github/workflows/hot-cluster-e2e.yml` | Hot Cluster E2E | +| `.github/workflows/hot-cluster-e2e-run.yml` | Hot Cluster E2E Run | ### Setting up the hot cluster @@ -201,28 +203,20 @@ To turn off dind (no Docker daemon in the pod): `export CONTAINER_MODE=none` and **Implementation notes:** Provisioning uses `ibmcloud oc cluster create classic` (not VPC workers in this workflow). Setup installs `oc` from the cluster downloads endpoint, runs `install-hco.sh`, then `arc/setup-dind-mirror.sh`, `arc/setup-runner-image.sh`, `install-arc-controller.sh`, and `install-runner-scale-set.sh`. -### Running POC E2E tests - -**Variant A — `poc-e2e-ci-test.yml` (IBM Cloud cluster health checks then run `poc-e2e-ci-test2.yml`)** - -1. Actions → **POC Hot Cluster E2E CI Test** -2. Inputs: Cypress spec (default `tests/gating.cy.ts`), cluster name -3. Runs `check-cluster-health.sh` on `ubuntu-latest` with an IBM Cloud kubeconfig; fails fast if the cluster is unhealthy -4. On success, calls `poc-e2e-ci-test2.yml` via `workflow_call` to run the tests +### Running hot cluster E2E tests -**Variant B — `poc-e2e-ci-test2.yml` (off-cluster console + plugin containers)** +1. Actions → **Hot Cluster E2E** (PR trigger or manual dispatch) +2. Inputs: Playwright project (`gating` or `features`), cluster name (default `kubevirt-plugin-ci`) +3. Health check on `ubuntu-latest`; on success calls **Hot Cluster E2E Run** +4. Run workflow provisions a `ci-test-stack`, runs Playwright, uploads artifacts, releases the stack -1. Actions → **POC Hot Cluster E2E CI Test 2** -2. Default spec: `tests/poc-gating.cy.ts` (narrower gating bundle than full `gating.cy.ts`) -3. Build job pushes/pulls a **plugin image** from a registry (see **POC debt** below). -4. The test job creates only the **test namespace + dummy secret** (not full `test-setup.sh`); modal handling and other prep lean on Cypress `beforeSpec` / shared helpers. -5. Test job starts **plugin** then **console** via `ci-scripts/`, then Cypress with `BRIDGE_BASE_ADDRESS=http://localhost:9000` +To run only the test jobs (cluster already verified): dispatch **Hot Cluster E2E Run** directly. ### Tearing down the cluster **Manual:** Actions → **IBM Cloud Hot Cluster Teardown** -**Automatic:** **IBM Cloud Hot Cluster Auto-Teardown** runs on a schedule (`*/30 * * * *`), uses `GITHUB_TOKEN` with `actions: write` to dispatch **IBM Cloud Hot Cluster Teardown** when idle thresholds are met. Idle detection monitors only the two E2E test workflows (`poc-e2e-ci-test.yml` and `poc-e2e-ci-test2.yml`) for in-progress, queued, or recently completed runs (fallback: cluster creation time). +**Automatic:** Idle detection monitors `hot-cluster-e2e.yml` and `hot-cluster-e2e-run.yml` for in-progress, queued, or recently completed runs. **Teardown implementation:** Uninstalls Helm releases `kubevirt-plugin-ci` (scale set) and `arc` (controller) when possible, deletes the ROKS cluster, then optionally removes offline GitHub runners labeled `kubevirt-plugin-ci` using `BOT_PAT`. @@ -279,14 +273,16 @@ Key defaults: - `ARC_SCALE_SET_LABELS` (optional multilabel; requires matching `runs-on` array in workflows) - Additional scale sets: run only **`ci-scripts/arc/install-runner-scale-set.sh`** (skip **`ci-scripts/arc/install-arc-controller.sh`**) -## POC: immediate next steps (toward stable green runs) +## Follow-up work + +See [docs/HOT_CLUSTER_FUTURE_WORK.md](../docs/HOT_CLUSTER_FUTURE_WORK.md) for RBAC hardening, FIPS, ci-env-controller setup gap, and workflow hygiene items. + +Quick checklist: -1. **Plugin image supply chain (`poc-e2e-ci-test2.yml`)** — Replace the hard-coded `KUBEVIRT_PLUGIN_IMAGE` (currently a fixed `ttl.sh/...` tag) with a per-run or per-SHA tag (e.g. uncomment the `github.run_id`-style pattern), or build on every run and push to a registry your cluster/runner can pull. Ensure the **skopeo inspect** skip path does not mask a broken or stale image. -2. **Align Cypress coverage with stability** — `tests/poc-gating.cy.ts` is intentionally smaller than full `tests/gating.cy.ts`; expand only after the off-cluster stack is reliable. Fix flaky specs (VM start/status waits, tab navigation) using the same patterns as local CI. -3. **Run variant A first for signal** — Use `poc-e2e-ci-test.yml` against a healthy cluster to separate **cluster/HCO** issues from **docker/console/plugin** issues in test2. -4. **Fork / ARC** — Variant A (`poc-e2e-ci-test.yml`) runs the health check on `ubuntu-latest` and is fork-safe; variant B (`poc-e2e-ci-test2.yml`) still requires a runner labeled `kubevirt-plugin-ci` and cannot run on forks without ARC registered. -5. **Workflow hygiene** — Add dependency caching to `poc-e2e-ci-test2.yml` (open TODO: use `actions/setup-node` with caching or an explicit cache step). Consider pinning `actions/checkout` major versions consistently across workflows. -6. **Verify auto-teardown** — Confirm scheduled **IBM Cloud Hot Cluster Auto-Teardown** successfully dispatches **IBM Cloud Hot Cluster Teardown** (`workflow_id` must match `ibmc-cluster-teardown.yml`). +1. **Health check first** — Run **Hot Cluster E2E** (or health-check job only) to isolate cluster/HCO issues from test-stack issues. +2. **ci-env-controller** — Install once on the cluster if not already present (`./dev/ci-env.sh`). +3. **ARC on org repo** — Runners must register to `kubevirt-ui/kubevirt-plugin`, not a fork. +4. **Auto-teardown** — Confirm idle detection watches `hot-cluster-e2e.yml` and `hot-cluster-e2e-run.yml`. ## Production and hardening review (before treating POC patterns as prod) diff --git a/ci-scripts/arc/arc-runner-scale-set.pod.yaml b/ci-scripts/arc/arc-runner-scale-set.pod.yaml index c27f8109ad..fb36733e1d 100644 --- a/ci-scripts/arc/arc-runner-scale-set.pod.yaml +++ b/ci-scripts/arc/arc-runner-scale-set.pod.yaml @@ -24,7 +24,7 @@ template: command: - '/home/runner/run.sh' env: - # Workaround for FIPS-enabled clusters (see POC_HOT_CLUSTER_CI.md). + # Workaround for FIPS-enabled clusters (see docs/HOT_CLUSTER_FUTURE_WORK.md). - name: OPENSSL_FORCE_FIPS_MODE value: '0' - name: GOLANG_FIPS diff --git a/ci-scripts/images/arc-runner/Dockerfile b/ci-scripts/images/arc-runner/Dockerfile index 5162f2cc9f..e79a71a566 100644 --- a/ci-scripts/images/arc-runner/Dockerfile +++ b/ci-scripts/images/arc-runner/Dockerfile @@ -1,6 +1,6 @@ # Custom GitHub Actions runner image for kubevirt-plugin-ci CI. # Extends the official runner image with cli tools used in the CI pipeline (such as jq, curl, -# envsubst, Node.js, oc, and virtctl) and support for running Cypress tests. +# envsubst, Node.js, oc, and virtctl) and system dependencies for Playwright (Chromium). # # https://github.com/actions/runner/blob/main/images/Dockerfile: the base image @@ -42,12 +42,7 @@ RUN apt-get update \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -# Cypress 14+ headless (bundled Electron) — Linux deps for Ubuntu 24.04+ (noble). -# Official runner image is noble (see actions/runner images/Dockerfile: mcr.microsoft.com/dotnet/runtime-deps:8.0-noble). -# Matches Cypress docs: https://docs.cypress.io/guides/getting-started/installing-cypress#Linux — Ubuntu >=24.04 list. -# GitHub-hosted ubuntu-latest (runner-images Ubuntu2404) also includes xvfb, Chrome/Firefox, and extra fonts; for ARC we -# install the minimum Electron stack plus common fonts so screenshots/video match hosted runners more closely. -# unzip is already present in the upstream actions-runner image (needed for fast Cypress binary unpack). +# Playwright / Chromium headless — Linux deps for Ubuntu 24.04+ (noble). RUN apt-get update \ && apt-get install -y --no-install-recommends \ fontconfig \ diff --git a/ci-scripts/install-hco.sh b/ci-scripts/install-hco.sh index d20e122129..8642690617 100755 --- a/ci-scripts/install-hco.sh +++ b/ci-scripts/install-hco.sh @@ -85,8 +85,21 @@ spec: value: "${KVM_EMULATION}" EOF -echo "Waiting 90s for install plan and pods to be created..." -sleep 90 +echo "Waiting for Subscription to have an InstallPlan..." +for i in $(seq 1 60); do + INSTALL_PLAN="$(oc get subscription hco-operatorhub -n kubevirt-hyperconverged \ + -o jsonpath='{.status.installPlanRef.name}' 2>/dev/null || true)" + if [[ -n "${INSTALL_PLAN}" ]]; then + echo "InstallPlan found: ${INSTALL_PLAN}" + break + fi + if [[ "${i}" -eq 60 ]]; then + echo "ERROR: Timed out waiting for HCO InstallPlan" + exit 1 + fi + echo "Waiting for InstallPlan... (${i}/60)" + sleep 5 +done # --- Wait for HCO deployments --- echo "Waiting for HCO deployments to become available..." @@ -125,13 +138,11 @@ oc wait -n kubevirt-hyperconverged hyperconverged kubevirt-hyperconverged \ if [[ "${SKIP_HPP}" != "true" ]]; then echo "Installing HostPath Provisioner..." - oc create -f \ - "https://raw.githubusercontent.com/kubevirt/hostpath-provisioner-operator/${HPP_VERSION}/deploy/hostpathprovisioner_cr.yaml" \ - || echo "HPP CR may already exist, continuing..." + oc apply -f \ + "https://raw.githubusercontent.com/kubevirt/hostpath-provisioner-operator/${HPP_VERSION}/deploy/hostpathprovisioner_cr.yaml" - oc create -f \ - "https://raw.githubusercontent.com/kubevirt/hostpath-provisioner-operator/${HPP_VERSION}/deploy/storageclass-wffc-csi.yaml" \ - || echo "HPP StorageClass may already exist, continuing..." + oc apply -f \ + "https://raw.githubusercontent.com/kubevirt/hostpath-provisioner-operator/${HPP_VERSION}/deploy/storageclass-wffc-csi.yaml" oc annotate storageclasses --all storageclass.kubernetes.io/is-default-class- || true oc annotate storageclass hostpath-csi storageclass.kubernetes.io/is-default-class='true' diff --git a/ci-scripts/start-console.sh b/ci-scripts/start-console.sh index 9c796e57ef..e1473faf82 100755 --- a/ci-scripts/start-console.sh +++ b/ci-scripts/start-console.sh @@ -94,8 +94,8 @@ BRIDGE_PLUGINS="kubevirt-plugin=${PLUGIN_TRANSPORT}://host.docker.internal:${PLU # Resolve CONSOLE_IMAGE from the cluster's OpenShift version when not already set. # Falls back to :latest if resolution fails (e.g. non-OCP cluster, oc not logged in). eval "$(bash "${SCRIPT_DIR}/resolve-console-image.sh")" || true -CONSOLE_IMAGE="${CONSOLE_IMAGE:-quay.io/openshift/origin-console:latest}" CONSOLE_PORT=${CONSOLE_PORT:-9000} +: "${CONSOLE_IMAGE:?CONSOLE_IMAGE must be set or resolved before starting the console}" # --------------------------------------------------------------------------- # Job summary (no secrets) diff --git a/ci-scripts/test-cleanup.sh b/ci-scripts/test-cleanup.sh index 18d55193f4..47106eff92 100755 --- a/ci-scripts/test-cleanup.sh +++ b/ci-scripts/test-cleanup.sh @@ -2,7 +2,7 @@ # # A copy of `../test-cleanup.sh` to use the correct namespace from the e2e run, and only delete resources in that namespace. # -export TEST_NS=${CYPRESS_TEST_NS:-'auto-test-ns'} +export TEST_NS=${TEST_NS:-${CYPRESS_TEST_NS:-'auto-test-ns'}} oc -n ${TEST_NS} delete vm --all --wait=false || true oc -n ${TEST_NS} delete template --all --wait=false || true From 488c0a9b62a60f6a7a350fa6dc930cd314374a82 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 17 Jun 2026 16:47:14 -0400 Subject: [PATCH 14/42] Fix IC_KEY config --- .github/workflows/hot-cluster-e2e.yml | 15 +++++++++++++++ .github/workflows/ibmc-cluster-auto-teardown.yml | 2 ++ .github/workflows/ibmc-cluster-setup.yml | 2 ++ .github/workflows/ibmc-cluster-teardown.yml | 2 ++ 4 files changed, 21 insertions(+) diff --git a/.github/workflows/hot-cluster-e2e.yml b/.github/workflows/hot-cluster-e2e.yml index c86a4c3cde..5141897ff1 100644 --- a/.github/workflows/hot-cluster-e2e.yml +++ b/.github/workflows/hot-cluster-e2e.yml @@ -31,8 +31,21 @@ env: CLUSTER_NAME: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }} jobs: + # Fork PRs do not receive org secrets (IC_KEY). Same-repo PRs and workflow_dispatch can log in. + fork-pr-notice: + name: Fork PR — secrets unavailable + if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository + runs-on: ubuntu-latest + steps: + - name: Explain why Hot Cluster E2E cannot run + run: | + echo "::error::Hot Cluster E2E needs the IC_KEY secret, which GitHub does not pass to pull_request workflows from forks." + echo "Push your branch to kubevirt-ui/kubevirt-plugin and open a same-repo PR, or run this workflow manually (workflow_dispatch) from an upstream branch." + exit 1 + cluster-health-check: name: Cluster Health Check + if: github.event_name == 'workflow_dispatch' || github.event.pull_request.head.repo.full_name == github.repository runs-on: ubuntu-latest timeout-minutes: 10 steps: @@ -43,6 +56,8 @@ jobs: uses: IBM/actions-ibmcloud-cli@v1 with: api_key: ${{ secrets.IC_KEY }} + region: eu-de + group: cnv-ui plugins: kubernetes-service - name: Install oc client diff --git a/.github/workflows/ibmc-cluster-auto-teardown.yml b/.github/workflows/ibmc-cluster-auto-teardown.yml index 0e1d6b3c78..88f7e9b39e 100644 --- a/.github/workflows/ibmc-cluster-auto-teardown.yml +++ b/.github/workflows/ibmc-cluster-auto-teardown.yml @@ -87,6 +87,8 @@ jobs: if: steps.check_ci.outputs.active_jobs == 'false' with: api_key: ${{ secrets.IC_KEY }} + region: eu-de + group: cnv-ui plugins: kubernetes-service - name: Check idle threshold diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index b6e193b5ee..07854afa94 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -53,6 +53,8 @@ jobs: uses: IBM/actions-ibmcloud-cli@v1 with: api_key: ${{ secrets.IC_KEY }} + region: eu-de + group: cnv-ui plugins: kubernetes-service, container-registry - name: Check for existing cluster diff --git a/.github/workflows/ibmc-cluster-teardown.yml b/.github/workflows/ibmc-cluster-teardown.yml index e0b5135f44..1efca5f32e 100644 --- a/.github/workflows/ibmc-cluster-teardown.yml +++ b/.github/workflows/ibmc-cluster-teardown.yml @@ -37,6 +37,8 @@ jobs: uses: IBM/actions-ibmcloud-cli@v1 with: api_key: ${{ secrets.IC_KEY }} + region: eu-de + group: cnv-ui plugins: kubernetes-service - name: Check cluster exists From 136ac4b800684f146eda3d420cac5b5e90a2f2a0 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Tue, 23 Jun 2026 16:39:10 -0400 Subject: [PATCH 15/42] ci(hot-cluster): fix setup workflow script paths and install ci-env-controller Update ibmc-cluster-setup.yml to use ci-scripts/images/setup-arc-runner-image.sh after the runner image scripts were moved, drop the removed dind mirror step, and install ci-env-controller after ARC so hot-cluster E2E can provision stacks. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index 07854afa94..ccee34dbe2 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -173,13 +173,12 @@ jobs: fi echo "ARC secrets are present." - - name: Setup ARC runner image and dind mirror + - name: Build ARC runner image id: build_runner env: OC_VERSION: '4.20' run: | - ./ci-scripts/arc/setup-dind-mirror.sh - IMAGE_REF=$(./ci-scripts/arc/setup-runner-image.sh | grep '^IMAGE_REF=' | cut -d= -f2-) + IMAGE_REF=$(./ci-scripts/images/setup-arc-runner-image.sh | grep '^IMAGE_REF=' | cut -d= -f2-) echo "image_ref=${IMAGE_REF}" >> "$GITHUB_OUTPUT" - name: Install ARC @@ -196,6 +195,10 @@ jobs: ./ci-scripts/arc/install-arc-controller.sh ./ci-scripts/arc/install-runner-scale-set.sh + - name: Install CI environment controller + run: | + ./ci-scripts/ci-env/install-ci-env-controller.sh + - name: Verify cluster health env: GITHUB_REPOSITORY: ${{ github.repository }} From 5a6ef992eb14eeda1a67fd05825ce7abc9046a0c Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 11:31:40 -0400 Subject: [PATCH 16/42] ci(hot-cluster): log IBM Cloud infra-permissions in setup workflow Run ibmcloud ks infra-permissions get after CLI login so IAM gaps (required vs suggested classic infrastructure permissions) appear in the GitHub Actions step summary when cluster create fails. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 6 ++ ci-scripts/log-ibmcloud-iam-diagnostics.sh | 64 ++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100755 ci-scripts/log-ibmcloud-iam-diagnostics.sh diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index ccee34dbe2..f068b3c41b 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -57,6 +57,12 @@ jobs: group: cnv-ui plugins: kubernetes-service, container-registry + - name: Log IBM Cloud IAM diagnostics + continue-on-error: true + env: + IBM_REGION: eu-de + run: bash ./ci-scripts/log-ibmcloud-iam-diagnostics.sh + - name: Check for existing cluster id: check_cluster run: | diff --git a/ci-scripts/log-ibmcloud-iam-diagnostics.sh b/ci-scripts/log-ibmcloud-iam-diagnostics.sh new file mode 100755 index 0000000000..58ff445098 --- /dev/null +++ b/ci-scripts/log-ibmcloud-iam-diagnostics.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# +# Log IBM Cloud identity and classic infrastructure permission gaps for CI debugging. +# Safe to run in GitHub Actions — never fails the job (exit 0). +# +# Optional env: +# IBM_REGION Kubernetes Service region for infra-permissions (default: eu-de) + +set -uo pipefail + +IBM_REGION="${IBM_REGION:-eu-de}" + +emit() { + if [[ -n "${GITHUB_STEP_SUMMARY:-}" ]]; then + tee -a "${GITHUB_STEP_SUMMARY}" + else + cat + fi +} + +{ + echo "## IBM Cloud IAM diagnostics" + echo "" + echo "Region for \`ibmcloud ks infra-permissions get\`: \`${IBM_REGION}\`" + echo "" + + echo "### Target" + echo '```' + ibmcloud target 2>&1 || echo "(ibmcloud target failed)" + echo '```' + echo "" + + echo "### Account" + echo '```' + ibmcloud account show 2>&1 || echo "(ibmcloud account show failed)" + echo '```' + echo "" + + echo "### Cluster list probe" + echo '```' + ibmcloud oc cluster ls 2>&1 || echo "(ibmcloud oc cluster ls failed)" + echo '```' + echo "" + + echo "### Classic infrastructure permissions" + echo "" + echo "Output of \`ibmcloud ks infra-permissions get --region ${IBM_REGION}\`:" + echo "" + echo '```' + if ibmcloud ks infra-permissions get --region "${IBM_REGION}" -q 2>&1; then + echo "(no missing required/suggested permissions reported)" + else + echo "(command failed — see output above)" + fi + echo '```' + echo "" + + echo "### Classic infrastructure permissions (JSON)" + echo '```json' + ibmcloud ks infra-permissions get --region "${IBM_REGION}" --output json -q 2>&1 || echo "{}" + echo '```' +} | emit + +exit 0 From 8778a06114dacd8972d678872489982af6c41df0 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 11:33:11 -0400 Subject: [PATCH 17/42] ci(hot-cluster): map worker zone to infra-permissions region fra02 -> eu-de, wdc04 -> us-south, etc. so ks infra-permissions get uses the KS region that matches the classic datacenter, not only eu-de. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 2 +- ci-scripts/log-ibmcloud-iam-diagnostics.sh | 66 ++++++++++++++++------ 2 files changed, 49 insertions(+), 19 deletions(-) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index f068b3c41b..0c57fc462c 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -60,7 +60,7 @@ jobs: - name: Log IBM Cloud IAM diagnostics continue-on-error: true env: - IBM_REGION: eu-de + WORKER_ZONE: ${{ inputs.zone }} run: bash ./ci-scripts/log-ibmcloud-iam-diagnostics.sh - name: Check for existing cluster diff --git a/ci-scripts/log-ibmcloud-iam-diagnostics.sh b/ci-scripts/log-ibmcloud-iam-diagnostics.sh index 58ff445098..ccffaf0281 100755 --- a/ci-scripts/log-ibmcloud-iam-diagnostics.sh +++ b/ci-scripts/log-ibmcloud-iam-diagnostics.sh @@ -4,11 +4,31 @@ # Safe to run in GitHub Actions — never fails the job (exit 0). # # Optional env: -# IBM_REGION Kubernetes Service region for infra-permissions (default: eu-de) +# WORKER_ZONE Classic datacenter for cluster workers (e.g. fra02, wdc04) +# IBM_REGION Override KS region for infra-permissions (skips zone mapping) set -uo pipefail -IBM_REGION="${IBM_REGION:-eu-de}" +# Map classic datacenter zone -> Kubernetes Service region for infra-permissions. +# See: ibmcloud ks infra-permissions get --region (jp-osa, jp-tok, au-syd, eu-de, eu-gb, us-east, us-south) +map_worker_zone_to_infra_region() { + local zone="${1:?zone required}" + case "${zone}" in + fra*|ams*|lon*|par*|sng*) echo "eu-de" ;; + wdc*|dal*|sjc*|tor*|mon*|che*|sao*) echo "us-south" ;; + tok*|osa*) echo "jp-tok" ;; + syd*) echo "au-syd" ;; + *) echo "us-south" ;; + esac +} + +if [[ -n "${IBM_REGION:-}" ]]; then + INFRA_REGIONS=("${IBM_REGION}") +elif [[ -n "${WORKER_ZONE:-}" ]]; then + INFRA_REGIONS=("$(map_worker_zone_to_infra_region "${WORKER_ZONE}")") +else + INFRA_REGIONS=("eu-de") +fi emit() { if [[ -n "${GITHUB_STEP_SUMMARY:-}" ]]; then @@ -18,10 +38,32 @@ emit() { fi } +run_infra_permissions() { + local region="$1" + echo "#### \`ibmcloud ks infra-permissions get --region ${region}\`" + echo "" + echo '```' + ibmcloud ks infra-permissions get --region "${region}" -q 2>&1 || true + echo '```' + echo "" + echo "
JSON (${region})" + echo "" + echo '```json' + ibmcloud ks infra-permissions get --region "${region}" --output json -q 2>&1 || echo "{}" + echo '```' + echo "" + echo "
" + echo "" +} + { echo "## IBM Cloud IAM diagnostics" echo "" - echo "Region for \`ibmcloud ks infra-permissions get\`: \`${IBM_REGION}\`" + if [[ -n "${WORKER_ZONE:-}" ]]; then + echo "Worker zone: \`${WORKER_ZONE}\` → infra-permissions region(s): \`${INFRA_REGIONS[*]}\`" + else + echo "Infra-permissions region(s): \`${INFRA_REGIONS[*]}\`" + fi echo "" echo "### Target" @@ -44,21 +86,9 @@ emit() { echo "### Classic infrastructure permissions" echo "" - echo "Output of \`ibmcloud ks infra-permissions get --region ${IBM_REGION}\`:" - echo "" - echo '```' - if ibmcloud ks infra-permissions get --region "${IBM_REGION}" -q 2>&1; then - echo "(no missing required/suggested permissions reported)" - else - echo "(command failed — see output above)" - fi - echo '```' - echo "" - - echo "### Classic infrastructure permissions (JSON)" - echo '```json' - ibmcloud ks infra-permissions get --region "${IBM_REGION}" --output json -q 2>&1 || echo "{}" - echo '```' + for region in "${INFRA_REGIONS[@]}"; do + run_infra_permissions "${region}" + done } | emit exit 0 From 005efee7ca27977fa65f6679d38597d0eb4a27ff Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 11:42:57 -0400 Subject: [PATCH 18/42] ci(hot-cluster): make IAM diagnostics visible in logs and artifacts Use ::group:: for step logs, upload ibmcloud-iam-diagnostics.txt artifact, and point to diagnostics from the setup summary. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 15 ++++++ ci-scripts/log-ibmcloud-iam-diagnostics.sh | 61 +++++++++++----------- 2 files changed, 46 insertions(+), 30 deletions(-) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index 0c57fc462c..c1cde8f1a9 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -58,11 +58,22 @@ jobs: plugins: kubernetes-service, container-registry - name: Log IBM Cloud IAM diagnostics + id: iam_diagnostics continue-on-error: true env: WORKER_ZONE: ${{ inputs.zone }} run: bash ./ci-scripts/log-ibmcloud-iam-diagnostics.sh + - name: Upload IAM diagnostics log + if: always() && steps.iam_diagnostics.outcome != 'skipped' + continue-on-error: true + uses: actions/upload-artifact@v6 + with: + name: ibmcloud-iam-diagnostics-${{ github.run_id }} + path: ${{ runner.temp }}/ibmcloud-iam-diagnostics.txt + retention-days: 14 + if-no-files-found: warn + - name: Check for existing cluster id: check_cluster run: | @@ -225,6 +236,10 @@ jobs: echo "| Workers | \`${{ inputs.worker_count }}\` |" >> "$GITHUB_STEP_SUMMARY" echo "| KVM Emulation | \`${{ inputs.kvm_emulation }}\` |" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" + echo "### IAM diagnostics" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "If cluster create failed with **E73e6**, expand the **Log IBM Cloud IAM diagnostics** step log, open the run **Summary** tab, or download the \`ibmcloud-iam-diagnostics\` artifact." >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" if oc cluster-info &>/dev/null; then echo "Cluster is **healthy** and ready for CI." >> "$GITHUB_STEP_SUMMARY" else diff --git a/ci-scripts/log-ibmcloud-iam-diagnostics.sh b/ci-scripts/log-ibmcloud-iam-diagnostics.sh index ccffaf0281..048b0cd0e9 100755 --- a/ci-scripts/log-ibmcloud-iam-diagnostics.sh +++ b/ci-scripts/log-ibmcloud-iam-diagnostics.sh @@ -3,14 +3,20 @@ # Log IBM Cloud identity and classic infrastructure permission gaps for CI debugging. # Safe to run in GitHub Actions — never fails the job (exit 0). # +# Output: +# - Step log (expand "Log IBM Cloud IAM diagnostics" in Actions) +# - Job Summary tab on the workflow run page +# - Artifact ibmcloud-iam-diagnostics.txt (when GITHUB_STEP_SUMMARY is set) +# # Optional env: # WORKER_ZONE Classic datacenter for cluster workers (e.g. fra02, wdc04) # IBM_REGION Override KS region for infra-permissions (skips zone mapping) set -uo pipefail +DIAG_FILE="${RUNNER_TEMP:-/tmp}/ibmcloud-iam-diagnostics.txt" + # Map classic datacenter zone -> Kubernetes Service region for infra-permissions. -# See: ibmcloud ks infra-permissions get --region (jp-osa, jp-tok, au-syd, eu-de, eu-gb, us-east, us-south) map_worker_zone_to_infra_region() { local zone="${1:?zone required}" case "${zone}" in @@ -30,65 +36,60 @@ else INFRA_REGIONS=("eu-de") fi -emit() { - if [[ -n "${GITHUB_STEP_SUMMARY:-}" ]]; then - tee -a "${GITHUB_STEP_SUMMARY}" - else - cat - fi -} - run_infra_permissions() { local region="$1" - echo "#### \`ibmcloud ks infra-permissions get --region ${region}\`" + echo "#### ibmcloud ks infra-permissions get --region ${region}" echo "" echo '```' ibmcloud ks infra-permissions get --region "${region}" -q 2>&1 || true echo '```' echo "" - echo "
JSON (${region})" - echo "" - echo '```json' - ibmcloud ks infra-permissions get --region "${region}" --output json -q 2>&1 || echo "{}" - echo '```' - echo "" - echo "
" - echo "" } -{ +write_diagnostics() { echo "## IBM Cloud IAM diagnostics" echo "" - if [[ -n "${WORKER_ZONE:-}" ]]; then - echo "Worker zone: \`${WORKER_ZONE}\` → infra-permissions region(s): \`${INFRA_REGIONS[*]}\`" - else - echo "Infra-permissions region(s): \`${INFRA_REGIONS[*]}\`" - fi + echo "Worker zone: \`${WORKER_ZONE:-}\`" + echo "Infra-permissions region(s): \`${INFRA_REGIONS[*]}\`" echo "" - echo "### Target" echo '```' ibmcloud target 2>&1 || echo "(ibmcloud target failed)" echo '```' echo "" - echo "### Account" echo '```' ibmcloud account show 2>&1 || echo "(ibmcloud account show failed)" echo '```' echo "" - echo "### Cluster list probe" echo '```' ibmcloud oc cluster ls 2>&1 || echo "(ibmcloud oc cluster ls failed)" echo '```' echo "" - - echo "### Classic infrastructure permissions" + echo "### Classic infrastructure permissions (missing required/suggested)" echo "" for region in "${INFRA_REGIONS[@]}"; do run_infra_permissions "${region}" done -} | emit +} + +echo "::group::IBM Cloud IAM diagnostics" +write_diagnostics | tee "${DIAG_FILE}" +echo "::endgroup::" + +if [[ -n "${GITHUB_STEP_SUMMARY:-}" ]]; then + { + echo "## IBM Cloud IAM diagnostics" + echo "" + echo "Full output is also in the **Log IBM Cloud IAM diagnostics** step log and the \`ibmcloud-iam-diagnostics\` artifact." + echo "" + cat "${DIAG_FILE}" + } >> "${GITHUB_STEP_SUMMARY}" +fi + +echo "" +echo "=== IBM Cloud IAM diagnostics written to ${DIAG_FILE} ===" +echo "=== Also check the workflow run Summary tab (top of the run page) ===" exit 0 From 7569633620398b9412d66dd0329a9e1dc6c47ffc Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 12:46:37 -0400 Subject: [PATCH 19/42] ci(hot-cluster): add VPC Gen2 provisioning path alongside classic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add infrastructure_type input (classic/vpc) to ibmc-cluster-setup.yml. Classic path is unchanged and remains the default. VPC path provisions VPC, subnet, public gateway, then creates a vpc-gen2 cluster — avoids the classic infrastructure Super User requirement (E73e6). Also updates: - log-ibmcloud-iam-diagnostics.sh: VPC probe when infra_type=vpc - ibmc-cluster-teardown.yml: clean VPC resources after cluster delete - ci-scripts/README.md: document both paths and IAM requirements Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 139 ++++++++++++++++++-- .github/workflows/ibmc-cluster-teardown.yml | 41 +++++- ci-scripts/README.md | 37 +++++- ci-scripts/log-ibmcloud-iam-diagnostics.sh | 66 ++++++++-- 4 files changed, 253 insertions(+), 30 deletions(-) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index c1cde8f1a9..f428fe7d0f 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -3,15 +3,23 @@ name: IBM Cloud Hot Cluster Setup on: workflow_dispatch: inputs: + infrastructure_type: + description: 'Infrastructure type: classic or vpc' + required: true + default: 'classic' + type: choice + options: + - classic + - vpc cluster_name: description: 'Cluster name' required: true default: 'kubevirt-plugin-ci' type: string zone: - description: 'IBM Cloud classic infrastructure zone (e.g., dal10, wdc04, fra02)' + description: 'Zone (classic: wdc04, fra02; vpc: us-south-1, eu-de-1)' required: true - default: 'wdc04' + default: 'us-south-1' type: string openshift_version: description: 'OpenShift version' @@ -19,9 +27,9 @@ on: default: '4.20_openshift' type: string worker_flavor: - description: 'Worker node flavor (bare metal: mb4c.4x32, mb4c.20x64; vpc: m3c.4x16, m3c.8x64)' + description: 'Worker node flavor (classic bare metal: mb4c.4x32; vpc: bx2.8x32, cx2.4x8)' required: true - default: 'm3c.8x64' + default: 'bx2.8x32' type: string worker_count: description: 'Number of worker nodes (at least 2 so ingress is happy)' @@ -29,16 +37,22 @@ on: default: '2' type: string kvm_emulation: - description: 'KVM emulation (true for vpc, false for bare metal)' + description: 'KVM emulation (true for vpc/shared, false for bare metal)' required: true default: true type: boolean + cos_instance_crn: + description: 'COS instance CRN for VPC internal registry (required for vpc, ignored for classic)' + required: false + default: '' + type: string permissions: contents: read env: CLUSTER_NAME: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }} + INFRASTRUCTURE_TYPE: ${{ inputs.infrastructure_type || 'classic' }} jobs: provision-cluster: @@ -55,13 +69,14 @@ jobs: api_key: ${{ secrets.IC_KEY }} region: eu-de group: cnv-ui - plugins: kubernetes-service, container-registry + plugins: kubernetes-service, container-registry, vpc-infrastructure - name: Log IBM Cloud IAM diagnostics id: iam_diagnostics continue-on-error: true env: WORKER_ZONE: ${{ inputs.zone }} + INFRASTRUCTURE_TYPE: ${{ inputs.infrastructure_type || 'classic' }} run: bash ./ci-scripts/log-ibmcloud-iam-diagnostics.sh - name: Upload IAM diagnostics log @@ -85,8 +100,11 @@ jobs: echo "exists=false" >> "$GITHUB_OUTPUT" fi - - name: Verify zone and flavor - if: steps.check_cluster.outputs.exists == 'false' + # ────────────────────────────────────────────────────────────────────── + # Classic infrastructure path + # ────────────────────────────────────────────────────────────────────── + - name: Verify zone and flavor (classic) + if: steps.check_cluster.outputs.exists == 'false' && inputs.infrastructure_type == 'classic' env: ZONE: ${{ inputs.zone }} FLAVOR: ${{ inputs.worker_flavor }} @@ -127,8 +145,8 @@ jobs: fi echo "Flavor '${FLAVOR}' is available in zone '${ZONE}'" - - name: Create ROKS cluster - if: steps.check_cluster.outputs.exists == 'false' + - name: Create ROKS cluster (classic) + if: steps.check_cluster.outputs.exists == 'false' && inputs.infrastructure_type == 'classic' env: ZONE: ${{ inputs.zone }} run: | @@ -154,6 +172,99 @@ jobs: --private-vlan "${PRIVATE_VLAN}" \ --public-vlan "${PUBLIC_VLAN}" + # ────────────────────────────────────────────────────────────────────── + # VPC Gen2 infrastructure path + # ────────────────────────────────────────────────────────────────────── + - name: Provision VPC resources + if: steps.check_cluster.outputs.exists == 'false' && inputs.infrastructure_type == 'vpc' + id: vpc_resources + env: + ZONE: ${{ inputs.zone }} + VPC_NAME: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }}-vpc + run: | + echo "=== VPC Gen2 provisioning ===" + + # Derive region from zone (e.g. us-south-1 -> us-south) + VPC_REGION="${ZONE%-*}" + echo "VPC region: ${VPC_REGION}, zone: ${ZONE}" + + # Target the VPC region + ibmcloud target -r "${VPC_REGION}" + + # Create or reuse VPC + VPC_ID=$(ibmcloud is vpcs --output json 2>/dev/null | jq -r --arg n "${VPC_NAME}" '.[] | select(.name == $n) | .id // empty') + if [[ -z "${VPC_ID}" ]]; then + echo "Creating VPC '${VPC_NAME}'..." + VPC_ID=$(ibmcloud is vpc-create "${VPC_NAME}" --output json | jq -r '.id') + echo "Created VPC: ${VPC_ID}" + else + echo "Reusing existing VPC '${VPC_NAME}': ${VPC_ID}" + fi + + # Create or reuse subnet + SUBNET_NAME="${VPC_NAME}-subnet-${ZONE}" + SUBNET_ID=$(ibmcloud is subnets --output json 2>/dev/null | jq -r --arg n "${SUBNET_NAME}" '.[] | select(.name == $n) | .id // empty') + if [[ -z "${SUBNET_ID}" ]]; then + echo "Creating subnet '${SUBNET_NAME}' in zone '${ZONE}'..." + SUBNET_ID=$(ibmcloud is subnet-create "${SUBNET_NAME}" "${VPC_ID}" --zone "${ZONE}" --ipv4-address-count 256 --output json | jq -r '.id') + echo "Created subnet: ${SUBNET_ID}" + else + echo "Reusing existing subnet '${SUBNET_NAME}': ${SUBNET_ID}" + fi + + # Create or reuse public gateway (required for console/OperatorHub access) + GW_NAME="${VPC_NAME}-gw-${ZONE}" + GW_ID=$(ibmcloud is public-gateways --output json 2>/dev/null | jq -r --arg n "${GW_NAME}" '.[] | select(.name == $n) | .id // empty') + if [[ -z "${GW_ID}" ]]; then + echo "Creating public gateway '${GW_NAME}'..." + GW_ID=$(ibmcloud is public-gateway-create "${GW_NAME}" "${VPC_ID}" "${ZONE}" --output json | jq -r '.id') + echo "Created public gateway: ${GW_ID}" + else + echo "Reusing existing public gateway '${GW_NAME}': ${GW_ID}" + fi + + # Attach public gateway to subnet (idempotent — no-op if already attached) + echo "Attaching public gateway to subnet..." + ibmcloud is subnet-update "${SUBNET_ID}" --pgw "${GW_ID}" 2>/dev/null || true + + echo "vpc_id=${VPC_ID}" >> "$GITHUB_OUTPUT" + echo "subnet_id=${SUBNET_ID}" >> "$GITHUB_OUTPUT" + echo "vpc_region=${VPC_REGION}" >> "$GITHUB_OUTPUT" + + - name: Create ROKS cluster (vpc) + if: steps.check_cluster.outputs.exists == 'false' && inputs.infrastructure_type == 'vpc' + env: + ZONE: ${{ inputs.zone }} + VPC_ID: ${{ steps.vpc_resources.outputs.vpc_id }} + SUBNET_ID: ${{ steps.vpc_resources.outputs.subnet_id }} + COS_CRN: ${{ inputs.cos_instance_crn }} + run: | + if [[ -z "${COS_CRN}" ]]; then + echo "::error::COS instance CRN is required for VPC clusters (internal registry). Set the 'cos_instance_crn' input." + echo "" + echo "To find your COS instance CRN:" + echo " ibmcloud resource service-instances --service-name cloud-object-storage --long" + echo "" + echo "To create one:" + echo " ibmcloud resource service-instance-create kubevirt-ci-cos cloud-object-storage standard global" + exit 1 + fi + + echo "Creating VPC cluster '${CLUSTER_NAME}' with ${{ inputs.worker_count }}x ${{ inputs.worker_flavor }} workers in zone ${ZONE}..." + ibmcloud oc cluster create vpc-gen2 \ + --name "${CLUSTER_NAME}" \ + --version "${{ inputs.openshift_version }}" \ + --flavor "${{ inputs.worker_flavor }}" \ + --workers "${{ inputs.worker_count }}" \ + --zone "${ZONE}" \ + --vpc-id "${VPC_ID}" \ + --subnet-id "${SUBNET_ID}" \ + --cos-instance "${COS_CRN}" \ + --disable-outbound-traffic-protection + + # ────────────────────────────────────────────────────────────────────── + # Common steps (both classic and VPC converge here) + # ────────────────────────────────────────────────────────────────────── - name: Wait for cluster to be ready to use run: | ./ci-scripts/check-roks-cluster-state.sh @@ -206,7 +317,6 @@ jobs: ARC_APP_PRIVATE_KEY: ${{ secrets.ARC_GITHUB_APP_PRIVATE_KEY }} ARC_PAT: ${{ secrets.ARC_GITHUB_PAT }} ARC_RUNNER_IMAGE: ${{ steps.build_runner.outputs.image_ref }} - # Pin with script default (0.14.0); set to "latest" in the workflow to float OCI tags. ARC_VERSION: '0.14.0' run: | ./ci-scripts/arc/install-arc-controller.sh @@ -229,6 +339,7 @@ jobs: echo "" >> "$GITHUB_STEP_SUMMARY" echo "| Parameter | Value |" >> "$GITHUB_STEP_SUMMARY" echo "|-----------|-------|" >> "$GITHUB_STEP_SUMMARY" + echo "| Infrastructure | \`${{ inputs.infrastructure_type || 'classic' }}\` |" >> "$GITHUB_STEP_SUMMARY" echo "| Cluster | \`${CLUSTER_NAME}\` |" >> "$GITHUB_STEP_SUMMARY" echo "| Zone | \`${{ inputs.zone }}\` |" >> "$GITHUB_STEP_SUMMARY" echo "| OpenShift | \`${{ inputs.openshift_version }}\` |" >> "$GITHUB_STEP_SUMMARY" @@ -238,7 +349,11 @@ jobs: echo "" >> "$GITHUB_STEP_SUMMARY" echo "### IAM diagnostics" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" - echo "If cluster create failed with **E73e6**, expand the **Log IBM Cloud IAM diagnostics** step log, open the run **Summary** tab, or download the \`ibmcloud-iam-diagnostics\` artifact." >> "$GITHUB_STEP_SUMMARY" + if [[ "${{ inputs.infrastructure_type || 'classic' }}" == "classic" ]]; then + echo "If cluster create failed with **E73e6**, expand the **Log IBM Cloud IAM diagnostics** step log, open the run **Summary** tab, or download the \`ibmcloud-iam-diagnostics\` artifact." >> "$GITHUB_STEP_SUMMARY" + else + echo "If cluster create failed, check VPC Infrastructure permissions in the **Log IBM Cloud IAM diagnostics** step." >> "$GITHUB_STEP_SUMMARY" + fi echo "" >> "$GITHUB_STEP_SUMMARY" if oc cluster-info &>/dev/null; then echo "Cluster is **healthy** and ready for CI." >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/ibmc-cluster-teardown.yml b/.github/workflows/ibmc-cluster-teardown.yml index 1efca5f32e..037021cff2 100644 --- a/.github/workflows/ibmc-cluster-teardown.yml +++ b/.github/workflows/ibmc-cluster-teardown.yml @@ -95,7 +95,46 @@ jobs: exit 1 fi - # TODO: Followup about how to manually delete "Runner scale sets" from ARC + - name: Clean up VPC resources + if: steps.check_cluster.outputs.exists == 'true' + continue-on-error: true + env: + VPC_NAME: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }}-vpc + run: | + echo "Checking for VPC resources to clean up..." + ibmcloud plugin install vpc-infrastructure -f 2>/dev/null || true + + VPC_ID=$(ibmcloud is vpcs --output json 2>/dev/null | jq -r --arg n "${VPC_NAME}" '.[] | select(.name == $n) | .id // empty' || true) + if [[ -z "${VPC_ID}" ]]; then + echo "No VPC '${VPC_NAME}' found — cluster was likely classic. Skipping VPC cleanup." + exit 0 + fi + + echo "Found VPC '${VPC_NAME}': ${VPC_ID}" + + echo "Removing public gateways..." + for gw_id in $(ibmcloud is public-gateways --output json 2>/dev/null | jq -r --arg vpc "${VPC_ID}" '.[] | select(.vpc.id == $vpc) | .id' || true); do + echo " Detaching and deleting gateway ${gw_id}..." + for sub_id in $(ibmcloud is subnets --output json 2>/dev/null | jq -r --arg gw "${gw_id}" '.[] | select(.public_gateway.id == $gw) | .id' || true); do + ibmcloud is subnet-update "${sub_id}" --pgw "" 2>/dev/null || true + done + ibmcloud is public-gateway-delete "${gw_id}" -f 2>/dev/null || true + done + + echo "Removing subnets..." + for sub_id in $(ibmcloud is subnets --output json 2>/dev/null | jq -r --arg vpc "${VPC_ID}" '.[] | select(.vpc.id == $vpc) | .id' || true); do + echo " Deleting subnet ${sub_id}..." + ibmcloud is subnet-delete "${sub_id}" -f 2>/dev/null || true + done + + echo "Waiting for subnets to be deleted before removing VPC..." + sleep 30 + + echo "Removing VPC '${VPC_NAME}'..." + ibmcloud is vpc-delete "${VPC_ID}" -f 2>/dev/null || echo "VPC deletion failed (may have remaining resources)" + + echo "VPC cleanup complete." + - name: Clean up ghost runners continue-on-error: true env: diff --git a/ci-scripts/README.md b/ci-scripts/README.md index 995d1c2847..006906f128 100644 --- a/ci-scripts/README.md +++ b/ci-scripts/README.md @@ -58,9 +58,18 @@ These secrets must be configured in the repository settings before running the w The API key must belong to a user or service ID with the following IAM permissions: -- **Kubernetes Service**: Administrator role (to create/delete ROKS clusters) -- **VPC Infrastructure Services**: Editor role (if using VPC-based clusters) -- **Classic Infrastructure**: Super User or equivalent (for bare metal provisioning) +**For VPC clusters** (recommended — simpler IAM): + +- **Kubernetes Service**: Administrator role (platform) +- **VPC Infrastructure Services**: Administrator role +- **Container Registry**: Administrator role +- **Resource group (`cnv-ui`)**: Viewer role + +**For classic clusters:** + +- **Kubernetes Service**: Administrator role (platform) +- **Classic Infrastructure**: Super User (or equivalent per-permission set) +- **Container Registry**: Administrator role ### Ghost Runner Cleanup (optional) @@ -198,10 +207,26 @@ To turn off dind (no Docker daemon in the pod): `export CONTAINER_MODE=none` and ### Setting up the hot cluster 1. Actions → **IBM Cloud Hot Cluster Setup** → Run workflow -2. Inputs: cluster name, **classic** zone (e.g. `wdc04`), OpenShift version, worker flavor/count, **KVM emulation** (`true` for VPC-style workers, `false` for bare metal with hardware KVM) -3. Wait for completion (provisioning time depends on flavor; setup includes HCO, dind mirror, custom runner image build, ARC controller + scale set, `check-cluster-health.sh`) +2. Select **infrastructure type**: `classic` or `vpc` (VPC recommended — simpler IAM) +3. Inputs: cluster name, zone, OpenShift version, worker flavor/count, KVM emulation +4. Wait for completion (provisioning time depends on flavor; setup includes HCO, ARC runner image build, ARC controller + scale set, ci-env-controller, `check-cluster-health.sh`) + +#### VPC path (recommended) + +- **Zone format**: `us-south-1`, `us-south-2`, `eu-de-1`, etc. +- **Flavor**: VPC flavors like `bx2.8x32`, `cx2.4x8` (list with `ibmcloud oc flavors --zone --provider vpc-gen2`) +- **COS instance CRN**: Required for internal registry. Find with `ibmcloud resource service-instances --service-name cloud-object-storage --long` +- **IAM**: Only needs **VPC Infrastructure Administrator** — no classic Super User or SoftLayer permissions +- VPC, subnet, and public gateway are auto-created and reused across runs + +#### Classic path + +- **Zone format**: `wdc04`, `fra02`, `dal10`, etc. +- **Flavor**: Classic flavors like `m3c.8x64`, `mb4c.4x32` +- **IAM**: Requires **Classic Infrastructure Super User** (SoftLayer permissions) +- VLANs are auto-discovered or created -**Implementation notes:** Provisioning uses `ibmcloud oc cluster create classic` (not VPC workers in this workflow). Setup installs `oc` from the cluster downloads endpoint, runs `install-hco.sh`, then `arc/setup-dind-mirror.sh`, `arc/setup-runner-image.sh`, `install-arc-controller.sh`, and `install-runner-scale-set.sh`. +**Implementation notes:** Both paths converge after cluster creation. Setup installs `oc` from the cluster downloads endpoint, then runs `install-hco.sh`, `setup-arc-runner-image.sh`, `install-arc-controller.sh`, `install-runner-scale-set.sh`, and `install-ci-env-controller.sh`. ### Running hot cluster E2E tests diff --git a/ci-scripts/log-ibmcloud-iam-diagnostics.sh b/ci-scripts/log-ibmcloud-iam-diagnostics.sh index 048b0cd0e9..cfed20be32 100755 --- a/ci-scripts/log-ibmcloud-iam-diagnostics.sh +++ b/ci-scripts/log-ibmcloud-iam-diagnostics.sh @@ -1,6 +1,7 @@ #!/bin/bash # -# Log IBM Cloud identity and classic infrastructure permission gaps for CI debugging. +# Log IBM Cloud identity and infrastructure permission gaps for CI debugging. +# Supports both classic and VPC infrastructure types. # Safe to run in GitHub Actions — never fails the job (exit 0). # # Output: @@ -9,12 +10,14 @@ # - Artifact ibmcloud-iam-diagnostics.txt (when GITHUB_STEP_SUMMARY is set) # # Optional env: -# WORKER_ZONE Classic datacenter for cluster workers (e.g. fra02, wdc04) -# IBM_REGION Override KS region for infra-permissions (skips zone mapping) +# INFRASTRUCTURE_TYPE 'classic' (default) or 'vpc' +# WORKER_ZONE Classic datacenter (e.g. fra02, wdc04) or VPC zone (e.g. us-south-1) +# IBM_REGION Override KS region for infra-permissions (classic only) set -uo pipefail DIAG_FILE="${RUNNER_TEMP:-/tmp}/ibmcloud-iam-diagnostics.txt" +INFRASTRUCTURE_TYPE="${INFRASTRUCTURE_TYPE:-classic}" # Map classic datacenter zone -> Kubernetes Service region for infra-permissions. map_worker_zone_to_infra_region() { @@ -30,13 +33,13 @@ map_worker_zone_to_infra_region() { if [[ -n "${IBM_REGION:-}" ]]; then INFRA_REGIONS=("${IBM_REGION}") -elif [[ -n "${WORKER_ZONE:-}" ]]; then +elif [[ -n "${WORKER_ZONE:-}" && "${INFRASTRUCTURE_TYPE}" == "classic" ]]; then INFRA_REGIONS=("$(map_worker_zone_to_infra_region "${WORKER_ZONE}")") else INFRA_REGIONS=("eu-de") fi -run_infra_permissions() { +run_classic_infra_permissions() { local region="$1" echo "#### ibmcloud ks infra-permissions get --region ${region}" echo "" @@ -46,32 +49,73 @@ run_infra_permissions() { echo "" } +run_vpc_diagnostics() { + echo "### VPC Infrastructure probe" + echo "" + echo "#### VPCs in account" + echo '```' + ibmcloud is vpcs 2>&1 || echo "(ibmcloud is vpcs failed — VPC Infrastructure plugin may not be installed or no permissions)" + echo '```' + echo "" + echo "#### VPC zones available" + echo '```' + if [[ -n "${WORKER_ZONE:-}" ]]; then + VPC_REGION="${WORKER_ZONE%-*}" + ibmcloud is zones --output json 2>&1 | jq -r --arg r "${VPC_REGION}" '.[] | select(.region.name == $r) | .name' 2>/dev/null || \ + ibmcloud is zones 2>&1 || echo "(zone listing failed)" + else + ibmcloud is zones 2>&1 || echo "(zone listing failed)" + fi + echo '```' + echo "" + echo "#### VPC flavors in zone ${WORKER_ZONE:-}" + echo '```' + if [[ -n "${WORKER_ZONE:-}" ]]; then + ibmcloud oc flavors --zone "${WORKER_ZONE}" --provider vpc-gen2 2>&1 | head -30 || echo "(flavor listing failed)" + else + echo "(WORKER_ZONE not set, skipping flavor check)" + fi + echo '```' + echo "" +} + write_diagnostics() { echo "## IBM Cloud IAM diagnostics" echo "" + echo "Infrastructure type: \`${INFRASTRUCTURE_TYPE}\`" echo "Worker zone: \`${WORKER_ZONE:-}\`" - echo "Infra-permissions region(s): \`${INFRA_REGIONS[*]}\`" + if [[ "${INFRASTRUCTURE_TYPE}" == "classic" ]]; then + echo "Infra-permissions region(s): \`${INFRA_REGIONS[*]}\`" + fi echo "" + echo "### Target" echo '```' ibmcloud target 2>&1 || echo "(ibmcloud target failed)" echo '```' echo "" + echo "### Account" echo '```' ibmcloud account show 2>&1 || echo "(ibmcloud account show failed)" echo '```' echo "" + echo "### Cluster list probe" echo '```' ibmcloud oc cluster ls 2>&1 || echo "(ibmcloud oc cluster ls failed)" echo '```' echo "" - echo "### Classic infrastructure permissions (missing required/suggested)" - echo "" - for region in "${INFRA_REGIONS[@]}"; do - run_infra_permissions "${region}" - done + + if [[ "${INFRASTRUCTURE_TYPE}" == "vpc" ]]; then + run_vpc_diagnostics + else + echo "### Classic infrastructure permissions (missing required/suggested)" + echo "" + for region in "${INFRA_REGIONS[@]}"; do + run_classic_infra_permissions "${region}" + done + fi } echo "::group::IBM Cloud IAM diagnostics" From ce0fcfc1518b47fd366fe8fda4c02fb15f18f30d Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 12:48:38 -0400 Subject: [PATCH 20/42] ci(hot-cluster): auto-create COS instance for VPC if not provided Instead of failing when cos_instance_crn is empty, look for an existing COS instance named -cos, or create one automatically. This makes the VPC path work out-of-the-box without pre-setup. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index f428fe7d0f..17d80ae1f3 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -238,16 +238,20 @@ jobs: VPC_ID: ${{ steps.vpc_resources.outputs.vpc_id }} SUBNET_ID: ${{ steps.vpc_resources.outputs.subnet_id }} COS_CRN: ${{ inputs.cos_instance_crn }} + COS_INSTANCE_NAME: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }}-cos run: | if [[ -z "${COS_CRN}" ]]; then - echo "::error::COS instance CRN is required for VPC clusters (internal registry). Set the 'cos_instance_crn' input." - echo "" - echo "To find your COS instance CRN:" - echo " ibmcloud resource service-instances --service-name cloud-object-storage --long" - echo "" - echo "To create one:" - echo " ibmcloud resource service-instance-create kubevirt-ci-cos cloud-object-storage standard global" - exit 1 + echo "No COS CRN provided — looking for existing COS instance '${COS_INSTANCE_NAME}'..." + COS_CRN=$(ibmcloud resource service-instances --service-name cloud-object-storage --output json 2>/dev/null \ + | jq -r --arg n "${COS_INSTANCE_NAME}" '.[] | select(.name == $n) | .crn // empty' || true) + + if [[ -z "${COS_CRN}" ]]; then + echo "Creating COS instance '${COS_INSTANCE_NAME}'..." + COS_CRN=$(ibmcloud resource service-instance-create "${COS_INSTANCE_NAME}" cloud-object-storage standard global --output json | jq -r '.crn') + echo "Created COS instance: ${COS_CRN}" + else + echo "Reusing existing COS instance: ${COS_CRN}" + fi fi echo "Creating VPC cluster '${CLUSTER_NAME}' with ${{ inputs.worker_count }}x ${{ inputs.worker_flavor }} workers in zone ${ZONE}..." From 1346c97bf8a05606430373cb261d3e15c6d3e416 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 12:49:51 -0400 Subject: [PATCH 21/42] ci(hot-cluster): fix COS instance creation (no --output json support) ibmcloud resource service-instance-create does not support --output json. Create first, then query the CRN with service-instances --output json. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index 17d80ae1f3..b7a3d3c6b8 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -247,7 +247,9 @@ jobs: if [[ -z "${COS_CRN}" ]]; then echo "Creating COS instance '${COS_INSTANCE_NAME}'..." - COS_CRN=$(ibmcloud resource service-instance-create "${COS_INSTANCE_NAME}" cloud-object-storage standard global --output json | jq -r '.crn') + ibmcloud resource service-instance-create "${COS_INSTANCE_NAME}" cloud-object-storage standard global + COS_CRN=$(ibmcloud resource service-instances --service-name cloud-object-storage --output json \ + | jq -r --arg n "${COS_INSTANCE_NAME}" '.[] | select(.name == $n) | .crn') echo "Created COS instance: ${COS_CRN}" else echo "Reusing existing COS instance: ${COS_CRN}" From ba9457ec26fddcb83487c6e236c6426917f51afe Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 12:50:56 -0400 Subject: [PATCH 22/42] ci(hot-cluster): fix COS plan name for non-interactive creation The 'standard' plan no longer works; use 'premium-global-deployment' which is the first option shown by the CLI. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index b7a3d3c6b8..ff707aa4eb 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -247,7 +247,8 @@ jobs: if [[ -z "${COS_CRN}" ]]; then echo "Creating COS instance '${COS_INSTANCE_NAME}'..." - ibmcloud resource service-instance-create "${COS_INSTANCE_NAME}" cloud-object-storage standard global + ibmcloud resource service-instance-create "${COS_INSTANCE_NAME}" cloud-object-storage \ + premium-global-deployment global COS_CRN=$(ibmcloud resource service-instances --service-name cloud-object-storage --output json \ | jq -r --arg n "${COS_INSTANCE_NAME}" '.[] | select(.name == $n) | .crn') echo "Created COS instance: ${COS_CRN}" From dec458b66bf3e93250cde9577d2ec82120ea0672 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 12:52:26 -0400 Subject: [PATCH 23/42] ci(hot-cluster): use COS standard plan ID for unambiguous creation Use the plan ID (744bfc56-d12c-4866-88d5-dac9139e0e5d) instead of the plan name to avoid interactive prompts or name resolution issues. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index ff707aa4eb..a15a23234d 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -247,8 +247,9 @@ jobs: if [[ -z "${COS_CRN}" ]]; then echo "Creating COS instance '${COS_INSTANCE_NAME}'..." + # Use plan ID to avoid interactive prompts (744bfc56... = standard plan) ibmcloud resource service-instance-create "${COS_INSTANCE_NAME}" cloud-object-storage \ - premium-global-deployment global + 744bfc56-d12c-4866-88d5-dac9139e0e5d global COS_CRN=$(ibmcloud resource service-instances --service-name cloud-object-storage --output json \ | jq -r --arg n "${COS_INSTANCE_NAME}" '.[] | select(.name == $n) | .crn') echo "Created COS instance: ${COS_CRN}" From 474dbd8b0408fb6e3b9a26e9a26ff47d32ab4b46 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 12:53:41 -0400 Subject: [PATCH 24/42] ci(hot-cluster): specify COS deployment name to avoid interactive prompt The standard plan ID resolves to two deployments (premium-global-deployment, premium-global-deployment-iam). Use -d premium-global-deployment to select non-interactively. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index a15a23234d..e9ff7b4e2f 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -247,9 +247,9 @@ jobs: if [[ -z "${COS_CRN}" ]]; then echo "Creating COS instance '${COS_INSTANCE_NAME}'..." - # Use plan ID to avoid interactive prompts (744bfc56... = standard plan) ibmcloud resource service-instance-create "${COS_INSTANCE_NAME}" cloud-object-storage \ - 744bfc56-d12c-4866-88d5-dac9139e0e5d global + 744bfc56-d12c-4866-88d5-dac9139e0e5d global \ + -d premium-global-deployment COS_CRN=$(ibmcloud resource service-instances --service-name cloud-object-storage --output json \ | jq -r --arg n "${COS_INSTANCE_NAME}" '.[] | select(.name == $n) | .crn') echo "Created COS instance: ${COS_CRN}" From bb89825b6282d4238fab0896e67269fee849627d Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 13:09:26 -0400 Subject: [PATCH 25/42] ci(hot-cluster): add IPI prerequisite checks to IAM diagnostics Probes VPC, COS, DNS Services, CIS, IAM Identity, resource groups, and authorization policies to confirm readiness for OpenShift IPI on IBM Cloud VPC. Run with INFRASTRUCTURE_TYPE=ipi. Co-authored-by: Cursor --- ci-scripts/log-ibmcloud-iam-diagnostics.sh | 81 +++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/ci-scripts/log-ibmcloud-iam-diagnostics.sh b/ci-scripts/log-ibmcloud-iam-diagnostics.sh index cfed20be32..1ccbb697f9 100755 --- a/ci-scripts/log-ibmcloud-iam-diagnostics.sh +++ b/ci-scripts/log-ibmcloud-iam-diagnostics.sh @@ -79,6 +79,83 @@ run_vpc_diagnostics() { echo "" } +run_ipi_prerequisites() { + echo "### IPI Prerequisites Check" + echo "" + echo "Checking if the account has the resources needed for OpenShift IPI on IBM Cloud VPC..." + echo "" + + echo "#### 1. VPC Infrastructure (create VMs, networks)" + echo '```' + if ibmcloud is vpcs 2>&1 | head -3; then + echo "RESULT: VPC access OK" + else + echo "RESULT: FAILED — need VPC Infrastructure Administrator" + fi + echo '```' + echo "" + + echo "#### 2. Cloud Object Storage (RHCOS images, ignition)" + echo '```' + if ibmcloud resource service-instances --service-name cloud-object-storage 2>&1 | head -5; then + echo "RESULT: COS access OK" + else + echo "RESULT: FAILED — need COS Administrator" + fi + echo '```' + echo "" + + echo "#### 3. DNS Services (cluster API/ingress records)" + echo '```' + ibmcloud plugin install dns -f 2>/dev/null || true + if ibmcloud dns zones 2>&1 | head -10; then + echo "RESULT: DNS Services access OK" + else + echo "RESULT: FAILED or no DNS zones configured — IPI needs a public DNS zone" + fi + echo '```' + echo "" + + echo "#### 4. Internet Services / CIS (alternative to DNS Services)" + echo '```' + ibmcloud plugin install cis -f 2>/dev/null || true + if ibmcloud cis instances 2>&1 | head -5; then + echo "RESULT: CIS access OK" + else + echo "RESULT: No CIS instances (may use DNS Services instead)" + fi + echo '```' + echo "" + + echo "#### 5. IAM Identity Service (service IDs for cluster components)" + echo '```' + if ibmcloud iam service-ids 2>&1 | head -5; then + echo "RESULT: IAM Identity access OK" + else + echo "RESULT: FAILED — need IAM Identity Service Administrator" + fi + echo '```' + echo "" + + echo "#### 6. Resource groups" + echo '```' + ibmcloud resource groups 2>&1 | head -10 + echo '```' + echo "" + + echo "#### 7. IAM authorization policies (service-to-service)" + echo '```' + ibmcloud iam authorization-policies 2>&1 | head -20 + echo '```' + echo "" + + echo "#### Summary" + echo "" + echo "If checks 1-5 show OK and check 3 or 4 has a DNS zone, IPI should work." + echo "If DNS shows no zones, a domain + DNS zone must be configured first." + echo "" +} + write_diagnostics() { echo "## IBM Cloud IAM diagnostics" echo "" @@ -107,7 +184,9 @@ write_diagnostics() { echo '```' echo "" - if [[ "${INFRASTRUCTURE_TYPE}" == "vpc" ]]; then + if [[ "${INFRASTRUCTURE_TYPE}" == "ipi" ]]; then + run_ipi_prerequisites + elif [[ "${INFRASTRUCTURE_TYPE}" == "vpc" ]]; then run_vpc_diagnostics else echo "### Classic infrastructure permissions (missing required/suggested)" From 660062dba8f6b2f83fc074d7fea398f98e423317 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 13:10:53 -0400 Subject: [PATCH 26/42] ci(hot-cluster): add ipi diagnostics-only mode to setup workflow Select infrastructure_type=ipi to run IPI prerequisite checks (VPC, COS, DNS, IAM Identity, resource groups, auth policies) without creating any resources. All common steps are skipped. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index e9ff7b4e2f..fbcec0ec06 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -4,13 +4,14 @@ on: workflow_dispatch: inputs: infrastructure_type: - description: 'Infrastructure type: classic or vpc' + description: 'Infrastructure type: classic, vpc, or ipi (ipi = diagnostics only)' required: true default: 'classic' type: choice options: - classic - vpc + - ipi cluster_name: description: 'Cluster name' required: true @@ -90,6 +91,7 @@ jobs: if-no-files-found: warn - name: Check for existing cluster + if: inputs.infrastructure_type != 'ipi' id: check_cluster run: | if ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" &>/dev/null; then @@ -271,31 +273,36 @@ jobs: --disable-outbound-traffic-protection # ────────────────────────────────────────────────────────────────────── - # Common steps (both classic and VPC converge here) + # Common steps (both classic and VPC converge here; skipped for ipi) # ────────────────────────────────────────────────────────────────────── - name: Wait for cluster to be ready to use + if: inputs.infrastructure_type != 'ipi' run: | ./ci-scripts/check-roks-cluster-state.sh - name: Install oc client from cluster version + if: inputs.infrastructure_type != 'ipi' run: | CLUSTER_JSON="$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --output json)" export CLUSTER_JSON bash ./ci-scripts/install-oc-client.sh - name: Configure kubeconfig + if: inputs.infrastructure_type != 'ipi' run: | ibmcloud oc cluster config --cluster "${CLUSTER_NAME}" --admin oc cluster-info oc get nodes -o wide - name: Install HCO + if: inputs.infrastructure_type != 'ipi' env: KVM_EMULATION: ${{ inputs.kvm_emulation }} run: | ./ci-scripts/install-hco.sh - name: Verify ARC secrets + if: inputs.infrastructure_type != 'ipi' run: | HAS_APP=$([ -n "${{ secrets.ARC_GITHUB_APP_ID }}" ] && [ -n "${{ secrets.ARC_GITHUB_APP_INSTALL_ID }}" ] && [ -n "${{ secrets.ARC_GITHUB_APP_PRIVATE_KEY }}" ] && echo "yes" || echo "no") HAS_PAT=$([ -n "${{ secrets.ARC_GITHUB_PAT }}" ] && echo "yes" || echo "no") @@ -310,6 +317,7 @@ jobs: echo "ARC secrets are present." - name: Build ARC runner image + if: inputs.infrastructure_type != 'ipi' id: build_runner env: OC_VERSION: '4.20' @@ -318,6 +326,7 @@ jobs: echo "image_ref=${IMAGE_REF}" >> "$GITHUB_OUTPUT" - name: Install ARC + if: inputs.infrastructure_type != 'ipi' env: ARC_CONFIG_URL: 'https://github.com/${{ github.repository }}' ARC_APP_ID: ${{ secrets.ARC_GITHUB_APP_ID }} @@ -331,10 +340,12 @@ jobs: ./ci-scripts/arc/install-runner-scale-set.sh - name: Install CI environment controller + if: inputs.infrastructure_type != 'ipi' run: | ./ci-scripts/ci-env/install-ci-env-controller.sh - name: Verify cluster health + if: inputs.infrastructure_type != 'ipi' env: GITHUB_REPOSITORY: ${{ github.repository }} run: | From 2aa1b338f8394e47511a0ba38abc523f58ab82e3 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 13:21:10 -0400 Subject: [PATCH 27/42] ci(hot-cluster): add IPI cluster creation path (experimental) When infrastructure_type=ipi, downloads openshift-install 4.20.8, generates install-config.yaml for IBM Cloud VPC, and attempts cluster creation. Includes automatic cleanup on failure and artifact upload for debugging. Expects OPENSHIFT_PULL_SECRET secret. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 125 +++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index fbcec0ec06..44953ecfe1 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -272,6 +272,131 @@ jobs: --cos-instance "${COS_CRN}" \ --disable-outbound-traffic-protection + # ────────────────────────────────────────────────────────────────────── + # IPI (self-managed OpenShift) path + # ────────────────────────────────────────────────────────────────────── + - name: Install openshift-install and ccoctl + if: inputs.infrastructure_type == 'ipi' + env: + OCP_VERSION: '4.20.8' + run: | + echo "Downloading openshift-install ${OCP_VERSION}..." + curl -sL "https://mirror.openshift.com/pub/openshift-v4/x86_64/clients/ocp/${OCP_VERSION}/openshift-install-linux.tar.gz" \ + | tar -xz -C /usr/local/bin openshift-install + openshift-install version + + echo "Downloading ccoctl ${OCP_VERSION}..." + curl -sL "https://mirror.openshift.com/pub/openshift-v4/x86_64/clients/ocp/${OCP_VERSION}/ccoctl-linux.tar.gz" \ + | tar -xz -C /usr/local/bin ccoctl + ccoctl version 2>&1 | head -3 || true + + - name: Generate IPI install-config + if: inputs.infrastructure_type == 'ipi' + env: + ZONE: ${{ inputs.zone }} + PULL_SECRET: ${{ secrets.OPENSHIFT_PULL_SECRET }} + WORKER_FLAVOR: ${{ inputs.worker_flavor }} + WORKER_COUNT: ${{ inputs.worker_count }} + run: | + VPC_REGION="${ZONE%-*}" + INSTALL_DIR="${RUNNER_TEMP}/ipi-install" + mkdir -p "${INSTALL_DIR}" + + # Generate SSH key for node access + ssh-keygen -t ed25519 -f "${INSTALL_DIR}/ssh-key" -N "" -q + SSH_PUB=$(cat "${INSTALL_DIR}/ssh-key.pub") + + cat > "${INSTALL_DIR}/install-config.yaml" <> "$GITHUB_OUTPUT" + + # Show config without secrets + echo "::group::install-config.yaml (redacted)" + sed 's/pullSecret:.*/pullSecret: REDACTED/' "${INSTALL_DIR}/install-config.yaml" + echo "::endgroup::" + + - name: Attempt IPI cluster creation + if: inputs.infrastructure_type == 'ipi' + env: + IC_API_KEY: ${{ secrets.IC_KEY }} + INSTALL_DIR: ${{ runner.temp }}/ipi-install + run: | + export IC_API_KEY + + echo "Running openshift-install create cluster..." + echo "This will likely fail at DNS validation — we want to see the exact error." + echo "" + + openshift-install create cluster --dir="${INSTALL_DIR}" --log-level=info 2>&1 | tee "${INSTALL_DIR}/install.log" || true + + echo "" + echo "=== Install attempt completed (exit code: $?) ===" + echo "" + echo "::group::Last 50 lines of install log" + tail -50 "${INSTALL_DIR}/install.log" || true + echo "::endgroup::" + + - name: Upload IPI install artifacts + if: inputs.infrastructure_type == 'ipi' && always() + uses: actions/upload-artifact@v6 + with: + name: ipi-install-artifacts-${{ github.run_id }} + path: | + ${{ runner.temp }}/ipi-install/.openshift_install.log + ${{ runner.temp }}/ipi-install/install.log + ${{ runner.temp }}/ipi-install/terraform.tfstate + retention-days: 7 + if-no-files-found: ignore + + - name: IPI cleanup on failure + if: inputs.infrastructure_type == 'ipi' && always() + env: + IC_API_KEY: ${{ secrets.IC_KEY }} + INSTALL_DIR: ${{ runner.temp }}/ipi-install + run: | + export IC_API_KEY + if [[ -f "${INSTALL_DIR}/metadata.json" ]]; then + echo "Cleaning up IPI resources..." + openshift-install destroy cluster --dir="${INSTALL_DIR}" --log-level=info 2>&1 | tail -30 || true + else + echo "No metadata.json found — nothing to clean up (install likely failed before provisioning)." + fi + # ────────────────────────────────────────────────────────────────────── # Common steps (both classic and VPC converge here; skipped for ipi) # ────────────────────────────────────────────────────────────────────── From cf4e298a0fe745e749f3f3932c4f8c4977f00ac6 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 13:25:24 -0400 Subject: [PATCH 28/42] ci(hot-cluster): fix IPI flavor format (bx2-4x16 not bx2.4x16) openshift-install uses hyphen-separated VPC instance profiles (bx2-4x16) while ibmcloud oc uses dot-separated (bx2.4x16). Convert automatically. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index 44953ecfe1..ca995e25bb 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -306,6 +306,9 @@ jobs: ssh-keygen -t ed25519 -f "${INSTALL_DIR}/ssh-key" -N "" -q SSH_PUB=$(cat "${INSTALL_DIR}/ssh-key.pub") + # IPI uses hyphen-separated flavors (bx2-4x16), not dot (bx2.4x16) + IPI_WORKER_FLAVOR=$(echo "${WORKER_FLAVOR}" | sed 's/\./-/g') + cat > "${INSTALL_DIR}/install-config.yaml" < Date: Wed, 24 Jun 2026 13:34:52 -0400 Subject: [PATCH 29/42] docs: add hot cluster CI status and follow-up guide Summarizes the Jun 24 testing session: three provisioning paths (classic, VPC ROKS, IPI), what works, what's blocked, and exact next steps for tomorrow after DNS/COS authorization is configured. Co-authored-by: Cursor --- docs/HOT_CLUSTER_CI_STATUS.md | 224 ++++++++++++++++++++++++++++++++++ 1 file changed, 224 insertions(+) create mode 100644 docs/HOT_CLUSTER_CI_STATUS.md diff --git a/docs/HOT_CLUSTER_CI_STATUS.md b/docs/HOT_CLUSTER_CI_STATUS.md new file mode 100644 index 0000000000..7b1814a56b --- /dev/null +++ b/docs/HOT_CLUSTER_CI_STATUS.md @@ -0,0 +1,224 @@ +# Hot Cluster CI — Status & Follow-Up (2026-06-24) + +## Overview + +PR [#4099](https://github.com/kubevirt-ui/kubevirt-plugin/pull/4099) moves E2E testing from Prow (ephemeral AWS clusters) to a persistent IBM Cloud hot cluster with GitHub Actions + ARC. This document summarizes the current state and next steps. + +--- + +## Three Provisioning Paths Explored + +The setup workflow (`ibmc-cluster-setup.yml`) now supports three `infrastructure_type` options: + +| Path | Status | Remaining Blocker | +| ---------------------- | ------- | ----------------------------------------------------- | +| **Classic ROKS** | Blocked | E73e6 — Classic Infrastructure Super User missing | +| **VPC ROKS** | Blocked | E4acb — COS service-to-service authorization missing | +| **IPI (self-managed)** | Blocked | DNS zone for `cnv-ui.com` not configured in IBM Cloud | + +--- + +## What Works (Confirmed in CI Runs) + +| Component | Status | Run Evidence | +| ---------------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------ | +| IBM Cloud CLI login (`IC_KEY`) | OK | All runs | +| Kubernetes Service Administrator | OK (was A0010, now fixed) | Runs after Jun 24 | +| VPC Infrastructure Administrator | OK | VPC created in [#28114675220](https://github.com/kubevirt-ui/kubevirt-plugin/actions/runs/28114675220) | +| VPC + subnet + public gateway creation | OK | Auto-created and reused across runs | +| COS instance creation | OK | `kubevirt-plugin-ci-cos` active | +| IAM Identity Service | OK | Can list/create service IDs | +| `openshift-install` download + auth | OK | [#28116978024](https://github.com/kubevirt-ui/kubevirt-plugin/actions/runs/28116978024) | +| IPI flavor validation (`bx2-4x16`, `bx2-8x32`) | OK | Same run | +| `OPENSHIFT_PULL_SECRET` GitHub secret | OK | Added Jun 24 | +| IPI diagnostics mode | OK | [#28116118038](https://github.com/kubevirt-ui/kubevirt-plugin/actions/runs/28116118038) (green) | + +--- + +## Blocker Details + +### 1. VPC ROKS — COS Authorization (One Command Fix) + +Error: + +``` +Could not find the specified cloud object storage instance because it does not exist +or the API key that is set for this resource group and region has inadequate permissions. (E4acb) +``` + +**Fix:** + +```bash +ibmcloud iam authorization-policy-create containers-kubernetes cloud-object-storage Reader +``` + +This creates a service-to-service authorization policy allowing Kubernetes Service to read COS instances. One-time, account-level. + +### 2. IPI — DNS Zone Required + +Error: + +``` +failed to generate asset "DNS Config": failed to get DNS zone ID: +DNS zone "ipi-test.ibmcloud.local" not found +``` + +**Fix:** Register `cnv-ui.com` (or `ci.cnv-ui.com`) in IBM Cloud Internet Services (CIS): + +```bash +# Create CIS instance +ibmcloud resource service-instance-create cnv-ui-cis internet-svcs standard-next global -g cnv-ui + +# Add domain +ibmcloud cis instance-set cnv-ui-cis +ibmcloud cis domain-add cnv-ui.com + +# Then update domain registrar NS records to point to IBM Cloud nameservers +``` + +After NS propagation, update `baseDomain` in the IPI install-config to `cnv-ui.com`. + +### 3. Classic ROKS — Infrastructure Permissions + +Error: + +``` +The classic infrastructure permissions that are set for the region and resource group +do not have the required permissions to perform this action. (E73e6) +``` + +**Fix:** Grant Classic Infrastructure Super User to `IC_KEY` identity, or at minimum: + +- Add Server, Cancel Server, View Virtual Server Details +- IPMI Remote Management, OS Reloads and Rescue Kernel +- Add/Edit/View Support Case +- Add Compute with Public Network Port + +--- + +## Workflow Inputs Reference + +``` +Actions → IBM Cloud Hot Cluster Setup → Run workflow + +infrastructure_type: classic | vpc | ipi +cluster_name: kubevirt-plugin-ci +zone: wdc04 (classic), us-south-1 (vpc/ipi) +openshift_version: 4.20_openshift +worker_flavor: bx2.8x32 (ROKS dot format) / bx2-8x32 (IPI hyphen format, auto-converted) +worker_count: 2 +kvm_emulation: true +cos_instance_crn: (leave empty — auto-created for VPC) +``` + +--- + +## GitHub Secrets Required + +| Secret | Purpose | Status | +| -------------------------------------- | ------------------------------- | -------------------------- | +| `IC_KEY` | IBM Cloud API key | Org secret (exists) | +| `ARC_GITHUB_APP_*` or `ARC_GITHUB_PAT` | ARC runner registration | Org secret (exists) | +| `OPENSHIFT_PULL_SECRET` | Red Hat registry auth for IPI | Repo secret (added Jun 24) | +| `BOT_PAT` | Ghost runner cleanup (optional) | Repo secret (exists) | + +--- + +## IBM Cloud Account Info (from diagnostics) + +``` +User: mschatzm@redhat.com +Account: Virtualization (2be0cd841378412882ec2fb4a99951e2) +Account Owner: dkenigsb@redhat.com +Resource Group: cnv-ui +CLI Region: eu-de +``` + +--- + +## VPC Resources Already Created (reusable) + +| Resource | ID | Zone | +| -------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ---------- | +| VPC | `r006-2342eba0-b3be-412d-bbe3-ea040609b26d` | us-south | +| Subnet | `0717-aaacfab4-8863-4fc1-8e0c-8ad739f97504` | us-south-1 | +| Public Gateway | `r006-29e78258-1637-43df-b210-f2d920f68b27` | us-south-1 | +| COS Instance | `kubevirt-plugin-ci-cos` (CRN: `crn:v1:bluemix:public:cloud-object-storage:global:a/2be0cd841378412882ec2fb4a99951e2:6e32949d-...`) | global | + +These will be reused automatically on the next VPC or IPI run. + +--- + +## Tomorrow's Action Plan + +### If DNS is configured (`cnv-ui.com` in CIS): + +1. Update `baseDomain` in the IPI install-config step (change `ipi-test.ibmcloud.local` → `cnv-ui.com`) +2. Trigger: `infrastructure_type=ipi`, `zone=us-south-1` +3. Wait ~45 min for cluster creation +4. If successful, continue with HCO + ARC + ci-env-controller install +5. Then trigger Hot Cluster E2E + +### If COS authorization is granted instead (VPC ROKS path): + +1. Trigger: `infrastructure_type=vpc`, `zone=us-south-1` +2. COS instance already exists — cluster create should succeed +3. Wait for cluster ready (~30 min) +4. HCO, ARC, ci-env-controller install automatically +5. Then trigger Hot Cluster E2E + +### Quick test to verify either fix: + +```bash +# Test VPC ROKS (after COS auth): +gh workflow run "IBM Cloud Hot Cluster Setup" \ + --repo kubevirt-ui/kubevirt-plugin \ + --ref CNV-74265-hot-cluster-ci \ + -f infrastructure_type=vpc \ + -f cluster_name=kubevirt-plugin-ci \ + -f zone=us-south-1 + +# Test IPI (after DNS): +gh workflow run "IBM Cloud Hot Cluster Setup" \ + --repo kubevirt-ui/kubevirt-plugin \ + --ref CNV-74265-hot-cluster-ci \ + -f infrastructure_type=ipi \ + -f cluster_name=kubevirt-plugin-ci \ + -f zone=us-south-1 +``` + +--- + +## Key Commits on Branch (Jun 24) + +| Commit | Description | +| ----------- | ------------------------------------------------ | +| `005efee7c` | IAM diagnostics visible in logs + artifacts | +| `756963362` | Add VPC Gen2 provisioning path alongside classic | +| `bb89825b6` | Add IPI prerequisite checks to diagnostics | +| `660062dba` | Add `ipi` diagnostics-only mode | +| `2aa1b338f` | Add IPI cluster creation path (experimental) | +| `cf4e298a0` | Fix IPI flavor format (bx2-4x16 not bx2.4x16) | + +--- + +## Architecture Diagram + +``` +infrastructure_type = classic → Verify zone → VLAN lookup → ibmcloud oc cluster create classic +infrastructure_type = vpc → VPC/subnet/gateway → COS → ibmcloud oc cluster create vpc-gen2 +infrastructure_type = ipi → openshift-install create cluster (DNS + VPC + VMs) + ↓ (all paths converge) + Wait for cluster → Install oc → HCO → ARC → ci-env-controller + ↓ + Hot Cluster E2E → Playwright gating tests +``` + +--- + +## Cost Notes + +- VPC resources (VPC, subnet, gateway) are free when idle — no VMs running +- COS instance exists but has no buckets yet — negligible cost +- No VMs were provisioned in any IPI run (failed before that stage) +- Auto-teardown (2h idle) applies once a cluster is running From d878863701cb2c297efdb342696e7c76ff9ba5ef Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 15:34:05 -0400 Subject: [PATCH 30/42] ci(hot-cluster): use cnv-ui.com as IPI baseDomain Domain is registered with IBM Cloud CIS nameservers. IPI will create api.kubevirt-plugin-ci.cnv-ui.com and *.apps.kubevirt-plugin-ci.cnv-ui.com DNS records. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index ca995e25bb..d5d16af35d 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -313,7 +313,7 @@ jobs: apiVersion: v1 metadata: name: ${CLUSTER_NAME} - baseDomain: ipi-test.ibmcloud.local + baseDomain: cnv-ui.com platform: ibmcloud: region: ${VPC_REGION} From 33af1b0fa3d9bdf6c77aca7a28c59d839b812f13 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 15:39:22 -0400 Subject: [PATCH 31/42] ci(hot-cluster): use existing cnv-ui resource group for IPI The IC_KEY identity cannot create resource groups. Tell the installer to use the existing cnv-ui group instead of creating a new one. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index d5d16af35d..c771574495 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -317,6 +317,7 @@ jobs: platform: ibmcloud: region: ${VPC_REGION} + resourceGroupName: cnv-ui controlPlane: architecture: amd64 hyperthreading: Enabled From a3fb934a6f5d299a1cf255441ca5701338913338 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 17:06:10 -0400 Subject: [PATCH 32/42] ci(hot-cluster): production cleanup and IPI bootstrap fix Setup workflow: - IPI now leaves cluster running on success (cleanup only on failure) - All bootstrap steps (HCO, ARC, ci-env) run for all infra types - IPI uses installer kubeconfig instead of ibmcloud oc - Fixed install-config.yaml heredoc indentation - Added credentialsMode: Manual + ccoctl manifest generation - OpenShift version configurable for IPI (stable-4.XX channel) - Default infrastructure_type changed to ipi Teardown workflow: - Added infrastructure_type input (ipi/vpc/classic) - IPI teardown downloads install state and runs openshift-install destroy Auto-teardown: - Detects IPI clusters via DNS probe - Added business_hours_only option (Sun-Thu 8-18 Israel) - Idle threshold shown in summary Cleanup: - Deleted POC files and status doc - Consolidated README with IPI docs Co-authored-by: Cursor --- .../workflows/ibmc-cluster-auto-teardown.yml | 88 +++- .github/workflows/ibmc-cluster-setup.yml | 221 +++++----- .github/workflows/ibmc-cluster-teardown.yml | 104 ++++- ci-scripts/POC_OUTLINE.md | 72 --- ci-scripts/README.md | 412 +++++------------- cypress/tests/gating/poc-check-tab-yaml.cy.ts | 232 ---------- cypress/tests/poc-gating.cy.ts | 3 - docs/HOT_CLUSTER_CI_STATUS.md | 224 ---------- 8 files changed, 384 insertions(+), 972 deletions(-) delete mode 100644 ci-scripts/POC_OUTLINE.md delete mode 100644 cypress/tests/gating/poc-check-tab-yaml.cy.ts delete mode 100644 cypress/tests/poc-gating.cy.ts delete mode 100644 docs/HOT_CLUSTER_CI_STATUS.md diff --git a/.github/workflows/ibmc-cluster-auto-teardown.yml b/.github/workflows/ibmc-cluster-auto-teardown.yml index 88f7e9b39e..bef89d2bcf 100644 --- a/.github/workflows/ibmc-cluster-auto-teardown.yml +++ b/.github/workflows/ibmc-cluster-auto-teardown.yml @@ -11,10 +11,23 @@ on: default: 'kubevirt-plugin-ci' type: string idle_threshold_minutes: - description: 'Idle threshold in minutes before teardown' + description: 'Minutes of CI inactivity before teardown' required: true default: '120' type: string + check_frequency_minutes: + description: 'How often to check (only affects manual runs; cron is fixed at 30 min)' + required: false + default: '30' + type: string + business_hours_only: + description: 'Only tear down outside business hours (Sun-Thu 8am-6pm UTC+2 Israel)' + required: false + default: 'false' + type: choice + options: + - 'false' + - 'true' permissions: contents: read @@ -82,9 +95,35 @@ jobs: core.setOutput('active_jobs', (inProgress > 0 || queued > 0) ? 'true' : 'false'); core.setOutput('last_run_time', lastRunTime ? lastRunTime.toISOString() : ''); + - name: Check business hours + id: check_hours + if: steps.check_ci.outputs.active_jobs == 'false' + env: + BUSINESS_HOURS_ONLY: ${{ inputs.business_hours_only || 'false' }} + run: | + if [[ "${BUSINESS_HOURS_ONLY}" != "true" ]]; then + echo "skip=false" >> "$GITHUB_OUTPUT" + echo "Business hours check disabled — proceeding with idle check." + exit 0 + fi + + HOUR_IL=$(TZ='Asia/Jerusalem' date +%H) + DOW=$(TZ='Asia/Jerusalem' date +%u) + + if [[ ${DOW} -ge 1 && ${DOW} -le 4 && ${HOUR_IL} -ge 8 && ${HOUR_IL} -lt 18 ]]; then + echo "skip=true" >> "$GITHUB_OUTPUT" + echo "Business hours (Sun-Thu 8-18 Israel) — skipping teardown." + elif [[ ${DOW} -eq 7 && ${HOUR_IL} -ge 8 && ${HOUR_IL} -lt 18 ]]; then + echo "skip=true" >> "$GITHUB_OUTPUT" + echo "Business hours (Sunday 8-18 Israel) — skipping teardown." + else + echo "skip=false" >> "$GITHUB_OUTPUT" + echo "Outside business hours — proceeding with idle check." + fi + - name: Setup IBM Cloud CLI uses: IBM/actions-ibmcloud-cli@v1 - if: steps.check_ci.outputs.active_jobs == 'false' + if: steps.check_ci.outputs.active_jobs == 'false' && steps.check_hours.outputs.skip != 'true' with: api_key: ${{ secrets.IC_KEY }} region: eu-de @@ -93,7 +132,7 @@ jobs: - name: Check idle threshold id: check_idle - if: steps.check_ci.outputs.active_jobs == 'false' + if: steps.check_ci.outputs.active_jobs == 'false' && steps.check_hours.outputs.skip != 'true' env: LAST_RUN_TIME: ${{ steps.check_ci.outputs.last_run_time }} run: | @@ -130,30 +169,50 @@ jobs: id: check_cluster if: steps.check_idle.outputs.recent_activity == 'false' run: | + INFRA_TYPE="" + if ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" &>/dev/null; then STATE=$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --output json 2>/dev/null | jq -r '.state // "unknown"') - echo "Cluster '${CLUSTER_NAME}' exists (state: ${STATE})" + PROVIDER=$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --output json 2>/dev/null | jq -r '.provider // "classic"') + echo "ROKS cluster '${CLUSTER_NAME}' exists (state: ${STATE}, provider: ${PROVIDER})" + echo "exists=true" >> "$GITHUB_OUTPUT" + if [[ "${PROVIDER}" == "vpc-gen2" ]]; then + INFRA_TYPE="vpc" + else + INFRA_TYPE="classic" + fi + elif dig +short "api.${CLUSTER_NAME}.cnv-ui.com" 2>/dev/null | grep -q .; then + echo "IPI cluster detected via DNS (api.${CLUSTER_NAME}.cnv-ui.com resolves)" echo "exists=true" >> "$GITHUB_OUTPUT" + INFRA_TYPE="ipi" else - echo "Cluster '${CLUSTER_NAME}' does not exist, nothing to do" + echo "No cluster '${CLUSTER_NAME}' found (ROKS or IPI), nothing to do" echo "exists=false" >> "$GITHUB_OUTPUT" fi + echo "infra_type=${INFRA_TYPE}" >> "$GITHUB_OUTPUT" + - name: Trigger teardown if: steps.check_idle.outputs.recent_activity == 'false' && steps.check_cluster.outputs.exists == 'true' uses: actions/github-script@v8 with: script: | + const infraType = '${{ steps.check_cluster.outputs.infra_type }}' || 'classic'; + + const inputs = { + cluster_name: '${{ env.CLUSTER_NAME }}', + infrastructure_type: infraType, + }; + + core.info(`Triggering teardown for ${infraType} cluster '${inputs.cluster_name}'`); + await github.rest.actions.createWorkflowDispatch({ owner: context.repo.owner, repo: context.repo.repo, workflow_id: 'ibmc-cluster-teardown.yml', ref: context.ref, - inputs: { - cluster_name: '${{ env.CLUSTER_NAME }}' - } + inputs, }); - core.info('Teardown workflow triggered for cluster ${{ env.CLUSTER_NAME }}'); - name: Summary if: always() @@ -163,14 +222,17 @@ jobs: WORKFLOW_LAST_RUN_TIME: ${{ steps.check_ci.outputs.last_run_time }} IDLE_RECENT_ACTIVITY: ${{ steps.check_idle.outputs.recent_activity }} IDLE_REASON: ${{ steps.check_idle.outputs.reason }} + BUSINESS_HOURS_SKIP: ${{ steps.check_hours.outputs.skip }} run: | echo "## Auto-Teardown Check" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" echo "| Check | Result |" >> "$GITHUB_STEP_SUMMARY" echo "|-------|--------|" >> "$GITHUB_STEP_SUMMARY" echo "| Cluster | \`${CLUSTER_NAME}\` |" >> "$GITHUB_STEP_SUMMARY" + echo "| Idle Threshold | \`${IDLE_THRESHOLD_MINUTES}\` min |" >> "$GITHUB_STEP_SUMMARY" + echo "| Business Hours Skip | \`${BUSINESS_HOURS_SKIP:-N/A}\` |" >> "$GITHUB_STEP_SUMMARY" echo "| Cluster Exists | \`${CLUSTER_EXISTS:-(unknown)}\` |" >> "$GITHUB_STEP_SUMMARY" - echo "| Workflow Active Jobs? | \`${WORKFLOW_ACTIVE_JOBS:-N/A}\` |" >> "$GITHUB_STEP_SUMMARY" - echo "| Workflow Last Run Time | \`${WORKFLOW_LAST_RUN_TIME:-N/A}\` |" >> "$GITHUB_STEP_SUMMARY" - echo "| Idle Recent Activity? | \`${IDLE_RECENT_ACTIVITY:-N/A}\` |" >> "$GITHUB_STEP_SUMMARY" - echo "| Idle Reason | \`${IDLE_REASON:-N/A}\` |" >> "$GITHUB_STEP_SUMMARY" + echo "| Active CI Jobs? | \`${WORKFLOW_ACTIVE_JOBS:-N/A}\` |" >> "$GITHUB_STEP_SUMMARY" + echo "| Last CI Run | \`${WORKFLOW_LAST_RUN_TIME:-N/A}\` |" >> "$GITHUB_STEP_SUMMARY" + echo "| Recent Activity? | \`${IDLE_RECENT_ACTIVITY:-N/A}\` |" >> "$GITHUB_STEP_SUMMARY" + echo "| Reason | \`${IDLE_REASON:-N/A}\` |" >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index c771574495..a080cdb967 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -4,31 +4,31 @@ on: workflow_dispatch: inputs: infrastructure_type: - description: 'Infrastructure type: classic, vpc, or ipi (ipi = diagnostics only)' + description: 'Infrastructure type' required: true - default: 'classic' + default: 'ipi' type: choice options: - - classic - - vpc - ipi + - vpc + - classic cluster_name: description: 'Cluster name' required: true default: 'kubevirt-plugin-ci' type: string zone: - description: 'Zone (classic: wdc04, fra02; vpc: us-south-1, eu-de-1)' + description: 'Zone (classic: wdc04, fra02; vpc/ipi: us-south-1, eu-de-1)' required: true default: 'us-south-1' type: string openshift_version: - description: 'OpenShift version' + description: 'OpenShift version (e.g. 4.20_openshift, 4.22_openshift)' required: true default: '4.20_openshift' type: string worker_flavor: - description: 'Worker node flavor (classic bare metal: mb4c.4x32; vpc: bx2.8x32, cx2.4x8)' + description: 'Worker node flavor (classic: mb4c.4x32; vpc/ipi: bx2.8x32)' required: true default: 'bx2.8x32' type: string @@ -43,7 +43,7 @@ on: default: true type: boolean cos_instance_crn: - description: 'COS instance CRN for VPC internal registry (required for vpc, ignored for classic)' + description: 'COS instance CRN for VPC internal registry (auto-created if empty)' required: false default: '' type: string @@ -53,7 +53,7 @@ permissions: env: CLUSTER_NAME: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }} - INFRASTRUCTURE_TYPE: ${{ inputs.infrastructure_type || 'classic' }} + INFRASTRUCTURE_TYPE: ${{ inputs.infrastructure_type || 'ipi' }} jobs: provision-cluster: @@ -77,7 +77,7 @@ jobs: continue-on-error: true env: WORKER_ZONE: ${{ inputs.zone }} - INFRASTRUCTURE_TYPE: ${{ inputs.infrastructure_type || 'classic' }} + INFRASTRUCTURE_TYPE: ${{ inputs.infrastructure_type || 'ipi' }} run: bash ./ci-scripts/log-ibmcloud-iam-diagnostics.sh - name: Upload IAM diagnostics log @@ -90,7 +90,7 @@ jobs: retention-days: 14 if-no-files-found: warn - - name: Check for existing cluster + - name: Check for existing ROKS cluster if: inputs.infrastructure_type != 'ipi' id: check_cluster run: | @@ -186,14 +186,10 @@ jobs: run: | echo "=== VPC Gen2 provisioning ===" - # Derive region from zone (e.g. us-south-1 -> us-south) VPC_REGION="${ZONE%-*}" echo "VPC region: ${VPC_REGION}, zone: ${ZONE}" - - # Target the VPC region ibmcloud target -r "${VPC_REGION}" - # Create or reuse VPC VPC_ID=$(ibmcloud is vpcs --output json 2>/dev/null | jq -r --arg n "${VPC_NAME}" '.[] | select(.name == $n) | .id // empty') if [[ -z "${VPC_ID}" ]]; then echo "Creating VPC '${VPC_NAME}'..." @@ -203,7 +199,6 @@ jobs: echo "Reusing existing VPC '${VPC_NAME}': ${VPC_ID}" fi - # Create or reuse subnet SUBNET_NAME="${VPC_NAME}-subnet-${ZONE}" SUBNET_ID=$(ibmcloud is subnets --output json 2>/dev/null | jq -r --arg n "${SUBNET_NAME}" '.[] | select(.name == $n) | .id // empty') if [[ -z "${SUBNET_ID}" ]]; then @@ -214,7 +209,6 @@ jobs: echo "Reusing existing subnet '${SUBNET_NAME}': ${SUBNET_ID}" fi - # Create or reuse public gateway (required for console/OperatorHub access) GW_NAME="${VPC_NAME}-gw-${ZONE}" GW_ID=$(ibmcloud is public-gateways --output json 2>/dev/null | jq -r --arg n "${GW_NAME}" '.[] | select(.name == $n) | .id // empty') if [[ -z "${GW_ID}" ]]; then @@ -225,7 +219,6 @@ jobs: echo "Reusing existing public gateway '${GW_NAME}': ${GW_ID}" fi - # Attach public gateway to subnet (idempotent — no-op if already attached) echo "Attaching public gateway to subnet..." ibmcloud is subnet-update "${SUBNET_ID}" --pgw "${GW_ID}" 2>/dev/null || true @@ -275,20 +268,23 @@ jobs: # ────────────────────────────────────────────────────────────────────── # IPI (self-managed OpenShift) path # ────────────────────────────────────────────────────────────────────── - - name: Install openshift-install and ccoctl + - name: Install openshift-install if: inputs.infrastructure_type == 'ipi' + id: ipi_tools env: - OCP_VERSION: '4.20.8' + OC_VERSION_INPUT: ${{ inputs.openshift_version }} run: | - echo "Downloading openshift-install ${OCP_VERSION}..." - curl -sL "https://mirror.openshift.com/pub/openshift-v4/x86_64/clients/ocp/${OCP_VERSION}/openshift-install-linux.tar.gz" \ + OCP_CHANNEL="stable-${OC_VERSION_INPUT%%_*}" + echo "Resolving ${OCP_CHANNEL} to latest patch version..." + + echo "Downloading openshift-install (${OCP_CHANNEL})..." + curl -sL "https://mirror.openshift.com/pub/openshift-v4/x86_64/clients/ocp/${OCP_CHANNEL}/openshift-install-linux.tar.gz" \ | tar -xz -C /usr/local/bin openshift-install openshift-install version - echo "Downloading ccoctl ${OCP_VERSION}..." - curl -sL "https://mirror.openshift.com/pub/openshift-v4/x86_64/clients/ocp/${OCP_VERSION}/ccoctl-linux.tar.gz" \ - | tar -xz -C /usr/local/bin ccoctl - ccoctl version 2>&1 | head -3 || true + RESOLVED_VERSION=$(openshift-install version | head -1 | awk '{print $2}') + echo "ocp_channel=${OCP_CHANNEL}" >> "$GITHUB_OUTPUT" + echo "ocp_version=${RESOLVED_VERSION}" >> "$GITHUB_OUTPUT" - name: Generate IPI install-config if: inputs.infrastructure_type == 'ipi' @@ -302,59 +298,56 @@ jobs: INSTALL_DIR="${RUNNER_TEMP}/ipi-install" mkdir -p "${INSTALL_DIR}" - # Generate SSH key for node access ssh-keygen -t ed25519 -f "${INSTALL_DIR}/ssh-key" -N "" -q SSH_PUB=$(cat "${INSTALL_DIR}/ssh-key.pub") - # IPI uses hyphen-separated flavors (bx2-4x16), not dot (bx2.4x16) IPI_WORKER_FLAVOR=$(echo "${WORKER_FLAVOR}" | sed 's/\./-/g') cat > "${INSTALL_DIR}/install-config.yaml" <> "$GITHUB_OUTPUT" - # Show config without secrets echo "::group::install-config.yaml (redacted)" sed 's/pullSecret:.*/pullSecret: REDACTED/' "${INSTALL_DIR}/install-config.yaml" echo "::endgroup::" - - name: Attempt IPI cluster creation + - name: Generate CCO manifests for IBM Cloud if: inputs.infrastructure_type == 'ipi' env: IC_API_KEY: ${{ secrets.IC_KEY }} @@ -362,54 +355,74 @@ jobs: run: | export IC_API_KEY - echo "Running openshift-install create cluster..." - echo "This will likely fail at DNS validation — we want to see the exact error." - echo "" - - openshift-install create cluster --dir="${INSTALL_DIR}" --log-level=info 2>&1 | tee "${INSTALL_DIR}/install.log" || true + echo "Creating install manifests..." + openshift-install create manifests --dir="${INSTALL_DIR}" + + echo "Extracting CredentialsRequests..." + CRED_DIR="${INSTALL_DIR}/credreqs" + mkdir -p "${CRED_DIR}" + oc adm release extract \ + --credentials-requests \ + --cloud=ibmcloud \ + --to="${CRED_DIR}" \ + "$(openshift-install version | grep 'release image' | awk '{print $3}')" 2>/dev/null || \ + echo "Warning: could not extract credentials requests; falling back to installer defaults" + + if ls "${CRED_DIR}"/*.yaml &>/dev/null; then + echo "Processing CredentialsRequests with ccoctl..." + ccoctl ibmcloud create-service-id \ + --credentials-requests-dir="${CRED_DIR}" \ + --name="${CLUSTER_NAME}" \ + --output-dir="${INSTALL_DIR}" 2>&1 || true + fi - echo "" - echo "=== Install attempt completed (exit code: $?) ===" - echo "" - echo "::group::Last 50 lines of install log" - tail -50 "${INSTALL_DIR}/install.log" || true - echo "::endgroup::" + - name: Create IPI cluster + id: ipi_create + if: inputs.infrastructure_type == 'ipi' + env: + IC_API_KEY: ${{ secrets.IC_KEY }} + INSTALL_DIR: ${{ runner.temp }}/ipi-install + run: | + export IC_API_KEY + echo "Running openshift-install create cluster..." + openshift-install create cluster --dir="${INSTALL_DIR}" --log-level=info 2>&1 | tee "${INSTALL_DIR}/install.log" - name: Upload IPI install artifacts if: inputs.infrastructure_type == 'ipi' && always() uses: actions/upload-artifact@v6 with: - name: ipi-install-artifacts-${{ github.run_id }} + name: ipi-install-state-${{ github.run_id }} path: | - ${{ runner.temp }}/ipi-install/.openshift_install.log - ${{ runner.temp }}/ipi-install/install.log + ${{ runner.temp }}/ipi-install/metadata.json ${{ runner.temp }}/ipi-install/terraform.tfstate - retention-days: 7 + ${{ runner.temp }}/ipi-install/auth/ + ${{ runner.temp }}/ipi-install/.openshift_install.log + retention-days: 30 if-no-files-found: ignore - name: IPI cleanup on failure - if: inputs.infrastructure_type == 'ipi' && always() + if: inputs.infrastructure_type == 'ipi' && failure() env: IC_API_KEY: ${{ secrets.IC_KEY }} INSTALL_DIR: ${{ runner.temp }}/ipi-install run: | export IC_API_KEY if [[ -f "${INSTALL_DIR}/metadata.json" ]]; then - echo "Cleaning up IPI resources..." - openshift-install destroy cluster --dir="${INSTALL_DIR}" --log-level=info 2>&1 | tail -30 || true + echo "Install failed — cleaning up IPI resources..." + openshift-install destroy cluster --dir="${INSTALL_DIR}" --log-level=info 2>&1 | tail -50 || true else - echo "No metadata.json found — nothing to clean up (install likely failed before provisioning)." + echo "No metadata.json found — nothing to clean up." fi # ────────────────────────────────────────────────────────────────────── - # Common steps (both classic and VPC converge here; skipped for ipi) + # Common bootstrap steps (all paths converge here) # ────────────────────────────────────────────────────────────────────── - - name: Wait for cluster to be ready to use + - name: Wait for ROKS cluster ready if: inputs.infrastructure_type != 'ipi' run: | ./ci-scripts/check-roks-cluster-state.sh - - name: Install oc client from cluster version + - name: Install oc client if: inputs.infrastructure_type != 'ipi' run: | CLUSTER_JSON="$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --output json)" @@ -417,21 +430,29 @@ jobs: bash ./ci-scripts/install-oc-client.sh - name: Configure kubeconfig - if: inputs.infrastructure_type != 'ipi' + env: + OCP_CHANNEL: ${{ steps.ipi_tools.outputs.ocp_channel }} run: | - ibmcloud oc cluster config --cluster "${CLUSTER_NAME}" --admin + if [[ "${{ inputs.infrastructure_type }}" == "ipi" ]]; then + INSTALL_DIR="${RUNNER_TEMP}/ipi-install" + export KUBECONFIG="${INSTALL_DIR}/auth/kubeconfig" + echo "KUBECONFIG=${KUBECONFIG}" >> "$GITHUB_ENV" + + curl -sL "https://mirror.openshift.com/pub/openshift-v4/x86_64/clients/ocp/${OCP_CHANNEL}/openshift-client-linux.tar.gz" \ + | tar -xz -C /usr/local/bin oc kubectl + else + ibmcloud oc cluster config --cluster "${CLUSTER_NAME}" --admin + fi oc cluster-info oc get nodes -o wide - name: Install HCO - if: inputs.infrastructure_type != 'ipi' env: KVM_EMULATION: ${{ inputs.kvm_emulation }} run: | ./ci-scripts/install-hco.sh - name: Verify ARC secrets - if: inputs.infrastructure_type != 'ipi' run: | HAS_APP=$([ -n "${{ secrets.ARC_GITHUB_APP_ID }}" ] && [ -n "${{ secrets.ARC_GITHUB_APP_INSTALL_ID }}" ] && [ -n "${{ secrets.ARC_GITHUB_APP_PRIVATE_KEY }}" ] && echo "yes" || echo "no") HAS_PAT=$([ -n "${{ secrets.ARC_GITHUB_PAT }}" ] && echo "yes" || echo "no") @@ -446,7 +467,6 @@ jobs: echo "ARC secrets are present." - name: Build ARC runner image - if: inputs.infrastructure_type != 'ipi' id: build_runner env: OC_VERSION: '4.20' @@ -455,7 +475,6 @@ jobs: echo "image_ref=${IMAGE_REF}" >> "$GITHUB_OUTPUT" - name: Install ARC - if: inputs.infrastructure_type != 'ipi' env: ARC_CONFIG_URL: 'https://github.com/${{ github.repository }}' ARC_APP_ID: ${{ secrets.ARC_GITHUB_APP_ID }} @@ -469,12 +488,10 @@ jobs: ./ci-scripts/arc/install-runner-scale-set.sh - name: Install CI environment controller - if: inputs.infrastructure_type != 'ipi' run: | ./ci-scripts/ci-env/install-ci-env-controller.sh - name: Verify cluster health - if: inputs.infrastructure_type != 'ipi' env: GITHUB_REPOSITORY: ${{ github.repository }} run: | @@ -487,7 +504,7 @@ jobs: echo "" >> "$GITHUB_STEP_SUMMARY" echo "| Parameter | Value |" >> "$GITHUB_STEP_SUMMARY" echo "|-----------|-------|" >> "$GITHUB_STEP_SUMMARY" - echo "| Infrastructure | \`${{ inputs.infrastructure_type || 'classic' }}\` |" >> "$GITHUB_STEP_SUMMARY" + echo "| Infrastructure | \`${{ inputs.infrastructure_type || 'ipi' }}\` |" >> "$GITHUB_STEP_SUMMARY" echo "| Cluster | \`${CLUSTER_NAME}\` |" >> "$GITHUB_STEP_SUMMARY" echo "| Zone | \`${{ inputs.zone }}\` |" >> "$GITHUB_STEP_SUMMARY" echo "| OpenShift | \`${{ inputs.openshift_version }}\` |" >> "$GITHUB_STEP_SUMMARY" @@ -495,16 +512,8 @@ jobs: echo "| Workers | \`${{ inputs.worker_count }}\` |" >> "$GITHUB_STEP_SUMMARY" echo "| KVM Emulation | \`${{ inputs.kvm_emulation }}\` |" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" - echo "### IAM diagnostics" >> "$GITHUB_STEP_SUMMARY" - echo "" >> "$GITHUB_STEP_SUMMARY" - if [[ "${{ inputs.infrastructure_type || 'classic' }}" == "classic" ]]; then - echo "If cluster create failed with **E73e6**, expand the **Log IBM Cloud IAM diagnostics** step log, open the run **Summary** tab, or download the \`ibmcloud-iam-diagnostics\` artifact." >> "$GITHUB_STEP_SUMMARY" - else - echo "If cluster create failed, check VPC Infrastructure permissions in the **Log IBM Cloud IAM diagnostics** step." >> "$GITHUB_STEP_SUMMARY" - fi - echo "" >> "$GITHUB_STEP_SUMMARY" if oc cluster-info &>/dev/null; then echo "Cluster is **healthy** and ready for CI." >> "$GITHUB_STEP_SUMMARY" else - echo "Cluster setup **may have issues**. Check the logs." >> "$GITHUB_STEP_SUMMARY" + echo "Cluster setup **may have issues**. Check the logs and the IAM diagnostics artifact." >> "$GITHUB_STEP_SUMMARY" fi diff --git a/.github/workflows/ibmc-cluster-teardown.yml b/.github/workflows/ibmc-cluster-teardown.yml index 037021cff2..b2e745b812 100644 --- a/.github/workflows/ibmc-cluster-teardown.yml +++ b/.github/workflows/ibmc-cluster-teardown.yml @@ -3,26 +3,53 @@ name: IBM Cloud Hot Cluster Teardown on: workflow_dispatch: inputs: + infrastructure_type: + description: 'Infrastructure type used to create the cluster' + required: true + default: 'ipi' + type: choice + options: + - ipi + - vpc + - classic cluster_name: description: 'Cluster name to tear down' required: true default: 'kubevirt-plugin-ci' type: string + ipi_setup_run_id: + description: 'GitHub Actions run ID from the IPI setup (to download install state artifact)' + required: false + default: '' + type: string workflow_call: inputs: + infrastructure_type: + description: 'Infrastructure type' + required: false + default: 'classic' + type: string cluster_name: description: 'Cluster name to tear down' required: false default: 'kubevirt-plugin-ci' type: string + ipi_setup_run_id: + description: 'GitHub Actions run ID from the IPI setup' + required: false + default: '' + type: string secrets: IC_KEY: required: true BOT_PAT: required: false + OPENSHIFT_PULL_SECRET: + required: false permissions: contents: read + actions: read env: CLUSTER_NAME: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }} @@ -31,7 +58,7 @@ jobs: teardown: name: Tear Down Hot Cluster runs-on: ubuntu-latest - timeout-minutes: 60 + timeout-minutes: 90 steps: - name: Setup IBM Cloud CLI uses: IBM/actions-ibmcloud-cli@v1 @@ -39,15 +66,18 @@ jobs: api_key: ${{ secrets.IC_KEY }} region: eu-de group: cnv-ui - plugins: kubernetes-service + plugins: kubernetes-service, vpc-infrastructure - - name: Check cluster exists + # ────────────────────────────────────────────────────────────────────── + # ROKS teardown (classic + vpc) + # ────────────────────────────────────────────────────────────────────── + - name: Check ROKS cluster exists + if: inputs.infrastructure_type != 'ipi' id: check_cluster run: | if ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" &>/dev/null; then echo "Cluster '${CLUSTER_NAME}' found" echo "exists=true" >> "$GITHUB_OUTPUT" - ibmcloud oc cluster config --cluster "${CLUSTER_NAME}" --admin || true else echo "Cluster '${CLUSTER_NAME}' not found, nothing to tear down" @@ -55,7 +85,7 @@ jobs: fi - name: Deregister ARC runners - if: steps.check_cluster.outputs.exists == 'true' + if: inputs.infrastructure_type != 'ipi' && steps.check_cluster.outputs.exists == 'true' continue-on-error: true run: | if command -v helm &>/dev/null || (curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash); then @@ -68,15 +98,15 @@ jobs: echo "WARNING: Helm not available, skipping Helm uninstall" fi - - name: Delete cluster - if: steps.check_cluster.outputs.exists == 'true' + - name: Delete ROKS cluster + if: inputs.infrastructure_type != 'ipi' && steps.check_cluster.outputs.exists == 'true' run: | echo "Deleting cluster '${CLUSTER_NAME}'..." ibmcloud oc cluster rm --cluster "${CLUSTER_NAME}" -f --force-delete-storage echo "Cluster deletion initiated" echo "Waiting for cluster to be fully removed..." - MAX_WAIT=7200 # 2 hours + MAX_WAIT=7200 INTERVAL=30 ELAPSED=0 @@ -96,17 +126,16 @@ jobs: fi - name: Clean up VPC resources - if: steps.check_cluster.outputs.exists == 'true' + if: inputs.infrastructure_type == 'vpc' && steps.check_cluster.outputs.exists == 'true' continue-on-error: true env: VPC_NAME: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }}-vpc run: | echo "Checking for VPC resources to clean up..." - ibmcloud plugin install vpc-infrastructure -f 2>/dev/null || true VPC_ID=$(ibmcloud is vpcs --output json 2>/dev/null | jq -r --arg n "${VPC_NAME}" '.[] | select(.name == $n) | .id // empty' || true) if [[ -z "${VPC_ID}" ]]; then - echo "No VPC '${VPC_NAME}' found — cluster was likely classic. Skipping VPC cleanup." + echo "No VPC '${VPC_NAME}' found. Skipping VPC cleanup." exit 0 fi @@ -127,14 +156,60 @@ jobs: ibmcloud is subnet-delete "${sub_id}" -f 2>/dev/null || true done - echo "Waiting for subnets to be deleted before removing VPC..." sleep 30 echo "Removing VPC '${VPC_NAME}'..." ibmcloud is vpc-delete "${VPC_ID}" -f 2>/dev/null || echo "VPC deletion failed (may have remaining resources)" - echo "VPC cleanup complete." + # ────────────────────────────────────────────────────────────────────── + # IPI teardown + # ────────────────────────────────────────────────────────────────────── + - name: Download IPI install state + if: inputs.infrastructure_type == 'ipi' + env: + GH_TOKEN: ${{ github.token }} + SETUP_RUN_ID: ${{ inputs.ipi_setup_run_id }} + run: | + INSTALL_DIR="${RUNNER_TEMP}/ipi-install" + mkdir -p "${INSTALL_DIR}" + + if [[ -z "${SETUP_RUN_ID}" ]]; then + echo "::error::ipi_setup_run_id is required for IPI teardown. Provide the run ID from the setup workflow that created the cluster." + exit 1 + fi + + echo "Downloading IPI install state from run ${SETUP_RUN_ID}..." + gh run download "${SETUP_RUN_ID}" --repo "${{ github.repository }}" \ + --name "ipi-install-state-${SETUP_RUN_ID}" \ + --dir "${INSTALL_DIR}" + + echo "Downloaded files:" + ls -la "${INSTALL_DIR}/" + echo "install_dir=${INSTALL_DIR}" >> "$GITHUB_OUTPUT" + + - name: Destroy IPI cluster + if: inputs.infrastructure_type == 'ipi' + env: + IC_API_KEY: ${{ secrets.IC_KEY }} + INSTALL_DIR: ${{ runner.temp }}/ipi-install + run: | + export IC_API_KEY + + echo "Downloading openshift-install (latest stable)..." + curl -sL "https://mirror.openshift.com/pub/openshift-v4/x86_64/clients/ocp/stable/openshift-install-linux.tar.gz" \ + | tar -xz -C /usr/local/bin openshift-install + + if [[ ! -f "${INSTALL_DIR}/metadata.json" ]]; then + echo "::error::metadata.json not found in install state. Cannot destroy IPI cluster." + exit 1 + fi + + echo "Destroying IPI cluster..." + openshift-install destroy cluster --dir="${INSTALL_DIR}" --log-level=info 2>&1 | tail -80 + # ────────────────────────────────────────────────────────────────────── + # Common cleanup + # ────────────────────────────────────────────────────────────────────── - name: Clean up ghost runners continue-on-error: true env: @@ -160,6 +235,7 @@ jobs: echo "" >> "$GITHUB_STEP_SUMMARY" echo "| Parameter | Value |" >> "$GITHUB_STEP_SUMMARY" echo "|-----------|-------|" >> "$GITHUB_STEP_SUMMARY" + echo "| Infrastructure | \`${{ inputs.infrastructure_type || 'classic' }}\` |" >> "$GITHUB_STEP_SUMMARY" echo "| Cluster | \`${CLUSTER_NAME}\` |" >> "$GITHUB_STEP_SUMMARY" - echo "| Cluster Found | \`${{ steps.check_cluster.outputs.exists }}\` |" >> "$GITHUB_STEP_SUMMARY" + echo "| Cluster Found | \`${{ steps.check_cluster.outputs.exists || 'N/A (IPI)' }}\` |" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" diff --git a/ci-scripts/POC_OUTLINE.md b/ci-scripts/POC_OUTLINE.md deleted file mode 100644 index becbb84377..0000000000 --- a/ci-scripts/POC_OUTLINE.md +++ /dev/null @@ -1,72 +0,0 @@ -# Part 1 - Get a github action workflow running on an OpenShift cluster - -Use GitHub Action Runner Controller (ARC) to enable "ephemeral self-hosted runners" as a workflow job `runs-on` target. - -Notes: - -- A two part install via helm: - - - runner scale set controller (ARC) - installed once per cluster - - runner scale set (RSS) - installed per repo+runner, provides handling for a specific self-hosted runner - -- No incoming network access is required, ARC uses a polling system - -- The helm chart is patched at install time to: - - - Define the SCC, SA and RBAC bindings the ARC, RSS listener, and RSS ephemeral runner pods will be assigned - - Define the RSS ephemeral container's image - -- Authentication from the ARC and RSS to GitHub uses a GitHub App - -- All of the configurations allow us to control what image runs workflows, how that deployment is managed, and the workflow's embedded permissions to interact with the cluster - -# Part 2 - Control the workflow's container image - -Develop an ARC runner image that includes all needed tools. This allows specific control over what tools and libraries are provided by default to the workflows. - -Notes: - -- Use an `ImageStream` and `BuildConfig` to build the container image directly on the Cluster itself. -- The script that sets up the `BuildConfig` can lookup the proper versions of `oc` and `virtctl` -- The build can include specific versions/locations of important tools - -# Part 3 - Setup an action run specific CI environment on OpenShift cluster - -Each test run needs to establish its own self-contained (as much as possible at least) environment to run tests. This comes down to creating a namespace, deploying the plugin to be tests, and deploying a dev console. When it is all running, there is a console+plugin+CI namespace easily created for each CI run. - -Notes: - -- A two part helm chart based install: - - - `ci-env-controller` uses a collection of ConfigMaps to control CI test environments - - `ci-test-stack` is the deployment to be able to run and access a console, plugin and namespace - -- The arc RSS only needs enough permissions to manipulate the ConfigMaps - -- The controller sets up the namespace, and runs the console in "off cluster" mode with no authentication required - -- Accessing the test stack's route runs with a SA that has enough permissions to do everything needed for the e2e testing - -- The exact way this is all deployed and how the SA and role binding are setup could use some additional work to make the RBAC being used everywhere very obvious - -- The ConfigMaps will timeout and the test stack reaped around a default of 2 hours after it is created. - -# Part 4 - Run full workflow from GitHub, watch all job steps run as expected - -The workflow job will run on the RSS, be able to create a `ci-test-stack` by pushing a ConfigMap to the `ci-env-controller`, get a route to the test stack and run all tests targeting that route. - -Notes: - -- The route can be on-cluster only, or use a publicly available route - -- GitHub actions are used to request and release the test stack - -- Test can run any way we want. The POC is using a slim version of the cypress gating tests and the Cypress provided github action. - -# Part 5 - Updates to diagnostic and test results artifact tracking - -Anything that could be useful (test reports, pod logs, cluster log) are collected and pushed to artifacts to capture all the test run details. - -# Part 6 - Allow developers to manually create/remove CI environments - -Since a `ci-test-stack` can be requested by creating a ConfigMap, anyone login with the permissions to ask for one can get one. This allows developers to be able to manually create a test stack with any custom build of a plugin image they want. diff --git a/ci-scripts/README.md b/ci-scripts/README.md index 006906f128..c0102815b6 100644 --- a/ci-scripts/README.md +++ b/ci-scripts/README.md @@ -1,350 +1,168 @@ # Hot Cluster CI -> **Continuation guide (CNV-74265):** [docs/HOT_CLUSTER_CI_CONTINUATION.md](../docs/HOT_CLUSTER_CI_CONTINUATION.md) -> **Future work backlog:** [docs/HOT_CLUSTER_FUTURE_WORK.md](../docs/HOT_CLUSTER_FUTURE_WORK.md) -> **Cluster lifecycle:** [docs/CLUSTER_LIFECYCLE.md](../docs/CLUSTER_LIFECYCLE.md) +This directory contains scripts and documentation for the **IBM Cloud hot cluster** CI stack: an OpenShift cluster used for KubeVirt plugin E2E testing, with **Hyperconverged Cluster Operator (HCO)** and **GitHub Actions Runner Controller (ARC)** for self-hosted runners (`kubevirt-plugin-ci`). -This directory contains scripts and documentation for the **IBM Cloud hot cluster** CI stack: an OpenShift (ROKS) cluster used for KubeVirt plugin integration testing, with **Hyperconverged Cluster Operator (HCO)** and **GitHub Actions Runner Controller (ARC)** so jobs can run on cluster-adjacent self-hosted runners (`kubevirt-plugin-ci`). - -Workers can be **bare metal** (real KVM) or **VPC / shared** flavors with **KVM emulation**; the setup workflow defaults favor VPC-style flavors and `kvm_emulation: true` unless you change inputs. - -## Why this stack (motivation) - -| Goal | Approach | -| -------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **Real KubeVirt / OpenShift behavior** | Tests run against a live cluster with HCO, virt stack, and storage—not mocks. | -| **Console + plugin fidelity** | Two POC paths: hit the **in-cluster** console URL, or run an **off-cluster** console container with the plugin served like the operator (TLS + nginx), matching how developers run bridge locally. | -| **Long-running / privileged CI** | GitHub-hosted runners are a poor fit for nested virt, heavy Playwright, and Docker-heavy flows; **ARC** on the cluster provides dind-capable runners with `oc` RBAC. | -| **Cost control** | Bare metal and large workers are expensive; **auto-teardown** after idle time limits runaway spend. | +Three infrastructure types are supported: **Classic ROKS**, **VPC ROKS** (recommended), and **IPI** (self-managed OpenShift). ## Architecture -**Lifecycle (IBM Cloud)** +**Cluster lifecycle** ``` GitHub Actions - │ - ├── ibmc-cluster-setup.yml → "IBM Cloud Hot Cluster Setup" - ├── ibmc-cluster-teardown.yml → "IBM Cloud Hot Cluster Teardown" (also workflow_call + ghost-runner cleanup) - └── ibmc-cluster-auto-teardown.yml → "IBM Cloud Hot Cluster Auto-Teardown" (cron + dispatch → teardown workflow) + ├── ibmc-cluster-setup.yml → "IBM Cloud Hot Cluster Setup" (classic / vpc / ipi) + ├── ibmc-cluster-teardown.yml → "IBM Cloud Hot Cluster Teardown" + └── ibmc-cluster-auto-teardown.yml → "IBM Cloud Hot Cluster Auto-Teardown" (cron, 2h idle) ``` -**Hot cluster E2E** +**E2E testing** ``` -hot-cluster-e2e.yml — "Hot Cluster E2E" (PR + manual dispatch) - ├── cluster-health-check (ubuntu-latest + IBM Cloud → kubeconfig) - │ └── ci-scripts/check-cluster-health.sh +hot-cluster-e2e.yml → "Hot Cluster E2E" (PR trigger + manual dispatch) + ├── cluster-health-check (ubuntu-latest) └── run-e2e-tests (workflow_call → hot-cluster-e2e-run.yml) -hot-cluster-e2e-run.yml — "Hot Cluster E2E Run" - ├── check-runner (diagnostics on ARC runner) - ├── build-kubevirt-plugin-image (ubuntu-latest; podman build + push) +hot-cluster-e2e-run.yml → "Hot Cluster E2E Run" + ├── check-runner (ARC runner diagnostics) + ├── build-kubevirt-plugin-image (podman build + push to ttl.sh) └── run-gating-tests (runs-on: kubevirt-plugin-ci) - ├── ci-env-request → ci-env-controller → ci-test-stack (console + plugin) + ├── ci-env-request → ci-env-controller → ci-test-stack ├── BRIDGE_BASE_ADDRESS from test stack └── Playwright gating (or features project) ``` -## Required GitHub Secrets - -These secrets must be configured in the repository settings before running the workflows. - -### IBM Cloud - -| Secret | Description | How to Obtain | -| -------- | --------------------- | ------------------------------- | -| `IC_KEY` | IBM Cloud IAM API key | Repository/org secret (Actions) | - -The API key must belong to a user or service ID with the following IAM permissions: - -**For VPC clusters** (recommended — simpler IAM): - -- **Kubernetes Service**: Administrator role (platform) -- **VPC Infrastructure Services**: Administrator role -- **Container Registry**: Administrator role -- **Resource group (`cnv-ui`)**: Viewer role - -**For classic clusters:** - -- **Kubernetes Service**: Administrator role (platform) -- **Classic Infrastructure**: Super User (or equivalent per-permission set) -- **Container Registry**: Administrator role - -### Ghost Runner Cleanup (optional) - -| Secret | Description | How to Obtain | -| --------- | ------------------------- | ------------------------------------------- | -| `BOT_PAT` | PAT with repo admin scope | GitHub Settings → Developer Settings → PATs | - -The `BOT_PAT` is only needed if you want the teardown workflow to automatically delete offline "ghost" runners from GitHub. Deleting self-hosted runners requires repository admin access which `GITHUB_TOKEN` cannot provide. The PAT needs the `repo` scope (classic) or **Administration: Read and Write** (fine-grained). If not set, ghost runners can be cleaned up manually via Settings → Actions → Runners. - -### ARC Authentication (choose one) - -#### Option A: GitHub App (recommended for production) - -| Secret | Description | How to Obtain | -| ---------------------------- | --------------------- | --------------------------------- | -| `ARC_GITHUB_APP_ID` | GitHub App ID | See "Creating a GitHub App" below | -| `ARC_GITHUB_APP_INSTALL_ID` | App installation ID | See "Creating a GitHub App" below | -| `ARC_GITHUB_APP_PRIVATE_KEY` | App private key (PEM) | See "Creating a GitHub App" below | - -#### Option B: Personal Access Token (simpler, less secure) - -| Secret | Description | How to Obtain | -| ---------------- | ---------------- | ---------------------------------------------------------- | -| `ARC_GITHUB_PAT` | Fine-grained PAT | GitHub Settings → Developer Settings → Fine-grained tokens | - -The PAT requires these permissions on the target repository: - -- **Administration**: Read and Write -- **Metadata**: Read-only - -## Cluster Authentication - -All workflows that need cluster access use the IBM Cloud CLI to pull a kubeconfig on-demand: - -```yaml -- name: Setup IBM Cloud CLI - uses: IBM/actions-ibmcloud-cli@v1 - with: - api_key: ${{ secrets.IC_KEY }} - plugins: kubernetes-service - -- name: Configure kubeconfig - run: | - ibmcloud oc cluster config --cluster "${CLUSTER_NAME}" --admin - oc cluster-info -``` - -This avoids storing kubeconfig or credentials as GitHub secrets. Any workflow or job that needs `oc`/`kubectl` access simply repeats these two steps with the shared `IC_KEY` secret. - -## Creating a GitHub App for ARC - -1. Go to your organization settings (or personal settings) → Developer settings → GitHub Apps → New GitHub App -2. Configure the app: - - **Name**: `kubevirt-plugin-arc` (or any name) - - **Homepage URL**: Your repository URL - - **Webhook**: Uncheck "Active" (not needed) - - **Permissions**: - - Repository permissions → Administration: Read and Write - - Organization permissions → Self-hosted runners: Read and Write -3. Create the app and note the **App ID** -4. Generate a **Private Key** (downloads a `.pem` file) -5. Install the app on your organization/repository and note the **Installation ID** - - Find it in the URL: `https://github.com/settings/installations/` -6. Store the three values as GitHub secrets: - - `ARC_GITHUB_APP_ID` = App ID - - `ARC_GITHUB_APP_INSTALL_ID` = Installation ID - - `ARC_GITHUB_APP_PRIVATE_KEY` = Contents of the `.pem` file - -## Usage - -### ARC install scripts (OpenShift) - -All ARC automation lives under **`ci-scripts/arc/`**. See **[`ci-scripts/arc/README.md`](arc/README.md)** for the full walkthrough. - -| Script | Role | -| ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **`ci-scripts/arc/setup-dind-mirror.sh`** | Mirror **`docker:dind`** to the internal registry, write **`ci-scripts/generated/arc-dind-replace.env`** for Helm post-rendering (standard path; `SKIP_DIND_MIRROR=1` only if dind is provided another way). | -| **`ci-scripts/arc/setup-runner-image.sh`** | Build custom runner image (BuildConfig + `runner-image/Dockerfile`); prints **`IMAGE_REF=`**. | -| **`ci-scripts/arc/install-arc-controller.sh`** | Once per cluster: `arc-systems`, **`ci-scripts/arc/arc-openshift-scc.yaml`**, Helm **`gha-runner-scale-set-controller`**. | -| **`ci-scripts/arc/install-runner-scale-set.sh`** | Per scale set: Helm **`gha-runner-scale-set`**, optional **`ARC_RUNNER_IMAGE`**, dind post-render (**`--storage-driver=vfs`** always; optional **`docker:dind`** mirror via env file or **`ARC_DIND_INTERNAL_IMAGE`**), SCC bind, **`arc-runner-rbac.yaml`** (unless `SKIP_ARC_RUNNER_RBAC=1`). Requires **`ARC_CONFIG_URL`** + GitHub auth. Run **after** the controller script. | +## Infrastructure Types -Hot Cluster Setup runs **`ci-scripts/arc/setup-dind-mirror.sh`**, **`ci-scripts/arc/setup-runner-image.sh`**, then **`ci-scripts/arc/install-arc-controller.sh`** and **`ci-scripts/arc/install-runner-scale-set.sh`** (same env for the install steps). +| Type | Description | IAM needed | Cluster management | +|------|------------|------------|-------------------| +| **classic** | IBM-managed ROKS on classic infrastructure | K8s Admin + Classic Super User | IBM manages control plane | +| **vpc** | IBM-managed ROKS on VPC Gen2 | K8s Admin + VPC Admin + COS auth | IBM manages control plane | +| **ipi** | Self-managed OpenShift via `openshift-install` | VPC Admin + COS Manager + DNS + IAM Identity | You manage everything | -### Custom runner image +### VPC ROKS (recommended) -The setup workflow builds a **custom runner image** on the cluster. The image extends the official GitHub Actions runner with Node.js 22, kubectl, oc, virtctl, and jq. Container workflows use **Docker** via the ARC **dind** sidecar (`DOCKER_HOST`). +Uses standard IBM Cloud IAM (no SoftLayer/classic permissions). VPC, subnet, public gateway, and COS instance are auto-created and reused across runs. -- **Dockerfile**: `ci-scripts/arc/runner-image/Dockerfile` -- **Runner pod Helm fragment**: `ci-scripts/arc/arc-runner-scale-set.pod.yaml` — used by **`ci-scripts/arc/install-runner-scale-set.sh`**. -- **Dind post-render**: **`ci-scripts/arc/install-runner-scale-set.sh`** always runs Helm with **`--post-renderer ci-scripts/arc/arc-dind-post-render.sh`** for **`CONTAINER_MODE=dind`** (injects **`--storage-driver=vfs`** so nested overlay does not fail on OpenShift). **`ci-scripts/arc/setup-dind-mirror.sh`** writes **`ci-scripts/generated/arc-dind-replace.env`** so the post-renderer also swaps **`docker:dind`** for the internal registry; you can set **`ARC_DIND_INTERNAL_IMAGE`** at install time instead (writes the same env file for that run). -- **Refresh runner image only**: re-run **`ci-scripts/arc/setup-runner-image.sh`**, then **`ci-scripts/arc/install-runner-scale-set.sh`** with **`ARC_RUNNER_IMAGE`** set to the new ref (and the same auth env vars). +- **Zone format**: `us-south-1`, `eu-de-1` +- **Flavor**: `bx2.8x32`, `cx2.4x8` (list with `ibmcloud oc flavors --zone --provider vpc-gen2`) +- **Required**: COS service-to-service authorization (`ibmcloud iam authorization-policy-create containers-kubernetes cloud-object-storage Reader`) -Optional: `OC_VERSION`, `VIRTCTL_VERSION`, `ARC_RUNNERS_NS`, `CONTAINER_MODE` (default **dind**), `ARC_VERSION`, `ARC_SCALE_SET_LABELS`, `SKIP_ARC_RUNNER_RBAC=1`. +### Classic ROKS -#### ARC 0.14.0+ ([changelog](https://github.blog/changelog/2026-03-19-actions-runner-controller-release-0-14-0/)) +Uses SoftLayer/classic infrastructure. VLANs are auto-discovered or created. -**`ci-scripts/arc/install-arc-controller.sh`** and **`ci-scripts/arc/install-runner-scale-set.sh`** use Helm chart **`0.14.0`** by default so controller and scale-set versions stay aligned and reproducible. +- **Zone format**: `wdc04`, `fra02`, `dal10` +- **Flavor**: `m3c.8x64`, `mb4c.4x32` +- **Required**: Classic Infrastructure Super User -| Feature | How we use it | -| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **Multilabel scale sets** | Optional env **`ARC_SCALE_SET_LABELS=kubevirt-plugin-ci,linux`** (comma-separated). Jobs must target **every** label, e.g. `runs-on: [kubevirt-plugin-ci, linux]`. Omit the variable to keep the previous single-label behavior (default `runs-on: kubevirt-plugin-ci`). | -| **Listener on Linux nodes** | Upstream defaults the listener pod to **`kubernetes.io/os: linux`** — helpful on mixed OS clusters without extra config. | -| **`resourceMeta` labels/annotations** | Optional: merge **`ci-scripts/examples/arc-0.14-extra-values.yaml`** (commented patterns) via **`ARC_RUNNER_EXTRA_VALUES`**. | -| **Experimental charts** | **`gha-runner-scale-set-experimental`** exposes **`runner.dind.container.image`**, so you could point dind at your internal mirror **without** the Helm post-renderer — values shape differs (`scaleset`, `auth`, …); treat as a larger migration. OCI path: `oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set-experimental`. | +### IPI (self-managed) -The **stable** chart still hardcodes **`docker:dind`** in templates; this repo keeps **`ci-scripts/arc/setup-dind-mirror.sh`** + **`ci-scripts/arc/arc-dind-post-render.sh`** unless you adopt the experimental chart or cluster mirroring. +Uses `openshift-install` to create a fully self-managed OpenShift cluster on IBM Cloud VPC. No ROKS management fee; you own the control plane. -#### Dind image source +- **Zone format**: `us-south-1`, `eu-de-1` +- **Flavor**: `bx2-8x32` (hyphen format, auto-converted from dot format) +- **Base domain**: `cnv-ui.com` (registered in IBM Cloud CIS) +- **Required**: VPC Admin, COS Manager, DNS/CIS access, IAM Identity Admin, `OPENSHIFT_PULL_SECRET` GitHub secret -The stable chart embeds **`docker:dind`** (Docker Hub). This repo **always mirrors** that image into the OpenShift internal registry via **`ci-scripts/arc/setup-dind-mirror.sh`** and rewrites rendered manifests with the Helm post-renderer so runner pods pull **arc-docker-dind** from the cluster registry. The approach avoid docker hub pull throttling / rate limiting. - -### Docker-in-Docker (default) - -**`ci-scripts/arc/install-runner-scale-set.sh`** defaults **`CONTAINER_MODE=dind`**. The ARC chart adds a privileged **`docker:dind`** sidecar and wires the main runner container with `DOCKER_HOST=unix:///var/run/docker.sock` so workflows can run: - -- `docker build` / `docker run` in steps -- `container:` jobs and container actions that need a Docker daemon - -The custom runner image is still the **main** `runner` container; the chart merges your `template.spec.containers[runner]` with dind-specific env and volume mounts (see upstream `gha-runner-scale-set` `_helpers.tpl`). - -**OpenShift:** the `github-arc` SCC in `ci-scripts/arc/arc-openshift-scc.yaml` allows **privileged** containers and **RunAsAny** for UIDs so the `docker:dind` sidecar can run as root while the runner container stays at 1001/123 via Helm. This is broader than a “restricted-only” SCC; scope runner namespaces and RBAC accordingly. - -To turn off dind (no Docker daemon in the pod): `export CONTAINER_MODE=none` and re-run **`ci-scripts/arc/install-runner-scale-set.sh`**. - -### Workflow file ↔ Actions UI name - -| File | `name:` in workflow (shown in Actions tab) | -| -------------------------------------------------- | ------------------------------------------ | -| `.github/workflows/ibmc-cluster-setup.yml` | IBM Cloud Hot Cluster Setup | -| `.github/workflows/ibmc-cluster-teardown.yml` | IBM Cloud Hot Cluster Teardown | -| `.github/workflows/ibmc-cluster-auto-teardown.yml` | IBM Cloud Hot Cluster Auto-Teardown | -| `.github/workflows/hot-cluster-e2e.yml` | Hot Cluster E2E | -| `.github/workflows/hot-cluster-e2e-run.yml` | Hot Cluster E2E Run | - -### Setting up the hot cluster - -1. Actions → **IBM Cloud Hot Cluster Setup** → Run workflow -2. Select **infrastructure type**: `classic` or `vpc` (VPC recommended — simpler IAM) -3. Inputs: cluster name, zone, OpenShift version, worker flavor/count, KVM emulation -4. Wait for completion (provisioning time depends on flavor; setup includes HCO, ARC runner image build, ARC controller + scale set, ci-env-controller, `check-cluster-health.sh`) - -#### VPC path (recommended) +## Required GitHub Secrets -- **Zone format**: `us-south-1`, `us-south-2`, `eu-de-1`, etc. -- **Flavor**: VPC flavors like `bx2.8x32`, `cx2.4x8` (list with `ibmcloud oc flavors --zone --provider vpc-gen2`) -- **COS instance CRN**: Required for internal registry. Find with `ibmcloud resource service-instances --service-name cloud-object-storage --long` -- **IAM**: Only needs **VPC Infrastructure Administrator** — no classic Super User or SoftLayer permissions -- VPC, subnet, and public gateway are auto-created and reused across runs +### IBM Cloud -#### Classic path +| Secret | Description | Required for | +|--------|-------------|-------------| +| `IC_KEY` | IBM Cloud IAM API key | All paths | +| `OPENSHIFT_PULL_SECRET` | Red Hat pull secret (from console.redhat.com) | IPI only | -- **Zone format**: `wdc04`, `fra02`, `dal10`, etc. -- **Flavor**: Classic flavors like `m3c.8x64`, `mb4c.4x32` -- **IAM**: Requires **Classic Infrastructure Super User** (SoftLayer permissions) -- VLANs are auto-discovered or created +### ARC Authentication (choose one) -**Implementation notes:** Both paths converge after cluster creation. Setup installs `oc` from the cluster downloads endpoint, then runs `install-hco.sh`, `setup-arc-runner-image.sh`, `install-arc-controller.sh`, `install-runner-scale-set.sh`, and `install-ci-env-controller.sh`. +| Secret | Description | +|--------|-------------| +| `ARC_GITHUB_APP_ID` + `ARC_GITHUB_APP_INSTALL_ID` + `ARC_GITHUB_APP_PRIVATE_KEY` | GitHub App (recommended) | +| `ARC_GITHUB_PAT` | Fine-grained PAT (simpler, less secure) | -### Running hot cluster E2E tests +### Optional -1. Actions → **Hot Cluster E2E** (PR trigger or manual dispatch) -2. Inputs: Playwright project (`gating` or `features`), cluster name (default `kubevirt-plugin-ci`) -3. Health check on `ubuntu-latest`; on success calls **Hot Cluster E2E Run** -4. Run workflow provisions a `ci-test-stack`, runs Playwright, uploads artifacts, releases the stack +| Secret | Description | +|--------|-------------| +| `BOT_PAT` | PAT with repo admin scope for ghost runner cleanup | -To run only the test jobs (cluster already verified): dispatch **Hot Cluster E2E Run** directly. +## Setting Up the Hot Cluster -### Tearing down the cluster +1. Go to Actions → **IBM Cloud Hot Cluster Setup** → Run workflow +2. Select infrastructure type, zone, flavor, worker count +3. Wait for completion (30-60 min depending on infrastructure type) -**Manual:** Actions → **IBM Cloud Hot Cluster Teardown** +All paths converge after cluster creation: the workflow installs HCO, builds the ARC runner image, installs ARC controller + scale set, deploys the ci-env-controller, and runs health checks. -**Automatic:** Idle detection monitors `hot-cluster-e2e.yml` and `hot-cluster-e2e-run.yml` for in-progress, queued, or recently completed runs. +## Running E2E Tests -**Teardown implementation:** Uninstalls Helm releases `kubevirt-plugin-ci` (scale set) and `arc` (controller) when possible, deletes the ROKS cluster, then optionally removes offline GitHub runners labeled `kubevirt-plugin-ci` using `BOT_PAT`. +1. Actions → **Hot Cluster E2E** → Run workflow (or triggered on PR) +2. Health check verifies cluster is reachable +3. Plugin image is built and pushed to `ttl.sh` (2h TTL) +4. Test environment is provisioned via ci-env-controller (ConfigMap-driven) +5. Playwright gating tests run against the in-cluster console +6. Artifacts are uploaded, test environment is released -## ARC on OpenShift vs [na-launch/github-arc](https://github.com/na-launch/github-arc/blob/main/README.md) +## Teardown -The [na-launch/github-arc](https://github.com/na-launch/github-arc/blob/main/README.md) README is a concise Helm + OpenShift recipe: fixed env vars for namespaces and installation names, explicit `serviceAccount.name=-gha-rs-controller` on the controller chart, matching `controllerServiceAccount` on the scale set, apply SCC + ClusterRole, then `oc policy add-role-to-user system:openshift:scc:github-arc -z -gha-rs-no-permission`. +**Manual:** Actions → **IBM Cloud Hot Cluster Teardown** (select infrastructure type) -| Topic | na-launch/github-arc | This repo (`ci-scripts/arc/*`) | -| ----------------------- | ------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------- | -| Controller SA name | `${GITHUB_ARC_SYSTEM_INSTALLATION_NAME}-gha-rs-controller` | `${ARC_CONTROLLER_INSTALL_NAME}-gha-rs-controller` (default install name `arc`) | -| Scale set → controller | `controllerServiceAccount.name` + `.namespace` | Same when OpenShift is detected | -| SCC + `use` ClusterRole | Separate `manifests/scc.yaml` + `cluster-role.yaml` | Single `ci-scripts/arc/arc-openshift-scc.yaml` (equivalent) | -| Runner policy | `oc policy add-role-to-user ... -z -gha-rs-no-permission` | Same pattern | -| Auth | PAT in `--set` | **GitHub App** (recommended) or PAT; App IDs forced to strings to avoid Helm float bugs | -| Runner pod template | Checked-in `values.yaml` | `ci-scripts/arc/arc-runner-scale-set.pod.yaml` + optional `ARC_RUNNER_EXTRA_VALUES` | -| Container jobs | Not emphasized | **Default `CONTAINER_MODE=dind`** (privileged `docker:dind`); SCC allows privileged / RunAsAny for that sidecar | +**Automatic:** The auto-teardown workflow runs every 30 minutes and tears down the cluster after 2 hours of CI inactivity. It detects both ROKS clusters (via `ibmcloud oc`) and IPI clusters (via DNS probe). -**Additional runner scale sets (same as na-launch):** Keep `ARC_CONTROLLER_NS` and `ARC_CONTROLLER_INSTALL_NAME` unchanged. **Do not** re-run **`ci-scripts/arc/install-arc-controller.sh`**. Set `ARC_RUNNERS_NS`, `RUNNER_SCALE_SET_NAME`, and `ARC_CONFIG_URL` (and auth). Run **`SKIP_ARC_RUNNER_RBAC=1` `./ci-scripts/arc/install-runner-scale-set.sh`** so the shared `ClusterRoleBinding` **arc-runner-ci** is not overwritten (it only lists one ServiceAccount). The scale-set script still binds SCC **github-arc** to the new runner SA; add **ClusterRole** access for CI: +For IPI teardown, provide the `ipi_setup_run_id` (the GitHub Actions run ID from the setup workflow) so the teardown can download the install state artifacts needed by `openshift-install destroy cluster`. -`oc adm policy add-cluster-role-to-user arc-runner-ci -z -gha-rs-no-permission -n ` +## Scripts -You do **not** need to re-apply `ci-scripts/arc/arc-openshift-scc.yaml`. +| Script | Purpose | +|--------|---------| +| `install-hco.sh` | Installs HCO operator, HPP storage, and virtctl | +| `check-cluster-health.sh` | Verifies cluster, HCO, ARC, storage, console | +| `check-roks-cluster-state.sh` | Polls until ROKS cluster is ready | +| `log-ibmcloud-iam-diagnostics.sh` | Logs IAM permissions for debugging (classic, VPC, and IPI) | +| `arc/install-arc-controller.sh` | Installs ARC controller (once per cluster) | +| `arc/install-runner-scale-set.sh` | Installs ARC runner scale set | +| `arc/setup-dind-mirror.sh` | Mirrors `docker:dind` to internal registry | +| `ci-env/install-ci-env-controller.sh` | Installs the ConfigMap-driven CI environment controller | -## Scripts +See [`arc/README.md`](arc/README.md) for ARC-specific details and [`ci-env/README.md`](ci-env/README.md) for the ci-env-controller. -| Script | Purpose | -| --------------------------------- | --------------------------------------------------------------------------------- | -| `install-hco.sh` | Installs HCO operator, HPP storage, and virtctl | -| `arc/setup-dind-mirror.sh` | Mirror `docker:dind` to internal registry; write `generated/arc-dind-replace.env` | -| `arc/setup-runner-image.sh` | OpenShift binary build for custom ARC runner image | -| `arc/install-arc-controller.sh` | SCC + Helm `gha-runner-scale-set-controller` (once per cluster) | -| `arc/install-runner-scale-set.sh` | Helm `gha-runner-scale-set`, SCC bind, `arc-runner-rbac.yaml` | -| `arc/README.md` | ARC on OpenShift setup guide | -| `check-cluster-health.sh` | Verifies cluster, HCO, ARC, storage, console; optional GitHub runner check | -| `check-roks-cluster-state.sh` | Waits until ROKS cluster is usable (used by setup workflow) | -| `resolve-console-image.sh` | Emits `CONSOLE_IMAGE` tag **x.y** from `ClusterVersion` for off-cluster console | -| `start-plugin-container.sh` | Runs plugin image with TLS + `nginx-9443.conf` (Docker dind–safe cert paths) | -| `start-console.sh` | Runs `origin-console` off-cluster; `BRIDGE_PLUGIN_PROXY` + kubevirt API route | -| `nginx-9443.conf` | Nginx config for plugin HTTPS (mounted into plugin container in POC test2) | - -### Script Configuration +## Script Configuration All scripts accept configuration via environment variables. See the header comments in each script for details. Key defaults: -- `KVM_EMULATION=false` (bare metal has real KVM) +- `KVM_EMULATION=false` (bare metal has real KVM; set `true` for VPC/shared) - `RUNNER_SCALE_SET_NAME=kubevirt-plugin-ci` (the `runs-on:` label) -- `ARC_CONTROLLER_INSTALL_NAME=arc` (Helm release for controller; OpenShift SA `arc-gha-rs-controller`) -- `MIN_RUNNERS=0` -- `MAX_RUNNERS=5` -- `CONTAINER_MODE=dind` (Docker-in-Docker for container jobs / `docker` in workflows) -- `ARC_VERSION=0.14.0` (default pinned chart; `ARC_VERSION=latest` floats OCI default tag) -- `ARC_SCALE_SET_LABELS` (optional multilabel; requires matching `runs-on` array in workflows) -- Additional scale sets: run only **`ci-scripts/arc/install-runner-scale-set.sh`** (skip **`ci-scripts/arc/install-arc-controller.sh`**) - -## Follow-up work - -See [docs/HOT_CLUSTER_FUTURE_WORK.md](../docs/HOT_CLUSTER_FUTURE_WORK.md) for RBAC hardening, FIPS, ci-env-controller setup gap, and workflow hygiene items. - -Quick checklist: - -1. **Health check first** — Run **Hot Cluster E2E** (or health-check job only) to isolate cluster/HCO issues from test-stack issues. -2. **ci-env-controller** — Install once on the cluster if not already present (`./dev/ci-env.sh`). -3. **ARC on org repo** — Runners must register to `kubevirt-ui/kubevirt-plugin`, not a fork. -4. **Auto-teardown** — Confirm idle detection watches `hot-cluster-e2e.yml` and `hot-cluster-e2e-run.yml`. - -## Production and hardening review (before treating POC patterns as prod) - -| Technique / choice | Risk or limitation | Hardening direction | -| ------------------------------------------------------------ | -------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------- | -| **Privileged dind + broad SCC (`github-arc`)** | High blast radius on the node; runner compromise ≈ strong cluster access | Narrow SCC, dedicated nodes, separate scale sets per trust zone, audit images | -| **`oc whoami --show-token` in `start-console.sh`** | Long-lived bearer token injected into console container env | Short-lived tokens, dedicated read-only SA, rotate; never log unmasked | -| **Self-signed TLS for plugin (`start-plugin-container.sh`)** | MITM within pod network if mis-scoped | Match operator-style serving certs; trust only where `InsecureSkipVerify` is explicit | -| **`ttl.sh` or ephemeral public registries** | Ephemeral tags, no provenance, rate/abuse limits | Internal registry + image signing, digest pinning | -| **Skip `npm audit` / `--ignore-scripts`** | Supply-chain and lifecycle scripts not run | Revisit for production pipelines; use lockfile + audited base images | -| **Cluster-scoped mutations in `test-setup.sh`** | Variant A may patch shared ConfigMaps | Prefer namespaced fixtures or dedicated test clusters | -| **Ghost runner cleanup via `BOT_PAT`** | PAT scope and rotation | GitHub App or org-level runner management; least privilege | -| **Auto-teardown idle heuristic** | Monitors only the two E2E test workflows; a cluster used by other workflows may be torn down early | Tie to runner job queue or explicit "last test" workflow | -| **Classic ROKS only in setup workflow** | Not IBM Cloud VPC Gen2 path | Add a parallel path or doc if prod standardizes on VPC | - -## POC completion score - -**~55%** — **Infrastructure path is largely in place** (IBM Cloud provisioning, HCO, ARC 0.14, dind mirror, health checks, two E2E workflows, diagnostics artifacts). **Productization is incomplete**: test2 still carries registry/image shortcuts, TODOs, and a narrower Cypress entrypoint; **end-to-end “green main”** on real clusters is not yet documented as achieved. Raising the score to **~80%** means repeatable green runs on both POC variants with pinned images, no hard-coded TTL tags, and either stable full gating or an agreed minimal gate with flake budget. +- `ARC_CONTROLLER_INSTALL_NAME=arc` (Helm release for controller) +- `CONTAINER_MODE=dind` (Docker-in-Docker for container jobs) +- `ARC_VERSION=0.14.0` (pinned Helm chart version) ## Cost Control -Bare metal nodes on IBM Cloud are expensive. The auto-teardown workflow provides automatic cost control: +The auto-teardown workflow provides automatic cost control: - Runs every 30 minutes via cron -- Checks if any CI jobs are in-progress or queued +- Checks if any E2E jobs are in-progress or queued - If idle for more than 2 hours, triggers the teardown workflow - Worst case: an idle cluster runs ~2.5 hours before teardown -**Important**: Always verify the cluster has been torn down if you're done testing. The auto-teardown is a safety net, not a substitute for manual cleanup. +Always verify the cluster has been torn down when done testing. The auto-teardown is a safety net, not a substitute for manual cleanup. + +## Known Limitations + +- **Privileged dind + broad SCC**: The `github-arc` SCC allows privileged containers for the Docker-in-Docker sidecar. Scope runner namespaces and RBAC accordingly. +- **`ttl.sh` for plugin images**: Plugin images use ephemeral `ttl.sh` tags with 2h TTL per CI run. Suitable for CI but not for long-term storage. +- **Ghost runner cleanup**: Requires `BOT_PAT` with repo admin scope. Without it, offline runners must be cleaned up manually. ## Troubleshooting -### Cluster setup fails during provisioning +### Cluster setup fails - Check IBM Cloud status page for outages -- Verify the API key has sufficient permissions -- Bare metal availability varies by region; try a different zone +- Verify the API key has sufficient permissions (expand the **IAM diagnostics** step log or download the artifact) +- For classic: check `ibmcloud ks infra-permissions get --region ` +- For VPC/IPI: check VPC Infrastructure permissions ### ARC runners not registering @@ -358,36 +176,14 @@ Bare metal nodes on IBM Cloud are expensive. The auto-teardown workflow provides - Check individual component status: `oc get pods -n kubevirt-hyperconverged` - Verify storage: `oc get storageclass` -### `npm ci` fails in kubevirt-plugin-ci job - -- **"package-lock.json is out of sync"**: Run `npm install` locally and commit the updated `package-lock.json`. -- **Node/npm version**: The workflow uses Node 22; the runner image must provide a compatible Node (or use `actions/setup-node`). Check the "Install dependencies" step log for `node -v` and `npm -v`. -- **Network**: The runner must reach the npm registry. If the cluster restricts egress, allow `registry.npmjs.org` (and any private registries). -- **RW Access**: The runner must have writable volumes for npm global configuration and package caching +### Auto-teardown not triggering -### Auto-teardown never triggers teardown - -- The scheduled job needs **`permissions: actions: write`** to dispatch the teardown workflow; confirm it is not overridden by org policy. -- **`workflow_id`** in the dispatch step must match the teardown workflow filename on the default branch (**`ibmc-cluster-teardown.yml`**). A mismatch silently prevents teardown from running. - -### Ghost runners after failed teardown - -- Go to repository Settings → Actions → Runners -- Manually delete any offline runners -- Or run the teardown workflow again (it includes ghost runner cleanup) - -### ARC runner `oc` / `kubectl` permissions - -Jobs on `kubevirt-plugin-ci` use the ARC scale set ServiceAccount **`kubevirt-plugin-ci-gha-rs-no-permission`** in `arc-runners`, not `default`. Without RBAC, `oc` steps fail with Forbidden. - -**Default:** **`ci-scripts/arc/install-runner-scale-set.sh`** applies **`ci-scripts/arc/arc-runner-rbac.yaml`** after the scale set install (ClusterRole **`arc-runner-ci`** bound to **`${RUNNER_SCALE_SET_NAME}-gha-rs-no-permission`** in **`${ARC_RUNNERS_NS}`**). Set **`SKIP_ARC_RUNNER_RBAC=1`** if you manage bindings yourself. - -**Manual apply** (defaults only, or after skipping): - -```bash -oc apply -f ci-scripts/arc/arc-runner-rbac.yaml -``` +- The scheduled job needs `permissions: actions: write` to dispatch teardown +- `workflow_id` in the dispatch step must match the teardown workflow filename on the default branch +- For IPI clusters: auto-teardown detects them via DNS probe (`api..cnv-ui.com`) -If `RUNNER_SCALE_SET_NAME` or `ARC_RUNNERS_NS` differ from defaults, edit the `subjects` block or rely on **`ci-scripts/arc/install-runner-scale-set.sh`** substitution. +### `npm ci` fails on ARC runner -For a disposable or single-tenant cluster you can instead grant full cluster-admin by using the alternative ClusterRoleBinding described in the comments at the top of `ci-scripts/arc/arc-runner-rbac.yaml`. +- Run `npm install` locally and commit the updated `package-lock.json` +- Check Node/npm version compatibility (runner image provides Node 22) +- Verify the runner can reach `registry.npmjs.org` diff --git a/cypress/tests/gating/poc-check-tab-yaml.cy.ts b/cypress/tests/gating/poc-check-tab-yaml.cy.ts deleted file mode 100644 index dc94aa8b41..0000000000 --- a/cypress/tests/gating/poc-check-tab-yaml.cy.ts +++ /dev/null @@ -1,232 +0,0 @@ -import { ALL_PROJ_NS, MINUTE, SECOND, TEST_NS, VM_STATUS } from '../../utils/const/index'; -import { Example, YAML } from '../../utils/const/string'; -import { TEMPLATE } from '../../utils/const/template'; -import * as sel from '../../views/selector'; -import { userButtonTxt } from '../../views/selector-instance'; -import { navigateToConfigurationSubTab, subTabName, tab } from '../../views/tab'; - -describe('Check all virtualization pages can be loaded', () => { - before(() => { - cy.beforeSpec(); - cy.visitVMsVirt(); - }); - - describe('Check VirtualMachines page', () => { - it('start example vm', () => { - cy.byLegacyTestID(Example).click(); - cy.get(sel.iconStartBtn, { timeout: MINUTE }).click(); - cy.wait(15 * SECOND); - }); - - it( - 'check the status of example vm', - { - retries: { - runMode: 8, - }, - }, - () => { - cy.contains(sel.vmStatusOnOverview, VM_STATUS.Running).should('be.visible'); - cy.wait(10 * SECOND); - }, - ); - - it('vm tabs are loaded', () => { - cy.contains('Hostname').should('be.visible'); - - tab.navigateToMetrics(); - cy.contains('Utilization').should('be.visible'); - - tab.navigateToYAML(); - cy.contains('Download').should('be.visible'); - - tab.navigateToEvents(); - cy.contains('event').should('be.visible'); - - tab.navigateToConsole(); - cy.contains('Guest login credentials').should('be.visible'); - - tab.navigateToSnapshots(); - cy.contains('No snapshots found').should('be.visible'); - - tab.navigateToDiagnostics(); - cy.contains('Status conditions').should('be.visible'); - - tab.navigateToDiagnosticsGuestSystemLog(); - cy.contains('Guest system log').should('be.visible'); - - tab.navigateToConfiguration(); - cy.contains('Headless mode').should('be.visible'); - - navigateToConfigurationSubTab(subTabName.Storage); - cy.contains('rootdisk').should('be.visible'); - - navigateToConfigurationSubTab(subTabName.Network); - cy.contains('Pod networking').should('be.visible'); - - navigateToConfigurationSubTab(subTabName.Scheduling); - cy.contains('Scheduling and resource requirements').should('be.visible'); - - navigateToConfigurationSubTab(subTabName.SSH); - cy.contains('SSH access').should('be.visible'); - - navigateToConfigurationSubTab(subTabName.InitialRun); - cy.contains('Cloud-init').should('be.visible'); - - navigateToConfigurationSubTab(subTabName.Metadata); - cy.contains('Annotations').should('be.visible'); - }); - - it('vmi tabs are loaded', () => { - tab.navigateToOverview(); - cy.contains('VirtualMachineInstance').should('be.visible'); - cy.byLegacyTestID(Example).click(); - - cy.contains('Annotations').should('be.visible'); - - tab.navigateToYAML(); - cy.contains('Download').should('be.visible'); - - tab.navigateToScheduling(); - cy.contains('Tolerations').should('be.visible'); - - tab.navigateToEvents(); - cy.contains('event').should('be.visible'); - - tab.navigateToConsole(); - cy.contains('Guest login credentials').should('be.visible'); - - tab.navigateToNetworks(); - cy.contains('Pod networking').should('be.visible'); - - tab.navigateToDisks(); - cy.contains('rootdisk').should('be.visible'); - }); - }); - - // describe('Check Templates page', () => { - // it('visit template page', () => { - // cy.visitTemplates(); - // cy.switchProject(ALL_PROJ_NS); - // }); - - // it('common template tabs are loaded', () => { - // cy.get(sel.nameFilter).type(TEMPLATE.RHEL9.metadataName); - // cy.byLegacyTestID(TEMPLATE.RHEL9.metadataName).click(); - - // cy.contains('Display name').should('be.visible'); - // cy.contains('not editable').should('be.visible'); - - // tab.navigateToYAML(); - // cy.contains('Download').should('be.visible'); - - // tab.navigateToScheduling(); - // cy.contains('Tolerations').should('be.visible'); - - // tab.navigateToNetworks(); - // cy.contains('Pod networking').should('be.visible'); - - // tab.navigateToDisks(); - // cy.contains('rootdisk').should('be.visible'); - - // tab.navigateToScripts(); - // cy.contains('Cloud-init').should('be.visible'); - - // tab.navigateToParameters(); - // cy.contains('DATA_SOURCE_NAME').should('be.visible'); - // }); - - // it('create example template', () => { - // cy.switchProject(TEST_NS); - // cy.get(sel.itemCreateBtn).click(); - // cy.get(sel.saveBtn).click(); - // }); - - // it('custom template tabs are loaded', () => { - // cy.contains('Display name').should('be.visible'); - - // tab.navigateToYAML(); - // cy.contains('Download').should('be.visible'); - - // tab.navigateToScheduling(); - // cy.contains('Tolerations').should('be.visible'); - - // tab.navigateToNetworks(); - // cy.contains('Pod networking').should('be.visible'); - - // tab.navigateToDisks(); - // cy.contains('rootdisk').should('be.visible'); - - // tab.navigateToScripts(); - // cy.contains('Cloud-init').should('be.visible'); - - // tab.navigateToParameters(); - // cy.contains('CLOUD_USER_PASSWORD').should('be.visible'); - // }); - // }); - - // describe('Check InstanceTypes tabs', () => { - // it('instanceTypes page is loaded', () => { - // cy.visitITs(); - // cy.contains('cx1.2xlarge').should('exist'); - // }); - - // it('create VirtualMachineClusterInstanceType from YAML', () => { - // cy.get('div.co-m-list').find(sel.itemCreateBtn).eq(0).click(); - // cy.get(sel.saveBtn).click(); - // cy.get(sel.breadcrumb).click(); - // cy.get(sel.nameFilter).first().type(Example); - // cy.byLegacyTestID(Example).should('exist'); - // cy.byLegacyTestID('cx1.2xlarge').should('not.exist'); - // }); - - // it('create VirtualMachineInstanceType from YAML', () => { - // cy.contains('span.pf-v6-c-tabs__item-text', userButtonTxt).click(); - // cy.switchProject(TEST_NS); - // cy.get(sel.itemCreateBtn).click(); - // cy.get(sel.saveBtn).click(); - // cy.get(sel.breadcrumb).click(); - // cy.byLegacyTestID(Example).should('exist'); - // }); - // }); - - // describe('Check Bootable volumes page', () => { - // it('bootable volume page is loaded', () => { - // cy.visitVolumes(); - // cy.switchProject(ALL_PROJ_NS); - // cy.contains('fedora').should('exist'); - // }); - - // it('create bootable volume from YAML', () => { - // cy.switchProject(TEST_NS); - // cy.wait(3000); - // cy.get(sel.itemCreateBtn).click(); - // cy.byButtonText(YAML).click(); - // cy.get(sel.saveBtn).click(); - // cy.byLegacyTestID(Example).should('exist'); - // }); - // }); - - // describe('Check MigrationPolicies page', () => { - // it('migration policy page is loaded', () => { - // cy.visitMPs(); - // cy.contains('No MigrationPolicies found').should('exist'); - // }); - - // it('create migration policy from YAML', () => { - // cy.get(sel.itemCreateBtn).click(); - // cy.byButtonText(YAML).click(); - // cy.get(sel.saveBtn).click(); - // cy.get('.pf-v6-c-breadcrumb__item').eq(0).click(); - // cy.byLegacyTestID(Example).should('exist'); - // }); - // }); - - // describe('Check Checkups tabs', () => { - // it('storage checkup pages is loaded', () => { - // cy.visitCheckups(); - // cy.contains('.pf-v6-c-tabs__item-text', 'Storage').click(); - // cy.contains('No storage checkups found').should('exist'); - // }); - // }); -}); diff --git a/cypress/tests/poc-gating.cy.ts b/cypress/tests/poc-gating.cy.ts deleted file mode 100644 index c3ce787bf3..0000000000 --- a/cypress/tests/poc-gating.cy.ts +++ /dev/null @@ -1,3 +0,0 @@ -import './setup/setup.cy.ts'; -import './setup/shared-vm.cy.ts'; -import './gating/check-tab-yaml.cy.ts'; diff --git a/docs/HOT_CLUSTER_CI_STATUS.md b/docs/HOT_CLUSTER_CI_STATUS.md deleted file mode 100644 index 7b1814a56b..0000000000 --- a/docs/HOT_CLUSTER_CI_STATUS.md +++ /dev/null @@ -1,224 +0,0 @@ -# Hot Cluster CI — Status & Follow-Up (2026-06-24) - -## Overview - -PR [#4099](https://github.com/kubevirt-ui/kubevirt-plugin/pull/4099) moves E2E testing from Prow (ephemeral AWS clusters) to a persistent IBM Cloud hot cluster with GitHub Actions + ARC. This document summarizes the current state and next steps. - ---- - -## Three Provisioning Paths Explored - -The setup workflow (`ibmc-cluster-setup.yml`) now supports three `infrastructure_type` options: - -| Path | Status | Remaining Blocker | -| ---------------------- | ------- | ----------------------------------------------------- | -| **Classic ROKS** | Blocked | E73e6 — Classic Infrastructure Super User missing | -| **VPC ROKS** | Blocked | E4acb — COS service-to-service authorization missing | -| **IPI (self-managed)** | Blocked | DNS zone for `cnv-ui.com` not configured in IBM Cloud | - ---- - -## What Works (Confirmed in CI Runs) - -| Component | Status | Run Evidence | -| ---------------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------ | -| IBM Cloud CLI login (`IC_KEY`) | OK | All runs | -| Kubernetes Service Administrator | OK (was A0010, now fixed) | Runs after Jun 24 | -| VPC Infrastructure Administrator | OK | VPC created in [#28114675220](https://github.com/kubevirt-ui/kubevirt-plugin/actions/runs/28114675220) | -| VPC + subnet + public gateway creation | OK | Auto-created and reused across runs | -| COS instance creation | OK | `kubevirt-plugin-ci-cos` active | -| IAM Identity Service | OK | Can list/create service IDs | -| `openshift-install` download + auth | OK | [#28116978024](https://github.com/kubevirt-ui/kubevirt-plugin/actions/runs/28116978024) | -| IPI flavor validation (`bx2-4x16`, `bx2-8x32`) | OK | Same run | -| `OPENSHIFT_PULL_SECRET` GitHub secret | OK | Added Jun 24 | -| IPI diagnostics mode | OK | [#28116118038](https://github.com/kubevirt-ui/kubevirt-plugin/actions/runs/28116118038) (green) | - ---- - -## Blocker Details - -### 1. VPC ROKS — COS Authorization (One Command Fix) - -Error: - -``` -Could not find the specified cloud object storage instance because it does not exist -or the API key that is set for this resource group and region has inadequate permissions. (E4acb) -``` - -**Fix:** - -```bash -ibmcloud iam authorization-policy-create containers-kubernetes cloud-object-storage Reader -``` - -This creates a service-to-service authorization policy allowing Kubernetes Service to read COS instances. One-time, account-level. - -### 2. IPI — DNS Zone Required - -Error: - -``` -failed to generate asset "DNS Config": failed to get DNS zone ID: -DNS zone "ipi-test.ibmcloud.local" not found -``` - -**Fix:** Register `cnv-ui.com` (or `ci.cnv-ui.com`) in IBM Cloud Internet Services (CIS): - -```bash -# Create CIS instance -ibmcloud resource service-instance-create cnv-ui-cis internet-svcs standard-next global -g cnv-ui - -# Add domain -ibmcloud cis instance-set cnv-ui-cis -ibmcloud cis domain-add cnv-ui.com - -# Then update domain registrar NS records to point to IBM Cloud nameservers -``` - -After NS propagation, update `baseDomain` in the IPI install-config to `cnv-ui.com`. - -### 3. Classic ROKS — Infrastructure Permissions - -Error: - -``` -The classic infrastructure permissions that are set for the region and resource group -do not have the required permissions to perform this action. (E73e6) -``` - -**Fix:** Grant Classic Infrastructure Super User to `IC_KEY` identity, or at minimum: - -- Add Server, Cancel Server, View Virtual Server Details -- IPMI Remote Management, OS Reloads and Rescue Kernel -- Add/Edit/View Support Case -- Add Compute with Public Network Port - ---- - -## Workflow Inputs Reference - -``` -Actions → IBM Cloud Hot Cluster Setup → Run workflow - -infrastructure_type: classic | vpc | ipi -cluster_name: kubevirt-plugin-ci -zone: wdc04 (classic), us-south-1 (vpc/ipi) -openshift_version: 4.20_openshift -worker_flavor: bx2.8x32 (ROKS dot format) / bx2-8x32 (IPI hyphen format, auto-converted) -worker_count: 2 -kvm_emulation: true -cos_instance_crn: (leave empty — auto-created for VPC) -``` - ---- - -## GitHub Secrets Required - -| Secret | Purpose | Status | -| -------------------------------------- | ------------------------------- | -------------------------- | -| `IC_KEY` | IBM Cloud API key | Org secret (exists) | -| `ARC_GITHUB_APP_*` or `ARC_GITHUB_PAT` | ARC runner registration | Org secret (exists) | -| `OPENSHIFT_PULL_SECRET` | Red Hat registry auth for IPI | Repo secret (added Jun 24) | -| `BOT_PAT` | Ghost runner cleanup (optional) | Repo secret (exists) | - ---- - -## IBM Cloud Account Info (from diagnostics) - -``` -User: mschatzm@redhat.com -Account: Virtualization (2be0cd841378412882ec2fb4a99951e2) -Account Owner: dkenigsb@redhat.com -Resource Group: cnv-ui -CLI Region: eu-de -``` - ---- - -## VPC Resources Already Created (reusable) - -| Resource | ID | Zone | -| -------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ---------- | -| VPC | `r006-2342eba0-b3be-412d-bbe3-ea040609b26d` | us-south | -| Subnet | `0717-aaacfab4-8863-4fc1-8e0c-8ad739f97504` | us-south-1 | -| Public Gateway | `r006-29e78258-1637-43df-b210-f2d920f68b27` | us-south-1 | -| COS Instance | `kubevirt-plugin-ci-cos` (CRN: `crn:v1:bluemix:public:cloud-object-storage:global:a/2be0cd841378412882ec2fb4a99951e2:6e32949d-...`) | global | - -These will be reused automatically on the next VPC or IPI run. - ---- - -## Tomorrow's Action Plan - -### If DNS is configured (`cnv-ui.com` in CIS): - -1. Update `baseDomain` in the IPI install-config step (change `ipi-test.ibmcloud.local` → `cnv-ui.com`) -2. Trigger: `infrastructure_type=ipi`, `zone=us-south-1` -3. Wait ~45 min for cluster creation -4. If successful, continue with HCO + ARC + ci-env-controller install -5. Then trigger Hot Cluster E2E - -### If COS authorization is granted instead (VPC ROKS path): - -1. Trigger: `infrastructure_type=vpc`, `zone=us-south-1` -2. COS instance already exists — cluster create should succeed -3. Wait for cluster ready (~30 min) -4. HCO, ARC, ci-env-controller install automatically -5. Then trigger Hot Cluster E2E - -### Quick test to verify either fix: - -```bash -# Test VPC ROKS (after COS auth): -gh workflow run "IBM Cloud Hot Cluster Setup" \ - --repo kubevirt-ui/kubevirt-plugin \ - --ref CNV-74265-hot-cluster-ci \ - -f infrastructure_type=vpc \ - -f cluster_name=kubevirt-plugin-ci \ - -f zone=us-south-1 - -# Test IPI (after DNS): -gh workflow run "IBM Cloud Hot Cluster Setup" \ - --repo kubevirt-ui/kubevirt-plugin \ - --ref CNV-74265-hot-cluster-ci \ - -f infrastructure_type=ipi \ - -f cluster_name=kubevirt-plugin-ci \ - -f zone=us-south-1 -``` - ---- - -## Key Commits on Branch (Jun 24) - -| Commit | Description | -| ----------- | ------------------------------------------------ | -| `005efee7c` | IAM diagnostics visible in logs + artifacts | -| `756963362` | Add VPC Gen2 provisioning path alongside classic | -| `bb89825b6` | Add IPI prerequisite checks to diagnostics | -| `660062dba` | Add `ipi` diagnostics-only mode | -| `2aa1b338f` | Add IPI cluster creation path (experimental) | -| `cf4e298a0` | Fix IPI flavor format (bx2-4x16 not bx2.4x16) | - ---- - -## Architecture Diagram - -``` -infrastructure_type = classic → Verify zone → VLAN lookup → ibmcloud oc cluster create classic -infrastructure_type = vpc → VPC/subnet/gateway → COS → ibmcloud oc cluster create vpc-gen2 -infrastructure_type = ipi → openshift-install create cluster (DNS + VPC + VMs) - ↓ (all paths converge) - Wait for cluster → Install oc → HCO → ARC → ci-env-controller - ↓ - Hot Cluster E2E → Playwright gating tests -``` - ---- - -## Cost Notes - -- VPC resources (VPC, subnet, gateway) are free when idle — no VMs running -- COS instance exists but has no buckets yet — negligible cost -- No VMs were provisioned in any IPI run (failed before that stage) -- Auto-teardown (2h idle) applies once a cluster is running From cd8da562f469f1c4ea8db53a32bc81fb5065e23e Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 17:13:38 -0400 Subject: [PATCH 33/42] ci: trigger workflow re-index Co-authored-by: Cursor From 4e504a76a187b34e1130cfa5f7e6b884e9adadc8 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 17:15:28 -0400 Subject: [PATCH 34/42] ci(hot-cluster): fix invalid YAML from heredoc and use template file Move install-config.yaml content to a standalone template file (ci-scripts/ipi-install-config.yaml.tpl) and use envsubst to substitute variables. Fixes the workflow YAML validation error that prevented workflow_dispatch from working. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 40 ++---------------------- ci-scripts/ipi-install-config.yaml.tpl | 35 +++++++++++++++++++++ 2 files changed, 38 insertions(+), 37 deletions(-) create mode 100644 ci-scripts/ipi-install-config.yaml.tpl diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index a080cdb967..f10e880bbb 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -303,43 +303,9 @@ jobs: IPI_WORKER_FLAVOR=$(echo "${WORKER_FLAVOR}" | sed 's/\./-/g') - cat > "${INSTALL_DIR}/install-config.yaml" < "${INSTALL_DIR}/install-config.yaml" echo "install-config.yaml generated at ${INSTALL_DIR}" diff --git a/ci-scripts/ipi-install-config.yaml.tpl b/ci-scripts/ipi-install-config.yaml.tpl new file mode 100644 index 0000000000..3506dce8f4 --- /dev/null +++ b/ci-scripts/ipi-install-config.yaml.tpl @@ -0,0 +1,35 @@ +apiVersion: v1 +metadata: + name: ${CLUSTER_NAME} +baseDomain: cnv-ui.com +credentialsMode: Manual +platform: + ibmcloud: + region: ${VPC_REGION} + resourceGroupName: cnv-ui +controlPlane: + architecture: amd64 + hyperthreading: Enabled + name: master + replicas: 3 + platform: + ibmcloud: + type: bx2-4x16 +compute: + - architecture: amd64 + hyperthreading: Enabled + name: worker + replicas: ${WORKER_COUNT} + platform: + ibmcloud: + type: ${IPI_WORKER_FLAVOR} +networking: + networkType: OVNKubernetes + clusterNetwork: + - cidr: 10.128.0.0/14 + hostPrefix: 23 + serviceNetwork: + - 172.30.0.0/16 +publish: External +pullSecret: '${PULL_SECRET}' +sshKey: '${SSH_PUB}' From 533f9ffce78c6af1a155a7a691c605fe31ad441b Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 17:17:51 -0400 Subject: [PATCH 35/42] ci(hot-cluster): install oc + ccoctl before manifest generation oc adm release extract needs the oc binary which was not available during the CCO manifest step. Download oc, kubectl, and ccoctl alongside openshift-install so credentials can be properly generated. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index f10e880bbb..3f631bf69c 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -268,7 +268,7 @@ jobs: # ────────────────────────────────────────────────────────────────────── # IPI (self-managed OpenShift) path # ────────────────────────────────────────────────────────────────────── - - name: Install openshift-install + - name: Install openshift-install, oc, and ccoctl if: inputs.infrastructure_type == 'ipi' id: ipi_tools env: @@ -276,12 +276,19 @@ jobs: run: | OCP_CHANNEL="stable-${OC_VERSION_INPUT%%_*}" echo "Resolving ${OCP_CHANNEL} to latest patch version..." + MIRROR="https://mirror.openshift.com/pub/openshift-v4/x86_64/clients/ocp/${OCP_CHANNEL}" - echo "Downloading openshift-install (${OCP_CHANNEL})..." - curl -sL "https://mirror.openshift.com/pub/openshift-v4/x86_64/clients/ocp/${OCP_CHANNEL}/openshift-install-linux.tar.gz" \ - | tar -xz -C /usr/local/bin openshift-install + echo "Downloading openshift-install..." + curl -sL "${MIRROR}/openshift-install-linux.tar.gz" | tar -xz -C /usr/local/bin openshift-install openshift-install version + echo "Downloading oc + kubectl..." + curl -sL "${MIRROR}/openshift-client-linux.tar.gz" | tar -xz -C /usr/local/bin oc kubectl + oc version --client + + echo "Downloading ccoctl..." + curl -sL "${MIRROR}/ccoctl-linux.tar.gz" | tar -xz -C /usr/local/bin ccoctl 2>/dev/null || echo "ccoctl not available for this version" + RESOLVED_VERSION=$(openshift-install version | head -1 | awk '{print $2}') echo "ocp_channel=${OCP_CHANNEL}" >> "$GITHUB_OUTPUT" echo "ocp_version=${RESOLVED_VERSION}" >> "$GITHUB_OUTPUT" From c0ff42ccb6155bb22d6d04e4e9227e4a0805a880 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 19:51:45 -0400 Subject: [PATCH 36/42] ci(hot-cluster): fix kubeconfig path and add debug output Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index 3f631bf69c..1a73514791 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -408,11 +408,18 @@ jobs: run: | if [[ "${{ inputs.infrastructure_type }}" == "ipi" ]]; then INSTALL_DIR="${RUNNER_TEMP}/ipi-install" - export KUBECONFIG="${INSTALL_DIR}/auth/kubeconfig" - echo "KUBECONFIG=${KUBECONFIG}" >> "$GITHUB_ENV" + KUBECONFIG_PATH="${INSTALL_DIR}/auth/kubeconfig" - curl -sL "https://mirror.openshift.com/pub/openshift-v4/x86_64/clients/ocp/${OCP_CHANNEL}/openshift-client-linux.tar.gz" \ - | tar -xz -C /usr/local/bin oc kubectl + echo "Looking for kubeconfig at ${KUBECONFIG_PATH}..." + ls -la "${INSTALL_DIR}/auth/" 2>/dev/null || echo "auth dir not found" + + if [[ ! -f "${KUBECONFIG_PATH}" ]]; then + echo "::error::kubeconfig not found at ${KUBECONFIG_PATH}" + exit 1 + fi + + export KUBECONFIG="${KUBECONFIG_PATH}" + echo "KUBECONFIG=${KUBECONFIG_PATH}" >> "$GITHUB_ENV" else ibmcloud oc cluster config --cluster "${CLUSTER_NAME}" --admin fi From 59fef6c28dfecf562263f6516a8f4e07e271e8a7 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 19:53:20 -0400 Subject: [PATCH 37/42] ci(hot-cluster): add one-off IPI destroy workflow for orphaned clusters Co-authored-by: Cursor --- .github/workflows/ipi-cluster-destroy.yml | 57 +++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 .github/workflows/ipi-cluster-destroy.yml diff --git a/.github/workflows/ipi-cluster-destroy.yml b/.github/workflows/ipi-cluster-destroy.yml new file mode 100644 index 0000000000..2ba0be1627 --- /dev/null +++ b/.github/workflows/ipi-cluster-destroy.yml @@ -0,0 +1,57 @@ +name: IPI Cluster Destroy (one-off) + +on: + workflow_dispatch: + inputs: + setup_run_id: + description: 'Setup workflow run ID to download install state from' + required: true + type: string + +permissions: + contents: read + actions: read + +jobs: + destroy: + name: Destroy IPI Cluster + runs-on: ubuntu-latest + timeout-minutes: 60 + steps: + - name: Download install state + env: + GH_TOKEN: ${{ github.token }} + run: | + INSTALL_DIR="${RUNNER_TEMP}/ipi-destroy" + mkdir -p "${INSTALL_DIR}" + + echo "Downloading install state from run ${{ inputs.setup_run_id }}..." + gh run download "${{ inputs.setup_run_id }}" --repo "${{ github.repository }}" \ + --name "ipi-install-state-${{ inputs.setup_run_id }}" \ + --dir "${INSTALL_DIR}" 2>/dev/null || \ + gh run download "${{ inputs.setup_run_id }}" --repo "${{ github.repository }}" \ + --dir "${INSTALL_DIR}" + + echo "Files downloaded:" + find "${INSTALL_DIR}" -type f + + if [[ ! -f "${INSTALL_DIR}/metadata.json" ]]; then + echo "::error::metadata.json not found" + exit 1 + fi + + echo "infraID: $(jq -r '.infraID' "${INSTALL_DIR}/metadata.json")" + + - name: Install openshift-install + run: | + curl -sL "https://mirror.openshift.com/pub/openshift-v4/x86_64/clients/ocp/stable/openshift-install-linux.tar.gz" \ + | tar -xz -C /usr/local/bin openshift-install + + - name: Destroy cluster + env: + IC_API_KEY: ${{ secrets.IC_KEY }} + INSTALL_DIR: ${{ runner.temp }}/ipi-destroy + run: | + export IC_API_KEY + echo "Destroying cluster $(jq -r '.infraID' "${INSTALL_DIR}/metadata.json")..." + openshift-install destroy cluster --dir="${INSTALL_DIR}" --log-level=info From 06b4c82549ca9eb80862946a337e398a049354f2 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 19:54:39 -0400 Subject: [PATCH 38/42] ci(hot-cluster): auto-cleanup old IPI resources before new install Add a pre-step that finds and deletes VMs, load balancers, and DNS records from previous IPI installs with the same cluster name prefix. Also adds a one-off IPI destroy workflow for manual cleanup. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 45 ++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index 1a73514791..ddcf8b0b2d 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -90,6 +90,51 @@ jobs: retention-days: 14 if-no-files-found: warn + - name: Clean up old IPI clusters + if: inputs.infrastructure_type == 'ipi' + env: + IC_API_KEY: ${{ secrets.IC_KEY }} + continue-on-error: true + run: | + export IC_API_KEY + echo "Checking for leftover IPI resources with name prefix '${CLUSTER_NAME}'..." + ibmcloud target -r us-south 2>/dev/null || true + + OLD_VMS=$(ibmcloud is instances --output json 2>/dev/null | jq -r --arg prefix "${CLUSTER_NAME}" '[.[] | select(.name | startswith($prefix))] | length' || echo "0") + if [[ "${OLD_VMS}" -gt 0 ]]; then + echo "Found ${OLD_VMS} VMs from previous IPI installs. Cleaning up..." + for vm_id in $(ibmcloud is instances --output json 2>/dev/null | jq -r --arg prefix "${CLUSTER_NAME}" '.[] | select(.name | startswith($prefix)) | .id'); do + echo " Deleting VM ${vm_id}..." + ibmcloud is instance-delete "${vm_id}" -f 2>/dev/null || true + done + echo "Waiting for VMs to terminate..." + sleep 30 + else + echo "No old IPI VMs found." + fi + + echo "Cleaning up old load balancers..." + for lb_id in $(ibmcloud is lbs --output json 2>/dev/null | jq -r --arg prefix "${CLUSTER_NAME}" '.[] | select(.name | startswith($prefix)) | .id' || true); do + echo " Deleting LB ${lb_id}..." + ibmcloud is lb-delete "${lb_id}" -f 2>/dev/null || true + done + + echo "Cleaning up old DNS records..." + if command -v ibmcloud cis &>/dev/null || ibmcloud plugin install cis -f 2>/dev/null; then + CIS_ID=$(jq -r '.ibmcloud.cisInstanceCRN // empty' /tmp/ipi-state2/metadata.json 2>/dev/null || true) + if [[ -n "${CIS_ID}" ]]; then + ibmcloud cis instance-set "${CIS_ID}" 2>/dev/null || true + for zone_id in $(ibmcloud cis domains --output json 2>/dev/null | jq -r '.[].id' || true); do + for record_id in $(ibmcloud cis dns-records "${zone_id}" --output json 2>/dev/null | jq -r --arg prefix "${CLUSTER_NAME}" '.[] | select(.name | contains($prefix)) | .id' || true); do + echo " Deleting DNS record ${record_id}..." + ibmcloud cis dns-record-delete "${zone_id}" "${record_id}" 2>/dev/null || true + done + done + fi + fi + + echo "Old IPI resource cleanup complete." + - name: Check for existing ROKS cluster if: inputs.infrastructure_type != 'ipi' id: check_cluster From 65b7ace3104826c37e6776df3fd9c8da212f1677 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 19:56:51 -0400 Subject: [PATCH 39/42] ci(hot-cluster): scope IPI cleanup to DNS records only, by exact cluster name The previous cleanup deleted all VMs/LBs matching the cluster name prefix, which would prevent parallel clusters. Now only cleans stale DNS records (the actual blocker for re-installs) and scopes to the exact cluster name. VMs and LBs are managed by openshift-install destroy, not brute-force. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 61 +++++++++++------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index ddcf8b0b2d..52940a482d 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -90,50 +90,43 @@ jobs: retention-days: 14 if-no-files-found: warn - - name: Clean up old IPI clusters + - name: Clean up stale IPI resources for this cluster name if: inputs.infrastructure_type == 'ipi' env: IC_API_KEY: ${{ secrets.IC_KEY }} continue-on-error: true run: | export IC_API_KEY - echo "Checking for leftover IPI resources with name prefix '${CLUSTER_NAME}'..." - ibmcloud target -r us-south 2>/dev/null || true - - OLD_VMS=$(ibmcloud is instances --output json 2>/dev/null | jq -r --arg prefix "${CLUSTER_NAME}" '[.[] | select(.name | startswith($prefix))] | length' || echo "0") - if [[ "${OLD_VMS}" -gt 0 ]]; then - echo "Found ${OLD_VMS} VMs from previous IPI installs. Cleaning up..." - for vm_id in $(ibmcloud is instances --output json 2>/dev/null | jq -r --arg prefix "${CLUSTER_NAME}" '.[] | select(.name | startswith($prefix)) | .id'); do - echo " Deleting VM ${vm_id}..." - ibmcloud is instance-delete "${vm_id}" -f 2>/dev/null || true - done - echo "Waiting for VMs to terminate..." - sleep 30 - else - echo "No old IPI VMs found." - fi - - echo "Cleaning up old load balancers..." - for lb_id in $(ibmcloud is lbs --output json 2>/dev/null | jq -r --arg prefix "${CLUSTER_NAME}" '.[] | select(.name | startswith($prefix)) | .id' || true); do - echo " Deleting LB ${lb_id}..." - ibmcloud is lb-delete "${lb_id}" -f 2>/dev/null || true - done - - echo "Cleaning up old DNS records..." - if command -v ibmcloud cis &>/dev/null || ibmcloud plugin install cis -f 2>/dev/null; then - CIS_ID=$(jq -r '.ibmcloud.cisInstanceCRN // empty' /tmp/ipi-state2/metadata.json 2>/dev/null || true) - if [[ -n "${CIS_ID}" ]]; then - ibmcloud cis instance-set "${CIS_ID}" 2>/dev/null || true - for zone_id in $(ibmcloud cis domains --output json 2>/dev/null | jq -r '.[].id' || true); do - for record_id in $(ibmcloud cis dns-records "${zone_id}" --output json 2>/dev/null | jq -r --arg prefix "${CLUSTER_NAME}" '.[] | select(.name | contains($prefix)) | .id' || true); do - echo " Deleting DNS record ${record_id}..." + ZONE="${{ inputs.zone }}" + VPC_REGION="${ZONE%-*}" + ibmcloud target -r "${VPC_REGION}" 2>/dev/null || true + + echo "Cleaning up stale DNS records for '${CLUSTER_NAME}.cnv-ui.com'..." + ibmcloud plugin install cis -f 2>/dev/null || true + + CIS_INSTANCES=$(ibmcloud cis instances --output json 2>/dev/null || echo "[]") + CIS_ID=$(echo "${CIS_INSTANCES}" | jq -r '.[0].crn // empty') + if [[ -n "${CIS_ID}" ]]; then + ibmcloud cis instance-set "${CIS_ID}" 2>/dev/null || true + for zone_id in $(ibmcloud cis domains --output json 2>/dev/null | jq -r '.[].id' || true); do + STALE_RECORDS=$(ibmcloud cis dns-records "${zone_id}" --output json 2>/dev/null \ + | jq -r --arg cn "${CLUSTER_NAME}" '.[] | select(.name | contains($cn)) | "\(.id) \(.name)"' || true) + if [[ -n "${STALE_RECORDS}" ]]; then + echo "Found stale DNS records:" + echo "${STALE_RECORDS}" + echo "${STALE_RECORDS}" | while read -r record_id record_name; do + echo " Deleting: ${record_name} (${record_id})" ibmcloud cis dns-record-delete "${zone_id}" "${record_id}" 2>/dev/null || true done - done - fi + else + echo "No stale DNS records found for '${CLUSTER_NAME}'." + fi + done + else + echo "No CIS instance found, skipping DNS cleanup." fi - echo "Old IPI resource cleanup complete." + echo "DNS cleanup complete." - name: Check for existing ROKS cluster if: inputs.infrastructure_type != 'ipi' From 9d29c5fcb48ddea2ba73d0e273a6df2914122d35 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 19:58:51 -0400 Subject: [PATCH 40/42] ci(hot-cluster): remove ccoctl/manual credentials, let installer handle it The IC_KEY doesn't have iam.policy.create permission so ccoctl fails. Remove credentialsMode: Manual and the CCO manifests step entirely. Let openshift-install create cluster handle credentials using the API key directly. Also fixes the auth/ dir missing issue (create manifests was consuming install-config before create cluster). Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 29 ------------------------ ci-scripts/ipi-install-config.yaml.tpl | 1 - 2 files changed, 30 deletions(-) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index 52940a482d..5f2fae3b7c 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -358,35 +358,6 @@ jobs: sed 's/pullSecret:.*/pullSecret: REDACTED/' "${INSTALL_DIR}/install-config.yaml" echo "::endgroup::" - - name: Generate CCO manifests for IBM Cloud - if: inputs.infrastructure_type == 'ipi' - env: - IC_API_KEY: ${{ secrets.IC_KEY }} - INSTALL_DIR: ${{ runner.temp }}/ipi-install - run: | - export IC_API_KEY - - echo "Creating install manifests..." - openshift-install create manifests --dir="${INSTALL_DIR}" - - echo "Extracting CredentialsRequests..." - CRED_DIR="${INSTALL_DIR}/credreqs" - mkdir -p "${CRED_DIR}" - oc adm release extract \ - --credentials-requests \ - --cloud=ibmcloud \ - --to="${CRED_DIR}" \ - "$(openshift-install version | grep 'release image' | awk '{print $3}')" 2>/dev/null || \ - echo "Warning: could not extract credentials requests; falling back to installer defaults" - - if ls "${CRED_DIR}"/*.yaml &>/dev/null; then - echo "Processing CredentialsRequests with ccoctl..." - ccoctl ibmcloud create-service-id \ - --credentials-requests-dir="${CRED_DIR}" \ - --name="${CLUSTER_NAME}" \ - --output-dir="${INSTALL_DIR}" 2>&1 || true - fi - - name: Create IPI cluster id: ipi_create if: inputs.infrastructure_type == 'ipi' diff --git a/ci-scripts/ipi-install-config.yaml.tpl b/ci-scripts/ipi-install-config.yaml.tpl index 3506dce8f4..5cc41b5e21 100644 --- a/ci-scripts/ipi-install-config.yaml.tpl +++ b/ci-scripts/ipi-install-config.yaml.tpl @@ -2,7 +2,6 @@ apiVersion: v1 metadata: name: ${CLUSTER_NAME} baseDomain: cnv-ui.com -credentialsMode: Manual platform: ibmcloud: region: ${VPC_REGION} From bbc85f25eacc6766ced4ee6186467b319f9000b5 Mon Sep 17 00:00:00 2001 From: Gal Kremer Date: Wed, 24 Jun 2026 21:49:31 -0400 Subject: [PATCH 41/42] ci(hot-cluster): fix IPI credentials with manual secret generation IBM Cloud IPI requires credentialsMode: Manual but ccoctl needs iam.policy.create which we don't have. Instead, generate the ibm-cloud-credentials secret manifests directly using the API key for all required namespaces (CCM, machine-api, image-registry, ingress, CSI). This fixes the CCM FailedMount error that prevented node initialization and caused the entire cluster to be broken. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 9 +++++ ci-scripts/create-ibmcloud-cco-secrets.sh | 43 +++++++++++++++++++++++ ci-scripts/ipi-install-config.yaml.tpl | 1 + 3 files changed, 53 insertions(+) create mode 100755 ci-scripts/create-ibmcloud-cco-secrets.sh diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index 5f2fae3b7c..6592be7e4e 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -358,6 +358,15 @@ jobs: sed 's/pullSecret:.*/pullSecret: REDACTED/' "${INSTALL_DIR}/install-config.yaml" echo "::endgroup::" + - name: Generate CCO credential secrets + if: inputs.infrastructure_type == 'ipi' + env: + IC_API_KEY: ${{ secrets.IC_KEY }} + INSTALL_DIR: ${{ runner.temp }}/ipi-install + run: | + openshift-install create manifests --dir="${INSTALL_DIR}" + bash ./ci-scripts/create-ibmcloud-cco-secrets.sh + - name: Create IPI cluster id: ipi_create if: inputs.infrastructure_type == 'ipi' diff --git a/ci-scripts/create-ibmcloud-cco-secrets.sh b/ci-scripts/create-ibmcloud-cco-secrets.sh new file mode 100755 index 0000000000..a78ad993e2 --- /dev/null +++ b/ci-scripts/create-ibmcloud-cco-secrets.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# +# Create IBM Cloud credential secrets for CCO manual mode. +# This replaces ccoctl for environments where the API key doesn't have +# iam.policy.create permission. Uses the same API key for all components. +# +# Required env: +# IC_API_KEY IBM Cloud API key +# INSTALL_DIR openshift-install working directory (must contain manifests/) + +set -euo pipefail + +IC_API_KEY="${IC_API_KEY:?IC_API_KEY must be set}" +INSTALL_DIR="${INSTALL_DIR:?INSTALL_DIR must be set}" + +MANIFESTS_DIR="${INSTALL_DIR}/openshift" +mkdir -p "${MANIFESTS_DIR}" + +CRED_DATA=$(printf "IBMCLOUD_AUTHTYPE=iam\nIBMCLOUD_APIKEY=%s" "${IC_API_KEY}" | base64 -w0) + +NAMESPACES=( + openshift-cloud-controller-manager + openshift-machine-api + openshift-image-registry + openshift-ingress-operator + openshift-cluster-csi-drivers +) + +for ns in "${NAMESPACES[@]}"; do + cat > "${MANIFESTS_DIR}/99-ibm-cloud-credentials-${ns}.yaml" < Date: Wed, 24 Jun 2026 21:51:03 -0400 Subject: [PATCH 42/42] ci(hot-cluster): move IPI cleanup to end of job, destroy on any failure The cleanup step was positioned before the common bootstrap steps, so it only ran when IPI-specific steps failed. Now it runs at the end of the job and destroys the cluster if ANY step failed (HCO, ARC, health check, etc.). No more orphaned clusters from partial setup failures. Co-authored-by: Cursor --- .github/workflows/ibmc-cluster-setup.yml | 30 +++++++++++++----------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ibmc-cluster-setup.yml b/.github/workflows/ibmc-cluster-setup.yml index 6592be7e4e..7b7ddd886a 100644 --- a/.github/workflows/ibmc-cluster-setup.yml +++ b/.github/workflows/ibmc-cluster-setup.yml @@ -391,20 +391,6 @@ jobs: retention-days: 30 if-no-files-found: ignore - - name: IPI cleanup on failure - if: inputs.infrastructure_type == 'ipi' && failure() - env: - IC_API_KEY: ${{ secrets.IC_KEY }} - INSTALL_DIR: ${{ runner.temp }}/ipi-install - run: | - export IC_API_KEY - if [[ -f "${INSTALL_DIR}/metadata.json" ]]; then - echo "Install failed — cleaning up IPI resources..." - openshift-install destroy cluster --dir="${INSTALL_DIR}" --log-level=info 2>&1 | tail -50 || true - else - echo "No metadata.json found — nothing to clean up." - fi - # ────────────────────────────────────────────────────────────────────── # Common bootstrap steps (all paths converge here) # ────────────────────────────────────────────────────────────────────── @@ -495,6 +481,22 @@ jobs: run: | ./ci-scripts/check-cluster-health.sh + - name: Destroy IPI cluster on failure + if: always() && inputs.infrastructure_type == 'ipi' && job.status == 'failure' + env: + IC_API_KEY: ${{ secrets.IC_KEY }} + INSTALL_DIR: ${{ runner.temp }}/ipi-install + run: | + export IC_API_KEY + if [[ -f "${INSTALL_DIR}/metadata.json" ]]; then + INFRA_ID=$(jq -r '.infraID' "${INSTALL_DIR}/metadata.json") + echo "Job failed — destroying IPI cluster '${INFRA_ID}'..." + openshift-install destroy cluster --dir="${INSTALL_DIR}" --log-level=info 2>&1 | tail -50 || true + echo "IPI cluster destroyed." + else + echo "No metadata.json found — no IPI resources to clean up." + fi + - name: Setup summary if: always() run: |