Skip to content

IBM Cloud Hot Cluster Setup #16

IBM Cloud Hot Cluster Setup

IBM Cloud Hot Cluster Setup #16

name: IBM Cloud Hot Cluster Setup
on:
workflow_dispatch:
inputs:
infrastructure_type:
description: 'Infrastructure type: classic, vpc, or ipi (ipi = diagnostics only)'
required: true
default: 'classic'
type: choice
options:
- classic
- vpc
- ipi
cluster_name:
description: 'Cluster name'
required: true
default: 'kubevirt-plugin-ci'
type: string
zone:
description: 'Zone (classic: wdc04, fra02; vpc: us-south-1, eu-de-1)'
required: true
default: 'us-south-1'
type: string
openshift_version:
description: 'OpenShift version'
required: true
default: '4.20_openshift'
type: string
worker_flavor:
description: 'Worker node flavor (classic bare metal: mb4c.4x32; vpc: bx2.8x32, cx2.4x8)'
required: true
default: 'bx2.8x32'
type: string
worker_count:
description: 'Number of worker nodes (at least 2 so ingress is happy)'
required: true
default: '2'
type: string
kvm_emulation:
description: 'KVM emulation (true for vpc/shared, false for bare metal)'
required: true
default: true
type: boolean
cos_instance_crn:
description: 'COS instance CRN for VPC internal registry (required for vpc, ignored for classic)'
required: false
default: ''
type: string
permissions:
contents: read
env:
CLUSTER_NAME: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }}
INFRASTRUCTURE_TYPE: ${{ inputs.infrastructure_type || 'classic' }}
jobs:
provision-cluster:
name: Provision OpenShift Cluster
runs-on: ubuntu-latest
timeout-minutes: 360
steps:
- name: Checkout
uses: actions/checkout@v5
- name: Setup IBM Cloud CLI
uses: IBM/actions-ibmcloud-cli@v1
with:
api_key: ${{ secrets.IC_KEY }}
region: eu-de
group: cnv-ui
plugins: kubernetes-service, container-registry, vpc-infrastructure
- name: Log IBM Cloud IAM diagnostics
id: iam_diagnostics
continue-on-error: true
env:
WORKER_ZONE: ${{ inputs.zone }}
INFRASTRUCTURE_TYPE: ${{ inputs.infrastructure_type || 'classic' }}
run: bash ./ci-scripts/log-ibmcloud-iam-diagnostics.sh
- name: Upload IAM diagnostics log
if: always() && steps.iam_diagnostics.outcome != 'skipped'
continue-on-error: true
uses: actions/upload-artifact@v6
with:
name: ibmcloud-iam-diagnostics-${{ github.run_id }}
path: ${{ runner.temp }}/ibmcloud-iam-diagnostics.txt
retention-days: 14
if-no-files-found: warn
- name: Check for existing cluster
if: inputs.infrastructure_type != 'ipi'
id: check_cluster
run: |
if ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" &>/dev/null; then
echo "Cluster '${CLUSTER_NAME}' already exists"
echo "exists=true" >> "$GITHUB_OUTPUT"
else
echo "Cluster '${CLUSTER_NAME}' does not exist, will create"
echo "exists=false" >> "$GITHUB_OUTPUT"
fi
# ──────────────────────────────────────────────────────────────────────
# Classic infrastructure path
# ──────────────────────────────────────────────────────────────────────
- name: Verify zone and flavor (classic)
if: steps.check_cluster.outputs.exists == 'false' && inputs.infrastructure_type == 'classic'
env:
ZONE: ${{ inputs.zone }}
FLAVOR: ${{ inputs.worker_flavor }}
run: |
echo "Fetching classic infrastructure locations and flavors..."
LOCATIONS_JSON=$(
ibmcloud oc locations --provider classic --show-flavors --output json |\
jq '[.[] | select(.kind=="dc")]'
)
echo "Checking zone '${ZONE}' exists..."
ZONE_EXISTS=$(
echo "${LOCATIONS_JSON}" |\
jq -r --arg z "${ZONE}" \
'[.[] | select(.id == $z)] | length'
)
if [[ "${ZONE_EXISTS}" -ne 1 ]]; then
echo "ERROR: Zone '${ZONE}' not found in classic infrastructure locations."
echo ""
echo "Available zones:"
echo "${LOCATIONS_JSON}" | jq -r '.[].id' | sort
exit 1
fi
echo "Zone '${ZONE}' exists"
echo "Checking flavor '${FLAVOR}' is available in zone '${ZONE}'..."
FLAVOR_EXISTS=$(
echo "${LOCATIONS_JSON}" |\
jq -r --arg z "${ZONE}" --arg f "${FLAVOR}" \
'.[] | select(.id == $z) | .flavors | split(",") | any(index($f))'
)
if [[ "${FLAVOR_EXISTS}" == "false" ]]; then
echo "ERROR: Flavor '${FLAVOR}' is not available in zone '${ZONE}'."
echo ""
echo "Available flavors in '${ZONE}':"
echo "${LOCATIONS_JSON}" | jq -r --arg z "${ZONE}" '.[] | select(.id == $z) | .flavors[]? | .id' | sort
exit 2
fi
echo "Flavor '${FLAVOR}' is available in zone '${ZONE}'"
- name: Create ROKS cluster (classic)
if: steps.check_cluster.outputs.exists == 'false' && inputs.infrastructure_type == 'classic'
env:
ZONE: ${{ inputs.zone }}
run: |
echo "Looking up existing VLANs in zone '${ZONE}'..."
VLAN_JSON=$(ibmcloud oc vlan ls --zone "${ZONE}" --output json 2>/dev/null || echo "[]")
PRIVATE_VLAN=$(echo "${VLAN_JSON}" | jq -r '[.[] | select(.type == "private")] | first | .id // empty')
PUBLIC_VLAN=$(echo "${VLAN_JSON}" | jq -r '[.[] | select(.type == "public")] | first | .id // empty')
if [[ -n "${PRIVATE_VLAN}" ]]; then
echo "Reusing existing private VLAN: ${PRIVATE_VLAN}"
echo "Reusing existing public VLAN: ${PUBLIC_VLAN:-'(none)'}"
else
echo "No existing VLANs in zone, new VLANs will be created"
fi
echo "Creating cluster '${CLUSTER_NAME}' with ${{ inputs.worker_count }}x ${{ inputs.worker_flavor }} workers in zone ${ZONE}..."
ibmcloud oc cluster create classic \
--name "${CLUSTER_NAME}" \
--version "${{ inputs.openshift_version }}" \
--flavor "${{ inputs.worker_flavor }}" \
--workers "${{ inputs.worker_count }}" \
--zone "${ZONE}" \
--private-vlan "${PRIVATE_VLAN}" \
--public-vlan "${PUBLIC_VLAN}"
# ──────────────────────────────────────────────────────────────────────
# VPC Gen2 infrastructure path
# ──────────────────────────────────────────────────────────────────────
- name: Provision VPC resources
if: steps.check_cluster.outputs.exists == 'false' && inputs.infrastructure_type == 'vpc'
id: vpc_resources
env:
ZONE: ${{ inputs.zone }}
VPC_NAME: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }}-vpc
run: |
echo "=== VPC Gen2 provisioning ==="
# Derive region from zone (e.g. us-south-1 -> us-south)
VPC_REGION="${ZONE%-*}"
echo "VPC region: ${VPC_REGION}, zone: ${ZONE}"
# Target the VPC region
ibmcloud target -r "${VPC_REGION}"
# Create or reuse VPC
VPC_ID=$(ibmcloud is vpcs --output json 2>/dev/null | jq -r --arg n "${VPC_NAME}" '.[] | select(.name == $n) | .id // empty')
if [[ -z "${VPC_ID}" ]]; then
echo "Creating VPC '${VPC_NAME}'..."
VPC_ID=$(ibmcloud is vpc-create "${VPC_NAME}" --output json | jq -r '.id')
echo "Created VPC: ${VPC_ID}"
else
echo "Reusing existing VPC '${VPC_NAME}': ${VPC_ID}"
fi
# Create or reuse subnet
SUBNET_NAME="${VPC_NAME}-subnet-${ZONE}"
SUBNET_ID=$(ibmcloud is subnets --output json 2>/dev/null | jq -r --arg n "${SUBNET_NAME}" '.[] | select(.name == $n) | .id // empty')
if [[ -z "${SUBNET_ID}" ]]; then
echo "Creating subnet '${SUBNET_NAME}' in zone '${ZONE}'..."
SUBNET_ID=$(ibmcloud is subnet-create "${SUBNET_NAME}" "${VPC_ID}" --zone "${ZONE}" --ipv4-address-count 256 --output json | jq -r '.id')
echo "Created subnet: ${SUBNET_ID}"
else
echo "Reusing existing subnet '${SUBNET_NAME}': ${SUBNET_ID}"
fi
# Create or reuse public gateway (required for console/OperatorHub access)
GW_NAME="${VPC_NAME}-gw-${ZONE}"
GW_ID=$(ibmcloud is public-gateways --output json 2>/dev/null | jq -r --arg n "${GW_NAME}" '.[] | select(.name == $n) | .id // empty')
if [[ -z "${GW_ID}" ]]; then
echo "Creating public gateway '${GW_NAME}'..."
GW_ID=$(ibmcloud is public-gateway-create "${GW_NAME}" "${VPC_ID}" "${ZONE}" --output json | jq -r '.id')
echo "Created public gateway: ${GW_ID}"
else
echo "Reusing existing public gateway '${GW_NAME}': ${GW_ID}"
fi
# Attach public gateway to subnet (idempotent — no-op if already attached)
echo "Attaching public gateway to subnet..."
ibmcloud is subnet-update "${SUBNET_ID}" --pgw "${GW_ID}" 2>/dev/null || true
echo "vpc_id=${VPC_ID}" >> "$GITHUB_OUTPUT"
echo "subnet_id=${SUBNET_ID}" >> "$GITHUB_OUTPUT"
echo "vpc_region=${VPC_REGION}" >> "$GITHUB_OUTPUT"
- name: Create ROKS cluster (vpc)
if: steps.check_cluster.outputs.exists == 'false' && inputs.infrastructure_type == 'vpc'
env:
ZONE: ${{ inputs.zone }}
VPC_ID: ${{ steps.vpc_resources.outputs.vpc_id }}
SUBNET_ID: ${{ steps.vpc_resources.outputs.subnet_id }}
COS_CRN: ${{ inputs.cos_instance_crn }}
COS_INSTANCE_NAME: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }}-cos
run: |
if [[ -z "${COS_CRN}" ]]; then
echo "No COS CRN provided — looking for existing COS instance '${COS_INSTANCE_NAME}'..."
COS_CRN=$(ibmcloud resource service-instances --service-name cloud-object-storage --output json 2>/dev/null \
| jq -r --arg n "${COS_INSTANCE_NAME}" '.[] | select(.name == $n) | .crn // empty' || true)
if [[ -z "${COS_CRN}" ]]; then
echo "Creating COS instance '${COS_INSTANCE_NAME}'..."
ibmcloud resource service-instance-create "${COS_INSTANCE_NAME}" cloud-object-storage \
744bfc56-d12c-4866-88d5-dac9139e0e5d global \
-d premium-global-deployment
COS_CRN=$(ibmcloud resource service-instances --service-name cloud-object-storage --output json \
| jq -r --arg n "${COS_INSTANCE_NAME}" '.[] | select(.name == $n) | .crn')
echo "Created COS instance: ${COS_CRN}"
else
echo "Reusing existing COS instance: ${COS_CRN}"
fi
fi
echo "Creating VPC cluster '${CLUSTER_NAME}' with ${{ inputs.worker_count }}x ${{ inputs.worker_flavor }} workers in zone ${ZONE}..."
ibmcloud oc cluster create vpc-gen2 \
--name "${CLUSTER_NAME}" \
--version "${{ inputs.openshift_version }}" \
--flavor "${{ inputs.worker_flavor }}" \
--workers "${{ inputs.worker_count }}" \
--zone "${ZONE}" \
--vpc-id "${VPC_ID}" \
--subnet-id "${SUBNET_ID}" \
--cos-instance "${COS_CRN}" \
--disable-outbound-traffic-protection
# ──────────────────────────────────────────────────────────────────────
# Common steps (both classic and VPC converge here; skipped for ipi)
# ──────────────────────────────────────────────────────────────────────
- name: Wait for cluster to be ready to use
if: inputs.infrastructure_type != 'ipi'
run: |
./ci-scripts/check-roks-cluster-state.sh
- name: Install oc client from cluster version
if: inputs.infrastructure_type != 'ipi'
run: |
CLUSTER_JSON="$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --output json)"
export CLUSTER_JSON
bash ./ci-scripts/install-oc-client.sh
- name: Configure kubeconfig
if: inputs.infrastructure_type != 'ipi'
run: |
ibmcloud oc cluster config --cluster "${CLUSTER_NAME}" --admin
oc cluster-info
oc get nodes -o wide
- name: Install HCO
if: inputs.infrastructure_type != 'ipi'
env:
KVM_EMULATION: ${{ inputs.kvm_emulation }}
run: |
./ci-scripts/install-hco.sh
- name: Verify ARC secrets
if: inputs.infrastructure_type != 'ipi'
run: |
HAS_APP=$([ -n "${{ secrets.ARC_GITHUB_APP_ID }}" ] && [ -n "${{ secrets.ARC_GITHUB_APP_INSTALL_ID }}" ] && [ -n "${{ secrets.ARC_GITHUB_APP_PRIVATE_KEY }}" ] && echo "yes" || echo "no")
HAS_PAT=$([ -n "${{ secrets.ARC_GITHUB_PAT }}" ] && echo "yes" || echo "no")
if [[ "$HAS_APP" != "yes" && "$HAS_PAT" != "yes" ]]; then
echo "::error::ARC authentication secrets are missing or empty."
echo "Configure either:"
echo " - ARC_GITHUB_APP_ID, ARC_GITHUB_APP_INSTALL_ID, ARC_GITHUB_APP_PRIVATE_KEY (GitHub App), or"
echo " - ARC_GITHUB_PAT (Personal Access Token)"
echo "in Settings → Secrets and variables → Actions for this repository (or its organization)."
exit 1
fi
echo "ARC secrets are present."
- name: Build ARC runner image
if: inputs.infrastructure_type != 'ipi'
id: build_runner
env:
OC_VERSION: '4.20'
run: |
IMAGE_REF=$(./ci-scripts/images/setup-arc-runner-image.sh | grep '^IMAGE_REF=' | cut -d= -f2-)
echo "image_ref=${IMAGE_REF}" >> "$GITHUB_OUTPUT"
- name: Install ARC
if: inputs.infrastructure_type != 'ipi'
env:
ARC_CONFIG_URL: 'https://github.com/${{ github.repository }}'
ARC_APP_ID: ${{ secrets.ARC_GITHUB_APP_ID }}
ARC_APP_INSTALL_ID: ${{ secrets.ARC_GITHUB_APP_INSTALL_ID }}
ARC_APP_PRIVATE_KEY: ${{ secrets.ARC_GITHUB_APP_PRIVATE_KEY }}
ARC_PAT: ${{ secrets.ARC_GITHUB_PAT }}
ARC_RUNNER_IMAGE: ${{ steps.build_runner.outputs.image_ref }}
ARC_VERSION: '0.14.0'
run: |
./ci-scripts/arc/install-arc-controller.sh
./ci-scripts/arc/install-runner-scale-set.sh
- name: Install CI environment controller
if: inputs.infrastructure_type != 'ipi'
run: |
./ci-scripts/ci-env/install-ci-env-controller.sh
- name: Verify cluster health
if: inputs.infrastructure_type != 'ipi'
env:
GITHUB_REPOSITORY: ${{ github.repository }}
run: |
./ci-scripts/check-cluster-health.sh
- name: Setup summary
if: always()
run: |
echo "## Hot Cluster Setup Summary" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "| Parameter | Value |" >> "$GITHUB_STEP_SUMMARY"
echo "|-----------|-------|" >> "$GITHUB_STEP_SUMMARY"
echo "| Infrastructure | \`${{ inputs.infrastructure_type || 'classic' }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| Cluster | \`${CLUSTER_NAME}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| Zone | \`${{ inputs.zone }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| OpenShift | \`${{ inputs.openshift_version }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| Worker Flavor | \`${{ inputs.worker_flavor }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| Workers | \`${{ inputs.worker_count }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| KVM Emulation | \`${{ inputs.kvm_emulation }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "### IAM diagnostics" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
if [[ "${{ inputs.infrastructure_type || 'classic' }}" == "classic" ]]; then
echo "If cluster create failed with **E73e6**, expand the **Log IBM Cloud IAM diagnostics** step log, open the run **Summary** tab, or download the \`ibmcloud-iam-diagnostics\` artifact." >> "$GITHUB_STEP_SUMMARY"
else
echo "If cluster create failed, check VPC Infrastructure permissions in the **Log IBM Cloud IAM diagnostics** step." >> "$GITHUB_STEP_SUMMARY"
fi
echo "" >> "$GITHUB_STEP_SUMMARY"
if oc cluster-info &>/dev/null; then
echo "Cluster is **healthy** and ready for CI." >> "$GITHUB_STEP_SUMMARY"
else
echo "Cluster setup **may have issues**. Check the logs." >> "$GITHUB_STEP_SUMMARY"
fi