Skip to content

IBM Cloud Hot Cluster Setup #25

IBM Cloud Hot Cluster Setup

IBM Cloud Hot Cluster Setup #25

name: IBM Cloud Hot Cluster Setup
on:
workflow_dispatch:
inputs:
infrastructure_type:
description: 'Infrastructure type'
required: true
default: 'ipi'
type: choice
options:
- ipi
- vpc
- classic
cluster_name:
description: 'Cluster name'
required: true
default: 'kubevirt-plugin-ci'
type: string
zone:
description: 'Zone (classic: wdc04, fra02; vpc/ipi: us-south-1, eu-de-1)'
required: true
default: 'us-south-1'
type: string
openshift_version:
description: 'OpenShift version (e.g. 4.20_openshift, 4.22_openshift)'
required: true
default: '4.20_openshift'
type: string
worker_flavor:
description: 'Worker node flavor (classic: mb4c.4x32; vpc/ipi: bx2.8x32)'
required: true
default: 'bx2.8x32'
type: string
worker_count:
description: 'Number of worker nodes (at least 2 so ingress is happy)'
required: true
default: '2'
type: string
kvm_emulation:
description: 'KVM emulation (true for vpc/shared, false for bare metal)'
required: true
default: true
type: boolean
cos_instance_crn:
description: 'COS instance CRN for VPC internal registry (auto-created if empty)'
required: false
default: ''
type: string
permissions:
contents: read
env:
CLUSTER_NAME: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }}
INFRASTRUCTURE_TYPE: ${{ inputs.infrastructure_type || 'ipi' }}
jobs:
provision-cluster:
name: Provision OpenShift Cluster
runs-on: ubuntu-latest
timeout-minutes: 360
steps:
- name: Checkout
uses: actions/checkout@v5
- name: Setup IBM Cloud CLI
uses: IBM/actions-ibmcloud-cli@v1
with:
api_key: ${{ secrets.IC_KEY }}
region: eu-de
group: cnv-ui
plugins: kubernetes-service, container-registry, vpc-infrastructure
- name: Log IBM Cloud IAM diagnostics
id: iam_diagnostics
continue-on-error: true
env:
WORKER_ZONE: ${{ inputs.zone }}
INFRASTRUCTURE_TYPE: ${{ inputs.infrastructure_type || 'ipi' }}
run: bash ./ci-scripts/log-ibmcloud-iam-diagnostics.sh
- name: Upload IAM diagnostics log
if: always() && steps.iam_diagnostics.outcome != 'skipped'
continue-on-error: true
uses: actions/upload-artifact@v6
with:
name: ibmcloud-iam-diagnostics-${{ github.run_id }}
path: ${{ runner.temp }}/ibmcloud-iam-diagnostics.txt
retention-days: 14
if-no-files-found: warn
- name: Check for existing ROKS cluster
if: inputs.infrastructure_type != 'ipi'
id: check_cluster
run: |
if ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" &>/dev/null; then
echo "Cluster '${CLUSTER_NAME}' already exists"
echo "exists=true" >> "$GITHUB_OUTPUT"
else
echo "Cluster '${CLUSTER_NAME}' does not exist, will create"
echo "exists=false" >> "$GITHUB_OUTPUT"
fi
# ──────────────────────────────────────────────────────────────────────
# Classic infrastructure path
# ──────────────────────────────────────────────────────────────────────
- name: Verify zone and flavor (classic)
if: steps.check_cluster.outputs.exists == 'false' && inputs.infrastructure_type == 'classic'
env:
ZONE: ${{ inputs.zone }}
FLAVOR: ${{ inputs.worker_flavor }}
run: |
echo "Fetching classic infrastructure locations and flavors..."
LOCATIONS_JSON=$(
ibmcloud oc locations --provider classic --show-flavors --output json |\
jq '[.[] | select(.kind=="dc")]'
)
echo "Checking zone '${ZONE}' exists..."
ZONE_EXISTS=$(
echo "${LOCATIONS_JSON}" |\
jq -r --arg z "${ZONE}" \
'[.[] | select(.id == $z)] | length'
)
if [[ "${ZONE_EXISTS}" -ne 1 ]]; then
echo "ERROR: Zone '${ZONE}' not found in classic infrastructure locations."
echo ""
echo "Available zones:"
echo "${LOCATIONS_JSON}" | jq -r '.[].id' | sort
exit 1
fi
echo "Zone '${ZONE}' exists"
echo "Checking flavor '${FLAVOR}' is available in zone '${ZONE}'..."
FLAVOR_EXISTS=$(
echo "${LOCATIONS_JSON}" |\
jq -r --arg z "${ZONE}" --arg f "${FLAVOR}" \
'.[] | select(.id == $z) | .flavors | split(",") | any(index($f))'
)
if [[ "${FLAVOR_EXISTS}" == "false" ]]; then
echo "ERROR: Flavor '${FLAVOR}' is not available in zone '${ZONE}'."
echo ""
echo "Available flavors in '${ZONE}':"
echo "${LOCATIONS_JSON}" | jq -r --arg z "${ZONE}" '.[] | select(.id == $z) | .flavors[]? | .id' | sort
exit 2
fi
echo "Flavor '${FLAVOR}' is available in zone '${ZONE}'"
- name: Create ROKS cluster (classic)
if: steps.check_cluster.outputs.exists == 'false' && inputs.infrastructure_type == 'classic'
env:
ZONE: ${{ inputs.zone }}
run: |
echo "Looking up existing VLANs in zone '${ZONE}'..."
VLAN_JSON=$(ibmcloud oc vlan ls --zone "${ZONE}" --output json 2>/dev/null || echo "[]")
PRIVATE_VLAN=$(echo "${VLAN_JSON}" | jq -r '[.[] | select(.type == "private")] | first | .id // empty')
PUBLIC_VLAN=$(echo "${VLAN_JSON}" | jq -r '[.[] | select(.type == "public")] | first | .id // empty')
if [[ -n "${PRIVATE_VLAN}" ]]; then
echo "Reusing existing private VLAN: ${PRIVATE_VLAN}"
echo "Reusing existing public VLAN: ${PUBLIC_VLAN:-'(none)'}"
else
echo "No existing VLANs in zone, new VLANs will be created"
fi
echo "Creating cluster '${CLUSTER_NAME}' with ${{ inputs.worker_count }}x ${{ inputs.worker_flavor }} workers in zone ${ZONE}..."
ibmcloud oc cluster create classic \
--name "${CLUSTER_NAME}" \
--version "${{ inputs.openshift_version }}" \
--flavor "${{ inputs.worker_flavor }}" \
--workers "${{ inputs.worker_count }}" \
--zone "${ZONE}" \
--private-vlan "${PRIVATE_VLAN}" \
--public-vlan "${PUBLIC_VLAN}"
# ──────────────────────────────────────────────────────────────────────
# VPC Gen2 infrastructure path
# ──────────────────────────────────────────────────────────────────────
- name: Provision VPC resources
if: steps.check_cluster.outputs.exists == 'false' && inputs.infrastructure_type == 'vpc'
id: vpc_resources
env:
ZONE: ${{ inputs.zone }}
VPC_NAME: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }}-vpc
run: |
echo "=== VPC Gen2 provisioning ==="
VPC_REGION="${ZONE%-*}"
echo "VPC region: ${VPC_REGION}, zone: ${ZONE}"
ibmcloud target -r "${VPC_REGION}"
VPC_ID=$(ibmcloud is vpcs --output json 2>/dev/null | jq -r --arg n "${VPC_NAME}" '.[] | select(.name == $n) | .id // empty')
if [[ -z "${VPC_ID}" ]]; then
echo "Creating VPC '${VPC_NAME}'..."
VPC_ID=$(ibmcloud is vpc-create "${VPC_NAME}" --output json | jq -r '.id')
echo "Created VPC: ${VPC_ID}"
else
echo "Reusing existing VPC '${VPC_NAME}': ${VPC_ID}"
fi
SUBNET_NAME="${VPC_NAME}-subnet-${ZONE}"
SUBNET_ID=$(ibmcloud is subnets --output json 2>/dev/null | jq -r --arg n "${SUBNET_NAME}" '.[] | select(.name == $n) | .id // empty')
if [[ -z "${SUBNET_ID}" ]]; then
echo "Creating subnet '${SUBNET_NAME}' in zone '${ZONE}'..."
SUBNET_ID=$(ibmcloud is subnet-create "${SUBNET_NAME}" "${VPC_ID}" --zone "${ZONE}" --ipv4-address-count 256 --output json | jq -r '.id')
echo "Created subnet: ${SUBNET_ID}"
else
echo "Reusing existing subnet '${SUBNET_NAME}': ${SUBNET_ID}"
fi
GW_NAME="${VPC_NAME}-gw-${ZONE}"
GW_ID=$(ibmcloud is public-gateways --output json 2>/dev/null | jq -r --arg n "${GW_NAME}" '.[] | select(.name == $n) | .id // empty')
if [[ -z "${GW_ID}" ]]; then
echo "Creating public gateway '${GW_NAME}'..."
GW_ID=$(ibmcloud is public-gateway-create "${GW_NAME}" "${VPC_ID}" "${ZONE}" --output json | jq -r '.id')
echo "Created public gateway: ${GW_ID}"
else
echo "Reusing existing public gateway '${GW_NAME}': ${GW_ID}"
fi
echo "Attaching public gateway to subnet..."
ibmcloud is subnet-update "${SUBNET_ID}" --pgw "${GW_ID}" 2>/dev/null || true
echo "vpc_id=${VPC_ID}" >> "$GITHUB_OUTPUT"
echo "subnet_id=${SUBNET_ID}" >> "$GITHUB_OUTPUT"
echo "vpc_region=${VPC_REGION}" >> "$GITHUB_OUTPUT"
- name: Create ROKS cluster (vpc)
if: steps.check_cluster.outputs.exists == 'false' && inputs.infrastructure_type == 'vpc'
env:
ZONE: ${{ inputs.zone }}
VPC_ID: ${{ steps.vpc_resources.outputs.vpc_id }}
SUBNET_ID: ${{ steps.vpc_resources.outputs.subnet_id }}
COS_CRN: ${{ inputs.cos_instance_crn }}
COS_INSTANCE_NAME: ${{ inputs.cluster_name || 'kubevirt-plugin-ci' }}-cos
run: |
if [[ -z "${COS_CRN}" ]]; then
echo "No COS CRN provided — looking for existing COS instance '${COS_INSTANCE_NAME}'..."
COS_CRN=$(ibmcloud resource service-instances --service-name cloud-object-storage --output json 2>/dev/null \
| jq -r --arg n "${COS_INSTANCE_NAME}" '.[] | select(.name == $n) | .crn // empty' || true)
if [[ -z "${COS_CRN}" ]]; then
echo "Creating COS instance '${COS_INSTANCE_NAME}'..."
ibmcloud resource service-instance-create "${COS_INSTANCE_NAME}" cloud-object-storage \
744bfc56-d12c-4866-88d5-dac9139e0e5d global \
-d premium-global-deployment
COS_CRN=$(ibmcloud resource service-instances --service-name cloud-object-storage --output json \
| jq -r --arg n "${COS_INSTANCE_NAME}" '.[] | select(.name == $n) | .crn')
echo "Created COS instance: ${COS_CRN}"
else
echo "Reusing existing COS instance: ${COS_CRN}"
fi
fi
echo "Creating VPC cluster '${CLUSTER_NAME}' with ${{ inputs.worker_count }}x ${{ inputs.worker_flavor }} workers in zone ${ZONE}..."
ibmcloud oc cluster create vpc-gen2 \
--name "${CLUSTER_NAME}" \
--version "${{ inputs.openshift_version }}" \
--flavor "${{ inputs.worker_flavor }}" \
--workers "${{ inputs.worker_count }}" \
--zone "${ZONE}" \
--vpc-id "${VPC_ID}" \
--subnet-id "${SUBNET_ID}" \
--cos-instance "${COS_CRN}" \
--disable-outbound-traffic-protection
# ──────────────────────────────────────────────────────────────────────
# IPI (self-managed OpenShift) path
# ──────────────────────────────────────────────────────────────────────
- name: Install openshift-install, oc, and ccoctl
if: inputs.infrastructure_type == 'ipi'
id: ipi_tools
env:
OC_VERSION_INPUT: ${{ inputs.openshift_version }}
run: |
OCP_CHANNEL="stable-${OC_VERSION_INPUT%%_*}"
echo "Resolving ${OCP_CHANNEL} to latest patch version..."
MIRROR="https://mirror.openshift.com/pub/openshift-v4/x86_64/clients/ocp/${OCP_CHANNEL}"
echo "Downloading openshift-install..."
curl -sL "${MIRROR}/openshift-install-linux.tar.gz" | tar -xz -C /usr/local/bin openshift-install
openshift-install version
echo "Downloading oc + kubectl..."
curl -sL "${MIRROR}/openshift-client-linux.tar.gz" | tar -xz -C /usr/local/bin oc kubectl
oc version --client
echo "Downloading ccoctl..."
curl -sL "${MIRROR}/ccoctl-linux.tar.gz" | tar -xz -C /usr/local/bin ccoctl 2>/dev/null || echo "ccoctl not available for this version"
RESOLVED_VERSION=$(openshift-install version | head -1 | awk '{print $2}')
echo "ocp_channel=${OCP_CHANNEL}" >> "$GITHUB_OUTPUT"
echo "ocp_version=${RESOLVED_VERSION}" >> "$GITHUB_OUTPUT"
- name: Generate IPI install-config
if: inputs.infrastructure_type == 'ipi'
env:
ZONE: ${{ inputs.zone }}
PULL_SECRET: ${{ secrets.OPENSHIFT_PULL_SECRET }}
WORKER_FLAVOR: ${{ inputs.worker_flavor }}
WORKER_COUNT: ${{ inputs.worker_count }}
run: |
VPC_REGION="${ZONE%-*}"
INSTALL_DIR="${RUNNER_TEMP}/ipi-install"
mkdir -p "${INSTALL_DIR}"
ssh-keygen -t ed25519 -f "${INSTALL_DIR}/ssh-key" -N "" -q
SSH_PUB=$(cat "${INSTALL_DIR}/ssh-key.pub")
IPI_WORKER_FLAVOR=$(echo "${WORKER_FLAVOR}" | sed 's/\./-/g')
export VPC_REGION IPI_WORKER_FLAVOR SSH_PUB
envsubst < "${GITHUB_WORKSPACE}/ci-scripts/ipi-install-config.yaml.tpl" \
> "${INSTALL_DIR}/install-config.yaml"
echo "install-config.yaml generated at ${INSTALL_DIR}"
echo "::group::install-config.yaml (redacted)"
sed 's/pullSecret:.*/pullSecret: REDACTED/' "${INSTALL_DIR}/install-config.yaml"
echo "::endgroup::"
- name: Generate CCO manifests for IBM Cloud
if: inputs.infrastructure_type == 'ipi'
env:
IC_API_KEY: ${{ secrets.IC_KEY }}
INSTALL_DIR: ${{ runner.temp }}/ipi-install
run: |
export IC_API_KEY
echo "Creating install manifests..."
openshift-install create manifests --dir="${INSTALL_DIR}"
echo "Extracting CredentialsRequests..."
CRED_DIR="${INSTALL_DIR}/credreqs"
mkdir -p "${CRED_DIR}"
oc adm release extract \
--credentials-requests \
--cloud=ibmcloud \
--to="${CRED_DIR}" \
"$(openshift-install version | grep 'release image' | awk '{print $3}')" 2>/dev/null || \
echo "Warning: could not extract credentials requests; falling back to installer defaults"
if ls "${CRED_DIR}"/*.yaml &>/dev/null; then
echo "Processing CredentialsRequests with ccoctl..."
ccoctl ibmcloud create-service-id \
--credentials-requests-dir="${CRED_DIR}" \
--name="${CLUSTER_NAME}" \
--output-dir="${INSTALL_DIR}" 2>&1 || true
fi
- name: Create IPI cluster
id: ipi_create
if: inputs.infrastructure_type == 'ipi'
env:
IC_API_KEY: ${{ secrets.IC_KEY }}
INSTALL_DIR: ${{ runner.temp }}/ipi-install
run: |
export IC_API_KEY
echo "Running openshift-install create cluster..."
openshift-install create cluster --dir="${INSTALL_DIR}" --log-level=info 2>&1 | tee "${INSTALL_DIR}/install.log"
- name: Upload IPI install artifacts
if: inputs.infrastructure_type == 'ipi' && always()
uses: actions/upload-artifact@v6
with:
name: ipi-install-state-${{ github.run_id }}
path: |
${{ runner.temp }}/ipi-install/metadata.json
${{ runner.temp }}/ipi-install/terraform.tfstate
${{ runner.temp }}/ipi-install/auth/
${{ runner.temp }}/ipi-install/.openshift_install.log
retention-days: 30
if-no-files-found: ignore
- name: IPI cleanup on failure
if: inputs.infrastructure_type == 'ipi' && failure()
env:
IC_API_KEY: ${{ secrets.IC_KEY }}
INSTALL_DIR: ${{ runner.temp }}/ipi-install
run: |
export IC_API_KEY
if [[ -f "${INSTALL_DIR}/metadata.json" ]]; then
echo "Install failed — cleaning up IPI resources..."
openshift-install destroy cluster --dir="${INSTALL_DIR}" --log-level=info 2>&1 | tail -50 || true
else
echo "No metadata.json found — nothing to clean up."
fi
# ──────────────────────────────────────────────────────────────────────
# Common bootstrap steps (all paths converge here)
# ──────────────────────────────────────────────────────────────────────
- name: Wait for ROKS cluster ready
if: inputs.infrastructure_type != 'ipi'
run: |
./ci-scripts/check-roks-cluster-state.sh
- name: Install oc client
if: inputs.infrastructure_type != 'ipi'
run: |
CLUSTER_JSON="$(ibmcloud oc cluster get --cluster "${CLUSTER_NAME}" --output json)"
export CLUSTER_JSON
bash ./ci-scripts/install-oc-client.sh
- name: Configure kubeconfig
env:
OCP_CHANNEL: ${{ steps.ipi_tools.outputs.ocp_channel }}
run: |
if [[ "${{ inputs.infrastructure_type }}" == "ipi" ]]; then
INSTALL_DIR="${RUNNER_TEMP}/ipi-install"
export KUBECONFIG="${INSTALL_DIR}/auth/kubeconfig"
echo "KUBECONFIG=${KUBECONFIG}" >> "$GITHUB_ENV"
curl -sL "https://mirror.openshift.com/pub/openshift-v4/x86_64/clients/ocp/${OCP_CHANNEL}/openshift-client-linux.tar.gz" \
| tar -xz -C /usr/local/bin oc kubectl
else
ibmcloud oc cluster config --cluster "${CLUSTER_NAME}" --admin
fi
oc cluster-info
oc get nodes -o wide
- name: Install HCO
env:
KVM_EMULATION: ${{ inputs.kvm_emulation }}
run: |
./ci-scripts/install-hco.sh
- name: Verify ARC secrets
run: |
HAS_APP=$([ -n "${{ secrets.ARC_GITHUB_APP_ID }}" ] && [ -n "${{ secrets.ARC_GITHUB_APP_INSTALL_ID }}" ] && [ -n "${{ secrets.ARC_GITHUB_APP_PRIVATE_KEY }}" ] && echo "yes" || echo "no")
HAS_PAT=$([ -n "${{ secrets.ARC_GITHUB_PAT }}" ] && echo "yes" || echo "no")
if [[ "$HAS_APP" != "yes" && "$HAS_PAT" != "yes" ]]; then
echo "::error::ARC authentication secrets are missing or empty."
echo "Configure either:"
echo " - ARC_GITHUB_APP_ID, ARC_GITHUB_APP_INSTALL_ID, ARC_GITHUB_APP_PRIVATE_KEY (GitHub App), or"
echo " - ARC_GITHUB_PAT (Personal Access Token)"
echo "in Settings → Secrets and variables → Actions for this repository (or its organization)."
exit 1
fi
echo "ARC secrets are present."
- name: Build ARC runner image
id: build_runner
env:
OC_VERSION: '4.20'
run: |
IMAGE_REF=$(./ci-scripts/images/setup-arc-runner-image.sh | grep '^IMAGE_REF=' | cut -d= -f2-)
echo "image_ref=${IMAGE_REF}" >> "$GITHUB_OUTPUT"
- name: Install ARC
env:
ARC_CONFIG_URL: 'https://github.com/${{ github.repository }}'
ARC_APP_ID: ${{ secrets.ARC_GITHUB_APP_ID }}
ARC_APP_INSTALL_ID: ${{ secrets.ARC_GITHUB_APP_INSTALL_ID }}
ARC_APP_PRIVATE_KEY: ${{ secrets.ARC_GITHUB_APP_PRIVATE_KEY }}
ARC_PAT: ${{ secrets.ARC_GITHUB_PAT }}
ARC_RUNNER_IMAGE: ${{ steps.build_runner.outputs.image_ref }}
ARC_VERSION: '0.14.0'
run: |
./ci-scripts/arc/install-arc-controller.sh
./ci-scripts/arc/install-runner-scale-set.sh
- name: Install CI environment controller
run: |
./ci-scripts/ci-env/install-ci-env-controller.sh
- name: Verify cluster health
env:
GITHUB_REPOSITORY: ${{ github.repository }}
run: |
./ci-scripts/check-cluster-health.sh
- name: Setup summary
if: always()
run: |
echo "## Hot Cluster Setup Summary" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "| Parameter | Value |" >> "$GITHUB_STEP_SUMMARY"
echo "|-----------|-------|" >> "$GITHUB_STEP_SUMMARY"
echo "| Infrastructure | \`${{ inputs.infrastructure_type || 'ipi' }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| Cluster | \`${CLUSTER_NAME}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| Zone | \`${{ inputs.zone }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| OpenShift | \`${{ inputs.openshift_version }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| Worker Flavor | \`${{ inputs.worker_flavor }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| Workers | \`${{ inputs.worker_count }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| KVM Emulation | \`${{ inputs.kvm_emulation }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
if oc cluster-info &>/dev/null; then
echo "Cluster is **healthy** and ready for CI." >> "$GITHUB_STEP_SUMMARY"
else
echo "Cluster setup **may have issues**. Check the logs and the IAM diagnostics artifact." >> "$GITHUB_STEP_SUMMARY"
fi