Skip to content

ci: run TPU workload smoke test #7

ci: run TPU workload smoke test

ci: run TPU workload smoke test #7

name: GKE Connectivity Smoke
on:
push:
paths:
- ".github/workflows/gke-connectivity-smoke.yml"
pull_request:
paths:
- ".github/workflows/gke-connectivity-smoke.yml"
workflow_dispatch:
inputs:
project_id:
description: "GCP project ID. Defaults to vars.GCP_PROJECT_ID."
required: false
type: string
cluster_name:
description: "GKE cluster name. Defaults to vars.GKE_CLUSTER_NAME."
required: false
type: string
cluster_location:
description: "GKE cluster zone or region. Defaults to vars.GKE_CLUSTER_LOCATION."
required: false
type: string
workload_identity_provider:
description: "Workload Identity Provider. Defaults to vars.GCP_WORKLOAD_IDENTITY_PROVIDER."
required: false
type: string
node_pool:
description: "Existing GKE node pool to verify."
required: true
default: "v6e-4x4-perf-accuracy-tests"
type: string
namespace:
description: "Namespace for Kubernetes RBAC checks."
required: true
default: "default"
type: string
run_workload:
description: "Create and run a short multi-host TPU Kubernetes Job."
required: true
default: false
type: boolean
concurrency:
group: gke-connectivity-smoke-${{ github.ref }}
cancel-in-progress: false
permissions:
contents: read
id-token: write
jobs:
connectivity:
runs-on: ubuntu-latest
timeout-minutes: 60
env:
PROJECT_ID: ${{ inputs.project_id || vars.GCP_PROJECT_ID }}
CLUSTER_NAME: ${{ inputs.cluster_name || vars.GKE_CLUSTER_NAME }}
CLUSTER_LOCATION: ${{ inputs.cluster_location || vars.GKE_CLUSTER_LOCATION }}
WORKLOAD_IDENTITY_PROVIDER: ${{ inputs.workload_identity_provider || vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}
SERVICE_ACCOUNT: ${{ vars.GCP_SERVICE_ACCOUNT }}
NODE_POOL: ${{ inputs.node_pool || 'v6e-4x4-perf-accuracy-tests' }}
NAMESPACE: ${{ inputs.namespace || 'default' }}
RUN_WORKLOAD: ${{ inputs.run_workload || github.event_name == 'push' }}
WORKLOAD_NAME: gke-tpu-smoke-${{ github.run_id }}
steps:
- name: Validate smoke test configuration
shell: bash
run: |
set -euo pipefail
missing=0
for name in PROJECT_ID CLUSTER_NAME CLUSTER_LOCATION WORKLOAD_IDENTITY_PROVIDER NODE_POOL NAMESPACE; do
if [[ -z "${!name}" ]]; then
echo "::error::${name} is required. Set the workflow input or the matching repository variable."
missing=1
fi
done
if [[ "${missing}" -ne 0 ]]; then
exit 1
fi
echo "project=${PROJECT_ID}"
echo "cluster=${CLUSTER_NAME}"
echo "location=${CLUSTER_LOCATION}"
echo "node_pool=${NODE_POOL}"
echo "namespace=${NAMESPACE}"
- name: Authenticate to Google Cloud
if: env.SERVICE_ACCOUNT == ''
uses: google-github-actions/auth@v3
with:
project_id: ${{ env.PROJECT_ID }}
workload_identity_provider: ${{ env.WORKLOAD_IDENTITY_PROVIDER }}
- name: Authenticate to Google Cloud with service account
if: env.SERVICE_ACCOUNT != ''
uses: google-github-actions/auth@v3
with:
project_id: ${{ env.PROJECT_ID }}
workload_identity_provider: ${{ env.WORKLOAD_IDENTITY_PROVIDER }}
service_account: ${{ env.SERVICE_ACCOUNT }}
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v3
with:
version: ">= 490.0.0"
project_id: ${{ env.PROJECT_ID }}
- name: Verify GKE node pool through GCP API
shell: bash
run: |
set -euo pipefail
gcloud container node-pools describe "${NODE_POOL}" \
--project="${PROJECT_ID}" \
--cluster="${CLUSTER_NAME}" \
--location="${CLUSTER_LOCATION}" \
--format="table(name,status,config.machineType,autoscaling.enabled,autoscaling.minNodeCount,autoscaling.maxNodeCount)"
- name: Get GKE credentials
uses: google-github-actions/get-gke-credentials@v3
with:
project_id: ${{ env.PROJECT_ID }}
cluster_name: ${{ env.CLUSTER_NAME }}
location: ${{ env.CLUSTER_LOCATION }}
- name: Verify Kubernetes access
shell: bash
run: |
set -euo pipefail
kubectl cluster-info
kubectl auth can-i get nodes
kubectl auth can-i create jobs --namespace "${NAMESPACE}"
kubectl auth can-i create services --namespace "${NAMESPACE}"
kubectl get nodes -l "cloud.google.com/gke-nodepool=${NODE_POOL}" -o wide
- name: Run multi-host TPU workload smoke test
if: env.RUN_WORKLOAD == 'true'
shell: bash
run: |
set -euo pipefail
cat > /tmp/tpu-smoke.yaml <<EOF
apiVersion: v1
kind: Service
metadata:
name: ${WORKLOAD_NAME}-headless
namespace: ${NAMESPACE}
spec:
clusterIP: None
selector:
job-name: ${WORKLOAD_NAME}
---
apiVersion: batch/v1
kind: Job
metadata:
name: ${WORKLOAD_NAME}
namespace: ${NAMESPACE}
spec:
completionMode: Indexed
parallelism: 4
completions: 4
backoffLimit: 0
activeDeadlineSeconds: 2400
ttlSecondsAfterFinished: 600
template:
spec:
subdomain: ${WORKLOAD_NAME}-headless
restartPolicy: Never
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
cloud.google.com/gke-tpu-topology: 4x4
cloud.google.com/gke-nodepool: ${NODE_POOL}
tolerations:
- key: google.com/tpu
operator: Equal
value: present
effect: NoSchedule
containers:
- name: tpu-smoke
image: us-docker.pkg.dev/cloud-tpu-images/jax-ai-image/tpu:jax0.8.1-rev1
command:
- /bin/bash
- -lc
- |
set -euxo pipefail
echo "pod=$(hostname)"
echo "job_index=${JOB_COMPLETION_INDEX:-unknown}"
ls -la /dev/vfio || true
sleep 120
resources:
requests:
google.com/tpu: "4"
limits:
google.com/tpu: "4"
EOF
kubectl apply -f /tmp/tpu-smoke.yaml
kubectl get job "${WORKLOAD_NAME}" --namespace "${NAMESPACE}" -o wide
kubectl wait --for=condition=Ready pod \
--selector="job-name=${WORKLOAD_NAME}" \
--namespace "${NAMESPACE}" \
--timeout=30m
kubectl logs --selector="job-name=${WORKLOAD_NAME}" \
--namespace "${NAMESPACE}" \
--all-containers=true \
--prefix=true
kubectl wait --for=condition=complete "job/${WORKLOAD_NAME}" \
--namespace "${NAMESPACE}" \
--timeout=5m
- name: Diagnose TPU workload failure
if: failure() && env.RUN_WORKLOAD == 'true'
shell: bash
run: |
set -euo pipefail
kubectl get pods --namespace "${NAMESPACE}" --selector="job-name=${WORKLOAD_NAME}" -o wide || true
kubectl describe pods --namespace "${NAMESPACE}" --selector="job-name=${WORKLOAD_NAME}" || true
kubectl logs --namespace "${NAMESPACE}" --selector="job-name=${WORKLOAD_NAME}" --all-containers=true --prefix=true || true
- name: Cleanup TPU workload smoke test
if: always() && env.RUN_WORKLOAD == 'true'
shell: bash
run: |
set -euo pipefail
kubectl delete job "${WORKLOAD_NAME}" --namespace "${NAMESPACE}" --ignore-not-found=true
kubectl delete service "${WORKLOAD_NAME}-headless" --namespace "${NAMESPACE}" --ignore-not-found=true