Add GCE GPU integration workflow

kevmo314 · kevmo314 · commit 615e553d41d6 · 2026-06-04T02:14:39.000-04:00
diff --git a/.github/workflows/gpu-integration-gcloud.yml b/.github/workflows/gpu-integration-gcloud.yml
@@ -0,0 +1,313 @@
+name: GPU Integration on GCE
+
+# Expected repository variables:
+#   GCP_PROJECT_ID, GCP_WORKLOAD_IDENTITY_PROVIDER, GCP_SERVICE_ACCOUNT
+#
+# Keep GCP_SERVICE_ACCOUNT on a custom role rather than project Owner/Editor.
+# It needs only enough Compute permissions to create/delete this one VM shape,
+# create/delete the temporary firewall rule, and read instance/network metadata.
+
+on:
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      zone:
+        description: GCE zone with NVIDIA T4 quota.
+        required: true
+        default: us-central1-a
+      machine_type:
+        description: GCE machine type for one NVIDIA T4.
+        required: true
+        default: n1-standard-4
+      network:
+        description: GCE VPC network name.
+        required: true
+        default: default
+      cuda_version:
+        description: CUDA version for the client test container.
+        required: true
+        default: 13.1.0
+      ubuntu_version:
+        description: Ubuntu version for the client test container and VM image.
+        required: true
+        default: "24.04"
+      pytorch_index_url:
+        description: PyTorch wheel index URL.
+        required: true
+        default: https://download.pytorch.org/whl/cu130
+      spot:
+        description: Use a Spot VM.
+        required: true
+        type: boolean
+        default: true
+
+permissions:
+  contents: read
+  id-token: write
+
+concurrency:
+  group: gpu-integration-gce
+  cancel-in-progress: false
+
+jobs:
+  gpu-integration:
+    name: CUDA samples and PyTorch on GCE T4
+    runs-on: ubuntu-latest
+    timeout-minutes: 360
+
+    env:
+      GCP_PROJECT_ID: ${{ vars.GCP_PROJECT_ID }}
+      GCP_ZONE: ${{ inputs.zone }}
+      GCP_NETWORK: ${{ inputs.network }}
+      MACHINE_TYPE: ${{ inputs.machine_type }}
+      CUDA_VERSION: ${{ inputs.cuda_version }}
+      UBUNTU_VERSION: ${{ inputs.ubuntu_version }}
+      PYTORCH_INDEX_URL: ${{ inputs.pytorch_index_url }}
+      USE_SPOT: ${{ inputs.spot }}
+      VM_NAME: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }}
+      VM_TAG: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }}
+      FIREWALL_ALLOW_RULE: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }}-allow
+      FIREWALL_DENY_RULE: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }}-deny
+
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Authenticate to Google Cloud
+        uses: google-github-actions/auth@v3
+        with:
+          project_id: ${{ vars.GCP_PROJECT_ID }}
+          workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}
+          service_account: ${{ vars.GCP_SERVICE_ACCOUNT }}
+
+      - name: Set up gcloud
+        uses: google-github-actions/setup-gcloud@v3
+
+      - name: Validate Google Cloud configuration
+        run: |
+          set -euo pipefail
+          test -n "$GCP_PROJECT_ID"
+          test -n "${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}"
+          test -n "${{ vars.GCP_SERVICE_ACCOUNT }}"
+          case "$UBUNTU_VERSION" in
+            24.04)
+              echo "VM_IMAGE_FAMILY=ubuntu-2404-lts-amd64" >> "$GITHUB_ENV"
+              ;;
+            22.04)
+              echo "VM_IMAGE_FAMILY=ubuntu-2204-lts" >> "$GITHUB_ENV"
+              ;;
+            *)
+              echo "Unsupported Ubuntu version for the GCE VM image: $UBUNTU_VERSION" >&2
+              exit 1
+              ;;
+          esac
+
+      - name: Prepare SSH key and runner allowlist
+        run: |
+          set -euo pipefail
+          ssh_dir="$RUNNER_TEMP/gce-ssh"
+          mkdir -p "$ssh_dir"
+          ssh-keygen -t ed25519 -N '' -C "gha-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" -f "$ssh_dir/id_ed25519"
+          chmod 700 "$ssh_dir"
+          chmod 600 "$ssh_dir/id_ed25519"
+
+          runner_ip="$(curl -fsS https://api.ipify.org)"
+          test -n "$runner_ip"
+
+          {
+            echo "SSH_DIR=$ssh_dir"
+            echo "RUNNER_IP=$runner_ip"
+          } >> "$GITHUB_ENV"
+
+      - name: Create locked-down firewall rules
+        run: |
+          set -euo pipefail
+          gcloud compute firewall-rules create "$FIREWALL_ALLOW_RULE" \
+            --project="$GCP_PROJECT_ID" \
+            --network="$GCP_NETWORK" \
+            --direction=INGRESS \
+            --priority=800 \
+            --action=ALLOW \
+            --rules=tcp:22,tcp:14900-16999,tcp:20100-20299 \
+            --source-ranges="$RUNNER_IP/32" \
+            --target-tags="$VM_TAG"
+
+          gcloud compute firewall-rules create "$FIREWALL_DENY_RULE" \
+            --project="$GCP_PROJECT_ID" \
+            --network="$GCP_NETWORK" \
+            --direction=INGRESS \
+            --priority=900 \
+            --action=DENY \
+            --rules=tcp:1-65535,udp:1-65535,icmp,esp,ah,sctp \
+            --source-ranges=0.0.0.0/0 \
+            --target-tags="$VM_TAG"
+
+      - name: Create T4 VM
+        run: |
+          set -euo pipefail
+          startup_script="$RUNNER_TEMP/gce-startup.sh"
+          cat > "$startup_script" <<'EOF'
+          #!/usr/bin/env bash
+          set -euxo pipefail
+          export DEBIAN_FRONTEND=noninteractive
+          apt-get update
+          apt-get install -y --no-install-recommends ca-certificates libnghttp2-14
+          rm -rf /var/lib/apt/lists/*
+          EOF
+
+          create_args=(
+            "$VM_NAME"
+            "--project=$GCP_PROJECT_ID"
+            "--zone=$GCP_ZONE"
+            "--machine-type=$MACHINE_TYPE"
+            "--network=$GCP_NETWORK"
+            "--tags=$VM_TAG"
+            "--image-family=$VM_IMAGE_FAMILY"
+            "--image-project=ubuntu-os-cloud"
+            "--boot-disk-size=100GB"
+            "--boot-disk-type=pd-balanced"
+            "--accelerator=type=nvidia-tesla-t4,count=1"
+            "--maintenance-policy=TERMINATE"
+            "--max-run-duration=6h"
+            "--instance-termination-action=DELETE"
+            "--no-service-account"
+            "--metadata=block-project-ssh-keys=TRUE,enable-oslogin=FALSE,install-nvidia-driver=True,ssh-keys=gha:$(cat "$SSH_DIR/id_ed25519.pub")"
+            "--metadata-from-file=startup-script=$startup_script"
+            "--shielded-vtpm"
+            "--shielded-integrity-monitoring"
+            "--no-shielded-secure-boot"
+          )
+
+          if [[ "$USE_SPOT" == "true" ]]; then
+            create_args+=("--provisioning-model=SPOT")
+          fi
+
+          gcloud compute instances create "${create_args[@]}"
+
+          vm_ip="$(gcloud compute instances describe "$VM_NAME" \
+            --project="$GCP_PROJECT_ID" \
+            --zone="$GCP_ZONE" \
+            --format='value(networkInterfaces[0].accessConfigs[0].natIP)')"
+          test -n "$vm_ip"
+          echo "VM_IP=$vm_ip" >> "$GITHUB_ENV"
+
+      - name: Wait for SSH and NVIDIA driver
+        run: |
+          set -euo pipefail
+          ssh_base=(
+            ssh
+            -i "$SSH_DIR/id_ed25519"
+            -o IdentitiesOnly=yes
+            -o StrictHostKeyChecking=accept-new
+            -o UserKnownHostsFile="$SSH_DIR/known_hosts"
+            -o ConnectTimeout=10
+            "gha@$VM_IP"
+          )
+
+          for _ in $(seq 1 120); do
+            if "${ssh_base[@]}" 'echo ready' >/dev/null 2>&1; then
+              break
+            fi
+            sleep 5
+          done
+
+          "${ssh_base[@]}" '
+            deadline=$((SECONDS + 1200))
+            until command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L; do
+              if [ "$SECONDS" -ge "$deadline" ]; then
+                echo "NVIDIA driver did not become ready before the timeout" >&2
+                exit 1
+              fi
+              sleep 15
+            done
+          '
+
+      - name: Run CUDA samples and PyTorch compliance
+        run: |
+          set -euo pipefail
+          image="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
+
+          docker pull "$image"
+          docker run --rm \
+            --network host \
+            -e SERVER_HOST="$VM_IP" \
+            -e SERVER_USER=gha \
+            -e SERVER_SSH_TARGET="gha@$VM_IP" \
+            -e PYTORCH_INDEX_URL="$PYTORCH_INDEX_URL" \
+            -e CUDA_HOME=/usr/local/cuda \
+            -v "$PWD:/workspace" \
+            -v "$SSH_DIR:/root/.ssh:ro" \
+            -w /workspace \
+            "$image" \
+            bash -lc '
+              set -euo pipefail
+              export DEBIAN_FRONTEND=noninteractive
+              apt-get update
+              apt-get install -y --no-install-recommends \
+                bash \
+                build-essential \
+                ca-certificates \
+                cmake \
+                git \
+                libnghttp2-dev \
+                ninja-build \
+                openssh-client \
+                python3 \
+                python3-pip \
+                python3-venv
+              rm -rf /var/lib/apt/lists/*
+
+              python3 -m venv /workspace/.venv-pytorch312
+              /workspace/.venv-pytorch312/bin/pip install --upgrade pip
+              /workspace/.venv-pytorch312/bin/pip install --index-url "$PYTORCH_INDEX_URL" torch
+
+              cmake -S /workspace -B /workspace/build \
+                -G Ninja \
+                -DCMAKE_BUILD_TYPE=Release \
+                -DCMAKE_LIBRARY_PATH=/usr/local/cuda/lib64/stubs
+              cmake --build /workspace/build --parallel \
+                --target lupine_driver lupine_nvml lupine_driver_server
+              ln -sf libcuda.so.1 /workspace/build/libcuda.so
+              ln -sf libnvidia-ml.so.1 /workspace/build/libnvidia-ml.so
+
+              export SSH_OPTS="-i /root/.ssh/id_ed25519 -o IdentitiesOnly=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=/root/.ssh/known_hosts"
+              export SAMPLE_SUITE=compliance
+              export SAMPLE_TIMEOUT=20
+              export TEST_TIMEOUT=90
+
+              /workspace/test/run_cuda_samples.sh
+              /workspace/test/run_pytorch_lupine_tests.sh
+            '
+
+      - name: Upload compliance results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: gpu-integration-results-${{ github.run_id }}-${{ github.run_attempt }}
+          path: |
+            test/cuda-samples/results/
+            test/pytorch/results/
+          if-no-files-found: ignore
+
+      - name: Refresh Google Cloud credentials for cleanup
+        if: always()
+        uses: google-github-actions/auth@v3
+        with:
+          project_id: ${{ vars.GCP_PROJECT_ID }}
+          workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}
+          service_account: ${{ vars.GCP_SERVICE_ACCOUNT }}
+
+      - name: Tear down GCE resources
+        if: always()
+        run: |
+          set +e
+          gcloud compute instances delete "$VM_NAME" \
+            --project="$GCP_PROJECT_ID" \
+            --zone="$GCP_ZONE" \
+            --quiet
+          gcloud compute firewall-rules delete "$FIREWALL_ALLOW_RULE" \
+            --project="$GCP_PROJECT_ID" \
+            --quiet
+          gcloud compute firewall-rules delete "$FIREWALL_DENY_RULE" \
+            --project="$GCP_PROJECT_ID" \
+            --quiet