|
| 1 | +name: GPU Integration on GCE |
| 2 | + |
| 3 | +# Expected repository variables: |
| 4 | +# GCP_PROJECT_ID, GCP_WORKLOAD_IDENTITY_PROVIDER, GCP_SERVICE_ACCOUNT |
| 5 | +# |
| 6 | +# Keep GCP_SERVICE_ACCOUNT on a custom role rather than project Owner/Editor. |
| 7 | +# It needs only enough Compute permissions to create/delete this one VM shape, |
| 8 | +# create/delete the temporary firewall rule, and read instance/network metadata. |
| 9 | + |
| 10 | +on: |
| 11 | + pull_request: |
| 12 | + workflow_dispatch: |
| 13 | + inputs: |
| 14 | + zone: |
| 15 | + description: GCE zone with NVIDIA T4 quota. |
| 16 | + required: true |
| 17 | + default: us-central1-a |
| 18 | + machine_type: |
| 19 | + description: GCE machine type for one NVIDIA T4. |
| 20 | + required: true |
| 21 | + default: n1-standard-4 |
| 22 | + network: |
| 23 | + description: GCE VPC network name. |
| 24 | + required: true |
| 25 | + default: default |
| 26 | + cuda_version: |
| 27 | + description: CUDA version for the client test container. |
| 28 | + required: true |
| 29 | + default: 13.1.0 |
| 30 | + ubuntu_version: |
| 31 | + description: Ubuntu version for the client test container and VM image. |
| 32 | + required: true |
| 33 | + default: "24.04" |
| 34 | + pytorch_index_url: |
| 35 | + description: PyTorch wheel index URL. |
| 36 | + required: true |
| 37 | + default: https://download.pytorch.org/whl/cu130 |
| 38 | + spot: |
| 39 | + description: Use a Spot VM. |
| 40 | + required: true |
| 41 | + type: boolean |
| 42 | + default: true |
| 43 | + |
| 44 | +permissions: |
| 45 | + contents: read |
| 46 | + id-token: write |
| 47 | + |
| 48 | +concurrency: |
| 49 | + group: gpu-integration-gce |
| 50 | + cancel-in-progress: false |
| 51 | + |
| 52 | +jobs: |
| 53 | + gpu-integration: |
| 54 | + name: CUDA samples and PyTorch on GCE T4 |
| 55 | + runs-on: ubuntu-latest |
| 56 | + timeout-minutes: 360 |
| 57 | + |
| 58 | + env: |
| 59 | + GCP_PROJECT_ID: ${{ vars.GCP_PROJECT_ID }} |
| 60 | + GCP_ZONE: ${{ inputs.zone }} |
| 61 | + GCP_NETWORK: ${{ inputs.network }} |
| 62 | + MACHINE_TYPE: ${{ inputs.machine_type }} |
| 63 | + CUDA_VERSION: ${{ inputs.cuda_version }} |
| 64 | + UBUNTU_VERSION: ${{ inputs.ubuntu_version }} |
| 65 | + PYTORCH_INDEX_URL: ${{ inputs.pytorch_index_url }} |
| 66 | + USE_SPOT: ${{ inputs.spot }} |
| 67 | + VM_NAME: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }} |
| 68 | + VM_TAG: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }} |
| 69 | + FIREWALL_ALLOW_RULE: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }}-allow |
| 70 | + FIREWALL_DENY_RULE: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }}-deny |
| 71 | + |
| 72 | + steps: |
| 73 | + - name: Check out repository |
| 74 | + uses: actions/checkout@v4 |
| 75 | + |
| 76 | + - name: Authenticate to Google Cloud |
| 77 | + uses: google-github-actions/auth@v3 |
| 78 | + with: |
| 79 | + project_id: ${{ vars.GCP_PROJECT_ID }} |
| 80 | + workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }} |
| 81 | + service_account: ${{ vars.GCP_SERVICE_ACCOUNT }} |
| 82 | + |
| 83 | + - name: Set up gcloud |
| 84 | + uses: google-github-actions/setup-gcloud@v3 |
| 85 | + |
| 86 | + - name: Validate Google Cloud configuration |
| 87 | + run: | |
| 88 | + set -euo pipefail |
| 89 | + test -n "$GCP_PROJECT_ID" |
| 90 | + test -n "${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}" |
| 91 | + test -n "${{ vars.GCP_SERVICE_ACCOUNT }}" |
| 92 | + case "$UBUNTU_VERSION" in |
| 93 | + 24.04) |
| 94 | + echo "VM_IMAGE_FAMILY=ubuntu-2404-lts-amd64" >> "$GITHUB_ENV" |
| 95 | + ;; |
| 96 | + 22.04) |
| 97 | + echo "VM_IMAGE_FAMILY=ubuntu-2204-lts" >> "$GITHUB_ENV" |
| 98 | + ;; |
| 99 | + *) |
| 100 | + echo "Unsupported Ubuntu version for the GCE VM image: $UBUNTU_VERSION" >&2 |
| 101 | + exit 1 |
| 102 | + ;; |
| 103 | + esac |
| 104 | +
|
| 105 | + - name: Prepare SSH key and runner allowlist |
| 106 | + run: | |
| 107 | + set -euo pipefail |
| 108 | + ssh_dir="$RUNNER_TEMP/gce-ssh" |
| 109 | + mkdir -p "$ssh_dir" |
| 110 | + ssh-keygen -t ed25519 -N '' -C "gha-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" -f "$ssh_dir/id_ed25519" |
| 111 | + chmod 700 "$ssh_dir" |
| 112 | + chmod 600 "$ssh_dir/id_ed25519" |
| 113 | +
|
| 114 | + runner_ip="$(curl -fsS https://api.ipify.org)" |
| 115 | + test -n "$runner_ip" |
| 116 | +
|
| 117 | + { |
| 118 | + echo "SSH_DIR=$ssh_dir" |
| 119 | + echo "RUNNER_IP=$runner_ip" |
| 120 | + } >> "$GITHUB_ENV" |
| 121 | +
|
| 122 | + - name: Create locked-down firewall rules |
| 123 | + run: | |
| 124 | + set -euo pipefail |
| 125 | + gcloud compute firewall-rules create "$FIREWALL_ALLOW_RULE" \ |
| 126 | + --project="$GCP_PROJECT_ID" \ |
| 127 | + --network="$GCP_NETWORK" \ |
| 128 | + --direction=INGRESS \ |
| 129 | + --priority=800 \ |
| 130 | + --action=ALLOW \ |
| 131 | + --rules=tcp:22,tcp:14900-16999,tcp:20100-20299 \ |
| 132 | + --source-ranges="$RUNNER_IP/32" \ |
| 133 | + --target-tags="$VM_TAG" |
| 134 | +
|
| 135 | + gcloud compute firewall-rules create "$FIREWALL_DENY_RULE" \ |
| 136 | + --project="$GCP_PROJECT_ID" \ |
| 137 | + --network="$GCP_NETWORK" \ |
| 138 | + --direction=INGRESS \ |
| 139 | + --priority=900 \ |
| 140 | + --action=DENY \ |
| 141 | + --rules=tcp:1-65535,udp:1-65535,icmp,esp,ah,sctp \ |
| 142 | + --source-ranges=0.0.0.0/0 \ |
| 143 | + --target-tags="$VM_TAG" |
| 144 | +
|
| 145 | + - name: Create T4 VM |
| 146 | + run: | |
| 147 | + set -euo pipefail |
| 148 | + startup_script="$RUNNER_TEMP/gce-startup.sh" |
| 149 | + cat > "$startup_script" <<'EOF' |
| 150 | + #!/usr/bin/env bash |
| 151 | + set -euxo pipefail |
| 152 | + export DEBIAN_FRONTEND=noninteractive |
| 153 | + apt-get update |
| 154 | + apt-get install -y --no-install-recommends ca-certificates libnghttp2-14 |
| 155 | + rm -rf /var/lib/apt/lists/* |
| 156 | + EOF |
| 157 | +
|
| 158 | + create_args=( |
| 159 | + "$VM_NAME" |
| 160 | + "--project=$GCP_PROJECT_ID" |
| 161 | + "--zone=$GCP_ZONE" |
| 162 | + "--machine-type=$MACHINE_TYPE" |
| 163 | + "--network=$GCP_NETWORK" |
| 164 | + "--tags=$VM_TAG" |
| 165 | + "--image-family=$VM_IMAGE_FAMILY" |
| 166 | + "--image-project=ubuntu-os-cloud" |
| 167 | + "--boot-disk-size=100GB" |
| 168 | + "--boot-disk-type=pd-balanced" |
| 169 | + "--accelerator=type=nvidia-tesla-t4,count=1" |
| 170 | + "--maintenance-policy=TERMINATE" |
| 171 | + "--max-run-duration=6h" |
| 172 | + "--instance-termination-action=DELETE" |
| 173 | + "--no-service-account" |
| 174 | + "--metadata=block-project-ssh-keys=TRUE,enable-oslogin=FALSE,install-nvidia-driver=True,ssh-keys=gha:$(cat "$SSH_DIR/id_ed25519.pub")" |
| 175 | + "--metadata-from-file=startup-script=$startup_script" |
| 176 | + "--shielded-vtpm" |
| 177 | + "--shielded-integrity-monitoring" |
| 178 | + "--no-shielded-secure-boot" |
| 179 | + ) |
| 180 | +
|
| 181 | + if [[ "$USE_SPOT" == "true" ]]; then |
| 182 | + create_args+=("--provisioning-model=SPOT") |
| 183 | + fi |
| 184 | +
|
| 185 | + gcloud compute instances create "${create_args[@]}" |
| 186 | +
|
| 187 | + vm_ip="$(gcloud compute instances describe "$VM_NAME" \ |
| 188 | + --project="$GCP_PROJECT_ID" \ |
| 189 | + --zone="$GCP_ZONE" \ |
| 190 | + --format='value(networkInterfaces[0].accessConfigs[0].natIP)')" |
| 191 | + test -n "$vm_ip" |
| 192 | + echo "VM_IP=$vm_ip" >> "$GITHUB_ENV" |
| 193 | +
|
| 194 | + - name: Wait for SSH and NVIDIA driver |
| 195 | + run: | |
| 196 | + set -euo pipefail |
| 197 | + ssh_base=( |
| 198 | + ssh |
| 199 | + -i "$SSH_DIR/id_ed25519" |
| 200 | + -o IdentitiesOnly=yes |
| 201 | + -o StrictHostKeyChecking=accept-new |
| 202 | + -o UserKnownHostsFile="$SSH_DIR/known_hosts" |
| 203 | + -o ConnectTimeout=10 |
| 204 | + "gha@$VM_IP" |
| 205 | + ) |
| 206 | +
|
| 207 | + for _ in $(seq 1 120); do |
| 208 | + if "${ssh_base[@]}" 'echo ready' >/dev/null 2>&1; then |
| 209 | + break |
| 210 | + fi |
| 211 | + sleep 5 |
| 212 | + done |
| 213 | +
|
| 214 | + "${ssh_base[@]}" ' |
| 215 | + deadline=$((SECONDS + 1200)) |
| 216 | + until command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L; do |
| 217 | + if [ "$SECONDS" -ge "$deadline" ]; then |
| 218 | + echo "NVIDIA driver did not become ready before the timeout" >&2 |
| 219 | + exit 1 |
| 220 | + fi |
| 221 | + sleep 15 |
| 222 | + done |
| 223 | + ' |
| 224 | +
|
| 225 | + - name: Run CUDA samples and PyTorch compliance |
| 226 | + run: | |
| 227 | + set -euo pipefail |
| 228 | + image="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}" |
| 229 | +
|
| 230 | + docker pull "$image" |
| 231 | + docker run --rm \ |
| 232 | + --network host \ |
| 233 | + -e SERVER_HOST="$VM_IP" \ |
| 234 | + -e SERVER_USER=gha \ |
| 235 | + -e SERVER_SSH_TARGET="gha@$VM_IP" \ |
| 236 | + -e PYTORCH_INDEX_URL="$PYTORCH_INDEX_URL" \ |
| 237 | + -e CUDA_HOME=/usr/local/cuda \ |
| 238 | + -v "$PWD:/workspace" \ |
| 239 | + -v "$SSH_DIR:/root/.ssh:ro" \ |
| 240 | + -w /workspace \ |
| 241 | + "$image" \ |
| 242 | + bash -lc ' |
| 243 | + set -euo pipefail |
| 244 | + export DEBIAN_FRONTEND=noninteractive |
| 245 | + apt-get update |
| 246 | + apt-get install -y --no-install-recommends \ |
| 247 | + bash \ |
| 248 | + build-essential \ |
| 249 | + ca-certificates \ |
| 250 | + cmake \ |
| 251 | + git \ |
| 252 | + libnghttp2-dev \ |
| 253 | + ninja-build \ |
| 254 | + openssh-client \ |
| 255 | + python3 \ |
| 256 | + python3-pip \ |
| 257 | + python3-venv |
| 258 | + rm -rf /var/lib/apt/lists/* |
| 259 | +
|
| 260 | + python3 -m venv /workspace/.venv-pytorch312 |
| 261 | + /workspace/.venv-pytorch312/bin/pip install --upgrade pip |
| 262 | + /workspace/.venv-pytorch312/bin/pip install --index-url "$PYTORCH_INDEX_URL" torch |
| 263 | +
|
| 264 | + cmake -S /workspace -B /workspace/build \ |
| 265 | + -G Ninja \ |
| 266 | + -DCMAKE_BUILD_TYPE=Release \ |
| 267 | + -DCMAKE_LIBRARY_PATH=/usr/local/cuda/lib64/stubs |
| 268 | + cmake --build /workspace/build --parallel \ |
| 269 | + --target lupine_driver lupine_nvml lupine_driver_server |
| 270 | + ln -sf libcuda.so.1 /workspace/build/libcuda.so |
| 271 | + ln -sf libnvidia-ml.so.1 /workspace/build/libnvidia-ml.so |
| 272 | +
|
| 273 | + export SSH_OPTS="-i /root/.ssh/id_ed25519 -o IdentitiesOnly=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=/root/.ssh/known_hosts" |
| 274 | + export SAMPLE_SUITE=compliance |
| 275 | + export SAMPLE_TIMEOUT=20 |
| 276 | + export TEST_TIMEOUT=90 |
| 277 | +
|
| 278 | + /workspace/test/run_cuda_samples.sh |
| 279 | + /workspace/test/run_pytorch_lupine_tests.sh |
| 280 | + ' |
| 281 | +
|
| 282 | + - name: Upload compliance results |
| 283 | + if: always() |
| 284 | + uses: actions/upload-artifact@v4 |
| 285 | + with: |
| 286 | + name: gpu-integration-results-${{ github.run_id }}-${{ github.run_attempt }} |
| 287 | + path: | |
| 288 | + test/cuda-samples/results/ |
| 289 | + test/pytorch/results/ |
| 290 | + if-no-files-found: ignore |
| 291 | + |
| 292 | + - name: Refresh Google Cloud credentials for cleanup |
| 293 | + if: always() |
| 294 | + uses: google-github-actions/auth@v3 |
| 295 | + with: |
| 296 | + project_id: ${{ vars.GCP_PROJECT_ID }} |
| 297 | + workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }} |
| 298 | + service_account: ${{ vars.GCP_SERVICE_ACCOUNT }} |
| 299 | + |
| 300 | + - name: Tear down GCE resources |
| 301 | + if: always() |
| 302 | + run: | |
| 303 | + set +e |
| 304 | + gcloud compute instances delete "$VM_NAME" \ |
| 305 | + --project="$GCP_PROJECT_ID" \ |
| 306 | + --zone="$GCP_ZONE" \ |
| 307 | + --quiet |
| 308 | + gcloud compute firewall-rules delete "$FIREWALL_ALLOW_RULE" \ |
| 309 | + --project="$GCP_PROJECT_ID" \ |
| 310 | + --quiet |
| 311 | + gcloud compute firewall-rules delete "$FIREWALL_DENY_RULE" \ |
| 312 | + --project="$GCP_PROJECT_ID" \ |
| 313 | + --quiet |
0 commit comments