@@ -16,16 +16,17 @@ permissions:
1616
1717jobs :
1818 gpu-integration :
19- name : CUDA samples and PyTorch on GCE T4
19+ name : CUDA samples and PyTorch on GCE L4
2020 if : github.event.pull_request.head.repo.full_name == github.repository
2121 runs-on : ubuntu-latest
2222 timeout-minutes : 300
2323
2424 env :
2525 GCP_PROJECT_ID : ${{ vars.GCP_PROJECT_ID }}
2626 GCP_ZONE : us-central1-a
27+ GCP_FALLBACK_ZONES : us-central1-c us-central1-b
2728 GCP_NETWORK : default
28- MACHINE_TYPE : n1 -standard-4
29+ MACHINE_TYPE : g2 -standard-4
2930 CUDA_VERSION : 12.9.1
3031 UBUNTU_VERSION : " 24.04"
3132 PYTORCH_INDEX_URL : https://download.pytorch.org/whl/cu128
7677 test -n "$GCP_PROJECT_ID"
7778 test -n "${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}"
7879 test -n "${{ vars.GCP_SERVICE_ACCOUNT }}"
79- test "$MACHINE_TYPE" = n1-standard-4
80+ test "$GCP_FALLBACK_ZONES" = "us-central1-c us-central1-b"
81+ test "$MACHINE_TYPE" = g2-standard-4
8082 test "$VM_IMAGE_PROJECT" = ml-images
8183 test "$VM_IMAGE_FAMILY" = common-cu129-ubuntu-2404-nvidia-580
8284 test "$VM_MAX_RUN_DURATION" = 260m
@@ -171,9 +173,9 @@ jobs:
171173 ln -sf libcuda.so.1 "$PWD/build/libcuda.so"
172174 ln -sf libnvidia-ml.so.1 "$PWD/build/libnvidia-ml.so"
173175
174- # CUDA_SAMPLES_ARCH=75 : only compile device code for the T4 (sm_75 )
176+ # CUDA_SAMPLES_ARCH=89 : only compile device code for the L4 (sm_89 )
175177 # instead of the 15 architectures the samples build by default.
176- BUILD_ONLY=1 BUILD_SAMPLES=1 SAMPLE_SUITE=extended CUDA_SAMPLES_ARCH=75 \
178+ BUILD_ONLY=1 BUILD_SAMPLES=1 SAMPLE_SUITE=extended CUDA_SAMPLES_ARCH=89 \
177179 CUDA_HOME="$CUDA_HOME" CUDA_LIB_DIR="$CUDA_LIB_DIR" \
178180 "$PWD/test/run_cuda_samples.sh"
179181
@@ -213,7 +215,7 @@ jobs:
213215 tcp:1-65535,udp:1-65535,icmp,esp,ah,sctp \
214216 0.0.0.0/0
215217
216- - name : Create T4 server VM
218+ - name : Create L4 server VM
217219 run : |
218220 set -euo pipefail
219221 startup_script="$RUNNER_TEMP/gce-startup.sh"
@@ -226,38 +228,60 @@ jobs:
226228 rm -rf /var/lib/apt/lists/*
227229 EOF
228230
229- create_args=(
230- "$VM_NAME"
231- "--project=$GCP_PROJECT_ID"
232- "--zone=$GCP_ZONE"
233- "--machine-type=$MACHINE_TYPE"
234- "--network=$GCP_NETWORK"
235- "--tags=$VM_TAG"
236- "--image-family=$VM_IMAGE_FAMILY"
237- "--image-project=$VM_IMAGE_PROJECT"
238- "--boot-disk-size=100GB"
239- "--boot-disk-type=pd-balanced"
240- "--accelerator=type=nvidia-tesla-t4,count=1"
241- "--maintenance-policy=TERMINATE"
242- "--max-run-duration=$VM_MAX_RUN_DURATION"
243- "--instance-termination-action=DELETE"
244- "--no-service-account"
245- "--no-scopes"
246- "--metadata=block-project-ssh-keys=TRUE,enable-oslogin=FALSE,ssh-keys=gha:$(cat "$SSH_DIR/id_ed25519.pub")"
247- "--metadata-from-file=startup-script=$startup_script"
248- "--shielded-vtpm"
249- "--shielded-integrity-monitoring"
250- "--no-shielded-secure-boot"
251- )
231+ # shellcheck disable=SC2206
232+ fallback_zones=($GCP_FALLBACK_ZONES)
233+ candidate_zones=("$GCP_ZONE" "${fallback_zones[@]}")
234+
235+ vm_ip=""
236+ selected_zone=""
237+ for zone in "${candidate_zones[@]}"; do
238+ echo "Creating $MACHINE_TYPE in $zone"
239+ create_args=(
240+ "$VM_NAME"
241+ "--project=$GCP_PROJECT_ID"
242+ "--zone=$zone"
243+ "--machine-type=$MACHINE_TYPE"
244+ "--network=$GCP_NETWORK"
245+ "--tags=$VM_TAG"
246+ "--image-family=$VM_IMAGE_FAMILY"
247+ "--image-project=$VM_IMAGE_PROJECT"
248+ "--boot-disk-size=100GB"
249+ "--boot-disk-type=pd-balanced"
250+ "--maintenance-policy=TERMINATE"
251+ "--max-run-duration=$VM_MAX_RUN_DURATION"
252+ "--instance-termination-action=DELETE"
253+ "--no-service-account"
254+ "--no-scopes"
255+ "--metadata=block-project-ssh-keys=TRUE,enable-oslogin=FALSE,ssh-keys=gha:$(cat "$SSH_DIR/id_ed25519.pub")"
256+ "--metadata-from-file=startup-script=$startup_script"
257+ "--shielded-vtpm"
258+ "--shielded-integrity-monitoring"
259+ "--no-shielded-secure-boot"
260+ )
261+
262+ if [[ "$USE_SPOT" == "true" ]]; then
263+ create_args+=("--provisioning-model=SPOT")
264+ fi
252265
253- if [[ "$USE_SPOT" == "true" ]]; then
254- create_args+=("--provisioning-model=SPOT")
255- fi
266+ if vm_ip="$(gcloud compute instances create "${create_args[@]}" \
267+ --format='value(networkInterfaces[0].accessConfigs[0].natIP)')"; then
268+ selected_zone="$zone"
269+ break
270+ fi
271+
272+ echo "Could not create $MACHINE_TYPE in $zone; trying next candidate zone" >&2
273+ gcloud compute instances delete "$VM_NAME" \
274+ --project="$GCP_PROJECT_ID" \
275+ --zone="$zone" \
276+ --quiet >/dev/null 2>&1 || true
277+ done
256278
257- vm_ip="$(gcloud compute instances create "${create_args[@]}" \
258- --format='value(networkInterfaces[0].accessConfigs[0].natIP)')"
259279 test -n "$vm_ip"
260- echo "VM_IP=$vm_ip" >> "$GITHUB_ENV"
280+ test -n "$selected_zone"
281+ {
282+ echo "VM_IP=$vm_ip"
283+ echo "GCP_ZONE=$selected_zone"
284+ } >> "$GITHUB_ENV"
261285
262286 - name : Wait for SSH and NVIDIA driver
263287 run : |
@@ -290,7 +314,7 @@ jobs:
290314 done
291315 '
292316
293- - name : Run GitHub runner client against T4 server
317+ - name : Run GitHub runner client against L4 server
294318 run : |
295319 set -euo pipefail
296320
@@ -371,10 +395,16 @@ jobs:
371395 run : |
372396 set -euo pipefail
373397
374- gcloud compute instances delete "$VM_NAME" \
375- --project="$GCP_PROJECT_ID" \
376- --zone="$GCP_ZONE" \
377- --quiet || true
398+ # shellcheck disable=SC2206
399+ fallback_zones=($GCP_FALLBACK_ZONES)
400+ candidate_zones=("$GCP_ZONE" "${fallback_zones[@]}")
401+
402+ for zone in "${candidate_zones[@]}"; do
403+ gcloud compute instances delete "$VM_NAME" \
404+ --project="$GCP_PROJECT_ID" \
405+ --zone="$zone" \
406+ --quiet || true
407+ done
378408
379409 for rule in "$FIREWALL_ALLOW_RULE" "$FIREWALL_DENY_RULE"; do
380410 gcloud compute firewall-rules delete "$rule" \
@@ -383,12 +413,14 @@ jobs:
383413 done
384414
385415 leftovers=0
386- if gcloud compute instances describe "$VM_NAME" \
387- --project="$GCP_PROJECT_ID" \
388- --zone="$GCP_ZONE" >/dev/null 2>&1; then
389- echo "leftover VM still exists: $VM_NAME" >&2
390- leftovers=1
391- fi
416+ for zone in "${candidate_zones[@]}"; do
417+ if gcloud compute instances describe "$VM_NAME" \
418+ --project="$GCP_PROJECT_ID" \
419+ --zone="$zone" >/dev/null 2>&1; then
420+ echo "leftover VM still exists in $zone: $VM_NAME" >&2
421+ leftovers=1
422+ fi
423+ done
392424
393425 for rule in "$FIREWALL_ALLOW_RULE" "$FIREWALL_DENY_RULE"; do
394426 if gcloud compute firewall-rules describe "$rule" \
0 commit comments