Skip to content

Commit af718db

Browse files
authored
Merge branch 'main' into async-launch-pipelining
2 parents fceb04c + 454febf commit af718db

12 files changed

Lines changed: 299 additions & 220 deletions

.github/workflows/gpu-integration-gcloud.yml

Lines changed: 78 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,17 @@ permissions:
1616

1717
jobs:
1818
gpu-integration:
19-
name: CUDA samples and PyTorch on GCE T4
19+
name: CUDA samples and PyTorch on GCE L4
2020
if: github.event.pull_request.head.repo.full_name == github.repository
2121
runs-on: ubuntu-latest
2222
timeout-minutes: 300
2323

2424
env:
2525
GCP_PROJECT_ID: ${{ vars.GCP_PROJECT_ID }}
2626
GCP_ZONE: us-central1-a
27+
GCP_FALLBACK_ZONES: us-central1-c us-central1-b
2728
GCP_NETWORK: default
28-
MACHINE_TYPE: n1-standard-4
29+
MACHINE_TYPE: g2-standard-4
2930
CUDA_VERSION: 12.9.1
3031
UBUNTU_VERSION: "24.04"
3132
PYTORCH_INDEX_URL: https://download.pytorch.org/whl/cu128
@@ -76,7 +77,8 @@ jobs:
7677
test -n "$GCP_PROJECT_ID"
7778
test -n "${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}"
7879
test -n "${{ vars.GCP_SERVICE_ACCOUNT }}"
79-
test "$MACHINE_TYPE" = n1-standard-4
80+
test "$GCP_FALLBACK_ZONES" = "us-central1-c us-central1-b"
81+
test "$MACHINE_TYPE" = g2-standard-4
8082
test "$VM_IMAGE_PROJECT" = ml-images
8183
test "$VM_IMAGE_FAMILY" = common-cu129-ubuntu-2404-nvidia-580
8284
test "$VM_MAX_RUN_DURATION" = 260m
@@ -171,9 +173,9 @@ jobs:
171173
ln -sf libcuda.so.1 "$PWD/build/libcuda.so"
172174
ln -sf libnvidia-ml.so.1 "$PWD/build/libnvidia-ml.so"
173175
174-
# CUDA_SAMPLES_ARCH=75: only compile device code for the T4 (sm_75)
176+
# CUDA_SAMPLES_ARCH=89: only compile device code for the L4 (sm_89)
175177
# instead of the 15 architectures the samples build by default.
176-
BUILD_ONLY=1 BUILD_SAMPLES=1 SAMPLE_SUITE=extended CUDA_SAMPLES_ARCH=75 \
178+
BUILD_ONLY=1 BUILD_SAMPLES=1 SAMPLE_SUITE=extended CUDA_SAMPLES_ARCH=89 \
177179
CUDA_HOME="$CUDA_HOME" CUDA_LIB_DIR="$CUDA_LIB_DIR" \
178180
"$PWD/test/run_cuda_samples.sh"
179181
@@ -213,7 +215,7 @@ jobs:
213215
tcp:1-65535,udp:1-65535,icmp,esp,ah,sctp \
214216
0.0.0.0/0
215217
216-
- name: Create T4 server VM
218+
- name: Create L4 server VM
217219
run: |
218220
set -euo pipefail
219221
startup_script="$RUNNER_TEMP/gce-startup.sh"
@@ -226,38 +228,60 @@ jobs:
226228
rm -rf /var/lib/apt/lists/*
227229
EOF
228230
229-
create_args=(
230-
"$VM_NAME"
231-
"--project=$GCP_PROJECT_ID"
232-
"--zone=$GCP_ZONE"
233-
"--machine-type=$MACHINE_TYPE"
234-
"--network=$GCP_NETWORK"
235-
"--tags=$VM_TAG"
236-
"--image-family=$VM_IMAGE_FAMILY"
237-
"--image-project=$VM_IMAGE_PROJECT"
238-
"--boot-disk-size=100GB"
239-
"--boot-disk-type=pd-balanced"
240-
"--accelerator=type=nvidia-tesla-t4,count=1"
241-
"--maintenance-policy=TERMINATE"
242-
"--max-run-duration=$VM_MAX_RUN_DURATION"
243-
"--instance-termination-action=DELETE"
244-
"--no-service-account"
245-
"--no-scopes"
246-
"--metadata=block-project-ssh-keys=TRUE,enable-oslogin=FALSE,ssh-keys=gha:$(cat "$SSH_DIR/id_ed25519.pub")"
247-
"--metadata-from-file=startup-script=$startup_script"
248-
"--shielded-vtpm"
249-
"--shielded-integrity-monitoring"
250-
"--no-shielded-secure-boot"
251-
)
231+
# shellcheck disable=SC2206
232+
fallback_zones=($GCP_FALLBACK_ZONES)
233+
candidate_zones=("$GCP_ZONE" "${fallback_zones[@]}")
234+
235+
vm_ip=""
236+
selected_zone=""
237+
for zone in "${candidate_zones[@]}"; do
238+
echo "Creating $MACHINE_TYPE in $zone"
239+
create_args=(
240+
"$VM_NAME"
241+
"--project=$GCP_PROJECT_ID"
242+
"--zone=$zone"
243+
"--machine-type=$MACHINE_TYPE"
244+
"--network=$GCP_NETWORK"
245+
"--tags=$VM_TAG"
246+
"--image-family=$VM_IMAGE_FAMILY"
247+
"--image-project=$VM_IMAGE_PROJECT"
248+
"--boot-disk-size=100GB"
249+
"--boot-disk-type=pd-balanced"
250+
"--maintenance-policy=TERMINATE"
251+
"--max-run-duration=$VM_MAX_RUN_DURATION"
252+
"--instance-termination-action=DELETE"
253+
"--no-service-account"
254+
"--no-scopes"
255+
"--metadata=block-project-ssh-keys=TRUE,enable-oslogin=FALSE,ssh-keys=gha:$(cat "$SSH_DIR/id_ed25519.pub")"
256+
"--metadata-from-file=startup-script=$startup_script"
257+
"--shielded-vtpm"
258+
"--shielded-integrity-monitoring"
259+
"--no-shielded-secure-boot"
260+
)
261+
262+
if [[ "$USE_SPOT" == "true" ]]; then
263+
create_args+=("--provisioning-model=SPOT")
264+
fi
252265
253-
if [[ "$USE_SPOT" == "true" ]]; then
254-
create_args+=("--provisioning-model=SPOT")
255-
fi
266+
if vm_ip="$(gcloud compute instances create "${create_args[@]}" \
267+
--format='value(networkInterfaces[0].accessConfigs[0].natIP)')"; then
268+
selected_zone="$zone"
269+
break
270+
fi
271+
272+
echo "Could not create $MACHINE_TYPE in $zone; trying next candidate zone" >&2
273+
gcloud compute instances delete "$VM_NAME" \
274+
--project="$GCP_PROJECT_ID" \
275+
--zone="$zone" \
276+
--quiet >/dev/null 2>&1 || true
277+
done
256278
257-
vm_ip="$(gcloud compute instances create "${create_args[@]}" \
258-
--format='value(networkInterfaces[0].accessConfigs[0].natIP)')"
259279
test -n "$vm_ip"
260-
echo "VM_IP=$vm_ip" >> "$GITHUB_ENV"
280+
test -n "$selected_zone"
281+
{
282+
echo "VM_IP=$vm_ip"
283+
echo "GCP_ZONE=$selected_zone"
284+
} >> "$GITHUB_ENV"
261285
262286
- name: Wait for SSH and NVIDIA driver
263287
run: |
@@ -290,7 +314,7 @@ jobs:
290314
done
291315
'
292316
293-
- name: Run GitHub runner client against T4 server
317+
- name: Run GitHub runner client against L4 server
294318
run: |
295319
set -euo pipefail
296320
@@ -371,10 +395,16 @@ jobs:
371395
run: |
372396
set -euo pipefail
373397
374-
gcloud compute instances delete "$VM_NAME" \
375-
--project="$GCP_PROJECT_ID" \
376-
--zone="$GCP_ZONE" \
377-
--quiet || true
398+
# shellcheck disable=SC2206
399+
fallback_zones=($GCP_FALLBACK_ZONES)
400+
candidate_zones=("$GCP_ZONE" "${fallback_zones[@]}")
401+
402+
for zone in "${candidate_zones[@]}"; do
403+
gcloud compute instances delete "$VM_NAME" \
404+
--project="$GCP_PROJECT_ID" \
405+
--zone="$zone" \
406+
--quiet || true
407+
done
378408
379409
for rule in "$FIREWALL_ALLOW_RULE" "$FIREWALL_DENY_RULE"; do
380410
gcloud compute firewall-rules delete "$rule" \
@@ -383,12 +413,14 @@ jobs:
383413
done
384414
385415
leftovers=0
386-
if gcloud compute instances describe "$VM_NAME" \
387-
--project="$GCP_PROJECT_ID" \
388-
--zone="$GCP_ZONE" >/dev/null 2>&1; then
389-
echo "leftover VM still exists: $VM_NAME" >&2
390-
leftovers=1
391-
fi
416+
for zone in "${candidate_zones[@]}"; do
417+
if gcloud compute instances describe "$VM_NAME" \
418+
--project="$GCP_PROJECT_ID" \
419+
--zone="$zone" >/dev/null 2>&1; then
420+
echo "leftover VM still exists in $zone: $VM_NAME" >&2
421+
leftovers=1
422+
fi
423+
done
392424
393425
for rule in "$FIREWALL_ALLOW_RULE" "$FIREWALL_DENY_RULE"; do
394426
if gcloud compute firewall-rules describe "$rule" \

0 commit comments

Comments
 (0)