Skip to content

Commit 615e553

Browse files
committed
Add GCE GPU integration workflow
1 parent 7a62e94 commit 615e553

1 file changed

Lines changed: 313 additions & 0 deletions

File tree

Lines changed: 313 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
name: GPU Integration on GCE
2+
3+
# Expected repository variables:
4+
# GCP_PROJECT_ID, GCP_WORKLOAD_IDENTITY_PROVIDER, GCP_SERVICE_ACCOUNT
5+
#
6+
# Keep GCP_SERVICE_ACCOUNT on a custom role rather than project Owner/Editor.
7+
# It needs only enough Compute permissions to create/delete this one VM shape,
8+
# create/delete the temporary firewall rule, and read instance/network metadata.
9+
10+
on:
11+
pull_request:
12+
workflow_dispatch:
13+
inputs:
14+
zone:
15+
description: GCE zone with NVIDIA T4 quota.
16+
required: true
17+
default: us-central1-a
18+
machine_type:
19+
description: GCE machine type for one NVIDIA T4.
20+
required: true
21+
default: n1-standard-4
22+
network:
23+
description: GCE VPC network name.
24+
required: true
25+
default: default
26+
cuda_version:
27+
description: CUDA version for the client test container.
28+
required: true
29+
default: 13.1.0
30+
ubuntu_version:
31+
description: Ubuntu version for the client test container and VM image.
32+
required: true
33+
default: "24.04"
34+
pytorch_index_url:
35+
description: PyTorch wheel index URL.
36+
required: true
37+
default: https://download.pytorch.org/whl/cu130
38+
spot:
39+
description: Use a Spot VM.
40+
required: true
41+
type: boolean
42+
default: true
43+
44+
permissions:
45+
contents: read
46+
id-token: write
47+
48+
concurrency:
49+
group: gpu-integration-gce
50+
cancel-in-progress: false
51+
52+
jobs:
53+
gpu-integration:
54+
name: CUDA samples and PyTorch on GCE T4
55+
runs-on: ubuntu-latest
56+
timeout-minutes: 360
57+
58+
env:
59+
GCP_PROJECT_ID: ${{ vars.GCP_PROJECT_ID }}
60+
GCP_ZONE: ${{ inputs.zone }}
61+
GCP_NETWORK: ${{ inputs.network }}
62+
MACHINE_TYPE: ${{ inputs.machine_type }}
63+
CUDA_VERSION: ${{ inputs.cuda_version }}
64+
UBUNTU_VERSION: ${{ inputs.ubuntu_version }}
65+
PYTORCH_INDEX_URL: ${{ inputs.pytorch_index_url }}
66+
USE_SPOT: ${{ inputs.spot }}
67+
VM_NAME: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }}
68+
VM_TAG: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }}
69+
FIREWALL_ALLOW_RULE: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }}-allow
70+
FIREWALL_DENY_RULE: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }}-deny
71+
72+
steps:
73+
- name: Check out repository
74+
uses: actions/checkout@v4
75+
76+
- name: Authenticate to Google Cloud
77+
uses: google-github-actions/auth@v3
78+
with:
79+
project_id: ${{ vars.GCP_PROJECT_ID }}
80+
workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}
81+
service_account: ${{ vars.GCP_SERVICE_ACCOUNT }}
82+
83+
- name: Set up gcloud
84+
uses: google-github-actions/setup-gcloud@v3
85+
86+
- name: Validate Google Cloud configuration
87+
run: |
88+
set -euo pipefail
89+
test -n "$GCP_PROJECT_ID"
90+
test -n "${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}"
91+
test -n "${{ vars.GCP_SERVICE_ACCOUNT }}"
92+
case "$UBUNTU_VERSION" in
93+
24.04)
94+
echo "VM_IMAGE_FAMILY=ubuntu-2404-lts-amd64" >> "$GITHUB_ENV"
95+
;;
96+
22.04)
97+
echo "VM_IMAGE_FAMILY=ubuntu-2204-lts" >> "$GITHUB_ENV"
98+
;;
99+
*)
100+
echo "Unsupported Ubuntu version for the GCE VM image: $UBUNTU_VERSION" >&2
101+
exit 1
102+
;;
103+
esac
104+
105+
- name: Prepare SSH key and runner allowlist
106+
run: |
107+
set -euo pipefail
108+
ssh_dir="$RUNNER_TEMP/gce-ssh"
109+
mkdir -p "$ssh_dir"
110+
ssh-keygen -t ed25519 -N '' -C "gha-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" -f "$ssh_dir/id_ed25519"
111+
chmod 700 "$ssh_dir"
112+
chmod 600 "$ssh_dir/id_ed25519"
113+
114+
runner_ip="$(curl -fsS https://api.ipify.org)"
115+
test -n "$runner_ip"
116+
117+
{
118+
echo "SSH_DIR=$ssh_dir"
119+
echo "RUNNER_IP=$runner_ip"
120+
} >> "$GITHUB_ENV"
121+
122+
- name: Create locked-down firewall rules
123+
run: |
124+
set -euo pipefail
125+
gcloud compute firewall-rules create "$FIREWALL_ALLOW_RULE" \
126+
--project="$GCP_PROJECT_ID" \
127+
--network="$GCP_NETWORK" \
128+
--direction=INGRESS \
129+
--priority=800 \
130+
--action=ALLOW \
131+
--rules=tcp:22,tcp:14900-16999,tcp:20100-20299 \
132+
--source-ranges="$RUNNER_IP/32" \
133+
--target-tags="$VM_TAG"
134+
135+
gcloud compute firewall-rules create "$FIREWALL_DENY_RULE" \
136+
--project="$GCP_PROJECT_ID" \
137+
--network="$GCP_NETWORK" \
138+
--direction=INGRESS \
139+
--priority=900 \
140+
--action=DENY \
141+
--rules=tcp:1-65535,udp:1-65535,icmp,esp,ah,sctp \
142+
--source-ranges=0.0.0.0/0 \
143+
--target-tags="$VM_TAG"
144+
145+
- name: Create T4 VM
146+
run: |
147+
set -euo pipefail
148+
startup_script="$RUNNER_TEMP/gce-startup.sh"
149+
cat > "$startup_script" <<'EOF'
150+
#!/usr/bin/env bash
151+
set -euxo pipefail
152+
export DEBIAN_FRONTEND=noninteractive
153+
apt-get update
154+
apt-get install -y --no-install-recommends ca-certificates libnghttp2-14
155+
rm -rf /var/lib/apt/lists/*
156+
EOF
157+
158+
create_args=(
159+
"$VM_NAME"
160+
"--project=$GCP_PROJECT_ID"
161+
"--zone=$GCP_ZONE"
162+
"--machine-type=$MACHINE_TYPE"
163+
"--network=$GCP_NETWORK"
164+
"--tags=$VM_TAG"
165+
"--image-family=$VM_IMAGE_FAMILY"
166+
"--image-project=ubuntu-os-cloud"
167+
"--boot-disk-size=100GB"
168+
"--boot-disk-type=pd-balanced"
169+
"--accelerator=type=nvidia-tesla-t4,count=1"
170+
"--maintenance-policy=TERMINATE"
171+
"--max-run-duration=6h"
172+
"--instance-termination-action=DELETE"
173+
"--no-service-account"
174+
"--metadata=block-project-ssh-keys=TRUE,enable-oslogin=FALSE,install-nvidia-driver=True,ssh-keys=gha:$(cat "$SSH_DIR/id_ed25519.pub")"
175+
"--metadata-from-file=startup-script=$startup_script"
176+
"--shielded-vtpm"
177+
"--shielded-integrity-monitoring"
178+
"--no-shielded-secure-boot"
179+
)
180+
181+
if [[ "$USE_SPOT" == "true" ]]; then
182+
create_args+=("--provisioning-model=SPOT")
183+
fi
184+
185+
gcloud compute instances create "${create_args[@]}"
186+
187+
vm_ip="$(gcloud compute instances describe "$VM_NAME" \
188+
--project="$GCP_PROJECT_ID" \
189+
--zone="$GCP_ZONE" \
190+
--format='value(networkInterfaces[0].accessConfigs[0].natIP)')"
191+
test -n "$vm_ip"
192+
echo "VM_IP=$vm_ip" >> "$GITHUB_ENV"
193+
194+
- name: Wait for SSH and NVIDIA driver
195+
run: |
196+
set -euo pipefail
197+
ssh_base=(
198+
ssh
199+
-i "$SSH_DIR/id_ed25519"
200+
-o IdentitiesOnly=yes
201+
-o StrictHostKeyChecking=accept-new
202+
-o UserKnownHostsFile="$SSH_DIR/known_hosts"
203+
-o ConnectTimeout=10
204+
"gha@$VM_IP"
205+
)
206+
207+
for _ in $(seq 1 120); do
208+
if "${ssh_base[@]}" 'echo ready' >/dev/null 2>&1; then
209+
break
210+
fi
211+
sleep 5
212+
done
213+
214+
"${ssh_base[@]}" '
215+
deadline=$((SECONDS + 1200))
216+
until command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L; do
217+
if [ "$SECONDS" -ge "$deadline" ]; then
218+
echo "NVIDIA driver did not become ready before the timeout" >&2
219+
exit 1
220+
fi
221+
sleep 15
222+
done
223+
'
224+
225+
- name: Run CUDA samples and PyTorch compliance
226+
run: |
227+
set -euo pipefail
228+
image="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
229+
230+
docker pull "$image"
231+
docker run --rm \
232+
--network host \
233+
-e SERVER_HOST="$VM_IP" \
234+
-e SERVER_USER=gha \
235+
-e SERVER_SSH_TARGET="gha@$VM_IP" \
236+
-e PYTORCH_INDEX_URL="$PYTORCH_INDEX_URL" \
237+
-e CUDA_HOME=/usr/local/cuda \
238+
-v "$PWD:/workspace" \
239+
-v "$SSH_DIR:/root/.ssh:ro" \
240+
-w /workspace \
241+
"$image" \
242+
bash -lc '
243+
set -euo pipefail
244+
export DEBIAN_FRONTEND=noninteractive
245+
apt-get update
246+
apt-get install -y --no-install-recommends \
247+
bash \
248+
build-essential \
249+
ca-certificates \
250+
cmake \
251+
git \
252+
libnghttp2-dev \
253+
ninja-build \
254+
openssh-client \
255+
python3 \
256+
python3-pip \
257+
python3-venv
258+
rm -rf /var/lib/apt/lists/*
259+
260+
python3 -m venv /workspace/.venv-pytorch312
261+
/workspace/.venv-pytorch312/bin/pip install --upgrade pip
262+
/workspace/.venv-pytorch312/bin/pip install --index-url "$PYTORCH_INDEX_URL" torch
263+
264+
cmake -S /workspace -B /workspace/build \
265+
-G Ninja \
266+
-DCMAKE_BUILD_TYPE=Release \
267+
-DCMAKE_LIBRARY_PATH=/usr/local/cuda/lib64/stubs
268+
cmake --build /workspace/build --parallel \
269+
--target lupine_driver lupine_nvml lupine_driver_server
270+
ln -sf libcuda.so.1 /workspace/build/libcuda.so
271+
ln -sf libnvidia-ml.so.1 /workspace/build/libnvidia-ml.so
272+
273+
export SSH_OPTS="-i /root/.ssh/id_ed25519 -o IdentitiesOnly=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=/root/.ssh/known_hosts"
274+
export SAMPLE_SUITE=compliance
275+
export SAMPLE_TIMEOUT=20
276+
export TEST_TIMEOUT=90
277+
278+
/workspace/test/run_cuda_samples.sh
279+
/workspace/test/run_pytorch_lupine_tests.sh
280+
'
281+
282+
- name: Upload compliance results
283+
if: always()
284+
uses: actions/upload-artifact@v4
285+
with:
286+
name: gpu-integration-results-${{ github.run_id }}-${{ github.run_attempt }}
287+
path: |
288+
test/cuda-samples/results/
289+
test/pytorch/results/
290+
if-no-files-found: ignore
291+
292+
- name: Refresh Google Cloud credentials for cleanup
293+
if: always()
294+
uses: google-github-actions/auth@v3
295+
with:
296+
project_id: ${{ vars.GCP_PROJECT_ID }}
297+
workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}
298+
service_account: ${{ vars.GCP_SERVICE_ACCOUNT }}
299+
300+
- name: Tear down GCE resources
301+
if: always()
302+
run: |
303+
set +e
304+
gcloud compute instances delete "$VM_NAME" \
305+
--project="$GCP_PROJECT_ID" \
306+
--zone="$GCP_ZONE" \
307+
--quiet
308+
gcloud compute firewall-rules delete "$FIREWALL_ALLOW_RULE" \
309+
--project="$GCP_PROJECT_ID" \
310+
--quiet
311+
gcloud compute firewall-rules delete "$FIREWALL_DENY_RULE" \
312+
--project="$GCP_PROJECT_ID" \
313+
--quiet

0 commit comments

Comments
 (0)