Skip to content

Commit 9209454

Browse files
Add ARC GPU runner test workflow
1 parent 16a70b4 commit 9209454

File tree

1 file changed

+195
-0
lines changed

1 file changed

+195
-0
lines changed
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
# Test workflow for EKS Auto Mode GPU runners with ARC
2+
# This workflow validates the GPU runner infrastructure
3+
4+
name: Test ARC GPU Runners
5+
6+
on:
7+
workflow_dispatch:
8+
inputs:
9+
test_light_runners:
10+
description: 'Test light-gpu-runners (1 GPU)'
11+
required: false
12+
default: true
13+
type: boolean
14+
test_heavy_runners:
15+
description: 'Test heavy-gpu-runners (2 GPUs)'
16+
required: false
17+
default: true
18+
type: boolean
19+
20+
jobs:
21+
test-light-gpu-runner:
22+
if: ${{ inputs.test_light_runners }}
23+
runs-on: light-gpu-runners
24+
timeout-minutes: 30
25+
steps:
26+
- name: Checkout repository
27+
uses: actions/checkout@v4
28+
29+
- name: System Information
30+
run: |
31+
echo "=== System Information ==="
32+
echo "Hostname: $(hostname)"
33+
echo "Kernel: $(uname -r)"
34+
echo "CPU Info:"
35+
lscpu | grep -E "^(Architecture|CPU\(s\)|Model name)"
36+
echo ""
37+
echo "Memory Info:"
38+
free -h
39+
echo ""
40+
echo "Disk Info:"
41+
df -h /
42+
43+
- name: Verify GPU Access
44+
run: |
45+
echo "=== GPU Verification ==="
46+
if command -v nvidia-smi &> /dev/null; then
47+
echo "nvidia-smi found, checking GPU..."
48+
nvidia-smi
49+
echo ""
50+
echo "GPU Count: $(nvidia-smi -L | wc -l)"
51+
nvidia-smi -L
52+
else
53+
echo "nvidia-smi not found in runner, checking via Docker..."
54+
fi
55+
56+
- name: Test GPU via Docker
57+
run: |
58+
echo "=== Docker GPU Test ==="
59+
docker run --rm --gpus all nvidia/cuda:12.2.0-runtime-ubuntu22.04 nvidia-smi
60+
61+
- name: Verify Single GPU Allocation
62+
run: |
63+
echo "=== Verifying 1 GPU Allocation ==="
64+
GPU_COUNT=$(docker run --rm --gpus all nvidia/cuda:12.2.0-runtime-ubuntu22.04 nvidia-smi -L | wc -l)
65+
echo "GPUs available: $GPU_COUNT"
66+
if [ "$GPU_COUNT" -ge 1 ]; then
67+
echo "✅ GPU allocation verified"
68+
else
69+
echo "❌ Expected at least 1 GPU, got $GPU_COUNT"
70+
exit 1
71+
fi
72+
73+
- name: Test CUDA Compute
74+
run: |
75+
echo "=== CUDA Compute Test ==="
76+
docker run --rm --gpus all nvidia/cuda:12.2.0-devel-ubuntu22.04 bash -c '
77+
cat > /tmp/test.cu << EOF
78+
#include <stdio.h>
79+
__global__ void hello() {
80+
printf("Hello from GPU thread %d\\n", threadIdx.x);
81+
}
82+
int main() {
83+
hello<<<1, 5>>>();
84+
cudaDeviceSynchronize();
85+
return 0;
86+
}
87+
EOF
88+
nvcc /tmp/test.cu -o /tmp/test && /tmp/test
89+
'
90+
91+
test-heavy-gpu-runner:
92+
if: ${{ inputs.test_heavy_runners }}
93+
runs-on: heavy-gpu-runners
94+
timeout-minutes: 30
95+
steps:
96+
- name: Checkout repository
97+
uses: actions/checkout@v4
98+
99+
- name: System Information
100+
run: |
101+
echo "=== System Information ==="
102+
echo "Hostname: $(hostname)"
103+
echo "Kernel: $(uname -r)"
104+
echo "CPU Info:"
105+
lscpu | grep -E "^(Architecture|CPU\(s\)|Model name)"
106+
echo ""
107+
echo "Memory Info:"
108+
free -h
109+
echo ""
110+
echo "Disk Info:"
111+
df -h /
112+
113+
- name: Verify GPU Access
114+
run: |
115+
echo "=== GPU Verification ==="
116+
if command -v nvidia-smi &> /dev/null; then
117+
echo "nvidia-smi found, checking GPU..."
118+
nvidia-smi
119+
echo ""
120+
echo "GPU Count: $(nvidia-smi -L | wc -l)"
121+
nvidia-smi -L
122+
else
123+
echo "nvidia-smi not found in runner, checking via Docker..."
124+
fi
125+
126+
- name: Test GPU via Docker
127+
run: |
128+
echo "=== Docker GPU Test ==="
129+
docker run --rm --gpus all nvidia/cuda:12.2.0-runtime-ubuntu22.04 nvidia-smi
130+
131+
- name: Verify Multi-GPU Allocation
132+
run: |
133+
echo "=== Verifying 2 GPU Allocation ==="
134+
GPU_COUNT=$(docker run --rm --gpus all nvidia/cuda:12.2.0-runtime-ubuntu22.04 nvidia-smi -L | wc -l)
135+
echo "GPUs available: $GPU_COUNT"
136+
if [ "$GPU_COUNT" -ge 2 ]; then
137+
echo "✅ Multi-GPU allocation verified"
138+
else
139+
echo "❌ Expected at least 2 GPUs, got $GPU_COUNT"
140+
exit 1
141+
fi
142+
143+
- name: Test Multi-GPU NCCL Communication
144+
run: |
145+
echo "=== Multi-GPU NCCL Test ==="
146+
docker run --rm --gpus all \
147+
-e NCCL_DEBUG=INFO \
148+
nvidia/cuda:12.2.0-devel-ubuntu22.04 bash -c '
149+
apt-get update && apt-get install -y libnccl2 libnccl-dev > /dev/null 2>&1
150+
cat > /tmp/nccl_test.cu << EOF
151+
#include <stdio.h>
152+
#include <cuda_runtime.h>
153+
int main() {
154+
int deviceCount;
155+
cudaGetDeviceCount(&deviceCount);
156+
printf("CUDA Device Count: %d\\n", deviceCount);
157+
for (int i = 0; i < deviceCount; i++) {
158+
cudaDeviceProp prop;
159+
cudaGetDeviceProperties(&prop, i);
160+
printf("Device %d: %s (Compute %d.%d)\\n", i, prop.name, prop.major, prop.minor);
161+
}
162+
// Test P2P access
163+
if (deviceCount >= 2) {
164+
int canAccess;
165+
cudaDeviceCanAccessPeer(&canAccess, 0, 1);
166+
printf("P2P Access 0->1: %s\\n", canAccess ? "Yes" : "No");
167+
cudaDeviceCanAccessPeer(&canAccess, 1, 0);
168+
printf("P2P Access 1->0: %s\\n", canAccess ? "Yes" : "No");
169+
}
170+
return 0;
171+
}
172+
EOF
173+
nvcc /tmp/nccl_test.cu -o /tmp/nccl_test && /tmp/nccl_test
174+
'
175+
176+
summary:
177+
needs: [test-light-gpu-runner, test-heavy-gpu-runner]
178+
if: always()
179+
runs-on: ubuntu-latest
180+
steps:
181+
- name: Test Summary
182+
run: |
183+
echo "=== ARC GPU Runner Test Summary ==="
184+
echo ""
185+
echo "Light GPU Runner (1 GPU): ${{ needs.test-light-gpu-runner.result }}"
186+
echo "Heavy GPU Runner (2 GPUs): ${{ needs.test-heavy-gpu-runner.result }}"
187+
echo ""
188+
if [ "${{ needs.test-light-gpu-runner.result }}" == "success" ] || [ "${{ needs.test-light-gpu-runner.result }}" == "skipped" ]; then
189+
if [ "${{ needs.test-heavy-gpu-runner.result }}" == "success" ] || [ "${{ needs.test-heavy-gpu-runner.result }}" == "skipped" ]; then
190+
echo "✅ All requested tests passed!"
191+
exit 0
192+
fi
193+
fi
194+
echo "❌ Some tests failed"
195+
exit 1

0 commit comments

Comments
 (0)