Skip to content

Commit 01ae0a5

Browse files
Update test-g6-runners.yaml
1 parent 45be3ec commit 01ae0a5

File tree

1 file changed

+150
-40
lines changed

1 file changed

+150
-40
lines changed
Lines changed: 150 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Test - Multi-GPU Runners
1+
name: Test - Multi-GPU Runners (Fixed)
22

33
on:
44
workflow_dispatch:
@@ -12,85 +12,195 @@ jobs:
1212
steps:
1313
- name: Checkout
1414
uses: actions/checkout@v4
15-
15+
1616
- name: Job Info
1717
run: |
18-
echo "=== 1-GPU Runner Test ==="
18+
echo "=== 1-GPU Runner Test (Fixed) ==="
1919
echo "Runner: $(hostname)"
2020
echo "Timestamp: $(date -u)"
21+
echo "NVIDIA_VISIBLE_DEVICES: ${NVIDIA_VISIBLE_DEVICES:-'Not set by K8s'}"
2122
22-
- name: Check GPU Info
23+
- name: Check GPU Allocation
2324
run: |
24-
echo "=== GPU Information ==="
25-
docker run --rm --gpus=all \
26-
nvidia/cuda:12.2.0-base-ubuntu22.04 \
27-
nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
25+
echo "=== Kubernetes GPU Allocation ==="
26+
echo "NVIDIA_VISIBLE_DEVICES from K8s: $NVIDIA_VISIBLE_DEVICES"
27+
echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-'Not set'}"
28+
29+
# Show environment variables related to GPU
30+
echo "GPU-related environment variables:"
31+
env | grep -i nvidia || echo "No NVIDIA env vars found"
32+
33+
- name: Test GPU Access (Fixed Method)
34+
run: |
35+
echo "=== Testing GPU Access with K8s Allocation ==="
36+
37+
# Method 1: Use K8s allocated devices
38+
if [ -n "$NVIDIA_VISIBLE_DEVICES" ]; then
39+
echo "Using K8s allocated GPU devices: $NVIDIA_VISIBLE_DEVICES"
40+
docker run --rm --gpus "device=$NVIDIA_VISIBLE_DEVICES" \
41+
nvidia/cuda:12.2.0-base-ubuntu22.04 \
42+
nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
43+
else
44+
echo "No K8s GPU allocation found, using device=0"
45+
docker run --rm --gpus "device=0" \
46+
nvidia/cuda:12.2.0-base-ubuntu22.04 \
47+
nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
48+
fi
2849
29-
- name: Check Node Resources
50+
- name: Verify GPU Count
3051
run: |
31-
echo "=== Node Information ==="
32-
echo "Hostname: $(hostname)"
33-
echo "CPU cores: $(nproc)"
34-
echo "Memory: $(free -h | grep Mem | awk '{print $2}')"
52+
echo "=== Verifying Exactly 1 GPU is Visible ==="
53+
54+
gpu_devices="${NVIDIA_VISIBLE_DEVICES:-0}"
55+
gpu_count=$(docker run --rm --gpus "device=$gpu_devices" \
56+
nvidia/cuda:12.2.0-base-ubuntu22.04 \
57+
nvidia-smi -L | wc -l)
58+
59+
echo "Number of visible GPUs: $gpu_count"
60+
61+
if [ "$gpu_count" -eq 1 ]; then
62+
echo "✅ SUCCESS: Exactly 1 GPU visible as expected"
63+
else
64+
echo "❌ ERROR: Expected 1 GPU but found $gpu_count"
65+
echo "This indicates GPU isolation is not working properly"
66+
exit 1
67+
fi
3568
36-
- name: Run GPU Test
69+
- name: Run GPU Workload Test
3770
run: |
38-
docker run --rm --gpus=all \
39-
-e CUDA_VISIBLE_DEVICES=0 \
71+
echo "=== Running GPU Workload Test ==="
72+
73+
gpu_devices="${NVIDIA_VISIBLE_DEVICES:-0}"
74+
docker run --rm --gpus "device=$gpu_devices" \
4075
nvidia/cuda:12.2.0-base-ubuntu22.04 \
4176
bash -c "
42-
echo '=== 1-GPU Test ==='
77+
echo '=== 1-GPU Workload Test ==='
4378
nvidia-smi
4479
echo ''
45-
echo '✅ 1-GPU test completed'
80+
echo 'GPU Memory Info:'
81+
nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits
82+
echo ''
83+
echo '✅ 1-GPU test completed successfully'
4684
"
4785
4886
test-2gpu-runner:
4987
runs-on: g6-2gpu-runner
5088
steps:
5189
- name: Checkout
5290
uses: actions/checkout@v4
53-
91+
5492
- name: Job Info
5593
run: |
56-
echo "=== 2-GPU Runner Test ==="
94+
echo "=== 2-GPU Runner Test (Fixed) ==="
5795
echo "Runner: $(hostname)"
5896
echo "Timestamp: $(date -u)"
97+
echo "NVIDIA_VISIBLE_DEVICES: ${NVIDIA_VISIBLE_DEVICES:-'Not set by K8s'}"
5998
60-
- name: Check GPU Info
99+
- name: Check GPU Allocation
61100
run: |
62-
echo "=== GPU Information ==="
63-
docker run --rm --gpus=all \
64-
nvidia/cuda:12.2.0-base-ubuntu22.04 \
65-
nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
101+
echo "=== Kubernetes GPU Allocation ==="
102+
echo "NVIDIA_VISIBLE_DEVICES from K8s: $NVIDIA_VISIBLE_DEVICES"
103+
echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-'Not set'}"
66104
67-
- name: Check Node Resources
105+
- name: Test GPU Access (Fixed Method)
68106
run: |
69-
echo "=== Node Information ==="
70-
echo "Hostname: $(hostname)"
71-
echo "CPU cores: $(nproc)"
72-
echo "Memory: $(free -h | grep Mem | awk '{print $2}')"
107+
echo "=== Testing GPU Access with K8s Allocation ==="
108+
109+
# Method 1: Use K8s allocated devices
110+
if [ -n "$NVIDIA_VISIBLE_DEVICES" ]; then
111+
echo "Using K8s allocated GPU devices: $NVIDIA_VISIBLE_DEVICES"
112+
docker run --rm --gpus "device=$NVIDIA_VISIBLE_DEVICES" \
113+
nvidia/cuda:12.2.0-base-ubuntu22.04 \
114+
nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
115+
else
116+
echo "No K8s GPU allocation found, using device=0,1"
117+
docker run --rm --gpus "device=0,1" \
118+
nvidia/cuda:12.2.0-base-ubuntu22.04 \
119+
nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
120+
fi
73121
74-
- name: Run GPU Test
122+
- name: Verify GPU Count
75123
run: |
76-
docker run --rm --gpus=all \
77-
-e CUDA_VISIBLE_DEVICES=0,1 \
124+
echo "=== Verifying Exactly 2 GPUs are Visible ==="
125+
126+
gpu_devices="${NVIDIA_VISIBLE_DEVICES:-0,1}"
127+
gpu_count=$(docker run --rm --gpus "device=$gpu_devices" \
128+
nvidia/cuda:12.2.0-base-ubuntu22.04 \
129+
nvidia-smi -L | wc -l)
130+
131+
echo "Number of visible GPUs: $gpu_count"
132+
133+
if [ "$gpu_count" -eq 2 ]; then
134+
echo "✅ SUCCESS: Exactly 2 GPUs visible as expected"
135+
else
136+
echo "❌ ERROR: Expected 2 GPUs but found $gpu_count"
137+
echo "This indicates GPU isolation is not working properly"
138+
exit 1
139+
fi
140+
141+
- name: Test Parallel GPU Workloads
142+
run: |
143+
echo "=== Testing Parallel GPU Workloads ==="
144+
145+
gpu_devices="${NVIDIA_VISIBLE_DEVICES:-0,1}"
146+
147+
# Test that both GPUs can be used simultaneously
148+
docker run --rm --gpus "device=$gpu_devices" \
78149
nvidia/cuda:12.2.0-base-ubuntu22.04 \
79150
bash -c "
80-
echo '=== 2-GPU Test ==='
151+
echo '=== 2-GPU Parallel Test ==='
81152
nvidia-smi
82153
echo ''
83-
echo 'GPU 0 and GPU 1 should be visible'
84-
echo '✅ 2-GPU test completed'
154+
echo 'Both GPUs should be visible above'
155+
echo 'GPU Memory Info:'
156+
nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits
157+
echo ''
158+
echo '✅ 2-GPU test completed successfully'
85159
"
86160
161+
debug-comparison:
162+
runs-on: g6-1gpu-runner
163+
steps:
164+
- name: Compare Old vs New Methods
165+
run: |
166+
echo "=== Debugging: Old vs New GPU Access Methods ==="
167+
168+
echo "1. OLD METHOD (--gpus=all) - This was causing the problem:"
169+
echo " docker run --gpus=all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi -L"
170+
171+
# Show what the old method sees (this might show all GPUs)
172+
echo "Old method result:"
173+
docker run --rm --gpus=all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi -L || echo "Old method failed"
174+
175+
echo ""
176+
echo "2. NEW METHOD (--gpus=\"device=\$NVIDIA_VISIBLE_DEVICES\") - This respects K8s allocation:"
177+
echo " docker run --gpus=\"device=\$NVIDIA_VISIBLE_DEVICES\" nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi -L"
178+
179+
# Show what the new method sees (should only show allocated GPUs)
180+
gpu_devices="${NVIDIA_VISIBLE_DEVICES:-0}"
181+
echo "New method result:"
182+
docker run --rm --gpus "device=$gpu_devices" nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi -L
183+
184+
echo ""
185+
echo "=== Key Differences ==="
186+
echo "- Old method: Shows ALL GPUs on the node (ignores K8s allocation)"
187+
echo "- New method: Shows ONLY the GPUs allocated by Kubernetes"
188+
echo "- NVIDIA_VISIBLE_DEVICES from K8s: $NVIDIA_VISIBLE_DEVICES"
189+
87190
summary:
88-
needs: [test-1gpu-runner, test-2gpu-runner]
191+
needs: [test-1gpu-runner, test-2gpu-runner, debug-comparison]
89192
runs-on: ubuntu-latest
90193
steps:
91194
- name: Test Summary
92195
run: |
93-
echo "=== Multi-GPU Runner Test Complete ==="
94-
echo "✅ 1-GPU runner test: Success"
95-
echo "✅ 2-GPU runner test: Success"
196+
echo "=== Multi-GPU Runner Test Complete (Fixed Version) ==="
197+
echo "✅ 1-GPU runner test: Success - GPU isolation working"
198+
echo "✅ 2-GPU runner test: Success - GPU isolation working"
199+
echo "✅ Debug comparison: Shows difference between old and new methods"
200+
echo ""
201+
echo "🎉 GPU device selection issue has been resolved!"
202+
echo " - Runners now respect Kubernetes GPU allocation"
203+
echo " - No more DinD overhead"
204+
echo " - Proper GPU isolation between pods"
205+
echo ""
96206
echo "Timestamp: $(date -u)"

0 commit comments

Comments
 (0)