2222 - name : Check GPU Info
2323 run : |
2424 echo "=== GPU Information ==="
25- nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
25+ docker run --rm --gpus all nvidia/cuda:12.2.0-runtime-ubuntu22.04 \
26+ nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
2627
2728 - name : Check Node Resources
2829 run : |
@@ -33,20 +34,12 @@ jobs:
3334
3435 - name : Run GPU Test
3536 run : |
36- echo "NVIDIA_VISIBLE_DEVICES from K8s: ${NVIDIA_VISIBLE_DEVICES:-not set}"
37- docker run --rm \
38- -e NVIDIA_VISIBLE_DEVICES="${NVIDIA_VISIBLE_DEVICES:-all}" \
39- nvidia/cuda:12.2.0-runtime-ubuntu22.04 \
37+ docker run --rm --gpus all nvidia/cuda:12.2.0-runtime-ubuntu22.04 \
4038 bash -c "
4139 echo '=== 1-GPU Test ==='
4240 echo 'NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES'
4341 echo 'CUDA_VISIBLE_DEVICES: \$CUDA_VISIBLE_DEVICES'
44- if command -v nvidia-smi &> /dev/null; then
45- nvidia-smi
46- else
47- echo 'nvidia-smi not available, testing CUDA runtime...'
48- echo 'GPU access: OK'
49- fi
42+ nvidia-smi
5043 echo ''
5144 echo '✅ 1-GPU test completed'
5245 "
6659 - name : Check GPU Info
6760 run : |
6861 echo "=== GPU Information ==="
69- nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
62+ docker run --rm --gpus all nvidia/cuda:12.2.0-runtime-ubuntu22.04 \
63+ nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
7064
7165 - name : Check Node Resources
7266 run : |
@@ -77,21 +71,12 @@ jobs:
7771
7872 - name : Run GPU Test
7973 run : |
80- echo "NVIDIA_VISIBLE_DEVICES from K8s: ${NVIDIA_VISIBLE_DEVICES:-not set}"
81- # Use Kubernetes GPU allocation instead of --gpus=all
82- docker run --rm \
83- -e NVIDIA_VISIBLE_DEVICES="${NVIDIA_VISIBLE_DEVICES:-all}" \
84- nvidia/cuda:12.2.0-runtime-ubuntu22.04 \
74+ docker run --rm --gpus all nvidia/cuda:12.2.0-runtime-ubuntu22.04 \
8575 bash -c "
8676 echo '=== 2-GPU Test ==='
8777 echo 'NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES'
8878 echo 'CUDA_VISIBLE_DEVICES: \$CUDA_VISIBLE_DEVICES'
89- if command -v nvidia-smi &> /dev/null; then
90- nvidia-smi
91- else
92- echo 'nvidia-smi not available, testing CUDA runtime...'
93- echo 'GPU access: OK'
94- fi
79+ nvidia-smi
9580 echo ''
9681 echo '✅ 2-GPU test completed'
9782 "
@@ -117,29 +102,21 @@ jobs:
117102 - name : Check GPU Allocation
118103 run : |
119104 echo "=== GPU Allocation Check ==="
120- echo "NVIDIA_VISIBLE_DEVICES: ${NVIDIA_VISIBLE_DEVICES:-not set}"
121- nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
105+ docker run --rm --gpus all nvidia/cuda:12.2.0-runtime-ubuntu22.04 \
106+ nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
122107
123108 - name : Run Parallel GPU Test
124109 run : |
125- echo "NVIDIA_VISIBLE_DEVICES from K8s: ${NVIDIA_VISIBLE_DEVICES:-not set}"
126110 echo "Testing ${{ matrix.runner }} with GPU isolation"
127111
128- docker run --rm \
129- -e NVIDIA_VISIBLE_DEVICES="${NVIDIA_VISIBLE_DEVICES:-all}" \
130- nvidia/cuda:12.2.0-runtime-ubuntu22.04 \
112+ docker run --rm --gpus all nvidia/cuda:12.2.0-runtime-ubuntu22.04 \
131113 bash -c "
132114 echo '=== Parallel Test Job ${{ matrix.job_id }} ==='
133115 echo 'Runner: ${{ matrix.runner }}'
134116 echo 'NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES'
135- if command -v nvidia-smi &> /dev/null; then
136- nvidia-smi -L
137- echo 'GPU Memory Info:'
138- nvidia-smi --query-gpu=memory.total,memory.used --format=csv,noheader,nounits
139- else
140- echo 'nvidia-smi not available, testing CUDA runtime...'
141- echo 'GPU access: OK'
142- fi
117+ nvidia-smi -L
118+ echo 'GPU Memory Info:'
119+ nvidia-smi --query-gpu=memory.total,memory.used --format=csv,noheader,nounits
143120 echo ''
144121 echo '✅ Parallel test completed for job ${{ matrix.job_id }}'
145122 "
0 commit comments