Update test-g6-runners.yaml #3

Workflow file for this run

.github/workflows/test-g6-runners.yaml at 01ae0a5

	name: Test - Multi-GPU Runners (Fixed)

	on:
	workflow_dispatch:
	push:
	paths:
	- ".github/workflows/test-g6-runners.yaml"

	jobs:
	test-1gpu-runner:
	runs-on: g6-1gpu-runner
	steps:
	- name: Checkout
	uses: actions/checkout@v4

	- name: Job Info
	run: \|
	echo "=== 1-GPU Runner Test (Fixed) ==="
	echo "Runner: $(hostname)"
	echo "Timestamp: $(date -u)"
	echo "NVIDIA_VISIBLE_DEVICES: ${NVIDIA_VISIBLE_DEVICES:-'Not set by K8s'}"

	- name: Check GPU Allocation
	run: \|
	echo "=== Kubernetes GPU Allocation ==="
	echo "NVIDIA_VISIBLE_DEVICES from K8s: $NVIDIA_VISIBLE_DEVICES"
	echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-'Not set'}"

	# Show environment variables related to GPU
	echo "GPU-related environment variables:"
	env \| grep -i nvidia \|\| echo "No NVIDIA env vars found"

	- name: Test GPU Access (Fixed Method)
	run: \|
	echo "=== Testing GPU Access with K8s Allocation ==="

	# Method 1: Use K8s allocated devices
	if [ -n "$NVIDIA_VISIBLE_DEVICES" ]; then
	echo "Using K8s allocated GPU devices: $NVIDIA_VISIBLE_DEVICES"
	docker run --rm --gpus "device=$NVIDIA_VISIBLE_DEVICES" \
	nvidia/cuda:12.2.0-base-ubuntu22.04 \
	nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
	else
	echo "No K8s GPU allocation found, using device=0"
	docker run --rm --gpus "device=0" \
	nvidia/cuda:12.2.0-base-ubuntu22.04 \
	nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
	fi

	- name: Verify GPU Count
	run: \|
	echo "=== Verifying Exactly 1 GPU is Visible ==="

	gpu_devices="${NVIDIA_VISIBLE_DEVICES:-0}"
	gpu_count=$(docker run --rm --gpus "device=$gpu_devices" \
	nvidia/cuda:12.2.0-base-ubuntu22.04 \
	nvidia-smi -L \| wc -l)

	echo "Number of visible GPUs: $gpu_count"

	if [ "$gpu_count" -eq 1 ]; then
	echo "✅ SUCCESS: Exactly 1 GPU visible as expected"
	else
	echo "❌ ERROR: Expected 1 GPU but found $gpu_count"
	echo "This indicates GPU isolation is not working properly"
	exit 1
	fi

	- name: Run GPU Workload Test
	run: \|
	echo "=== Running GPU Workload Test ==="

	gpu_devices="${NVIDIA_VISIBLE_DEVICES:-0}"
	docker run --rm --gpus "device=$gpu_devices" \
	nvidia/cuda:12.2.0-base-ubuntu22.04 \
	bash -c "
	echo '=== 1-GPU Workload Test ==='
	nvidia-smi
	echo ''
	echo 'GPU Memory Info:'
	nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits
	echo ''
	echo '✅ 1-GPU test completed successfully'
	"

	test-2gpu-runner:
	runs-on: g6-2gpu-runner
	steps:
	- name: Checkout
	uses: actions/checkout@v4

	- name: Job Info
	run: \|
	echo "=== 2-GPU Runner Test (Fixed) ==="
	echo "Runner: $(hostname)"
	echo "Timestamp: $(date -u)"
	echo "NVIDIA_VISIBLE_DEVICES: ${NVIDIA_VISIBLE_DEVICES:-'Not set by K8s'}"

	- name: Check GPU Allocation
	run: \|
	echo "=== Kubernetes GPU Allocation ==="
	echo "NVIDIA_VISIBLE_DEVICES from K8s: $NVIDIA_VISIBLE_DEVICES"
	echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-'Not set'}"

	- name: Test GPU Access (Fixed Method)
	run: \|
	echo "=== Testing GPU Access with K8s Allocation ==="

	# Method 1: Use K8s allocated devices
	if [ -n "$NVIDIA_VISIBLE_DEVICES" ]; then
	echo "Using K8s allocated GPU devices: $NVIDIA_VISIBLE_DEVICES"
	docker run --rm --gpus "device=$NVIDIA_VISIBLE_DEVICES" \
	nvidia/cuda:12.2.0-base-ubuntu22.04 \
	nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
	else
	echo "No K8s GPU allocation found, using device=0,1"
	docker run --rm --gpus "device=0,1" \
	nvidia/cuda:12.2.0-base-ubuntu22.04 \
	nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
	fi

	- name: Verify GPU Count
	run: \|
	echo "=== Verifying Exactly 2 GPUs are Visible ==="

	gpu_devices="${NVIDIA_VISIBLE_DEVICES:-0,1}"
	gpu_count=$(docker run --rm --gpus "device=$gpu_devices" \
	nvidia/cuda:12.2.0-base-ubuntu22.04 \
	nvidia-smi -L \| wc -l)

	echo "Number of visible GPUs: $gpu_count"

	if [ "$gpu_count" -eq 2 ]; then
	echo "✅ SUCCESS: Exactly 2 GPUs visible as expected"
	else
	echo "❌ ERROR: Expected 2 GPUs but found $gpu_count"
	echo "This indicates GPU isolation is not working properly"
	exit 1
	fi

	- name: Test Parallel GPU Workloads
	run: \|
	echo "=== Testing Parallel GPU Workloads ==="

	gpu_devices="${NVIDIA_VISIBLE_DEVICES:-0,1}"

	# Test that both GPUs can be used simultaneously
	docker run --rm --gpus "device=$gpu_devices" \
	nvidia/cuda:12.2.0-base-ubuntu22.04 \
	bash -c "
	echo '=== 2-GPU Parallel Test ==='
	nvidia-smi
	echo ''
	echo 'Both GPUs should be visible above'
	echo 'GPU Memory Info:'
	nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits
	echo ''
	echo '✅ 2-GPU test completed successfully'
	"

	debug-comparison:
	runs-on: g6-1gpu-runner
	steps:
	- name: Compare Old vs New Methods
	run: \|
	echo "=== Debugging: Old vs New GPU Access Methods ==="

	echo "1. OLD METHOD (--gpus=all) - This was causing the problem:"
	echo " docker run --gpus=all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi -L"

	# Show what the old method sees (this might show all GPUs)
	echo "Old method result:"
	docker run --rm --gpus=all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi -L \|\| echo "Old method failed"

	echo ""
	echo "2. NEW METHOD (--gpus=\"device=\$NVIDIA_VISIBLE_DEVICES\") - This respects K8s allocation:"
	echo " docker run --gpus=\"device=\$NVIDIA_VISIBLE_DEVICES\" nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi -L"

	# Show what the new method sees (should only show allocated GPUs)
	gpu_devices="${NVIDIA_VISIBLE_DEVICES:-0}"
	echo "New method result:"
	docker run --rm --gpus "device=$gpu_devices" nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi -L

	echo ""
	echo "=== Key Differences ==="
	echo "- Old method: Shows ALL GPUs on the node (ignores K8s allocation)"
	echo "- New method: Shows ONLY the GPUs allocated by Kubernetes"
	echo "- NVIDIA_VISIBLE_DEVICES from K8s: $NVIDIA_VISIBLE_DEVICES"

	summary:
	needs: [test-1gpu-runner, test-2gpu-runner, debug-comparison]
	runs-on: ubuntu-latest
	steps:
	- name: Test Summary
	run: \|
	echo "=== Multi-GPU Runner Test Complete (Fixed Version) ==="
	echo "✅ 1-GPU runner test: Success - GPU isolation working"
	echo "✅ 2-GPU runner test: Success - GPU isolation working"
	echo "✅ Debug comparison: Shows difference between old and new methods"
	echo ""
	echo "🎉 GPU device selection issue has been resolved!"
	echo " - Runners now respect Kubernetes GPU allocation"
	echo " - No more DinD overhead"
	echo " - Proper GPU isolation between pods"
	echo ""
	echo "Timestamp: $(date -u)"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Update test-g6-runners.yaml #3

Workflow file

Update test-g6-runners.yaml #3

Uh oh!

Workflow file for this run