Fix workflow to trigger on push to master/feature branch #1

Workflow file for this run

.github/workflows/test-arc-gpu-runners.yml at d1c89f6

	# Test workflow for EKS Auto Mode GPU runners with ARC
	# This workflow validates the GPU runner infrastructure

	name: Test ARC GPU Runners

	on:
	push:
	branches:
	- master
	- main
	- 'feature/eks-arc-gpu-runners'
	workflow_dispatch:
	inputs:
	test_light_runners:
	description: 'Test light-gpu-runners (1 GPU)'
	required: false
	default: true
	type: boolean
	test_heavy_runners:
	description: 'Test heavy-gpu-runners (2 GPUs)'
	required: false
	default: true
	type: boolean

	jobs:
	test-light-gpu-runner:
	if: ${{ github.event_name == 'push' \|\| inputs.test_light_runners }}
	runs-on: light-gpu-runners
	timeout-minutes: 30
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: System Information
	run: \|
	echo "=== System Information ==="
	echo "Hostname: $(hostname)"
	echo "Kernel: $(uname -r)"
	echo "CPU Info:"
	lscpu \| grep -E "^(Architecture\|CPU$s$\|Model name)"
	echo ""
	echo "Memory Info:"
	free -h
	echo ""
	echo "Disk Info:"
	df -h /

	- name: Verify GPU Access
	run: \|
	echo "=== GPU Verification ==="
	if command -v nvidia-smi &> /dev/null; then
	echo "nvidia-smi found, checking GPU..."
	nvidia-smi
	echo ""
	echo "GPU Count: $(nvidia-smi -L \| wc -l)"
	nvidia-smi -L
	else
	echo "nvidia-smi not found in runner, checking via Docker..."
	fi

	- name: Test GPU via Docker
	run: \|
	echo "=== Docker GPU Test ==="
	docker run --rm --gpus all nvidia/cuda:12.2.0-runtime-ubuntu22.04 nvidia-smi

	- name: Verify Single GPU Allocation
	run: \|
	echo "=== Verifying 1 GPU Allocation ==="
	GPU_COUNT=$(docker run --rm --gpus all nvidia/cuda:12.2.0-runtime-ubuntu22.04 nvidia-smi -L \| wc -l)
	echo "GPUs available: $GPU_COUNT"
	if [ "$GPU_COUNT" -ge 1 ]; then
	echo "✅ GPU allocation verified"
	else
	echo "❌ Expected at least 1 GPU, got $GPU_COUNT"
	exit 1
	fi

	- name: Test CUDA Compute
	run: \|
	echo "=== CUDA Compute Test ==="
	docker run --rm --gpus all nvidia/cuda:12.2.0-devel-ubuntu22.04 bash -c '
	cat > /tmp/test.cu << EOF
	#include <stdio.h>
	__global__ void hello() {
	printf("Hello from GPU thread %d\\n", threadIdx.x);
	}
	int main() {
	hello<<<1, 5>>>();
	cudaDeviceSynchronize();
	return 0;
	}
	EOF
	nvcc /tmp/test.cu -o /tmp/test && /tmp/test
	'

	test-heavy-gpu-runner:
	if: ${{ github.event_name == 'push' \|\| inputs.test_heavy_runners }}
	runs-on: heavy-gpu-runners
	timeout-minutes: 30
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: System Information
	run: \|
	echo "=== System Information ==="
	echo "Hostname: $(hostname)"
	echo "Kernel: $(uname -r)"
	echo "CPU Info:"
	lscpu \| grep -E "^(Architecture\|CPU$s$\|Model name)"
	echo ""
	echo "Memory Info:"
	free -h
	echo ""
	echo "Disk Info:"
	df -h /

	- name: Verify GPU Access
	run: \|
	echo "=== GPU Verification ==="
	if command -v nvidia-smi &> /dev/null; then
	echo "nvidia-smi found, checking GPU..."
	nvidia-smi
	echo ""
	echo "GPU Count: $(nvidia-smi -L \| wc -l)"
	nvidia-smi -L
	else
	echo "nvidia-smi not found in runner, checking via Docker..."
	fi

	- name: Test GPU via Docker
	run: \|
	echo "=== Docker GPU Test ==="
	docker run --rm --gpus all nvidia/cuda:12.2.0-runtime-ubuntu22.04 nvidia-smi

	- name: Verify Multi-GPU Allocation
	run: \|
	echo "=== Verifying 2 GPU Allocation ==="
	GPU_COUNT=$(docker run --rm --gpus all nvidia/cuda:12.2.0-runtime-ubuntu22.04 nvidia-smi -L \| wc -l)
	echo "GPUs available: $GPU_COUNT"
	if [ "$GPU_COUNT" -ge 2 ]; then
	echo "✅ Multi-GPU allocation verified"
	else
	echo "❌ Expected at least 2 GPUs, got $GPU_COUNT"
	exit 1
	fi

	- name: Test Multi-GPU NCCL Communication
	run: \|
	echo "=== Multi-GPU NCCL Test ==="
	docker run --rm --gpus all \
	-e NCCL_DEBUG=INFO \
	nvidia/cuda:12.2.0-devel-ubuntu22.04 bash -c '
	apt-get update && apt-get install -y libnccl2 libnccl-dev > /dev/null 2>&1
	cat > /tmp/nccl_test.cu << EOF
	#include <stdio.h>
	#include <cuda_runtime.h>
	int main() {
	int deviceCount;
	cudaGetDeviceCount(&deviceCount);
	printf("CUDA Device Count: %d\\n", deviceCount);
	for (int i = 0; i < deviceCount; i++) {
	cudaDeviceProp prop;
	cudaGetDeviceProperties(&prop, i);
	printf("Device %d: %s (Compute %d.%d)\\n", i, prop.name, prop.major, prop.minor);
	}
	// Test P2P access
	if (deviceCount >= 2) {
	int canAccess;
	cudaDeviceCanAccessPeer(&canAccess, 0, 1);
	printf("P2P Access 0->1: %s\\n", canAccess ? "Yes" : "No");
	cudaDeviceCanAccessPeer(&canAccess, 1, 0);
	printf("P2P Access 1->0: %s\\n", canAccess ? "Yes" : "No");
	}
	return 0;
	}
	EOF
	nvcc /tmp/nccl_test.cu -o /tmp/nccl_test && /tmp/nccl_test
	'

	summary:
	needs: [test-light-gpu-runner, test-heavy-gpu-runner]
	if: always()
	runs-on: ubuntu-latest
	steps:
	- name: Test Summary
	run: \|
	echo "=== ARC GPU Runner Test Summary ==="
	echo ""
	echo "Light GPU Runner (1 GPU): ${{ needs.test-light-gpu-runner.result }}"
	echo "Heavy GPU Runner (2 GPUs): ${{ needs.test-heavy-gpu-runner.result }}"
	echo ""
	if [ "${{ needs.test-light-gpu-runner.result }}" == "success" ] \|\| [ "${{ needs.test-light-gpu-runner.result }}" == "skipped" ]; then
	if [ "${{ needs.test-heavy-gpu-runner.result }}" == "success" ] \|\| [ "${{ needs.test-heavy-gpu-runner.result }}" == "skipped" ]; then
	echo "✅ All requested tests passed!"
	exit 0
	fi
	fi
	echo "❌ Some tests failed"
	exit 1

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fix workflow to trigger on push to master/feature branch #1

Workflow file

Fix workflow to trigger on push to master/feature branch #1

Uh oh!

Workflow file for this run