Skip to content

Update test-native-gpu-runner.yaml #2

Update test-native-gpu-runner.yaml

Update test-native-gpu-runner.yaml #2

name: Test - Native GPU Runner
on:
workflow_dispatch:
push:
paths:
- ".github/workflows/test-native-gpu-runner.yaml"
jobs:
test-2gpu-native:
runs-on: g6-2gpu-native-runner
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Job Info
run: |
echo "=== Native GPU Runner Test ==="
echo "Runner: $(hostname)"
echo "Timestamp: $(date -u)"
- name: Check GPU Environment
run: |
echo "=== GPU Environment ==="
echo "NVIDIA_VISIBLE_DEVICES: ${NVIDIA_VISIBLE_DEVICES:-not set}"
echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-not set}"
- name: Check GPU Info
run: |
echo "=== GPU Information ==="
nvidia-smi
nvidia-smi -L
nvidia-smi --query-gpu=index,name,uuid,memory.total,memory.used --format=csv
- name: Check CUDA
run: |
echo "=== CUDA Info ==="
nvcc --version || echo "nvcc not in PATH"
ls -la /usr/local/cuda/bin/ || true
- name: Check Node Resources
run: |
echo "=== Node Information ==="
echo "Hostname: $(hostname)"
echo "CPU cores: $(nproc)"
echo "Memory: $(free -h | grep Mem | awk '{print $2}')"
# Test parallel jobs to verify GPU isolation
test-parallel-native:
strategy:
matrix:
job_id: [1, 2]
runs-on: g6-2gpu-native-runner
steps:
- name: Job Info
run: |
echo "=== Parallel Native GPU Test ==="
echo "Job ID: ${{ matrix.job_id }}"
echo "Hostname: $(hostname)"
echo "Start Time: $(date -u +%Y-%m-%dT%H:%M:%S)"
- name: Check GPU Allocation
run: |
echo "=== GPU Allocation Check for Job ${{ matrix.job_id }} ==="
echo "NVIDIA_VISIBLE_DEVICES: ${NVIDIA_VISIBLE_DEVICES:-not set}"
echo ""
echo "GPU UUIDs assigned to this job:"
nvidia-smi --query-gpu=index,uuid,name --format=csv
echo ""
echo "Full GPU Details:"
nvidia-smi -L
- name: Simulate Workload with GPU Lock
run: |
echo "=== Job ${{ matrix.job_id }} - Holding GPUs for 60 seconds ==="
echo "GPUs held by this job:"
nvidia-smi --query-gpu=uuid --format=csv,noheader
echo ""
echo "Starting workload at: $(date -u +%Y-%m-%dT%H:%M:%S)"
sleep 60
echo "Finished workload at: $(date -u +%Y-%m-%dT%H:%M:%S)"
echo "✅ Job ${{ matrix.job_id }} completed"