Skip to content

Update test-g6-runners.yaml #7

Update test-g6-runners.yaml

Update test-g6-runners.yaml #7

name: Test - G6 Runner Autoscaling
on:
workflow_dispatch:
push:
paths:
- ".github/workflows/test-g6-runners.yaml"
jobs:
test-1gpu-runner:
runs-on: g6-1gpu-runner
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Job Info
run: |
echo "=== 1-GPU Runner Test ==="
echo "Runner: $(hostname)"
echo "Timestamp: $(date -u)"
- name: Check GPU Info
run: |
echo "=== GPU Information ==="
nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
- name: Check Node Resources
run: |
echo "=== Node Information ==="
echo "Hostname: $(hostname)"
echo "CPU cores: $(nproc)"
echo "Memory: $(free -h | grep Mem | awk '{print $2}')"
- name: Run GPU Test
run: |
echo "NVIDIA_VISIBLE_DEVICES from K8s: ${NVIDIA_VISIBLE_DEVICES:-not set}"
docker run --rm --gpus=all \
nvidia/cuda:12.2.0-runtime-ubuntu22.04 \
bash -c "
echo '=== 1-GPU Test ==='
echo 'NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES'
echo 'CUDA_VISIBLE_DEVICES: \$CUDA_VISIBLE_DEVICES'
if command -v nvidia-smi &> /dev/null; then
nvidia-smi
else
echo 'nvidia-smi not available, testing CUDA runtime...'
echo 'GPU access: OK'
fi
echo ''
echo '✅ 1-GPU test completed'
"
test-2gpu-runner:
runs-on: g6-2gpu-runner
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Job Info
run: |
echo "=== 2-GPU Runner Test ==="
echo "Runner: $(hostname)"
echo "Timestamp: $(date -u)"
- name: Check GPU Info
run: |
echo "=== GPU Information ==="
nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
- name: Check Node Resources
run: |
echo "=== Node Information ==="
echo "Hostname: $(hostname)"
echo "CPU cores: $(nproc)"
echo "Memory: $(free -h | grep Mem | awk '{print $2}')"
- name: Run GPU Test
run: |
echo "NVIDIA_VISIBLE_DEVICES from K8s: ${NVIDIA_VISIBLE_DEVICES:-not set}"
docker run --rm --gpus=all \
nvidia/cuda:12.2.0-runtime-ubuntu22.04 \
bash -c "
echo '=== 2-GPU Test ==='
echo 'NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES'
echo 'CUDA_VISIBLE_DEVICES: \$CUDA_VISIBLE_DEVICES'
if command -v nvidia-smi &> /dev/null; then
nvidia-smi
else
echo 'nvidia-smi not available, testing CUDA runtime...'
echo 'GPU access: OK'
fi
echo ''
echo '✅ 2-GPU test completed'
"
test-parallel-scaling:
strategy:
matrix:
runner: [g6-1gpu-runner, g6-2gpu-runner]
job_id: [1, 2, 3]
runs-on: ${{ matrix.runner }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Job Info
run: |
echo "=== Parallel Scaling Test ==="
echo "Runner: ${{ matrix.runner }}"
echo "Job ID: ${{ matrix.job_id }}"
echo "Hostname: $(hostname)"
echo "Timestamp: $(date -u)"
- name: Check GPU Allocation
run: |
echo "=== GPU Allocation Check ==="
echo "NVIDIA_VISIBLE_DEVICES: ${NVIDIA_VISIBLE_DEVICES:-not set}"
nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv
- name: Run Parallel GPU Test
run: |
echo "NVIDIA_VISIBLE_DEVICES from K8s: ${NVIDIA_VISIBLE_DEVICES:-not set}"
echo "Testing ${{ matrix.runner }} with --gpus=all"
docker run --rm --gpus=all \
nvidia/cuda:12.2.0-runtime-ubuntu22.04 \
bash -c "
echo '=== Parallel Test Job ${{ matrix.job_id }} ==='
echo 'Runner: ${{ matrix.runner }}'
echo 'NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES'
if command -v nvidia-smi &> /dev/null; then
nvidia-smi -L
echo 'GPU Memory Info:'
nvidia-smi --query-gpu=memory.total,memory.used --format=csv,noheader,nounits
else
echo 'nvidia-smi not available, testing CUDA runtime...'
echo 'GPU access: OK'
fi
echo ''
echo '✅ Parallel test completed for job ${{ matrix.job_id }}'
"
- name: Simulate Workload
run: |
echo "=== Simulating GPU Workload ==="
sleep 30 # Simulate some work to test autoscaling behavior