Update test-g6-runners.yaml #8
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Test - G6 Runner Autoscaling | |
| on: | |
| workflow_dispatch: | |
| push: | |
| paths: | |
| - ".github/workflows/test-g6-runners.yaml" | |
| jobs: | |
| test-1gpu-runner: | |
| runs-on: g6-1gpu-runner | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: Job Info | |
| run: | | |
| echo "=== 1-GPU Runner Test ===" | |
| echo "Runner: $(hostname)" | |
| echo "Timestamp: $(date -u)" | |
| - name: Check GPU Info | |
| run: | | |
| echo "=== GPU Information ===" | |
| nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv | |
| - name: Check Node Resources | |
| run: | | |
| echo "=== Node Information ===" | |
| echo "Hostname: $(hostname)" | |
| echo "CPU cores: $(nproc)" | |
| echo "Memory: $(free -h | grep Mem | awk '{print $2}')" | |
| - name: Run GPU Test | |
| run: | | |
| echo "NVIDIA_VISIBLE_DEVICES from K8s: ${NVIDIA_VISIBLE_DEVICES:-not set}" | |
| docker run --rm \ | |
| -e NVIDIA_VISIBLE_DEVICES="${NVIDIA_VISIBLE_DEVICES:-all}" \ | |
| --runtime=nvidia \ | |
| nvidia/cuda:12.2.0-runtime-ubuntu22.04 \ | |
| bash -c " | |
| echo '=== 1-GPU Test ===' | |
| echo 'NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES' | |
| echo 'CUDA_VISIBLE_DEVICES: \$CUDA_VISIBLE_DEVICES' | |
| if command -v nvidia-smi &> /dev/null; then | |
| nvidia-smi | |
| else | |
| echo 'nvidia-smi not available, testing CUDA runtime...' | |
| echo 'GPU access: OK' | |
| fi | |
| echo '' | |
| echo '✅ 1-GPU test completed' | |
| " | |
| test-2gpu-runner: | |
| runs-on: g6-2gpu-runner | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: Job Info | |
| run: | | |
| echo "=== 2-GPU Runner Test ===" | |
| echo "Runner: $(hostname)" | |
| echo "Timestamp: $(date -u)" | |
| - name: Check GPU Info | |
| run: | | |
| echo "=== GPU Information ===" | |
| nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv | |
| - name: Check Node Resources | |
| run: | | |
| echo "=== Node Information ===" | |
| echo "Hostname: $(hostname)" | |
| echo "CPU cores: $(nproc)" | |
| echo "Memory: $(free -h | grep Mem | awk '{print $2}')" | |
| - name: Run GPU Test | |
| run: | | |
| echo "NVIDIA_VISIBLE_DEVICES from K8s: ${NVIDIA_VISIBLE_DEVICES:-not set}" | |
| # Use Kubernetes GPU allocation instead of --gpus=all | |
| docker run --rm \ | |
| -e NVIDIA_VISIBLE_DEVICES="${NVIDIA_VISIBLE_DEVICES:-all}" \ | |
| --runtime=nvidia \ | |
| nvidia/cuda:12.2.0-runtime-ubuntu22.04 \ | |
| bash -c " | |
| echo '=== 2-GPU Test ===' | |
| echo 'NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES' | |
| echo 'CUDA_VISIBLE_DEVICES: \$CUDA_VISIBLE_DEVICES' | |
| if command -v nvidia-smi &> /dev/null; then | |
| nvidia-smi | |
| else | |
| echo 'nvidia-smi not available, testing CUDA runtime...' | |
| echo 'GPU access: OK' | |
| fi | |
| echo '' | |
| echo '✅ 2-GPU test completed' | |
| " | |
| test-parallel-scaling: | |
| strategy: | |
| matrix: | |
| runner: [g6-1gpu-runner, g6-2gpu-runner] | |
| job_id: [1, 2, 3] | |
| runs-on: ${{ matrix.runner }} | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: Job Info | |
| run: | | |
| echo "=== Parallel Scaling Test ===" | |
| echo "Runner: ${{ matrix.runner }}" | |
| echo "Job ID: ${{ matrix.job_id }}" | |
| echo "Hostname: $(hostname)" | |
| echo "Timestamp: $(date -u)" | |
| - name: Check GPU Allocation | |
| run: | | |
| echo "=== GPU Allocation Check ===" | |
| echo "NVIDIA_VISIBLE_DEVICES: ${NVIDIA_VISIBLE_DEVICES:-not set}" | |
| nvidia-smi --query-gpu=index,name,memory.total,memory.used --format=csv | |
| - name: Run Parallel GPU Test | |
| run: | | |
| echo "NVIDIA_VISIBLE_DEVICES from K8s: ${NVIDIA_VISIBLE_DEVICES:-not set}" | |
| echo "Testing ${{ matrix.runner }} with GPU isolation" | |
| docker run --rm \ | |
| -e NVIDIA_VISIBLE_DEVICES="${NVIDIA_VISIBLE_DEVICES:-all}" \ | |
| --runtime=nvidia \ | |
| nvidia/cuda:12.2.0-runtime-ubuntu22.04 \ | |
| bash -c " | |
| echo '=== Parallel Test Job ${{ matrix.job_id }} ===' | |
| echo 'Runner: ${{ matrix.runner }}' | |
| echo 'NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES' | |
| if command -v nvidia-smi &> /dev/null; then | |
| nvidia-smi -L | |
| echo 'GPU Memory Info:' | |
| nvidia-smi --query-gpu=memory.total,memory.used --format=csv,noheader,nounits | |
| else | |
| echo 'nvidia-smi not available, testing CUDA runtime...' | |
| echo 'GPU access: OK' | |
| fi | |
| echo '' | |
| echo '✅ Parallel test completed for job ${{ matrix.job_id }}' | |
| " | |
| - name: Simulate Workload | |
| run: | | |
| echo "=== Simulating GPU Workload ===" | |
| sleep 30 # Simulate some work to test autoscaling behavior |