Merge pull request #5470 from makr-code/hardening/replication-empty-r… #90
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: GPU Benchmark Matrix (CUDA/HIP/Vulkan) | |
| # GPU Benchmark Matrix Runner — M05 implementation (§1.4 PERFORMANCE_EXPECTATIONS.md) | |
| # | |
| # Establishes dedicated runner profiles for CUDA, HIP, and Vulkan benchmarks | |
| # so that GPU-gated benchmarks produce real measurements instead of disabled-stub | |
| # placeholders. | |
| # | |
| # Runner profiles: | |
| # gpu-cuda — self-hosted, Ubuntu 22.04, CUDA 12.x, NVIDIA Ampere/Ada/Hopper | |
| # gpu-hip — self-hosted, Ubuntu 22.04, ROCm 6.x, AMD RDNA3/CDNA2 | |
| # gpu-vulkan — self-hosted, Ubuntu 22.04, Vulkan 1.3, any Vulkan-capable GPU | |
| # | |
| # Fallback: | |
| # When no self-hosted GPU runner is registered the three GPU jobs are skipped | |
| # automatically (GitHub queues them and times out after the runner_timeout). | |
| # The `gpu-bench-cpu-fallback` job always executes on ubuntu-latest, compiles | |
| # all GPU-gated targets with a CPU-only flag, and verifies that the disabled-stub | |
| # path compiles cleanly. | |
| # | |
| # Artifacts: | |
| # • <backend>-benchmark-results/ — JSON benchmark output per backend | |
| # • gpu-benchmark-pipeline-report/ — consolidated report (runner info + timings) | |
| # | |
| # Priority benchmark list (descending): | |
| # 1. bench_fused_kernels (CUDA/HIP — fused LoRA kernel pipeline) | |
| # 2. bench_gpu_backends (CUDA/HIP/Vulkan — distance computation) | |
| # 3. bench_gpu_training_cycle (CUDA/HIP — full training cycle) | |
| # 4. bench_fused_lora_kernels (CUDA/HIP — forward/backward LoRA pass) | |
| # 5. bench_vulkan_lora (Vulkan — Vulkan LoRA pipeline) | |
| # 6. bench_backend_comparison (CUDA/HIP — CUDA vs HIP head-to-head) | |
| # 7. bench_cuda_vs_cpu (CUDA — CPU baseline vs CUDA speedup) | |
| # 8. bench_multi_gpu_scaling (CUDA/HIP — multi-GPU shard scaling) | |
| # | |
| # See: docs/ci-cd/gpu-benchmark-matrix-runner.md | |
| on: | |
| push: | |
| branches: | |
| - main | |
| - develop | |
| paths: | |
| - 'benchmarks/bench_fused_kernels.cpp' | |
| - 'benchmarks/bench_fused_lora_kernels.cpp' | |
| - 'benchmarks/bench_gpu_backends.cpp' | |
| - 'benchmarks/bench_gpu_training_cycle.cpp' | |
| - 'benchmarks/bench_vulkan_lora.cpp' | |
| - 'benchmarks/bench_backend_comparison.cpp' | |
| - 'benchmarks/bench_cuda_vs_cpu.cpp' | |
| - 'benchmarks/bench_multi_gpu_scaling.cpp' | |
| - 'src/acceleration/**' | |
| - 'include/acceleration/**' | |
| - 'src/llm/lora_framework/**' | |
| - 'include/llm/lora_framework/**' | |
| - '.github/workflows/06-infrastructure_gpu_gpu-benchmark-matrix-ci.yml' | |
| schedule: | |
| - cron: '0 3 * * 6' # Weekly on Saturday 03:00 UTC | |
| workflow_dispatch: | |
| inputs: | |
| backend_filter: | |
| description: 'Backend filter (cuda|hip|vulkan|all)' | |
| required: false | |
| default: 'all' | |
| benchmark_filter: | |
| description: 'Google Benchmark --benchmark_filter regex' | |
| required: false | |
| default: '' | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: false # Let GPU benchmarks complete; don't cancel mid-run | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Job 1 (CUDA): self-hosted runner with NVIDIA GPU | |
| # Satisfies M05 audit check: runs-on contains "gpu-cuda" AND THEMIS_ENABLE_CUDA=ON | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| jobs: | |
| gpu-bench-cuda: | |
| name: GPU Benchmarks (CUDA, ${{ matrix.cuda_arch }}) | |
| runs-on: [self-hosted, gpu-cuda] | |
| if: > | |
| github.event_name == 'workflow_dispatch' || | |
| github.event_name == 'schedule' || | |
| (github.event_name == 'push' && github.ref == 'refs/heads/main') | |
| continue-on-error: true # Non-blocking: GPU runner may not be registered | |
| permissions: | |
| contents: read | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - cuda_arch: sm_80 | |
| description: 'NVIDIA Ampere (A100, A10)' | |
| - cuda_arch: sm_89 | |
| description: 'NVIDIA Ada (RTX 4090, L4)' | |
| - cuda_arch: sm_90 | |
| description: 'NVIDIA Hopper (H100)' | |
| env: | |
| BENCH_OUT_DIR: artifacts/gpu-cuda-${{ matrix.cuda_arch }} | |
| BENCH_FILTER: ${{ github.event.inputs.benchmark_filter || 'BM_CUDA|BM_Fused|BM_GPU|BM_Backend' }} | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: Record runner GPU info | |
| run: | | |
| mkdir -p "$BENCH_OUT_DIR" | |
| echo "=== nvidia-smi ===" | tee "$BENCH_OUT_DIR/runner_info.txt" | |
| nvidia-smi | tee -a "$BENCH_OUT_DIR/runner_info.txt" || true | |
| echo "=== nvcc --version ===" | tee -a "$BENCH_OUT_DIR/runner_info.txt" | |
| nvcc --version | tee -a "$BENCH_OUT_DIR/runner_info.txt" || true | |
| echo "=== uname -a ===" | tee -a "$BENCH_OUT_DIR/runner_info.txt" | |
| uname -a | tee -a "$BENCH_OUT_DIR/runner_info.txt" | |
| - name: Install system dependencies | |
| run: | | |
| sudo apt-get update -qq | |
| sudo apt-get install -y --no-install-recommends \ | |
| cmake ninja-build g++ \ | |
| libgtest-dev libbenchmark-dev \ | |
| pkg-config \ | |
| librocksdb-dev \ | |
| libssl-dev \ | |
| zlib1g-dev \ | |
| libzstd-dev \ | |
| liblz4-dev \ | |
| libfmt-dev \ | |
| libspdlog-dev \ | |
| nlohmann-json3-dev \ | |
| libmimalloc-dev || true | |
| - name: Install Google Benchmark (build from source if needed) | |
| run: | | |
| if ! pkg-config --exists benchmark 2>/dev/null; then | |
| git clone --depth 1 --branch v1.8.3 \ | |
| https://github.com/google/benchmark.git /tmp/benchmark | |
| cmake -S /tmp/benchmark -B /tmp/benchmark/build \ | |
| -DCMAKE_BUILD_TYPE=Release \ | |
| -DBENCHMARK_ENABLE_TESTING=OFF | |
| cmake --build /tmp/benchmark/build --parallel $(nproc) | |
| sudo cmake --install /tmp/benchmark/build --prefix /usr/local | |
| fi | |
| - name: Configure CMake (CUDA, ${{ matrix.cuda_arch }}) | |
| run: | | |
| cmake -B build_gpu_cuda -G Ninja \ | |
| -DCMAKE_BUILD_TYPE=Release \ | |
| -DTHEMIS_BUILD_BENCHMARKS=ON \ | |
| -DTHEMIS_BUILD_TESTS=OFF \ | |
| -DTHEMIS_ENABLE_CUDA=ON \ | |
| -DTHEMIS_ENABLE_GPU=ON \ | |
| -DTHEMIS_ENABLE_LLM=ON \ | |
| -DTHEMIS_ENABLE_TRACING=OFF \ | |
| -DTHEMIS_ENABLE_GRPC=OFF \ | |
| -DCUDA_ARCH_LIST="${{ matrix.cuda_arch }}" \ | |
| -DCMAKE_CUDA_ARCHITECTURES="${{ matrix.cuda_arch }}" \ | |
| -S cmake | |
| - name: Build priority GPU benchmarks (CUDA) | |
| run: | | |
| cmake --build build_gpu_cuda --parallel $(nproc) \ | |
| --target bench_fused_kernels \ | |
| --target bench_gpu_backends \ | |
| --target bench_gpu_training_cycle \ | |
| --target bench_fused_lora_kernels \ | |
| --target bench_backend_comparison \ | |
| --target bench_cuda_vs_cpu \ | |
| --target bench_multi_gpu_scaling \ | |
| --target bench_lora_gpu \ | |
| 2>&1 | tee "$BENCH_OUT_DIR/build.log" || true | |
| - name: Run priority GPU benchmarks (CUDA) | |
| run: | | |
| set +e | |
| mkdir -p "$BENCH_OUT_DIR" | |
| BENCH_BINARIES=( | |
| bench_fused_kernels | |
| bench_gpu_backends | |
| bench_gpu_training_cycle | |
| bench_fused_lora_kernels | |
| bench_backend_comparison | |
| bench_cuda_vs_cpu | |
| bench_multi_gpu_scaling | |
| bench_lora_gpu | |
| ) | |
| for bin in "${BENCH_BINARIES[@]}"; do | |
| if [ -x "build_gpu_cuda/$bin" ]; then | |
| echo "▶ Running $bin (CUDA ${{ matrix.cuda_arch }}) …" | |
| build_gpu_cuda/$bin \ | |
| --benchmark_format=json \ | |
| --benchmark_out="$BENCH_OUT_DIR/${bin}.json" \ | |
| --benchmark_min_time=0.5s \ | |
| ${BENCH_FILTER:+--benchmark_filter="$BENCH_FILTER"} \ | |
| 2>&1 | tee "$BENCH_OUT_DIR/${bin}.run.log" | |
| echo " exit=$?" | |
| else | |
| echo " ⚠️ $bin not built — skipping" | |
| fi | |
| done | |
| echo "✅ CUDA benchmark run complete (${{ matrix.cuda_arch }})" | |
| - name: Generate benchmark summary | |
| if: always() | |
| run: | | |
| python3 - <<'PYEOF' | |
| import json, os, glob | |
| from datetime import datetime, timezone | |
| out_dir = os.environ.get("BENCH_OUT_DIR", "artifacts/gpu-cuda") | |
| results = {} | |
| for jf in sorted(glob.glob(f"{out_dir}/*.json")): | |
| try: | |
| with open(jf) as f: | |
| data = json.load(f) | |
| bench_name = os.path.splitext(os.path.basename(jf))[0] | |
| benchmarks = data.get("benchmarks", []) | |
| skipped = [b for b in benchmarks if b.get("skipped")] | |
| real = [b for b in benchmarks if not b.get("skipped")] | |
| results[bench_name] = { | |
| "total": len(benchmarks), | |
| "real": len(real), | |
| "skipped": len(skipped), | |
| } | |
| except Exception as e: | |
| results[os.path.basename(jf)] = {"error": str(e)} | |
| summary = { | |
| "backend": "cuda", | |
| "arch": os.environ.get("MATRIX_CUDA_ARCH", "unknown"), | |
| "timestamp": datetime.now(timezone.utc).isoformat(), | |
| "commit": os.environ.get("GITHUB_SHA", "unknown"), | |
| "benchmarks": results, | |
| } | |
| with open(f"{out_dir}/summary.json", "w") as f: | |
| json.dump(summary, f, indent=2) | |
| print(json.dumps(summary, indent=2)) | |
| PYEOF | |
| env: | |
| MATRIX_CUDA_ARCH: ${{ matrix.cuda_arch }} | |
| - name: Upload CUDA benchmark artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: gpu-cuda-${{ matrix.cuda_arch }}-benchmark-results | |
| path: ${{ env.BENCH_OUT_DIR }}/ | |
| retention-days: 30 | |
| - name: Write job summary (CUDA) | |
| if: always() | |
| run: | | |
| echo "## 🚀 GPU Benchmark Matrix — CUDA (${{ matrix.cuda_arch }})" >> "$GITHUB_STEP_SUMMARY" | |
| echo "" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| Parameter | Value |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "|-----------|-------|" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Backend** | CUDA |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Architecture** | \`${{ matrix.cuda_arch }}\` (${{ matrix.description }}) |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Event** | \`${{ github.event_name }}\` |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Branch** | \`${{ github.ref_name }}\` |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Commit** | \`${{ github.sha }}\` |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Runner** | self-hosted gpu-cuda |" >> "$GITHUB_STEP_SUMMARY" | |
| if [ -f "$BENCH_OUT_DIR/summary.json" ]; then | |
| echo "" >> "$GITHUB_STEP_SUMMARY" | |
| echo "### Benchmark Results" >> "$GITHUB_STEP_SUMMARY" | |
| echo "\`\`\`json" >> "$GITHUB_STEP_SUMMARY" | |
| cat "$BENCH_OUT_DIR/summary.json" >> "$GITHUB_STEP_SUMMARY" | |
| echo "\`\`\`" >> "$GITHUB_STEP_SUMMARY" | |
| fi | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Job 2 (HIP): self-hosted runner with AMD GPU (ROCm) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| gpu-bench-hip: | |
| name: GPU Benchmarks (HIP/ROCm, ${{ matrix.rocm_arch }}) | |
| runs-on: [self-hosted, gpu-hip] | |
| if: > | |
| github.event_name == 'workflow_dispatch' || | |
| github.event_name == 'schedule' || | |
| (github.event_name == 'push' && github.ref == 'refs/heads/main') | |
| continue-on-error: true # Non-blocking: GPU runner may not be registered | |
| permissions: | |
| contents: read | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - rocm_arch: gfx1100 | |
| description: 'AMD RDNA3 (RX 7900 XTX)' | |
| - rocm_arch: gfx90a | |
| description: 'AMD CDNA2 (MI250X)' | |
| env: | |
| BENCH_OUT_DIR: artifacts/gpu-hip-${{ matrix.rocm_arch }} | |
| BENCH_FILTER: ${{ github.event.inputs.benchmark_filter || 'BM_HIP|BM_GPU|BM_Fused' }} | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: Record runner GPU info | |
| run: | | |
| mkdir -p "$BENCH_OUT_DIR" | |
| echo "=== rocm-smi ===" | tee "$BENCH_OUT_DIR/runner_info.txt" | |
| rocm-smi | tee -a "$BENCH_OUT_DIR/runner_info.txt" || true | |
| echo "=== hipcc --version ===" | tee -a "$BENCH_OUT_DIR/runner_info.txt" | |
| hipcc --version | tee -a "$BENCH_OUT_DIR/runner_info.txt" || true | |
| echo "=== uname -a ===" | tee -a "$BENCH_OUT_DIR/runner_info.txt" | |
| uname -a | tee -a "$BENCH_OUT_DIR/runner_info.txt" | |
| - name: Install system dependencies | |
| run: | | |
| sudo apt-get update -qq | |
| sudo apt-get install -y --no-install-recommends \ | |
| cmake ninja-build g++ \ | |
| libbenchmark-dev \ | |
| pkg-config \ | |
| librocksdb-dev \ | |
| libssl-dev \ | |
| libfmt-dev \ | |
| libspdlog-dev \ | |
| nlohmann-json3-dev \ | |
| libmimalloc-dev || true | |
| - name: Configure CMake (HIP, ${{ matrix.rocm_arch }}) | |
| run: | | |
| cmake -B build_gpu_hip -G Ninja \ | |
| -DCMAKE_BUILD_TYPE=Release \ | |
| -DTHEMIS_BUILD_BENCHMARKS=ON \ | |
| -DTHEMIS_BUILD_TESTS=OFF \ | |
| -DTHEMIS_ENABLE_HIP=ON \ | |
| -DTHEMIS_ENABLE_GPU=ON \ | |
| -DTHEMIS_ENABLE_LLM=ON \ | |
| -DTHEMIS_ENABLE_TRACING=OFF \ | |
| -DTHEMIS_ENABLE_GRPC=OFF \ | |
| -DAMDGPU_TARGETS="${{ matrix.rocm_arch }}" \ | |
| -S cmake | |
| - name: Build priority GPU benchmarks (HIP) | |
| run: | | |
| cmake --build build_gpu_hip --parallel $(nproc) \ | |
| --target bench_fused_kernels \ | |
| --target bench_gpu_backends \ | |
| --target bench_gpu_training_cycle \ | |
| --target bench_fused_lora_kernels \ | |
| --target bench_multi_gpu_scaling \ | |
| --target bench_lora_gpu \ | |
| 2>&1 | tee "$BENCH_OUT_DIR/build.log" || true | |
| - name: Run priority GPU benchmarks (HIP) | |
| run: | | |
| set +e | |
| BENCH_BINARIES=( | |
| bench_fused_kernels | |
| bench_gpu_backends | |
| bench_gpu_training_cycle | |
| bench_fused_lora_kernels | |
| bench_multi_gpu_scaling | |
| bench_lora_gpu | |
| ) | |
| for bin in "${BENCH_BINARIES[@]}"; do | |
| if [ -x "build_gpu_hip/$bin" ]; then | |
| echo "▶ Running $bin (HIP ${{ matrix.rocm_arch }}) …" | |
| build_gpu_hip/$bin \ | |
| --benchmark_format=json \ | |
| --benchmark_out="$BENCH_OUT_DIR/${bin}.json" \ | |
| --benchmark_min_time=0.5s \ | |
| ${BENCH_FILTER:+--benchmark_filter="$BENCH_FILTER"} \ | |
| 2>&1 | tee "$BENCH_OUT_DIR/${bin}.run.log" | |
| else | |
| echo " ⚠️ $bin not built — skipping" | |
| fi | |
| done | |
| echo "✅ HIP benchmark run complete (${{ matrix.rocm_arch }})" | |
| - name: Upload HIP benchmark artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: gpu-hip-${{ matrix.rocm_arch }}-benchmark-results | |
| path: ${{ env.BENCH_OUT_DIR }}/ | |
| retention-days: 30 | |
| - name: Write job summary (HIP) | |
| if: always() | |
| run: | | |
| echo "## 🚀 GPU Benchmark Matrix — HIP (${{ matrix.rocm_arch }})" >> "$GITHUB_STEP_SUMMARY" | |
| echo "" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| Parameter | Value |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "|-----------|-------|" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Backend** | HIP/ROCm |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Architecture** | \`${{ matrix.rocm_arch }}\` (${{ matrix.description }}) |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Event** | \`${{ github.event_name }}\` |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Branch** | \`${{ github.ref_name }}\` |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Commit** | \`${{ github.sha }}\` |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Runner** | self-hosted gpu-hip |" >> "$GITHUB_STEP_SUMMARY" | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Job 3 (Vulkan): self-hosted runner with Vulkan-capable GPU | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| gpu-bench-vulkan: | |
| name: GPU Benchmarks (Vulkan) | |
| runs-on: [self-hosted, gpu-vulkan] | |
| if: > | |
| github.event_name == 'workflow_dispatch' || | |
| github.event_name == 'schedule' || | |
| (github.event_name == 'push' && github.ref == 'refs/heads/main') | |
| continue-on-error: true # Non-blocking: GPU runner may not be registered | |
| permissions: | |
| contents: read | |
| env: | |
| BENCH_OUT_DIR: artifacts/gpu-vulkan | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: Record runner GPU / Vulkan info | |
| run: | | |
| mkdir -p "$BENCH_OUT_DIR" | |
| echo "=== vulkaninfo --summary ===" | tee "$BENCH_OUT_DIR/runner_info.txt" | |
| vulkaninfo --summary 2>/dev/null | tee -a "$BENCH_OUT_DIR/runner_info.txt" || true | |
| echo "=== uname -a ===" | tee -a "$BENCH_OUT_DIR/runner_info.txt" | |
| uname -a | tee -a "$BENCH_OUT_DIR/runner_info.txt" | |
| - name: Install system dependencies | |
| run: | | |
| sudo apt-get update -qq | |
| sudo apt-get install -y --no-install-recommends \ | |
| cmake ninja-build g++ \ | |
| libbenchmark-dev \ | |
| pkg-config \ | |
| librocksdb-dev \ | |
| libssl-dev \ | |
| libfmt-dev \ | |
| libspdlog-dev \ | |
| nlohmann-json3-dev \ | |
| libvulkan-dev \ | |
| vulkan-tools \ | |
| spirv-tools \ | |
| libmimalloc-dev || true | |
| - name: Configure CMake (Vulkan) | |
| run: | | |
| cmake -B build_gpu_vulkan -G Ninja \ | |
| -DCMAKE_BUILD_TYPE=Release \ | |
| -DTHEMIS_BUILD_BENCHMARKS=ON \ | |
| -DTHEMIS_BUILD_TESTS=OFF \ | |
| -DTHEMIS_ENABLE_VULKAN=ON \ | |
| -DTHEMIS_ENABLE_GPU=ON \ | |
| -DTHEMIS_ENABLE_LLM=ON \ | |
| -DTHEMIS_ENABLE_TRACING=OFF \ | |
| -DTHEMIS_ENABLE_GRPC=OFF \ | |
| -S cmake | |
| - name: Build priority GPU benchmarks (Vulkan) | |
| run: | | |
| cmake --build build_gpu_vulkan --parallel $(nproc) \ | |
| --target bench_vulkan_lora \ | |
| --target bench_gpu_backends \ | |
| --target bench_lora_gpu \ | |
| 2>&1 | tee "$BENCH_OUT_DIR/build.log" || true | |
| - name: Run priority GPU benchmarks (Vulkan) | |
| run: | | |
| set +e | |
| BENCH_BINARIES=( | |
| bench_vulkan_lora | |
| bench_gpu_backends | |
| bench_lora_gpu | |
| ) | |
| for bin in "${BENCH_BINARIES[@]}"; do | |
| if [ -x "build_gpu_vulkan/$bin" ]; then | |
| echo "▶ Running $bin (Vulkan) …" | |
| build_gpu_vulkan/$bin \ | |
| --benchmark_format=json \ | |
| --benchmark_out="$BENCH_OUT_DIR/${bin}.json" \ | |
| --benchmark_min_time=0.5s \ | |
| 2>&1 | tee "$BENCH_OUT_DIR/${bin}.run.log" | |
| else | |
| echo " ⚠️ $bin not built — skipping" | |
| fi | |
| done | |
| echo "✅ Vulkan benchmark run complete" | |
| - name: Upload Vulkan benchmark artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: gpu-vulkan-benchmark-results | |
| path: ${{ env.BENCH_OUT_DIR }}/ | |
| retention-days: 30 | |
| - name: Write job summary (Vulkan) | |
| if: always() | |
| run: | | |
| echo "## 🚀 GPU Benchmark Matrix — Vulkan" >> "$GITHUB_STEP_SUMMARY" | |
| echo "" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| Parameter | Value |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "|-----------|-------|" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Backend** | Vulkan |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Event** | \`${{ github.event_name }}\` |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Branch** | \`${{ github.ref_name }}\` |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Commit** | \`${{ github.sha }}\` |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Runner** | self-hosted gpu-vulkan |" >> "$GITHUB_STEP_SUMMARY" | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Job 4 (CPU fallback): always runs on ubuntu-latest | |
| # Compiles all GPU-gated benchmarks in CPU-only mode to verify that: | |
| # a) the disabled-stub paths compile cleanly, and | |
| # b) cmake targets are registered correctly. | |
| # Also validates the disabled-stub policy (Deadline + Issue tag present). | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| gpu-bench-cpu-fallback: | |
| name: GPU Benchmarks — CPU Fallback Compile Check | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| env: | |
| BENCH_OUT_DIR: artifacts/gpu-cpu-fallback | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: Install system dependencies | |
| run: | | |
| sudo apt-get update -qq | |
| sudo apt-get install -y --no-install-recommends \ | |
| cmake ninja-build g++ \ | |
| libbenchmark-dev \ | |
| pkg-config \ | |
| librocksdb-dev \ | |
| libssl-dev \ | |
| zlib1g-dev \ | |
| libzstd-dev \ | |
| liblz4-dev \ | |
| libfmt-dev \ | |
| libspdlog-dev \ | |
| nlohmann-json3-dev \ | |
| libmimalloc-dev | |
| - name: Install Google Benchmark (build from source if needed) | |
| run: | | |
| if ! pkg-config --exists benchmark 2>/dev/null; then | |
| git clone --depth 1 --branch v1.8.3 \ | |
| https://github.com/google/benchmark.git /tmp/benchmark | |
| cmake -S /tmp/benchmark -B /tmp/benchmark/build \ | |
| -DCMAKE_BUILD_TYPE=Release \ | |
| -DBENCHMARK_ENABLE_TESTING=OFF | |
| cmake --build /tmp/benchmark/build --parallel $(nproc) | |
| sudo cmake --install /tmp/benchmark/build --prefix /usr/local | |
| fi | |
| - name: Configure CMake (CPU-only, all GPU-gated benchmarks) | |
| run: | | |
| cmake -B build_cpu_fallback -G Ninja \ | |
| -DCMAKE_BUILD_TYPE=Release \ | |
| -DTHEMIS_BUILD_BENCHMARKS=ON \ | |
| -DTHEMIS_BUILD_TESTS=OFF \ | |
| -DTHEMIS_ENABLE_GPU=OFF \ | |
| -DTHEMIS_ENABLE_CUDA=OFF \ | |
| -DTHEMIS_ENABLE_HIP=OFF \ | |
| -DTHEMIS_ENABLE_VULKAN=OFF \ | |
| -DTHEMIS_ENABLE_LLM=OFF \ | |
| -DTHEMIS_ENABLE_TRACING=OFF \ | |
| -DTHEMIS_ENABLE_GRPC=OFF \ | |
| -S cmake | |
| - name: Build GPU-gated benchmarks (CPU stub path) | |
| run: | | |
| mkdir -p "$BENCH_OUT_DIR" | |
| cmake --build build_cpu_fallback --parallel $(nproc) \ | |
| 2>&1 | tee "$BENCH_OUT_DIR/build.log" | |
| echo "✅ CPU-stub build complete" | |
| - name: Verify disabled-stub policy compliance | |
| run: | | |
| echo "Checking disabled-stub policy in GPU benchmark files…" | |
| EXIT=0 | |
| for f in \ | |
| benchmarks/bench_fused_kernels.cpp \ | |
| benchmarks/bench_fused_lora_kernels.cpp \ | |
| benchmarks/bench_gpu_training_cycle.cpp \ | |
| benchmarks/bench_vulkan_lora.cpp \ | |
| benchmarks/bench_lora_gpu.cpp \ | |
| benchmarks/bench_multi_gpu_scaling.cpp \ | |
| benchmarks/bench_backend_comparison.cpp \ | |
| benchmarks/bench_gpu_vector_index.cpp; do | |
| [ -f "$f" ] || continue | |
| # Each *_GPUDisabled or *_Disabled BENCHMARK registration must carry | |
| # "Deadline: " and "Issue: #" in the same file. | |
| if grep -q "BENCHMARK(BM_.*Disabled)" "$f"; then | |
| HAS_DEADLINE=$(grep -c "Deadline:" "$f" || true) | |
| HAS_ISSUE=$(grep -c "Issue: #" "$f" || true) | |
| if [ "$HAS_DEADLINE" -lt 1 ] || [ "$HAS_ISSUE" -lt 1 ]; then | |
| echo "❌ $f — missing Deadline or Issue tag for disabled stub" | |
| EXIT=1 | |
| else | |
| echo "✅ $f — stub policy compliant" | |
| fi | |
| fi | |
| done | |
| exit $EXIT | |
| - name: Run CPU-stub benchmarks (smoke test) | |
| run: | | |
| set +e | |
| mkdir -p "$BENCH_OUT_DIR" | |
| # Run each CPU-stub binary to verify it starts and exits cleanly. | |
| # Disabled stubs call SkipWithError so they exit 0. | |
| STUB_TARGETS=( | |
| bench_gpu_backends | |
| ) | |
| for bin in "${STUB_TARGETS[@]}"; do | |
| if [ -x "build_cpu_fallback/$bin" ]; then | |
| echo "▶ Smoke-testing $bin (CPU stub) …" | |
| timeout 60 build_cpu_fallback/$bin \ | |
| --benchmark_format=json \ | |
| --benchmark_out="$BENCH_OUT_DIR/${bin}.json" \ | |
| --benchmark_min_time=0.01s \ | |
| 2>&1 | tee "$BENCH_OUT_DIR/${bin}.run.log" | |
| fi | |
| done | |
| echo "✅ CPU-stub smoke tests complete" | |
| - name: Upload CPU fallback artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: gpu-cpu-fallback-benchmark-results | |
| path: ${{ env.BENCH_OUT_DIR }}/ | |
| retention-days: 14 | |
| - name: Write job summary (CPU fallback) | |
| if: always() | |
| run: | | |
| echo "## 🖥️ GPU Benchmark Matrix — CPU Fallback Compile Check" >> "$GITHUB_STEP_SUMMARY" | |
| echo "" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| Parameter | Value |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "|-----------|-------|" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Runner** | ubuntu-latest (CPU-only fallback) |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **GPU flags** | all OFF — disabled-stub path only |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Event** | \`${{ github.event_name }}\` |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Branch** | \`${{ github.ref_name }}\` |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| **Commit** | \`${{ github.sha }}\` |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "" >> "$GITHUB_STEP_SUMMARY" | |
| echo "### Checks" >> "$GITHUB_STEP_SUMMARY" | |
| echo "- GPU-gated benchmarks compile in CPU stub mode" >> "$GITHUB_STEP_SUMMARY" | |
| echo "- Disabled-stub policy compliance (Deadline + Issue tag)" >> "$GITHUB_STEP_SUMMARY" | |
| echo "- CPU-stub smoke run (bench_gpu_backends exits 0)" >> "$GITHUB_STEP_SUMMARY" | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Gate job: summarises all backend results | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| gpu-benchmark-gate: | |
| name: GPU Benchmark Matrix Gate | |
| needs: | |
| - gpu-bench-cuda | |
| - gpu-bench-hip | |
| - gpu-bench-vulkan | |
| - gpu-bench-cpu-fallback | |
| if: always() | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| steps: | |
| - name: Check overall status | |
| shell: bash | |
| run: | | |
| cuda="${{ needs.gpu-bench-cuda.result }}" | |
| hip="${{ needs.gpu-bench-hip.result }}" | |
| vulkan="${{ needs.gpu-bench-vulkan.result }}" | |
| fallback="${{ needs.gpu-bench-cpu-fallback.result }}" | |
| echo "CUDA job result : $cuda" | |
| echo "HIP job result : $hip" | |
| echo "Vulkan job result : $vulkan" | |
| echo "CPU fallback : $fallback" | |
| # CPU fallback is the blocking gate — GPU runners are optional. | |
| if [[ "$fallback" != "success" && "$fallback" != "skipped" ]]; then | |
| echo "❌ CPU fallback compile/stub check failed." | |
| exit 1 | |
| fi | |
| # Report GPU runner availability. | |
| for backend in cuda hip vulkan; do | |
| result_var="${backend}" | |
| case $backend in | |
| cuda) result=$cuda ;; | |
| hip) result=$hip ;; | |
| vulkan) result=$vulkan ;; | |
| esac | |
| if [[ "$result" == "success" ]]; then | |
| echo "✅ $backend — real measurements recorded" | |
| elif [[ "$result" == "skipped" ]]; then | |
| echo "ℹ️ $backend — no self-hosted runner registered (expected in hosted CI)" | |
| else | |
| echo "⚠️ $backend — runner available but job encountered errors (continue-on-error)" | |
| fi | |
| done | |
| echo "✅ GPU Benchmark Matrix gate passed." | |
| - name: Write gate summary | |
| if: always() | |
| run: | | |
| echo "## 🎯 GPU Benchmark Matrix — Gate Summary" >> "$GITHUB_STEP_SUMMARY" | |
| echo "" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| Backend | Result |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "|---------|--------|" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| CUDA (sm_80/89/90) | \`${{ needs.gpu-bench-cuda.result }}\` |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| HIP/ROCm (gfx1100/gfx90a) | \`${{ needs.gpu-bench-hip.result }}\` |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| Vulkan | \`${{ needs.gpu-bench-vulkan.result }}\` |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "| CPU Fallback (blocking gate) | \`${{ needs.gpu-bench-cpu-fallback.result }}\` |" >> "$GITHUB_STEP_SUMMARY" | |
| echo "" >> "$GITHUB_STEP_SUMMARY" | |
| echo "_GPU runner jobs (CUDA/HIP/Vulkan) use_ \`continue-on-error: true\` _—_" >> "$GITHUB_STEP_SUMMARY" | |
| echo "_they are skipped when no self-hosted runner is registered._" >> "$GITHUB_STEP_SUMMARY" | |
| echo "_The CPU fallback job is the blocking gate._" >> "$GITHUB_STEP_SUMMARY" | |
| echo "" >> "$GITHUB_STEP_SUMMARY" | |
| echo "See: [docs/ci-cd/gpu-benchmark-matrix-runner.md](../../docs/ci-cd/gpu-benchmark-matrix-runner.md)" >> "$GITHUB_STEP_SUMMARY" |