Skip to content

Merge pull request #5470 from makr-code/hardening/replication-empty-r… #90

Merge pull request #5470 from makr-code/hardening/replication-empty-r…

Merge pull request #5470 from makr-code/hardening/replication-empty-r… #90

name: GPU Benchmark Matrix (CUDA/HIP/Vulkan)
# GPU Benchmark Matrix Runner — M05 implementation (§1.4 PERFORMANCE_EXPECTATIONS.md)
#
# Establishes dedicated runner profiles for CUDA, HIP, and Vulkan benchmarks
# so that GPU-gated benchmarks produce real measurements instead of disabled-stub
# placeholders.
#
# Runner profiles:
# gpu-cuda — self-hosted, Ubuntu 22.04, CUDA 12.x, NVIDIA Ampere/Ada/Hopper
# gpu-hip — self-hosted, Ubuntu 22.04, ROCm 6.x, AMD RDNA3/CDNA2
# gpu-vulkan — self-hosted, Ubuntu 22.04, Vulkan 1.3, any Vulkan-capable GPU
#
# Fallback:
# When no self-hosted GPU runner is registered the three GPU jobs are skipped
# automatically (GitHub queues them and times out after the runner_timeout).
# The `gpu-bench-cpu-fallback` job always executes on ubuntu-latest, compiles
# all GPU-gated targets with a CPU-only flag, and verifies that the disabled-stub
# path compiles cleanly.
#
# Artifacts:
# • <backend>-benchmark-results/ — JSON benchmark output per backend
# • gpu-benchmark-pipeline-report/ — consolidated report (runner info + timings)
#
# Priority benchmark list (descending):
# 1. bench_fused_kernels (CUDA/HIP — fused LoRA kernel pipeline)
# 2. bench_gpu_backends (CUDA/HIP/Vulkan — distance computation)
# 3. bench_gpu_training_cycle (CUDA/HIP — full training cycle)
# 4. bench_fused_lora_kernels (CUDA/HIP — forward/backward LoRA pass)
# 5. bench_vulkan_lora (Vulkan — Vulkan LoRA pipeline)
# 6. bench_backend_comparison (CUDA/HIP — CUDA vs HIP head-to-head)
# 7. bench_cuda_vs_cpu (CUDA — CPU baseline vs CUDA speedup)
# 8. bench_multi_gpu_scaling (CUDA/HIP — multi-GPU shard scaling)
#
# See: docs/ci-cd/gpu-benchmark-matrix-runner.md
on:
push:
branches:
- main
- develop
paths:
- 'benchmarks/bench_fused_kernels.cpp'
- 'benchmarks/bench_fused_lora_kernels.cpp'
- 'benchmarks/bench_gpu_backends.cpp'
- 'benchmarks/bench_gpu_training_cycle.cpp'
- 'benchmarks/bench_vulkan_lora.cpp'
- 'benchmarks/bench_backend_comparison.cpp'
- 'benchmarks/bench_cuda_vs_cpu.cpp'
- 'benchmarks/bench_multi_gpu_scaling.cpp'
- 'src/acceleration/**'
- 'include/acceleration/**'
- 'src/llm/lora_framework/**'
- 'include/llm/lora_framework/**'
- '.github/workflows/06-infrastructure_gpu_gpu-benchmark-matrix-ci.yml'
schedule:
- cron: '0 3 * * 6' # Weekly on Saturday 03:00 UTC
workflow_dispatch:
inputs:
backend_filter:
description: 'Backend filter (cuda|hip|vulkan|all)'
required: false
default: 'all'
benchmark_filter:
description: 'Google Benchmark --benchmark_filter regex'
required: false
default: ''
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: false # Let GPU benchmarks complete; don't cancel mid-run
# ─────────────────────────────────────────────────────────────────────────────
# Job 1 (CUDA): self-hosted runner with NVIDIA GPU
# Satisfies M05 audit check: runs-on contains "gpu-cuda" AND THEMIS_ENABLE_CUDA=ON
# ─────────────────────────────────────────────────────────────────────────────
jobs:
gpu-bench-cuda:
name: GPU Benchmarks (CUDA, ${{ matrix.cuda_arch }})
runs-on: [self-hosted, gpu-cuda]
if: >
github.event_name == 'workflow_dispatch' ||
github.event_name == 'schedule' ||
(github.event_name == 'push' && github.ref == 'refs/heads/main')
continue-on-error: true # Non-blocking: GPU runner may not be registered
permissions:
contents: read
strategy:
fail-fast: false
matrix:
include:
- cuda_arch: sm_80
description: 'NVIDIA Ampere (A100, A10)'
- cuda_arch: sm_89
description: 'NVIDIA Ada (RTX 4090, L4)'
- cuda_arch: sm_90
description: 'NVIDIA Hopper (H100)'
env:
BENCH_OUT_DIR: artifacts/gpu-cuda-${{ matrix.cuda_arch }}
BENCH_FILTER: ${{ github.event.inputs.benchmark_filter || 'BM_CUDA|BM_Fused|BM_GPU|BM_Backend' }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Record runner GPU info
run: |
mkdir -p "$BENCH_OUT_DIR"
echo "=== nvidia-smi ===" | tee "$BENCH_OUT_DIR/runner_info.txt"
nvidia-smi | tee -a "$BENCH_OUT_DIR/runner_info.txt" || true
echo "=== nvcc --version ===" | tee -a "$BENCH_OUT_DIR/runner_info.txt"
nvcc --version | tee -a "$BENCH_OUT_DIR/runner_info.txt" || true
echo "=== uname -a ===" | tee -a "$BENCH_OUT_DIR/runner_info.txt"
uname -a | tee -a "$BENCH_OUT_DIR/runner_info.txt"
- name: Install system dependencies
run: |
sudo apt-get update -qq
sudo apt-get install -y --no-install-recommends \
cmake ninja-build g++ \
libgtest-dev libbenchmark-dev \
pkg-config \
librocksdb-dev \
libssl-dev \
zlib1g-dev \
libzstd-dev \
liblz4-dev \
libfmt-dev \
libspdlog-dev \
nlohmann-json3-dev \
libmimalloc-dev || true
- name: Install Google Benchmark (build from source if needed)
run: |
if ! pkg-config --exists benchmark 2>/dev/null; then
git clone --depth 1 --branch v1.8.3 \
https://github.com/google/benchmark.git /tmp/benchmark
cmake -S /tmp/benchmark -B /tmp/benchmark/build \
-DCMAKE_BUILD_TYPE=Release \
-DBENCHMARK_ENABLE_TESTING=OFF
cmake --build /tmp/benchmark/build --parallel $(nproc)
sudo cmake --install /tmp/benchmark/build --prefix /usr/local
fi
- name: Configure CMake (CUDA, ${{ matrix.cuda_arch }})
run: |
cmake -B build_gpu_cuda -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DTHEMIS_BUILD_BENCHMARKS=ON \
-DTHEMIS_BUILD_TESTS=OFF \
-DTHEMIS_ENABLE_CUDA=ON \
-DTHEMIS_ENABLE_GPU=ON \
-DTHEMIS_ENABLE_LLM=ON \
-DTHEMIS_ENABLE_TRACING=OFF \
-DTHEMIS_ENABLE_GRPC=OFF \
-DCUDA_ARCH_LIST="${{ matrix.cuda_arch }}" \
-DCMAKE_CUDA_ARCHITECTURES="${{ matrix.cuda_arch }}" \
-S cmake
- name: Build priority GPU benchmarks (CUDA)
run: |
cmake --build build_gpu_cuda --parallel $(nproc) \
--target bench_fused_kernels \
--target bench_gpu_backends \
--target bench_gpu_training_cycle \
--target bench_fused_lora_kernels \
--target bench_backend_comparison \
--target bench_cuda_vs_cpu \
--target bench_multi_gpu_scaling \
--target bench_lora_gpu \
2>&1 | tee "$BENCH_OUT_DIR/build.log" || true
- name: Run priority GPU benchmarks (CUDA)
run: |
set +e
mkdir -p "$BENCH_OUT_DIR"
BENCH_BINARIES=(
bench_fused_kernels
bench_gpu_backends
bench_gpu_training_cycle
bench_fused_lora_kernels
bench_backend_comparison
bench_cuda_vs_cpu
bench_multi_gpu_scaling
bench_lora_gpu
)
for bin in "${BENCH_BINARIES[@]}"; do
if [ -x "build_gpu_cuda/$bin" ]; then
echo "▶ Running $bin (CUDA ${{ matrix.cuda_arch }}) …"
build_gpu_cuda/$bin \
--benchmark_format=json \
--benchmark_out="$BENCH_OUT_DIR/${bin}.json" \
--benchmark_min_time=0.5s \
${BENCH_FILTER:+--benchmark_filter="$BENCH_FILTER"} \
2>&1 | tee "$BENCH_OUT_DIR/${bin}.run.log"
echo " exit=$?"
else
echo " ⚠️ $bin not built — skipping"
fi
done
echo "✅ CUDA benchmark run complete (${{ matrix.cuda_arch }})"
- name: Generate benchmark summary
if: always()
run: |
python3 - <<'PYEOF'
import json, os, glob
from datetime import datetime, timezone
out_dir = os.environ.get("BENCH_OUT_DIR", "artifacts/gpu-cuda")
results = {}
for jf in sorted(glob.glob(f"{out_dir}/*.json")):
try:
with open(jf) as f:
data = json.load(f)
bench_name = os.path.splitext(os.path.basename(jf))[0]
benchmarks = data.get("benchmarks", [])
skipped = [b for b in benchmarks if b.get("skipped")]
real = [b for b in benchmarks if not b.get("skipped")]
results[bench_name] = {
"total": len(benchmarks),
"real": len(real),
"skipped": len(skipped),
}
except Exception as e:
results[os.path.basename(jf)] = {"error": str(e)}
summary = {
"backend": "cuda",
"arch": os.environ.get("MATRIX_CUDA_ARCH", "unknown"),
"timestamp": datetime.now(timezone.utc).isoformat(),
"commit": os.environ.get("GITHUB_SHA", "unknown"),
"benchmarks": results,
}
with open(f"{out_dir}/summary.json", "w") as f:
json.dump(summary, f, indent=2)
print(json.dumps(summary, indent=2))
PYEOF
env:
MATRIX_CUDA_ARCH: ${{ matrix.cuda_arch }}
- name: Upload CUDA benchmark artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: gpu-cuda-${{ matrix.cuda_arch }}-benchmark-results
path: ${{ env.BENCH_OUT_DIR }}/
retention-days: 30
- name: Write job summary (CUDA)
if: always()
run: |
echo "## 🚀 GPU Benchmark Matrix — CUDA (${{ matrix.cuda_arch }})" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "| Parameter | Value |" >> "$GITHUB_STEP_SUMMARY"
echo "|-----------|-------|" >> "$GITHUB_STEP_SUMMARY"
echo "| **Backend** | CUDA |" >> "$GITHUB_STEP_SUMMARY"
echo "| **Architecture** | \`${{ matrix.cuda_arch }}\` (${{ matrix.description }}) |" >> "$GITHUB_STEP_SUMMARY"
echo "| **Event** | \`${{ github.event_name }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| **Branch** | \`${{ github.ref_name }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| **Commit** | \`${{ github.sha }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| **Runner** | self-hosted gpu-cuda |" >> "$GITHUB_STEP_SUMMARY"
if [ -f "$BENCH_OUT_DIR/summary.json" ]; then
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "### Benchmark Results" >> "$GITHUB_STEP_SUMMARY"
echo "\`\`\`json" >> "$GITHUB_STEP_SUMMARY"
cat "$BENCH_OUT_DIR/summary.json" >> "$GITHUB_STEP_SUMMARY"
echo "\`\`\`" >> "$GITHUB_STEP_SUMMARY"
fi
# ─────────────────────────────────────────────────────────────────────────────
# Job 2 (HIP): self-hosted runner with AMD GPU (ROCm)
# ─────────────────────────────────────────────────────────────────────────────
gpu-bench-hip:
name: GPU Benchmarks (HIP/ROCm, ${{ matrix.rocm_arch }})
runs-on: [self-hosted, gpu-hip]
if: >
github.event_name == 'workflow_dispatch' ||
github.event_name == 'schedule' ||
(github.event_name == 'push' && github.ref == 'refs/heads/main')
continue-on-error: true # Non-blocking: GPU runner may not be registered
permissions:
contents: read
strategy:
fail-fast: false
matrix:
include:
- rocm_arch: gfx1100
description: 'AMD RDNA3 (RX 7900 XTX)'
- rocm_arch: gfx90a
description: 'AMD CDNA2 (MI250X)'
env:
BENCH_OUT_DIR: artifacts/gpu-hip-${{ matrix.rocm_arch }}
BENCH_FILTER: ${{ github.event.inputs.benchmark_filter || 'BM_HIP|BM_GPU|BM_Fused' }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Record runner GPU info
run: |
mkdir -p "$BENCH_OUT_DIR"
echo "=== rocm-smi ===" | tee "$BENCH_OUT_DIR/runner_info.txt"
rocm-smi | tee -a "$BENCH_OUT_DIR/runner_info.txt" || true
echo "=== hipcc --version ===" | tee -a "$BENCH_OUT_DIR/runner_info.txt"
hipcc --version | tee -a "$BENCH_OUT_DIR/runner_info.txt" || true
echo "=== uname -a ===" | tee -a "$BENCH_OUT_DIR/runner_info.txt"
uname -a | tee -a "$BENCH_OUT_DIR/runner_info.txt"
- name: Install system dependencies
run: |
sudo apt-get update -qq
sudo apt-get install -y --no-install-recommends \
cmake ninja-build g++ \
libbenchmark-dev \
pkg-config \
librocksdb-dev \
libssl-dev \
libfmt-dev \
libspdlog-dev \
nlohmann-json3-dev \
libmimalloc-dev || true
- name: Configure CMake (HIP, ${{ matrix.rocm_arch }})
run: |
cmake -B build_gpu_hip -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DTHEMIS_BUILD_BENCHMARKS=ON \
-DTHEMIS_BUILD_TESTS=OFF \
-DTHEMIS_ENABLE_HIP=ON \
-DTHEMIS_ENABLE_GPU=ON \
-DTHEMIS_ENABLE_LLM=ON \
-DTHEMIS_ENABLE_TRACING=OFF \
-DTHEMIS_ENABLE_GRPC=OFF \
-DAMDGPU_TARGETS="${{ matrix.rocm_arch }}" \
-S cmake
- name: Build priority GPU benchmarks (HIP)
run: |
cmake --build build_gpu_hip --parallel $(nproc) \
--target bench_fused_kernels \
--target bench_gpu_backends \
--target bench_gpu_training_cycle \
--target bench_fused_lora_kernels \
--target bench_multi_gpu_scaling \
--target bench_lora_gpu \
2>&1 | tee "$BENCH_OUT_DIR/build.log" || true
- name: Run priority GPU benchmarks (HIP)
run: |
set +e
BENCH_BINARIES=(
bench_fused_kernels
bench_gpu_backends
bench_gpu_training_cycle
bench_fused_lora_kernels
bench_multi_gpu_scaling
bench_lora_gpu
)
for bin in "${BENCH_BINARIES[@]}"; do
if [ -x "build_gpu_hip/$bin" ]; then
echo "▶ Running $bin (HIP ${{ matrix.rocm_arch }}) …"
build_gpu_hip/$bin \
--benchmark_format=json \
--benchmark_out="$BENCH_OUT_DIR/${bin}.json" \
--benchmark_min_time=0.5s \
${BENCH_FILTER:+--benchmark_filter="$BENCH_FILTER"} \
2>&1 | tee "$BENCH_OUT_DIR/${bin}.run.log"
else
echo " ⚠️ $bin not built — skipping"
fi
done
echo "✅ HIP benchmark run complete (${{ matrix.rocm_arch }})"
- name: Upload HIP benchmark artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: gpu-hip-${{ matrix.rocm_arch }}-benchmark-results
path: ${{ env.BENCH_OUT_DIR }}/
retention-days: 30
- name: Write job summary (HIP)
if: always()
run: |
echo "## 🚀 GPU Benchmark Matrix — HIP (${{ matrix.rocm_arch }})" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "| Parameter | Value |" >> "$GITHUB_STEP_SUMMARY"
echo "|-----------|-------|" >> "$GITHUB_STEP_SUMMARY"
echo "| **Backend** | HIP/ROCm |" >> "$GITHUB_STEP_SUMMARY"
echo "| **Architecture** | \`${{ matrix.rocm_arch }}\` (${{ matrix.description }}) |" >> "$GITHUB_STEP_SUMMARY"
echo "| **Event** | \`${{ github.event_name }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| **Branch** | \`${{ github.ref_name }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| **Commit** | \`${{ github.sha }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| **Runner** | self-hosted gpu-hip |" >> "$GITHUB_STEP_SUMMARY"
# ─────────────────────────────────────────────────────────────────────────────
# Job 3 (Vulkan): self-hosted runner with Vulkan-capable GPU
# ─────────────────────────────────────────────────────────────────────────────
gpu-bench-vulkan:
name: GPU Benchmarks (Vulkan)
runs-on: [self-hosted, gpu-vulkan]
if: >
github.event_name == 'workflow_dispatch' ||
github.event_name == 'schedule' ||
(github.event_name == 'push' && github.ref == 'refs/heads/main')
continue-on-error: true # Non-blocking: GPU runner may not be registered
permissions:
contents: read
env:
BENCH_OUT_DIR: artifacts/gpu-vulkan
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Record runner GPU / Vulkan info
run: |
mkdir -p "$BENCH_OUT_DIR"
echo "=== vulkaninfo --summary ===" | tee "$BENCH_OUT_DIR/runner_info.txt"
vulkaninfo --summary 2>/dev/null | tee -a "$BENCH_OUT_DIR/runner_info.txt" || true
echo "=== uname -a ===" | tee -a "$BENCH_OUT_DIR/runner_info.txt"
uname -a | tee -a "$BENCH_OUT_DIR/runner_info.txt"
- name: Install system dependencies
run: |
sudo apt-get update -qq
sudo apt-get install -y --no-install-recommends \
cmake ninja-build g++ \
libbenchmark-dev \
pkg-config \
librocksdb-dev \
libssl-dev \
libfmt-dev \
libspdlog-dev \
nlohmann-json3-dev \
libvulkan-dev \
vulkan-tools \
spirv-tools \
libmimalloc-dev || true
- name: Configure CMake (Vulkan)
run: |
cmake -B build_gpu_vulkan -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DTHEMIS_BUILD_BENCHMARKS=ON \
-DTHEMIS_BUILD_TESTS=OFF \
-DTHEMIS_ENABLE_VULKAN=ON \
-DTHEMIS_ENABLE_GPU=ON \
-DTHEMIS_ENABLE_LLM=ON \
-DTHEMIS_ENABLE_TRACING=OFF \
-DTHEMIS_ENABLE_GRPC=OFF \
-S cmake
- name: Build priority GPU benchmarks (Vulkan)
run: |
cmake --build build_gpu_vulkan --parallel $(nproc) \
--target bench_vulkan_lora \
--target bench_gpu_backends \
--target bench_lora_gpu \
2>&1 | tee "$BENCH_OUT_DIR/build.log" || true
- name: Run priority GPU benchmarks (Vulkan)
run: |
set +e
BENCH_BINARIES=(
bench_vulkan_lora
bench_gpu_backends
bench_lora_gpu
)
for bin in "${BENCH_BINARIES[@]}"; do
if [ -x "build_gpu_vulkan/$bin" ]; then
echo "▶ Running $bin (Vulkan) …"
build_gpu_vulkan/$bin \
--benchmark_format=json \
--benchmark_out="$BENCH_OUT_DIR/${bin}.json" \
--benchmark_min_time=0.5s \
2>&1 | tee "$BENCH_OUT_DIR/${bin}.run.log"
else
echo " ⚠️ $bin not built — skipping"
fi
done
echo "✅ Vulkan benchmark run complete"
- name: Upload Vulkan benchmark artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: gpu-vulkan-benchmark-results
path: ${{ env.BENCH_OUT_DIR }}/
retention-days: 30
- name: Write job summary (Vulkan)
if: always()
run: |
echo "## 🚀 GPU Benchmark Matrix — Vulkan" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "| Parameter | Value |" >> "$GITHUB_STEP_SUMMARY"
echo "|-----------|-------|" >> "$GITHUB_STEP_SUMMARY"
echo "| **Backend** | Vulkan |" >> "$GITHUB_STEP_SUMMARY"
echo "| **Event** | \`${{ github.event_name }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| **Branch** | \`${{ github.ref_name }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| **Commit** | \`${{ github.sha }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| **Runner** | self-hosted gpu-vulkan |" >> "$GITHUB_STEP_SUMMARY"
# ─────────────────────────────────────────────────────────────────────────────
# Job 4 (CPU fallback): always runs on ubuntu-latest
# Compiles all GPU-gated benchmarks in CPU-only mode to verify that:
# a) the disabled-stub paths compile cleanly, and
# b) cmake targets are registered correctly.
# Also validates the disabled-stub policy (Deadline + Issue tag present).
# ─────────────────────────────────────────────────────────────────────────────
gpu-bench-cpu-fallback:
name: GPU Benchmarks — CPU Fallback Compile Check
runs-on: ubuntu-latest
permissions:
contents: read
env:
BENCH_OUT_DIR: artifacts/gpu-cpu-fallback
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install system dependencies
run: |
sudo apt-get update -qq
sudo apt-get install -y --no-install-recommends \
cmake ninja-build g++ \
libbenchmark-dev \
pkg-config \
librocksdb-dev \
libssl-dev \
zlib1g-dev \
libzstd-dev \
liblz4-dev \
libfmt-dev \
libspdlog-dev \
nlohmann-json3-dev \
libmimalloc-dev
- name: Install Google Benchmark (build from source if needed)
run: |
if ! pkg-config --exists benchmark 2>/dev/null; then
git clone --depth 1 --branch v1.8.3 \
https://github.com/google/benchmark.git /tmp/benchmark
cmake -S /tmp/benchmark -B /tmp/benchmark/build \
-DCMAKE_BUILD_TYPE=Release \
-DBENCHMARK_ENABLE_TESTING=OFF
cmake --build /tmp/benchmark/build --parallel $(nproc)
sudo cmake --install /tmp/benchmark/build --prefix /usr/local
fi
- name: Configure CMake (CPU-only, all GPU-gated benchmarks)
run: |
cmake -B build_cpu_fallback -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DTHEMIS_BUILD_BENCHMARKS=ON \
-DTHEMIS_BUILD_TESTS=OFF \
-DTHEMIS_ENABLE_GPU=OFF \
-DTHEMIS_ENABLE_CUDA=OFF \
-DTHEMIS_ENABLE_HIP=OFF \
-DTHEMIS_ENABLE_VULKAN=OFF \
-DTHEMIS_ENABLE_LLM=OFF \
-DTHEMIS_ENABLE_TRACING=OFF \
-DTHEMIS_ENABLE_GRPC=OFF \
-S cmake
- name: Build GPU-gated benchmarks (CPU stub path)
run: |
mkdir -p "$BENCH_OUT_DIR"
cmake --build build_cpu_fallback --parallel $(nproc) \
2>&1 | tee "$BENCH_OUT_DIR/build.log"
echo "✅ CPU-stub build complete"
- name: Verify disabled-stub policy compliance
run: |
echo "Checking disabled-stub policy in GPU benchmark files…"
EXIT=0
for f in \
benchmarks/bench_fused_kernels.cpp \
benchmarks/bench_fused_lora_kernels.cpp \
benchmarks/bench_gpu_training_cycle.cpp \
benchmarks/bench_vulkan_lora.cpp \
benchmarks/bench_lora_gpu.cpp \
benchmarks/bench_multi_gpu_scaling.cpp \
benchmarks/bench_backend_comparison.cpp \
benchmarks/bench_gpu_vector_index.cpp; do
[ -f "$f" ] || continue
# Each *_GPUDisabled or *_Disabled BENCHMARK registration must carry
# "Deadline: " and "Issue: #" in the same file.
if grep -q "BENCHMARK(BM_.*Disabled)" "$f"; then
HAS_DEADLINE=$(grep -c "Deadline:" "$f" || true)
HAS_ISSUE=$(grep -c "Issue: #" "$f" || true)
if [ "$HAS_DEADLINE" -lt 1 ] || [ "$HAS_ISSUE" -lt 1 ]; then
echo "❌ $f — missing Deadline or Issue tag for disabled stub"
EXIT=1
else
echo "✅ $f — stub policy compliant"
fi
fi
done
exit $EXIT
- name: Run CPU-stub benchmarks (smoke test)
run: |
set +e
mkdir -p "$BENCH_OUT_DIR"
# Run each CPU-stub binary to verify it starts and exits cleanly.
# Disabled stubs call SkipWithError so they exit 0.
STUB_TARGETS=(
bench_gpu_backends
)
for bin in "${STUB_TARGETS[@]}"; do
if [ -x "build_cpu_fallback/$bin" ]; then
echo "▶ Smoke-testing $bin (CPU stub) …"
timeout 60 build_cpu_fallback/$bin \
--benchmark_format=json \
--benchmark_out="$BENCH_OUT_DIR/${bin}.json" \
--benchmark_min_time=0.01s \
2>&1 | tee "$BENCH_OUT_DIR/${bin}.run.log"
fi
done
echo "✅ CPU-stub smoke tests complete"
- name: Upload CPU fallback artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: gpu-cpu-fallback-benchmark-results
path: ${{ env.BENCH_OUT_DIR }}/
retention-days: 14
- name: Write job summary (CPU fallback)
if: always()
run: |
echo "## 🖥️ GPU Benchmark Matrix — CPU Fallback Compile Check" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "| Parameter | Value |" >> "$GITHUB_STEP_SUMMARY"
echo "|-----------|-------|" >> "$GITHUB_STEP_SUMMARY"
echo "| **Runner** | ubuntu-latest (CPU-only fallback) |" >> "$GITHUB_STEP_SUMMARY"
echo "| **GPU flags** | all OFF — disabled-stub path only |" >> "$GITHUB_STEP_SUMMARY"
echo "| **Event** | \`${{ github.event_name }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| **Branch** | \`${{ github.ref_name }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| **Commit** | \`${{ github.sha }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "### Checks" >> "$GITHUB_STEP_SUMMARY"
echo "- GPU-gated benchmarks compile in CPU stub mode" >> "$GITHUB_STEP_SUMMARY"
echo "- Disabled-stub policy compliance (Deadline + Issue tag)" >> "$GITHUB_STEP_SUMMARY"
echo "- CPU-stub smoke run (bench_gpu_backends exits 0)" >> "$GITHUB_STEP_SUMMARY"
# ─────────────────────────────────────────────────────────────────────────────
# Gate job: summarises all backend results
# ─────────────────────────────────────────────────────────────────────────────
gpu-benchmark-gate:
name: GPU Benchmark Matrix Gate
needs:
- gpu-bench-cuda
- gpu-bench-hip
- gpu-bench-vulkan
- gpu-bench-cpu-fallback
if: always()
runs-on: ubuntu-latest
permissions:
contents: read
steps:
- name: Check overall status
shell: bash
run: |
cuda="${{ needs.gpu-bench-cuda.result }}"
hip="${{ needs.gpu-bench-hip.result }}"
vulkan="${{ needs.gpu-bench-vulkan.result }}"
fallback="${{ needs.gpu-bench-cpu-fallback.result }}"
echo "CUDA job result : $cuda"
echo "HIP job result : $hip"
echo "Vulkan job result : $vulkan"
echo "CPU fallback : $fallback"
# CPU fallback is the blocking gate — GPU runners are optional.
if [[ "$fallback" != "success" && "$fallback" != "skipped" ]]; then
echo "❌ CPU fallback compile/stub check failed."
exit 1
fi
# Report GPU runner availability.
for backend in cuda hip vulkan; do
result_var="${backend}"
case $backend in
cuda) result=$cuda ;;
hip) result=$hip ;;
vulkan) result=$vulkan ;;
esac
if [[ "$result" == "success" ]]; then
echo "✅ $backend — real measurements recorded"
elif [[ "$result" == "skipped" ]]; then
echo "ℹ️ $backend — no self-hosted runner registered (expected in hosted CI)"
else
echo "⚠️ $backend — runner available but job encountered errors (continue-on-error)"
fi
done
echo "✅ GPU Benchmark Matrix gate passed."
- name: Write gate summary
if: always()
run: |
echo "## 🎯 GPU Benchmark Matrix — Gate Summary" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "| Backend | Result |" >> "$GITHUB_STEP_SUMMARY"
echo "|---------|--------|" >> "$GITHUB_STEP_SUMMARY"
echo "| CUDA (sm_80/89/90) | \`${{ needs.gpu-bench-cuda.result }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| HIP/ROCm (gfx1100/gfx90a) | \`${{ needs.gpu-bench-hip.result }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| Vulkan | \`${{ needs.gpu-bench-vulkan.result }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "| CPU Fallback (blocking gate) | \`${{ needs.gpu-bench-cpu-fallback.result }}\` |" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "_GPU runner jobs (CUDA/HIP/Vulkan) use_ \`continue-on-error: true\` _—_" >> "$GITHUB_STEP_SUMMARY"
echo "_they are skipped when no self-hosted runner is registered._" >> "$GITHUB_STEP_SUMMARY"
echo "_The CPU fallback job is the blocking gate._" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "See: [docs/ci-cd/gpu-benchmark-matrix-runner.md](../../docs/ci-cd/gpu-benchmark-matrix-runner.md)" >> "$GITHUB_STEP_SUMMARY"