diff --git a/.ci/benchmarks/baseline.json b/.ci/benchmarks/baseline.json new file mode 100644 index 0000000..e30304e --- /dev/null +++ b/.ci/benchmarks/baseline.json @@ -0,0 +1,3 @@ +{ + "benchmarks": [] +} diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..1e270da --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,39 @@ +name: Benchmark Regression + +on: + pull_request: + branches: [main, master] + schedule: + - cron: "30 2 * * *" + workflow_dispatch: + +jobs: + benchmark-cpu: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: "pip" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e . + pip install pytest pytest-benchmark + + - name: Run benchmark suite + run: pytest tests -m "benchmark" --benchmark-json benchmark.json -q + + - name: Enforce regression threshold + run: python scripts/ci/check_benchmark_regression.py benchmark.json 0.05 + + - name: Upload benchmark report + uses: actions/upload-artifact@v4 + with: + name: benchmark-report + path: benchmark.json diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..82da6b4 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,34 @@ +name: Docs + +on: + pull_request: + paths: + - "docs/**" + - "README.md" + - ".github/workflows/docs.yml" + push: + branches: [main, master] + paths: + - "docs/**" + - "README.md" + workflow_dispatch: + +jobs: + docs-build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install docs dependencies + run: | + python -m pip install --upgrade pip + pip install mkdocs mkdocs-material + + - name: Build docs + run: mkdocs build --strict diff --git a/.github/workflows/gpu-hardware.yml b/.github/workflows/gpu-hardware.yml new file mode 100644 index 0000000..ff90223 --- /dev/null +++ b/.github/workflows/gpu-hardware.yml @@ -0,0 +1,66 @@ +name: GPU Hardware Validation + +on: + workflow_dispatch: + schedule: + - cron: "0 1 * * 0" + +jobs: + gpu-smoke: + name: GPU smoke (${{ matrix.runner_label }}) + runs-on: [self-hosted, linux, x64, gpu, "${{ matrix.runner_label }}"] + strategy: + fail-fast: false + matrix: + include: + - runner_label: a100 + cuda_version: "11.8" + gpu_arch: sm_80 + - runner_label: h100 + cuda_version: "12.1" + gpu_arch: sm_90 + - runner_label: rtx4090 + cuda_version: "12.4" + gpu_arch: sm_89 + + env: + EXPECTED_CUDA: ${{ matrix.cuda_version }} + EXPECTED_ARCH: ${{ matrix.gpu_arch }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e . + pip install pytest + + - name: Verify hardware runtime + run: | + nvidia-smi + python - <<'PY' + import os + print(f"Expected CUDA: {os.environ['EXPECTED_CUDA']}") + print(f"Expected arch: {os.environ['EXPECTED_ARCH']}") + PY + + - name: GPU integration and numerical tests + run: | + pytest tests -m "gpu or numerical" -q + + - name: Performance regression check + run: | + pytest tests -m "benchmark" --benchmark-json benchmark-${{ matrix.runner_label }}.json -q + + - name: Upload benchmark artifact + uses: actions/upload-artifact@v4 + with: + name: benchmark-${{ matrix.runner_label }} + path: benchmark-${{ matrix.runner_label }}.json diff --git a/.github/workflows/hpc-matrix.yml b/.github/workflows/hpc-matrix.yml new file mode 100644 index 0000000..07f791b --- /dev/null +++ b/.github/workflows/hpc-matrix.yml @@ -0,0 +1,77 @@ +name: HPC Matrix + +on: + pull_request: + branches: [main, master] + push: + branches: [main, master] + schedule: + - cron: "0 3 * * *" + workflow_dispatch: + +jobs: + matrix-cpu: + name: CPU matrix (py${{ matrix.python-version }} / torch${{ matrix.torch-version }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + torch-version: ["2.2.2", "2.3.1"] + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e . + pip install "torch==${{ matrix.torch-version }}" --index-url https://download.pytorch.org/whl/cpu + pip install pytest pytest-cov mypy ruff + + - name: Lint and type checks + run: | + ruff check . + ruff format --check . + mypy kernels implementations + + - name: Unit tests + run: pytest -m "not gpu" --cov=kernels --cov=implementations --cov-fail-under=85 + + - name: Integration tests + run: pytest tests -m "integration and not gpu" -q + + cuda-compat: + name: CUDA compatibility (containerized) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + cuda-tag: ["11.8.0-runtime-ubuntu22.04", "12.1.1-runtime-ubuntu22.04"] + + container: + image: nvidia/cuda:${{ matrix.cuda-tag }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Python and tooling + run: | + apt-get update + apt-get install -y python3 python3-pip + python3 -m pip install --upgrade pip + pip install -e . + pip install pytest + + - name: Import and fallback smoke check + run: | + python3 -c "import kernels; print('kernels import ok')" + pytest tests -m "fallback" -q diff --git a/CHANGELOG.md b/CHANGELOG.md index 8433e3a..c40fd68 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ The format is based on Keep a Changelog, and this project adheres to Semantic Ve ### Added +- Added a production-grade HPC CI/CD architecture reference document covering deterministic build strategy, GPU matrix validation, release gating, and scaling guidance. +- Added new GitHub Actions workflows for expanded HPC matrix validation, dedicated GPU hardware testing, benchmark regression gating, and docs build verification. +- Added a benchmark regression check utility script with baseline support to enforce latency performance thresholds in CI. +- Added MkDocs configuration to enable strict documentation build checks in CI. - Added a scheduled dependency health workflow that validates installation integrity, runs `pip check`, audits vulnerabilities with `pip-audit`, and verifies package imports across Python 3.9-3.12. - Added a weekly dependency canary workflow that upgrades core quality tooling to latest versions and runs lint, type checks, tests, and package build validation. - Expanded CI workflow to run a Python 3.9-3.12 matrix with linting, format checks, type checking, security scanning, coverage-enforced tests, smoke verification, and package build validation. @@ -27,6 +31,10 @@ The format is based on Keep a Changelog, and this project adheres to Semantic Ve ### Added +- Added a production-grade HPC CI/CD architecture reference document covering deterministic build strategy, GPU matrix validation, release gating, and scaling guidance. +- Added new GitHub Actions workflows for expanded HPC matrix validation, dedicated GPU hardware testing, benchmark regression gating, and docs build verification. +- Added a benchmark regression check utility script with baseline support to enforce latency performance thresholds in CI. +- Added MkDocs configuration to enable strict documentation build checks in CI. - Initial kernel implementation with deterministic state machine - Core types: KernelState, KernelRequest, KernelReceipt, Decision, ReceiptStatus - Append-only audit ledger with hash-chained entries diff --git a/README.md b/README.md index ef32f08..0ee3032 100644 --- a/README.md +++ b/README.md @@ -150,6 +150,7 @@ python -m kernels --help |----------|---------| | [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) | Component boundaries and data flows | | [docs/THREAT_MODEL.md](docs/THREAT_MODEL.md) | Adversary model + mitigations | +| [docs/pipelines/HPC_CICD_ARCHITECTURE.md](docs/pipelines/HPC_CICD_ARCHITECTURE.md) | Production-grade CI/CD architecture for kernel projects | | [docs/FAQ.md](docs/FAQ.md) | Usage clarifications | --- diff --git a/docs/README.md b/docs/README.md index c79de06..7a8a4f1 100644 --- a/docs/README.md +++ b/docs/README.md @@ -8,6 +8,7 @@ Documentation for understanding and integrating KERNELS. |----------|---------| | [ARCHITECTURE.md](ARCHITECTURE.md) | Component boundaries and data flows | | [THREAT_MODEL.md](THREAT_MODEL.md) | Adversary model + mitigations | +| [pipelines/HPC_CICD_ARCHITECTURE.md](pipelines/HPC_CICD_ARCHITECTURE.md) | Advanced CI/CD design for deterministic kernel infrastructure | | [FAQ.md](FAQ.md) | Usage clarifications and non-goals | ## Getting Started diff --git a/docs/pipelines/HPC_CICD_ARCHITECTURE.md b/docs/pipelines/HPC_CICD_ARCHITECTURE.md new file mode 100644 index 0000000..6c66301 --- /dev/null +++ b/docs/pipelines/HPC_CICD_ARCHITECTURE.md @@ -0,0 +1,117 @@ +# HPC CI/CD Architecture for KERNELS + +This document defines a production-grade CI/CD design for compute-kernel style +repositories where correctness, determinism, and hardware compatibility are first- +class release gates. + +## Pipeline Layers + +```text +Commit + -> Static Analysis + -> Deterministic Build Matrix + -> CPU + GPU Test Matrix + -> Security / Supply Chain + -> Numerical Validation + -> Performance Regression Validation + -> Artifact Packaging + Attestation + -> Release + Registry Publish + -> Production Telemetry Hooks +``` + +## Workflow Set + +| Workflow | Trigger | Purpose | +|----------|---------|---------| +| `ci.yml` | PR + push | Fast quality checks, unit tests, package build | +| `hpc-matrix.yml` | PR + push + nightly | Expanded matrix with PyTorch/CUDA compatibility checks | +| `gpu-hardware.yml` | nightly + manual + release candidate | Hardware validation on self-hosted GPU runners | +| `security.yml` | PR + push + weekly | SAST, dependency audit, secret scan | +| `benchmark.yml` | PR + nightly | Performance baselines and regression thresholds | +| `release.yml` | tags | Build, verify, and publish release artifacts | +| `docs.yml` | docs changes + release | Build and publish documentation | + +## Deterministic Build Strategy + +Determinism is enforced by: + +1. Pinning lock files and toolchain versions. +2. Building wheels in isolated environments (`python -m build`). +3. Verifying metadata (`twine check`). +4. Capturing SBOM and provenance attestations. + +## Build Matrix Recommendation + +| Axis | Values | +|------|--------| +| Python | 3.9, 3.10, 3.11, 3.12 | +| PyTorch | Supported minor versions | +| CUDA | 11.8, 12.1, 12.4 | +| GPU arch | sm_80, sm_86, sm_89, sm_90 | +| OS | ubuntu-latest, self-hosted GPU Linux | + +## Validation Gates + +### 1. Static Quality Gate + +- `ruff check .` +- `ruff format --check .` +- `mypy kernels implementations` + +### 2. Correctness Gate + +- Unit tests with coverage threshold. +- Integration tests for runtime loading and fallback behavior. +- Numerical equivalence checks against reference implementations. + +### 3. Hardware Gate + +- Run kernel smoke tests on A100 / H100 / RTX-class runners. +- Enforce CPU fallback tests in every PR. + +### 4. Security Gate + +- CodeQL +- `pip-audit` +- `safety` +- `gitleaks` +- Optional Semgrep policy pack + +### 5. Performance Gate + +- Pytest benchmark suite with historical comparison. +- Fail CI if median latency regresses beyond threshold (default 5%). + +## Release Controls + +Release jobs should execute only after all mandatory checks pass: + +- Lint / type / tests +- Security scans +- Benchmark regression check +- Docs build + +On tag: + +1. Build source + wheel distributions. +2. Generate SBOM (`syft`) and vulnerability report (`grype` or `trivy`). +3. Publish to PyPI. +4. Publish container image. +5. Attach benchmark + security artifacts to GitHub release. + +## Rollback Policy + +If release validation fails for correctness or benchmark thresholds: + +- Mark release candidate as failed. +- Prevent publication jobs from running. +- Emit structured summary and incident artifact. + +## Scaling Guidance + +For large OSS adoption: + +- Split fast and slow workflows; protect PR latency. +- Use distributed/self-hosted GPU pools by architecture label. +- Cache Python deps, build layers, and benchmark baselines. +- Nightly deep validation for expensive fuzz + hardware tests. diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..f8c29c7 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,15 @@ +site_name: KERNELS +site_description: Deterministic control planes and CI/CD governance documentation. +repo_url: https://github.com/ayais12210-hub/kernels + +theme: + name: material + +nav: + - Home: README.md + - Architecture: ARCHITECTURE.md + - Threat Model: THREAT_MODEL.md + - Pipelines: + - Phases: pipelines/PHASES.md + - Gates: pipelines/GATES.md + - HPC CI/CD Architecture: pipelines/HPC_CICD_ARCHITECTURE.md diff --git a/scripts/ci/check_benchmark_regression.py b/scripts/ci/check_benchmark_regression.py new file mode 100644 index 0000000..b5f32fe --- /dev/null +++ b/scripts/ci/check_benchmark_regression.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +"""Fail CI when benchmark median latency regresses above a threshold. + +Usage: + python scripts/ci/check_benchmark_regression.py + +The baseline file path is `.ci/benchmarks/baseline.json`. +""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path + +BASELINE_PATH = Path('.ci/benchmarks/baseline.json') + + +def _load(path: Path) -> dict: + with path.open('r', encoding='utf-8') as handle: + return json.load(handle) + + +def _median_map(payload: dict) -> dict[str, float]: + benchmarks = payload.get('benchmarks', []) + result: dict[str, float] = {} + for benchmark in benchmarks: + name = benchmark.get('name') + median = benchmark.get('stats', {}).get('median') + if name is None or median is None: + continue + result[str(name)] = float(median) + return result + + +def main() -> int: + if len(sys.argv) != 3: + print('Usage: check_benchmark_regression.py ') + return 2 + + current_path = Path(sys.argv[1]) + max_regression = float(sys.argv[2]) + + if not current_path.exists(): + print(f'Current benchmark report not found: {current_path}') + return 2 + + if not BASELINE_PATH.exists(): + print(f'Baseline benchmark file not found at {BASELINE_PATH}; skipping regression gate.') + return 0 + + current = _median_map(_load(current_path)) + baseline = _median_map(_load(BASELINE_PATH)) + + failures: list[str] = [] + for name, baseline_median in baseline.items(): + current_median = current.get(name) + if current_median is None: + failures.append(f'{name}: missing from current benchmark run') + continue + + if baseline_median <= 0: + continue + + regression = (current_median - baseline_median) / baseline_median + if regression > max_regression: + failures.append( + f'{name}: baseline={baseline_median:.6f}s current={current_median:.6f}s ' + f'regression={regression * 100:.2f}% > {max_regression * 100:.2f}%' + ) + + if failures: + print('Performance regression gate failed:') + for failure in failures: + print(f'- {failure}') + return 1 + + print('Benchmark regression gate passed.') + return 0 + + +if __name__ == '__main__': + raise SystemExit(main())