Build llama-server #12
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Build llama-server | |
| on: | |
| # Manual trigger — enter a specific version | |
| workflow_dispatch: | |
| inputs: | |
| version: | |
| description: 'llama.cpp version tag (e.g. b8416). Leave empty to auto-detect latest.' | |
| required: false | |
| default: '' | |
| type: string | |
| cuda_architectures: | |
| description: 'CUDA SM targets (semicolon-separated)' | |
| required: false | |
| default: '75;80;86;89;90;100;120' | |
| type: string | |
| # Automatic trigger — check for new releases every 6 hours | |
| schedule: | |
| - cron: '0 */6 * * *' | |
| permissions: | |
| contents: write | |
| jobs: | |
| # ── Step 1: Detect the version to build ────────────────────────────────────── | |
| detect-version: | |
| runs-on: ubuntu-22.04 | |
| outputs: | |
| version: ${{ steps.resolve.outputs.version }} | |
| should_build: ${{ steps.resolve.outputs.should_build }} | |
| cuda_architectures: ${{ steps.resolve.outputs.cuda_architectures }} | |
| steps: | |
| - name: Resolve version | |
| id: resolve | |
| env: | |
| INPUT_VERSION: ${{ inputs.version }} | |
| INPUT_CUDA_ARCHS: ${{ inputs.cuda_architectures }} | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| CUDA_ARCHS="${INPUT_CUDA_ARCHS:-75;80;86;89;90;100;120}" | |
| echo "cuda_architectures=${CUDA_ARCHS}" >> $GITHUB_OUTPUT | |
| # If version was provided manually, use it | |
| if [ -n "$INPUT_VERSION" ]; then | |
| echo "version=${INPUT_VERSION}" >> $GITHUB_OUTPUT | |
| echo "should_build=true" >> $GITHUB_OUTPUT | |
| echo "✅ Manual trigger: building ${INPUT_VERSION}" | |
| exit 0 | |
| fi | |
| # Auto-detect latest upstream version | |
| LATEST=$(curl -sf https://api.github.com/repos/ggml-org/llama.cpp/releases/latest \ | |
| -H "Accept: application/vnd.github.v3+json" | jq -r '.tag_name') | |
| if [ -z "$LATEST" ] || [ "$LATEST" = "null" ]; then | |
| echo "❌ Failed to fetch latest llama.cpp version" | |
| echo "should_build=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| echo "🔍 Latest upstream version: ${LATEST}" | |
| # Check if we already have a release for this version | |
| EXISTING=$(curl -sf \ | |
| -H "Accept: application/vnd.github.v3+json" \ | |
| -H "Authorization: Bearer ${GH_TOKEN}" \ | |
| "https://api.github.com/repos/${{ github.repository }}/releases/tags/${LATEST}" \ | |
| | jq -r '.tag_name // empty') | |
| if [ "$EXISTING" = "$LATEST" ]; then | |
| echo "⏭️ Release ${LATEST} already exists — skipping" | |
| echo "should_build=false" >> $GITHUB_OUTPUT | |
| else | |
| echo "🆕 New version detected: ${LATEST} — triggering build" | |
| echo "version=${LATEST}" >> $GITHUB_OUTPUT | |
| echo "should_build=true" >> $GITHUB_OUTPUT | |
| fi | |
| # ── Step 2: Build all variants ─────────────────────────────────────────────── | |
| build: | |
| needs: detect-version | |
| if: needs.detect-version.outputs.should_build == 'true' | |
| timeout-minutes: 120 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| # ── Linux x64 ── | |
| - name: linux-x64-cpu | |
| runner: ubuntu-22.04 | |
| os: linux | |
| acceleration: cpu | |
| cuda_version: '' | |
| cuda_architectures: '' | |
| - name: linux-x64-cuda-12 | |
| runner: ubuntu-22.04 | |
| os: linux | |
| acceleration: cuda | |
| cuda_version: '12-8' | |
| cuda_pkg: 'cuda-toolkit-12-8' | |
| cuda_home: '/usr/local/cuda-12.8' | |
| cuda_architectures: '75;80;86;89;90' | |
| - name: linux-x64-cuda-13 | |
| runner: ubuntu-22.04 | |
| os: linux | |
| acceleration: cuda | |
| cuda_version: '13-1' | |
| cuda_pkg: 'cuda-toolkit-13-1' | |
| cuda_home: '/usr/local/cuda-13.1' | |
| cuda_architectures: '75;80;86;89;90;100;120' | |
| - name: linux-x64-vulkan | |
| runner: ubuntu-22.04 | |
| os: linux | |
| acceleration: vulkan | |
| cuda_version: '' | |
| cuda_architectures: '' | |
| # ── Linux arm64 ── | |
| - name: linux-arm64-cpu | |
| runner: ubuntu-22.04-arm | |
| os: linux | |
| acceleration: cpu | |
| cuda_version: '' | |
| cuda_architectures: '' | |
| - name: linux-arm64-cuda-12 | |
| runner: ubuntu-22.04-arm | |
| os: linux | |
| acceleration: cuda | |
| cuda_version: '12-8' | |
| cuda_pkg: 'cuda-toolkit-12-8' | |
| cuda_home: '/usr/local/cuda-12.8' | |
| cuda_architectures: '75;80;86;89;90' | |
| - name: linux-arm64-cuda-13 | |
| runner: ubuntu-22.04-arm | |
| os: linux | |
| acceleration: cuda | |
| cuda_version: '13-1' | |
| cuda_pkg: 'cuda-toolkit-13-1' | |
| cuda_home: '/usr/local/cuda-13.1' | |
| cuda_architectures: '75;80;86;89;90;100;120' | |
| # ── Windows x64 ── | |
| - name: windows-x64-cpu | |
| runner: windows-2022 | |
| os: windows | |
| acceleration: cpu | |
| cuda_version: '' | |
| cuda_architectures: '' | |
| - name: windows-x64-cuda-12 | |
| runner: windows-2022 | |
| os: windows | |
| acceleration: cuda | |
| cuda_version: '12.4' | |
| cuda_architectures: '75;80;86;89;90' | |
| - name: windows-x64-cuda-13 | |
| runner: windows-2022 | |
| os: windows | |
| acceleration: cuda | |
| cuda_version: '13.1' | |
| cuda_architectures: '75;80;86;89;90;100;120' | |
| - name: windows-x64-vulkan | |
| runner: windows-2022 | |
| os: windows | |
| acceleration: vulkan | |
| cuda_version: '' | |
| cuda_architectures: '' | |
| # ── macOS ── | |
| - name: macos-arm64-metal | |
| runner: macos-14 | |
| os: macos | |
| acceleration: metal | |
| cuda_version: '' | |
| cuda_architectures: '' | |
| - name: macos-x64-cpu | |
| runner: macos-14 | |
| os: macos | |
| acceleration: cpu | |
| cross_arch: x86_64 | |
| cuda_version: '' | |
| cuda_architectures: '' | |
| name: ${{ matrix.name }} | |
| runs-on: ${{ matrix.runner }} | |
| steps: | |
| - name: Checkout build repo | |
| uses: actions/checkout@v4 | |
| # ── Linux dependencies ── | |
| - name: Install build dependencies (Linux) | |
| if: matrix.os == 'linux' | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y build-essential cmake git | |
| - name: Install Vulkan SDK (Linux) | |
| if: matrix.os == 'linux' && matrix.acceleration == 'vulkan' | |
| run: | | |
| wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc | |
| sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list | |
| sudo apt-get update | |
| sudo apt-get install -y vulkan-sdk | |
| - name: Install CUDA toolkit (Linux) | |
| if: matrix.os == 'linux' && matrix.acceleration == 'cuda' | |
| run: | | |
| ARCH=$(dpkg --print-architecture) | |
| if [ "$ARCH" = "amd64" ]; then | |
| CUDA_REPO="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64" | |
| elif [ "$ARCH" = "arm64" ]; then | |
| CUDA_REPO="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/sbsa" | |
| else | |
| echo "❌ Unsupported architecture: $ARCH" | |
| exit 1 | |
| fi | |
| wget -qO- "${CUDA_REPO}/3bf863cc.pub" | sudo gpg --dearmor -o /usr/share/keyrings/cuda-archive-keyring.gpg | |
| echo "deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] ${CUDA_REPO} /" | sudo tee /etc/apt/sources.list.d/cuda.list | |
| sudo apt-get update | |
| sudo apt-get install -y ${{ matrix.cuda_pkg }} | |
| echo "CUDA_HOME=${{ matrix.cuda_home }}" >> $GITHUB_ENV | |
| echo "${{ matrix.cuda_home }}/bin" >> $GITHUB_PATH | |
| # ── Windows dependencies ── | |
| - name: Install Ninja (Windows) | |
| if: matrix.os == 'windows' | |
| run: choco install ninja -y | |
| - name: Install CUDA toolkit (Windows) | |
| if: matrix.os == 'windows' && matrix.acceleration == 'cuda' | |
| uses: ./.github/actions/windows-setup-cuda | |
| with: | |
| cuda_version: ${{ matrix.cuda_version }} | |
| - name: Install Vulkan SDK (Windows) | |
| if: matrix.os == 'windows' && matrix.acceleration == 'vulkan' | |
| env: | |
| VULKAN_VERSION: 1.4.313.0 | |
| run: | | |
| curl -L -o vulkan-sdk.exe "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe" | |
| .\vulkan-sdk.exe --accept-licenses --default-answer --confirm-command install | |
| echo "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" >> $env:GITHUB_ENV | |
| echo "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" >> $env:GITHUB_PATH | |
| # ── Build (Linux & macOS) ── | |
| - name: Build llama-server (Unix) | |
| if: matrix.os != 'windows' | |
| env: | |
| CROSS_ARCH: ${{ matrix.cross_arch }} | |
| run: | | |
| chmod +x scripts/build.sh | |
| bash scripts/build.sh \ | |
| "${{ needs.detect-version.outputs.version }}" \ | |
| "${{ matrix.acceleration }}" \ | |
| "${{ matrix.cuda_architectures || needs.detect-version.outputs.cuda_architectures }}" | |
| # ── Build (Windows) ── | |
| - name: Build llama-server (Windows) | |
| if: matrix.os == 'windows' | |
| shell: cmd | |
| run: | | |
| set VERSION=${{ needs.detect-version.outputs.version }} | |
| set ACCELERATION=${{ matrix.acceleration }} | |
| set CUDA_ARCHS=${{ matrix.cuda_architectures || needs.detect-version.outputs.cuda_architectures }} | |
| rem Clone source | |
| git clone --depth 1 --branch %VERSION% https://github.com/ggml-org/llama.cpp.git C:\llama-source | |
| rem Setup MSVC environment | |
| call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 | |
| rem Configure cmake with Ninja | |
| set CMAKE_ARGS=-B C:\llama-build -S C:\llama-source -G "Ninja Multi-Config" -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF | |
| if "%ACCELERATION%"=="cuda" set CMAKE_ARGS=%CMAKE_ARGS% -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=%CUDA_ARCHS% | |
| if "%ACCELERATION%"=="vulkan" set CMAKE_ARGS=%CMAKE_ARGS% -DGGML_VULKAN=ON | |
| cmake %CMAKE_ARGS% | |
| rem Build | |
| set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 | |
| cmake --build C:\llama-build --config Release -j %NINJA_JOBS% | |
| - name: Package (Windows) | |
| if: matrix.os == 'windows' | |
| shell: pwsh | |
| run: | | |
| $version = "${{ needs.detect-version.outputs.version }}" | |
| $artifactName = "llama-server-${version}-${{ matrix.name }}" | |
| $stagingDir = "C:\llama-staging\$artifactName" | |
| New-Item -ItemType Directory -Force -Path $stagingDir | Out-Null | |
| New-Item -ItemType Directory -Force -Path "dist" | Out-Null | |
| # Copy binary and DLLs | |
| Copy-Item "C:\llama-build\bin\Release\llama-server.exe" "$stagingDir\" -ErrorAction SilentlyContinue | |
| Copy-Item "C:\llama-build\bin\llama-server.exe" "$stagingDir\" -ErrorAction SilentlyContinue | |
| Get-ChildItem "C:\llama-build" -Recurse -Include "*.dll" | Copy-Item -Destination "$stagingDir\" -ErrorAction SilentlyContinue | |
| Compress-Archive -Path "$stagingDir" -DestinationPath "dist\${artifactName}.zip" | |
| Write-Host "✅ Built: dist\${artifactName}.zip" | |
| - name: Upload artifact | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: ${{ matrix.name }} | |
| path: | | |
| dist/*.tar.gz | |
| dist/*.zip | |
| retention-days: 7 | |
| # ── Step 3: Publish release ────────────────────────────────────────────────── | |
| release: | |
| needs: [detect-version, build] | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - name: Download all artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: artifacts/ | |
| - name: List artifacts | |
| run: find artifacts/ -type f \( -name '*.tar.gz' -o -name '*.zip' \) | sort | |
| - name: Create GitHub Release | |
| uses: softprops/action-gh-release@v2 | |
| with: | |
| tag_name: ${{ needs.detect-version.outputs.version }} | |
| name: "llama-server ${{ needs.detect-version.outputs.version }}" | |
| body: | | |
| Pre-built `llama-server` binaries from [llama.cpp ${{ needs.detect-version.outputs.version }}](https://github.com/ggml-org/llama.cpp/releases/tag/${{ needs.detect-version.outputs.version }}). | |
| ## Downloads — Linux | |
| | Variant | File | | |
| |---------|------| | |
| | x64 CPU | `llama-server-*-linux-x64-cpu.tar.gz` | | |
| | x64 CUDA 12.8 | `llama-server-*-linux-x64-cuda-12.tar.gz` | | |
| | x64 CUDA 13.1 | `llama-server-*-linux-x64-cuda-13.tar.gz` | | |
| | x64 Vulkan | `llama-server-*-linux-x64-vulkan.tar.gz` | | |
| | arm64 CPU | `llama-server-*-linux-arm64-cpu.tar.gz` | | |
| | arm64 CUDA 12.8 | `llama-server-*-linux-arm64-cuda-12.tar.gz` | | |
| | arm64 CUDA 13.1 | `llama-server-*-linux-arm64-cuda-13.tar.gz` | | |
| ## Downloads — Windows | |
| | Variant | File | | |
| |---------|------| | |
| | x64 CPU | `llama-server-*-windows-x64-cpu.zip` | | |
| | x64 CUDA 12.4 | `llama-server-*-windows-x64-cuda-12.zip` | | |
| | x64 CUDA 13.1 | `llama-server-*-windows-x64-cuda-13.zip` | | |
| | x64 Vulkan | `llama-server-*-windows-x64-vulkan.zip` | | |
| ## Downloads — macOS | |
| | Variant | File | | |
| |---------|------| | |
| | arm64 Metal | `llama-server-*-macos-arm64-metal.tar.gz` | | |
| | x64 CPU | `llama-server-*-macos-x64-cpu.tar.gz` | | |
| ### CUDA SM targets: `${{ needs.detect-version.outputs.cuda_architectures }}` | |
| files: artifacts/**/* | |
| fail_on_unmatched_files: false | |
| generate_release_notes: false |