Build llama.cpp with CUDA #229

Workflow file for this run

.github/workflows/build-cuda.yml at 20acfc9

	name: Build llama.cpp with CUDA

	on:
	schedule:
	# Run daily at 00:00 UTC
	- cron: '0 0 * * *'
	workflow_dispatch:
	inputs:
	force_build:
	description: 'Force build even if no new release'
	required: false
	type: boolean
	default: false

	jobs:
	check-release:
	runs-on: ubuntu-latest
	outputs:
	should_build: ${{ steps.check.outputs.should_build }}
	release_tag: ${{ steps.check.outputs.release_tag }}
	release_hash: ${{ steps.check.outputs.release_hash }}
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Check for new llama.cpp release
	id: check
	run: \|
	# Get latest release from llama.cpp
	LATEST_RELEASE=$(curl -s https://api.github.com/repos/ggml-org/llama.cpp/releases/latest \| jq -r '.tag_name')
	RELEASE_HASH=$(curl -s https://api.github.com/repos/ggml-org/llama.cpp/releases/latest \| jq -r '.target_commitish')

	echo "Latest llama.cpp release: $LATEST_RELEASE"
	echo "Release hash: $RELEASE_HASH"

	# Check if we've already built this release
	if git tag \| grep -q "^${LATEST_RELEASE}$"; then
	echo "Release $LATEST_RELEASE already built"
	if [ "${{ github.event.inputs.force_build }}" = "true" ]; then
	echo "Force build enabled, building anyway"
	echo "should_build=true" >> $GITHUB_OUTPUT
	else
	echo "should_build=false" >> $GITHUB_OUTPUT
	fi
	else
	echo "New release detected: $LATEST_RELEASE"
	echo "should_build=true" >> $GITHUB_OUTPUT
	fi

	echo "release_tag=$LATEST_RELEASE" >> $GITHUB_OUTPUT
	echo "release_hash=$RELEASE_HASH" >> $GITHUB_OUTPUT

	build:
	needs: check-release
	if: needs.check-release.outputs.should_build == 'true'
	runs-on: ${{ matrix.arch.runs_on }}
	strategy:
	fail-fast: false
	matrix:
	cuda_version: ['12.8.1']
	arch:
	# Native runners — no QEMU. nvidia/cuda:*-cudnn-devel-ubuntu22.04 is
	# multi-arch on Docker Hub so the same build path runs on both.
	- { suffix: amd64, runs_on: ubuntu-latest }
	- { suffix: arm64, runs_on: ubuntu-24.04-arm }
	include:
	- cuda_version: '12.8.1'
	cuda_version_short: '12.8'
	cuda_tag: '12.8.1-cudnn-devel-ubuntu22.04'
	# CUDA compute capabilities target the runtime GPU, not the host
	# CPU arch, so the same list applies to both amd64 and arm64
	# builds. Relevant aarch64 GPU contexts (Grace Hopper, Grace
	# Blackwell, DGX Spark) are covered by sm_90 / sm_100 / sm_120.
	architectures: '75-virtual;80-virtual;86-virtual;89-virtual;90-virtual;100-virtual;120-virtual'

	steps:
	- name: Checkout
	uses: actions/checkout@v4

	- name: Maximize build space
	run: \|
	echo "=== Initial Disk Space ==="
	df -h

	echo "=== Removing unnecessary packages ==="
	sudo apt-get remove -y '^aspnetcore-.*' \|\| true
	sudo apt-get remove -y '^dotnet-.*' --fix-missing \|\| true
	sudo apt-get remove -y '^llvm-.*' --fix-missing \|\| true
	sudo apt-get remove -y 'php.*' --fix-missing \|\| true
	sudo apt-get remove -y '^mongodb-.*' --fix-missing \|\| true
	sudo apt-get remove -y '^mysql-.*' --fix-missing \|\| true
	sudo apt-get remove -y azure-cli google-chrome-stable firefox powershell mono-devel libgl1-mesa-dri --fix-missing \|\| true
	sudo apt-get autoremove -y \|\| true
	sudo apt-get clean \|\| true

	echo "=== Removing large directories ==="
	sudo rm -rf /usr/share/dotnet
	sudo rm -rf /usr/local/lib/android
	sudo rm -rf /opt/ghc
	sudo rm -rf /opt/hostedtoolcache/CodeQL
	sudo rm -rf /usr/local/share/boost
	sudo rm -rf /usr/share/swift
	sudo rm -rf /usr/local/.ghcup
	sudo rm -rf "$AGENT_TOOLSDIRECTORY"

	# Clean docker system
	docker system prune -af \|\| true

	echo "=== After Cleanup ==="
	df -h

	- name: Build llama.cpp with CUDA
	run: \|
	# Pull Docker image
	docker pull nvidia/cuda:${{ matrix.cuda_tag }}

	# Run build in container
	docker run --rm \
	-v $PWD:/workspace \
	nvidia/cuda:${{ matrix.cuda_tag }} \
	bash -c "
	set -e

	echo '=== Installing minimal dependencies ==='
	apt-get update -qq
	apt-get install -y --no-install-recommends git cmake ninja-build build-essential libssl-dev ca-certificates
	apt-get clean
	rm -rf /var/lib/apt/lists/*

	echo '=== Cloning llama.cpp ==='
	cd /workspace
	git clone --depth 1 --branch ${{ needs.check-release.outputs.release_tag }} https://github.com/ggml-org/llama.cpp.git \|\| \
	(git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp && git checkout ${{ needs.check-release.outputs.release_hash }})
	cd llama.cpp

	echo '=== Configuring build with Ninja ==='
	export LIBRARY_PATH="/usr/local/cuda/lib64/stubs${LIBRARY_PATH:+:$LIBRARY_PATH}"

	ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
	cmake -B build -S . \
	-G Ninja \
	-DGGML_CUDA=ON \
	-DCMAKE_CUDA_ARCHITECTURES='${{ matrix.architectures }}' \
	-DCMAKE_BUILD_TYPE=Release \
	-DBUILD_SHARED_LIBS=ON \
	-DGGML_NATIVE=OFF \
	-DLLAMA_BUILD_TESTS=OFF \
	-DLLAMA_BUILD_EXAMPLES=OFF \
	-DCMAKE_EXE_LINKER_FLAGS='-Wl,-rpath-link,/usr/local/cuda/lib64/stubs' \
	-DCMAKE_SHARED_LINKER_FLAGS='-Wl,-rpath-link,/usr/local/cuda/lib64/stubs'

	echo '=== Building with Ninja (parallel: all cores) ==='
	cmake --build build --config Release -j\$(nproc)

	echo '=== Copying binaries ==='
	cd /workspace
	mkdir -p binaries/cuda-${{ matrix.cuda_version_short }}

	# Copy everything from build/bin
	cp -r llama.cpp/build/bin/* binaries/cuda-${{ matrix.cuda_version_short }}/

	# Strip binaries to reduce size (executables only, not .so files)
	find binaries/cuda-${{ matrix.cuda_version_short }}/ -type f -executable ! -name ".so" -exec strip {} \; 2>/dev/null \|\| true

	echo '=== Creating version info ==='
	echo 'llama.cpp version: ${{ needs.check-release.outputs.release_tag }}' > binaries/cuda-${{ matrix.cuda_version_short }}/VERSION.txt
	echo 'CUDA version: ${{ matrix.cuda_version }}' >> binaries/cuda-${{ matrix.cuda_version_short }}/VERSION.txt
	echo 'Architectures: ${{ matrix.architectures }}' >> binaries/cuda-${{ matrix.cuda_version_short }}/VERSION.txt
	echo 'Build date: '$(date -u +%Y-%m-%d) >> binaries/cuda-${{ matrix.cuda_version_short }}/VERSION.txt
	echo 'Build hash: ${{ needs.check-release.outputs.release_hash }}' >> binaries/cuda-${{ matrix.cuda_version_short }}/VERSION.txt

	echo '=== Build complete ==='
	ls -lh binaries/cuda-${{ matrix.cuda_version_short }}/

	echo '=== Cleaning up build directory ==='
	rm -rf llama.cpp
	"

	# Remove Docker image to free space
	docker rmi nvidia/cuda:${{ matrix.cuda_tag }} \|\| true

	# Fix permissions for files created by root in container
	sudo chown -R $(id -u):$(id -g) $PWD

	echo "=== Final disk usage ==="
	df -h

	- name: Create tarball
	run: \|
	cd binaries
	tar -czf llama.cpp-${{ needs.check-release.outputs.release_tag }}-cuda-${{ matrix.cuda_version_short }}-${{ matrix.arch.suffix }}.tar.gz cuda-${{ matrix.cuda_version_short }}
	ls -lh *.tar.gz

	- name: Upload artifact
	uses: actions/upload-artifact@v4
	with:
	# Arch suffix in the artifact name so the matrix jobs do not collide
	# in actions/download-artifact later.
	name: llama.cpp-cuda-${{ matrix.cuda_version_short }}-${{ matrix.arch.suffix }}
	path: binaries/*.tar.gz
	retention-days: 1

	release:
	needs: [check-release, build]
	runs-on: ubuntu-latest
	permissions:
	contents: write
	steps:
	- name: Checkout
	uses: actions/checkout@v4

	- name: Download all artifacts
	uses: actions/download-artifact@v4
	with:
	path: artifacts

	- name: Prepare release assets
	run: \|
	mkdir -p release-assets
	find artifacts -name "*.tar.gz" -exec cp {} release-assets/ \;
	ls -lh release-assets/

	- name: Create Release
	uses: softprops/action-gh-release@v1
	with:
	tag_name: ${{ needs.check-release.outputs.release_tag }}
	name: llama.cpp ${{ needs.check-release.outputs.release_tag }} with CUDA
	body: \|
	# llama.cpp ${{ needs.check-release.outputs.release_tag }} with CUDA Support

	Pre-built binaries of llama.cpp with CUDA support for multiple CUDA versions.

	Source: https://github.com/ggml-org/llama.cpp/releases/tag/${{ needs.check-release.outputs.release_tag }}
	Commit: ${{ needs.check-release.outputs.release_hash }}

	## CUDA Versions
	- CUDA 12.8 - GPU compute capabilities: 7.5, 8.0, 8.6, 8.9, 9.0, 10.0, 12.0

	## Host architectures
	Tarballs are published per host CPU architecture (Linux):
	- `-amd64.tar.gz` — x86_64 (most desktops, servers, cloud VMs)
	- `-arm64.tar.gz` — aarch64 (Grace Hopper / Grace Blackwell / DGX Spark / Ampere Altra)

	## GPU compute capability reference
	- 7.5: Tesla T4, RTX 20xx series, Quadro RTX
	- 8.0: A100
	- 8.6: RTX 3000 series
	- 8.9: RTX 4000 series, L4, L40
	- 9.0: H100, H200, GH200
	- 10.0: B200, GB200
	- 12.0: RTX Pro series, RTX 50xx

	## Usage
	Download the tarball matching your host CPU arch and CUDA version, then extract:
	```bash
	# amd64 host
	tar -xzf llama.cpp-${{ needs.check-release.outputs.release_tag }}-cuda-12.8-amd64.tar.gz
	# arm64 host (e.g. Grace Blackwell)
	tar -xzf llama.cpp-${{ needs.check-release.outputs.release_tag }}-cuda-12.8-arm64.tar.gz
	./llama-cli --help
	```
	files: release-assets/*
	draft: false
	prerelease: false

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Build llama.cpp with CUDA #229

Workflow file

Build llama.cpp with CUDA #229

Uh oh!

Workflow file for this run