CI/CD

chore: added CI model layer caching #451

Workflow file for this run

	name: CI/CD

	on:
	push:
	branches: [ "main" ]
	pull_request:
	branches: [ "**" ] # Adjust branches as needed
	release:
	types: [published]

	permissions:
	id-token: write # Required for OIDC
	contents: read # Required for checkout
	packages: write # Required for pushing cache layers to GHCR

	jobs:
	test:
	runs-on: ubuntu-latest

	steps:
	- uses: actions/checkout@v4

	- uses: astral-sh/setup-uv@v7
	with:
	enable-cache: true
	cache-dependency-glob: "**/pyproject.toml"

	- name: Cache dependencies
	uses: actions/cache@v4
	with:
	path: ${{ env.UV_CACHE_DIR }}
	key: ${{ runner.os }}-uv-${{ hashFiles('**/pyproject.toml') }}
	restore-keys: \|
	${{ runner.os }}-uv-

	- name: Install dependencies
	run: \|
	export ACLOCAL=aclocal
	export AUTOMAKE=automake
	uv sync

	- name: Run Ruff format check
	run: uv run ruff format --check

	- name: Run Ruff linting
	run: uv run ruff check --exclude packages/verifier/

	- name: Create .env for tests
	run: \|
	cp .env.ci .env
	# Set dummy secrets for unit tests
	sed -i 's/HF_TOKEN=.*/HF_TOKEN=dummy_token/' .env
	sed -i 's/BRAVE_SEARCH_API=.*/BRAVE_SEARCH_API=dummy_api/' .env
	sed -i 's/E2B_API_KEY=.*/E2B_API_KEY=dummy_token/' .env

	- name: pyright
	run: uv run pyright

	- name: Run unit tests
	run: uv run pytest -v tests/unit

	- name: Run integration tests
	run: uv run pytest -v tests/integration

	start-runner:
	name: Start self-hosted EC2 runner
	runs-on: ubuntu-24.04
	needs: test
	outputs:
	label: ${{ steps.start-ec2-runner.outputs.label }}
	ec2-instances-ids: ${{ steps.start-ec2-runner.outputs.ec2-instances-ids }}
	steps:
	- name: Configure AWS credentials
	uses: aws-actions/[email protected]
	with:
	aws-access-key-id: ${{ secrets.GH_AWS_ACCESS_KEY }}
	aws-secret-access-key: ${{ secrets.GH_AWS_SECRET_KEY }}
	aws-region: "us-east-1"
	- name: Start EC2 runner
	id: start-ec2-runner
	uses: NillionNetwork/[email protected]
	with:
	mode: start
	github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
	runners-per-machine: 3
	number-of-machines: 1
	ec2-image-id: ami-0e70d84403fc045d7
	ec2-instance-type: g6.xlarge
	subnet-id: subnet-0bb357f46d1bc355c
	security-group-id: sg-022a5cdcf57e9618b
	key-name: us-east-1-github-runner-key
	iam-role-name: github-runners-us-east-1-github-runner-ec2
	aws-resource-tags: >
	[
	{"Key": "Name", "Value": "github-runner-${{ github.run_id }}-${{ github.run_number }}"},
	{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
	{"Key": "KeyName", "Value": "github-runners-key"},
	{"Key": "Deployment", "Value": "github-runners"},
	{"Key": "Type", "Value": "GithubRunner"},
	{"Key": "User", "Value": "ec2-user"},
	{"Key": "Environment", "Value": "us-east-1"}
	]

	build-images:
	name: Build ${{ matrix.component }} image
	needs: start-runner
	runs-on: ${{ needs.start-runner.outputs.label }}
	strategy:
	matrix:
	component: [vllm, attestation, api]
	include:
	- component: api
	build_args: "--target nilai --platform linux/amd64"
	- component: vllm
	model_to_cache: "openai/gpt-oss-20b"
	steps:
	- name: Checkout
	uses: actions/checkout@v4

	- name: Disable unattended upgrades
	run: \|
	echo "Disabling unattended upgrades to prevent interference with CI builds..."

	# Stop unattended-upgrades service
	sudo systemctl stop unattended-upgrades \|\| true
	sudo systemctl disable unattended-upgrades \|\| true

	# Kill any running unattended-upgrades processes
	sudo pkill -f unattended-upgrade \|\| true

	# Remove or disable the unattended-upgrades configuration
	sudo systemctl mask unattended-upgrades \|\| true

	# Wait for any ongoing package operations to complete
	while sudo fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1; do
	echo "Waiting for package manager lock to be released..."
	sleep 5
	done

	# Disable automatic updates in APT configuration
	echo 'APT::Periodic::Update-Package-Lists "0";' \| sudo tee /etc/apt/apt.conf.d/20auto-upgrades
	echo 'APT::Periodic::Unattended-Upgrade "0";' \| sudo tee -a /etc/apt/apt.conf.d/20auto-upgrades

	echo "✅ Unattended upgrades disabled successfully"

	- name: Install Docker Buildx plugin
	run: \|
	set -euo pipefail
	BUILDX_VERSION="v0.14.1"
	mkdir -p ~/.docker/cli-plugins
	curl -sSL "https://github.com/docker/buildx/releases/download/${BUILDX_VERSION}/buildx-${BUILDX_VERSION}.linux-amd64" \
	-o ~/.docker/cli-plugins/docker-buildx
	chmod +x ~/.docker/cli-plugins/docker-buildx
	docker buildx version

	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@v3
	with:
	driver-opts: image=moby/buildkit:latest
	buildkitd-flags: --allow-insecure-entitlement security.insecure --allow-insecure-entitlement network.host

	- name: Login to GitHub Container Registry
	uses: docker/login-action@v3
	with:
	registry: ghcr.io
	username: ${{ github.actor }}
	password: ${{ secrets.GITHUB_TOKEN }}

	- name: Check system resources
	run: \|
	echo "=== System Resources ==="
	df -h
	free -h
	echo "=== Docker Info ==="
	docker info
	echo "=== Docker System Usage ==="
	docker system df

	- name: Pre-pull Docker base image (for vllm)
	if: matrix.component == 'vllm'
	run: \|
	echo "Pre-pulling vllm base image to avoid rate limiting during build..."
	docker pull vllm/vllm-openai:v0.10.1

	- name: Setup HuggingFace cache directory
	if: matrix.component == 'vllm' && matrix.model_to_cache != ''
	run: \|
	mkdir -p /home/ec2-user/.cache/huggingface
	echo "Cache directory created at /home/ec2-user/.cache/huggingface"

	- name: Restore model from GHCR
	if: matrix.component == 'vllm' && matrix.model_to_cache != ''
	id: restore-model
	run: \|
	MODEL_CACHE_DIR="/home/ec2-user/.cache/huggingface"
	HF_DIR_NAME="models--$(echo ${{ matrix.model_to_cache }} \| sed 's/\//--/g')"
	FULL_PATH="$MODEL_CACHE_DIR/$HF_DIR_NAME"

	if [ -d "$FULL_PATH" ]; then
	echo "Model found on host filesystem at $FULL_PATH"
	echo "Skipping GHCR pull to save I/O."
	echo "cache-hit=true" >> $GITHUB_OUTPUT
	exit 0
	fi

	MODEL_IMAGE="ghcr.io/${{ github.repository_owner }}/nilai-model-cache:${{ matrix.model_to_cache }}-v1"
	MODEL_IMAGE=$(echo "$MODEL_IMAGE" \| tr '[:upper:]' '[:lower:]')

	echo "Attempting to pull model cache image: $MODEL_IMAGE"

	if docker pull "$MODEL_IMAGE"; then
	echo "Image found. Copying model files to host..."
	mkdir -p "$MODEL_CACHE_DIR"

	CONTAINER_ID=$(docker create "$MODEL_IMAGE")
	docker cp "$CONTAINER_ID":/model/. "$MODEL_CACHE_DIR/"
	docker rm "$CONTAINER_ID"
	echo "Model restored from GHCR."
	echo "cache-hit=true" >> $GITHUB_OUTPUT
	else
	echo "Model cache not found in GHCR."
	echo "cache-hit=false" >> $GITHUB_OUTPUT
	fi

	- name: DEBUG - Verify Cache Structure
	if: matrix.component == 'vllm'
	run: \|
	echo "Listing /home/ec2-user/.cache/huggingface contents:"
	ls -F /home/ec2-user/.cache/huggingface/ \|\| echo "Directory not found"

	echo "Checking for specific model folder:"
	ls -F /home/ec2-user/.cache/huggingface/models--openai--gpt-oss-20b/ \|\| echo "Model folder not found"

	echo "Checking snapshot content (first few files):"
	find /home/ec2-user/.cache/huggingface -maxdepth 4 \| head -n 10

	- name: Setup uv for model download
	if: matrix.component == 'vllm' && matrix.model_to_cache != '' && steps.restore-model.outputs.cache-hit != 'true'
	uses: astral-sh/setup-uv@v7
	with:
	enable-cache: true
	cache-dependency-glob: "**/pyproject.toml"

	- name: Install dependencies for model download
	if: matrix.component == 'vllm' && matrix.model_to_cache != '' && steps.restore-model.outputs.cache-hit != 'true'
	run: \|
	apt-get update && apt-get install curl git pkg-config automake file python3.12-dev -y
	export ACLOCAL=aclocal
	export AUTOMAKE=automake
	uv sync

	- name: Download HuggingFace model
	if: matrix.component == 'vllm' && matrix.model_to_cache != '' && steps.restore-model.outputs.cache-hit != 'true'
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	run: \|
	echo "Downloading model ${{ matrix.model_to_cache }} to cache..."
	uv run python -c "from huggingface_hub import snapshot_download; import os; os.environ['HF_TOKEN'] = '${{ secrets.HF_TOKEN }}'; snapshot_download('${{ matrix.model_to_cache }}', cache_dir='/home/ec2-user/.cache/huggingface'); print('Model cached successfully')" \
	\|\| { echo "Failed to download model"; exit 1; }
	echo "Model download completed successfully"

	- name: Save model to GHCR
	if: matrix.component == 'vllm' && matrix.model_to_cache != '' && steps.restore-model.outputs.cache-hit != 'true'
	run: \|
	echo "Saving model to GHCR..."

	MODEL_IMAGE="ghcr.io/${{ github.repository_owner }}/nilai-model-cache:${{ matrix.model_to_cache }}-v1"
	MODEL_IMAGE=$(echo "$MODEL_IMAGE" \| tr '[:upper:]' '[:lower:]')

	echo "FROM scratch" > Dockerfile.model
	echo "COPY . /model" >> Dockerfile.model

	cd /home/ec2-user/.cache/huggingface

	echo "Building cache image..."
	docker build -t "$MODEL_IMAGE" -f $GITHUB_WORKSPACE/Dockerfile.model .

	echo "Pushing cache image to GHCR..."
	docker push "$MODEL_IMAGE"
	echo "Model cached to GHCR."

	- name: Build ${{ matrix.component }} image
	run: \|
	echo "Building ${{ matrix.component }} image..."

	# Convert repository name to lowercase for Docker registry compatibility
	REPO_LOWER=$(echo "${{ github.repository }}" \| tr '[:upper:]' '[:lower:]')

	# Set cache references
	CACHE_REF="ghcr.io/${REPO_LOWER}/nilai-${{ matrix.component }}:buildcache"

	# Check if cache exists and is accessible
	echo "Checking cache availability..."
	CACHE_ARGS=""
	if docker manifest inspect ${CACHE_REF} >/dev/null 2>&1; then
	echo "✅ Cache found, using registry cache"
	CACHE_ARGS="--cache-from=type=registry,ref=${CACHE_REF} --cache-to=type=registry,ref=${CACHE_REF},mode=max"
	else
	echo "⚠️ No cache found or cache inaccessible, building without import cache"
	CACHE_ARGS="--cache-to=type=registry,ref=${CACHE_REF},mode=max"
	fi

	# Function to build with retry logic
	build_with_retry() {
	local attempt=1
	local max_attempts=3

	while [ $attempt -le $max_attempts ]; do
	echo "🔄 Build attempt $attempt of $max_attempts..."

	if docker buildx build \
	-t nillion/nilai-${{ matrix.component }}:latest \
	-f docker/${{ matrix.component }}.Dockerfile \
	${CACHE_ARGS} \
	--load \
	${{ matrix.build_args \|\| '' }} \
	.; then
	echo "✅ Build succeeded on attempt $attempt"
	return 0
	else
	echo "❌ Build failed on attempt $attempt"
	if [ $attempt -lt $max_attempts ]; then
	echo "⏳ Waiting 30 seconds before retry..."
	sleep 30

	# Clean up any partial builds
	echo "🧹 Cleaning up Docker system..."
	docker system prune -f \|\| true

	# On retry, disable cache export to reduce complexity
	if [ $attempt -eq 2 ]; then
	echo "⚠️ Disabling cache export for retry..."
	CACHE_ARGS="--cache-from=type=registry,ref=${CACHE_REF}"
	fi

	# On final retry, disable all cache
	if [ $attempt -eq 3 ]; then
	echo "⚠️ Disabling all cache for final retry..."
	CACHE_ARGS=""
	fi
	fi
	attempt=$((attempt + 1))
	fi
	done

	echo "💥 All build attempts failed"
	return 1
	}

	# Execute build with retry logic
	build_with_retry

	echo "✅ ${{ matrix.component }} build completed successfully"

	e2e-tests:
	name: E2E Tests
	needs: [start-runner, build-images]
	runs-on: ${{ needs.start-runner.outputs.label }}
	steps:
	- name: Checkout
	uses: actions/checkout@v4

	- name: Disable unattended upgrades
	run: \|
	echo "Disabling unattended upgrades to prevent interference with CI builds..."

	# Stop unattended-upgrades service
	sudo systemctl stop unattended-upgrades \|\| true
	sudo systemctl disable unattended-upgrades \|\| true

	# Kill any running unattended-upgrades processes
	sudo pkill -f unattended-upgrade \|\| true

	# Remove or disable the unattended-upgrades configuration
	sudo systemctl mask unattended-upgrades \|\| true

	# Wait for any ongoing package operations to complete
	while sudo fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1; do
	echo "Waiting for package manager lock to be released..."
	sleep 5
	done

	# Disable automatic updates in APT configuration
	echo 'APT::Periodic::Update-Package-Lists "0";' \| sudo tee /etc/apt/apt.conf.d/20auto-upgrades
	echo 'APT::Periodic::Unattended-Upgrade "0";' \| sudo tee -a /etc/apt/apt.conf.d/20auto-upgrades

	echo "✅ Unattended upgrades disabled successfully"

	- uses: astral-sh/setup-uv@v7
	with:
	enable-cache: true
	cache-dependency-glob: "**/pyproject.toml"

	- name: Install dependencies
	run: \|
	apt-get update && apt-get install curl git pkg-config automake file python3.12-dev -y
	export ACLOCAL=aclocal
	export AUTOMAKE=automake
	uv sync

	- name: Create .env
	run: \|
	cp .env.ci .env
	# Copy secret into .env replacing the existing HF_TOKEN
	sed -i 's/HF_TOKEN=.*/HF_TOKEN=${{ secrets.HF_TOKEN }}/' .env
	sed -i 's/BRAVE_SEARCH_API=.*/BRAVE_SEARCH_API=${{ secrets.BRAVE_SEARCH_API }}/' .env
	sed -i 's/NILDB_BUILDER_PRIVATE_KEY=.*/NILDB_BUILDER_PRIVATE_KEY=${{ secrets.NILDB_BUILDER_PRIVATE_KEY }}/' .env
	sed -i 's/NILDB_COLLECTION=.*/NILDB_COLLECTION=${{ secrets.NILDB_COLLECTION }}/' .env

	- name: Compose docker-compose.yml
	run: python3 ./scripts/docker-composer.py --dev -f docker/compose/docker-compose.gpt-20b-gpu.ci.yml -o development-compose.yml

	- name: GPU stack versions (non-fatal)
	shell: bash
	run: \|
	set +e # never fail this step

	echo "::group::Host & kernel"
	uname -a \|\| true
	echo "Kernel: $(uname -r 2>/dev/null \|\| echo unknown)"
	test -e /var/run/reboot-required && echo "Reboot flag: PRESENT" \|\| echo "Reboot flag: none"
	echo "::endgroup::"

	echo "::group::NVIDIA driver"
	if command -v nvidia-smi >/dev/null 2>&1; then
	nvidia-smi \|\| true
	echo "Driver version (nvidia-smi): $(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null \| head -n1 \|\| echo unknown)"
	echo "GPU(s):"; nvidia-smi -L \|\| true
	else
	echo "nvidia-smi: not found"
	fi
	if [ -r /proc/driver/nvidia/version ]; then
	echo "--- /proc/driver/nvidia/version ---"
	cat /proc/driver/nvidia/version \|\| true
	else
	echo "/proc/driver/nvidia/version: not present"
	fi
	command -v modinfo >/dev/null 2>&1 && { echo "--- modinfo nvidia (head) ---"; modinfo nvidia 2>/dev/null \| head -n 20 \|\| true; } \|\| true
	echo "::endgroup::"

	echo "::group::DKMS status"
	command -v dkms >/dev/null 2>&1 && dkms status \| grep -i nvidia \|\| echo "dkms or nvidia dkms info not present"
	echo "::endgroup::"

	echo "::group::CUDA toolkit/runtime"
	if command -v nvcc >/dev/null 2>&1; then
	nvcc --version \|\| true
	else
	echo "nvcc: not found"
	fi
	echo "libcudart in ldconfig:"
	ldconfig -p 2>/dev/null \| grep -i libcudart \|\| echo "libcudart not found in ldconfig cache"
	echo "NCCL packages:"
	dpkg -l 2>/dev/null \| grep -iE '^ii\s+libnccl' \|\| echo "NCCL not installed (Debian/Ubuntu dpkg check)"
	echo "::endgroup::"

	echo "::group::Container stack"
	docker --version \|\| echo "docker: not found"
	docker info 2>/dev/null \| grep -iE 'Runtimes\|nvidia' \|\| echo "docker info: no nvidia runtime line found"
	containerd --version 2>/dev/null \|\| echo "containerd: not found"
	runc --version 2>/dev/null \|\| echo "runc: not found"
	echo "::endgroup::"

	echo "::group::NVIDIA container runtime/toolkit"
	# Legacy/runtime binaries
	if command -v nvidia-container-runtime >/dev/null 2>&1; then
	nvidia-container-runtime --version \|\| nvidia-container-runtime -v \|\| true
	else
	echo "nvidia-container-runtime: not found"
	fi
	# Toolkit binaries (newer distros)
	if command -v nvidia-ctk >/dev/null 2>&1; then
	nvidia-ctk --version \|\| true
	nvidia-ctk runtime configure --help >/dev/null 2>&1 \|\| true
	else
	echo "nvidia-ctk: not found"
	fi
	if command -v nvidia-container-toolkit >/dev/null 2>&1; then
	nvidia-container-toolkit --version \|\| true
	else
	echo "nvidia-container-toolkit: not found"
	fi
	echo "libnvidia-container packages:"
	dpkg -l 2>/dev/null \| grep -iE '^ii\s+(libnvidia-container1\|libnvidia-container-tools)\s' \|\| echo "libnvidia-container packages not found (dpkg)"
	# Show runtime config if present
	if [ -f /etc/nvidia-container-runtime/config.toml ]; then
	echo "--- /etc/nvidia-container-runtime/config.toml (head) ---"
	sed -n '1,120p' /etc/nvidia-container-runtime/config.toml \|\| true
	else
	echo "/etc/nvidia-container-runtime/config.toml: not present"
	fi
	echo "::endgroup::"

	echo "::group::Apt logs (NVIDIA-related entries)"
	for f in /var/log/apt/history.log /var/log/apt/term.log /var/log/unattended-upgrades/unattended-upgrades.log; do
	if [[ -f "$f" ]]; then
	echo "--- scanning $f"
	grep -H -i -E 'nvidia\|cuda\|container-toolkit' "$f" \|\| echo "no recent NVIDIA entries"
	else
	echo "missing: $f"
	fi
	done
	echo "::endgroup::"

	- name: Start Services
	run: \|
	docker-compose -f development-compose.yml up -d
	docker ps -a

	- name: Wait for services to be healthy
	run: bash scripts/wait_for_ci_services.sh

	- name: Run E2E tests for NUC
	run: \|
	set -e
	export ENVIRONMENT=ci
	export AUTH_STRATEGY=nuc
	uv run pytest -v tests/e2e

	- name: Run E2E tests for API Key
	run: \|
	set -e
	# Create a user with a rate limit of 1000 requests per minute, hour, and day
	export AUTH_TOKEN=$(docker exec nilai-api uv run src/nilai_api/commands/add_user.py --name test1 --ratelimit-minute 1000 --ratelimit-hour 1000 --ratelimit-day 1000 \| jq ".apikey" -r)
	export ENVIRONMENT=ci
	# Set the environment variable for the API key
	export AUTH_STRATEGY=api_key
	uv run pytest -v tests/e2e

	- name: Stop Services
	run: \|
	docker-compose -f development-compose.yml down -v

	push-images:
	name: Push ${{ matrix.component }} to ECR
	needs: [start-runner, build-images, e2e-tests]
	runs-on: ${{ needs.start-runner.outputs.label }}
	if: (github.event_name == 'push' && github.ref == 'refs/heads/main') \|\| github.event_name == 'release'
	strategy:
	matrix:
	component: [vllm, attestation, api]
	steps:
	- name: Configure AWS credentials for ECR
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: "arn:aws:iam::054037142884:role/nilAI-github"
	aws-region: "us-east-1"

	- name: Login to Amazon ECR
	id: login-ecr
	uses: aws-actions/amazon-ecr-login@v2
	with:
	registry-type: public

	- name: Set image tags
	id: image-tags
	run: \|
	IMAGE_TAG="${{ github.event_name == 'release' && github.ref_name \|\| github.sha }}"
	echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT

	- name: Tag and push ${{ matrix.component }} to ECR
	env:
	ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
	ECR_REGISTRY_ALIAS: k5d9x2g2
	IMAGE_TAG: ${{ steps.image-tags.outputs.image_tag }}
	run: \|
	echo "Tagging and pushing ${{ matrix.component }} image to ECR..."

	# Tag for ECR
	docker tag nillion/nilai-${{ matrix.component }}:latest ${ECR_REGISTRY}/${ECR_REGISTRY_ALIAS}/nilai-${{ matrix.component }}:${IMAGE_TAG}

	# Push to ECR
	docker push ${ECR_REGISTRY}/${ECR_REGISTRY_ALIAS}/nilai-${{ matrix.component }}:${IMAGE_TAG}

	echo "## Pushed ${{ matrix.component }} Image" >> $GITHUB_STEP_SUMMARY
	echo "- ${{ matrix.component }}: \`${ECR_REGISTRY}/${ECR_REGISTRY_ALIAS}/nilai-${{ matrix.component }}:${IMAGE_TAG}\`" >> $GITHUB_STEP_SUMMARY

	stop-runner:
	name: Stop self-hosted EC2 runner
	needs: [start-runner, build-images, e2e-tests, push-images]
	runs-on: ubuntu-24.04
	if: ${{ always() }}
	steps:
	- name: Configure AWS credentials
	uses: aws-actions/configure-aws-credentials@v1
	with:
	aws-access-key-id: ${{ secrets.GH_AWS_ACCESS_KEY }}
	aws-secret-access-key: ${{ secrets.GH_AWS_SECRET_KEY }}
	aws-region: "us-east-1"

	- name: Stop EC2 runner
	uses: NillionNetwork/[email protected]
	with:
	mode: stop
	github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
	label: ${{ needs.start-runner.outputs.label }}
	ec2-instances-ids: ${{ needs.start-runner.outputs.ec2-instances-ids }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

chore: added CI model layer caching #451

Workflow file

chore: added CI model layer caching #451

Uh oh!

Jobs

Run details

Workflow file for this run