Merge pull request #121 from stratika/cleanup/pdf-file #351

Workflow file for this run

.github/workflows/build-and-run.yml at 5294524

	name: GPULlama3 Build & Run

	on:
	push:
	branches: [ main ]
	pull_request:
	branches: [ main ]
	types: [opened, synchronize, reopened]

	env:
	JAVA_VERSION: 21.0.2-open
	GRAAL_JARS: /opt/graalJars
	MODELS_DIR: /opt/models
	QUARKUS_PORT: 8081
	# History file committed back to the repo on push to main
	PERF_HISTORY_FILE: docs/perf-history.jsonl

	jobs:
	code-quality:
	if: github.repository == 'beehive-lab/GPULlama3.java'
	runs-on: self-hosted
	timeout-minutes: 30

	steps:
	- name: Checkout GPULlama3
	uses: actions/checkout@v4

	- name: Check code formatting (Spotless)
	run: \|
	cd ${{ github.workspace }}
	# ./mvnw -T12C -Pspotless spotless:check

	# Build: TornadoVM → GPULlama3 → Quarkus LangChain4j
	# max-parallel: 1 ensures the opencl and ptx variants run sequentially so
	# there are no workspace conflicts between matrix jobs.
	build:
	if: github.repository == 'beehive-lab/GPULlama3.java'
	runs-on: [self-hosted]
	needs: code-quality
	timeout-minutes: 30
	strategy:
	fail-fast: true
	max-parallel: 1
	matrix:
	backend:
	- name: opencl
	- name: ptx

	steps:
	- name: Checkout GPULlama3
	uses: actions/checkout@v4
	with:
	clean: false

	- name: Set up Java
	uses: ./.github/actions/setup-java
	with:
	java_version: ${{ env.JAVA_VERSION }}

	- name: Setup TornadoVM
	uses: ./.github/actions/setup-tornadovm
	env:
	TORNADO_ROOT: ${{ runner.tool_cache }}/tornadovm/tornadovm-${{ matrix.backend.name }}
	with:
	backend: ${{ matrix.backend.name }}

	- name: Build GPULlama3.java
	run: \|
	tornado --version
	# Strip any pre-existing -SNAPSHOT suffix before appending, making this step idempotent
	# across sequential matrix variants (ptx runs after opencl on the same workspace).
	BASE_VERSION=$(./mvnw help:evaluate -Dexpression=project.version -q -DforceStdout \| sed 's/-SNAPSHOT$//')
	GPULLAMA3_VERSION="${BASE_VERSION}-SNAPSHOT"
	echo "GPULlama3 version: $GPULLAMA3_VERSION"
	./mvnw versions:set -DnewVersion=$GPULLAMA3_VERSION
	./mvnw clean install -DskipTests
	echo "GPULLAMA3_VERSION=$GPULLAMA3_VERSION" >> $GITHUB_ENV

	- name: Clone Quarkus LangChain4j
	run: \|
	rm -rf quarkus-langchain4j
	git clone --depth 1 https://github.com/quarkiverse/quarkus-langchain4j.git

	- name: Build Quarkus LangChain4j
	run: \|
	cd ${{ github.workspace }}/quarkus-langchain4j
	sed -i 's/<gpu-llama3\.version>.*<\/gpu-llama3\.version>/<gpu-llama3.version>'$GPULLAMA3_VERSION'<\/gpu-llama3.version>/' pom.xml
	# -Dtornado activates the TornadoVM profile; -am builds only the gpu-llama3 module + deps
	mvn clean install -pl integration-tests/gpu-llama3 -am -DskipTests -Dtornado

	standalone-inference:
	if: github.repository == 'beehive-lab/GPULlama3.java'
	runs-on: [self-hosted]
	needs: build
	timeout-minutes: 30
	strategy:
	fail-fast: true
	matrix:
	backend:
	- name: opencl
	- name: ptx

	steps:
	- name: Checkout GPULlama3
	uses: actions/checkout@v4
	with:
	clean: false

	- name: Set up Java
	uses: ./.github/actions/setup-java
	with:
	java_version: ${{ env.JAVA_VERSION }}

	- name: Setup TornadoVM
	uses: ./.github/actions/setup-tornadovm
	env:
	TORNADO_ROOT: ${{ runner.tool_cache }}/tornadovm/tornadovm-${{ matrix.backend.name }}
	with:
	backend: ${{ matrix.backend.name }}

	# Test standalone mode per model family and quantization
	# Note: variants can be represented with matrices
	- name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Standard
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: Llama-3.2-1B-Instruct-F16.gguf
	model: Llama-3.2-1B-Instruct
	quantization: F16
	configuration: standard
	metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-standard.json

	- name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: Llama-3.2-1B-Instruct-F16.gguf
	model: Llama-3.2-1B-Instruct
	quantization: F16
	configuration: prefill-decode
	flags: --with-prefill-decode
	metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-prefill-decode.json

	- name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Batch-Prefill-Decode
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: Llama-3.2-1B-Instruct-F16.gguf
	model: Llama-3.2-1B-Instruct
	quantization: F16
	configuration: batch-prefill-decode
	flags: --with-prefill-decode --batch-prefill-size 32
	metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-batch-prefill-decode.json

	# PTX-only: CUDA-graph variants
	- name: PTX - FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode-CUDA-Graphs
	if: matrix.backend.name == 'ptx'
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: Llama-3.2-1B-Instruct-F16.gguf
	model: Llama-3.2-1B-Instruct
	quantization: F16
	configuration: prefill-decode-cuda-graphs
	flags: --with-prefill-decode --cuda-graphs
	metrics_file: ${{ runner.temp }}/metrics-ptx-llama-1b-f16-prefill-decode-cuda-graphs.json

	- name: PTX - FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Batch-Prefill-Decode-CUDA-Graphs
	if: matrix.backend.name == 'ptx'
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: Llama-3.2-1B-Instruct-F16.gguf
	model: Llama-3.2-1B-Instruct
	quantization: F16
	configuration: batch-prefill-decode-cuda-graphs
	flags: --with-prefill-decode --batch-prefill-size 32 --cuda-graphs
	metrics_file: ${{ runner.temp }}/metrics-ptx-llama-1b-f16-batch-prefill-decode-cuda-graphs.json

	- name: FP16 - Run Qwen3-4B-f16.gguf
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: Qwen3-4B-f16.gguf
	model: Qwen3-4B
	quantization: F16
	configuration: standard
	metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-4b-f16-standard.json

	- name: FP16 - Run Mistral-7B-Instruct-v0.3.fp16.gguf
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: Mistral-7B-Instruct-v0.3.fp16.gguf
	model: Mistral-7B-Instruct-v0.3
	quantization: F16
	configuration: standard
	metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-fp16-standard.json

	- name: FP16 - Run Qwen2.5-1.5b-instruct-fp16.gguf
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: qwen2.5-1.5b-instruct-fp16.gguf
	model: Qwen2.5-1.5B-Instruct
	quantization: F16
	configuration: standard
	metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-fp16-standard.json

	- name: FP16 - Run Phi-3-mini-4k-instruct-fp16.gguf
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: Phi-3-mini-4k-instruct-fp16.gguf
	model: Phi-3-mini-4k-instruct
	quantization: F16
	configuration: standard
	metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-fp16-standard.json

	- name: FP16 - Run Granite-3.2-2b-instruct-f16.gguf
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: granite-3.2-2b-instruct-f16.gguf
	model: Granite-3.2-2B-Instruct
	quantization: F16
	configuration: standard
	metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-f16-standard.json

	- name: FP16 - Run Granite-4.0-1b-F16.gguf
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: granite-4.0-1b-F16.gguf
	model: Granite-4.0-1B
	quantization: F16
	configuration: standard
	metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-f16-standard.json

	- name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: Llama-3.2-1B-Instruct-Q8_0.gguf
	model: Llama-3.2-1B-Instruct
	quantization: Q8_0
	configuration: standard
	metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-standard.json

	- name: Q8 - Run Qwen3-0.6B-Q8_0.gguf
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: Qwen3-0.6B-Q8_0.gguf
	model: Qwen3-0.6B
	quantization: Q8_0
	configuration: standard
	metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-0-6b-q8-standard.json

	- name: Q8 - Run Phi-3-mini-4k-instruct-Q8_0.gguf
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: Phi-3-mini-4k-instruct-Q8_0.gguf
	model: Phi-3-mini-4k-instruct
	quantization: Q8_0
	configuration: standard
	metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-q8-standard.json

	- name: Q8 - Run Qwen2.5-1.5b-instruct-q8_0.gguf
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: qwen2.5-1.5b-instruct-q8_0.gguf
	model: Qwen2.5-1.5B-Instruct
	quantization: Q8_0
	configuration: standard
	metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-q8-standard.json

	- name: Q8 - Run Mistral-7B-Instruct-v0.3.Q8_0.gguf
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: Mistral-7B-Instruct-v0.3.Q8_0.gguf
	model: Mistral-7B-Instruct-v0.3
	quantization: Q8_0
	configuration: standard
	metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-q8-standard.json

	- name: Q8 - Run Granite-3.2-2b-instruct-Q8_0.gguf
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: granite-3.2-2b-instruct-Q8_0.gguf
	model: Granite-3.2-2B-Instruct
	quantization: Q8_0
	configuration: standard
	metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-q8-standard.json

	- name: Q8 - Run Granite-4.0-1b-Q8_0.gguf
	uses: ./.github/actions/run-inference
	with:
	backend: ${{ matrix.backend.name }}
	model_file: granite-4.0-1b-Q8_0.gguf
	model: Granite-4.0-1B
	quantization: Q8_0
	configuration: standard
	metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-q8-standard.json

	# Upload metrics for the publish job
	- name: Upload metrics artifacts
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: metrics-${{ matrix.backend.name }}-${{ github.run_id }}
	path: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-*.json
	if-no-files-found: warn

	# Test integration with Quarkus-langchain4j
	quarkus-langchain4j-integration:
	if: github.repository == 'beehive-lab/GPULlama3.java'
	runs-on: [self-hosted]
	needs: build
	timeout-minutes: 10
	strategy:
	fail-fast: true
	matrix:
	backend:
	- name: opencl
	- name: ptx

	steps:
	- name: Checkout GPULlama3
	uses: actions/checkout@v4
	with:
	clean: false

	- name: Set up Java
	uses: ./.github/actions/setup-java
	with:
	java_version: ${{ env.JAVA_VERSION }}

	- name: Setup TornadoVM
	uses: ./.github/actions/setup-tornadovm
	env:
	TORNADO_ROOT: ${{ runner.tool_cache }}/tornadovm/tornadovm-${{ matrix.backend.name }}
	with:
	backend: ${{ matrix.backend.name }}

	- name: Verify GPULlama3 Dependency
	run: \|
	cd ${{ github.workspace }}/quarkus-langchain4j/integration-tests/gpu-llama3
	mvn dependency:tree \| grep "io.github.beehive-lab:gpu-llama3"

	- name: Start Quarkus Application
	run: \|
	cd ${{ github.workspace }}/quarkus-langchain4j/integration-tests/gpu-llama3
	java @"$TORNADOVM_HOME/tornado-argfile" \
	-Dtornado.device.memory=8GB \
	-Dquarkus.http.port=$QUARKUS_PORT \
	-jar target/quarkus-app/quarkus-run.jar &
	APP_PID=$!
	echo "APP_PID=$APP_PID" >> $GITHUB_ENV
	for i in {1..30}; do
	if curl -s http://localhost:$QUARKUS_PORT/q/health > /dev/null 2>&1; then
	echo "Application ready after ${i} seconds"
	break
	elif [ $i -eq 30 ]; then
	echo "::error::Application failed to start within 30 seconds"
	kill $APP_PID \|\| true
	exit 1
	else
	[ $((i % 5)) -eq 0 ] && echo "Still waiting... (${i}s)"
	sleep 1
	fi
	done

	- name: Trigger Blocking Endpoint
	run: \|
	for attempt in 1 2 3; do
	echo "Attempt $attempt of 3 for blocking endpoint..."
	HTTP_RESPONSE=$(curl -s -w "%{http_code}" http://localhost:$QUARKUS_PORT/chat/blocking)
	HTTP_CODE="${HTTP_RESPONSE: -3}"
	if [ "$HTTP_CODE" = "200" ]; then
	echo "SUCCESS: HTTP $HTTP_CODE"
	echo "Response body: ${HTTP_RESPONSE%???}"
	break
	fi
	echo "Failed: HTTP $HTTP_CODE"
	[ $attempt -lt 3 ] && sleep 2
	[ $attempt -eq 3 ] && { echo "::error::Blocking endpoint failed after 3 attempts"; exit 1; }
	done

	- name: Trigger Streaming Endpoint
	run: \|
	for attempt in 1 2 3; do
	echo "Attempt $attempt of 3 for streaming endpoint..."
	HTTP_CODE=$(timeout 10s curl -s -o /dev/null -w "%{http_code}" http://localhost:$QUARKUS_PORT/chat/streaming)
	if [ "$HTTP_CODE" = "200" ]; then
	echo "SUCCESS: HTTP $HTTP_CODE"
	break
	fi
	echo "Failed: HTTP $HTTP_CODE"
	[ $attempt -lt 3 ] && sleep 2
	[ $attempt -eq 3 ] && { echo "::error::Streaming endpoint failed after 3 attempts"; exit 1; }
	done

	- name: Cleanup & Shutdown
	if: always()
	run: \|
	kill $APP_PID \|\| true
	wait $APP_PID 2>/dev/null \|\| true

	# Collect all matrix metrics and update history
	publish-performance-history:
	# Guard: only commit history on real pushes to main, not on PRs or forks.
	# Prevents duplicate entries from PR runs and avoids push-permission errors on forks.
	if: >-
	github.repository == 'beehive-lab/GPULlama3.java' &&
	github.event_name == 'push' &&
	github.ref == 'refs/heads/main'

	runs-on: [self-hosted]
	needs: standalone-inference
	timeout-minutes: 15

	steps:
	- name: Checkout GPULlama3
	uses: actions/checkout@v4

	- name: Download metrics artifacts
	uses: actions/download-artifact@v4
	with:
	pattern: metrics-*-${{ github.run_id }}
	path: ${{ runner.temp }}/metrics-artifacts
	merge-multiple: true

	- name: Append to performance history
	run: \|
	python3 scripts/process_metrics.py \
	--metrics-dir "${{ runner.temp }}/metrics-artifacts" \
	--commit "${{ github.sha }}" \
	--branch "${{ github.ref_name }}" \
	--run-id "${{ github.run_id }}" \
	--run-number "${{ github.run_number }}" \
	--run-attempt "${{ github.run_attempt }}" \
	--workflow "${{ github.workflow }}" \
	--history "$PERF_HISTORY_FILE"

	- name: Commit performance history
	run: \|
	SHORT_SHA=$(echo "${GITHUB_SHA}" \| cut -c1-8)

	git config user.name "github-actions[bot]"
	git config user.email "github-actions[bot]@users.noreply.github.com"

	git add "$PERF_HISTORY_FILE"

	git diff --cached --quiet && \
	echo "No history changes to commit" && exit 0

	git commit -m "perf: record run #${{ github.run_number }} @ ${SHORT_SHA}"

	for attempt in 1 2 3; do
	git pull --rebase origin main && git push && break \|\| {
	if [ $attempt -lt 3 ]; then
	echo "Attempt $attempt failed, retrying in $((attempt * 5))s..."
	sleep $((attempt * 5))
	else
	echo "::error::Failed to push after 3 attempts"
	exit 1
	fi
	}
	done

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Merge pull request #121 from stratika/cleanup/pdf-file #351

Workflow file

Merge pull request #121 from stratika/cleanup/pdf-file #351

Uh oh!

Workflow file for this run