Merge pull request #121 from stratika/cleanup/pdf-file #351
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: GPULlama3 Build & Run | |
| on: | |
| push: | |
| branches: [ main ] | |
| pull_request: | |
| branches: [ main ] | |
| types: [opened, synchronize, reopened] | |
| env: | |
| JAVA_VERSION: 21.0.2-open | |
| GRAAL_JARS: /opt/graalJars | |
| MODELS_DIR: /opt/models | |
| QUARKUS_PORT: 8081 | |
| # History file committed back to the repo on push to main | |
| PERF_HISTORY_FILE: docs/perf-history.jsonl | |
| jobs: | |
| code-quality: | |
| if: github.repository == 'beehive-lab/GPULlama3.java' | |
| runs-on: self-hosted | |
| timeout-minutes: 30 | |
| steps: | |
| - name: Checkout GPULlama3 | |
| uses: actions/checkout@v4 | |
| - name: Check code formatting (Spotless) | |
| run: | | |
| cd ${{ github.workspace }} | |
| # ./mvnw -T12C -Pspotless spotless:check | |
| # Build: TornadoVM → GPULlama3 → Quarkus LangChain4j | |
| # max-parallel: 1 ensures the opencl and ptx variants run sequentially so | |
| # there are no workspace conflicts between matrix jobs. | |
| build: | |
| if: github.repository == 'beehive-lab/GPULlama3.java' | |
| runs-on: [self-hosted] | |
| needs: code-quality | |
| timeout-minutes: 30 | |
| strategy: | |
| fail-fast: true | |
| max-parallel: 1 | |
| matrix: | |
| backend: | |
| - name: opencl | |
| - name: ptx | |
| steps: | |
| - name: Checkout GPULlama3 | |
| uses: actions/checkout@v4 | |
| with: | |
| clean: false | |
| - name: Set up Java | |
| uses: ./.github/actions/setup-java | |
| with: | |
| java_version: ${{ env.JAVA_VERSION }} | |
| - name: Setup TornadoVM | |
| uses: ./.github/actions/setup-tornadovm | |
| env: | |
| TORNADO_ROOT: ${{ runner.tool_cache }}/tornadovm/tornadovm-${{ matrix.backend.name }} | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| - name: Build GPULlama3.java | |
| run: | | |
| tornado --version | |
| # Strip any pre-existing -SNAPSHOT suffix before appending, making this step idempotent | |
| # across sequential matrix variants (ptx runs after opencl on the same workspace). | |
| BASE_VERSION=$(./mvnw help:evaluate -Dexpression=project.version -q -DforceStdout | sed 's/-SNAPSHOT$//') | |
| GPULLAMA3_VERSION="${BASE_VERSION}-SNAPSHOT" | |
| echo "GPULlama3 version: $GPULLAMA3_VERSION" | |
| ./mvnw versions:set -DnewVersion=$GPULLAMA3_VERSION | |
| ./mvnw clean install -DskipTests | |
| echo "GPULLAMA3_VERSION=$GPULLAMA3_VERSION" >> $GITHUB_ENV | |
| - name: Clone Quarkus LangChain4j | |
| run: | | |
| rm -rf quarkus-langchain4j | |
| git clone --depth 1 https://github.com/quarkiverse/quarkus-langchain4j.git | |
| - name: Build Quarkus LangChain4j | |
| run: | | |
| cd ${{ github.workspace }}/quarkus-langchain4j | |
| sed -i 's/<gpu-llama3\.version>.*<\/gpu-llama3\.version>/<gpu-llama3.version>'$GPULLAMA3_VERSION'<\/gpu-llama3.version>/' pom.xml | |
| # -Dtornado activates the TornadoVM profile; -am builds only the gpu-llama3 module + deps | |
| mvn clean install -pl integration-tests/gpu-llama3 -am -DskipTests -Dtornado | |
| standalone-inference: | |
| if: github.repository == 'beehive-lab/GPULlama3.java' | |
| runs-on: [self-hosted] | |
| needs: build | |
| timeout-minutes: 30 | |
| strategy: | |
| fail-fast: true | |
| matrix: | |
| backend: | |
| - name: opencl | |
| - name: ptx | |
| steps: | |
| - name: Checkout GPULlama3 | |
| uses: actions/checkout@v4 | |
| with: | |
| clean: false | |
| - name: Set up Java | |
| uses: ./.github/actions/setup-java | |
| with: | |
| java_version: ${{ env.JAVA_VERSION }} | |
| - name: Setup TornadoVM | |
| uses: ./.github/actions/setup-tornadovm | |
| env: | |
| TORNADO_ROOT: ${{ runner.tool_cache }}/tornadovm/tornadovm-${{ matrix.backend.name }} | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| # Test standalone mode per model family and quantization | |
| # Note: variants can be represented with matrices | |
| - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Standard | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Llama-3.2-1B-Instruct-F16.gguf | |
| model: Llama-3.2-1B-Instruct | |
| quantization: F16 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-standard.json | |
| - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Llama-3.2-1B-Instruct-F16.gguf | |
| model: Llama-3.2-1B-Instruct | |
| quantization: F16 | |
| configuration: prefill-decode | |
| flags: --with-prefill-decode | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-prefill-decode.json | |
| - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Batch-Prefill-Decode | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Llama-3.2-1B-Instruct-F16.gguf | |
| model: Llama-3.2-1B-Instruct | |
| quantization: F16 | |
| configuration: batch-prefill-decode | |
| flags: --with-prefill-decode --batch-prefill-size 32 | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-batch-prefill-decode.json | |
| # PTX-only: CUDA-graph variants | |
| - name: PTX - FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode-CUDA-Graphs | |
| if: matrix.backend.name == 'ptx' | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Llama-3.2-1B-Instruct-F16.gguf | |
| model: Llama-3.2-1B-Instruct | |
| quantization: F16 | |
| configuration: prefill-decode-cuda-graphs | |
| flags: --with-prefill-decode --cuda-graphs | |
| metrics_file: ${{ runner.temp }}/metrics-ptx-llama-1b-f16-prefill-decode-cuda-graphs.json | |
| - name: PTX - FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Batch-Prefill-Decode-CUDA-Graphs | |
| if: matrix.backend.name == 'ptx' | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Llama-3.2-1B-Instruct-F16.gguf | |
| model: Llama-3.2-1B-Instruct | |
| quantization: F16 | |
| configuration: batch-prefill-decode-cuda-graphs | |
| flags: --with-prefill-decode --batch-prefill-size 32 --cuda-graphs | |
| metrics_file: ${{ runner.temp }}/metrics-ptx-llama-1b-f16-batch-prefill-decode-cuda-graphs.json | |
| - name: FP16 - Run Qwen3-4B-f16.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Qwen3-4B-f16.gguf | |
| model: Qwen3-4B | |
| quantization: F16 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-4b-f16-standard.json | |
| - name: FP16 - Run Mistral-7B-Instruct-v0.3.fp16.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Mistral-7B-Instruct-v0.3.fp16.gguf | |
| model: Mistral-7B-Instruct-v0.3 | |
| quantization: F16 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-fp16-standard.json | |
| - name: FP16 - Run Qwen2.5-1.5b-instruct-fp16.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: qwen2.5-1.5b-instruct-fp16.gguf | |
| model: Qwen2.5-1.5B-Instruct | |
| quantization: F16 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-fp16-standard.json | |
| - name: FP16 - Run Phi-3-mini-4k-instruct-fp16.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Phi-3-mini-4k-instruct-fp16.gguf | |
| model: Phi-3-mini-4k-instruct | |
| quantization: F16 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-fp16-standard.json | |
| - name: FP16 - Run Granite-3.2-2b-instruct-f16.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: granite-3.2-2b-instruct-f16.gguf | |
| model: Granite-3.2-2B-Instruct | |
| quantization: F16 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-f16-standard.json | |
| - name: FP16 - Run Granite-4.0-1b-F16.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: granite-4.0-1b-F16.gguf | |
| model: Granite-4.0-1B | |
| quantization: F16 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-f16-standard.json | |
| - name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Llama-3.2-1B-Instruct-Q8_0.gguf | |
| model: Llama-3.2-1B-Instruct | |
| quantization: Q8_0 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-standard.json | |
| - name: Q8 - Run Qwen3-0.6B-Q8_0.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Qwen3-0.6B-Q8_0.gguf | |
| model: Qwen3-0.6B | |
| quantization: Q8_0 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-0-6b-q8-standard.json | |
| - name: Q8 - Run Phi-3-mini-4k-instruct-Q8_0.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Phi-3-mini-4k-instruct-Q8_0.gguf | |
| model: Phi-3-mini-4k-instruct | |
| quantization: Q8_0 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-q8-standard.json | |
| - name: Q8 - Run Qwen2.5-1.5b-instruct-q8_0.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: qwen2.5-1.5b-instruct-q8_0.gguf | |
| model: Qwen2.5-1.5B-Instruct | |
| quantization: Q8_0 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-q8-standard.json | |
| - name: Q8 - Run Mistral-7B-Instruct-v0.3.Q8_0.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Mistral-7B-Instruct-v0.3.Q8_0.gguf | |
| model: Mistral-7B-Instruct-v0.3 | |
| quantization: Q8_0 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-q8-standard.json | |
| - name: Q8 - Run Granite-3.2-2b-instruct-Q8_0.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: granite-3.2-2b-instruct-Q8_0.gguf | |
| model: Granite-3.2-2B-Instruct | |
| quantization: Q8_0 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-q8-standard.json | |
| - name: Q8 - Run Granite-4.0-1b-Q8_0.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: granite-4.0-1b-Q8_0.gguf | |
| model: Granite-4.0-1B | |
| quantization: Q8_0 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-q8-standard.json | |
| # Upload metrics for the publish job | |
| - name: Upload metrics artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: metrics-${{ matrix.backend.name }}-${{ github.run_id }} | |
| path: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-*.json | |
| if-no-files-found: warn | |
| # Test integration with Quarkus-langchain4j | |
| quarkus-langchain4j-integration: | |
| if: github.repository == 'beehive-lab/GPULlama3.java' | |
| runs-on: [self-hosted] | |
| needs: build | |
| timeout-minutes: 10 | |
| strategy: | |
| fail-fast: true | |
| matrix: | |
| backend: | |
| - name: opencl | |
| - name: ptx | |
| steps: | |
| - name: Checkout GPULlama3 | |
| uses: actions/checkout@v4 | |
| with: | |
| clean: false | |
| - name: Set up Java | |
| uses: ./.github/actions/setup-java | |
| with: | |
| java_version: ${{ env.JAVA_VERSION }} | |
| - name: Setup TornadoVM | |
| uses: ./.github/actions/setup-tornadovm | |
| env: | |
| TORNADO_ROOT: ${{ runner.tool_cache }}/tornadovm/tornadovm-${{ matrix.backend.name }} | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| - name: Verify GPULlama3 Dependency | |
| run: | | |
| cd ${{ github.workspace }}/quarkus-langchain4j/integration-tests/gpu-llama3 | |
| mvn dependency:tree | grep "io.github.beehive-lab:gpu-llama3" | |
| - name: Start Quarkus Application | |
| run: | | |
| cd ${{ github.workspace }}/quarkus-langchain4j/integration-tests/gpu-llama3 | |
| java @"$TORNADOVM_HOME/tornado-argfile" \ | |
| -Dtornado.device.memory=8GB \ | |
| -Dquarkus.http.port=$QUARKUS_PORT \ | |
| -jar target/quarkus-app/quarkus-run.jar & | |
| APP_PID=$! | |
| echo "APP_PID=$APP_PID" >> $GITHUB_ENV | |
| for i in {1..30}; do | |
| if curl -s http://localhost:$QUARKUS_PORT/q/health > /dev/null 2>&1; then | |
| echo "Application ready after ${i} seconds" | |
| break | |
| elif [ $i -eq 30 ]; then | |
| echo "::error::Application failed to start within 30 seconds" | |
| kill $APP_PID || true | |
| exit 1 | |
| else | |
| [ $((i % 5)) -eq 0 ] && echo "Still waiting... (${i}s)" | |
| sleep 1 | |
| fi | |
| done | |
| - name: Trigger Blocking Endpoint | |
| run: | | |
| for attempt in 1 2 3; do | |
| echo "Attempt $attempt of 3 for blocking endpoint..." | |
| HTTP_RESPONSE=$(curl -s -w "%{http_code}" http://localhost:$QUARKUS_PORT/chat/blocking) | |
| HTTP_CODE="${HTTP_RESPONSE: -3}" | |
| if [ "$HTTP_CODE" = "200" ]; then | |
| echo "SUCCESS: HTTP $HTTP_CODE" | |
| echo "Response body: ${HTTP_RESPONSE%???}" | |
| break | |
| fi | |
| echo "Failed: HTTP $HTTP_CODE" | |
| [ $attempt -lt 3 ] && sleep 2 | |
| [ $attempt -eq 3 ] && { echo "::error::Blocking endpoint failed after 3 attempts"; exit 1; } | |
| done | |
| - name: Trigger Streaming Endpoint | |
| run: | | |
| for attempt in 1 2 3; do | |
| echo "Attempt $attempt of 3 for streaming endpoint..." | |
| HTTP_CODE=$(timeout 10s curl -s -o /dev/null -w "%{http_code}" http://localhost:$QUARKUS_PORT/chat/streaming) | |
| if [ "$HTTP_CODE" = "200" ]; then | |
| echo "SUCCESS: HTTP $HTTP_CODE" | |
| break | |
| fi | |
| echo "Failed: HTTP $HTTP_CODE" | |
| [ $attempt -lt 3 ] && sleep 2 | |
| [ $attempt -eq 3 ] && { echo "::error::Streaming endpoint failed after 3 attempts"; exit 1; } | |
| done | |
| - name: Cleanup & Shutdown | |
| if: always() | |
| run: | | |
| kill $APP_PID || true | |
| wait $APP_PID 2>/dev/null || true | |
| # Collect all matrix metrics and update history | |
| publish-performance-history: | |
| # Guard: only commit history on real pushes to main, not on PRs or forks. | |
| # Prevents duplicate entries from PR runs and avoids push-permission errors on forks. | |
| if: >- | |
| github.repository == 'beehive-lab/GPULlama3.java' && | |
| github.event_name == 'push' && | |
| github.ref == 'refs/heads/main' | |
| runs-on: [self-hosted] | |
| needs: standalone-inference | |
| timeout-minutes: 15 | |
| steps: | |
| - name: Checkout GPULlama3 | |
| uses: actions/checkout@v4 | |
| - name: Download metrics artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| pattern: metrics-*-${{ github.run_id }} | |
| path: ${{ runner.temp }}/metrics-artifacts | |
| merge-multiple: true | |
| - name: Append to performance history | |
| run: | | |
| python3 scripts/process_metrics.py \ | |
| --metrics-dir "${{ runner.temp }}/metrics-artifacts" \ | |
| --commit "${{ github.sha }}" \ | |
| --branch "${{ github.ref_name }}" \ | |
| --run-id "${{ github.run_id }}" \ | |
| --run-number "${{ github.run_number }}" \ | |
| --run-attempt "${{ github.run_attempt }}" \ | |
| --workflow "${{ github.workflow }}" \ | |
| --history "$PERF_HISTORY_FILE" | |
| - name: Commit performance history | |
| run: | | |
| SHORT_SHA=$(echo "${GITHUB_SHA}" | cut -c1-8) | |
| git config user.name "github-actions[bot]" | |
| git config user.email "github-actions[bot]@users.noreply.github.com" | |
| git add "$PERF_HISTORY_FILE" | |
| git diff --cached --quiet && \ | |
| echo "No history changes to commit" && exit 0 | |
| git commit -m "perf: record run #${{ github.run_number }} @ ${SHORT_SHA}" | |
| for attempt in 1 2 3; do | |
| git pull --rebase origin main && git push && break || { | |
| if [ $attempt -lt 3 ]; then | |
| echo "Attempt $attempt failed, retrying in $((attempt * 5))s..." | |
| sleep $((attempt * 5)) | |
| else | |
| echo "::error::Failed to push after 3 attempts" | |
| exit 1 | |
| fi | |
| } | |
| done |