99
1010env :
1111 JAVA_VERSION : 21.0.2-open
12- TORNADO_BASE_ROOT : ${{ github.workspace }}/GPULlama3.java/external
13- LLAMA_ROOT : ${{ github.workspace }}
1412 GRAAL_JARS : /opt/graalJars
1513 MODELS_DIR : /opt/models
14+ QUARKUS_PORT : 8081
1615 # History file committed back to the repo on push to main
1716 PERF_HISTORY_FILE : docs/perf-history.jsonl
1817
@@ -31,14 +30,17 @@ jobs:
3130 cd ${{ github.workspace }}
3231 # ./mvnw -T12C -Pspotless spotless:check
3332
34- build-and-run :
33+ # Build: TornadoVM → GPULlama3 → Quarkus LangChain4j
34+ # max-parallel: 1 ensures the opencl and ptx variants run sequentially so
35+ # there are no workspace conflicts between matrix jobs.
36+ build :
3537 if : github.repository == 'beehive-lab/GPULlama3.java'
3638 runs-on : [self-hosted]
3739 needs : code-quality
3840 timeout-minutes : 30
39-
4041 strategy :
4142 fail-fast : true
43+ max-parallel : 1
4244 matrix :
4345 backend :
4446 - name : opencl
4749 steps :
4850 - name : Checkout GPULlama3
4951 uses : actions/checkout@v4
52+ with :
53+ clean : false
5054
5155 - name : Set up Java
5256 uses : ./.github/actions/setup-java
@@ -56,18 +60,66 @@ jobs:
5660 - name : Setup TornadoVM
5761 uses : ./.github/actions/setup-tornadovm
5862 env :
59- TORNADO_ROOT : ${{ env.TORNADO_BASE_ROOT }}/tornadovm-${{ matrix.backend.name }}
63+ TORNADO_ROOT : ${{ runner.tool_cache }}/tornadovm /tornadovm-${{ matrix.backend.name }}
6064 with :
6165 backend : ${{ matrix.backend.name }}
6266
6367 - name : Build GPULlama3.java
6468 run : |
65- cd ${{ github.workspace }}
66- echo "Using TORNADOVM_HOME=$TORNADOVM_HOME"
6769 tornado --version
68- ./mvnw clean package -DskipTests
70+ # Strip any pre-existing -SNAPSHOT suffix before appending, making this step idempotent
71+ # across sequential matrix variants (ptx runs after opencl on the same workspace).
72+ BASE_VERSION=$(./mvnw help:evaluate -Dexpression=project.version -q -DforceStdout | sed 's/-SNAPSHOT$//')
73+ GPULLAMA3_VERSION="${BASE_VERSION}-SNAPSHOT"
74+ echo "GPULlama3 version: $GPULLAMA3_VERSION"
75+ ./mvnw versions:set -DnewVersion=$GPULLAMA3_VERSION
76+ ./mvnw clean install -DskipTests
77+ echo "GPULLAMA3_VERSION=$GPULLAMA3_VERSION" >> $GITHUB_ENV
78+
79+ - name : Clone Quarkus LangChain4j
80+ run : |
81+ rm -rf quarkus-langchain4j
82+ git clone --depth 1 https://github.com/quarkiverse/quarkus-langchain4j.git
83+
84+ - name : Build Quarkus LangChain4j
85+ run : |
86+ cd ${{ github.workspace }}/quarkus-langchain4j
87+ sed -i 's/<gpu-llama3\.version>.*<\/gpu-llama3\.version>/<gpu-llama3.version>'$GPULLAMA3_VERSION'<\/gpu-llama3.version>/' pom.xml
88+ # -Dtornado activates the TornadoVM profile; -am builds only the gpu-llama3 module + deps
89+ mvn clean install -pl integration-tests/gpu-llama3 -am -DskipTests -Dtornado
90+
91+ standalone-inference :
92+ if : github.repository == 'beehive-lab/GPULlama3.java'
93+ runs-on : [self-hosted]
94+ needs : build
95+ timeout-minutes : 30
96+ strategy :
97+ fail-fast : true
98+ matrix :
99+ backend :
100+ - name : opencl
101+ - name : ptx
69102
70- # ── Llama-3.2-1B: standard + prefill-decode variants, all backends ──────────
103+ steps :
104+ - name : Checkout GPULlama3
105+ uses : actions/checkout@v4
106+ with :
107+ clean : false
108+
109+ - name : Set up Java
110+ uses : ./.github/actions/setup-java
111+ with :
112+ java_version : ${{ env.JAVA_VERSION }}
113+
114+ - name : Setup TornadoVM
115+ uses : ./.github/actions/setup-tornadovm
116+ env :
117+ TORNADO_ROOT : ${{ runner.tool_cache }}/tornadovm/tornadovm-${{ matrix.backend.name }}
118+ with :
119+ backend : ${{ matrix.backend.name }}
120+
121+ # Test standalone mode per model family and quantization
122+ # Note: variants can be represented with matrices
71123 - name : FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Standard
72124 uses : ./.github/actions/run-inference
73125 with :
@@ -100,12 +152,12 @@ jobs:
100152 flags : --with-prefill-decode --batch-prefill-size 32
101153 metrics_file : ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-batch-prefill-decode.json
102154
103- # ── PTX-only: CUDA-graph variants ────────────────────────────────────────
155+ # PTX-only: CUDA-graph variants
104156 - name : PTX - FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode-CUDA-Graphs
105157 if : matrix.backend.name == 'ptx'
106158 uses : ./.github/actions/run-inference
107159 with :
108- backend : ptx
160+ backend : ${{ matrix.backend.name }}
109161 model_file : Llama-3.2-1B-Instruct-F16.gguf
110162 model : Llama-3.2-1B-Instruct
111163 quantization : F16
@@ -117,15 +169,14 @@ jobs:
117169 if : matrix.backend.name == 'ptx'
118170 uses : ./.github/actions/run-inference
119171 with :
120- backend : ptx
172+ backend : ${{ matrix.backend.name }}
121173 model_file : Llama-3.2-1B-Instruct-F16.gguf
122174 model : Llama-3.2-1B-Instruct
123175 quantization : F16
124176 configuration : batch-prefill-decode-cuda-graphs
125177 flags : --with-prefill-decode --batch-prefill-size 32 --cuda-graphs
126178 metrics_file : ${{ runner.temp }}/metrics-ptx-llama-1b-f16-batch-prefill-decode-cuda-graphs.json
127179
128- # ── Additional models — standard inference, all backends ─────────────────
129180 - name : FP16 - Run Qwen3-4B-f16.gguf
130181 uses : ./.github/actions/run-inference
131182 with :
@@ -256,7 +307,7 @@ jobs:
256307 configuration : standard
257308 metrics_file : ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-q8-standard.json
258309
259- # ── Upload metrics for the publish job ────────────────────────────────────
310+ # Upload metrics for the publish job
260311 - name : Upload metrics artifacts
261312 if : always()
262313 uses : actions/upload-artifact@v4
@@ -265,7 +316,102 @@ jobs:
265316 path : ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-*.json
266317 if-no-files-found : warn
267318
268- # ── Separate job: collect all matrix metrics and update history ───────────────
319+ # Test integration with Quarkus-langchain4j
320+ quarkus-langchain4j-integration :
321+ if : github.repository == 'beehive-lab/GPULlama3.java'
322+ runs-on : [self-hosted]
323+ needs : build
324+ timeout-minutes : 10
325+ strategy :
326+ fail-fast : true
327+ matrix :
328+ backend :
329+ - name : opencl
330+ - name : ptx
331+
332+ steps :
333+ - name : Checkout GPULlama3
334+ uses : actions/checkout@v4
335+ with :
336+ clean : false
337+
338+ - name : Set up Java
339+ uses : ./.github/actions/setup-java
340+ with :
341+ java_version : ${{ env.JAVA_VERSION }}
342+
343+ - name : Setup TornadoVM
344+ uses : ./.github/actions/setup-tornadovm
345+ env :
346+ TORNADO_ROOT : ${{ runner.tool_cache }}/tornadovm/tornadovm-${{ matrix.backend.name }}
347+ with :
348+ backend : ${{ matrix.backend.name }}
349+
350+ - name : Verify GPULlama3 Dependency
351+ run : |
352+ cd ${{ github.workspace }}/quarkus-langchain4j/integration-tests/gpu-llama3
353+ mvn dependency:tree | grep "io.github.beehive-lab:gpu-llama3"
354+
355+ - name : Start Quarkus Application
356+ run : |
357+ cd ${{ github.workspace }}/quarkus-langchain4j/integration-tests/gpu-llama3
358+ java @"$TORNADOVM_HOME/tornado-argfile" \
359+ -Dtornado.device.memory=8GB \
360+ -Dquarkus.http.port=$QUARKUS_PORT \
361+ -jar target/quarkus-app/quarkus-run.jar &
362+ APP_PID=$!
363+ echo "APP_PID=$APP_PID" >> $GITHUB_ENV
364+ for i in {1..30}; do
365+ if curl -s http://localhost:$QUARKUS_PORT/q/health > /dev/null 2>&1; then
366+ echo "Application ready after ${i} seconds"
367+ break
368+ elif [ $i -eq 30 ]; then
369+ echo "::error::Application failed to start within 30 seconds"
370+ kill $APP_PID || true
371+ exit 1
372+ else
373+ [ $((i % 5)) -eq 0 ] && echo "Still waiting... (${i}s)"
374+ sleep 1
375+ fi
376+ done
377+
378+ - name : Trigger Blocking Endpoint
379+ run : |
380+ for attempt in 1 2 3; do
381+ echo "Attempt $attempt of 3 for blocking endpoint..."
382+ HTTP_RESPONSE=$(curl -s -w "%{http_code}" http://localhost:$QUARKUS_PORT/chat/blocking)
383+ HTTP_CODE="${HTTP_RESPONSE: -3}"
384+ if [ "$HTTP_CODE" = "200" ]; then
385+ echo "SUCCESS: HTTP $HTTP_CODE"
386+ echo "Response body: ${HTTP_RESPONSE%???}"
387+ break
388+ fi
389+ echo "Failed: HTTP $HTTP_CODE"
390+ [ $attempt -lt 3 ] && sleep 2
391+ [ $attempt -eq 3 ] && { echo "::error::Blocking endpoint failed after 3 attempts"; exit 1; }
392+ done
393+
394+ - name : Trigger Streaming Endpoint
395+ run : |
396+ for attempt in 1 2 3; do
397+ echo "Attempt $attempt of 3 for streaming endpoint..."
398+ HTTP_CODE=$(timeout 10s curl -s -o /dev/null -w "%{http_code}" http://localhost:$QUARKUS_PORT/chat/streaming)
399+ if [ "$HTTP_CODE" = "200" ]; then
400+ echo "SUCCESS: HTTP $HTTP_CODE"
401+ break
402+ fi
403+ echo "Failed: HTTP $HTTP_CODE"
404+ [ $attempt -lt 3 ] && sleep 2
405+ [ $attempt -eq 3 ] && { echo "::error::Streaming endpoint failed after 3 attempts"; exit 1; }
406+ done
407+
408+ - name : Cleanup & Shutdown
409+ if : always()
410+ run : |
411+ kill $APP_PID || true
412+ wait $APP_PID 2>/dev/null || true
413+
414+ # Collect all matrix metrics and update history
269415 publish-performance-history :
270416 # Guard: only commit history on real pushes to main, not on PRs or forks.
271417 # Prevents duplicate entries from PR runs and avoids push-permission errors on forks.
@@ -275,7 +421,7 @@ jobs:
275421 github.ref == 'refs/heads/main'
276422
277423 runs-on : [self-hosted]
278- needs : build-and-run
424+ needs : standalone-inference
279425 timeout-minutes : 15
280426
281427 steps :
@@ -325,4 +471,4 @@ jobs:
325471 exit 1
326472 fi
327473 }
328- done
474+ done
0 commit comments