Skip to content

Commit 7d13c88

Browse files
[ci] Merge integration tests into main ci workflow
1 parent b986b1c commit 7d13c88

2 files changed

Lines changed: 163 additions & 244 deletions

File tree

.github/workflows/build-and-run.yml

Lines changed: 163 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,9 @@ on:
99

1010
env:
1111
JAVA_VERSION: 21.0.2-open
12-
TORNADO_BASE_ROOT: ${{ github.workspace }}/GPULlama3.java/external
13-
LLAMA_ROOT: ${{ github.workspace }}
1412
GRAAL_JARS: /opt/graalJars
1513
MODELS_DIR: /opt/models
14+
QUARKUS_PORT: 8081
1615
# History file committed back to the repo on push to main
1716
PERF_HISTORY_FILE: docs/perf-history.jsonl
1817

@@ -31,14 +30,17 @@ jobs:
3130
cd ${{ github.workspace }}
3231
# ./mvnw -T12C -Pspotless spotless:check
3332
34-
build-and-run:
33+
# Build: TornadoVM → GPULlama3 → Quarkus LangChain4j
34+
# max-parallel: 1 ensures the opencl and ptx variants run sequentially so
35+
# there are no workspace conflicts between matrix jobs.
36+
build:
3537
if: github.repository == 'beehive-lab/GPULlama3.java'
3638
runs-on: [self-hosted]
3739
needs: code-quality
3840
timeout-minutes: 30
39-
4041
strategy:
4142
fail-fast: true
43+
max-parallel: 1
4244
matrix:
4345
backend:
4446
- name: opencl
@@ -47,6 +49,8 @@ jobs:
4749
steps:
4850
- name: Checkout GPULlama3
4951
uses: actions/checkout@v4
52+
with:
53+
clean: false
5054

5155
- name: Set up Java
5256
uses: ./.github/actions/setup-java
@@ -56,18 +60,66 @@ jobs:
5660
- name: Setup TornadoVM
5761
uses: ./.github/actions/setup-tornadovm
5862
env:
59-
TORNADO_ROOT: ${{ env.TORNADO_BASE_ROOT }}/tornadovm-${{ matrix.backend.name }}
63+
TORNADO_ROOT: ${{ runner.tool_cache }}/tornadovm/tornadovm-${{ matrix.backend.name }}
6064
with:
6165
backend: ${{ matrix.backend.name }}
6266

6367
- name: Build GPULlama3.java
6468
run: |
65-
cd ${{ github.workspace }}
66-
echo "Using TORNADOVM_HOME=$TORNADOVM_HOME"
6769
tornado --version
68-
./mvnw clean package -DskipTests
70+
# Strip any pre-existing -SNAPSHOT suffix before appending, making this step idempotent
71+
# across sequential matrix variants (ptx runs after opencl on the same workspace).
72+
BASE_VERSION=$(./mvnw help:evaluate -Dexpression=project.version -q -DforceStdout | sed 's/-SNAPSHOT$//')
73+
GPULLAMA3_VERSION="${BASE_VERSION}-SNAPSHOT"
74+
echo "GPULlama3 version: $GPULLAMA3_VERSION"
75+
./mvnw versions:set -DnewVersion=$GPULLAMA3_VERSION
76+
./mvnw clean install -DskipTests
77+
echo "GPULLAMA3_VERSION=$GPULLAMA3_VERSION" >> $GITHUB_ENV
78+
79+
- name: Clone Quarkus LangChain4j
80+
run: |
81+
rm -rf quarkus-langchain4j
82+
git clone --depth 1 https://github.com/quarkiverse/quarkus-langchain4j.git
83+
84+
- name: Build Quarkus LangChain4j
85+
run: |
86+
cd ${{ github.workspace }}/quarkus-langchain4j
87+
sed -i 's/<gpu-llama3\.version>.*<\/gpu-llama3\.version>/<gpu-llama3.version>'$GPULLAMA3_VERSION'<\/gpu-llama3.version>/' pom.xml
88+
# -Dtornado activates the TornadoVM profile; -am builds only the gpu-llama3 module + deps
89+
mvn clean install -pl integration-tests/gpu-llama3 -am -DskipTests -Dtornado
90+
91+
standalone-inference:
92+
if: github.repository == 'beehive-lab/GPULlama3.java'
93+
runs-on: [self-hosted]
94+
needs: build
95+
timeout-minutes: 30
96+
strategy:
97+
fail-fast: true
98+
matrix:
99+
backend:
100+
- name: opencl
101+
- name: ptx
69102

70-
# ── Llama-3.2-1B: standard + prefill-decode variants, all backends ──────────
103+
steps:
104+
- name: Checkout GPULlama3
105+
uses: actions/checkout@v4
106+
with:
107+
clean: false
108+
109+
- name: Set up Java
110+
uses: ./.github/actions/setup-java
111+
with:
112+
java_version: ${{ env.JAVA_VERSION }}
113+
114+
- name: Setup TornadoVM
115+
uses: ./.github/actions/setup-tornadovm
116+
env:
117+
TORNADO_ROOT: ${{ runner.tool_cache }}/tornadovm/tornadovm-${{ matrix.backend.name }}
118+
with:
119+
backend: ${{ matrix.backend.name }}
120+
121+
# Test standalone mode per model family and quantization
122+
# Note: variants can be represented with matrices
71123
- name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Standard
72124
uses: ./.github/actions/run-inference
73125
with:
@@ -100,12 +152,12 @@ jobs:
100152
flags: --with-prefill-decode --batch-prefill-size 32
101153
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-batch-prefill-decode.json
102154

103-
# ── PTX-only: CUDA-graph variants ────────────────────────────────────────
155+
# PTX-only: CUDA-graph variants
104156
- name: PTX - FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode-CUDA-Graphs
105157
if: matrix.backend.name == 'ptx'
106158
uses: ./.github/actions/run-inference
107159
with:
108-
backend: ptx
160+
backend: ${{ matrix.backend.name }}
109161
model_file: Llama-3.2-1B-Instruct-F16.gguf
110162
model: Llama-3.2-1B-Instruct
111163
quantization: F16
@@ -117,15 +169,14 @@ jobs:
117169
if: matrix.backend.name == 'ptx'
118170
uses: ./.github/actions/run-inference
119171
with:
120-
backend: ptx
172+
backend: ${{ matrix.backend.name }}
121173
model_file: Llama-3.2-1B-Instruct-F16.gguf
122174
model: Llama-3.2-1B-Instruct
123175
quantization: F16
124176
configuration: batch-prefill-decode-cuda-graphs
125177
flags: --with-prefill-decode --batch-prefill-size 32 --cuda-graphs
126178
metrics_file: ${{ runner.temp }}/metrics-ptx-llama-1b-f16-batch-prefill-decode-cuda-graphs.json
127179

128-
# ── Additional models — standard inference, all backends ─────────────────
129180
- name: FP16 - Run Qwen3-4B-f16.gguf
130181
uses: ./.github/actions/run-inference
131182
with:
@@ -256,7 +307,7 @@ jobs:
256307
configuration: standard
257308
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-q8-standard.json
258309

259-
# ── Upload metrics for the publish job ────────────────────────────────────
310+
# Upload metrics for the publish job
260311
- name: Upload metrics artifacts
261312
if: always()
262313
uses: actions/upload-artifact@v4
@@ -265,7 +316,102 @@ jobs:
265316
path: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-*.json
266317
if-no-files-found: warn
267318

268-
# ── Separate job: collect all matrix metrics and update history ───────────────
319+
# Test integration with Quarkus-langchain4j
320+
quarkus-langchain4j-integration:
321+
if: github.repository == 'beehive-lab/GPULlama3.java'
322+
runs-on: [self-hosted]
323+
needs: build
324+
timeout-minutes: 10
325+
strategy:
326+
fail-fast: true
327+
matrix:
328+
backend:
329+
- name: opencl
330+
- name: ptx
331+
332+
steps:
333+
- name: Checkout GPULlama3
334+
uses: actions/checkout@v4
335+
with:
336+
clean: false
337+
338+
- name: Set up Java
339+
uses: ./.github/actions/setup-java
340+
with:
341+
java_version: ${{ env.JAVA_VERSION }}
342+
343+
- name: Setup TornadoVM
344+
uses: ./.github/actions/setup-tornadovm
345+
env:
346+
TORNADO_ROOT: ${{ runner.tool_cache }}/tornadovm/tornadovm-${{ matrix.backend.name }}
347+
with:
348+
backend: ${{ matrix.backend.name }}
349+
350+
- name: Verify GPULlama3 Dependency
351+
run: |
352+
cd ${{ github.workspace }}/quarkus-langchain4j/integration-tests/gpu-llama3
353+
mvn dependency:tree | grep "io.github.beehive-lab:gpu-llama3"
354+
355+
- name: Start Quarkus Application
356+
run: |
357+
cd ${{ github.workspace }}/quarkus-langchain4j/integration-tests/gpu-llama3
358+
java @"$TORNADOVM_HOME/tornado-argfile" \
359+
-Dtornado.device.memory=8GB \
360+
-Dquarkus.http.port=$QUARKUS_PORT \
361+
-jar target/quarkus-app/quarkus-run.jar &
362+
APP_PID=$!
363+
echo "APP_PID=$APP_PID" >> $GITHUB_ENV
364+
for i in {1..30}; do
365+
if curl -s http://localhost:$QUARKUS_PORT/q/health > /dev/null 2>&1; then
366+
echo "Application ready after ${i} seconds"
367+
break
368+
elif [ $i -eq 30 ]; then
369+
echo "::error::Application failed to start within 30 seconds"
370+
kill $APP_PID || true
371+
exit 1
372+
else
373+
[ $((i % 5)) -eq 0 ] && echo "Still waiting... (${i}s)"
374+
sleep 1
375+
fi
376+
done
377+
378+
- name: Trigger Blocking Endpoint
379+
run: |
380+
for attempt in 1 2 3; do
381+
echo "Attempt $attempt of 3 for blocking endpoint..."
382+
HTTP_RESPONSE=$(curl -s -w "%{http_code}" http://localhost:$QUARKUS_PORT/chat/blocking)
383+
HTTP_CODE="${HTTP_RESPONSE: -3}"
384+
if [ "$HTTP_CODE" = "200" ]; then
385+
echo "SUCCESS: HTTP $HTTP_CODE"
386+
echo "Response body: ${HTTP_RESPONSE%???}"
387+
break
388+
fi
389+
echo "Failed: HTTP $HTTP_CODE"
390+
[ $attempt -lt 3 ] && sleep 2
391+
[ $attempt -eq 3 ] && { echo "::error::Blocking endpoint failed after 3 attempts"; exit 1; }
392+
done
393+
394+
- name: Trigger Streaming Endpoint
395+
run: |
396+
for attempt in 1 2 3; do
397+
echo "Attempt $attempt of 3 for streaming endpoint..."
398+
HTTP_CODE=$(timeout 10s curl -s -o /dev/null -w "%{http_code}" http://localhost:$QUARKUS_PORT/chat/streaming)
399+
if [ "$HTTP_CODE" = "200" ]; then
400+
echo "SUCCESS: HTTP $HTTP_CODE"
401+
break
402+
fi
403+
echo "Failed: HTTP $HTTP_CODE"
404+
[ $attempt -lt 3 ] && sleep 2
405+
[ $attempt -eq 3 ] && { echo "::error::Streaming endpoint failed after 3 attempts"; exit 1; }
406+
done
407+
408+
- name: Cleanup & Shutdown
409+
if: always()
410+
run: |
411+
kill $APP_PID || true
412+
wait $APP_PID 2>/dev/null || true
413+
414+
# Collect all matrix metrics and update history
269415
publish-performance-history:
270416
# Guard: only commit history on real pushes to main, not on PRs or forks.
271417
# Prevents duplicate entries from PR runs and avoids push-permission errors on forks.
@@ -275,7 +421,7 @@ jobs:
275421
github.ref == 'refs/heads/main'
276422
277423
runs-on: [self-hosted]
278-
needs: build-and-run
424+
needs: standalone-inference
279425
timeout-minutes: 15
280426

281427
steps:
@@ -325,4 +471,4 @@ jobs:
325471
exit 1
326472
fi
327473
}
328-
done
474+
done

0 commit comments

Comments
 (0)