Skip to content

Commit 264c69b

Browse files
authored
#21997: [skip ci] Create initial BH QB pipeline (#23373)
### Ticket #21997 ### Problem description Need to run tests on the BH P150x4 QB ### What's changed Update blackhole post commit workflow with initial set of tests for QB: - BH single card tests - Llama DP=4 Demo - [Fabric unit tests](#23295) Adds to the nightly BH suite: - Llama DP=4 Demo - Fabric unit tests ### Checklist - [ ] BH post commit targeting QB: https://github.com/tenstorrent/tt-metal/actions/runs/15639167108 - [x] BH post commit: https://github.com/tenstorrent/tt-metal/actions/runs/15640932046 - [x] BH nightly (llmbox tests only): https://github.com/tenstorrent/tt-metal/actions/runs/15638560284
1 parent 6efb31d commit 264c69b

File tree

4 files changed

+258
-26
lines changed

4 files changed

+258
-26
lines changed
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
name: "[internal] Blackhole LLMBox Demo tests impl"
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
build-artifact-name:
7+
required: true
8+
type: string
9+
wheel-artifact-name:
10+
required: true
11+
type: string
12+
docker-image:
13+
required: true
14+
type: string
15+
runner-label:
16+
required: false
17+
type: string
18+
default: "BH-LLMBox"
19+
20+
jobs:
21+
single-card-demo-tests:
22+
strategy:
23+
fail-fast: false
24+
matrix:
25+
test-group: [
26+
{
27+
name: "llama3-8b quietbox data-parallel=4 performance",
28+
arch: blackhole,
29+
cmd: LLAMA_DIR=/localdev/blackhole_demos/huggingface_data/meta-llama/Llama-3.1-8B-Instruct pytest models/tt_transformers/demo/simple_text_demo.py -k "performance and ci-32" --data_parallel 4,
30+
owner_id: U05RWH3QUPM # Salar Hosseini
31+
}
32+
]
33+
name: ${{ matrix.test-group.name }}
34+
runs-on: ["in-service", "${{ inputs.runner-label }}", "pipeline-perf"]
35+
steps:
36+
- name: ⬇️ Checkout
37+
uses: actions/checkout@v4
38+
with:
39+
submodules: recursive
40+
- name: ⬇️ Download Build
41+
uses: actions/download-artifact@v4
42+
timeout-minutes: 10
43+
with:
44+
name: ${{ inputs.build-artifact-name }}
45+
- name: Extract files
46+
run: tar -xvf ttm_any.tar
47+
- name: ⬇️ Download Wheel
48+
uses: actions/download-artifact@v4
49+
timeout-minutes: 10
50+
with:
51+
name: ${{ inputs.wheel-artifact-name }}
52+
- name: Enable Performance mode
53+
if: ${{ contains(matrix.test-group.name, 'performance') }}
54+
run: |
55+
sudo cpupower frequency-set -g performance
56+
- name: Run demo regression tests
57+
uses: ./.github/actions/docker-run
58+
timeout-minutes: 70
59+
env:
60+
LOGURU_LEVEL: INFO
61+
with:
62+
docker_image: ${{ inputs.docker-image }}
63+
docker_password: ${{ secrets.GITHUB_TOKEN }}
64+
docker_opts: |
65+
-e TT_METAL_HOME=${{ github.workspace }}
66+
-e ARCH_NAME=${{ matrix.test-group.arch }}
67+
-e LD_LIBRARY_PATH=${{ github.workspace }}/build/lib
68+
-e HF_TOKEN=${{ secrets.HUGGINGFACE_TOKEN }}
69+
-v /localdev/blackhole_demos:/localdev/blackhole_demos:ro
70+
install_wheel: true
71+
run_args: |
72+
if [[ "${{ matrix.test-group.name }}" == *"llama"* ]]; then
73+
pip install -r ${{ github.workspace }}/models/tt_transformers/requirements.txt
74+
fi
75+
${{ matrix.test-group.cmd }}
76+
- uses: tenstorrent/tt-metal/.github/actions/upload-artifact-with-job-uuid@main
77+
timeout-minutes: 10
78+
if: ${{ !cancelled() }}
79+
with:
80+
path: generated/test_reports/
81+
prefix: "test_reports_"
82+
- name: Disable Performance mode
83+
if: ${{ contains(matrix.test-group.name, 'performance') }}
84+
run: |
85+
sudo cpupower frequency-set -g ondemand
86+
- uses: tenstorrent/tt-metal/.github/actions/slack-report@main
87+
if: ${{ failure() }}
88+
with:
89+
slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }}
90+
owner: ${{ matrix.test-group.owner_id }}
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
name: "[internal] Blackhole LLMBox Fabric unit tests impl"
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
arch:
7+
required: true
8+
type: string
9+
runner-label:
10+
required: true
11+
type: string
12+
timeout:
13+
required: false
14+
type: number
15+
default: 10
16+
build-artifact-name:
17+
required: true
18+
type: string
19+
docker-image:
20+
required: true
21+
type: string
22+
wheel-artifact-name:
23+
required: true
24+
type: string
25+
26+
jobs:
27+
fabric-tests:
28+
strategy:
29+
# Do not fail-fast because we need to ensure all tests go to completion
30+
# so we try not to get hanging machines
31+
fail-fast: false
32+
matrix:
33+
test-group: [
34+
{name: fabric 1D unit tests, cmd: ./build/test/tt_metal/tt_fabric/fabric_unit_tests --gtest_filter="Fabric1D.*" },
35+
{name: fabric 2D fixture unit tests, cmd: ./build/test/tt_metal/tt_fabric/fabric_unit_tests --gtest_filter="Fabric2D*Fixture.*" },
36+
{name: fabric system health tests, cmd: ./build/test/tt_metal/tt_fabric/test_system_health },
37+
# {name: t3000 fast fabric tests, cmd: "source tests/scripts/t3000/run_t3000_unit_tests.sh && run_t3000_ttfabric_tests" },
38+
]
39+
name: ${{ inputs.arch }} ${{ inputs.runner-label }} ${{ matrix.test-group.name }}
40+
runs-on: >-
41+
${{
42+
((inputs.runner-label == 'N150' || inputs.runner-label == 'N300') && format('tt-beta-ubuntu-2204-{0}-large-stable', inputs.runner-label))
43+
|| github.event.pull_request.head.repo.fork == true && format('tt-beta-ubuntu-2204-{0}-large-stable', inputs.runner-label)
44+
|| fromJSON(format('["{0}", "in-service", "cloud-virtual-machine"]', inputs.runner-label))
45+
}}
46+
container:
47+
image: ${{ inputs.docker-image || 'docker-image-unresolved' }}
48+
env:
49+
ARCH_NAME: ${{ inputs.arch }}
50+
LOGURU_LEVEL: INFO
51+
LD_LIBRARY_PATH: /work/build/lib
52+
PYTHONPATH: /work
53+
TT_METAL_HOME: /work
54+
GTEST_OUTPUT: xml:/work/generated/test_reports/
55+
volumes:
56+
- ${{ github.workspace }}/docker-job:/work # Subdir to workaround https://github.com/actions/runner/issues/691
57+
- /dev/hugepages-1G:/dev/hugepages-1G
58+
options: "--device /dev/tenstorrent"
59+
defaults:
60+
run:
61+
shell: bash
62+
working-directory: /work # https://github.com/actions/runner/issues/878
63+
steps:
64+
- name: ⬇️ Setup Job
65+
uses: tenstorrent/tt-metal/.github/actions/setup-job@main
66+
timeout-minutes: 10
67+
with:
68+
build-artifact-name: ${{ inputs.build-artifact-name }}
69+
wheel-artifact-name: ${{ inputs.wheel-artifact-name }}
70+
- name: ${{ matrix.test-group.name }} tests
71+
timeout-minutes: ${{ inputs.timeout }}
72+
run: |
73+
${{ matrix.test-group.cmd }}
74+
- uses: tenstorrent/tt-metal/.github/actions/slack-report@main
75+
if: ${{ failure() }}
76+
with:
77+
slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }}
78+
owner: U06CXU895AP # Michael Chiou
79+
- uses: tenstorrent/tt-metal/.github/actions/upload-artifact-with-job-uuid@main
80+
timeout-minutes: 10
81+
if: ${{ !cancelled() }}
82+
with:
83+
prefix: "test_reports_"
84+
- name: Generate gtest annotations on failure
85+
uses: tenstorrent/tt-metal/.github/actions/generate-gtest-failure-message@main
86+
if: ${{ failure() }}

.github/workflows/blackhole-nightly-tests.yaml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,3 +82,22 @@ jobs:
8282
docker-image: ${{ needs.build-artifact.outputs.dev-docker-image }}
8383
build-artifact-name: ${{ needs.build-artifact.outputs.build-artifact-name }}
8484
wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }}
85+
blackhole-llmbox-demo-tests:
86+
needs: build-artifact
87+
secrets: inherit
88+
uses: ./.github/workflows/blackhole-llmbox-demo-tests-impl.yaml
89+
with:
90+
runner-label: BH-LLMBox
91+
docker-image: ${{ needs.build-artifact.outputs.dev-docker-image }}
92+
build-artifact-name: ${{ needs.build-artifact.outputs.build-artifact-name }}
93+
wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }}
94+
blackhole-llmbox-fabric-unit-tests:
95+
needs: build-artifact
96+
secrets: inherit
97+
uses: ./.github/workflows/blackhole-llmbox-fabric-build-and-unit-tests.yaml
98+
with:
99+
arch: blackhole
100+
runner-label: BH-LLMBox
101+
docker-image: ${{ needs.build-artifact.outputs.dev-docker-image }}
102+
build-artifact-name: ${{ needs.build-artifact.outputs.build-artifact-name }}
103+
wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }}

.github/workflows/blackhole-post-commit.yaml

Lines changed: 63 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ on:
1616
description: 'Enable watcher in BH Post commit'
1717
default: false
1818
type: boolean
19+
enable-llmbox-tests:
20+
description: 'Run tests on LLMBox instead of single card (must set runner-label to BH-LLMBox)'
21+
default: false
22+
type: boolean
1923
workflow_dispatch:
2024
inputs:
2125
runner-label:
@@ -42,13 +46,17 @@ on:
4246
description: 'Enable watcher in BH Post commit'
4347
default: false
4448
type: boolean
49+
enable-llmbox-tests:
50+
description: 'Run tests on LLMBox instead of single card (must set runner-label to BH-LLMBox)'
51+
default: false
52+
type: boolean
4553
schedule:
4654
- cron: "0 */4 * * *"
4755
# Pause this since not enough runners to support every commit to main
4856
# push:
4957
# branches: ["main"]
5058

51-
run-name: ${{ inputs.enable-watcher == true && 'Blackhole post-commit tests (watcher enabled) ' || 'Blackhole post-commit tests' }}
59+
run-name: ${{ inputs.enable-llmbox-tests == true && 'Blackhole LLMBox tests' || (inputs.enable-watcher == true && 'Blackhole post-commit tests (watcher enabled) ' || 'Blackhole post-commit tests') }}
5260

5361
permissions:
5462
actions: read
@@ -60,6 +68,23 @@ permissions:
6068
checks: write
6169

6270
jobs:
71+
generate-matrix:
72+
runs-on: ubuntu-latest
73+
outputs:
74+
matrix: ${{ steps.set-matrix.outputs.matrix }}
75+
steps:
76+
- id: set-matrix
77+
run: |
78+
if [ "${{ inputs.enable-llmbox-tests }}" = "true" ]; then
79+
if [ "${{ inputs.runner-label }}" != "BH-LLMBox" ]; then
80+
echo "::warning::LLMBox tests are enabled but runner-label is not set to BH-LLMBox. Current value: ${{ inputs.runner-label }}"
81+
fi
82+
matrix='["BH-LLMBox"]'
83+
else
84+
matrix='["P100", "P150"]'
85+
fi
86+
echo "matrix=$matrix" >> $GITHUB_OUTPUT
87+
6388
build-artifact:
6489
uses: ./.github/workflows/build-artifact.yaml
6590
permissions:
@@ -86,7 +111,6 @@ jobs:
86111
secrets: inherit
87112
with:
88113
arch: "blackhole"
89-
timeout: 20
90114
runner-label: ${{ inputs.runner-label || 'BH' }}
91115
docker-image: ${{ needs.build-artifact-profiler.outputs.dev-docker-image }}
92116
build-artifact-name: ${{ needs.build-artifact-profiler.outputs.build-artifact-name }}
@@ -108,7 +132,6 @@ jobs:
108132
with:
109133
arch: blackhole
110134
runner-label: ${{ inputs.runner-label || 'BH' }}
111-
timeout: 15
112135
docker-image: ${{ needs.build-artifact.outputs.dev-docker-image }}
113136
build-artifact-name: ${{ needs.build-artifact.outputs.build-artifact-name }}
114137
wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }}
@@ -138,19 +161,16 @@ jobs:
138161
wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }}
139162
enable-watcher: ${{ inputs.enable-watcher || false }}
140163
models-unit-tests:
141-
needs: build-artifact
164+
needs: [build-artifact, generate-matrix]
142165
secrets: inherit
143166
uses: ./.github/workflows/models-post-commit.yaml
144167
strategy:
145168
fail-fast: false
146169
matrix:
147-
test-group: [
148-
{ runner-label: P100 },
149-
{ runner-label: P150 },
150-
]
170+
test-group: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
151171
with:
152172
arch: blackhole
153-
runner-label: ${{ matrix.test-group.runner-label }}
173+
runner-label: ${{ matrix.test-group }}
154174
timeout: 20
155175
docker-image: ${{ needs.build-artifact.outputs.dev-docker-image }}
156176
wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }}
@@ -160,7 +180,7 @@ jobs:
160180
secrets: inherit
161181
uses: ./.github/workflows/blackhole-demo-tests-impl.yaml
162182
with:
163-
runner-label: BH
183+
runner-label: ${{ inputs.runner-label || 'BH' }}
164184
docker-image: ${{ needs.build-artifact.outputs.dev-docker-image }}
165185
build-artifact-name: ${{ needs.build-artifact.outputs.build-artifact-name }}
166186
wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }}
@@ -178,53 +198,70 @@ jobs:
178198
wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }}
179199

180200
ttnn-stress-tests:
181-
needs: build-artifact
201+
needs: [build-artifact, generate-matrix]
182202
secrets: inherit
183203
uses: ./.github/workflows/ttnn-stress-tests-impl.yaml
184204
strategy:
185205
fail-fast: false
186206
matrix:
187-
test-group: [
188-
{ runner-label: P100 },
189-
{ runner-label: P150 },
190-
]
207+
test-group: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
191208
with:
192209
arch: blackhole
193-
runner-label: ${{ matrix.test-group.runner-label }}
210+
runner-label: ${{ matrix.test-group }}
194211
timeout: 45
195212
docker-image: ${{ needs.build-artifact.outputs.dev-docker-image }}
196213
build-artifact-name: ${{ needs.build-artifact.outputs.build-artifact-name }}
197214
wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }}
198215
metalium-smoke-tests:
199-
needs: build-artifact
216+
needs: [build-artifact, generate-matrix]
200217
strategy:
201218
fail-fast: false
202219
matrix:
203-
platform: [
204-
"P100",
205-
"P150",
206-
]
220+
platform: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
207221
uses: ./.github/workflows/smoke.yaml
208222
with:
209223
docker-image: ${{ needs.build-artifact.outputs.dev-docker-image }}
210224
package-artifact-name: ${{ needs.build-artifact.outputs.packages-artifact-name }}
211225
runner: ${{ matrix.platform }}
212226
product: tt-metalium
213227
ttnn-smoke-tests:
214-
needs: build-artifact
228+
needs: [build-artifact, generate-matrix]
215229
strategy:
216230
fail-fast: false
217231
matrix:
218-
platform: [
219-
"P100",
220-
"P150",
221-
]
232+
platform: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
222233
uses: ./.github/workflows/smoke.yaml
223234
with:
224235
docker-image: ${{ needs.build-artifact.outputs.dev-docker-image }}
225236
package-artifact-name: ${{ needs.build-artifact.outputs.packages-artifact-name }}
226237
runner: ${{ matrix.platform }}
227238
product: tt-nn
239+
240+
# LLMBox-only demo tests
241+
blackhole-llmbox-demo-tests:
242+
needs: build-artifact
243+
if: ${{ inputs.enable-llmbox-tests }}
244+
secrets: inherit
245+
uses: ./.github/workflows/blackhole-llmbox-demo-tests-impl.yaml
246+
with:
247+
runner-label: ${{ inputs.runner-label || 'BH' }}
248+
docker-image: ${{ needs.build-artifact.outputs.dev-docker-image }}
249+
build-artifact-name: ${{ needs.build-artifact.outputs.build-artifact-name }}
250+
wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }}
251+
252+
# LLMBox-only fabric tests
253+
blackhole-llmbox-fabric-unit-tests:
254+
needs: build-artifact
255+
if: ${{ inputs.enable-llmbox-tests }}
256+
secrets: inherit
257+
uses: ./.github/workflows/blackhole-llmbox-fabric-build-and-unit-tests.yaml
258+
with:
259+
arch: blackhole
260+
runner-label: ${{ inputs.runner-label || 'BH' }}
261+
docker-image: ${{ needs.build-artifact.outputs.dev-docker-image }}
262+
build-artifact-name: ${{ needs.build-artifact.outputs.build-artifact-name }}
263+
wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }}
264+
228265
# build-and-test-wheels:
229266
# uses: Check all-post-commit yaml for directions
230267
# secrets: inherit

0 commit comments

Comments
 (0)