-
Notifications
You must be signed in to change notification settings - Fork 239
151 lines (138 loc) · 5.89 KB
/
test_component.yml
File metadata and controls
151 lines (138 loc) · 5.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
name: Test component
on:
workflow_call:
inputs:
artifact_run_id:
type: string
default: ""
artifact_group:
type: string
amdgpu_families:
type: string
amdgpu_targets:
type: string
default: ""
test_runs_on:
type: string
platform:
type: string
component:
type: string
default_container_image:
type: string
default: "ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98"
permissions:
contents: read
jobs:
test_component:
name: >-
Test ${{ fromJSON(inputs.component).job_name }}
(shard ${{ matrix.shard }}/${{ fromJSON(inputs.component).total_shards }})
(${{ inputs.amdgpu_families }})
${{ fromJSON(inputs.component).expect_failure == true && '(xfail)' || '' }}
runs-on: ${{ inputs.test_runs_on }}
continue-on-error: ${{ fromJSON(inputs.component).expect_failure == true }}
timeout-minutes: 210
container:
# If the component has specified an alternate image, honor that option.
# Otherwise use the default image.
image: >-
${{ inputs.platform == 'linux' &&
(fromJSON(inputs.component).container_image || inputs.default_container_image)
|| null
}}
# --ulimit memlock=-1:-1 - Prevents memory allocation issues with ROCm inside container
# --security-opt seccomp=unconfined - enables memory mapping, and is recommended for containers running in HPC environments
# --env-file /etc/podinfo/gha-gpu-isolation-settings - Required for GPU isolation on OSSCI MIXXX runners
# --user 0:0 - Running as root, by recommendation of GitHub: https://docs.github.com/en/actions/reference/workflows-and-actions/dockerfile-support#user
options: --ipc host
--group-add video
--device /dev/kfd
--device /dev/dri
--group-add 992
--group-add 110
--ulimit memlock=-1:-1
--security-opt seccomp=unconfined
--env-file /etc/podinfo/gha-gpu-isolation-settings
--user 0:0
${{ fromJSON(inputs.component).container_options }}
strategy:
fail-fast: false
matrix:
# The shard array is based on "total_shards" from "fetch_test_configurations.py"
# The test executable will shard based on the array. (ex: [1, 2, 3, 4] = four test shards)
shard: ${{ fromJSON(inputs.component).shard_arr }}
defaults:
run:
shell: bash
env:
VENV_DIR: ${{ github.workspace }}/.venv
ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}"
OUTPUT_ARTIFACTS_DIR: "./build"
THEROCK_BIN_DIR: "./build/bin"
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
AMDGPU_TARGETS: ${{ inputs.amdgpu_targets }}
ARTIFACT_GROUP: ${{ inputs.artifact_group }}
# Benchmark results database API endpoints for performance metrics collection
# NOTE: These secrets are only required for benchmark results submission in nightly CI runs.
# For PR/push workflows, secret retrieval will fail gracefully and benchmarks will skip API submission.
BENCHMARK_DB_URL: ${{ secrets.BENCHMARK_DB_URL }}
BENCHMARK_DB_FALLBACK_URL: ${{ secrets.BENCHMARK_DB_FALLBACK_URL }}
steps:
- name: "Fetch 'build_tools' from repository"
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
sparse-checkout: build_tools
path: "prejob"
- name: Pre-job cleanup processes on Windows
if: ${{ runner.os == 'Windows' }}
timeout-minutes: 5
shell: powershell
run: . '${{ github.workspace }}\prejob\build_tools\github_actions\cleanup_processes.ps1'
- name: Checkout Repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: "ROCm/TheRock"
- name: Run setup test environment workflow
timeout-minutes: 15
uses: './.github/actions/setup_test_environment'
with:
ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
# TODO(#3381): revert back to `inputs.artifact_group` once issue is resolved
ARTIFACT_GROUP: ${{ inputs.amdgpu_families }}
AMDGPU_TARGETS: ${{ inputs.amdgpu_targets }}
OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
VENV_DIR: ${{ env.VENV_DIR }}
FETCH_ARTIFACT_ARGS: ${{ fromJSON(inputs.component).fetch_artifact_args }}
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
# safe.directory must be set before Runner Health Status
- name: Adjust git config
run: |
git config --global --add safe.directory $PWD
git config fetch.parallel 10
- name: Runner health status
run: |
python ./build_tools/health_status.py
- name: Driver / GPU sanity check
timeout-minutes: 3
run: |
python ./build_tools/print_driver_gpu_info.py
- name: Setup Requirements
run: |
python ./build_tools/install_requirements.py \
--requirements-files=${{ fromJSON(inputs.component).requirements_files }}
- name: Test
timeout-minutes: ${{ fromJSON(inputs.component).timeout_minutes }}
env:
SHARD_INDEX: ${{ matrix.shard }}
TOTAL_SHARDS: ${{ fromJSON(inputs.component).total_shards }}
TEST_TYPE: ${{ fromJSON(inputs.component).test_type }}
run: |
${{ fromJSON(inputs.component).test_script }}
# GitHub's 'Complete job' step is unaware of launched executables
# and will fail to clean up orphan processes.
- name: Post-job cleanup processes on Windows
if: ${{ always() && runner.os == 'Windows' }}
timeout-minutes: 5
shell: powershell
run: . '${{ github.workspace }}\build_tools\github_actions\cleanup_processes.ps1'