Skip to content

Commit 39fbfbd

Browse files
authored
ci: harden workflows and reduce duplication (#188)
1 parent 361120e commit 39fbfbd

21 files changed

+254
-102
lines changed
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: GPU Snapshot and Validate
16+
description: Run aicr snapshot and validate GPU detection
17+
18+
inputs:
19+
gpu_model:
20+
description: 'Expected GPU model substring (e.g., T4, H100)'
21+
required: true
22+
min_gpu_count:
23+
description: 'Minimum expected GPU count'
24+
required: true
25+
default: '1'
26+
cluster_name:
27+
description: 'Kind cluster name (for kubectl context)'
28+
required: true
29+
30+
runs:
31+
using: composite
32+
steps:
33+
- name: Run aicr snapshot
34+
shell: bash
35+
run: |
36+
./aicr snapshot --deploy-agent \
37+
--kubeconfig="${HOME}/.kube/config" \
38+
--namespace=default \
39+
--image=ko.local:smoke-test \
40+
--require-gpu \
41+
--output=snapshot.yaml
42+
echo "--- Snapshot output ---"
43+
cat snapshot.yaml
44+
45+
- name: Validate snapshot detected GPU
46+
shell: bash
47+
run: |
48+
GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[0].data["gpu.model"]' snapshot.yaml)
49+
GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[0].data["gpu-count"]' snapshot.yaml)
50+
echo "GPU model: ${GPU_MODEL}"
51+
echo "GPU count: ${GPU_COUNT}"
52+
if [[ "${GPU_MODEL}" != *"${{ inputs.gpu_model }}"* ]]; then
53+
echo "::error::Expected ${{ inputs.gpu_model }} GPU in snapshot, got: ${GPU_MODEL}"
54+
exit 1
55+
fi
56+
if [[ "${GPU_COUNT}" -lt ${{ inputs.min_gpu_count }} ]]; then
57+
echo "::error::Expected gpu-count >= ${{ inputs.min_gpu_count }}, got: ${GPU_COUNT}"
58+
exit 1
59+
fi
60+
echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}"

.github/dependabot.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ updates:
3232
exclude-patterns:
3333
# klog has independent versioning
3434
- "k8s.io/klog/v2"
35+
golang-x:
36+
patterns:
37+
- "golang.org/x/*"
38+
opencontainers:
39+
patterns:
40+
- "github.com/opencontainers/*"
3541

3642
- package-ecosystem: "github-actions"
3743
directories:

.github/workflows/actionlint.yaml

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: Lint GitHub Actions
16+
17+
on:
18+
push:
19+
branches:
20+
- main
21+
paths:
22+
- '.github/workflows/**'
23+
- '.github/actions/**'
24+
pull_request:
25+
branches:
26+
- main
27+
paths:
28+
- '.github/workflows/**'
29+
- '.github/actions/**'
30+
workflow_dispatch: {}
31+
32+
permissions:
33+
contents: read
34+
35+
concurrency:
36+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
37+
cancel-in-progress: true
38+
39+
jobs:
40+
41+
actionlint:
42+
name: Lint Workflows
43+
runs-on: ubuntu-latest
44+
timeout-minutes: 5
45+
steps:
46+
- name: Checkout Code
47+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
48+
with:
49+
persist-credentials: false
50+
51+
- name: Run actionlint
52+
uses: reviewdog/action-actionlint@a5524e1de9f1c345efc2e244a8b348e17f1b8e58 # v1.65.0
53+
with:
54+
fail_on_error: true

.github/workflows/codeql.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ on:
1818
push:
1919
branches:
2020
- main
21-
- "pull-request/*"
2221
paths-ignore:
2322
- '**.md'
2423
- 'docs/**'
@@ -37,7 +36,7 @@ permissions:
3736
contents: read
3837

3938
concurrency:
40-
group: ${{ github.workflow }}-${{ github.ref }}
39+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
4140
cancel-in-progress: true
4241

4342
jobs:

.github/workflows/conflict-check.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ on:
2222

2323
permissions:
2424
contents: read
25-
pull-requests: write
2625

2726
concurrency:
2827
group: ${{ github.workflow }}
@@ -32,6 +31,9 @@ jobs:
3231
conflicts:
3332
name: Check Open PRs for Conflicts
3433
runs-on: ubuntu-latest
34+
permissions:
35+
contents: read
36+
pull-requests: write
3537
timeout-minutes: 10
3638
steps:
3739
- name: Check Mergeable State

.github/workflows/dependabot-auto-merge.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,18 @@ name: Dependabot Auto-Merge
1717
on:
1818
pull_request_target:
1919
types: [opened, synchronize]
20-
workflow_dispatch: {}
2120

2221
permissions:
23-
contents: write
24-
pull-requests: write
22+
contents: read
2523

2624
jobs:
2725

2826
auto-merge:
2927
name: Auto-Merge Patch Updates
3028
runs-on: ubuntu-latest
29+
permissions:
30+
contents: write
31+
pull-requests: write
3132
timeout-minutes: 5
3233
if: github.actor == 'dependabot[bot]'
3334
steps:

.github/workflows/gpu-h100-inference-test.yaml

Lines changed: 7 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -80,32 +80,12 @@ jobs:
8080

8181
# --- Snapshot and validation ---
8282

83-
- name: Run aicr snapshot
84-
run: |
85-
./aicr snapshot --deploy-agent \
86-
--kubeconfig="${HOME}/.kube/config" \
87-
--namespace=default \
88-
--image=ko.local:smoke-test \
89-
--require-gpu \
90-
--output=snapshot.yaml
91-
echo "--- Snapshot output ---"
92-
cat snapshot.yaml
93-
94-
- name: Validate snapshot detected GPU
95-
run: |
96-
GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[0].data["gpu.model"]' snapshot.yaml)
97-
GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[0].data["gpu-count"]' snapshot.yaml)
98-
echo "GPU model: ${GPU_MODEL}"
99-
echo "GPU count: ${GPU_COUNT}"
100-
if [[ "${GPU_MODEL}" != *"H100"* ]]; then
101-
echo "::error::Expected H100 GPU in snapshot, got: ${GPU_MODEL}"
102-
exit 1
103-
fi
104-
if [[ "${GPU_COUNT}" -lt 1 ]]; then
105-
echo "::error::Expected gpu-count >= 1, got: ${GPU_COUNT}"
106-
exit 1
107-
fi
108-
echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}"
83+
- name: Snapshot and validate GPU
84+
uses: ./.github/actions/gpu-snapshot-validate
85+
with:
86+
gpu_model: H100
87+
min_gpu_count: '1'
88+
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
10989

11090
# --- Install Karpenter before validation so cluster-autoscaling check passes ---
11191

@@ -252,7 +232,7 @@ jobs:
252232
253233
- name: Upload conformance evidence
254234
if: always()
255-
uses: actions/upload-artifact@v4
235+
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
256236
with:
257237
name: conformance-evidence
258238
path: |

.github/workflows/gpu-h100-training-test.yaml

Lines changed: 7 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -77,32 +77,12 @@ jobs:
7777

7878
# --- Snapshot and validation ---
7979

80-
- name: Run aicr snapshot
81-
run: |
82-
./aicr snapshot --deploy-agent \
83-
--kubeconfig="${HOME}/.kube/config" \
84-
--namespace=default \
85-
--image=ko.local:smoke-test \
86-
--require-gpu \
87-
--output=snapshot.yaml
88-
echo "--- Snapshot output ---"
89-
cat snapshot.yaml
90-
91-
- name: Validate snapshot detected GPU
92-
run: |
93-
GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[0].data["gpu.model"]' snapshot.yaml)
94-
GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[0].data["gpu-count"]' snapshot.yaml)
95-
echo "GPU model: ${GPU_MODEL}"
96-
echo "GPU count: ${GPU_COUNT}"
97-
if [[ "${GPU_MODEL}" != *"H100"* ]]; then
98-
echo "::error::Expected H100 GPU in snapshot, got: ${GPU_MODEL}"
99-
exit 1
100-
fi
101-
if [[ "${GPU_COUNT}" -lt 2 ]]; then
102-
echo "::error::Expected gpu-count >= 2 for training, got: ${GPU_COUNT}"
103-
exit 1
104-
fi
105-
echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}"
80+
- name: Snapshot and validate GPU
81+
uses: ./.github/actions/gpu-snapshot-validate
82+
with:
83+
gpu_model: H100
84+
min_gpu_count: '2'
85+
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
10686

10787
# --- Install Karpenter before validation so cluster-autoscaling check passes ---
10888

@@ -164,7 +144,7 @@ jobs:
164144
165145
- name: Upload conformance evidence
166146
if: always()
167-
uses: actions/upload-artifact@v4
147+
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
168148
with:
169149
name: conformance-evidence
170150
path: |

.github/workflows/gpu-smoke-test.yaml

Lines changed: 6 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -93,32 +93,12 @@ jobs:
9393

9494
# --- Snapshot and validation ---
9595

96-
- name: Run aicr snapshot
97-
run: |
98-
./aicr snapshot --deploy-agent \
99-
--kubeconfig="${HOME}/.kube/config" \
100-
--namespace=default \
101-
--image=ko.local:smoke-test \
102-
--require-gpu \
103-
--output=snapshot.yaml
104-
echo "--- Snapshot output ---"
105-
cat snapshot.yaml
106-
107-
- name: Validate snapshot detected GPU
108-
run: |
109-
GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[0].data["gpu.model"]' snapshot.yaml)
110-
GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[0].data["gpu-count"]' snapshot.yaml)
111-
echo "GPU model: ${GPU_MODEL}"
112-
echo "GPU count: ${GPU_COUNT}"
113-
if [[ "${GPU_MODEL}" != *"T4"* ]]; then
114-
echo "::error::Expected T4 GPU in snapshot, got: ${GPU_MODEL}"
115-
exit 1
116-
fi
117-
if [[ "${GPU_COUNT}" -lt 1 ]]; then
118-
echo "::error::Expected gpu-count >= 1, got: ${GPU_COUNT}"
119-
exit 1
120-
fi
121-
echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}"
96+
- name: Snapshot and validate GPU
97+
uses: ./.github/actions/gpu-snapshot-validate
98+
with:
99+
gpu_model: T4
100+
min_gpu_count: '1'
101+
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
122102

123103
- name: Debug diagnostics
124104
if: failure()

.github/workflows/inactive-pr-reminder.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ on:
2020
workflow_dispatch: {}
2121

2222
permissions:
23-
pull-requests: write
23+
contents: read
2424

2525
concurrency:
2626
group: ${{ github.workflow }}
@@ -30,6 +30,8 @@ jobs:
3030
remind:
3131
name: Nudge Inactive PRs
3232
runs-on: ubuntu-latest
33+
permissions:
34+
pull-requests: write
3335
timeout-minutes: 10
3436
steps:
3537
- name: Check for Inactive PRs

0 commit comments

Comments
 (0)