-
Notifications
You must be signed in to change notification settings - Fork 215
377 lines (336 loc) · 14.2 KB
/
Copy pathon-pr.yaml
File metadata and controls
377 lines (336 loc) · 14.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
# Copyright 2025 NVIDIA CORPORATION
# SPDX-License-Identifier: Apache-2.0
name: KAI Scheduler - Pull Request
on:
pull_request:
types: [opened, reopened, synchronize]
merge_group:
types: [checks_requested]
concurrency:
group: ${{ github.event_name == 'merge_group' && github.ref || github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
check-build-and-test-required:
name: Check if build and test are required
runs-on: ubuntu-latest
outputs:
code: ${{ steps.filter.outputs.code }}
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Check changed files
uses: dorny/paths-filter@v3
id: filter
with:
predicate-quantifier: "every"
filters: |
docs:
- '**/*.md'
- 'docs/**'
code:
- '**'
- '!**/*.md'
- '!docs/**'
validate-and-test:
needs: [ check-build-and-test-required ]
if: needs.check-build-and-test-required.outputs.code == 'true'
name: Validate & Test
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Cache Go build cache (Docker-mounted)
uses: actions/cache@v5
with:
path: |
~/.cache/go-build-docker-gocache
~/.cache/go-build-docker-gopath
key: go-docker-${{ runner.os }}-${{ hashFiles('go.sum') }}
restore-keys: |
go-docker-${{ runner.os }}-
- name: Set up Go
uses: actions/setup-go@v6
with:
go-version: '1.26.3'
cache: true
- name: Run validation
run: make validate
- name: Run tests
run: make test
- name: Archive code coverage results
uses: actions/upload-artifact@v7
with:
name: code-coverage
path: coverage/coverage.out
- name: Calculate total coverage
run: |
if [ ! -s coverage/coverage.out ]; then
exit 0
fi
COVERAGE=$(go tool cover -func=coverage/coverage.out | grep total | grep -Eo '[0-9]+\.[0-9]+')
echo "$COVERAGE" > coverage/total-coverage.txt
- name: Archive total coverage
if: hashFiles('coverage/total-coverage.txt') != ''
uses: actions/upload-artifact@v7
with:
name: total-coverage
path: coverage/total-coverage.txt
code-coverage-report:
name: Code Coverage Report
runs-on: ubuntu-latest
needs: [ validate-and-test, check-build-and-test-required ]
if: github.event_name != 'merge_group' && needs.check-build-and-test-required.outputs.code == 'true' && github.base_ref == 'main'
steps:
- uses: fgrosse/go-coverage-report@cbeb2ab2e32591d690337146ba02a911cc566f3f
id: coverage_reporter
with:
coverage-artifact-name: "code-coverage"
coverage-file-name: "coverage.out"
root-package: "github.com/kai-scheduler/KAI-scheduler"
github-baseline-workflow-ref: update-coverage-badge.yaml
skip-comment: true
- name: Download total coverage artifact
uses: actions/download-artifact@v8
with:
name: total-coverage
path: coverage-summary
- name: Download coverage badge branch
uses: actions/checkout@v6
with:
ref: coverage-badge
path: coverage-badge
- name: Calculate coverage totals
id: coverage_totals
run: |
PR_COVERAGE=$(cat coverage-summary/total-coverage.txt)
echo "pr=$PR_COVERAGE" >> $GITHUB_OUTPUT
BASELINE_COVERAGE=$(grep -oE '[0-9]+\.[0-9]+%' coverage-badge/badges/coverage.svg | head -1 | tr -d '%')
echo "baseline=$BASELINE_COVERAGE" >> $GITHUB_OUTPUT
DELTA=$(awk -v pr="$PR_COVERAGE" -v baseline="$BASELINE_COVERAGE" 'BEGIN { printf "%.2f", pr - baseline }')
echo "delta=$DELTA" >> $GITHUB_OUTPUT
- name: Save coverage report to file
env:
REPORT_BODY: ${{ steps.coverage_reporter.outputs.coverage_report }}
BASELINE_COVERAGE: ${{ steps.coverage_totals.outputs.baseline }}
PR_COVERAGE: ${{ steps.coverage_totals.outputs.pr }}
COVERAGE_DELTA: ${{ steps.coverage_totals.outputs.delta }}
run: |
{
echo "**Total coverage:** ${BASELINE_COVERAGE}% -> ${PR_COVERAGE}% (delta ${COVERAGE_DELTA}%)"
if [ -n "$REPORT_BODY" ]; then
echo ""
echo "$REPORT_BODY"
fi
} > coverage-report.txt
- name: Upload coverage report
uses: actions/upload-artifact@v7
with:
name: coverage-report-for-comment
path: coverage-report.txt
- name: Save PR number
run: echo "${{ github.event.number }}" > pr_number.txt
- name: Upload PR number
uses: actions/upload-artifact@v7
with:
name: pr-number-for-comment
path: pr_number.txt
build:
needs: [ check-build-and-test-required ]
if: needs.check-build-and-test-required.outputs.code == 'true'
name: Build
runs-on: ubuntu-latest
outputs:
package_version: ${{ steps.package_version.outputs.PACKAGE_VERSION }}
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Extract package version
id: package_version
run: |
GIT_REV=$(git rev-parse --short HEAD | sed 's/^0*//')
PACKAGE_VERSION=0.0.0-$GIT_REV
echo "PACKAGE_VERSION=$PACKAGE_VERSION" >> $GITHUB_ENV
echo "PACKAGE_VERSION=$PACKAGE_VERSION" >> $GITHUB_OUTPUT
echo $PACKAGE_VERSION
- name: Cache Go build cache (Docker-mounted)
uses: actions/cache@v5
with:
path: |
~/.cache/go-build-docker-gocache
~/.cache/go-build-docker-gopath
key: go-docker-${{ runner.os }}-${{ hashFiles('go.sum') }}
restore-keys: |
go-docker-${{ runner.os }}-
- name: Set up Go
uses: actions/setup-go@v6
with:
go-version: '1.26.3'
cache: true
- name: Move Docker Data to /mnt
run: |
sudo systemctl stop docker
sudo mkdir -p /mnt/docker-data
echo '{"data-root": "/mnt/docker-data"}' | sudo tee /etc/docker/daemon.json
sudo systemctl start docker
docker info | grep "Docker Root Dir"
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v4
- name: Create image cache directory
run: |
sudo mkdir -p /mnt/images
sudo chown -R $USER:$USER /mnt/images
- name: Cache for docker images and helm chart
uses: actions/cache@v5
with:
path: /mnt/images
key: images-${{ github.sha }}
- name: Build docker images
run: |
make build DOCKER_BUILDX_ADDITIONAL_ARGS="--load --cache-from type=gha --cache-to type=gha,mode=max" VERSION=$PACKAGE_VERSION
docker save $(docker images --format '{{.Repository}}:{{.Tag}}' | grep $PACKAGE_VERSION) | gzip > /mnt/images/docker_images.tgz
- name: Build helm chart
run: |
helm package ./deployments/kai-scheduler -d ./charts --app-version $PACKAGE_VERSION --version $PACKAGE_VERSION
cp charts/kai-scheduler-$PACKAGE_VERSION.tgz /mnt/images/
skip-build-and-test-message:
name: Skip Build and Test Message
needs: [ check-build-and-test-required ]
if: needs.check-build-and-test-required.outputs.code != 'true'
runs-on: ubuntu-latest
steps:
- name: Skip message
run: |
echo "Skipping build and test since only documentation files (.md or docs/) were changed."
e2e-tests:
name: Run E2E Tests
needs: [ build, check-build-and-test-required ]
if: needs.check-build-and-test-required.outputs.code == 'true'
runs-on: ubuntu-latest
permissions:
actions: write
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Setup e2e cluster
uses: ./.github/actions/setup-e2e-cluster
with:
package_version: ${{ needs.build.outputs.package_version }}
- name: Delete restored cache
continue-on-error: true
env:
GH_TOKEN: ${{ github.token }}
run: |
gh cache delete "images-${{ github.sha }}" --repo ${{ github.repository }}
- name: Install KAI-scheduler
env:
PACKAGE_VERSION: ${{ needs.build.outputs.package_version }}
run: |
helm upgrade -i kai-scheduler /mnt/images/kai-scheduler-$PACKAGE_VERSION.tgz -n kai-scheduler --create-namespace \
--set "global.gpuSharing=true" --set "global.registry=localhost:30100" --set "prometheus.enabled=true" --debug --wait
kubectl create clusterrole pods-patcher --verb=patch --resource=pods
kubectl create rolebinding fake-status-updater --clusterrole=pods-patcher --serviceaccount=gpu-operator:status-updater -n kai-resource-reservation
- name: Run e2e tests
run: |
ginkgo -r --keep-going --randomize-all --randomize-suites --trace -vv --label-filter '!autoscale && !scale && !upgrade' --output-dir=. --json-report=e2e-report.json ./test/e2e/suites
echo ""
echo "=== Skipped Tests ==="
jq -r '.[].SpecReports[] | select(.State == "skipped") | ([.ContainerHierarchyTexts[], .LeafNodeText] | join(" > "))' e2e-report.json 2>/dev/null || echo "No skipped tests found"
- name: Uninstall KAI-scheduler
run: |
helm uninstall kai-scheduler -n kai-scheduler
echo "Waiting up to 60 seconds for pods to terminate..."
EXCLUDED_PODS="prometheus"
for i in {1..12}; do
EXCLUDE_PATTERN=$(echo "$EXCLUDED_PODS" | tr ' ' '|')
NON_TERM=$(kubectl get pods -n kai-scheduler --no-headers 2>/dev/null | grep -v Terminating | grep -vE "$EXCLUDE_PATTERN" | wc -l)
if [ "$NON_TERM" -eq 0 ]; then
echo "Only Terminating pods remain or no pods left (excluding: $EXCLUDED_PODS). Safe to proceed."
exit 0
fi
echo "Found $NON_TERM non-terminating pods (excluding: $EXCLUDED_PODS)... waiting (attempt $i/12)"
sleep 5
done
echo "Pods did not terminate within 60 seconds. Uninstall incomplete."
exit 1
e2e-upgrade-tests:
name: Run E2E Upgrade Tests
needs: [ build, check-build-and-test-required ]
if: needs.check-build-and-test-required.outputs.code == 'true'
runs-on: ubuntu-latest
permissions:
actions: write
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Resolve upgrade-from version
id: resolve_version
env:
GH_TOKEN: ${{ github.token }}
run: |
TARGET_BRANCH="${{ github.base_ref }}"
if [[ "$TARGET_BRANCH" =~ v([0-9]+)\.([0-9]+) ]]; then
# Version branch: upgrade from the latest release of the previous minor
MAJOR="${BASH_REMATCH[1]}"
MINOR="${BASH_REMATCH[2]}"
if [ "$MINOR" -eq 0 ]; then
echo "No previous minor version exists. Skipping upgrade tests."
echo "skip=true" >> $GITHUB_OUTPUT
exit 0
fi
PREV_MINOR="${MAJOR}.$((MINOR - 1))"
UPGRADE_FROM=$(gh release list --limit 200 --json tagName -q '.[].tagName' | grep -E "^v${PREV_MINOR}\.[0-9]+$" | sort -V | tail -1)
if [ -z "$UPGRADE_FROM" ]; then
echo "No release found for v${PREV_MINOR}.x. Skipping upgrade tests."
echo "skip=true" >> $GITHUB_OUTPUT
exit 0
fi
else
# Main branch: upgrade from the latest release
UPGRADE_FROM=$(gh release list --limit 200 --json tagName -q '.[].tagName' | grep -E '^v[0-9]+\.[0-9]+\.[0-9]+$' | sort -V | tail -1)
if [ -z "$UPGRADE_FROM" ]; then
echo "No releases found. Skipping upgrade tests."
echo "skip=true" >> $GITHUB_OUTPUT
exit 0
fi
fi
echo "Upgrading from $UPGRADE_FROM"
echo "upgrade_from=$UPGRADE_FROM" >> $GITHUB_OUTPUT
echo "skip=false" >> $GITHUB_OUTPUT
- name: Setup e2e cluster
if: steps.resolve_version.outputs.skip != 'true'
uses: ./.github/actions/setup-e2e-cluster
with:
package_version: ${{ needs.build.outputs.package_version }}
- name: Install previous version of KAI-scheduler
if: steps.resolve_version.outputs.skip != 'true'
env:
UPGRADE_FROM_VERSION: ${{ steps.resolve_version.outputs.upgrade_from }}
run: |
echo "Installing kai-scheduler $UPGRADE_FROM_VERSION from OCI registry..."
helm upgrade -i kai-scheduler oci://ghcr.io/kai-scheduler/kai-scheduler/kai-scheduler -n kai-scheduler --create-namespace \
--set "global.gpuSharing=true" --wait --version "$UPGRADE_FROM_VERSION"
kubectl create clusterrole pods-patcher --verb=patch --resource=pods
kubectl create rolebinding fake-status-updater --clusterrole=pods-patcher --serviceaccount=gpu-operator:status-updater -n kai-resource-reservation
- name: Run upgrade e2e tests
if: steps.resolve_version.outputs.skip != 'true'
env:
UPGRADE_CHART_PATH: /mnt/images/kai-scheduler-${{ needs.build.outputs.package_version }}.tgz
run: |
ginkgo -r --keep-going --trace -vv --label-filter 'upgrade' --output-dir=. --json-report=e2e-upgrade-report.json ./test/e2e/suites/upgrade
fossa-validate:
needs: [ build, check-build-and-test-required ]
if: needs.check-build-and-test-required.outputs.code == 'true'
name: FOSSA license check
runs-on: ubuntu-latest
continue-on-error: true
env:
# push-only token, intentional; see https://github.com/fossa-contrib/fossa-action?tab=readme-ov-file#push-only-api-token
FOSSA_API_KEY: 577e3d21c48454822ae8ea496209a505 # This is a push-only token that is safe to be exposed.
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Run FOSSA analysis and validate status
uses: fossa-contrib/fossa-action@v3.0.1
with:
fossa-api-key: ${{ env.FOSSA_API_KEY }}
skip-test: false