KAI-Scheduler/.github/workflows/on-pr.yaml at 7d3c6e97116e62f2d8a7df604d839c704a232087 · kai-scheduler/KAI-Scheduler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
# Copyright 2025 NVIDIA CORPORATION
# SPDX-License-Identifier: Apache-2.0

name: KAI Scheduler - Pull Request
on:
  pull_request:
    types: [opened, reopened, synchronize]
  merge_group:
    types: [checks_requested]

concurrency:
  group: ${{ github.event_name == 'merge_group' && github.ref || github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  check-build-and-test-required:
    name: Check if build and test are required
    runs-on: ubuntu-latest
    outputs:
      code: ${{ steps.filter.outputs.code }}
    steps:
      - name: Checkout code
        uses: actions/checkout@v6

      - name: Check changed files
        uses: dorny/paths-filter@v3
        id: filter
        with:
          predicate-quantifier: "every"
          filters: |
            docs:
              - '**/*.md'
              - 'docs/**'
            code:
              - '**'
              - '!**/*.md'
              - '!docs/**'

  validate-and-test:
    needs: [ check-build-and-test-required ]
    if: needs.check-build-and-test-required.outputs.code == 'true'
    name: Validate & Test
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v6

      - name: Cache Go build cache (Docker-mounted)
        uses: actions/cache@v5
        with:
          path: |
            ~/.cache/go-build-docker-gocache
            ~/.cache/go-build-docker-gopath
          key: go-docker-${{ runner.os }}-${{ hashFiles('go.sum') }}
          restore-keys: |
            go-docker-${{ runner.os }}-

      - name: Set up Go
        uses: actions/setup-go@v6
        with:
          go-version: '1.26.3'
          cache: true

      - name: Run validation
        run: make validate

      - name: Run tests
        run: make test

      - name: Archive code coverage results
        uses: actions/upload-artifact@v7
        with:
          name: code-coverage
          path: coverage/coverage.out

      - name: Calculate total coverage
        run: |
          if [ ! -s coverage/coverage.out ]; then
            exit 0
          fi

          COVERAGE=$(go tool cover -func=coverage/coverage.out | grep total | grep -Eo '[0-9]+\.[0-9]+')
          echo "$COVERAGE" > coverage/total-coverage.txt

      - name: Archive total coverage
        if: hashFiles('coverage/total-coverage.txt') != ''
        uses: actions/upload-artifact@v7
        with:
          name: total-coverage
          path: coverage/total-coverage.txt

  code-coverage-report:
    name: Code Coverage Report
    runs-on: ubuntu-latest
    needs: [ validate-and-test, check-build-and-test-required ]
    if: github.event_name != 'merge_group' && needs.check-build-and-test-required.outputs.code == 'true' && github.base_ref == 'main'
    steps:
      - uses: fgrosse/go-coverage-report@cbeb2ab2e32591d690337146ba02a911cc566f3f
        id: coverage_reporter
        with:
          coverage-artifact-name: "code-coverage"
          coverage-file-name: "coverage.out"
          root-package: "github.com/kai-scheduler/KAI-scheduler"
          github-baseline-workflow-ref: update-coverage-badge.yaml
          skip-comment: true
      - name: Download total coverage artifact
        uses: actions/download-artifact@v8
        with:
          name: total-coverage
          path: coverage-summary
      - name: Download coverage badge branch
        uses: actions/checkout@v6
        with:
          ref: coverage-badge
          path: coverage-badge
      - name: Calculate coverage totals
        id: coverage_totals
        run: |
          PR_COVERAGE=$(cat coverage-summary/total-coverage.txt)
          echo "pr=$PR_COVERAGE" >> $GITHUB_OUTPUT

          BASELINE_COVERAGE=$(grep -oE '[0-9]+\.[0-9]+%' coverage-badge/badges/coverage.svg | head -1 | tr -d '%')
          echo "baseline=$BASELINE_COVERAGE" >> $GITHUB_OUTPUT

          DELTA=$(awk -v pr="$PR_COVERAGE" -v baseline="$BASELINE_COVERAGE" 'BEGIN { printf "%.2f", pr - baseline }')
          echo "delta=$DELTA" >> $GITHUB_OUTPUT
      - name: Save coverage report to file
        env:
          REPORT_BODY: ${{ steps.coverage_reporter.outputs.coverage_report }}
          BASELINE_COVERAGE: ${{ steps.coverage_totals.outputs.baseline }}
          PR_COVERAGE: ${{ steps.coverage_totals.outputs.pr }}
          COVERAGE_DELTA: ${{ steps.coverage_totals.outputs.delta }}
        run: |
          {
            echo "**Total coverage:** ${BASELINE_COVERAGE}% -> ${PR_COVERAGE}% (delta ${COVERAGE_DELTA}%)"
            if [ -n "$REPORT_BODY" ]; then
              echo ""
              echo "$REPORT_BODY"
            fi
          } > coverage-report.txt
      - name: Upload coverage report
        uses: actions/upload-artifact@v7
        with:
          name: coverage-report-for-comment
          path: coverage-report.txt
      - name: Save PR number
        run: echo "${{ github.event.number }}" > pr_number.txt
      - name: Upload PR number
        uses: actions/upload-artifact@v7
        with:
          name: pr-number-for-comment
          path: pr_number.txt

  build:
    needs: [ check-build-and-test-required ]
    if: needs.check-build-and-test-required.outputs.code == 'true'
    name: Build
    runs-on: ubuntu-latest
    outputs:
      package_version: ${{ steps.package_version.outputs.PACKAGE_VERSION }}
    steps:
      - name: Checkout code
        uses: actions/checkout@v6

      - name: Extract package version
        id: package_version
        run: |
          GIT_REV=$(git rev-parse --short HEAD | sed 's/^0*//')
          PACKAGE_VERSION=0.0.0-$GIT_REV
          echo "PACKAGE_VERSION=$PACKAGE_VERSION" >> $GITHUB_ENV
          echo "PACKAGE_VERSION=$PACKAGE_VERSION" >> $GITHUB_OUTPUT
          echo $PACKAGE_VERSION

      - name: Cache Go build cache (Docker-mounted)
        uses: actions/cache@v5
        with:
          path: |
            ~/.cache/go-build-docker-gocache
            ~/.cache/go-build-docker-gopath
          key: go-docker-${{ runner.os }}-${{ hashFiles('go.sum') }}
          restore-keys: |
            go-docker-${{ runner.os }}-

      - name: Set up Go
        uses: actions/setup-go@v6
        with:
          go-version: '1.26.3'
          cache: true

      - name: Move Docker Data to /mnt
        run: |
          sudo systemctl stop docker
          sudo mkdir -p /mnt/docker-data
          echo '{"data-root": "/mnt/docker-data"}' | sudo tee /etc/docker/daemon.json
          sudo systemctl start docker
          docker info | grep "Docker Root Dir"

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v4

      - name: Create image cache directory
        run: |
          sudo mkdir -p /mnt/images
          sudo chown -R $USER:$USER /mnt/images

      - name: Cache for docker images and helm chart
        uses: actions/cache@v5
        with:
          path: /mnt/images
          key:  images-${{ github.sha }}

      - name: Build docker images
        run: |
          make build DOCKER_BUILDX_ADDITIONAL_ARGS="--load --cache-from type=gha --cache-to type=gha,mode=max" VERSION=$PACKAGE_VERSION
          docker save $(docker images --format '{{.Repository}}:{{.Tag}}' | grep $PACKAGE_VERSION) | gzip > /mnt/images/docker_images.tgz

      - name: Build helm chart
        run: |
          helm package ./deployments/kai-scheduler -d ./charts --app-version $PACKAGE_VERSION --version $PACKAGE_VERSION
          cp charts/kai-scheduler-$PACKAGE_VERSION.tgz /mnt/images/

  skip-build-and-test-message:
    name: Skip Build and Test Message
    needs: [ check-build-and-test-required ]
    if: needs.check-build-and-test-required.outputs.code != 'true'
    runs-on: ubuntu-latest
    steps:
      - name: Skip message
        run: |
          echo "Skipping build and test since only documentation files (.md or docs/) were changed."

  e2e-tests:
    name: Run E2E Tests
    needs: [ build, check-build-and-test-required ]
    if: needs.check-build-and-test-required.outputs.code == 'true'
    runs-on: ubuntu-latest
    permissions:
      actions: write
    steps:
      - name: Checkout code
        uses: actions/checkout@v6

      - name: Setup e2e cluster
        uses: ./.github/actions/setup-e2e-cluster
        with:
          package_version: ${{ needs.build.outputs.package_version }}

      - name: Delete restored cache
        continue-on-error: true
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          gh cache delete "images-${{ github.sha }}" --repo ${{ github.repository }}

      - name: Install KAI-scheduler
        env:
          PACKAGE_VERSION: ${{ needs.build.outputs.package_version }}
        run: |
          helm upgrade -i kai-scheduler /mnt/images/kai-scheduler-$PACKAGE_VERSION.tgz -n kai-scheduler --create-namespace \
            --set "global.gpuSharing=true" --set "global.registry=localhost:30100" --set "prometheus.enabled=true" --debug --wait
          kubectl create clusterrole pods-patcher --verb=patch --resource=pods
          kubectl create rolebinding fake-status-updater --clusterrole=pods-patcher --serviceaccount=gpu-operator:status-updater -n kai-resource-reservation

      - name: Run e2e tests
        run: |
          ginkgo -r --keep-going --randomize-all --randomize-suites --trace -vv --label-filter '!autoscale && !scale && !upgrade' --output-dir=. --json-report=e2e-report.json ./test/e2e/suites
          echo ""
          echo "=== Skipped Tests ==="
          jq -r '.[].SpecReports[] | select(.State == "skipped") | ([.ContainerHierarchyTexts[], .LeafNodeText] | join(" > "))' e2e-report.json 2>/dev/null || echo "No skipped tests found"

      - name: Uninstall KAI-scheduler
        run: |
          helm uninstall kai-scheduler -n kai-scheduler
          echo "Waiting up to 60 seconds for pods to terminate..."
          EXCLUDED_PODS="prometheus"
          for i in {1..12}; do
            EXCLUDE_PATTERN=$(echo "$EXCLUDED_PODS" | tr ' ' '|')
            NON_TERM=$(kubectl get pods -n kai-scheduler --no-headers 2>/dev/null | grep -v Terminating | grep -vE "$EXCLUDE_PATTERN" | wc -l)
            if [ "$NON_TERM" -eq 0 ]; then
              echo "Only Terminating pods remain or no pods left (excluding: $EXCLUDED_PODS). Safe to proceed."
              exit 0
            fi
            echo "Found $NON_TERM non-terminating pods (excluding: $EXCLUDED_PODS)... waiting (attempt $i/12)"
            sleep 5
          done
          echo "Pods did not terminate within 60 seconds. Uninstall incomplete."
          exit 1

  e2e-upgrade-tests:
    name: Run E2E Upgrade Tests
    needs: [ build, check-build-and-test-required ]
    if: needs.check-build-and-test-required.outputs.code == 'true'
    runs-on: ubuntu-latest
    permissions:
      actions: write
    steps:
      - name: Checkout code
        uses: actions/checkout@v6

      - name: Resolve upgrade-from version
        id: resolve_version
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          TARGET_BRANCH="${{ github.base_ref }}"
          if [[ "$TARGET_BRANCH" =~ v([0-9]+)\.([0-9]+) ]]; then
            # Version branch: upgrade from the latest release of the previous minor
            MAJOR="${BASH_REMATCH[1]}"
            MINOR="${BASH_REMATCH[2]}"
            if [ "$MINOR" -eq 0 ]; then
              echo "No previous minor version exists. Skipping upgrade tests."
              echo "skip=true" >> $GITHUB_OUTPUT
              exit 0
            fi
            PREV_MINOR="${MAJOR}.$((MINOR - 1))"
            UPGRADE_FROM=$(gh release list --limit 200 --json tagName -q '.[].tagName' | grep -E "^v${PREV_MINOR}\.[0-9]+$" | sort -V | tail -1)
            if [ -z "$UPGRADE_FROM" ]; then
              echo "No release found for v${PREV_MINOR}.x. Skipping upgrade tests."
              echo "skip=true" >> $GITHUB_OUTPUT
              exit 0
            fi
          else
            # Main branch: upgrade from the latest release
            UPGRADE_FROM=$(gh release list --limit 200 --json tagName -q '.[].tagName' | grep -E '^v[0-9]+\.[0-9]+\.[0-9]+$' | sort -V | tail -1)
            if [ -z "$UPGRADE_FROM" ]; then
              echo "No releases found. Skipping upgrade tests."
              echo "skip=true" >> $GITHUB_OUTPUT
              exit 0
            fi
          fi

          echo "Upgrading from $UPGRADE_FROM"
          echo "upgrade_from=$UPGRADE_FROM" >> $GITHUB_OUTPUT
          echo "skip=false" >> $GITHUB_OUTPUT

      - name: Setup e2e cluster
        if: steps.resolve_version.outputs.skip != 'true'
        uses: ./.github/actions/setup-e2e-cluster
        with:
          package_version: ${{ needs.build.outputs.package_version }}

      - name: Install previous version of KAI-scheduler
        if: steps.resolve_version.outputs.skip != 'true'
        env:
          UPGRADE_FROM_VERSION: ${{ steps.resolve_version.outputs.upgrade_from }}
        run: |
          echo "Installing kai-scheduler $UPGRADE_FROM_VERSION from OCI registry..."
          helm upgrade -i kai-scheduler oci://ghcr.io/kai-scheduler/kai-scheduler/kai-scheduler -n kai-scheduler --create-namespace \
            --set "global.gpuSharing=true" --wait --version "$UPGRADE_FROM_VERSION"
          kubectl create clusterrole pods-patcher --verb=patch --resource=pods
          kubectl create rolebinding fake-status-updater --clusterrole=pods-patcher --serviceaccount=gpu-operator:status-updater -n kai-resource-reservation

      - name: Run upgrade e2e tests
        if: steps.resolve_version.outputs.skip != 'true'
        env:
          UPGRADE_CHART_PATH: /mnt/images/kai-scheduler-${{ needs.build.outputs.package_version }}.tgz
        run: |
          ginkgo -r --keep-going --trace -vv --label-filter 'upgrade' --output-dir=. --json-report=e2e-upgrade-report.json ./test/e2e/suites/upgrade

  fossa-validate:
    needs: [ build, check-build-and-test-required ]
    if: needs.check-build-and-test-required.outputs.code == 'true'
    name: FOSSA license check
    runs-on: ubuntu-latest
    continue-on-error: true
    env:
      # push-only token, intentional; see https://github.com/fossa-contrib/fossa-action?tab=readme-ov-file#push-only-api-token
      FOSSA_API_KEY: 577e3d21c48454822ae8ea496209a505 # This is a push-only token that is safe to be exposed.
    steps:
      - name: Checkout code
        uses: actions/checkout@v6

      - name: Run FOSSA analysis and validate status
        uses: fossa-contrib/fossa-action@v3.0.1
        with:
          fossa-api-key: ${{ env.FOSSA_API_KEY }}
          skip-test: false