global-capacity-orchestrator-on-aws/.github/workflows/unit-tests.yml at 86400c63d54377fc12a9a16ad80e33afa8dac74c · awslabs/global-capacity-orchestrator-on-aws · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
# =============================================================================
# unit-tests.yml — Unit tests & fast smoke tests
# =============================================================================
#
# Maps to the "Unit Tests" row of the README badge table.
#
# Triggers: push: main, pull_request, workflow_dispatch.
#
# Jobs (alphabetical by display name):
#   - unit:bats:shell              — BATS shell tests
#   - unit:cdk:config-matrix       — tests/test_cdk_synthesis_matrix.py (in-process, -n auto)
#   - unit:cdk:nag-compliance      — cdk-nag compliance across full config matrix
#   - unit:cdk:synth               — cdk synth of the default config
#   - unit:cli:smoke               — gco --help and all subcommand help pages
#   - unit:fresh-install           — pip install from scratch, verify imports
#   - unit:lockfile:freshness      — requirements-lock.txt matches pyproject.toml
#   - unit:pages:deploy            — publish htmlcov/ (coverage report + badge) to Pages
#   - unit:pytest:core             — pytest (coverage target 85%) + upload Pages artifact
#   - unit:workload:imports        — K8s service import + circular-import detection
#
# =============================================================================

name: Unit Tests

on:
  push:
    branches: [main]
  pull_request:
  workflow_dispatch:

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

permissions:
  contents: read

jobs:
  # --------------------------------------------------------------------------
  # Path filter used by the lockfile / fresh-install jobs below. Both of those
  # only exercise pyproject.toml + requirements-lock.txt (plus the workflow
  # file itself), so running them on every docs- or code-only PR is waste —
  # especially unit:fresh-install, which intentionally disables the pip cache.
  #
  # On push to main and workflow_dispatch the filter is skipped and the
  # dependent jobs always run: we want at least one guaranteed green run per
  # merge. On PRs, dorny/paths-filter queries the GitHub API (no checkout
  # needed) for the changed files and gates the jobs via their `if:`.
  # --------------------------------------------------------------------------
  changes:
    name: "changes"
    runs-on: ubuntu-latest
    timeout-minutes: 2
    if: github.event_name == 'pull_request'
    outputs:
      deps: ${{ steps.filter.outputs.deps }}
    steps:
      - uses: dorny/paths-filter@v4
        id: filter
        with:
          filters: |
            deps:
              - 'pyproject.toml'
              - 'requirements-lock.txt'
              - '.github/workflows/unit-tests.yml'

  unit-pytest-core:
    name: "unit:pytest:core"
    runs-on: ubuntu-latest
    timeout-minutes: 20
    # Several tests import Lambda handler modules that construct boto3
    # clients at module scope (lambda/api-gateway-proxy/proxy_utils.py,
    # lambda/cross-region-aggregator/handler.py). boto3 requires a region
    # even for imports, so set a default for the whole job. Matches the
    # AWS_DEFAULT_REGION=us-east-1 env from the retired GitLab pipeline.
    env:
      AWS_DEFAULT_REGION: us-east-1
      AWS_REGION: us-east-1
    steps:
      - uses: actions/checkout@v6
      - uses: actions/setup-node@v6
        with:
          node-version: "24"
      - uses: actions/setup-python@v6
        with:
          python-version: "3.14"
          cache: "pip"
          cache-dependency-path: "requirements-lock.txt"
      - name: Install dependencies
        run: |
          pip install -e ".[dev,mcp]"
      - uses: ./.github/actions/build-lambda-package
      - name: Run pytest with coverage
        # -n auto distributes tests across all available CPU cores via
        # pytest-xdist. Every test is xdist-safe because
        # tests/conftest.py::_neutralize_lambda_build patches
        # StackManager._ensure_lambda_build and _rebuild_lambda_packages
        # so tests can't rebuild the real lambda/kubectl-applier-simple-build
        # tree mid-run — that rebuild is what CDK's Code.from_asset() races
        # against when two workers synthesize stacks concurrently. The
        # session-wide patch guards on `project_root` so the handful of
        # tests that legitimately exercise these methods against a
        # `tmp_path` keep working.
        #
        # --dist=load (xdist's default) round-robins individual test items
        # across workers. --maxfail=1 matches the previous -x "stop at
        # first failure" behavior — `-x` itself isn't compatible with xdist.
        run: |
          pytest tests/ -v \
            --ignore=tests/test_integration.py \
            --ignore=tests/test_nag_compliance.py \
            --ignore=tests/test_cdk_synthesis_matrix.py \
            --cov=gco --cov=cli --cov=mcp \
            --cov-report=xml --cov-report=html --cov-report=json \
            --cov-report=term-missing \
            --cov-fail-under=90 \
            --junitxml=report.xml \
            -n auto --maxfail=1
      - name: Upload coverage artifacts
        if: always()
        uses: actions/upload-artifact@v7
        with:
          name: pytest-coverage
          path: |
            htmlcov/
            coverage.xml
            coverage.json
            report.xml
          retention-days: 7
      - name: Generate shields.io badge endpoint JSON
        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
        run: |
          set -euo pipefail
          # coverage.json is written by coverage.py. Pull totals.percent_covered,
          # round to one decimal, and emit the shields.io endpoint badge
          # schema so the README badge can link to a static JSON file on
          # GitHub Pages without calling a third-party service.
          #
          # Color threshold matches the --cov-fail-under value (90%) from
          # the pytest step above: anything below that would have already
          # failed CI, so the badge only has two meaningful states.
          #
          # The JSON write is done entirely in Python (heredoc + json.dump)
          # rather than piped back through shell — earlier attempts to
          # interpolate a Python f-string result into shell printf ran into
          # PEP 498's ban on backslash escapes inside f-string `{ }`
          # expressions, producing a `SyntaxError: unexpected character
          # after line continuation character` at CI time.
          python3 <<'PY'
          import json
          with open("coverage.json") as f:
              pct = json.load(f)["totals"]["percent_covered"]
          color = "brightgreen" if pct >= 90 else "red"
          badge = {
              "schemaVersion": 1,
              "label": "coverage",
              "message": f"{pct:.1f}%",
              "color": color,
          }
          with open("htmlcov/coverage-badge.json", "w") as f:
              json.dump(badge, f)
          print(f"Coverage: {pct:.1f}% ({color})")
          PY
      - name: Upload coverage report as Pages artifact
        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
        uses: actions/upload-pages-artifact@v5
        with:
          # htmlcov/ now contains index.html (coverage.py's report root)
          # plus coverage-badge.json. GitHub Pages serves this at
          # https://awslabs.github.io/<repo>/ once the deploy job below
          # finalizes it. The README badge and report link both point
          # under that base URL.
          path: htmlcov

  unit-pages-deploy:
    name: "unit:pages:deploy"
    # Deploys the htmlcov Pages artifact uploaded by unit-pytest-core to
    # GitHub Pages. Runs only on push to main — PRs, manual dispatch,
    # and forks all skip this job. Uses the official actions/deploy-pages
    # flow rather than pushing to a gh-pages branch, so the Pages site
    # is managed entirely through GitHub's built-in deployment system.
    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
    needs: unit-pytest-core
    runs-on: ubuntu-latest
    timeout-minutes: 5
    # `pages: write` lets actions/deploy-pages finalize the deployment.
    # `id-token: write` gives the action an OIDC token to authenticate
    # with the Pages service — this is the documented requirement, not
    # optional. `contents: read` is the default and sufficient since
    # this job doesn't check out the repo.
    permissions:
      pages: write
      id-token: write
    environment:
      name: github-pages
      url: ${{ steps.deployment.outputs.page_url }}
    steps:
      - name: Deploy to GitHub Pages
        id: deployment
        uses: actions/deploy-pages@v5

  unit-bats-shell:
    name: "unit:bats:shell"
    runs-on: ubuntu-latest
    timeout-minutes: 15
    steps:
      - uses: actions/checkout@v6
      - name: Install bats + deps
        run: |
          sudo apt-get update
          sudo apt-get install -y bats jq python3 python3-yaml bash
      - name: Run BATS suite
        run: bats tests/BATS/

  unit-cli-smoke:
    name: "unit:cli:smoke"
    runs-on: ubuntu-latest
    timeout-minutes: 10
    steps:
      - uses: actions/checkout@v6
      - uses: actions/setup-python@v6
        with:
          python-version: "3.14"
          cache: "pip"
          cache-dependency-path: "requirements-lock.txt"
      - name: Install CLI
        run: |
          pip install -e .
      - name: Exercise CLI surface
        run: |
          python -c "from cli import GCOConfig, JobManager, CapacityChecker; print('CLI imports OK')"
          gco --help
          gco jobs --help
          gco capacity --help
          gco stacks --help
          gco costs --help
          gco dag --help
          gco inference --help

  unit-cdk-synth:
    name: "unit:cdk:synth"
    runs-on: ubuntu-latest
    timeout-minutes: 15
    steps:
      - uses: actions/checkout@v6
      - uses: actions/setup-node@v6
        with:
          node-version: "24"
      - uses: actions/setup-python@v6
        with:
          python-version: "3.14"
          cache: "pip"
          cache-dependency-path: "requirements-lock.txt"
      - name: Install CDK + dependencies
        run: |
          pip install -e ".[cdk]"
          npm install -g aws-cdk
      - uses: ./.github/actions/build-lambda-package
      - name: cdk synth
        run: cdk synth --quiet
      - name: Upload cdk.out
        if: always()
        uses: actions/upload-artifact@v7
        with:
          name: cdk-synth-output
          path: cdk.out/
          retention-days: 7

  unit-cdk-config-matrix:
    name: "unit:cdk:config-matrix"
    runs-on: ubuntu-latest
    timeout-minutes: 20
    env:
      AWS_ACCESS_KEY_ID: "fake"
      AWS_SECRET_ACCESS_KEY: "fake"
      AWS_DEFAULT_REGION: "us-east-1"
      CDK_DEFAULT_REGION: "us-east-1"
    steps:
      - uses: actions/checkout@v6
      - uses: actions/setup-node@v6
        with:
          node-version: "24"
      - uses: actions/setup-python@v6
        with:
          python-version: "3.14"
          cache: "pip"
          cache-dependency-path: "requirements-lock.txt"
      - name: Install CDK + test dependencies
        run: pip install -e ".[cdk,test]"
      - uses: ./.github/actions/build-lambda-package
      - name: Run config matrix
        # tests/test_cdk_synthesis_matrix.py parametrizes across every
        # entry in tests/_cdk_config_matrix.CONFIGS and runs ``app.synth()``
        # in-process per config — no ``cdk synth`` subprocess, no cdk.json
        # mutation. The Node.js/CDK CLI is no longer required, so the
        # ``npm install -g aws-cdk`` step was dropped alongside the
        # subprocess loop.
        #
        # Runs serially (no ``-n auto``) for the same reason as
        # unit:pytest:core: every parametrized case stages the same
        # ``lambda/kubectl-applier-simple-build`` tree into
        # ``cdk.out/asset.<hash>/`` and CDK's in-process asset-staging
        # cache races under xdist. The bulk of CDK-config parallelism
        # lives in the sibling unit:cdk:nag-compliance job, which fans
        # out one GitHub runner per config.
        run: pytest tests/test_cdk_synthesis_matrix.py -v

  unit-cdk-nag-compliance-matrix:
    name: "unit:cdk:nag-compliance:list"
    # Emits the NAG_CONFIGS list as a JSON array for the fan-out matrix
    # below. Reading the Python list here (rather than hard-coding config
    # names in YAML) keeps the workflow in lockstep with
    # ``tests/_cdk_config_matrix.NAG_CONFIGS`` — add an entry there and
    # the matrix automatically picks it up on the next run.
    runs-on: ubuntu-latest
    timeout-minutes: 2
    outputs:
      configs: ${{ steps.list.outputs.configs }}
    steps:
      - uses: actions/checkout@v6
      - id: list
        run: |
          python3 -c "
          import json, sys
          sys.path.insert(0, '.')
          from tests._cdk_config_matrix import NAG_CONFIGS
          names = [name for name, _ in NAG_CONFIGS]
          print(f'configs={json.dumps(names)}')
          " >> "$GITHUB_OUTPUT"

  unit-cdk-nag-compliance:
    name: "unit:cdk:nag-compliance (${{ matrix.config }})"
    # Fan out across ``NAG_CONFIGS`` so each configuration synthesizes
    # and runs cdk-nag in its own runner — the per-config work (build
    # global + API gateway + regional(s) + monitoring + optionally
    # analytics, then walk five rule packs across the resource tree)
    # is heavy enough that GitHub's free 4-vCPU arm64 runner beats
    # cramming every config onto a 2-vCPU amd64 runner under xdist.
    #
    # Each matrix cell invokes pytest with ``-k <config_name>`` so it
    # runs exactly the parametrized case matching that config. This is
    # the same hard gate as before against shipping an IAM-wildcard or
    # similar cdk-nag error to a user's ``cdk deploy``: if every matrix
    # cell is green, every configuration the user can pick from cdk.json
    # has been validated against AwsSolutions, HIPAA Security, NIST
    # 800-53 R5, PCI DSS 3.2.1, and Serverless rule packs in the same
    # process that ``app.py`` uses for real deploys.
    needs: unit-cdk-nag-compliance-matrix
    runs-on: ubuntu-24.04-arm
    timeout-minutes: 20
    strategy:
      fail-fast: false
      matrix:
        config: ${{ fromJson(needs.unit-cdk-nag-compliance-matrix.outputs.configs) }}
    env:
      AWS_DEFAULT_REGION: us-east-1
      AWS_REGION: us-east-1
    steps:
      - uses: actions/checkout@v6
      - uses: actions/setup-node@v6
        with:
          node-version: "24"
      - uses: actions/setup-python@v6
        with:
          python-version: "3.14"
          cache: "pip"
          cache-dependency-path: "requirements-lock.txt"
      - name: Install dependencies
        run: pip install -e ".[dev,mcp]"
      - uses: ./.github/actions/build-lambda-package
      - name: Run cdk-nag compliance for ${{ matrix.config }}
        # Select the exact parametrized test case by test ID. The
        # parametrize call in tests/test_nag_compliance.py uses
        # ``ids=[c[0] for c in CONFIGS]``, so the config name *is* the
        # test ID — ``::`` rather than ``-k`` avoids substring
        # collisions (e.g. ``analytics-enabled`` matching both
        # ``analytics-enabled`` and ``analytics-enabled-hyperpod-canvas``).
        run: |
          pytest -v \
            "tests/test_nag_compliance.py::TestCdkNagCompliance::test_no_unsuppressed_findings[${{ matrix.config }}]" \
            --junitxml=report-nag-compliance-${{ matrix.config }}.xml
      - name: Upload nag compliance report
        if: always()
        uses: actions/upload-artifact@v7
        with:
          name: nag-compliance-report-${{ matrix.config }}
          path: report-nag-compliance-${{ matrix.config }}.xml
          retention-days: 7

  unit-lockfile-freshness:
    name: "unit:lockfile:freshness"
    runs-on: ubuntu-latest
    timeout-minutes: 10
    # Skip on PRs that don't touch pyproject.toml or the lockfile. The
    # `needs: changes` only applies on pull_request (where the `changes`
    # job itself runs) — on push/dispatch `changes` is skipped and this
    # condition short-circuits to true so the job always runs.
    needs: changes
    if: |
      always() && (
        github.event_name != 'pull_request' ||
        needs.changes.outputs.deps == 'true'
      )
    steps:
      - uses: actions/checkout@v6
      - uses: actions/setup-python@v6
        with:
          python-version: "3.14"
          cache: "pip"
          cache-dependency-path: "requirements-lock.txt"
      - name: Install pip-tools
        run: |
          pip install pip-tools
      - name: Verify lockfile is up to date
        # Pre-seed the candidate lockfile with the committed one so pip-compile
        # treats the existing pins as the current resolution state. Without
        # this, an unrelated transitive release between a local pip-compile
        # and the CI run would falsely flag drift.
        #
        # ``--all-extras`` pins every optional-dependency group (dev, test,
        # cdk, mcp, typecheck, lint, security, inference-monitor). Without
        # it, only the runtime ``dependencies`` list gets resolved and CI's
        # own toolchain (pytest, pytest-xdist, moto, ruff, mypy, aws-cdk-lib,
        # etc.) installs against live PyPI on every run. That means a
        # transitive bump to pluggy or coverage between PR runs could
        # silently change test behavior. Locking everything keeps every job
        # reproducible against a known pinned tree.
        env:
          # pip-compile invokes pip, which can hit interactive keyring prompts
          # on CI ("Unhandled exception: EOF when reading a line"). Disabling
          # the keyring backend lets pip resolve without touching it.
          PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
        run: |
          set -euo pipefail
          cp requirements-lock.txt /tmp/requirements-lock-fresh.txt
          pip-compile --all-extras --no-emit-index-url --strip-extras -o /tmp/requirements-lock-fresh.txt pyproject.toml -q
          grep -vE '^\s*#' requirements-lock.txt | grep -v '^gco-cli @ file' > /tmp/lock-no-comments.txt
          grep -vE '^\s*#' /tmp/requirements-lock-fresh.txt | grep -v '^gco-cli @ file' > /tmp/fresh-no-comments.txt
          if ! diff /tmp/lock-no-comments.txt /tmp/fresh-no-comments.txt; then
            echo ""
            echo "ERROR: requirements-lock.txt is stale."
            echo "Regenerate using the Docker workflow documented in CONTRIBUTING.md"
            echo "(Dependency Management → Regenerating the Lockfile)."
            echo "Do not run pip-compile directly on the host — the result depends"
            echo "on your OS and will diverge from the Linux-targeted lockfile CI expects."
            echo "Then commit the updated lockfile."
            exit 1
          fi
          echo "Lockfile is up to date"

  unit-fresh-install:
    name: "unit:fresh-install"
    runs-on: ubuntu-latest
    timeout-minutes: 10
    # Same path-filter gate as unit:lockfile:freshness — the fresh install
    # verifies a cold `pip install -e ".[cdk]"` succeeds and imports the
    # expected modules. Only dependency/packaging changes can break it,
    # and it's the most expensive small job in the workflow (no pip cache
    # by design). Always runs on push to main / workflow_dispatch.
    needs: changes
    if: |
      always() && (
        github.event_name != 'pull_request' ||
        needs.changes.outputs.deps == 'true'
      )
    steps:
      - uses: actions/checkout@v6
      - uses: actions/setup-node@v6
        with:
          node-version: "24"
      - uses: actions/setup-python@v6
        with:
          python-version: "3.14"
          # Intentionally no pip cache — this job verifies a cold install.
      - name: Fresh install
        run: |
          pip install -e ".[cdk]"
      - name: Verify imports
        run: |
          python3 -c "import aws_cdk; import cdk_nag; import aws_cdk.aws_eks_v2; print('All CDK imports OK')"
          python3 -c "import click; import boto3; import requests; import yaml; print('All runtime imports OK')"
          python3 -c "from cli.main import cli; print('CLI entry point OK')"
          python3 -c "from gco.stacks.regional_stack import GCORegionalStack; print('Regional stack import OK')"

  unit-workload-imports:
    name: "unit:workload:imports"
    runs-on: ubuntu-latest
    timeout-minutes: 10
    steps:
      - uses: actions/checkout@v6
      - uses: actions/setup-python@v6
        with:
          python-version: "3.14"
          cache: "pip"
          cache-dependency-path: "requirements-lock.txt"
      - name: Install core + inference-monitor extras
        run: |
          pip install -e .
          pip install -e ".[inference-monitor]"
      - name: Import health-monitor + manifest-processor
        run: |
          python -c "from gco.services.health_api import app; print('health-monitor OK')"
          python -c "from gco.services.manifest_api import app; print('manifest-processor OK')"
      - name: Import inference-monitor
        run: |
          python -c "from gco.services.inference_monitor import InferenceMonitor; print('inference-monitor OK')"
      - name: Detect circular imports via importlib
        # importlib.import_module executes all module-level code including
        # deferred router imports; plain `import` can paper over cycles.
        run: |
          python -c "import importlib; importlib.import_module('gco.services.manifest_api'); print('manifest-api full import OK')"
          python -c "import importlib; importlib.import_module('gco.services.health_api'); print('health-api full import OK')"
          python -c "import importlib; importlib.import_module('gco.services.inference_monitor'); print('inference-monitor full import OK')"
          python -c "from gco.services.api_routes import jobs, manifests, queue, templates, webhooks; print('all api_routes OK')"