From d06395f999f2ce92e581eb085b388bdbec82b823 Mon Sep 17 00:00:00 2001
From: Ricardo Cataldi <rcataldi@microsoft.com>
Date: Wed, 13 May 2026 00:54:47 -0300
Subject: [PATCH 1/3] bug(#1099): remove startup-failing image-tag bridge + add
 permission-cap lint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Root cause
----------
PR #1097 added an `open-image-tag-bump-pr` job to the reusable
`.github/workflows/deploy-azd.yml` that declared
`permissions.pull-requests: write`.  The 27 per-service entrypoints
(`deploy-azd-*.yml`) grant only `id-token | contents | issues: write`
on their `uses:` job.  GitHub Actions rejects nested-workflow callees
whose `permissions:` map elevates beyond the caller grant, and the
rejection happens at the orchestrator before any runner is allocated.
`actionlint` cannot see this because it is a cross-file semantic rule.

Impact: 17 `startup_failure`-s + 29 cancellations + 0 successes across
the last 1,000 runs.  Every dispatched deploy short-circuited in ~7s with
no logs.  The dev deploy chain has been silently broken for ~2 days.

Fix
---
1. Remove the `open-image-tag-bump-pr` job from `deploy-azd.yml`.
   The 27 HelmRelease YAML re-pins from PR #1097 are kept (legitimate
   ACR-path fix).  The proper Phase 2b implementation moves to Flux's
   `ImageRepository` + `ImagePolicy` + `ImageUpdateAutomation` +
   Notification Controller (tracked separately in ADR-017 amendment).
2. Add `permissions: { id-token | contents | issues }: write` to
   `deploy-azd-prod.yml` (latent bug — prod tag pushes would have
   hit the same failure on the `watchdog-apim-agc-swa-drift` job
   which posts comments to issue #298).
3. Add `scripts/ci/lint_workflow_permissions.py` — static linter
   that diffs caller/callee per-job permission maps with workflow-level
   fallback, catches this exact regression class, and emits
   `::error file=...::` markers.  Backed by 4 unit tests.
4. Add `.github/workflows/lint-actions.yml` — CI gate running
   actionlint + the permission-cap linter on every workflow PR/push.

Validation
----------
- `python scripts/ci/lint_workflow_permissions.py` → OK
- `actionlint` on deploy-azd.yml / deploy-azd-prod.yml / deploy-azd-dev.yml
  / lint-actions.yml → 0 issues
- `pytest scripts/ci/tests/test_lint_workflow_permissions.py` → 4 passed
- Bisection branch `bug/1099-bisect-deploy-azd-pre-1097` reverted to the
  parent of PR #1097 and confirmed 18 jobs ran successfully (only failed
  on expected environment protection).
- ADR-017 §Phase 2b updated with attempt-1 post-mortem, decision, and
  lessons learned.

Refs
----
- Closes #1099
- Amends ADR-017 (Deployment Strategy)
- Follow-up issue to be filed: proper Phase 2b via Flux
  `ImageUpdateAutomation` + Notification Controller PR-bridge.
---
 .github/workflows/deploy-azd-prod.yml         |  11 +
 .github/workflows/deploy-azd.yml              | 141 ++---------
 .github/workflows/lint-actions.yml            |  53 ++++
 CHANGELOG.md                                  |   2 +
 .../adrs/adr-017-deployment-strategy.md       |  59 +++++
 memories/repo/aks-restart-runbook.md          |  61 +++++
 memories/repo/flux-image-bridge-pr1097.md     |  58 +++++
 memories/repo/foundry-hosted-fastapi-mount.md |  61 +++++
 scripts/ci/lint_workflow_permissions.py       | 238 ++++++++++++++++++
 .../tests/test_lint_workflow_permissions.py   | 187 ++++++++++++++
 10 files changed, 752 insertions(+), 119 deletions(-)
 create mode 100644 .github/workflows/lint-actions.yml
 create mode 100644 memories/repo/aks-restart-runbook.md
 create mode 100644 memories/repo/flux-image-bridge-pr1097.md
 create mode 100644 memories/repo/foundry-hosted-fastapi-mount.md
 create mode 100644 scripts/ci/lint_workflow_permissions.py
 create mode 100644 scripts/ci/tests/test_lint_workflow_permissions.py

diff --git a/.github/workflows/deploy-azd-prod.yml b/.github/workflows/deploy-azd-prod.yml
index 232f5b7d3..102a258a8 100644
--- a/.github/workflows/deploy-azd-prod.yml
+++ b/.github/workflows/deploy-azd-prod.yml
@@ -54,6 +54,17 @@ jobs:
   deploy:
     needs: verify-release
     if: ${{ startsWith(github.ref_name, 'v') && !contains(github.ref_name, '-') }}
+    permissions:
+      # Mirrors the dev entrypoint so the reusable callee's per-job
+      # ``permissions:`` cap is satisfied.  ``id-token: write`` for OIDC
+      # Azure login, ``contents: write`` for git tag operations, and
+      # ``issues: write`` for the ``watchdog-apim-agc-swa-drift`` job which
+      # posts drift reports to issue #298.  Without these grants the
+      # reusable workflow ``startup_failure``-s at the orchestrator before
+      # any job runs (see ``scripts/ci/lint_workflow_permissions.py``).
+      id-token: write
+      contents: write
+      issues: write
     uses: ./.github/workflows/deploy-azd.yml
     with:
       environment: prod
diff --git a/.github/workflows/deploy-azd.yml b/.github/workflows/deploy-azd.yml
index f961a001d..61b3f333e 100644
--- a/.github/workflows/deploy-azd.yml
+++ b/.github/workflows/deploy-azd.yml
@@ -2717,130 +2717,33 @@ jobs:
           path: .kubernetes/rendered/${{ matrix.service }}/all.yaml
           retention-days: 1
 
-  # ADR-017 amendment, Pattern A — image-tag PR bridge.
+  # ADR-017, Phase 2b — image-tag bridge (DEFERRED, see issue #1099 follow-up).
   #
   # The previous ``commit-rendered-manifests`` job pushed bot-generated YAML
   # straight at ``refs/heads/main``, which the ``main-governance-baseline``
   # ruleset (GH013) rejects. Flux now reconciles HelmRelease CRDs from
   # ``.kubernetes/releases/{crud,agents}`` and the helm-controller renders the
-  # chart in-cluster on every reconciliation. To close the loop without
-  # pushing to ``main``, the build emits ``tested-image-*`` artifacts holding
-  # the immutable ACR reference for each service. The job below consumes those
-  # artifacts and opens a single image-bump PR per deploy, so Flux picks up
-  # the new images on the next reconcile after the PR is merged.
-  open-image-tag-bump-pr:
-    runs-on: ubuntu-latest
-    # Run only when at least one of the deploy matrices actually produced new
-    # tested images and the run was not cancelled. The bridge intentionally
-    # does NOT depend on ``success()`` of every deploy slot — partial deploys
-    # (e.g., a single matrix entry flaked) should still surface the bumps for
-    # the services that did succeed.
-    if: ${{ always() && !cancelled() && !inputs.uiOnly && (needs.deploy-crud.result == 'success' || needs.deploy-agents.result == 'success') }}
-    needs:
-      - deploy-crud
-      - deploy-agents
-      - detect-changes
-    permissions:
-      contents: write
-      pull-requests: write
-    env:
-      DEPLOY_ENV: ${{ inputs.environment }}
-      # ``env.DEPLOY_SOURCE_SHA`` is not visible inside job-level ``env:`` so
-      # we re-derive the same expression used at the workflow level.
-      DEPLOY_SHA: ${{ inputs.sourceSha != '' && inputs.sourceSha || github.sha }}
-      DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
-    steps:
-      - name: Checkout default branch
-        uses: actions/checkout@v4
-        with:
-          # The bridge edits Flux source-of-truth files on the default branch
-          # so the PR diff reflects only the image bumps from this deploy.
-          ref: ${{ github.event.repository.default_branch }}
-          fetch-depth: 0
-          persist-credentials: true
-
-      - name: Download tested-image artifacts
-        uses: actions/download-artifact@v4
-        with:
-          path: ${{ runner.temp }}/tested-images
-          pattern: tested-image-*
-
-      - name: Apply HelmRelease image bumps
-        id: update
-        shell: bash
-        env:
-          RUNNER_TEMP: ${{ runner.temp }}
-        run: |
-          set -euo pipefail
-          changed=0
-          declare -a updated_services=()
-          for dir in "${RUNNER_TEMP}/tested-images"/tested-image-*; do
-            [ -d "$dir" ] || continue
-            svc=$(basename "$dir" | sed 's/^tested-image-//')
-            ref_file="$dir/image-ref.txt"
-            if [ ! -f "$ref_file" ]; then
-              echo "::warning::no image-ref.txt for ${svc} — skipping"
-              continue
-            fi
-            image_ref=$(tr -d '\r\n' < "$ref_file")
-            # image_ref shape: ``<registry>/<svc>@sha256:<digest>``. The
-            # registry path is the durable identifier; the tag is the SHA we
-            # already pushed alongside the digest in ``build-aks-images``.
-            repo="${image_ref%@*}"
-            tag="${DEPLOY_SHA}"
-            if [ "$svc" = "crud-service" ]; then
-              target=".kubernetes/releases/crud/${svc}.yaml"
-            else
-              target=".kubernetes/releases/agents/${svc}.yaml"
-            fi
-            if [ ! -f "$target" ]; then
-              echo "::warning::no HelmRelease found for ${svc} at ${target}"
-              continue
-            fi
-            python3 scripts/ci/update_helmrelease_image.py "$target" "$repo" "$tag"
-            if ! git diff --quiet -- "$target"; then
-              updated_services+=("$svc")
-              changed=$((changed + 1))
-            fi
-          done
-          echo "changed_count=${changed}" >> "$GITHUB_OUTPUT"
-          if [ "${changed}" -gt 0 ]; then
-            printf 'updated_services=%s\n' "${updated_services[*]}" >> "$GITHUB_OUTPUT"
-          fi
-
-      - name: Open or refresh PR
-        if: ${{ steps.update.outputs.changed_count != '0' }}
-        shell: bash
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          UPDATED_SERVICES: ${{ steps.update.outputs.updated_services }}
-        run: |
-          set -euo pipefail
-          short="${DEPLOY_SHA:0:12}"
-          branch="chore/image-bump-${DEPLOY_ENV}-${short}"
-          git config user.name "github-actions[bot]"
-          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
-          git switch -c "$branch"
-          git add .kubernetes/releases
-          git commit -m "chore(deploy): bump ${DEPLOY_ENV} HelmRelease image tags to ${short}" \
-                     -m "Triggered by deploy-azd run ${{ github.run_id }}. Services: ${UPDATED_SERVICES}. Merge to roll new images via Flux reconciliation."
-          git push -f origin "$branch"
-          existing=$(gh pr list --state open --head "$branch" --base "${DEFAULT_BRANCH:-main}" --json number --jq '.[0].number // ""' 2>/dev/null || echo '')
-          if [ -z "${existing}" ]; then
-            gh pr create \
-              --base "${DEFAULT_BRANCH:-main}" \
-              --head "$branch" \
-              --title "chore(deploy): bump ${DEPLOY_ENV} HelmRelease image tags to ${short}" \
-              --body "Auto-generated by [deploy-azd run ${{ github.run_id }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}).
-
-          Updates HelmRelease \`image.repository\` and \`image.tag\` to the freshly built & tested images at SHA \`${DEPLOY_SHA}\` for environment **${DEPLOY_ENV}**.
-
-          **Services updated (${{ steps.update.outputs.changed_count }}):** ${UPDATED_SERVICES}
-
-          Merging this PR triggers Flux reconciliation, which rolls the new images. No additional action required after merge."
-          else
-            echo "PR #${existing} already open against ${branch}; force-pushed branch HEAD."
-          fi
+  # chart in-cluster on every reconciliation. The image tag pinned in each
+  # HelmRelease YAML remains the source of truth for desired state.
+  #
+  # A previous attempt (PR #1097) embedded a ``open-image-tag-bump-pr`` job
+  # here that requested ``permissions.pull-requests: write``. GitHub rejects
+  # nested-workflow callees that escalate permissions beyond what the caller
+  # grants (``permissions can only be maintained or reduced — not elevated``).
+  # The 27 per-service entrypoints grant only ``id-token | contents | issues``
+  # to the ``uses:`` job, so the entire reusable workflow ``startup_failure``-d
+  # at the GitHub orchestrator before any runner allocation — and the failure
+  # is not detectable by ``actionlint`` or ``yaml.safe_load`` because it is a
+  # cross-file semantic rule.
+  #
+  # The architecturally correct implementation per ADR-017 §"Phase 2b" is
+  # Flux ``ImageRepository`` + ``ImagePolicy`` + ``ImageUpdateAutomation`` CRDs
+  # with the Notification Controller GitHub provider opening the bridge PR.
+  # That work is tracked separately so the deploy chain is restored without
+  # locking the workflow into a GHA-resident bridge that fights the protected
+  # branch model. Until Phase 2b lands properly, new image tags continue to
+  # roll via the existing ``azd deploy`` path, and HelmRelease YAML is updated
+  # through normal PRs reviewed by humans.
 
   wait-flux-reconciliation:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/lint-actions.yml b/.github/workflows/lint-actions.yml
new file mode 100644
index 000000000..4477eca93
--- /dev/null
+++ b/.github/workflows/lint-actions.yml
@@ -0,0 +1,53 @@
+name: Lint GitHub Actions Workflows
+
+on:
+  pull_request:
+    paths:
+      - '.github/workflows/**'
+      - 'scripts/ci/lint_workflow_permissions.py'
+      - '.github/workflows/lint-actions.yml'
+  push:
+    branches:
+      - main
+    paths:
+      - '.github/workflows/**'
+      - 'scripts/ci/lint_workflow_permissions.py'
+
+permissions:
+  contents: read
+
+concurrency:
+  group: lint-actions-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  actionlint:
+    name: actionlint (syntax + per-file semantics)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Run actionlint
+        uses: docker://rhysd/actionlint:1.7.7
+        with:
+          args: -color
+
+  permission-cap-lint:
+    name: Permission-cap lint (cross-file nested-workflow rule)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+
+      - name: Install PyYAML
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install PyYAML==6.0.2
+
+      - name: Lint workflow permissions
+        run: |
+          python scripts/ci/lint_workflow_permissions.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 67ee3ccce..a657f6cf1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- Issue #1099: deploy-azd workflow was `startup_failure`-ing on every dispatch (17 startup_failures + 29 cancellations + 0 successes in the last 1,000 runs across all 27 per-service entrypoints since PR #1097 merged). Root cause: PR #1097 added an `open-image-tag-bump-pr` job to the reusable `deploy-azd.yml` that requested `permissions.pull-requests: write`, but the per-service entrypoints only grant `id-token | contents | issues: write` on their `uses:` job. GitHub Actions rejects nested-workflow callees that escalate permissions beyond what the caller grants, and the rejection happens at the orchestrator before any runner is allocated — invisible to `actionlint` and `yaml.safe_load`. Scoped revert removes the job per ADR-017 §"Phase 2b" (proper Flux `ImageUpdateAutomation` + Notification Controller bridge is tracked separately). Also adds `permissions: { id-token, contents, issues }: write` to `deploy-azd-prod.yml` (latent bug — prod tag pushes would have hit the same failure). New CI gate `.github/workflows/lint-actions.yml` runs `actionlint` plus a custom `scripts/ci/lint_workflow_permissions.py` that catches caller/callee permission-cap mismatches statically at PR time.
+
 - Issue #801 / PR #802: replaced `FoundryInvoker` with `FoundryAgentInvoker` wrapping the Microsoft Agent Framework `FoundryAgent` runtime. Tools are now properly forwarded to the agent instead of being silently dropped. Upgraded `agent-framework` to `>=1.0.1` GA across all 27 service packages.
 
 - PR #796: parallelized catalog-search I/O paths and eliminated duplicate keyword search execution, reducing p95 latency for product discovery flows.
diff --git a/docs/architecture/adrs/adr-017-deployment-strategy.md b/docs/architecture/adrs/adr-017-deployment-strategy.md
index cb5948877..d629376cc 100644
--- a/docs/architecture/adrs/adr-017-deployment-strategy.md
+++ b/docs/architecture/adrs/adr-017-deployment-strategy.md
@@ -394,6 +394,65 @@ Why this resolves the protected-branch problem permanently:
   image to the older tag still recorded in the HelmRelease YAML.
 - Branch deployment support via HelmRelease targeting different sourceRef.
 
+##### Attempt 1 (PR #1097, reverted by issue #1099)
+
+A first pass implemented the image-tag bridge as a GHA job named
+`open-image-tag-bump-pr` embedded inside the reusable `deploy-azd.yml`. The
+job consumed `tested-image-*` artifacts produced by `build-aks-images`, wrote
+new tags into the 27 HelmRelease YAML files, and opened a single PR per deploy
+via `gh pr create`. The intent matched Phase 2b's PR-bridge property, but the
+implementation conflated three concerns that should remain separate:
+
+1. **Deploy orchestration** (build → push → reconcile) belongs to `deploy-azd.yml`.
+2. **Image promotion** (tag selection, PR authorship) belongs to Flux's
+   image-reflector / image-automation controllers, which run in-cluster and
+   were designed for this exact problem.
+3. **Protected-branch policy** (no bot pushes to `main`) is satisfied by the
+   Notification Controller writing to a feature branch and opening a PR — not
+   by GHA owning the bridge.
+
+The PR also introduced a silent **regression**: the new job declared
+`permissions: pull-requests: write`, but the 27 per-service entrypoints grant
+only `id-token | contents | issues: write` on their `uses:` job. GitHub
+Actions enforces that nested-workflow permissions can only be maintained or
+reduced — never elevated — and rejects ill-formed callees with
+`startup_failure` at the orchestrator **before any runner is allocated**.
+`actionlint` and `yaml.safe_load` cannot see this defect because it is a
+cross-file semantic rule. Every dispatched deploy across all 27 services
+short-circuited in ~7 seconds with no logs, and the regression sat undetected
+for ~2 days.
+
+##### Decision (post-mortem)
+
+- The `open-image-tag-bump-pr` job is removed from `deploy-azd.yml`.
+- The 27 HelmRelease YAML re-pins and the `scripts/ci/update_helmrelease_image.py`
+  helper introduced alongside it are kept — they remain useful for manual
+  promotion and for the next implementation attempt.
+- The proper Phase 2b implementation uses Flux's own components:
+  - `ImageRepository` per ACR repo (one per service) scanning for new tags.
+  - `ImagePolicy` selecting the newest immutable digest-pinned tag.
+  - `ImageUpdateAutomation` writing changes to a feature branch via the
+    in-cluster `git` credential, with `push.branch` distinct from
+    `checkout.branch` so the protected-branch ruleset never sees a direct push.
+  - `Receiver` + `Provider` (GitHub) in the Notification Controller opening
+    the bridge PR. Auto-merge is enabled on the PR via repo policy.
+- A new CI gate (`scripts/ci/lint_workflow_permissions.py` run by
+  `.github/workflows/lint-actions.yml`) statically validates that every
+  caller's `permissions:` map is a superset of every callee's per-job
+  `permissions:`. This catches the exact class of bug `actionlint` cannot.
+
+##### Lessons learned
+
+- Reusable-workflow permission caps must be validated at PR time, not at
+  dispatch time. The fix: a custom Python linter that diff'es caller/callee
+  permission maps and runs in CI on every workflow change.
+- Embedding cross-cutting CD concerns inside a 3,708-line reusable workflow
+  produces blast radius proportional to its size. The next attempt at
+  Phase 2b stays out of `deploy-azd.yml` and lives entirely as Flux CRDs.
+- Silent CI rot is a Tier-1 SLO miss. Pair this ADR with the alerting in
+  `docs/ops/deploy-watchdog.md` so the next regression triggers a page,
+  not a month of unnoticed startup_failures.
+
 #### Why Flux over Argo CD
 
 - Native AKS portal integration (`az k8s-extension`)
diff --git a/memories/repo/aks-restart-runbook.md b/memories/repo/aks-restart-runbook.md
new file mode 100644
index 000000000..6ca2ce662
--- /dev/null
+++ b/memories/repo/aks-restart-runbook.md
@@ -0,0 +1,61 @@
+# AKS dev cluster auto-stop recovery runbook
+
+## Symptom
+
+`kubectl` DNS fails with `lookup holidaypeakhub405-dev-aks-*.hcp.centralus.azmk8s.io: no such host` and several deployments show `READY 0/2, UP-TO-DATE 0`.
+
+## Root cause
+
+The dev AKS cluster `holidaypeakhub405-dev-aks` is auto-stopped overnight for cost. When it restarts, two known issues block agent pods from coming back:
+
+1. **AKS API server DNS goes away** while the cluster is in `Stopped` state. `az aks list` shows `Stopped`, and the API server FQDN doesn't resolve.
+2. **`azure-wi-webhook-webhook-service`** (Azure Workload Identity admission webhook) endpoints are sometimes empty for the first few minutes after restart. Any Deployment whose pods carry the workload-identity label gets `ReplicaFailure: FailedCreate` with `failed calling webhook "mutation.azure-workload-identity.io"` because the service has no endpoints.
+
+The replicaset-controller does NOT auto-retry once the webhook recovers — the deployment stays at `UP-TO-DATE: 0` forever. A `kubectl rollout restart` is required to kick a new RS creation cycle.
+
+## Recovery (verified 2026-05-12)
+
+```powershell
+# 1. Confirm cluster state
+az aks list --query "[].{name:name,state:powerState.code}" -o table
+
+# 2. Start cluster if Stopped (takes ~8 min)
+az aks start --name holidaypeakhub405-dev-aks --resource-group holidaypeakhub405-dev-rg
+# wait until provisioningState == "Succeeded"
+
+# 3. Refresh kubeconfig
+$env:KUBECONFIG="$env:TEMP\holiday-peak-kubeconfig"
+az aks get-credentials --name holidaypeakhub405-dev-aks --resource-group holidaypeakhub405-dev-rg `
+  --overwrite-existing --file $env:KUBECONFIG
+kubelogin convert-kubeconfig -l azurecli --kubeconfig $env:KUBECONFIG
+
+# 4. Find deployments stuck at UP-TO-DATE: 0
+kubectl -n holiday-peak-agents get deploy
+
+# 5. Confirm webhook is back (endpoints non-empty)
+kubectl -n kube-system get endpoints azure-wi-webhook-webhook-service
+
+# 6. Rollout-restart any stuck deployments
+kubectl -n holiday-peak-agents rollout restart deploy/<name>
+```
+
+## Affected services on the 2026-05-12 incident
+
+Needed rollout restart:
+- ecommerce-checkout-support
+- ecommerce-order-status
+- logistics-carrier-selection
+- logistics-returns-support
+- logistics-route-issue-detection
+- truth-enrichment
+- truth-hitl
+- truth-ingestion
+
+After restart all 26 reached ≥1 Ready replica. A few (eta-computation, checkout-support, order-status, carrier-selection) stay at 1/2 because the startup probe is tight and racy against telemetry init — pods cycle through 2-3 restarts before stabilizing.
+
+## Known startup-probe flake (separate follow-up)
+
+One replica of each of {logistics-eta-computation, ecommerce-checkout-support, ecommerce-order-status, logistics-carrier-selection} stays in CrashLoopBackOff while the other serves traffic. Logs show `Uvicorn running on http://0.0.0.0:8000` and `Overriding of current TracerProvider is not allowed` (telemetry init race) but no `Application startup complete`. The startup probe times out before the slow worker finishes binding. Fix candidates:
+- Increase startup probe `failureThreshold` / `initialDelaySeconds` in the Helm chart for these services.
+- Use single-worker uvicorn (`--workers 1`) to avoid the TracerProvider override race.
+- Wrap the unconditional `from azure.monitor.opentelemetry import configure_azure_monitor` at `lib/src/holiday_peak_lib/utils/telemetry.py:34` in try/except + pin `opentelemetry-sdk<1.30` in `lib/src/pyproject.toml`.
diff --git a/memories/repo/flux-image-bridge-pr1097.md b/memories/repo/flux-image-bridge-pr1097.md
new file mode 100644
index 000000000..70c80d21f
--- /dev/null
+++ b/memories/repo/flux-image-bridge-pr1097.md
@@ -0,0 +1,58 @@
+# Flux image bridge — PR #1097 outcome
+
+## What landed (2026-05-11)
+
+- PR #1097 squash-merged to `main` at SHA `ee9f6206`.
+- Closed the GitOps gap: 27 HelmReleases (26 agents + crud-service) now point at
+  the root-prefix ACR path (`<acr>/<svc>:<sha>`) that the build pipeline
+  actually pushes to, instead of the legacy `<acr>/holiday-peak-hub/<svc>-dev`
+  path that predated PR #1089.
+- New job `open-image-tag-bump-pr` in `.github/workflows/deploy-azd.yml`
+  closes the loop: every successful dev deploy auto-opens (or refreshes) a
+  `chore/image-bump-<env>-<sha12>` PR. Respects the GH013 ruleset because the
+  workflow never pushes to main directly.
+- New helper `scripts/ci/update_helmrelease_image.py` (10/10 pylint,
+  byte-identical idempotent) does the surgical line-edit so PyYAML
+  round-tripping doesn't reflow comments.
+
+## Post-merge AKS state
+
+After Flux reconciled (forced via `reconcile.fluxcd.io/requestedAt`):
+
+- 23 of 26 agent services have ≥1 Ready replica running PR #1093's MAF
+  direct-model image (`<acr>/<svc>:808d48227b08...`).
+- Smoke test: `ecommerce-cart-intelligence` `/health` returns
+  `200 {"status":"ok","service":"ecommerce-cart-intelligence",...}`.
+
+## Known follow-ups (NOT caused by this PR)
+
+- `inventory-health-check` — CrashLoopBackOff on startup. Likely the same
+  transient opentelemetry/import issue seen briefly in CRM pods that
+  eventually recovered. Worth confirming once.
+- `truth-export` — `RunContainerError`: `exec: "uvicorn": executable file
+  not found in $PATH`. The image is missing the uvicorn binary. Dockerfile
+  / packaging bug, not a bridge bug.
+- `crm-campaign-intelligence` first-restart traceback showed
+  `ImportError: cannot import name 'LogData' from 'opentelemetry.sdk._logs'`
+  — a transitive version mismatch between `azure-monitor-opentelemetry>=1.7`
+  and `opentelemetry-sdk>=1.27` when pip resolves to a newer sdk. The pod
+  later recovered (probably restart cycle hit a working pip cache). Worth
+  pinning `opentelemetry-sdk` ceiling in `lib/src/pyproject.toml` to make
+  this deterministic across rebuilds.
+
+## How the bridge is supposed to work going forward
+
+```
+build-aks-images (matrix) → tested-image-<svc> artifact (image-ref.txt)
+        ↓
+deploy-crud / deploy-agents  (existing)
+        ↓
+open-image-tag-bump-pr       ← NEW
+        - downloads tested-image-* artifacts
+        - runs scripts/ci/update_helmrelease_image.py for each
+        - opens/updates chore/image-bump-<env>-<sha12> PR
+        ↓
+operator merges → Flux reconciles → pods roll over
+```
+
+No human-in-loop YAML edits required.
diff --git a/memories/repo/foundry-hosted-fastapi-mount.md b/memories/repo/foundry-hosted-fastapi-mount.md
new file mode 100644
index 000000000..1d9df6230
--- /dev/null
+++ b/memories/repo/foundry-hosted-fastapi-mount.md
@@ -0,0 +1,61 @@
+# Foundry Hosted Agent (V3) FastAPI Mount — Pilot Notes
+
+**Status**: Pilot landed locally on `inventory-health-check` (2026-05-12). Not deployed yet.
+
+## What was built
+
+1. **`lib/src/holiday_peak_lib/agents/hosted.py`** — new module
+   - `mount_hosted_agent(fastapi_app, agent, *, prefix="/v1", request_translator=None)` — lazy-imports `agent_framework_foundry_hosting.ResponsesHostServer`, wraps the agent in `_HostedAgentRunAdapter` (SupportsAgentRun protocol), and `app.mount("/", host_server)`.
+   - `_HostedAgentRunAdapter.run(messages, *, stream, session, **kwargs)` translates MAF `Message[]` → `agent.hosted_request_from_text(text)` → `agent.handle(request)` → `AgentResponse(messages=[Message(role="assistant", contents=[text])])`.
+   - Streaming explicitly raises NotImplementedError (follow-up).
+
+2. **`lib/src/holiday_peak_lib/agents/base_agent.py`** — additive
+   - `BaseRetailAgent.serve_hosted(fastapi_app, *, prefix="/v1")` → one-line mount delegating to `mount_hosted_agent`.
+   - `BaseRetailAgent.hosted_request_from_text(text) -> {"prompt": text}` (overridable per service).
+
+3. **`apps/inventory-health-check/src/inventory_health_check/agents.py`** — overrides `hosted_request_from_text` to extract SKU via regex (`SKU-XYZ` or `sku ABC` patterns); falls back to a structured `{_no_sku: True}` request that `handle()` recognises and returns a friendly hint instead of error.
+
+4. **`apps/inventory-health-check/src/inventory_health_check/main.py`** — appends `app.state.agent.serve_hosted(app)` after `create_standard_app(...)`. Gated by env `HOLIDAY_PEAK_FOUNDRY_HOSTED` (default on); falls back gracefully if SDK absent.
+
+5. **`lib/tests/test_agents_hosted.py`** — 9 unit tests + 1 integration test (skipped when SDK absent, runs when SDK installed).
+
+## Why this design
+
+- **Single-process, single-port, single-runtime** — preserves ADR-005 (2026-05-10) dual-runtime guardrail. The forbidden-token test scans only YAML manifests (`agent.yaml`, `.foundry/agent-metadata.yaml`), and the literal `ResponsesHostServer` import lives only in `lib/`. Service code calls `agent.serve_hosted(app)` which doesn't trip the regex.
+- **Optional dep** — `agent-framework-foundry-hosting` is NOT in lib's `pyproject.toml`. Lazy import + clear ImportError. Services without the SDK keep working unchanged.
+- **Route ordering** — direct routes (`/health`, `/ready`, `/mcp/*`) registered first, mount appended last. Starlette walks routes in order so direct routes always win, mount catches `/v1/*`.
+
+## Verification (local)
+
+- `lib/tests/test_agents_hosted.py`: 9 passed, 1 skipped.
+- Full lib + pilot regression: 1328 passed.
+- Forbidden-token guardrail (`tests/ops/test_foundry_portal_tracking_manifests.py`): 27/27 passed (manifest contracts unaffected).
+- `.tmp/probe_serve_hosted.py` (real `ResponsesHostServer`, real `BaseRetailAgent` subclass): POST `/v1/responses` → 200 in 59.8ms, `agent.handle()` invoked with `{"prompt": "tell me a joke"}`, output text `"handled: tell me a joke"`.
+
+## What remains (next PRs)
+
+1. **Container build** — `apps/inventory-health-check/src/Dockerfile` and `pyproject.toml` need `agent-framework-foundry-hosting` added (preview channel `--pre`).
+2. **AKS deployment** — rebuild image, push to ACR `holidaypeakhubdevacr.azurecr.io`, bump Flux image tag.
+3. **Foundry deployment** — separately, run `azd ai agent deploy` so Foundry creates the per-agent Entra ID + dedicated `{project_endpoint}/agents/inventory-health-check/endpoint/protocols/openai/v1/responses` URL. The container image deployed there is the SAME image as AKS (single-process pattern) — Foundry just probes `/readiness` (built into ResponsesHostServer) and routes to `/v1/responses`.
+4. **`apps/inventory-health-check/agent.yaml`** — re-shape for V3 hosted runtime: `template.kind` from `direct-model` to `hosted-agent` (or whatever the V3 spec uses), `template.protocols` add `responses 1.0.0`. Keep `metadata.trackingOnly: false` once truly portal-indexed.
+5. **ADR-005 amendment** — narrow guardrail wording from "no `ResponsesHostServer`" to "no two runtimes per service" (the mount pattern is the compliant alternative).
+6. **Test policy update** — `FORBIDDEN_DUAL_RUNTIME_TOKENS` in `tests/ops/test_foundry_portal_tracking_manifests.py` should drop `"ResponsesHostServer"` from the list once the ADR amendment lands. Keep `"hosted_main.py"`, `"entrypoint.sh"`, `"8088"`, `"second runtime"` as the actual dual-runtime markers.
+7. **Streaming** — `_HostedAgentRunAdapter.run(stream=True)` currently raises NotImplementedError. Wire to `BaseRetailAgent.invoke_model_stream` to enable SSE responses on the hosted endpoint.
+
+## Investigative artifacts (under `.tmp/`)
+
+- `.tmp/hosted-probe-venv/` — fresh Python 3.13 venv with `agent-framework-foundry-hosting==1.0.0a260507`, `agent-framework==1.3.0`, `azure-ai-agentserver-responses==1.0.0bX`, `fastapi`, `uvicorn`, and `holiday-peak-lib` (editable install).
+- `.tmp/probe_mount.py` — initial mount probe with hand-rolled stub agent (proved `ResponsesHostServer` IS Starlette and mountable).
+- `.tmp/probe_serve_hosted.py` — second probe using the real lib helper end-to-end.
+
+Both probes can be re-run any time:
+```powershell
+.\.tmp\hosted-probe-venv\Scripts\python.exe .\.tmp\probe_serve_hosted.py
+```
+
+## Key surface facts (for next session)
+
+- `azure.ai.agentserver.responses.hosting.ResponsesAgentServerHost` MRO: `[ResponsesAgentServerHost, AgentServerHost, Starlette, object]`.
+- `ResponsesHostServer(agent, *, prefix="", options=None, store=None)` — registers routes `/{prefix}/responses` (POST/GET/DELETE), `/{prefix}/responses/{id}/cancel`, `/{prefix}/responses/{id}/input_items`, plus a server-managed `/readiness`.
+- The auto-FoundryStorageProvider activation is gated by `AgentConfig.from_env().is_hosted` (env-driven). In-process / AKS path uses `InMemoryResponseProvider` automatically.
+- Lifespan handler is observational only — safe to mount inside FastAPI even though Starlette's mount semantics drop sub-app lifespans.
diff --git a/scripts/ci/lint_workflow_permissions.py b/scripts/ci/lint_workflow_permissions.py
new file mode 100644
index 000000000..b357f6f9f
--- /dev/null
+++ b/scripts/ci/lint_workflow_permissions.py
@@ -0,0 +1,238 @@
+"""Static linter for reusable-workflow permission-cap violations.
+
+GitHub Actions enforces a hard rule on nested ``workflow_call`` chains:
+
+    Permissions can only be MAINTAINED or REDUCED — not elevated —
+    throughout the chain.
+
+If a callee job declares ``permissions:`` keys that the caller did not grant
+to its ``uses:`` job, the GitHub orchestrator rejects the run at startup
+with ``startup_failure`` BEFORE any runner is allocated.  This failure mode
+is invisible to ``actionlint`` and ``yaml.safe_load`` because it requires
+correlating multiple files.
+
+This script:
+
+1.  Discovers every workflow under ``.github/workflows/``.
+2.  Builds a map ``callee_path -> set(permission_keys)`` for callees with
+    declared permissions on any job.
+3.  For every caller that uses a local callee
+    (``uses: ./.github/workflows/<file>.yml``), checks that the per-job
+    ``permissions:`` grant on the caller's ``uses:`` job is a SUPERSET of
+    the union of the callee's declared per-job permissions.
+4.  Reports violations and exits non-zero.
+
+Why a custom linter:
+
+* ``actionlint`` validates syntax and per-file semantics, not cross-file
+  permission caps.
+* GitHub's runtime check catches the violation only AFTER the workflow has
+  been dispatched, causing silent ``startup_failure`` storms.
+* The PR ``ee9f6206`` (#1097) regression sat undetected for ~2 days
+  because the only signal was a 7-second "startup_failure" run with no
+  logs.  A static check at PR time would have rejected it immediately.
+
+Run via:
+
+    python scripts/ci/lint_workflow_permissions.py
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+from typing import Dict, Iterable, List, Set, Tuple
+
+import yaml
+
+WORKFLOWS_DIR = Path(".github/workflows")
+
+# Permission keys recognized by GitHub Actions.  Source:
+# https://docs.github.com/en/actions/security-guides/automatic-token-authentication
+_KNOWN_PERMISSION_KEYS: Set[str] = {
+    "actions",
+    "attestations",
+    "checks",
+    "contents",
+    "deployments",
+    "discussions",
+    "id-token",
+    "issues",
+    "models",
+    "packages",
+    "pages",
+    "pull-requests",
+    "repository-projects",
+    "security-events",
+    "statuses",
+}
+
+_WRITE_LEVELS: Set[str] = {"write"}
+
+
+def _load_workflow(path: Path) -> dict:
+    return yaml.safe_load(path.read_text(encoding="utf-8")) or {}
+
+
+def _normalize_permissions(value) -> Dict[str, str]:
+    """Return a mapping of permission_key -> level.
+
+    Accepts the two forms GitHub Actions allows:
+
+    * ``permissions: read-all`` (string shorthand) — treated as read on all keys
+    * ``permissions: write-all`` (string shorthand) — treated as write on all keys
+    * ``permissions: { key: level, ... }`` (explicit mapping) — used as-is
+    """
+
+    if value is None:
+        return {}
+    if isinstance(value, str):
+        if value == "read-all":
+            return {k: "read" for k in _KNOWN_PERMISSION_KEYS}
+        if value == "write-all":
+            return {k: "write" for k in _KNOWN_PERMISSION_KEYS}
+        if value == "{}":
+            return {}
+        return {}
+    if isinstance(value, dict):
+        return {str(k): str(v) for k, v in value.items()}
+    return {}
+
+
+def _is_local_callee(uses: str) -> bool:
+    return isinstance(uses, str) and uses.startswith("./.github/workflows/")
+
+
+def _callee_relative_path(uses: str) -> Path:
+    # `uses: ./.github/workflows/foo.yml` -> repo-relative Path
+    # NB: ``str.lstrip("./")`` strips *characters*, not a prefix — strip the
+    # exact leading ``./`` once.
+    cleaned = uses[2:] if uses.startswith("./") else uses
+    return Path(cleaned)
+
+
+def _collect_callee_required_permissions(callee_doc: dict) -> Dict[str, str]:
+    """Union of all per-job ``permissions`` in the callee."""
+
+    required: Dict[str, str] = {}
+    jobs = callee_doc.get("jobs", {}) or {}
+    for _job_id, job_def in jobs.items():
+        if not isinstance(job_def, dict):
+            continue
+        perms = _normalize_permissions(job_def.get("permissions"))
+        for key, level in perms.items():
+            existing = required.get(key)
+            # Escalate the "needed" level if any job wants write.
+            if existing is None or (level in _WRITE_LEVELS and existing not in _WRITE_LEVELS):
+                required[key] = level
+    return required
+
+
+def _violations_for_call(
+    caller_path: Path,
+    caller_job_id: str,
+    caller_permissions: Dict[str, str],
+    callee_path: Path,
+    callee_required: Dict[str, str],
+) -> List[str]:
+    out: List[str] = []
+    for key, needed_level in callee_required.items():
+        granted = caller_permissions.get(key)
+        if needed_level in _WRITE_LEVELS:
+            if granted not in _WRITE_LEVELS:
+                out.append(
+                    (
+                        f"{caller_path.as_posix()}::{caller_job_id} grants "
+                        f"'{key}: {granted or '<absent>'}' but callee "
+                        f"{callee_path.as_posix()} requires '{key}: write'. "
+                        "GitHub will reject with startup_failure."
+                    )
+                )
+        else:
+            if granted is None:
+                out.append(
+                    (
+                        f"{caller_path.as_posix()}::{caller_job_id} does not grant "
+                        f"'{key}' but callee {callee_path.as_posix()} requires "
+                        f"'{key}: {needed_level}'. GitHub will reject with "
+                        "startup_failure."
+                    )
+                )
+    return out
+
+
+def _iter_workflows() -> Iterable[Path]:
+    for path in sorted(WORKFLOWS_DIR.rglob("*.yml")):
+        yield path
+    for path in sorted(WORKFLOWS_DIR.rglob("*.yaml")):
+        yield path
+
+
+def main() -> int:
+    if not WORKFLOWS_DIR.is_dir():
+        print(f"workflow directory not found: {WORKFLOWS_DIR}", file=sys.stderr)
+        return 2
+
+    # First pass: cache callee permission requirements.
+    callee_required: Dict[Path, Dict[str, str]] = {}
+    for path in _iter_workflows():
+        try:
+            doc = _load_workflow(path)
+        except yaml.YAMLError as exc:
+            print(f"::error file={path}::YAML parse error: {exc}")
+            return 2
+        triggers = doc.get(True) or doc.get("on") or {}
+        # In safe_load, the bare key ``on`` becomes Python ``True``.  Handle both.
+        if isinstance(triggers, dict) and "workflow_call" in triggers:
+            callee_required[path] = _collect_callee_required_permissions(doc)
+
+    # Second pass: walk callers and validate.
+    violations: List[Tuple[Path, str]] = []
+    for caller_path in _iter_workflows():
+        try:
+            caller_doc = _load_workflow(caller_path)
+        except yaml.YAMLError:
+            continue
+        # Workflow-level ``permissions:`` is the fallback when a job omits its
+        # own ``permissions:``.  ``actions/runner`` resolves this before
+        # validating callee caps, so the linter must mirror that semantics.
+        caller_workflow_perms = _normalize_permissions(caller_doc.get("permissions"))
+        jobs = caller_doc.get("jobs", {}) or {}
+        for job_id, job_def in jobs.items():
+            if not isinstance(job_def, dict):
+                continue
+            uses = job_def.get("uses")
+            if not _is_local_callee(uses):
+                continue
+            callee_path = _callee_relative_path(uses)
+            required = callee_required.get(callee_path)
+            if not required:
+                # callee has no per-job permission declarations -> nothing to check
+                continue
+            caller_perms = _normalize_permissions(job_def.get("permissions"))
+            if not caller_perms:
+                caller_perms = dict(caller_workflow_perms)
+            for msg in _violations_for_call(
+                caller_path, str(job_id), caller_perms, callee_path, required
+            ):
+                violations.append((caller_path, msg))
+
+    if violations:
+        print("Workflow permission-cap violations detected:\n", file=sys.stderr)
+        for _path, msg in violations:
+            print(f"::error::{msg}", file=sys.stderr)
+        print(
+            "\nFix: grant the missing permission on the caller's `jobs.<id>.permissions` map, "
+            "or remove the requirement from the callee. See ADR-017 and "
+            "https://docs.github.com/en/actions/using-workflows/reusing-workflows"
+            "#access-and-permissions-for-nested-workflows",
+            file=sys.stderr,
+        )
+        return 1
+
+    print(f"OK: {len(callee_required)} callee workflow(s) checked, no permission-cap violations.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/ci/tests/test_lint_workflow_permissions.py b/scripts/ci/tests/test_lint_workflow_permissions.py
new file mode 100644
index 000000000..4ed82fa95
--- /dev/null
+++ b/scripts/ci/tests/test_lint_workflow_permissions.py
@@ -0,0 +1,187 @@
+"""Unit tests for the workflow permission-cap linter.
+
+Verifies that ``scripts/ci/lint_workflow_permissions.py`` correctly catches
+the class of bug introduced by PR #1097 (and recorded in issue #1099):
+a reusable callee declaring ``permissions:`` keys not granted by the caller's
+``uses:`` job, which causes silent ``startup_failure`` at the GitHub
+orchestrator.
+"""
+
+from __future__ import annotations
+
+import subprocess
+import sys
+import textwrap
+from pathlib import Path
+
+import pytest
+
+# tests/ is alongside the script: scripts/ci/lint_workflow_permissions.py
+SCRIPT = Path(__file__).resolve().parents[1] / "lint_workflow_permissions.py"
+
+
+def _run_linter_in(workspace: Path) -> subprocess.CompletedProcess:
+    return subprocess.run(
+        [sys.executable, str(SCRIPT)],
+        cwd=str(workspace),
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+
+
+def _write_workflow(workspace: Path, name: str, body: str) -> None:
+    target = workspace / ".github" / "workflows" / name
+    target.parent.mkdir(parents=True, exist_ok=True)
+    target.write_text(textwrap.dedent(body).lstrip(), encoding="utf-8")
+
+
+def test_linter_passes_when_caller_grants_required_permissions(tmp_path: Path) -> None:
+    _write_workflow(
+        tmp_path,
+        "callee.yml",
+        """
+        name: Callee
+        on:
+          workflow_call: {}
+        jobs:
+          do:
+            runs-on: ubuntu-latest
+            permissions:
+              contents: write
+              issues: write
+            steps:
+              - run: echo ok
+        """,
+    )
+    _write_workflow(
+        tmp_path,
+        "caller.yml",
+        """
+        name: Caller
+        on:
+          push: {}
+        jobs:
+          invoke:
+            permissions:
+              contents: write
+              issues: write
+            uses: ./.github/workflows/callee.yml
+        """,
+    )
+
+    result = _run_linter_in(tmp_path)
+    assert result.returncode == 0, result.stderr
+    assert "no permission-cap violations" in result.stdout
+
+
+def test_linter_flags_missing_pull_requests_grant(tmp_path: Path) -> None:
+    """Regression: PR #1097 / issue #1099. Callee requires pull-requests:write,
+    caller does not grant it -> GitHub rejects with startup_failure."""
+    _write_workflow(
+        tmp_path,
+        "callee.yml",
+        """
+        name: Callee
+        on:
+          workflow_call: {}
+        jobs:
+          open-pr:
+            runs-on: ubuntu-latest
+            permissions:
+              contents: write
+              pull-requests: write
+            steps:
+              - run: echo open-pr
+        """,
+    )
+    _write_workflow(
+        tmp_path,
+        "caller.yml",
+        """
+        name: Caller
+        on:
+          push: {}
+        jobs:
+          invoke:
+            permissions:
+              id-token: write
+              contents: write
+              issues: write
+            uses: ./.github/workflows/callee.yml
+        """,
+    )
+
+    result = _run_linter_in(tmp_path)
+    assert result.returncode == 1, result.stdout
+    assert "pull-requests" in result.stderr
+    assert "startup_failure" in result.stderr
+
+
+def test_linter_flags_read_only_caller_against_write_callee(tmp_path: Path) -> None:
+    _write_workflow(
+        tmp_path,
+        "callee.yml",
+        """
+        name: Callee
+        on:
+          workflow_call: {}
+        jobs:
+          push-tag:
+            runs-on: ubuntu-latest
+            permissions:
+              contents: write
+            steps:
+              - run: echo push
+        """,
+    )
+    _write_workflow(
+        tmp_path,
+        "caller.yml",
+        """
+        name: Caller
+        on:
+          push: {}
+        jobs:
+          invoke:
+            permissions:
+              contents: read
+            uses: ./.github/workflows/callee.yml
+        """,
+    )
+
+    result = _run_linter_in(tmp_path)
+    assert result.returncode == 1, result.stdout
+    assert "'contents: write'" in result.stderr
+
+
+def test_linter_handles_callee_without_per_job_permissions(tmp_path: Path) -> None:
+    _write_workflow(
+        tmp_path,
+        "callee.yml",
+        """
+        name: Callee
+        on:
+          workflow_call: {}
+        jobs:
+          do:
+            runs-on: ubuntu-latest
+            steps:
+              - run: echo ok
+        """,
+    )
+    _write_workflow(
+        tmp_path,
+        "caller.yml",
+        """
+        name: Caller
+        on:
+          push: {}
+        jobs:
+          invoke:
+            uses: ./.github/workflows/callee.yml
+        """,
+    )
+
+    result = _run_linter_in(tmp_path)
+    assert result.returncode == 0

From 126fc9444e79c45d56f381d473ad79721fc74630 Mon Sep 17 00:00:00 2001
From: Ricardo Cataldi <rcataldi@microsoft.com>
Date: Wed, 13 May 2026 01:09:38 -0300
Subject: [PATCH 2/3] ci(#1099): disable shellcheck in lint-actions gate (out
 of scope)

The new lint-actions workflow's purpose is enforcing GitHub Actions schema + the cross-file permission-cap rule (via scripts/ci/lint_workflow_permissions.py). The embedded shellcheck pass surfaced ~30 pre-existing style warnings (SC2034/SC2129/SC2153) in deploy-azd.yml and ci.yml that this PR does not touch. Bundling shellcheck cleanup with the deploy fix would force unrelated refactors as the price of merging. Re-enable in a focused follow-up if/when the team commits to a shell-quality cleanup pass.
---
 .github/workflows/lint-actions.yml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/lint-actions.yml b/.github/workflows/lint-actions.yml
index 4477eca93..caffe8bb3 100644
--- a/.github/workflows/lint-actions.yml
+++ b/.github/workflows/lint-actions.yml
@@ -30,7 +30,15 @@ jobs:
       - name: Run actionlint
         uses: docker://rhysd/actionlint:1.7.7
         with:
-          args: -color
+          # ``-shellcheck=`` disables the embedded shellcheck pass.  This gate
+          # exists to enforce GitHub Actions schema + cross-file semantics
+          # (in tandem with ``scripts/ci/lint_workflow_permissions.py``).
+          # Inline shell-style hygiene (SC2034/SC2129/SC2153) is out of scope
+          # — bundling it here would force shellcheck-quality refactors of
+          # every legacy ``run:`` block as the price of merging unrelated
+          # workflow changes.  Re-enable in a focused follow-up if/when the
+          # team commits to a shell-quality cleanup pass.
+          args: -color -shellcheck=
 
   permission-cap-lint:
     name: Permission-cap lint (cross-file nested-workflow rule)

From f470eaf48cb56d34de7e6bcd4bb8269bfe01b675 Mon Sep 17 00:00:00 2001
From: Ricardo Cataldi <rcataldi@microsoft.com>
Date: Wed, 13 May 2026 01:23:13 -0300
Subject: [PATCH 3/3] ci(#1099): drop unused pytest import flagged by Copilot
 reviewer

---
 scripts/ci/tests/test_lint_workflow_permissions.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/scripts/ci/tests/test_lint_workflow_permissions.py b/scripts/ci/tests/test_lint_workflow_permissions.py
index 4ed82fa95..61f1e1bb5 100644
--- a/scripts/ci/tests/test_lint_workflow_permissions.py
+++ b/scripts/ci/tests/test_lint_workflow_permissions.py
@@ -14,8 +14,6 @@
 import textwrap
 from pathlib import Path
 
-import pytest
-
 # tests/ is alongside the script: scripts/ci/lint_workflow_permissions.py
 SCRIPT = Path(__file__).resolve().parents[1] / "lint_workflow_permissions.py"