Azure-Samples · Cataldir · May 13, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/.github/workflows/deploy-azd-prod.yml b/.github/workflows/deploy-azd-prod.yml
@@ -54,6 +54,17 @@ jobs:
   deploy:
     needs: verify-release
     if: ${{ startsWith(github.ref_name, 'v') && !contains(github.ref_name, '-') }}
+    permissions:
+      # Mirrors the dev entrypoint so the reusable callee's per-job
+      # ``permissions:`` cap is satisfied.  ``id-token: write`` for OIDC
+      # Azure login, ``contents: write`` for git tag operations, and
+      # ``issues: write`` for the ``watchdog-apim-agc-swa-drift`` job which
+      # posts drift reports to issue #298.  Without these grants the
+      # reusable workflow ``startup_failure``-s at the orchestrator before
+      # any job runs (see ``scripts/ci/lint_workflow_permissions.py``).
+      id-token: write
+      contents: write
+      issues: write
     uses: ./.github/workflows/deploy-azd.yml
     with:
       environment: prod

diff --git a/.github/workflows/deploy-azd.yml b/.github/workflows/deploy-azd.yml
@@ -2717,130 +2717,33 @@ jobs:
           path: .kubernetes/rendered/${{ matrix.service }}/all.yaml
           retention-days: 1
 
-  # ADR-017 amendment, Pattern A — image-tag PR bridge.
+  # ADR-017, Phase 2b — image-tag bridge (DEFERRED, see issue #1099 follow-up).
   #
   # The previous ``commit-rendered-manifests`` job pushed bot-generated YAML
   # straight at ``refs/heads/main``, which the ``main-governance-baseline``
   # ruleset (GH013) rejects. Flux now reconciles HelmRelease CRDs from
   # ``.kubernetes/releases/{crud,agents}`` and the helm-controller renders the
-  # chart in-cluster on every reconciliation. To close the loop without
-  # pushing to ``main``, the build emits ``tested-image-*`` artifacts holding
-  # the immutable ACR reference for each service. The job below consumes those
-  # artifacts and opens a single image-bump PR per deploy, so Flux picks up
-  # the new images on the next reconcile after the PR is merged.
-  open-image-tag-bump-pr:
-    runs-on: ubuntu-latest
-    # Run only when at least one of the deploy matrices actually produced new
-    # tested images and the run was not cancelled. The bridge intentionally
-    # does NOT depend on ``success()`` of every deploy slot — partial deploys
-    # (e.g., a single matrix entry flaked) should still surface the bumps for
-    # the services that did succeed.
-    if: ${{ always() && !cancelled() && !inputs.uiOnly && (needs.deploy-crud.result == 'success' || needs.deploy-agents.result == 'success') }}
-    needs:
-      - deploy-crud
-      - deploy-agents
-      - detect-changes
-    permissions:
-      contents: write
-      pull-requests: write
-    env:
-      DEPLOY_ENV: ${{ inputs.environment }}
-      # ``env.DEPLOY_SOURCE_SHA`` is not visible inside job-level ``env:`` so
-      # we re-derive the same expression used at the workflow level.
-      DEPLOY_SHA: ${{ inputs.sourceSha != '' && inputs.sourceSha || github.sha }}
-      DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
-    steps:
-      - name: Checkout default branch
-        uses: actions/checkout@v4
-        with:
-          # The bridge edits Flux source-of-truth files on the default branch
-          # so the PR diff reflects only the image bumps from this deploy.
-          ref: ${{ github.event.repository.default_branch }}
-          fetch-depth: 0
-          persist-credentials: true
-
-      - name: Download tested-image artifacts
-        uses: actions/download-artifact@v4
-        with:
-          path: ${{ runner.temp }}/tested-images
-          pattern: tested-image-*
-
-      - name: Apply HelmRelease image bumps
-        id: update
-        shell: bash
-        env:
-          RUNNER_TEMP: ${{ runner.temp }}
-        run: |
-          set -euo pipefail
-          changed=0
-          declare -a updated_services=()
-          for dir in "${RUNNER_TEMP}/tested-images"/tested-image-*; do
-            [ -d "$dir" ] || continue
-            svc=$(basename "$dir" | sed 's/^tested-image-//')
-            ref_file="$dir/image-ref.txt"
-            if [ ! -f "$ref_file" ]; then
-              echo "::warning::no image-ref.txt for ${svc} — skipping"
-              continue
-            fi
-            image_ref=$(tr -d '\r\n' < "$ref_file")
-            # image_ref shape: ``<registry>/<svc>@sha256:<digest>``. The
-            # registry path is the durable identifier; the tag is the SHA we
-            # already pushed alongside the digest in ``build-aks-images``.
-            repo="${image_ref%@*}"
-            tag="${DEPLOY_SHA}"
-            if [ "$svc" = "crud-service" ]; then
-              target=".kubernetes/releases/crud/${svc}.yaml"
-            else
-              target=".kubernetes/releases/agents/${svc}.yaml"
-            fi
-            if [ ! -f "$target" ]; then
-              echo "::warning::no HelmRelease found for ${svc} at ${target}"
-              continue
-            fi
-            python3 scripts/ci/update_helmrelease_image.py "$target" "$repo" "$tag"
-            if ! git diff --quiet -- "$target"; then
-              updated_services+=("$svc")
-              changed=$((changed + 1))
-            fi
-          done
-          echo "changed_count=${changed}" >> "$GITHUB_OUTPUT"
-          if [ "${changed}" -gt 0 ]; then
-            printf 'updated_services=%s\n' "${updated_services[*]}" >> "$GITHUB_OUTPUT"
-          fi
-
-      - name: Open or refresh PR
-        if: ${{ steps.update.outputs.changed_count != '0' }}
-        shell: bash
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          UPDATED_SERVICES: ${{ steps.update.outputs.updated_services }}
-        run: |
-          set -euo pipefail
-          short="${DEPLOY_SHA:0:12}"
-          branch="chore/image-bump-${DEPLOY_ENV}-${short}"
-          git config user.name "github-actions[bot]"
-          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
-          git switch -c "$branch"
-          git add .kubernetes/releases
-          git commit -m "chore(deploy): bump ${DEPLOY_ENV} HelmRelease image tags to ${short}" \
-                     -m "Triggered by deploy-azd run ${{ github.run_id }}. Services: ${UPDATED_SERVICES}. Merge to roll new images via Flux reconciliation."
-          git push -f origin "$branch"
-          existing=$(gh pr list --state open --head "$branch" --base "${DEFAULT_BRANCH:-main}" --json number --jq '.[0].number // ""' 2>/dev/null || echo '')
-          if [ -z "${existing}" ]; then
-            gh pr create \
-              --base "${DEFAULT_BRANCH:-main}" \
-              --head "$branch" \
-              --title "chore(deploy): bump ${DEPLOY_ENV} HelmRelease image tags to ${short}" \
-              --body "Auto-generated by [deploy-azd run ${{ github.run_id }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}).
-
-          Updates HelmRelease \`image.repository\` and \`image.tag\` to the freshly built & tested images at SHA \`${DEPLOY_SHA}\` for environment **${DEPLOY_ENV}**.
-
-          **Services updated (${{ steps.update.outputs.changed_count }}):** ${UPDATED_SERVICES}
-
-          Merging this PR triggers Flux reconciliation, which rolls the new images. No additional action required after merge."
-          else
-            echo "PR #${existing} already open against ${branch}; force-pushed branch HEAD."
-          fi
+  # chart in-cluster on every reconciliation. The image tag pinned in each
+  # HelmRelease YAML remains the source of truth for desired state.
+  #
+  # A previous attempt (PR #1097) embedded a ``open-image-tag-bump-pr`` job
+  # here that requested ``permissions.pull-requests: write``. GitHub rejects
+  # nested-workflow callees that escalate permissions beyond what the caller
+  # grants (``permissions can only be maintained or reduced — not elevated``).
+  # The 27 per-service entrypoints grant only ``id-token | contents | issues``
+  # to the ``uses:`` job, so the entire reusable workflow ``startup_failure``-d
+  # at the GitHub orchestrator before any runner allocation — and the failure
+  # is not detectable by ``actionlint`` or ``yaml.safe_load`` because it is a
+  # cross-file semantic rule.
+  #
+  # The architecturally correct implementation per ADR-017 §"Phase 2b" is
+  # Flux ``ImageRepository`` + ``ImagePolicy`` + ``ImageUpdateAutomation`` CRDs
+  # with the Notification Controller GitHub provider opening the bridge PR.
+  # That work is tracked separately so the deploy chain is restored without
+  # locking the workflow into a GHA-resident bridge that fights the protected
+  # branch model. Until Phase 2b lands properly, new image tags continue to
+  # roll via the existing ``azd deploy`` path, and HelmRelease YAML is updated
+  # through normal PRs reviewed by humans.
 
   wait-flux-reconciliation:
     runs-on: ubuntu-latest

diff --git a/.github/workflows/lint-actions.yml b/.github/workflows/lint-actions.yml
@@ -0,0 +1,53 @@
+name: Lint GitHub Actions Workflows
+
+on:
+  pull_request:
+    paths:
+      - '.github/workflows/**'
+      - 'scripts/ci/lint_workflow_permissions.py'
+      - '.github/workflows/lint-actions.yml'
+  push:
+    branches:
+      - main
+    paths:
+      - '.github/workflows/**'
+      - 'scripts/ci/lint_workflow_permissions.py'
+
+permissions:
+  contents: read
+
+concurrency:
+  group: lint-actions-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  actionlint:
+    name: actionlint (syntax + per-file semantics)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Run actionlint
+        uses: docker://rhysd/actionlint:1.7.7
+        with:
+          args: -color
+
+  permission-cap-lint:
+    name: Permission-cap lint (cross-file nested-workflow rule)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+
+      - name: Install PyYAML
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install PyYAML==6.0.2
+
+      - name: Lint workflow permissions
+        run: |
+          python scripts/ci/lint_workflow_permissions.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- Issue #1099: deploy-azd workflow was `startup_failure`-ing on every dispatch (17 startup_failures + 29 cancellations + 0 successes in the last 1,000 runs across all 27 per-service entrypoints since PR #1097 merged). Root cause: PR #1097 added an `open-image-tag-bump-pr` job to the reusable `deploy-azd.yml` that requested `permissions.pull-requests: write`, but the per-service entrypoints only grant `id-token | contents | issues: write` on their `uses:` job. GitHub Actions rejects nested-workflow callees that escalate permissions beyond what the caller grants, and the rejection happens at the orchestrator before any runner is allocated — invisible to `actionlint` and `yaml.safe_load`. Scoped revert removes the job per ADR-017 §"Phase 2b" (proper Flux `ImageUpdateAutomation` + Notification Controller bridge is tracked separately). Also adds `permissions: { id-token, contents, issues }: write` to `deploy-azd-prod.yml` (latent bug — prod tag pushes would have hit the same failure). New CI gate `.github/workflows/lint-actions.yml` runs `actionlint` plus a custom `scripts/ci/lint_workflow_permissions.py` that catches caller/callee permission-cap mismatches statically at PR time.
+
 - Issue #801 / PR #802: replaced `FoundryInvoker` with `FoundryAgentInvoker` wrapping the Microsoft Agent Framework `FoundryAgent` runtime. Tools are now properly forwarded to the agent instead of being silently dropped. Upgraded `agent-framework` to `>=1.0.1` GA across all 27 service packages.
 
 - PR #796: parallelized catalog-search I/O paths and eliminated duplicate keyword search execution, reducing p95 latency for product discovery flows.

diff --git a/docs/architecture/adrs/adr-017-deployment-strategy.md b/docs/architecture/adrs/adr-017-deployment-strategy.md
@@ -394,6 +394,65 @@ Why this resolves the protected-branch problem permanently:
   image to the older tag still recorded in the HelmRelease YAML.
 - Branch deployment support via HelmRelease targeting different sourceRef.
 
+##### Attempt 1 (PR #1097, reverted by issue #1099)
+
+A first pass implemented the image-tag bridge as a GHA job named
+`open-image-tag-bump-pr` embedded inside the reusable `deploy-azd.yml`. The
+job consumed `tested-image-*` artifacts produced by `build-aks-images`, wrote
+new tags into the 27 HelmRelease YAML files, and opened a single PR per deploy
+via `gh pr create`. The intent matched Phase 2b's PR-bridge property, but the
+implementation conflated three concerns that should remain separate:
+
+1. **Deploy orchestration** (build → push → reconcile) belongs to `deploy-azd.yml`.
+2. **Image promotion** (tag selection, PR authorship) belongs to Flux's
+   image-reflector / image-automation controllers, which run in-cluster and
+   were designed for this exact problem.
+3. **Protected-branch policy** (no bot pushes to `main`) is satisfied by the
+   Notification Controller writing to a feature branch and opening a PR — not
+   by GHA owning the bridge.
+
+The PR also introduced a silent **regression**: the new job declared
+`permissions: pull-requests: write`, but the 27 per-service entrypoints grant
+only `id-token | contents | issues: write` on their `uses:` job. GitHub
+Actions enforces that nested-workflow permissions can only be maintained or
+reduced — never elevated — and rejects ill-formed callees with
+`startup_failure` at the orchestrator **before any runner is allocated**.
+`actionlint` and `yaml.safe_load` cannot see this defect because it is a
+cross-file semantic rule. Every dispatched deploy across all 27 services
+short-circuited in ~7 seconds with no logs, and the regression sat undetected
+for ~2 days.
+
+##### Decision (post-mortem)
+
+- The `open-image-tag-bump-pr` job is removed from `deploy-azd.yml`.
+- The 27 HelmRelease YAML re-pins and the `scripts/ci/update_helmrelease_image.py`
+  helper introduced alongside it are kept — they remain useful for manual
+  promotion and for the next implementation attempt.
+- The proper Phase 2b implementation uses Flux's own components:
+  - `ImageRepository` per ACR repo (one per service) scanning for new tags.
+  - `ImagePolicy` selecting the newest immutable digest-pinned tag.
+  - `ImageUpdateAutomation` writing changes to a feature branch via the
+    in-cluster `git` credential, with `push.branch` distinct from
+    `checkout.branch` so the protected-branch ruleset never sees a direct push.
+  - `Receiver` + `Provider` (GitHub) in the Notification Controller opening
+    the bridge PR. Auto-merge is enabled on the PR via repo policy.
+- A new CI gate (`scripts/ci/lint_workflow_permissions.py` run by
+  `.github/workflows/lint-actions.yml`) statically validates that every
+  caller's `permissions:` map is a superset of every callee's per-job
+  `permissions:`. This catches the exact class of bug `actionlint` cannot.
+
+##### Lessons learned
+
+- Reusable-workflow permission caps must be validated at PR time, not at
+  dispatch time. The fix: a custom Python linter that diff'es caller/callee
+  permission maps and runs in CI on every workflow change.
+- Embedding cross-cutting CD concerns inside a 3,708-line reusable workflow
+  produces blast radius proportional to its size. The next attempt at
+  Phase 2b stays out of `deploy-azd.yml` and lives entirely as Flux CRDs.
+- Silent CI rot is a Tier-1 SLO miss. Pair this ADR with the alerting in
+  `docs/ops/deploy-watchdog.md` so the next regression triggers a page,
+  not a month of unnoticed startup_failures.
+
 #### Why Flux over Argo CD
 
 - Native AKS portal integration (`az k8s-extension`)

diff --git a/memories/repo/aks-restart-runbook.md b/memories/repo/aks-restart-runbook.md
@@ -0,0 +1,61 @@
+# AKS dev cluster auto-stop recovery runbook
+
+## Symptom
+
+`kubectl` DNS fails with `lookup holidaypeakhub405-dev-aks-*.hcp.centralus.azmk8s.io: no such host` and several deployments show `READY 0/2, UP-TO-DATE 0`.
+
+## Root cause
+
+The dev AKS cluster `holidaypeakhub405-dev-aks` is auto-stopped overnight for cost. When it restarts, two known issues block agent pods from coming back:
+
+1. **AKS API server DNS goes away** while the cluster is in `Stopped` state. `az aks list` shows `Stopped`, and the API server FQDN doesn't resolve.
+2. **`azure-wi-webhook-webhook-service`** (Azure Workload Identity admission webhook) endpoints are sometimes empty for the first few minutes after restart. Any Deployment whose pods carry the workload-identity label gets `ReplicaFailure: FailedCreate` with `failed calling webhook "mutation.azure-workload-identity.io"` because the service has no endpoints.
+
+The replicaset-controller does NOT auto-retry once the webhook recovers — the deployment stays at `UP-TO-DATE: 0` forever. A `kubectl rollout restart` is required to kick a new RS creation cycle.
+
+## Recovery (verified 2026-05-12)
+
+```powershell
+# 1. Confirm cluster state
+az aks list --query "[].{name:name,state:powerState.code}" -o table
+
+# 2. Start cluster if Stopped (takes ~8 min)
+az aks start --name holidaypeakhub405-dev-aks --resource-group holidaypeakhub405-dev-rg
+# wait until provisioningState == "Succeeded"
+
+# 3. Refresh kubeconfig
+$env:KUBECONFIG="$env:TEMP\holiday-peak-kubeconfig"
+az aks get-credentials --name holidaypeakhub405-dev-aks --resource-group holidaypeakhub405-dev-rg `
+  --overwrite-existing --file $env:KUBECONFIG
+kubelogin convert-kubeconfig -l azurecli --kubeconfig $env:KUBECONFIG
+
+# 4. Find deployments stuck at UP-TO-DATE: 0
+kubectl -n holiday-peak-agents get deploy
+
+# 5. Confirm webhook is back (endpoints non-empty)
+kubectl -n kube-system get endpoints azure-wi-webhook-webhook-service
+
+# 6. Rollout-restart any stuck deployments
+kubectl -n holiday-peak-agents rollout restart deploy/<name>
+```
+
+## Affected services on the 2026-05-12 incident
+
+Needed rollout restart:
+- ecommerce-checkout-support
+- ecommerce-order-status
+- logistics-carrier-selection
+- logistics-returns-support
+- logistics-route-issue-detection
+- truth-enrichment
+- truth-hitl
+- truth-ingestion
+
+After restart all 26 reached ≥1 Ready replica. A few (eta-computation, checkout-support, order-status, carrier-selection) stay at 1/2 because the startup probe is tight and racy against telemetry init — pods cycle through 2-3 restarts before stabilizing.
+
+## Known startup-probe flake (separate follow-up)
+
+One replica of each of {logistics-eta-computation, ecommerce-checkout-support, ecommerce-order-status, logistics-carrier-selection} stays in CrashLoopBackOff while the other serves traffic. Logs show `Uvicorn running on http://0.0.0.0:8000` and `Overriding of current TracerProvider is not allowed` (telemetry init race) but no `Application startup complete`. The startup probe times out before the slow worker finishes binding. Fix candidates:
+- Increase startup probe `failureThreshold` / `initialDelaySeconds` in the Helm chart for these services.
+- Use single-worker uvicorn (`--workers 1`) to avoid the TracerProvider override race.
+- Wrap the unconditional `from azure.monitor.opentelemetry import configure_azure_monitor` at `lib/src/holiday_peak_lib/utils/telemetry.py:34` in try/except + pin `opentelemetry-sdk<1.30` in `lib/src/pyproject.toml`.