[Cloud Services] Rename credentials_workload_identity_provider to credentials_audience (elastic#265059)

Omolola-Akinleye · claude · elasticmachine · web-flow · commit ccdbd579eaf5 · 2026-04-23T18:15:51.000-07:00
## Summary

This PR bundles two Fleet fixes affecting the OTel permission verifier
flow:

### 1. Fix verifier agent policies leaking on agentless deploy failure

Two bugs caused verifier policies to accumulate indefinitely whenever
the agentless API returned an error (e.g. the 429 "agentless
provisioning limit" response seen in prod logs):

- **Missing inline rollback** — `createVerifierPolicy`
(`agent_policy.ts:2762`) persisted the agent-policy SO via
`this.create(...)` and then called `deployPolicy(..., {
throwOnAgentlessError: true })` without a try/catch. When deploy threw,
the SO was left orphaned with `is_verifier: true`. Now mirrors the
pattern in `agent_policy_create.ts:285-299`: wraps the deploy in a
try/catch that calls `deleteVerifierPolicy` before re-throwing.
- **Space-blind cleanup query** — `verify_permissions_task.ts` Phase 1
cleanup (line 244) and Phase 2 gate-check (line 130) called
`agentPolicyService.list` with no `spaceId`. Combined with the
`getInternalUserSOClientWithoutSpaceExtension` SO client, the `find`
resolved to the default namespace only, so orphans created in
non-default spaces were invisible — producing the `Found 0 verifier
policies for cleanup check` log even when orphans existed. Now passes
`spaceId: '*'` on both calls.

### 2. Rename `credentials_workload_identity_provider` →
`credentials_audience`

Renames the GCP cloud connector verifier variable in
`buildVerifierCredentialVars` (`agent_policy.ts:2809`). The value is
sourced from `gcpVars.audience` (an OIDC audience claim), so the new
name reflects its source semantic rather than the GCP-specific workload
identity provider abstraction.

## Test plan

- [ ] CI runs the new `verify_permissions_task.test.ts` case (`should
query verifier policies across all spaces during cleanup and gate
check`) asserting both `list` sites pass `spaceId: '*'`.
- [ ] CI runs the new `agent_policy.test.ts` case (`should roll back the
verifier policy and re-throw when deployPolicy fails`) asserting
`deleteVerifierPolicy` is invoked on deploy failure.
- [ ] CI runs the existing renamed GCP credential test (`should include
GCP credential vars for gcp provider`) with `credentials_audience`.
- [ ] Confirm the downstream GCP cloud connector integration package
consumes `credentials_audience` (coordinate with integrations).

---------

Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
Co-authored-by: Elastic Machine &lt;elasticmachine@users.noreply.github.com&gt;
diff --git a/x-pack/platform/plugins/shared/fleet/server/services/agent_policy.test.ts b/x-pack/platform/plugins/shared/fleet/server/services/agent_policy.test.ts
@@ -3052,11 +3052,11 @@ describe('Agent policy', () => {
         type: 'text',
         value: 'sa@project.iam.gserviceaccount.com',
       });
-      const expectedWifValue =
+      const expectedAudienceValue =
         '//iam.googleapis.com/projects/123/locations/global/workloadIdentityPools/pool/providers/prov';
-      expect(vars.credentials_workload_identity_provider).toEqual({
+      expect(vars.credentials_audience).toEqual({
         type: 'text',
-        value: expectedWifValue,
+        value: expectedAudienceValue,
       });
       expect(vars.credentials_role_arn).toBeUndefined();
     });
@@ -3122,6 +3122,26 @@ describe('Agent policy', () => {
       expect(deploySpy).not.toHaveBeenCalled();
     });
 
+    it('should roll back the verifier policy and re-throw when deployPolicy fails', async () => {
+      jest
+        .spyOn(agentPolicyService, 'deployPolicy')
+        .mockRejectedValueOnce(new Error('agentless provisioning limit'));
+      const deleteSpy = jest
+        .spyOn(agentPolicyService, 'deleteVerifierPolicy')
+        .mockResolvedValue(undefined);
+
+      await expect(
+        agentPolicyService.createVerifierPolicy(
+          soClient,
+          esClient,
+          baseConnector as any,
+          baseVerificationInfo
+        )
+      ).rejects.toThrow('agentless provisioning limit');
+
+      expect(deleteSpy).toHaveBeenCalledWith(soClient, esClient, 'mocked');
+    });
+
     it('should propagate secret_references from created package policy', async () => {
       mockedPackagePolicyService.create.mockResolvedValueOnce({
         id: 'pp-id',
diff --git a/x-pack/platform/plugins/shared/fleet/server/services/agent_policy.ts b/x-pack/platform/plugins/shared/fleet/server/services/agent_policy.ts
@@ -2761,9 +2761,17 @@ class AgentPolicyService {
 
     logger.info(`${VERIFY_PERMISSIONS_TASK} Deploying verifier policy ${agentPolicy.id}`);
 
-    await this.deployPolicy(soClient, agentPolicy.id, undefined, {
-      throwOnAgentlessError: true,
-    });
+    try {
+      await this.deployPolicy(soClient, agentPolicy.id, undefined, {
+        throwOnAgentlessError: true,
+      });
+    } catch (err) {
+      logger.error(
+        `${VERIFY_PERMISSIONS_TASK} Failed to deploy verifier policy ${agentPolicy.id}, rolling back: ${err}`
+      );
+      await this.deleteVerifierPolicy(soClient, esClient, agentPolicy.id);
+      throw err;
+    }
 
     return { policyId: agentPolicy.id };
   }
@@ -2806,7 +2814,7 @@ function buildVerifierCredentialVars(
   } else if (provider === 'gcp') {
     const gcpVars = connectorVars as GcpCloudConnectorVars;
     vars.credentials_service_account_email = gcpVars.service_account;
-    vars.credentials_workload_identity_provider = gcpVars.audience;
+    vars.credentials_audience = gcpVars.audience;
   }
 
   return vars;
diff --git a/x-pack/platform/plugins/shared/fleet/server/tasks/agentless/verify_permissions_task.test.ts b/x-pack/platform/plugins/shared/fleet/server/tasks/agentless/verify_permissions_task.test.ts
@@ -497,6 +497,29 @@ describe('verify_permissions_task', () => {
       );
     });
 
+    it('should query verifier policies across all spaces during cleanup and gate check', async () => {
+      mockedAgentPolicyService.list
+        .mockResolvedValueOnce({ items: [] } as any)
+        .mockResolvedValueOnce({ items: [] } as any);
+
+      mockSoClient.find.mockResolvedValue({ saved_objects: [] });
+
+      await taskRunner.run();
+
+      expect(mockedAgentPolicyService.list).toHaveBeenCalledWith(
+        mockSoClient,
+        expect.objectContaining({
+          kuery: expect.stringContaining('is_verifier: true'),
+          spaceId: '*',
+        })
+      );
+      // Both the cleanup (Phase 1) and the gate check (Phase 2) must fan out across spaces,
+      // otherwise verifier policies in non-default spaces are invisible and leak forever.
+      expect(mockedAgentPolicyService.list).toHaveBeenCalledTimes(2);
+      expect(mockedAgentPolicyService.list.mock.calls[0][1]).toMatchObject({ spaceId: '*' });
+      expect(mockedAgentPolicyService.list.mock.calls[1][1]).toMatchObject({ spaceId: '*' });
+    });
+
     it('should not cleanup verifier policies within TTL', async () => {
       const twoMinutesAgo = minutesAgo(2);
 
@@ -550,7 +573,7 @@ describe('verify_permissions_task', () => {
       );
     });
 
-    it('should only verify one connector per task run (serial execution gate)', async () => {
+    it('should verify only one connector per task run (one verifier deploy at a time)', async () => {
       mockedAgentPolicyService.list
         .mockResolvedValueOnce({ items: [] } as any)
         .mockResolvedValueOnce({ items: [] } as any);
@@ -583,6 +606,126 @@ describe('verify_permissions_task', () => {
       );
     });
 
+    it('should request a follow-up run (runAt ~TTL+buffer) when more eligible connectors remain', async () => {
+      mockedAgentPolicyService.list
+        .mockResolvedValueOnce({ items: [] } as any)
+        .mockResolvedValueOnce({ items: [] } as any);
+
+      mockedAgentPolicyService.createVerifierPolicy.mockResolvedValueOnce({
+        policyId: 'verifier-policy-1',
+      });
+
+      mockSoClient.find
+        .mockResolvedValueOnce({
+          saved_objects: [
+            makePackagePolicySO('pp-1', 'conn-1', 'cloudtrail'),
+            makePackagePolicySO('pp-2', 'conn-2', 'guardduty'),
+          ],
+        })
+        .mockResolvedValueOnce({
+          saved_objects: [makeConnectorSO('conn-1'), makeConnectorSO('conn-2')],
+        });
+
+      mockSoClient.update.mockResolvedValue({});
+
+      const before = Date.now();
+      const result = (await taskRunner.run()) as { runAt: Date } | undefined;
+      const after = Date.now();
+
+      expect(result).toBeDefined();
+      expect(result!.runAt).toBeInstanceOf(Date);
+      // Expected runAt is (now + TTL_MS + buffer). With TTL_MS = 5 min and buffer = 30 s
+      // the bound is [before + 5:30, after + 5:30].
+      const TTL_MS = 5 * 60 * 1000;
+      const BUFFER_MS = 30 * 1000;
+      expect(result!.runAt.getTime()).toBeGreaterThanOrEqual(before + TTL_MS + BUFFER_MS - 100);
+      expect(result!.runAt.getTime()).toBeLessThanOrEqual(after + TTL_MS + BUFFER_MS + 100);
+    });
+
+    it('should NOT request a follow-up run when only one eligible connector existed', async () => {
+      mockedAgentPolicyService.list
+        .mockResolvedValueOnce({ items: [] } as any)
+        .mockResolvedValueOnce({ items: [] } as any);
+
+      mockedAgentPolicyService.createVerifierPolicy.mockResolvedValueOnce({
+        policyId: 'verifier-policy-1',
+      });
+
+      mockSoClient.find
+        .mockResolvedValueOnce({
+          saved_objects: [makePackagePolicySO('pp-1', 'conn-1', 'cloudtrail')],
+        })
+        .mockResolvedValueOnce({
+          saved_objects: [makeConnectorSO('conn-1')],
+        });
+
+      mockSoClient.update.mockResolvedValue({});
+
+      const result = await taskRunner.run();
+
+      // Without a return value the task falls back to its 12 h cron.
+      expect(result).toBeUndefined();
+    });
+
+    it('should request a follow-up run when the gate blocks because a verifier is still in flight', async () => {
+      const twoMinutesAgo = minutesAgo(2);
+
+      mockedAgentPolicyService.list
+        .mockResolvedValueOnce({ items: [] } as any)
+        .mockResolvedValueOnce({
+          items: [
+            {
+              id: 'in-flight-verifier',
+              created_at: twoMinutesAgo,
+              updated_at: twoMinutesAgo,
+            },
+          ],
+        } as any);
+
+      const result = (await taskRunner.run()) as { runAt: Date } | undefined;
+
+      expect(result).toBeDefined();
+      expect(result!.runAt).toBeInstanceOf(Date);
+      // Gate-blocked runs also reschedule so we can drain the queue after the
+      // active verifier's TTL elapses (otherwise we'd wait the full 12 h cron).
+      expect(result!.runAt.getTime()).toBeGreaterThan(Date.now());
+      expect(mockedAgentPolicyService.createVerifierPolicy).not.toHaveBeenCalled();
+    });
+
+    it('should NOT request a follow-up run when the feature flag is off', async () => {
+      jest.spyOn(appContextService, 'getExperimentalFeatures').mockReturnValue({
+        enableOTelVerifier: false,
+      } as any);
+
+      const result = await taskRunner.run();
+      expect(result).toBeUndefined();
+    });
+
+    it('should NOT request a follow-up run when an earlier verification fails with no other eligibles', async () => {
+      mockedAgentPolicyService.list
+        .mockResolvedValueOnce({ items: [] } as any)
+        .mockResolvedValueOnce({ items: [] } as any);
+
+      mockedAgentPolicyService.createVerifierPolicy.mockRejectedValueOnce(
+        new Error('agentless provisioning limit')
+      );
+
+      mockSoClient.find
+        .mockResolvedValueOnce({
+          saved_objects: [makePackagePolicySO('pp-1', 'conn-1', 'cloudtrail')],
+        })
+        .mockResolvedValueOnce({
+          saved_objects: [makeConnectorSO('conn-1')],
+        });
+
+      mockSoClient.update.mockResolvedValue({});
+
+      const result = await taskRunner.run();
+
+      // Only one connector, and it failed — no more work this cycle.
+      expect(result).toBeUndefined();
+    });
+
     it('should skip all verifications when a non-expired verifier deployment is in flight', async () => {
       const twoMinutesAgo = minutesAgo(2);
 
diff --git a/x-pack/platform/plugins/shared/fleet/server/tasks/agentless/verify_permissions_task.ts b/x-pack/platform/plugins/shared/fleet/server/tasks/agentless/verify_permissions_task.ts