Skip to content

Commit ccdbd57

Browse files
Omolola-Akinleyeclaudeelasticmachine
authored
[Cloud Services] Rename credentials_workload_identity_provider to credentials_audience (elastic#265059)
## Summary This PR bundles two Fleet fixes affecting the OTel permission verifier flow: ### 1. Fix verifier agent policies leaking on agentless deploy failure Two bugs caused verifier policies to accumulate indefinitely whenever the agentless API returned an error (e.g. the 429 "agentless provisioning limit" response seen in prod logs): - **Missing inline rollback** — `createVerifierPolicy` (`agent_policy.ts:2762`) persisted the agent-policy SO via `this.create(...)` and then called `deployPolicy(..., { throwOnAgentlessError: true })` without a try/catch. When deploy threw, the SO was left orphaned with `is_verifier: true`. Now mirrors the pattern in `agent_policy_create.ts:285-299`: wraps the deploy in a try/catch that calls `deleteVerifierPolicy` before re-throwing. - **Space-blind cleanup query** — `verify_permissions_task.ts` Phase 1 cleanup (line 244) and Phase 2 gate-check (line 130) called `agentPolicyService.list` with no `spaceId`. Combined with the `getInternalUserSOClientWithoutSpaceExtension` SO client, the `find` resolved to the default namespace only, so orphans created in non-default spaces were invisible — producing the `Found 0 verifier policies for cleanup check` log even when orphans existed. Now passes `spaceId: '*'` on both calls. ### 2. Rename `credentials_workload_identity_provider` → `credentials_audience` Renames the GCP cloud connector verifier variable in `buildVerifierCredentialVars` (`agent_policy.ts:2809`). The value is sourced from `gcpVars.audience` (an OIDC audience claim), so the new name reflects its source semantic rather than the GCP-specific workload identity provider abstraction. ## Test plan - [ ] CI runs the new `verify_permissions_task.test.ts` case (`should query verifier policies across all spaces during cleanup and gate check`) asserting both `list` sites pass `spaceId: '*'`. - [ ] CI runs the new `agent_policy.test.ts` case (`should roll back the verifier policy and re-throw when deployPolicy fails`) asserting `deleteVerifierPolicy` is invoked on deploy failure. - [ ] CI runs the existing renamed GCP credential test (`should include GCP credential vars for gcp provider`) with `credentials_audience`. - [ ] Confirm the downstream GCP cloud connector integration package consumes `credentials_audience` (coordinate with integrations). --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
1 parent 475a14b commit ccdbd57

4 files changed

Lines changed: 241 additions & 22 deletions

File tree

x-pack/platform/plugins/shared/fleet/server/services/agent_policy.test.ts

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3052,11 +3052,11 @@ describe('Agent policy', () => {
30523052
type: 'text',
30533053
value: 'sa@project.iam.gserviceaccount.com',
30543054
});
3055-
const expectedWifValue =
3055+
const expectedAudienceValue =
30563056
'//iam.googleapis.com/projects/123/locations/global/workloadIdentityPools/pool/providers/prov';
3057-
expect(vars.credentials_workload_identity_provider).toEqual({
3057+
expect(vars.credentials_audience).toEqual({
30583058
type: 'text',
3059-
value: expectedWifValue,
3059+
value: expectedAudienceValue,
30603060
});
30613061
expect(vars.credentials_role_arn).toBeUndefined();
30623062
});
@@ -3122,6 +3122,26 @@ describe('Agent policy', () => {
31223122
expect(deploySpy).not.toHaveBeenCalled();
31233123
});
31243124

3125+
it('should roll back the verifier policy and re-throw when deployPolicy fails', async () => {
3126+
jest
3127+
.spyOn(agentPolicyService, 'deployPolicy')
3128+
.mockRejectedValueOnce(new Error('agentless provisioning limit'));
3129+
const deleteSpy = jest
3130+
.spyOn(agentPolicyService, 'deleteVerifierPolicy')
3131+
.mockResolvedValue(undefined);
3132+
3133+
await expect(
3134+
agentPolicyService.createVerifierPolicy(
3135+
soClient,
3136+
esClient,
3137+
baseConnector as any,
3138+
baseVerificationInfo
3139+
)
3140+
).rejects.toThrow('agentless provisioning limit');
3141+
3142+
expect(deleteSpy).toHaveBeenCalledWith(soClient, esClient, 'mocked');
3143+
});
3144+
31253145
it('should propagate secret_references from created package policy', async () => {
31263146
mockedPackagePolicyService.create.mockResolvedValueOnce({
31273147
id: 'pp-id',

x-pack/platform/plugins/shared/fleet/server/services/agent_policy.ts

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2761,9 +2761,17 @@ class AgentPolicyService {
27612761

27622762
logger.info(`${VERIFY_PERMISSIONS_TASK} Deploying verifier policy ${agentPolicy.id}`);
27632763

2764-
await this.deployPolicy(soClient, agentPolicy.id, undefined, {
2765-
throwOnAgentlessError: true,
2766-
});
2764+
try {
2765+
await this.deployPolicy(soClient, agentPolicy.id, undefined, {
2766+
throwOnAgentlessError: true,
2767+
});
2768+
} catch (err) {
2769+
logger.error(
2770+
`${VERIFY_PERMISSIONS_TASK} Failed to deploy verifier policy ${agentPolicy.id}, rolling back: ${err}`
2771+
);
2772+
await this.deleteVerifierPolicy(soClient, esClient, agentPolicy.id);
2773+
throw err;
2774+
}
27672775

27682776
return { policyId: agentPolicy.id };
27692777
}
@@ -2806,7 +2814,7 @@ function buildVerifierCredentialVars(
28062814
} else if (provider === 'gcp') {
28072815
const gcpVars = connectorVars as GcpCloudConnectorVars;
28082816
vars.credentials_service_account_email = gcpVars.service_account;
2809-
vars.credentials_workload_identity_provider = gcpVars.audience;
2817+
vars.credentials_audience = gcpVars.audience;
28102818
}
28112819

28122820
return vars;

x-pack/platform/plugins/shared/fleet/server/tasks/agentless/verify_permissions_task.test.ts

Lines changed: 144 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,29 @@ describe('verify_permissions_task', () => {
497497
);
498498
});
499499

500+
it('should query verifier policies across all spaces during cleanup and gate check', async () => {
501+
mockedAgentPolicyService.list
502+
.mockResolvedValueOnce({ items: [] } as any)
503+
.mockResolvedValueOnce({ items: [] } as any);
504+
505+
mockSoClient.find.mockResolvedValue({ saved_objects: [] });
506+
507+
await taskRunner.run();
508+
509+
expect(mockedAgentPolicyService.list).toHaveBeenCalledWith(
510+
mockSoClient,
511+
expect.objectContaining({
512+
kuery: expect.stringContaining('is_verifier: true'),
513+
spaceId: '*',
514+
})
515+
);
516+
// Both the cleanup (Phase 1) and the gate check (Phase 2) must fan out across spaces,
517+
// otherwise verifier policies in non-default spaces are invisible and leak forever.
518+
expect(mockedAgentPolicyService.list).toHaveBeenCalledTimes(2);
519+
expect(mockedAgentPolicyService.list.mock.calls[0][1]).toMatchObject({ spaceId: '*' });
520+
expect(mockedAgentPolicyService.list.mock.calls[1][1]).toMatchObject({ spaceId: '*' });
521+
});
522+
500523
it('should not cleanup verifier policies within TTL', async () => {
501524
const twoMinutesAgo = minutesAgo(2);
502525

@@ -550,7 +573,7 @@ describe('verify_permissions_task', () => {
550573
);
551574
});
552575

553-
it('should only verify one connector per task run (serial execution gate)', async () => {
576+
it('should verify only one connector per task run (one verifier deploy at a time)', async () => {
554577
mockedAgentPolicyService.list
555578
.mockResolvedValueOnce({ items: [] } as any)
556579
.mockResolvedValueOnce({ items: [] } as any);
@@ -583,6 +606,126 @@ describe('verify_permissions_task', () => {
583606
);
584607
});
585608

609+
it('should request a follow-up run (runAt ~TTL+buffer) when more eligible connectors remain', async () => {
610+
mockedAgentPolicyService.list
611+
.mockResolvedValueOnce({ items: [] } as any)
612+
.mockResolvedValueOnce({ items: [] } as any);
613+
614+
mockedAgentPolicyService.createVerifierPolicy.mockResolvedValueOnce({
615+
policyId: 'verifier-policy-1',
616+
});
617+
618+
mockSoClient.find
619+
.mockResolvedValueOnce({
620+
saved_objects: [
621+
makePackagePolicySO('pp-1', 'conn-1', 'cloudtrail'),
622+
makePackagePolicySO('pp-2', 'conn-2', 'guardduty'),
623+
],
624+
})
625+
.mockResolvedValueOnce({
626+
saved_objects: [makeConnectorSO('conn-1'), makeConnectorSO('conn-2')],
627+
});
628+
629+
mockSoClient.update.mockResolvedValue({});
630+
631+
const before = Date.now();
632+
const result = (await taskRunner.run()) as { runAt: Date } | undefined;
633+
const after = Date.now();
634+
635+
expect(result).toBeDefined();
636+
expect(result!.runAt).toBeInstanceOf(Date);
637+
// Expected runAt is (now + TTL_MS + buffer). With TTL_MS = 5 min and buffer = 30 s
638+
// the bound is [before + 5:30, after + 5:30].
639+
const TTL_MS = 5 * 60 * 1000;
640+
const BUFFER_MS = 30 * 1000;
641+
expect(result!.runAt.getTime()).toBeGreaterThanOrEqual(before + TTL_MS + BUFFER_MS - 100);
642+
expect(result!.runAt.getTime()).toBeLessThanOrEqual(after + TTL_MS + BUFFER_MS + 100);
643+
});
644+
645+
it('should NOT request a follow-up run when only one eligible connector existed', async () => {
646+
mockedAgentPolicyService.list
647+
.mockResolvedValueOnce({ items: [] } as any)
648+
.mockResolvedValueOnce({ items: [] } as any);
649+
650+
mockedAgentPolicyService.createVerifierPolicy.mockResolvedValueOnce({
651+
policyId: 'verifier-policy-1',
652+
});
653+
654+
mockSoClient.find
655+
.mockResolvedValueOnce({
656+
saved_objects: [makePackagePolicySO('pp-1', 'conn-1', 'cloudtrail')],
657+
})
658+
.mockResolvedValueOnce({
659+
saved_objects: [makeConnectorSO('conn-1')],
660+
});
661+
662+
mockSoClient.update.mockResolvedValue({});
663+
664+
const result = await taskRunner.run();
665+
666+
// Without a return value the task falls back to its 12 h cron.
667+
expect(result).toBeUndefined();
668+
});
669+
670+
it('should request a follow-up run when the gate blocks because a verifier is still in flight', async () => {
671+
const twoMinutesAgo = minutesAgo(2);
672+
673+
mockedAgentPolicyService.list
674+
.mockResolvedValueOnce({ items: [] } as any)
675+
.mockResolvedValueOnce({
676+
items: [
677+
{
678+
id: 'in-flight-verifier',
679+
created_at: twoMinutesAgo,
680+
updated_at: twoMinutesAgo,
681+
},
682+
],
683+
} as any);
684+
685+
const result = (await taskRunner.run()) as { runAt: Date } | undefined;
686+
687+
expect(result).toBeDefined();
688+
expect(result!.runAt).toBeInstanceOf(Date);
689+
// Gate-blocked runs also reschedule so we can drain the queue after the
690+
// active verifier's TTL elapses (otherwise we'd wait the full 12 h cron).
691+
expect(result!.runAt.getTime()).toBeGreaterThan(Date.now());
692+
expect(mockedAgentPolicyService.createVerifierPolicy).not.toHaveBeenCalled();
693+
});
694+
695+
it('should NOT request a follow-up run when the feature flag is off', async () => {
696+
jest.spyOn(appContextService, 'getExperimentalFeatures').mockReturnValue({
697+
enableOTelVerifier: false,
698+
} as any);
699+
700+
const result = await taskRunner.run();
701+
expect(result).toBeUndefined();
702+
});
703+
704+
it('should NOT request a follow-up run when an earlier verification fails with no other eligibles', async () => {
705+
mockedAgentPolicyService.list
706+
.mockResolvedValueOnce({ items: [] } as any)
707+
.mockResolvedValueOnce({ items: [] } as any);
708+
709+
mockedAgentPolicyService.createVerifierPolicy.mockRejectedValueOnce(
710+
new Error('agentless provisioning limit')
711+
);
712+
713+
mockSoClient.find
714+
.mockResolvedValueOnce({
715+
saved_objects: [makePackagePolicySO('pp-1', 'conn-1', 'cloudtrail')],
716+
})
717+
.mockResolvedValueOnce({
718+
saved_objects: [makeConnectorSO('conn-1')],
719+
});
720+
721+
mockSoClient.update.mockResolvedValue({});
722+
723+
const result = await taskRunner.run();
724+
725+
// Only one connector, and it failed — no more work this cycle.
726+
expect(result).toBeUndefined();
727+
});
728+
586729
it('should skip all verifications when a non-expired verifier deployment is in flight', async () => {
587730
const twoMinutesAgo = minutesAgo(2);
588731

0 commit comments

Comments
 (0)