Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions dist/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -263,11 +263,104 @@ async function updateEcsService(ecs, clusterName, service, taskDefArn, waitForSe
services: [service],
cluster: clusterName
});

await verifyServiceDeployment(ecs, clusterName, service, taskDefArn);
} else {
core.debug('Not waiting for the service to become stable');
}
}

async function verifyServiceDeployment(ecs, clusterName, serviceName, expectedTaskDefArn) {
core.debug(
`Verifying that service '${serviceName}' stabilized on expected task definition '${expectedTaskDefArn}'`
);

// Describe the service after the waiter reports "stable".
// This extra check is necessary because ECS can become stable again
// by rolling back to the previous deployment if circuit breaker
// rollback is enabled.
const describeResponse = await ecs.describeServices({
cluster: clusterName,
services: [serviceName]
});

// Surface any ECS-level lookup failures explicitly.
const failures = describeResponse.failures || [];
if (failures.length > 0) {
const failure = failures[0];
throw new Error(
`Failed to describe service '${serviceName}': ${failure.reason || 'unknown error'}`
);
}

// We expect exactly one service back because we queried by name.
const service = describeResponse.services && describeResponse.services[0];
if (!service) {
throw new Error(`Service '${serviceName}' was not returned by DescribeServices`);
}

const deployments = service.deployments || [];

// Find the deployment created from the task definition revision
// we just deployed.
const expectedDeployment = deployments.find(
deployment => deployment.taskDefinition === expectedTaskDefArn
);

// Find the deployment ECS considers PRIMARY after stabilization.
// This is the deployment currently serving traffic / considered active.
const primaryDeployment = deployments.find(
deployment => deployment.status === 'PRIMARY'
);

// If ECS explicitly marks the expected deployment as FAILED,
// fail immediately and include the AWS reason when available.
if (expectedDeployment && expectedDeployment.rolloutState === 'FAILED') {
const reason = expectedDeployment.rolloutStateReason
? ` Reason: ${expectedDeployment.rolloutStateReason}`
: '';
throw new Error(
`ECS deployment failed for task definition '${expectedTaskDefArn}'.${reason}`
);
}

// PRIMARY should always exist for a healthy service state.
if (!primaryDeployment) {
throw new Error(`No PRIMARY deployment found for service '${serviceName}'`);
}

// This is the key rollback check:
// even if the service is "stable", ECS may have rolled back to the
// previous task definition. In that case, the PRIMARY deployment
// will not match the task definition we expected to promote.
if (primaryDeployment.taskDefinition !== expectedTaskDefArn) {
throw new Error(
`ECS deployment did not complete on the expected task definition. ` +
`Expected PRIMARY task definition '${expectedTaskDefArn}', but found ` +
`'${primaryDeployment.taskDefinition}'. This usually means ECS rolled back ` +
`after the new deployment failed.`
);
}

// When rolloutState is available, require the expected deployment
// to have fully completed, not merely exist.
// This is an additional safeguard on top of the PRIMARY check.
if (
expectedDeployment &&
expectedDeployment.rolloutState &&
expectedDeployment.rolloutState !== 'COMPLETED'
) {
throw new Error(
`ECS deployment for task definition '${expectedTaskDefArn}' did not reach ` +
`COMPLETED. Current rolloutState: '${expectedDeployment.rolloutState}'.`
);
}

core.info(
`Deployment verified: service '${serviceName}' is PRIMARY on expected task definition.`
);
}

// Find value in a CodeDeploy AppSpec file with a case-insensitive key
function findAppSpecValue(obj, keyName) {
return obj[findAppSpecKey(obj, keyName)];
Expand Down
93 changes: 93 additions & 0 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -257,11 +257,104 @@ async function updateEcsService(ecs, clusterName, service, taskDefArn, waitForSe
services: [service],
cluster: clusterName
});

await verifyServiceDeployment(ecs, clusterName, service, taskDefArn);
} else {
core.debug('Not waiting for the service to become stable');
}
}

async function verifyServiceDeployment(ecs, clusterName, serviceName, expectedTaskDefArn) {
core.debug(
`Verifying that service '${serviceName}' stabilized on expected task definition '${expectedTaskDefArn}'`
);

// Describe the service after the waiter reports "stable".
// This extra check is necessary because ECS can become stable again
// by rolling back to the previous deployment if circuit breaker
// rollback is enabled.
const describeResponse = await ecs.describeServices({
cluster: clusterName,
services: [serviceName]
});

// Surface any ECS-level lookup failures explicitly.
const failures = describeResponse.failures || [];
if (failures.length > 0) {
const failure = failures[0];
throw new Error(
`Failed to describe service '${serviceName}': ${failure.reason || 'unknown error'}`
);
}

// We expect exactly one service back because we queried by name.
const service = describeResponse.services && describeResponse.services[0];
if (!service) {
throw new Error(`Service '${serviceName}' was not returned by DescribeServices`);
}

const deployments = service.deployments || [];

// Find the deployment created from the task definition revision
// we just deployed.
const expectedDeployment = deployments.find(
deployment => deployment.taskDefinition === expectedTaskDefArn
);

// Find the deployment ECS considers PRIMARY after stabilization.
// This is the deployment currently serving traffic / considered active.
const primaryDeployment = deployments.find(
deployment => deployment.status === 'PRIMARY'
);

// If ECS explicitly marks the expected deployment as FAILED,
// fail immediately and include the AWS reason when available.
if (expectedDeployment && expectedDeployment.rolloutState === 'FAILED') {
const reason = expectedDeployment.rolloutStateReason
? ` Reason: ${expectedDeployment.rolloutStateReason}`
: '';
throw new Error(
`ECS deployment failed for task definition '${expectedTaskDefArn}'.${reason}`
);
}

// PRIMARY should always exist for a healthy service state.
if (!primaryDeployment) {
throw new Error(`No PRIMARY deployment found for service '${serviceName}'`);
}

// This is the key rollback check:
// even if the service is "stable", ECS may have rolled back to the
// previous task definition. In that case, the PRIMARY deployment
// will not match the task definition we expected to promote.
if (primaryDeployment.taskDefinition !== expectedTaskDefArn) {
Copy link
Copy Markdown
Contributor

@s3cube s3cube Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a few parameters on the action for which this approach doesn't work. These are parameters called in the UpdateService call that trigger a deployment such as service-managed-ebs-volume or force-new-deployment

Basically, the task-definition would not change for these across deployments

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense! I will change the design. I think the safest way to check is to use the deployment ID.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, there might be a cleaner way, let me confirm

throw new Error(
`ECS deployment did not complete on the expected task definition. ` +
`Expected PRIMARY task definition '${expectedTaskDefArn}', but found ` +
`'${primaryDeployment.taskDefinition}'. This usually means ECS rolled back ` +
`after the new deployment failed.`
);
}

// When rolloutState is available, require the expected deployment
// to have fully completed, not merely exist.
// This is an additional safeguard on top of the PRIMARY check.
if (
expectedDeployment &&
expectedDeployment.rolloutState &&
expectedDeployment.rolloutState !== 'COMPLETED'
) {
throw new Error(
`ECS deployment for task definition '${expectedTaskDefArn}' did not reach ` +
`COMPLETED. Current rolloutState: '${expectedDeployment.rolloutState}'.`
);
}

core.info(
`Deployment verified: service '${serviceName}' is PRIMARY on expected task definition.`
);
}

// Find value in a CodeDeploy AppSpec file with a case-insensitive key
function findAppSpecValue(obj, keyName) {
return obj[findAppSpecKey(obj, keyName)];
Expand Down
157 changes: 150 additions & 7 deletions index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,21 @@ describe('Deploy to ECS', () => {

mockEcsUpdateService.mockImplementation(() => Promise.resolve({}));

mockEcsDescribeServices.mockImplementation(
() => Promise.resolve({
failures: [],
services: [{
status: 'ACTIVE'
}]
})
mockEcsDescribeServices.mockImplementation(() =>
Promise.resolve({
failures: [],
services: [{
status: 'ACTIVE',
deploymentController: { type: 'ECS' },
deployments: [
{
status: 'PRIMARY',
taskDefinition: 'task:def:arn',
rolloutState: 'COMPLETED'
}
]
}]
})
);

mockCodeDeployCreateDeployment.mockImplementation(
Expand Down Expand Up @@ -1117,6 +1125,141 @@ describe('Deploy to ECS', () => {
expect(core.setOutput).toHaveBeenNthCalledWith(1, 'task-definition-arn', 'task:def:arn');
});

test('verifies expected task definition becomes PRIMARY after service stability wait', async () => {
core.getInput = jest
.fn()
.mockReturnValueOnce('task-definition.json') // task-definition
.mockReturnValueOnce('service-456') // service
.mockReturnValueOnce('cluster-789') // cluster
.mockReturnValueOnce('3') // max-retries
.mockReturnValueOnce('true'); // wait-for-service-stability

let describeCount = 0;
mockEcsDescribeServices.mockImplementation(() => {
describeCount += 1;

if (describeCount === 1) {
return Promise.resolve({
failures: [],
services: [{ status: 'ACTIVE', deploymentController: { type: 'ECS' } }]
});
}

return Promise.resolve({
failures: [],
services: [{
status: 'ACTIVE',
deployments: [
{
status: 'PRIMARY',
taskDefinition: 'task:def:arn',
rolloutState: 'COMPLETED'
}
]
}]
});
});

await run();

expect(waitUntilServicesStable).toHaveBeenCalledTimes(1);
expect(core.setFailed).toHaveBeenCalledTimes(0);
});

test('fails when expected deployment rolloutState is FAILED', async () => {
core.getInput = jest
.fn()
.mockReturnValueOnce('task-definition.json')
.mockReturnValueOnce('service-456')
.mockReturnValueOnce('cluster-789')
.mockReturnValueOnce('3')
.mockReturnValueOnce('true');

let describeCount = 0;
mockEcsDescribeServices.mockImplementation(() => {
describeCount += 1;

if (describeCount === 1) {
return Promise.resolve({
failures: [],
services: [{ status: 'ACTIVE', deploymentController: { type: 'ECS' } }]
});
}

return Promise.resolve({
failures: [],
services: [{
status: 'ACTIVE',
deployments: [
{
status: 'ACTIVE',
taskDefinition: 'task:def:arn',
rolloutState: 'FAILED',
rolloutStateReason: 'ECS deployment circuit breaker: task failed health checks'
},
{
status: 'PRIMARY',
taskDefinition: 'task:def:old',
rolloutState: 'COMPLETED'
}
]
}]
});
});

await run();

expect(waitUntilServicesStable).toHaveBeenCalledTimes(1);
expect(core.setFailed).toHaveBeenCalledTimes(1);
expect(core.setFailed.mock.calls[0][0]).toContain("ECS deployment failed for task definition 'task:def:arn'");
});

test('fails when service stabilizes on a different PRIMARY task definition after rollback', async () => {
core.getInput = jest
.fn()
.mockReturnValueOnce('task-definition.json')
.mockReturnValueOnce('service-456')
.mockReturnValueOnce('cluster-789')
.mockReturnValueOnce('3')
.mockReturnValueOnce('true');

let describeCount = 0;
mockEcsDescribeServices.mockImplementation(() => {
describeCount += 1;

if (describeCount === 1) {
return Promise.resolve({
failures: [],
services: [{ status: 'ACTIVE', deploymentController: { type: 'ECS' } }]
});
}

return Promise.resolve({
failures: [],
services: [{
status: 'ACTIVE',
deployments: [
{
status: 'PRIMARY',
taskDefinition: 'task:def:old',
rolloutState: 'COMPLETED'
},
{
status: 'ACTIVE',
taskDefinition: 'task:def:arn'
}
]
}]
});
});

await run();

expect(waitUntilServicesStable).toHaveBeenCalledTimes(1);
expect(core.setFailed).toHaveBeenCalledTimes(1);
expect(core.setFailed.mock.calls[0][0]).toContain('did not complete on the expected task definition');
});

test('waits for the service to be stable', async () => {
core.getInput = jest
.fn()
Expand Down
Loading