Skip to content

Commit e748ef5

Browse files
authored
Merge pull request #218 from co-cddo/fix/planx-hasura-circuit-breaker
planx: harden Hasura boot against slow Aurora cold starts
2 parents d79ac74 + 7d19c85 commit e748ef5

2 files changed

Lines changed: 26 additions & 10 deletions

File tree

cloudformation/scenarios/planx/cdk/lib/constructs/compute.ts

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,9 +171,16 @@ export class ComputeConstruct extends Construct {
171171
vpcSubnets: { subnetType: ec2.SubnetType.PUBLIC },
172172
assignPublicIp: true,
173173
enableExecuteCommand: true,
174-
circuitBreaker: { rollback: true },
174+
// Don't roll the entire stack back when Hasura's first boot is slow.
175+
// Aurora cold starts can be 5-10 minutes on a fresh sandbox; with
176+
// rollback enabled, a couple of restart attempts trip the circuit
177+
// breaker and the StackSet operation FAILS with no useful per-task
178+
// reason (we lose the CloudWatch logs along with the rolled-back
179+
// stack). Letting ECS keep retrying surfaces the actual task error
180+
// and gives the entrypoint hard-wait loop time to win.
181+
circuitBreaker: { enable: true, rollback: false },
175182
serviceName: 'NdxPlanx-Hasura',
176-
healthCheckGracePeriod: cdk.Duration.minutes(15),
183+
healthCheckGracePeriod: cdk.Duration.minutes(30),
177184
});
178185

179186
// =========================================================================

cloudformation/scenarios/planx/docker/hasura/entrypoint-wrapper.sh

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,28 +15,37 @@ DB_NAME=$(echo "$DB_URL" | sed -n 's|.*/\([^?]*\).*|\1|p')
1515
echo "DB Host: $DB_HOST"
1616
echo "DB Name: $DB_NAME"
1717

18-
# 1. Wait for DNS resolution (Aurora DNS can take a minute to propagate)
18+
# 1. Wait for DNS resolution (Aurora DNS can take a minute to propagate).
19+
# Hard-fail if it never resolves: continuing past this point with a missing
20+
# host means Hasura starts up, fails its DB connection, exits, ECS restarts,
21+
# and the deployment circuit breaker eventually trips and rolls back the
22+
# whole stack with no useful error. Better to have the container exit fast
23+
# and ECS restart it (re-resolving DNS each time) than to bleed boot time.
1924
echo "[1/3] Waiting for DNS resolution of $DB_HOST..."
20-
for i in $(seq 1 60); do
25+
for i in $(seq 1 120); do
2126
if nslookup "$DB_HOST" > /dev/null 2>&1; then
2227
echo "DNS resolved."
2328
break
2429
fi
25-
if [ $i -eq 60 ]; then
26-
echo "WARNING: DNS resolution timed out after 5 minutes. Continuing anyway..."
30+
if [ $i -eq 120 ]; then
31+
echo "ERROR: DNS resolution failed after 10 minutes; exiting so ECS can retry."
32+
exit 1
2733
fi
2834
sleep 5
2935
done
3036

31-
# 2. Wait for PostgreSQL to accept connections
37+
# 2. Wait for PostgreSQL to accept connections. Same hard-fail rationale as
38+
# above — proceeding when PG is unreachable just guarantees a Hasura crash
39+
# loop and a circuit-breaker rollback.
3240
echo "[2/3] Waiting for PostgreSQL to be ready..."
33-
for i in $(seq 1 60); do
41+
for i in $(seq 1 120); do
3442
if PGPASSWORD="$DB_PASSWORD" pg_isready -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" > /dev/null 2>&1; then
3543
echo "PostgreSQL ready."
3644
break
3745
fi
38-
if [ $i -eq 60 ]; then
39-
echo "WARNING: PostgreSQL not ready after 5 minutes. Continuing anyway..."
46+
if [ $i -eq 120 ]; then
47+
echo "ERROR: PostgreSQL not ready after 10 minutes; exiting so ECS can retry."
48+
exit 1
4049
fi
4150
sleep 5
4251
done

0 commit comments

Comments
 (0)