Skip to content

Commit 98cd41e

Browse files
authored
Merge pull request #394 from co-cddo/chore/ci-lease-lambda-timeout
fix(ci-lease): cover all sandbox lifecycle OUs + retry safety
2 parents 9f1a03a + ae766b8 commit 98cd41e

2 files changed

Lines changed: 71 additions & 9 deletions

File tree

cloudformation/isb-hub-orgmgmt/ci-deploy-role-stackset/template.yaml

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,39 @@ Parameters:
3232
Default: 'isb-hub-github-actions-ci-lease'
3333
Description: Name of the hub-side OIDC role permitted to assume the CIDeployRole.
3434

35-
SandboxOuId:
35+
# SERVICE_MANAGED StackSets with AutoDeployment don't recurse into
36+
# descendant OUs — they only deploy to accounts DIRECTLY in the targeted
37+
# OUs. So we enumerate each ISB lifecycle OU explicitly. Accounts move
38+
# between these during a lease cycle (Available -> Active -> CleanUp
39+
# -> Available); covering them all ensures the role stays present
40+
# throughout the lease.
41+
EntryOuId:
3642
Type: String
37-
Default: 'ou-2laj-4dyae1oa'
38-
Description: Parent OU containing all ISB pool sub-OUs (Available, Active, Entry, ...). The StackSet auto-deploys to every account under this OU.
43+
Default: 'ou-2laj-2by9v0sr'
44+
AllowedPattern: 'ou-[a-z0-9]{4,}-[a-z0-9]{8,}'
45+
AvailableOuId:
46+
Type: String
47+
Default: 'ou-2laj-oihxgbtr'
48+
AllowedPattern: 'ou-[a-z0-9]{4,}-[a-z0-9]{8,}'
49+
ActiveOuId:
50+
Type: String
51+
Default: 'ou-2laj-sre4rnjs'
52+
AllowedPattern: 'ou-[a-z0-9]{4,}-[a-z0-9]{8,}'
53+
FrozenOuId:
54+
Type: String
55+
Default: 'ou-2laj-jpffue7g'
56+
AllowedPattern: 'ou-[a-z0-9]{4,}-[a-z0-9]{8,}'
57+
CleanUpOuId:
58+
Type: String
59+
Default: 'ou-2laj-x3o8lbk8'
60+
AllowedPattern: 'ou-[a-z0-9]{4,}-[a-z0-9]{8,}'
61+
QuarantineOuId:
62+
Type: String
63+
Default: 'ou-2laj-mmagoake'
64+
AllowedPattern: 'ou-[a-z0-9]{4,}-[a-z0-9]{8,}'
65+
ExitOuId:
66+
Type: String
67+
Default: 'ou-2laj-s1t02mrz'
3968
AllowedPattern: 'ou-[a-z0-9]{4,}-[a-z0-9]{8,}'
4069

4170
Namespace:
@@ -69,7 +98,13 @@ Resources:
6998
StackInstancesGroup:
7099
- DeploymentTargets:
71100
OrganizationalUnitIds:
72-
- !Ref SandboxOuId
101+
- !Ref EntryOuId
102+
- !Ref AvailableOuId
103+
- !Ref ActiveOuId
104+
- !Ref FrozenOuId
105+
- !Ref CleanUpOuId
106+
- !Ref QuarantineOuId
107+
- !Ref ExitOuId
73108
Regions:
74109
# IAM is global; one region is enough for StackSet bookkeeping.
75110
- us-west-2

cloudformation/isb-hub/lambda/lease-proxy/index.py

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,25 @@ def signed_admin_token(email: str) -> str:
8888
return sign_jwt(payload, fetch_jwt_secret())
8989

9090

91-
def make_isb_api_request(method, path, token, body=None, query_params=None):
92-
"""HTTP request to the ISB API with 4-retry exponential backoff."""
91+
def make_isb_api_request(method, path, token, body=None, query_params=None, idempotent=None):
92+
"""HTTP request to the ISB API.
93+
94+
Retries (4× exponential backoff) only on requests we KNOW are safe to
95+
repeat. GET is always idempotent. POST is only safe if the path is
96+
explicitly an action-style endpoint where double-firing is harmless
97+
(e.g. /leases/{id}/terminate). POST /leases (lease creation) is NOT
98+
idempotent — a retry on a 5xx after the server actually created the
99+
record would create a duplicate lease, which (a) eats into the
100+
per-user quota and (b) leaks a real pool account. Default for POST is
101+
fail-fast on 5xx; callers opt in via idempotent=True if appropriate.
102+
103+
Network-level failures (ConnectionResetError / socket.timeout /
104+
URLError) are retried regardless of method — they happen BEFORE the
105+
server saw the request, so re-firing is safe.
106+
"""
107+
if idempotent is None:
108+
idempotent = method.upper() in ("GET", "HEAD", "DELETE", "PUT")
109+
93110
url = f"{ISB_API_BASE_URL.rstrip('/')}/{path.lstrip('/')}"
94111
if query_params:
95112
qs = "&".join(
@@ -117,7 +134,7 @@ def make_isb_api_request(method, path, token, body=None, query_params=None):
117134
with urllib.request.urlopen(req, timeout=30) as response:
118135
return response.status, json.loads(response.read().decode())
119136
except urllib.error.HTTPError as e:
120-
if e.code in (500, 502, 503, 504) and attempt < 3:
137+
if idempotent and e.code in (500, 502, 503, 504) and attempt < 3:
121138
last_transient = e
122139
time.sleep(2 ** attempt)
123140
continue
@@ -172,7 +189,13 @@ def op_acquire(event):
172189
template = _resolve_lease_template(token, template_name)
173190

174191
create_body = {"leaseTemplateUuid": template["uuid"], "userEmail": user_email}
175-
status, response = make_isb_api_request("POST", "/leases", token, body=create_body)
192+
# idempotent=False is the default for POST, set explicitly here as a
193+
# reminder: lease creation is NOT idempotent. A retry on a 5xx after
194+
# the server actually committed the record would create a duplicate
195+
# lease (eats into quota + leaks a pool account).
196+
status, response = make_isb_api_request(
197+
"POST", "/leases", token, body=create_body, idempotent=False
198+
)
176199
if status != 201:
177200
return {
178201
"ok": False,
@@ -251,7 +274,11 @@ def op_release(event):
251274
user_email = event["user_email"]
252275
token = signed_admin_token(user_email)
253276
encoded_id = urllib.parse.quote(lease_id, safe="+=")
254-
status, body = make_isb_api_request("POST", f"/leases/{encoded_id}/terminate", token)
277+
# Terminate is effectively idempotent — repeated calls on an
278+
# already-terminated lease return 404/409, which we treat as success.
279+
status, body = make_isb_api_request(
280+
"POST", f"/leases/{encoded_id}/terminate", token, idempotent=True
281+
)
255282
if status == 200:
256283
return {"ok": True, "data": {"terminated": True}}
257284
if status in (404, 409):

0 commit comments

Comments
 (0)