Skip to content

Commit b6daa13

Browse files
committed
bench: fix SSM executionTimeout — set to 4hr, increase poll loop to 4hr
1 parent 2f403a8 commit b6daa13

1 file changed

Lines changed: 12 additions & 5 deletions

File tree

scripts/run_bench.sh

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -171,25 +171,32 @@ script = (
171171
+ bench_cmds
172172
)
173173
encoded = base64.b64encode(script.encode()).decode()
174-
print(json.dumps({"commands": [f"echo '{encoded}' | base64 -d | bash"]}))
174+
# executionTimeout overrides the AWS-RunShellScript default of 3600s.
175+
# --timeout-seconds on send-command controls delivery only; this param
176+
# controls how long the script is allowed to run on the instance.
177+
print(json.dumps({
178+
"commands": [f"echo '{encoded}' | base64 -d | bash"],
179+
"executionTimeout": ["14400"],
180+
}))
175181
PYEOF
176182

177183
CMD_ID=$(aws ssm send-command \
178184
--instance-ids "$INSTANCE_ID" \
179185
--document-name "AWS-RunShellScript" \
180186
--comment "trnblas bench @ $SHA shapes=${SHAPES_ARG}" \
181187
--parameters "file://$PARAMS_FILE" \
182-
--timeout-seconds 7200 \
188+
--timeout-seconds 14400 \
183189
--region "$REGION" \
184190
--output text --query 'Command.CommandId')
185191
rm -f "$PARAMS_FILE"
186192

187193
echo "Command ID: $CMD_ID"
188-
echo "Waiting for bench to complete (cold compile for large shape: 30-90 min)..."
194+
echo "Waiting for bench to complete (cold compile for large shape: 2-4 hr)..."
189195

190-
# Poll every 30s, up to 120 min.
196+
# Poll every 30s, up to 4 hr (480 polls).
197+
# Large-shape cold compile observed to take >2 hr (>120 NEFF compilations).
191198
STATUS=InProgress
192-
for _ in $(seq 1 240); do
199+
for _ in $(seq 1 480); do
193200
STATUS=$(aws ssm get-command-invocation \
194201
--command-id "$CMD_ID" \
195202
--instance-id "$INSTANCE_ID" \

0 commit comments

Comments
 (0)