Skip to content

Commit 78de1c9

Browse files
authored
Merge pull request #119 from awslabs/request_timeouts
fix: Refactor AWS timeouts and capacity-aware status handling
2 parents 18cb672 + 188a0b8 commit 78de1c9

File tree

17 files changed

+571
-127
lines changed

17 files changed

+571
-127
lines changed

config/README.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,17 @@ This section configures AWS SDK behavior:
1414
"aws": {
1515
"region": "us-east-1",
1616
"profile": "default",
17-
"max_retries": 3,
18-
"timeout": 30
17+
"aws_max_retries": 3,
18+
"aws_connect_timeout": 10,
19+
"aws_read_timeout": 30
1920
}
2021
```
2122

2223
- `region`: AWS region to use for API calls
2324
- `profile`: AWS profile to use for credentials
24-
- `max_retries`: Number of retries for AWS API calls
25-
- `timeout`: Timeout in seconds for AWS API calls
25+
- `aws_max_retries`: Number of retries for AWS API calls
26+
- `aws_connect_timeout`: Connection timeout in seconds for AWS API calls
27+
- `aws_read_timeout`: Timeout in seconds for AWS API calls
2628

2729
### Logging Configuration (`logging`)
2830

dev-tools/admin/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Admin Tools
2+
3+
Utilities intended for administrative maintenance tasks.
4+
5+
- `dev-tools/admin/terminate_req_instances.py`: terminates EC2 instances whose `Name` tag starts with a prefix (default `req-`) in a chosen region, with optional dry-run.
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import sys
5+
from typing import Iterable, List
6+
7+
import boto3
8+
from botocore.exceptions import ClientError
9+
10+
ACTIVE_STATES = [
11+
"pending",
12+
"running",
13+
"stopping",
14+
"stopped",
15+
"shutting-down",
16+
]
17+
18+
19+
def _chunked(items: List[str], size: int) -> Iterable[List[str]]:
20+
for start in range(0, len(items), size):
21+
yield items[start : start + size]
22+
23+
24+
def _collect_instance_ids(ec2_client, name_prefix: str) -> List[str]:
25+
paginator = ec2_client.get_paginator("describe_instances")
26+
filters = [
27+
{"Name": "tag:Name", "Values": [f"{name_prefix}*"]},
28+
{"Name": "instance-state-name", "Values": ACTIVE_STATES},
29+
]
30+
31+
instance_ids: List[str] = []
32+
for page in paginator.paginate(Filters=filters):
33+
for reservation in page.get("Reservations", []):
34+
for instance in reservation.get("Instances", []):
35+
instance_id = instance.get("InstanceId")
36+
if instance_id:
37+
instance_ids.append(instance_id)
38+
39+
return instance_ids
40+
41+
42+
def _terminate_instances(ec2_client, instance_ids: List[str], dry_run: bool) -> None:
43+
if not instance_ids:
44+
print("No instances found to terminate.")
45+
return
46+
47+
print(f"Found {len(instance_ids)} instance(s) to terminate.")
48+
for chunk in _chunked(instance_ids, 1000):
49+
try:
50+
ec2_client.terminate_instances(InstanceIds=chunk, DryRun=dry_run)
51+
if dry_run:
52+
print(f"Dry run succeeded for {len(chunk)} instance(s): {chunk}")
53+
else:
54+
print(f"Termination started for {len(chunk)} instance(s): {chunk}")
55+
except ClientError as exc:
56+
if dry_run and exc.response.get("Error", {}).get("Code") == "DryRunOperation":
57+
print(f"Dry run succeeded for {len(chunk)} instance(s): {chunk}")
58+
continue
59+
raise
60+
61+
62+
def main() -> int:
63+
parser = argparse.ArgumentParser(
64+
description="Terminate EC2 instances in a region whose Name tag starts with a prefix."
65+
)
66+
parser.add_argument("--region", default="eu-west-1", help="AWS region (default: eu-west-1)")
67+
parser.add_argument(
68+
"--prefix",
69+
default="req-",
70+
help='Name tag prefix to match (default: "req-")',
71+
)
72+
parser.add_argument(
73+
"--dry-run",
74+
action="store_true",
75+
help="Validate permissions without terminating instances.",
76+
)
77+
args = parser.parse_args()
78+
79+
session = boto3.session.Session()
80+
ec2_client = session.client("ec2", region_name=args.region)
81+
82+
try:
83+
instance_ids = _collect_instance_ids(ec2_client, args.prefix)
84+
_terminate_instances(ec2_client, instance_ids, args.dry_run)
85+
except ClientError as exc:
86+
print(f"AWS error: {exc}", file=sys.stderr)
87+
return 1
88+
89+
return 0
90+
91+
92+
if __name__ == "__main__":
93+
raise SystemExit(main())

src/api/documentation/openapi_config.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,9 +141,9 @@ def _get_api_description() -> str:
141141
def _get_security_scheme_name(strategy: str) -> str:
142142
"""Get security scheme name for the given auth strategy."""
143143
scheme_mapping = {
144-
"bearer_token": "BearerAuth",
145-
"iam": "AWSAuth",
146-
"cognito": "CognitoAuth",
144+
"bearer_token": "BearerAuth", # nosec B105
145+
"iam": "AWSAuth", # nosec B105
146+
"cognito": "CognitoAuth", # nosec B105
147147
}
148148
return scheme_mapping.get(strategy)
149149

src/application/queries/handlers.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -931,6 +931,31 @@ def _determine_request_status_from_machines(
931931
request_type = request.request_type
932932
provider_metadata = provider_metadata or {}
933933

934+
# If provisioning already recorded a failure, keep the request terminal.
935+
metadata = getattr(request, "metadata", {}) or {}
936+
error_details = getattr(request, "error_details", {}) or {}
937+
938+
provisioning_error_type = metadata.get("error_type")
939+
provisioning_error_message = metadata.get("error_message")
940+
fleet_errors = metadata.get("fleet_errors")
941+
ec2_fleet_errors = (error_details.get("ec2_fleet") or {}).get("errors")
942+
943+
if provisioning_error_type or fleet_errors or ec2_fleet_errors:
944+
status_message = provisioning_error_message or "Provisioning failed"
945+
if not status_message.lower().startswith("provisioning failed"):
946+
status_message = f"Provisioning failed: {status_message}"
947+
if current_status != RequestStatus.FAILED:
948+
return (RequestStatus.FAILED.value, status_message)
949+
return (None, None)
950+
951+
if current_status in [
952+
RequestStatus.COMPLETED,
953+
RequestStatus.FAILED,
954+
RequestStatus.CANCELLED,
955+
RequestStatus.TIMEOUT,
956+
]:
957+
return (None, None)
958+
934959
# Count machines by status from provider data (most up-to-date)
935960
provider_machine_count = len(machine_objects_from_provider)
936961
database_machine_count = len(machine_objects_from_database)

src/config/README.md

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,9 @@ Comprehensive validation system for all configuration sections.
5353
"aws": {
5454
"region": "us-east-1",
5555
"profile": "default",
56-
"max_retries": 3,
57-
"timeout": 30,
56+
"aws_max_retries": 3,
57+
"aws_connect_timeout": 10,
58+
"aws_read_timeout": 30,
5859
"access_key_id": null,
5960
"secret_access_key": null,
6061
"session_token": null
@@ -65,8 +66,9 @@ Comprehensive validation system for all configuration sections.
6566
**Fields:**
6667
- `region`: Cloud region for API calls
6768
- `profile`: Credential profile to use
68-
- `max_retries`: Number of API call retries
69-
- `timeout`: API call timeout in seconds
69+
- `aws_max_retries`: Number of API call retries
70+
- `aws_connect_timeout`: Connection timeout in seconds
71+
- `aws_read_timeout`: API call timeout in seconds
7072
- `access_key_id`: Direct access key (optional)
7173
- `secret_access_key`: Direct secret key (optional)
7274
- `session_token`: Session token for temporary credentials (optional)
@@ -236,8 +238,9 @@ log_level = config.get_logging_config().level
236238
# Configuration is type-safe with dataclasses
237239
aws_config = config.get_aws_config()
238240
assert isinstance(aws_config.region, str)
239-
assert isinstance(aws_config.max_retries, int)
240-
assert isinstance(aws_config.timeout, int)
241+
assert isinstance(aws_config.aws_max_retries, int)
242+
assert isinstance(aws_config.aws_connect_timeout, int)
243+
assert isinstance(aws_config.aws_read_timeout, int)
241244

242245
# Optional fields are properly typed
243246
if aws_config.access_key_id is not None:
@@ -323,11 +326,15 @@ def validate_aws_config(aws_config: AWSConfig) -> None:
323326
raise ValueError("AWS region is required")
324327

325328
# Retry validation
326-
if aws_config.max_retries < 0:
329+
if aws_config.aws_max_retries < 0:
327330
raise ValueError("Max retries cannot be negative")
328331

332+
# Connection timeout validation
333+
if aws_config.aws_connect_timeout <= 0:
334+
raise ValueError("Connect timeout must be positive")
335+
329336
# Timeout validation
330-
if aws_config.timeout <= 0:
337+
if aws_config.aws_read_timeout <= 0:
331338
raise ValueError("Timeout must be positive")
332339

333340
# Credential validation
@@ -392,8 +399,9 @@ def validate_repository_config(repo_config: RepositoryConfig) -> None:
392399
{
393400
"aws": {
394401
"region": "us-east-1",
395-
"max_retries": 5,
396-
"timeout": 60
402+
"aws_max_retries": 5,
403+
"aws_connect_timeout": 10,
404+
"aws_read_timeout": 60
397405
},
398406
"logging": {
399407
"level": "INFO",

src/config/loader.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ class ConfigurationLoader:
6767
"AWS_KEY_FILE": ("aws", "key_file"),
6868
"AWS_PROXY_HOST": ("aws", "proxy_host"),
6969
"AWS_PROXY_PORT": ("aws", "proxy_port"),
70+
"AWS_CONNECT_TIMEOUT": ("aws", "aws_connect_timeout"),
7071
"AWS_CONNECTION_TIMEOUT_MS": ("aws", "connection_timeout_ms"),
7172
"AWS_REQUEST_RETRY_ATTEMPTS": ("aws", "request_retry_attempts"),
7273
"AWS_INSTANCE_PENDING_TIMEOUT_SEC": ("aws", "instance_pending_timeout_sec"),

src/config/validators/config_validator.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,17 @@ def _validate_business_rules(self, config: AppConfig, result: ValidationResult)
7171
aws_config = provider.config
7272

7373
# Validate AWS-specific business rules
74-
if hasattr(aws_config, "max_retries") and aws_config.max_retries > 10:
74+
if hasattr(aws_config, "aws_max_retries") and aws_config.aws_max_retries > 10:
7575
result.add_warning(
76-
f"AWS provider '{provider.name}' max_retries is very high, consider reducing for better performance"
76+
f"AWS provider '{provider.name}' aws_max_retries is very high, consider reducing for better performance"
7777
)
7878

79-
if hasattr(aws_config, "timeout") and aws_config.timeout > 300:
79+
if (
80+
hasattr(aws_config, "aws_read_timeout")
81+
and aws_config.aws_read_timeout > 300
82+
):
8083
result.add_warning(
81-
f"AWS provider '{provider.name}' timeout is very high, consider reducing to avoid long waits"
84+
f"AWS provider '{provider.name}' aws_read_timeout is very high, consider reducing to avoid long waits"
8285
)
8386

8487
# Validate template configuration

src/infrastructure/auth/strategy/no_auth_strategy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ async def validate_token(self, token: str) -> AuthResult:
5757
user_id="anonymous",
5858
user_roles=["anonymous"],
5959
permissions=["*"],
60-
metadata={"strategy": "no_auth", "token_validation": "skipped"},
60+
metadata={"strategy": "no_auth", "token_validation": "skipped"}, # nosec B105
6161
)
6262

6363
async def refresh_token(self, refresh_token: str) -> AuthResult:
@@ -73,7 +73,7 @@ async def refresh_token(self, refresh_token: str) -> AuthResult:
7373
return AuthResult(
7474
status=AuthStatus.SUCCESS,
7575
user_id="anonymous",
76-
metadata={"strategy": "no_auth", "token_refresh": "not_applicable"},
76+
metadata={"strategy": "no_auth", "token_refresh": "not_applicable"}, # nosec B105
7777
)
7878

7979
async def revoke_token(self, token: str) -> bool:

src/interface/mcp_command_handlers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ def _format_validation_table(result: dict[str, Any]) -> dict[str, Any]:
323323
rows = []
324324

325325
for check in result["checks"]:
326-
status_symbol = {"PASS": "PASS", "WARNING": "WARN", "FAIL": "FAIL"}.get(
326+
status_symbol = {"PASS": "PASS", "WARNING": "WARN", "FAIL": "FAIL"}.get( # nosec B105
327327
check["status"], "UNKNOWN"
328328
)
329329

0 commit comments

Comments
 (0)