-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathscenario_2_aks_scaling.json
More file actions
89 lines (89 loc) · 3.46 KB
/
scenario_2_aks_scaling.json
File metadata and controls
89 lines (89 loc) · 3.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
{
"incident_id": "INC-2026-0402",
"title": "Kubernetes node pool scaling failure causing pod scheduling backlog",
"severity": "SEV1",
"timeframe": {
"start": "2026-02-17T09:00:00Z",
"end": null
},
"alerts": [
{
"name": "AKS-NodePool-ScaleFailure",
"description": "Node pool 'workload-pool' failed to scale from 5 to 12 nodes. Azure VMSS error: InsufficientCapacity.",
"timestamp": "2026-02-17T09:02:00Z"
},
{
"name": "PodScheduling-Backlog-Critical",
"description": "47 pods in Pending state for >5 minutes due to insufficient CPU/memory.",
"timestamp": "2026-02-17T09:05:00Z"
},
{
"name": "HPA-MaxReplicas-Reached",
"description": "Horizontal Pod Autoscaler for order-processor at max replicas (20), cannot scale further.",
"timestamp": "2026-02-17T09:03:00Z"
},
{
"name": "API-Latency-Degraded",
"description": "P95 latency for /api/orders exceeded 8s (threshold: 2s).",
"timestamp": "2026-02-17T09:08:00Z"
}
],
"logs": [
{
"source": "cluster-autoscaler",
"lines": [
"2026-02-17T09:01:00Z INFO Scale-up triggered: need 7 more nodes for pending pods",
"2026-02-17T09:01:30Z WARN VMSS scale-up request sent for workload-pool",
"2026-02-17T09:02:00Z ERROR VMSS scale-up FAILED: OverconstrainedAllocationRequest – not enough capacity for Standard_D8s_v3 in westus2",
"2026-02-17T09:02:01Z WARN Retrying with backoff (attempt 2/5)",
"2026-02-17T09:04:00Z ERROR All 5 retry attempts exhausted for Standard_D8s_v3",
"2026-02-17T09:04:01Z INFO Attempting alternate VM size Standard_D8s_v5...",
"2026-02-17T09:04:30Z ERROR Alternate size also failed: InsufficientCapacity"
]
},
{
"source": "kube-scheduler",
"lines": [
"2026-02-17T09:03:00Z WARN 0/5 nodes available: 5 Insufficient cpu, 3 Insufficient memory",
"2026-02-17T09:05:00Z WARN 47 pods unschedulable – waiting for node scale-up",
"2026-02-17T09:08:00Z ERROR Preemption attempted but no lower-priority pods found"
]
},
{
"source": "order-processor",
"lines": [
"2026-02-17T09:06:00Z WARN Request queue depth: 3,421 (normal: ~200)",
"2026-02-17T09:07:00Z ERROR Processing timeout for batch ORD-BATCH-20260217-AM",
"2026-02-17T09:08:30Z WARN Dropping oldest requests from queue (TTL exceeded)"
]
}
],
"metrics": [
{
"name": "aks_node_count",
"window": "10m",
"values_summary": "Stuck at 5 nodes since 09:00Z despite scale target of 12"
},
{
"name": "pending_pods_count",
"window": "5m",
"values_summary": "Grew from 0 to 47 between 09:02Z and 09:05Z"
},
{
"name": "order_processing_queue_depth",
"window": "5m",
"values_summary": "Normal ~200, peaked at 3,421 at 09:06Z"
},
{
"name": "api_orders_p95_latency_ms",
"window": "5m",
"values_summary": "Baseline 400ms, degraded to 8,200ms at 09:08Z"
}
],
"runbook_excerpt": "Step 1: Check cluster-autoscaler logs for scale failure reason. Step 2: If InsufficientCapacity, try alternate VM SKU or alternate AZ. Step 3: If urgent, manually scale another node pool. Step 4: Check if PodDisruptionBudgets are blocking preemption. Step 5: If queue backlog > 1000, enable overflow to secondary cluster.",
"constraints": {
"max_time_minutes": 30,
"environment": "production",
"region": "westus2"
}
}