forked from Tracer-Cloud/opensre
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathasset.yml
More file actions
164 lines (150 loc) · 6.06 KB
/
Copy pathasset.yml
File metadata and controls
164 lines (150 loc) · 6.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
schema_version: "1.0"
asset_type: rds_postgres
display_name: Amazon RDS PostgreSQL
# Failure modes this asset model can exhibit
failure_modes:
- replication_lag # replica falls behind primary due to high write throughput
- connection_exhaustion # max_connections ceiling reached, new connections rejected
- storage_full # FreeStorageSpace → 0, writes begin to fail
- cpu_saturation # sustained CPU > 90%, driven by bad queries or batch jobs
- failover # Multi-AZ automatic failover or manual promotion
# Telemetry sources available on AWS for this asset type
telemetry:
timeseries:
aws_api: cloudwatch:GetMetricData
namespace: AWS/RDS
key_metrics:
- CPUUtilization
- DatabaseConnections
- FreeStorageSpace
- FreeableMemory
- ReadIOPS
- WriteIOPS
- ReadLatency
- WriteLatency
- ReplicaLag
- TransactionLogsGeneration
- NetworkTransmitThroughput
events:
aws_api: rds:DescribeEvents
event_categories:
- availability
- backup
- configuration change
- creation
- deletion
- failover
- failure
- maintenance
- notification
- read replica
- recovery
- restoration
- security
traces:
aws_api: "pi:GetResourceMetrics,pi:DescribeDimensionKeys"
display_name: Performance Insights
dimensions:
- db.sql # top SQL statements by DB load
- db.wait_event # wait event breakdown
- db.user # top users by DB load
- db.host # top client hosts by DB load
# Instance class connection limits (max_connections = LEAST(DBInstanceClassMemory/9531392, 5000))
instance_connection_limits:
db.t3.micro: 112
db.t3.small: 225
db.t3.medium: 450
db.t3.large: 901
db.r6g.large: 1802
db.r6g.xlarge: 3604
db.r6g.2xlarge: 5000
db.r6g.4xlarge: 5000
db.r6g.8xlarge: 5000
# Storage types and their throughput/IOPS characteristics
storage_types:
gp2:
baseline_iops_per_gb: 3
burst_iops: 3000
max_iops: 16000
max_throughput_mb_per_s: 250
gp3:
baseline_iops: 3000
max_iops: 64000
max_throughput_mb_per_s: 1000
iops_configurable: true
io1:
max_iops: 256000
max_throughput_mb_per_s: 4000
iops_configurable: true
# Which topology features must be present for a failure mode to be possible
topology_constraints:
replication_lag:
requires: read_replicas
failover:
requires: multi_az
storage_full:
requires: always
connection_exhaustion:
requires: always
cpu_saturation:
requires: always
# ---------------------------------------------------------------------------
# Scenario curriculum
# ---------------------------------------------------------------------------
# 4-level difficulty progression replacing the binary efficiency/reasoning split.
# Level is per-scenario; a failure in level 2 points to a different weakness than
# a failure in level 4 (noise-immunity vs causal reasoning).
difficulty_levels:
1: "Single dominant signal — all evidence consistent, root cause identifiable in one step"
2: "One confounder present — second evidence source needed to rule it out"
3: "Absent or indirect evidence — key metric missing, agent must infer from what remains"
4: "Compositional fault — two failure modes active and causally linked, agent must explain both"
known_gaps:
- temporal_ordering: >
All scenarios deliver evidence as a static snapshot. Production delivers evidence
incrementally (alert fires → query metrics → query events → …). Testing temporal
ordering requires architectural changes to the fixture backend and is out of scope.
# ---------------------------------------------------------------------------
# Failure-mode signatures
# ---------------------------------------------------------------------------
# Describes what each failure mode looks like in telemetry. Used as reference for
# fixture authoring and for evaluating whether the agent consulted the right evidence.
#
# MECE note: uniqueness is on (primary_signal × rate × corroborating_presence ×
# event_presence), not on primary_signal alone. 003 and 008 both map to storage_full
# but have distinct fingerprints: 003 has FreeStorageSpace present and trending to 0
# with elevated WriteIOPS; 008 has FreeStorageSpace absent from the fixture entirely.
failure_mode_signatures:
replication_lag:
primary_signals: [ReplicaLag, TransactionLogsGeneration]
corroborating_signals: [WriteIOPS]
typical_confounders: [CPUUtilization]
required_evidence: [aws_cloudwatch_metrics, aws_performance_insights]
mece_basis: "primary_signal × rate × corroborating_presence"
connection_exhaustion:
primary_signals: [DatabaseConnections]
corroborating_signals: [CPUUtilization]
typical_confounders: [CPUUtilization]
required_evidence: [aws_cloudwatch_metrics, aws_performance_insights]
mece_basis: "primary_signal × saturation_level × wait_event_type"
storage_full:
primary_signals: [FreeStorageSpace]
corroborating_signals: [WriteIOPS, WriteLatency]
typical_confounders: [WriteIOPS] # low IOPS does NOT rule out storage_full
required_evidence: [aws_rds_events] # events alone can confirm when metric is absent
mece_note: >
003 and 008 share this failure mode. Distinct fingerprint: 003 has metric
present+fast-declining with elevated WriteIOPS; 008 has FreeStorageSpace absent
from the fixture entirely — agent must infer from events + PI write latency.
cpu_saturation:
primary_signals: [CPUUtilization]
corroborating_signals: [ReadIOPS]
typical_confounders: [DatabaseConnections]
required_evidence: [aws_cloudwatch_metrics, aws_performance_insights]
mece_basis: "primary_signal × query_fingerprint_in_PI"
failover:
primary_signals: [] # control-plane event IS the decisive signal
corroborating_signals: [DatabaseConnections]
typical_confounders: [CPUUtilization, ReplicaLag]
required_evidence: [aws_rds_events]
mece_basis: "event_sequence: Multi-AZ failover initiated → in progress → completed"