Skip to content

Commit 5f3869d

Browse files
committed
feat: add configuration for Konnectivity agent replica alert in GKE
1 parent 5d08a9e commit 5f3869d

File tree

8 files changed

+126
-23
lines changed

8 files changed

+126
-23
lines changed

CHANGELOG.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,16 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
88

99
## [Unreleased]
1010

11+
### Changed
12+
13+
- Add konnectivity agent replica alert with a PromQL-based condition that counts pods via `kubernetes_io:container_uptime`.
14+
- Standardize alert filter/query style for consistency across configuration.
15+
1116
## [0.10.0] - 2026-01-05
1217

1318
[Compare with previous version](https://github.com/sparkfabrik/terraform-google-services-monitoring/compare/0.9.0...0.10.0)
1419

15-
### Changed
20+
### Added
1621

1722
- Add `no agent available` to Kyverno log alert filter to capture control plane-to-node connectivity failures via Konnectivity (upstream Kubernetes); commonly seen on GKE (especially private nodes), but not GKE-specific.
1823

cloud_sql.tf

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,11 @@ resource "google_monitoring_alert_policy" "cloud_sql_cpu_utilization" {
6969

7070
conditions {
7171
condition_threshold {
72-
filter = "resource.type = \"cloudsql_database\" AND resource.labels.database_id = \"${local.cloud_sql_project}:${each.value.instance}\" AND metric.type = \"cloudsql.googleapis.com/database/cpu/utilization\""
72+
filter = <<-EOT
73+
resource.type="cloudsql_database"
74+
AND resource.labels.database_id="${local.cloud_sql_project}:${each.value.instance}"
75+
AND metric.type="cloudsql.googleapis.com/database/cpu/utilization"
76+
EOT
7377
comparison = "COMPARISON_GT"
7478
threshold_value = each.value.threshold
7579
duration = each.value.duration
@@ -101,7 +105,11 @@ resource "google_monitoring_alert_policy" "cloud_sql_memory_utilization" {
101105
conditions {
102106
display_name = "${local.cloud_sql_project} ${each.value.instance} - Memory utilization ${each.value.severity} ${each.value.threshold * 100}%"
103107
condition_threshold {
104-
filter = "resource.type = \"cloudsql_database\" AND resource.labels.database_id = \"${local.cloud_sql_project}:${each.value.instance}\" AND metric.type = \"cloudsql.googleapis.com/database/memory/utilization\""
108+
filter = <<-EOT
109+
resource.type="cloudsql_database"
110+
AND resource.labels.database_id="${local.cloud_sql_project}:${each.value.instance}"
111+
AND metric.type="cloudsql.googleapis.com/database/memory/utilization"
112+
EOT
105113
duration = each.value.duration
106114
comparison = "COMPARISON_GT"
107115
threshold_value = each.value.threshold
@@ -133,7 +141,11 @@ resource "google_monitoring_alert_policy" "cloud_sql_disk_utilization" {
133141
conditions {
134142
display_name = "${local.cloud_sql_project} ${each.value.instance} - Disk utilization ${each.value.severity} ${each.value.threshold * 100}%"
135143
condition_threshold {
136-
filter = "resource.type = \"cloudsql_database\" AND resource.labels.database_id = \"${local.cloud_sql_project}:${each.value.instance}\" AND metric.type = \"cloudsql.googleapis.com/database/disk/utilization\""
144+
filter = <<-EOT
145+
resource.type="cloudsql_database"
146+
AND resource.labels.database_id="${local.cloud_sql_project}:${each.value.instance}"
147+
AND metric.type="cloudsql.googleapis.com/database/disk/utilization"
148+
EOT
137149
duration = each.value.duration
138150
comparison = "COMPARISON_GT"
139151
threshold_value = each.value.threshold

examples/main.tf

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ locals {
4242
}
4343

4444
module "example" {
45-
source = "github.com/sparkfabrik/terraform-google-services-monitoring?ref=0.9.0"
45+
source = "github.com/sparkfabrik/terraform-google-services-monitoring?ref=0.9.0"
4646

4747
notification_channels = var.notification_channels
4848
project_id = var.project_id
@@ -55,22 +55,22 @@ module "example" {
5555
filter_extra = "-textPayload:\"stale GroupVersion discovery: metrics.k8s.io/v1beta1\""
5656
}
5757
cert_manager = {
58-
cluster_name = "test-cluster"
59-
namespace = "cert-manager"
58+
cluster_name = "test-cluster"
59+
namespace = "cert-manager"
6060
}
6161

6262
typesense = {
6363
cluster_name = "test-cluster"
6464
apps = {
6565
"typesense-app" = {
6666
uptime_check = {
67-
host = "typesense.example.com"
67+
host = "typesense.example.com"
6868
}
6969
container_check = {
7070
enabled = true
7171
namespace = "typesense"
7272
pod_restart = {
73-
threshold = 1
73+
threshold = 1
7474
}
7575
}
7676
}
@@ -82,13 +82,13 @@ module "example" {
8282
apps = {
8383
"litellm-app" = {
8484
uptime_check = {
85-
host = "litellm.example.com"
85+
host = "litellm.example.com"
8686
}
8787
container_check = {
8888
namespace = "litellm"
8989
pod_restart = {
90-
threshold = 2
91-
duration = 300
90+
threshold = 2
91+
duration = 300
9292
notification_prompts = ["CLOSED"]
9393
}
9494
}

konnectivity_agent.tf

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
locals {
2+
konnectivity_agent_project = (
3+
var.konnectivity_agent.project_id != null
4+
? var.konnectivity_agent.project_id
5+
: var.project_id
6+
)
7+
8+
konnectivity_agent_notification_channels = (
9+
var.konnectivity_agent.notification_enabled
10+
? (
11+
length(var.konnectivity_agent.notification_channels) > 0
12+
? var.konnectivity_agent.notification_channels
13+
: var.notification_channels
14+
)
15+
: []
16+
)
17+
}
18+
19+
resource "google_monitoring_alert_policy" "konnectivity_agent_replicas" {
20+
count = var.konnectivity_agent.enabled ? 1 : 0
21+
22+
project = local.konnectivity_agent_project
23+
display_name = "CRITICAL: Konnectivity agent pod count == 0 (cluster=${var.konnectivity_agent.cluster_name}, namespace=${var.konnectivity_agent.namespace}, deployment=${var.konnectivity_agent.deployment_name})"
24+
combiner = "OR"
25+
enabled = var.konnectivity_agent.enabled
26+
user_labels = {
27+
severity = "critical"
28+
}
29+
30+
conditions {
31+
display_name = "Konnectivity agent pod count == 0"
32+
33+
condition_prometheus_query_language {
34+
query = <<-PROMQL
35+
(
36+
count(
37+
max by (pod_name) (
38+
kubernetes_io:container_uptime{
39+
monitored_resource="k8s_container",
40+
project_id="${local.konnectivity_agent_project}",
41+
cluster_name="${var.konnectivity_agent.cluster_name}",
42+
namespace_name="${var.konnectivity_agent.namespace}",
43+
metadata_system_top_level_controller_name="${var.konnectivity_agent.deployment_name}"
44+
}
45+
)
46+
)
47+
or on() vector(0)
48+
) == 0
49+
PROMQL
50+
51+
duration = "${var.konnectivity_agent.duration_seconds}s"
52+
}
53+
}
54+
55+
documentation {
56+
content = "CRITICAL: Konnectivity agent has zero ready replicas in kube-system. Investigate immediately."
57+
mime_type = "text/markdown"
58+
}
59+
60+
notification_channels = local.konnectivity_agent_notification_channels
61+
62+
alert_strategy {
63+
auto_close = "${var.konnectivity_agent.auto_close_seconds}s"
64+
notification_prompts = var.konnectivity_agent.notification_prompts
65+
}
66+
}

lite_llm.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ resource "google_monitoring_alert_policy" "litellm_pod_restart" {
7474
notification_channels = local.litellm_notification_channels
7575

7676
alert_strategy {
77-
auto_close = "${each.value.pod_restart.auto_close_seconds}s"
77+
auto_close = "${each.value.pod_restart.auto_close_seconds}s"
7878
notification_prompts = each.value.pod_restart.notification_prompts
7979
}
8080
}

ssl_alert.tf

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@ resource "google_monitoring_alert_policy" "ssl_expiring_days" {
1111
combiner = "OR"
1212
conditions {
1313
condition_threshold {
14-
filter = "metric.type=\"monitoring.googleapis.com/uptime_check/time_until_ssl_cert_expires\" AND resource.type=\"uptime_url\""
14+
filter = <<-EOT
15+
metric.type="monitoring.googleapis.com/uptime_check/time_until_ssl_cert_expires"
16+
AND resource.type="uptime_url"
17+
EOT
1518
comparison = "COMPARISON_LT"
1619
threshold_value = each.value
1720
duration = "600s"

typesense.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ resource "google_monitoring_alert_policy" "typesense_pod_restart" {
7474
notification_channels = local.typesense_notification_channels
7575

7676
alert_strategy {
77-
auto_close = "${each.value.pod_restart.auto_close_seconds}s"
77+
auto_close = "${each.value.pod_restart.auto_close_seconds}s"
7878
notification_prompts = each.value.pod_restart.notification_prompts
7979
}
8080
}

variables.tf

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,23 @@ variable "cert_manager" {
9999
})
100100
}
101101

102+
variable "konnectivity_agent" {
103+
description = "Configuration for Konnectivity agent deployment replica alert in GKE. Triggers when there are no available replicas."
104+
default = {}
105+
type = object({
106+
enabled = optional(bool, true)
107+
cluster_name = string
108+
project_id = optional(string, null)
109+
namespace = optional(string, "kube-system")
110+
deployment_name = optional(string, "konnectivity-agent")
111+
duration_seconds = optional(number, 60)
112+
auto_close_seconds = optional(number, 3600)
113+
notification_enabled = optional(bool, true)
114+
notification_channels = optional(list(string), [])
115+
notification_prompts = optional(list(string), null)
116+
})
117+
}
118+
102119
variable "typesense" {
103120
description = "Configuration for Typesense monitoring alerts. Supports uptime checks for HTTP endpoints and container-level alerts (pod restarts) in GKE. Each app is identified by its name (map key)."
104121
default = {}
@@ -120,10 +137,10 @@ variable "typesense" {
120137
enabled = optional(bool, true)
121138
namespace = string
122139
pod_restart = optional(object({
123-
threshold = optional(number, 0)
124-
alignment_period = optional(number, 60)
125-
duration = optional(number, 180)
126-
auto_close_seconds = optional(number, 3600)
140+
threshold = optional(number, 0)
141+
alignment_period = optional(number, 60)
142+
duration = optional(number, 180)
143+
auto_close_seconds = optional(number, 3600)
127144
notification_prompts = optional(list(string), null)
128145
}), {})
129146
}), null)
@@ -171,10 +188,10 @@ variable "litellm" {
171188
enabled = optional(bool, true)
172189
namespace = string
173190
pod_restart = optional(object({
174-
threshold = optional(number, 0)
175-
alignment_period = optional(number, 60)
176-
duration = optional(number, 180)
177-
auto_close_seconds = optional(number, 3600)
191+
threshold = optional(number, 0)
192+
alignment_period = optional(number, 60)
193+
duration = optional(number, 180)
194+
auto_close_seconds = optional(number, 3600)
178195
notification_prompts = optional(list(string), null)
179196
}), {})
180197
}), null)

0 commit comments

Comments
 (0)