From d749c60f2a9677efc387d93dedd871cf72b0a35c Mon Sep 17 00:00:00 2001 From: Filippo Date: Wed, 4 Feb 2026 13:37:00 +0100 Subject: [PATCH 1/5] feat(kyverno): enhance error pattern handling with inclusion/exclusion options --- README.md | 4 +-- examples/main.tf | 13 ++++++-- kyverno.tf | 86 ++++++++++++++++++++++++++++++------------------ variables.tf | 20 +++++++++-- 4 files changed, 83 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 2f092e2..48e5ea8 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ Supported services: | Name | Version | |------|---------| -| [google](#provider\_google) | >= 5.10 | +| [google](#provider\_google) | 7.15.0 | ## Requirements @@ -56,7 +56,7 @@ Supported services: | [cert\_manager](#input\_cert\_manager) | Configuration for cert-manager missing issuer log alert. Allows customization of project, cluster, namespace, notification channels, alert documentation, enablement, extra filters, auto-close timing, and notification rate limiting. |
object({
enabled = optional(bool, true)
cluster_name = optional(string, null)
project_id = optional(string, null)
namespace = optional(string, "cert-manager")
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
})
| `{}` | no | | [cloud\_sql](#input\_cloud\_sql) | Configuration for Cloud SQL monitoring alerts. Supports customization of project, auto-close timing, notification channels, and per-instance alert thresholds for CPU, memory, and disk utilization. |
object({
enabled = optional(bool, true)
project_id = optional(string, null)
auto_close = optional(string, "86400s") # default 24h
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
instances = optional(map(object({
cpu_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "120s")
duration = optional(string, "300s")
})), [
{
threshold = 0.85,
duration = "1200s",
},
{
severity = "CRITICAL",
threshold = 1,
duration = "300s",
alignment_period = "60s",
}
])
memory_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "300s")
duration = optional(string, "300s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
disk_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.85)
alignment_period = optional(string, "300s")
duration = optional(string, "600s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
})), {})
})
| `{}` | no | | [konnectivity\_agent](#input\_konnectivity\_agent) | Configuration for Konnectivity agent deployment replica alert in GKE. Triggers when there are no available replicas. |
object({
enabled = optional(bool, true)
cluster_name = optional(string, null)
project_id = optional(string, null)
namespace = optional(string, "kube-system")
deployment_name = optional(string, "konnectivity-agent")
duration_seconds = optional(number, 60)
auto_close_seconds = optional(number, 3600)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
notification_prompts = optional(list(string), null)
})
| `{}` | no | -| [kyverno](#input\_kyverno) | Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace. |
object({
enabled = optional(bool, true)
cluster_name = optional(string, null)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
# Rate limit for notifications, e.g. "300s" for 5 minutes, used only for log match alerts
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
namespace = optional(string, "kyverno")
})
| `{}` | no | +| [kyverno](#input\_kyverno) | Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, error pattern inclusions/exclusions, and namespace. |
object({
enabled = optional(bool, true)
cluster_name = optional(string, null)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
# Rate limit for notifications, e.g. "300s" for 5 minutes, used only for log match alerts
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
namespace = optional(string, "kyverno")
# List of error patterns to exclude from the default set.
# Default patterns available for exclusion:
# "internal error", "failed calling webhook", "timeout", "client-side throttling",
# "failed to run warmup", "schema not found", "failed to list resources",
# "failed to watch resource", "context deadline exceeded", "is forbidden",
# "cannot list resource", "cannot watch resource", "RBAC.*denied",
# "failed to start watcher", "leader election lost", "unable to update .*WebhookConfiguration",
# "failed to sync", "dropping request", "failed to load certificate",
# "failed to update lock", "the object has been modified", "no matches for kind",
# "the server could not find the requested resource", "Too Many Requests", "x509",
# "is invalid:", "connection refused", "no agent available", "fatal error", "panic"
error_patterns_exclude = optional(list(string), [])
# List of additional error patterns to include (added to default set)
# e.g. ["my custom error", "another pattern"]
error_patterns_include = optional(list(string), [])
})
| `{}` | no | | [litellm](#input\_litellm) | Configuration for LiteLLM monitoring alerts. Supports uptime checks for HTTP endpoints and container-level alerts (pod restarts) in GKE. Each app is identified by its name (map key). |
object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
cluster_name = optional(string, null)

apps = optional(map(object({
uptime_check = optional(object({
enabled = optional(bool, true)
host = string
path = optional(string, "/health/readiness")
}), null)

container_check = optional(object({
enabled = optional(bool, true)
namespace = string
pod_restart = optional(object({
threshold = optional(number, 0)
alignment_period = optional(number, 60)
duration = optional(number, 180)
auto_close_seconds = optional(number, 3600)
notification_prompts = optional(list(string), null)
}), {})
}), null)
})), {})
})
| `{}` | no | | [notification\_channels](#input\_notification\_channels) | List of notification channel IDs to notify when an alert is triggered | `list(string)` | `[]` | no | | [project\_id](#input\_project\_id) | The Google Cloud project ID where logging exclusions will be created | `string` | n/a | yes | diff --git a/examples/main.tf b/examples/main.tf index 8f31b2d..49c8391 100644 --- a/examples/main.tf +++ b/examples/main.tf @@ -50,9 +50,16 @@ module "example" { kyverno = { cluster_name = "test-cluster" notification_channels = [] - # Optional filter for log entries, exclude known non-actionable messages - # e.g., "-textPayload:\"stale GroupVersion discovery: metrics.k8s.io/v1beta1\"" - filter_extra = "-textPayload:\"stale GroupVersion discovery: metrics.k8s.io/v1beta1\"" + # Exclude specific error patterns from the default set + error_patterns_exclude = [ + "failed to start watcher", + "failed to list resources", + ] + # Add custom error patterns to the default set + # error_patterns_include = [ + # "my custom error", + # "another pattern to match", + # ] } cert_manager = { cluster_name = "test-cluster" diff --git a/kyverno.tf b/kyverno.tf index 132f365..9cbc304 100644 --- a/kyverno.tf +++ b/kyverno.tf @@ -5,7 +5,58 @@ locals { kyverno_cluster_name = var.kyverno.cluster_name != null ? trimspace(var.kyverno.cluster_name) : "" - kyverno_log_filter = local.kyverno_cluster_name != "" ? (<<-EOT + # Default error patterns for Kyverno log matching + kyverno_default_error_patterns = [ + "internal error", + "failed calling webhook", + "timeout", + "client-side throttling", + "failed to run warmup", + "schema not found", + "failed to list resources", + "failed to watch resource", + "context deadline exceeded", + "is forbidden", + "cannot list resource", + "cannot watch resource", + "RBAC.*denied", + "failed to start watcher", + "leader election lost", + "unable to update .*WebhookConfiguration", + "failed to sync", + "dropping request", + "failed to load certificate", + "failed to update lock", + "the object has been modified", + "no matches for kind", + "the server could not find the requested resource", + "Too Many Requests", + "x509", + "is invalid:", + "connection refused", + "no agent available", + "fatal error", + "panic", + ] + + # Combine default patterns with included patterns, then filter out excluded ones + kyverno_all_error_patterns = concat( + local.kyverno_default_error_patterns, + var.kyverno.error_patterns_include + ) + + kyverno_active_error_patterns = [ + for pattern in local.kyverno_all_error_patterns : + pattern if !contains(var.kyverno.error_patterns_exclude, pattern) + ] + + # Build the error patterns filter string + kyverno_error_patterns_filter = length(local.kyverno_active_error_patterns) > 0 ? join("\n OR ", [ + for pattern in local.kyverno_active_error_patterns : + "jsonPayload.error=~\"(?i)${pattern}\"" + ]) : "" + + kyverno_log_filter = local.kyverno_cluster_name != "" && length(local.kyverno_active_error_patterns) > 0 ? (<<-EOT resource.type="k8s_container" AND resource.labels.project_id="${local.kyverno_project_id}" AND resource.labels.cluster_name="${local.kyverno_cluster_name}" @@ -15,38 +66,8 @@ locals { OR resource.labels.pod_name=~"kyverno-(admission|background|cleanup|reports)-controller-.*" ) AND ( - jsonPayload.error=~"(?i)internal error" - OR jsonPayload.error=~"(?i)failed calling webhook" - OR jsonPayload.error=~"(?i)timeout" - OR jsonPayload.error=~"(?i)client-side throttling" - OR jsonPayload.error=~"(?i)failed to run warmup" - OR jsonPayload.error=~"(?i)schema not found" - OR jsonPayload.error=~"(?i)failed to list resources" - OR jsonPayload.error=~"(?i)failed to watch resource" - OR jsonPayload.error=~"(?i)context deadline exceeded" - OR jsonPayload.error=~"(?i)is forbidden" - OR jsonPayload.error=~"(?i)cannot list resource" - OR jsonPayload.error=~"(?i)cannot watch resource" - OR jsonPayload.error=~"(?i)RBAC.*denied" - OR jsonPayload.error=~"(?i)failed to start watcher" - OR jsonPayload.error=~"(?i)leader election lost" - OR jsonPayload.error=~"(?i)unable to update .*WebhookConfiguration" - OR jsonPayload.error=~"(?i)failed to sync" - OR jsonPayload.error=~"(?i)dropping request" - OR jsonPayload.error=~"(?i)failed to load certificate" - OR jsonPayload.error=~"(?i)failed to update lock" - OR jsonPayload.error=~"(?i)the object has been modified" - OR jsonPayload.error=~"(?i)no matches for kind" - OR jsonPayload.error=~"(?i)the server could not find the requested resource" - OR jsonPayload.error=~"(?i)Too Many Requests" - OR jsonPayload.error=~"(?i)x509" - OR jsonPayload.error=~"(?i)is invalid:" - OR jsonPayload.error=~"(?i)connection refused" - OR jsonPayload.error=~"(?i)no agent available" - OR jsonPayload.error=~"(?i)fatal error" - OR jsonPayload.error=~"(?i)panic" + ${local.kyverno_error_patterns_filter} ) - ${trimspace(var.kyverno.filter_extra)} EOT ) : "" } @@ -55,6 +76,7 @@ resource "google_monitoring_alert_policy" "kyverno_logmatch_alert" { count = ( var.kyverno.enabled && local.kyverno_cluster_name != "" + && length(local.kyverno_active_error_patterns) > 0 ) ? 1 : 0 display_name = "Kyverno controllers ERROR logs (namespace=${var.kyverno.namespace})" diff --git a/variables.tf b/variables.tf index dcde9b9..4b5a5bf 100644 --- a/variables.tf +++ b/variables.tf @@ -69,7 +69,7 @@ variable "cloud_sql" { } variable "kyverno" { - description = "Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace." + description = "Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, error pattern inclusions/exclusions, and namespace." default = {} type = object({ enabled = optional(bool, true) @@ -81,8 +81,22 @@ variable "kyverno" { logmatch_notification_rate_limit = optional(string, "300s") alert_documentation = optional(string, null) auto_close_seconds = optional(number, 3600) - filter_extra = optional(string, "") - namespace = optional(string, "kyverno") + namespace = optional(string, "kyverno") + # List of error patterns to exclude from the default set. + # Default patterns available for exclusion: + # "internal error", "failed calling webhook", "timeout", "client-side throttling", + # "failed to run warmup", "schema not found", "failed to list resources", + # "failed to watch resource", "context deadline exceeded", "is forbidden", + # "cannot list resource", "cannot watch resource", "RBAC.*denied", + # "failed to start watcher", "leader election lost", "unable to update .*WebhookConfiguration", + # "failed to sync", "dropping request", "failed to load certificate", + # "failed to update lock", "the object has been modified", "no matches for kind", + # "the server could not find the requested resource", "Too Many Requests", "x509", + # "is invalid:", "connection refused", "no agent available", "fatal error", "panic" + error_patterns_exclude = optional(list(string), []) + # List of additional error patterns to include (added to default set) + # e.g. ["my custom error", "another pattern"] + error_patterns_include = optional(list(string), []) }) validation { From 71ea1531b94c8ebe488047cd2b88d6fcfa457010 Mon Sep 17 00:00:00 2001 From: Filippo Date: Wed, 4 Feb 2026 13:47:24 +0100 Subject: [PATCH 2/5] feat(kyverno): enhance error pattern handling with inclusion/exclusion options --- CHANGELOG.md | 6 ++++++ README.md | 2 +- variables.tf | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index df8e671..3c9340b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,12 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - Adjust Kyverno log filter to reduce false positives from normal transient errors such as `i/o timeout` and `failed to acquire lease`, including removal of the explicit `failed to acquire lease` condition. - Rename error pattern `list resources failed` to `failed to list resources` for consistency with other error patterns. +### Added + +- Add `error_patterns_exclude` to Kyverno configuration to allow excluding specific error patterns from the default set. +- Add `error_patterns_include` to Kyverno configuration to allow adding custom error patterns to the default set. +- Add validation for `error_patterns_exclude` to ensure only valid default patterns can be excluded. + ## [0.12.0] - 2026-01-28 [Compare with previous version](https://github.com/sparkfabrik/terraform-google-services-monitoring/compare/0.11.0...0.12.0) diff --git a/README.md b/README.md index 48e5ea8..bd6233c 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ Supported services: | Name | Version | |------|---------| -| [google](#provider\_google) | 7.15.0 | +| [google](#provider\_google) | >= 5.10 | ## Requirements diff --git a/variables.tf b/variables.tf index 4b5a5bf..abea591 100644 --- a/variables.tf +++ b/variables.tf @@ -106,6 +106,44 @@ variable "kyverno" { ) error_message = "When 'enabled' is true, 'cluster_name' must be provided and cannot be empty." } + + validation { + condition = alltrue([ + for pattern in var.kyverno.error_patterns_exclude : contains([ + "internal error", + "failed calling webhook", + "timeout", + "client-side throttling", + "failed to run warmup", + "schema not found", + "failed to list resources", + "failed to watch resource", + "context deadline exceeded", + "is forbidden", + "cannot list resource", + "cannot watch resource", + "RBAC.*denied", + "failed to start watcher", + "leader election lost", + "unable to update .*WebhookConfiguration", + "failed to sync", + "dropping request", + "failed to load certificate", + "failed to update lock", + "the object has been modified", + "no matches for kind", + "the server could not find the requested resource", + "Too Many Requests", + "x509", + "is invalid:", + "connection refused", + "no agent available", + "fatal error", + "panic", + ], pattern) + ]) + error_message = "error_patterns_exclude contains invalid pattern(s). Only default patterns can be excluded. Check the variable description for the list of valid patterns." + } } variable "cert_manager" { From 53850b9024f07d5ee2c40649bf1bb3b45011ec15 Mon Sep 17 00:00:00 2001 From: Filippo Date: Wed, 4 Feb 2026 14:00:44 +0100 Subject: [PATCH 3/5] feat(kyverno): enhance error pattern handling with inclusion/exclusion options --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c9340b..ea2a4fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,13 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - Add `error_patterns_include` to Kyverno configuration to allow adding custom error patterns to the default set. - Add validation for `error_patterns_exclude` to ensure only valid default patterns can be excluded. +### Breaking change + +- The `filter_extra` variable has been removed and replaced with `error_patterns_include` and `error_patterns_exclude`. To migrate: + - If you were using `filter_extra` to add custom error patterns, use `error_patterns_include` instead. + - If you need to exclude specific default error patterns, use `error_patterns_exclude`. + - See [examples/main.tf](examples/main.tf) for usage examples. + ## [0.12.0] - 2026-01-28 [Compare with previous version](https://github.com/sparkfabrik/terraform-google-services-monitoring/compare/0.11.0...0.12.0) From 393d173b8a70bcb464ea763d0f9cd6712de8117e Mon Sep 17 00:00:00 2001 From: Filippo Date: Wed, 4 Feb 2026 14:17:01 +0100 Subject: [PATCH 4/5] feat(kyverno): enhance error pattern handling with inclusion/exclusion options --- CHANGELOG.md | 3 +- README.md | 2 +- cert_manager.tf | 2 +- examples/main.tf | 10 ++++--- kyverno.tf | 4 +-- modules/http_monitoring/main.tf | 6 ++-- variables.tf | 50 ++++++++++++++++++++++++++++++--- 7 files changed, 61 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ea2a4fd..6c98e72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,8 +26,9 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ### Breaking change - The `filter_extra` variable has been removed and replaced with `error_patterns_include` and `error_patterns_exclude`. To migrate: - - If you were using `filter_extra` to add custom error patterns, use `error_patterns_include` instead. + - If you were using `filter_extra` to add custom error patterns for `jsonPayload.error` matching, use `error_patterns_include` instead. - If you need to exclude specific default error patterns, use `error_patterns_exclude`. + - **Note:** The new options only support error pattern matching against `jsonPayload.error`. If you were using `filter_extra` for arbitrary log filter conditions (e.g., negative filters like `-textPayload:"..."`), this functionality is no longer available. - See [examples/main.tf](examples/main.tf) for usage examples. ## [0.12.0] - 2026-01-28 diff --git a/README.md b/README.md index bd6233c..2fc9de3 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ Supported services: | [cert\_manager](#input\_cert\_manager) | Configuration for cert-manager missing issuer log alert. Allows customization of project, cluster, namespace, notification channels, alert documentation, enablement, extra filters, auto-close timing, and notification rate limiting. |
object({
enabled = optional(bool, true)
cluster_name = optional(string, null)
project_id = optional(string, null)
namespace = optional(string, "cert-manager")
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
})
| `{}` | no | | [cloud\_sql](#input\_cloud\_sql) | Configuration for Cloud SQL monitoring alerts. Supports customization of project, auto-close timing, notification channels, and per-instance alert thresholds for CPU, memory, and disk utilization. |
object({
enabled = optional(bool, true)
project_id = optional(string, null)
auto_close = optional(string, "86400s") # default 24h
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
instances = optional(map(object({
cpu_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "120s")
duration = optional(string, "300s")
})), [
{
threshold = 0.85,
duration = "1200s",
},
{
severity = "CRITICAL",
threshold = 1,
duration = "300s",
alignment_period = "60s",
}
])
memory_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "300s")
duration = optional(string, "300s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
disk_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.85)
alignment_period = optional(string, "300s")
duration = optional(string, "600s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
})), {})
})
| `{}` | no | | [konnectivity\_agent](#input\_konnectivity\_agent) | Configuration for Konnectivity agent deployment replica alert in GKE. Triggers when there are no available replicas. |
object({
enabled = optional(bool, true)
cluster_name = optional(string, null)
project_id = optional(string, null)
namespace = optional(string, "kube-system")
deployment_name = optional(string, "konnectivity-agent")
duration_seconds = optional(number, 60)
auto_close_seconds = optional(number, 3600)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
notification_prompts = optional(list(string), null)
})
| `{}` | no | -| [kyverno](#input\_kyverno) | Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, error pattern inclusions/exclusions, and namespace. |
object({
enabled = optional(bool, true)
cluster_name = optional(string, null)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
# Rate limit for notifications, e.g. "300s" for 5 minutes, used only for log match alerts
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
namespace = optional(string, "kyverno")
# List of error patterns to exclude from the default set.
# Default patterns available for exclusion:
# "internal error", "failed calling webhook", "timeout", "client-side throttling",
# "failed to run warmup", "schema not found", "failed to list resources",
# "failed to watch resource", "context deadline exceeded", "is forbidden",
# "cannot list resource", "cannot watch resource", "RBAC.*denied",
# "failed to start watcher", "leader election lost", "unable to update .*WebhookConfiguration",
# "failed to sync", "dropping request", "failed to load certificate",
# "failed to update lock", "the object has been modified", "no matches for kind",
# "the server could not find the requested resource", "Too Many Requests", "x509",
# "is invalid:", "connection refused", "no agent available", "fatal error", "panic"
error_patterns_exclude = optional(list(string), [])
# List of additional error patterns to include (added to default set)
# e.g. ["my custom error", "another pattern"]
error_patterns_include = optional(list(string), [])
})
| `{}` | no | +| [kyverno](#input\_kyverno) | Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, error pattern inclusions/exclusions for jsonPayload.error matching, and namespace. |
object({
enabled = optional(bool, true)
cluster_name = optional(string, null)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
# Rate limit for notifications, e.g. "300s" for 5 minutes, used only for log match alerts
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
namespace = optional(string, "kyverno")
# List of error patterns to exclude from the default set.
# Default patterns available for exclusion:
# "internal error", "failed calling webhook", "timeout", "client-side throttling",
# "failed to run warmup", "schema not found", "failed to list resources",
# "failed to watch resource", "context deadline exceeded", "is forbidden",
# "cannot list resource", "cannot watch resource", "RBAC.*denied",
# "failed to start watcher", "leader election lost", "unable to update .*WebhookConfiguration",
# "failed to sync", "dropping request", "failed to load certificate",
# "failed to update lock", "the object has been modified", "no matches for kind",
# "the server could not find the requested resource", "Too Many Requests", "x509",
# "is invalid:", "connection refused", "no agent available", "fatal error", "panic"
error_patterns_exclude = optional(list(string), [])
# List of additional regex error patterns to include (added to default set)
# e.g. ["my custom.*error", "failed to connect.*database"]
error_patterns_include = optional(list(string), [])
})
| `{}` | no | | [litellm](#input\_litellm) | Configuration for LiteLLM monitoring alerts. Supports uptime checks for HTTP endpoints and container-level alerts (pod restarts) in GKE. Each app is identified by its name (map key). |
object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
cluster_name = optional(string, null)

apps = optional(map(object({
uptime_check = optional(object({
enabled = optional(bool, true)
host = string
path = optional(string, "/health/readiness")
}), null)

container_check = optional(object({
enabled = optional(bool, true)
namespace = string
pod_restart = optional(object({
threshold = optional(number, 0)
alignment_period = optional(number, 60)
duration = optional(number, 180)
auto_close_seconds = optional(number, 3600)
notification_prompts = optional(list(string), null)
}), {})
}), null)
})), {})
})
| `{}` | no | | [notification\_channels](#input\_notification\_channels) | List of notification channel IDs to notify when an alert is triggered | `list(string)` | `[]` | no | | [project\_id](#input\_project\_id) | The Google Cloud project ID where logging exclusions will be created | `string` | n/a | yes | diff --git a/cert_manager.tf b/cert_manager.tf index 7cb7fb7..bc2877d 100644 --- a/cert_manager.tf +++ b/cert_manager.tf @@ -8,7 +8,7 @@ locals { EOT ) cert_manager_notification_channels = var.cert_manager.notification_enabled ? (length(var.cert_manager.notification_channels) > 0 ? var.cert_manager.notification_channels : var.notification_channels) : [] - cert_manager_cluster_name = var.cert_manager.cluster_name != null ? trimspace(var.cert_manager.cluster_name) : "" + cert_manager_cluster_name = var.cert_manager.cluster_name != null ? trimspace(var.cert_manager.cluster_name) : "" cert_manager_log_filter = local.cert_manager_cluster_name != "" ? (<<-EOT ( diff --git a/examples/main.tf b/examples/main.tf index 49c8391..3fa851f 100644 --- a/examples/main.tf +++ b/examples/main.tf @@ -50,15 +50,17 @@ module "example" { kyverno = { cluster_name = "test-cluster" notification_channels = [] - # Exclude specific error patterns from the default set + # Exclude specific error patterns from the default set (only affects jsonPayload.error matching) error_patterns_exclude = [ "failed to start watcher", "failed to list resources", ] - # Add custom error patterns to the default set + # Add custom regex error patterns to the default set (matched against jsonPayload.error) + # Note: These options only support error pattern matching. Arbitrary log filter conditions + # (e.g., negative filters like -textPayload:"...") are not supported. # error_patterns_include = [ - # "my custom error", - # "another pattern to match", + # "my custom.*error", + # "failed to connect.*database", # ] } cert_manager = { diff --git a/kyverno.tf b/kyverno.tf index 9cbc304..ba37b1d 100644 --- a/kyverno.tf +++ b/kyverno.tf @@ -40,10 +40,10 @@ locals { ] # Combine default patterns with included patterns, then filter out excluded ones - kyverno_all_error_patterns = concat( + kyverno_all_error_patterns = distinct(concat( local.kyverno_default_error_patterns, var.kyverno.error_patterns_include - ) + )) kyverno_active_error_patterns = [ for pattern in local.kyverno_all_error_patterns : diff --git a/modules/http_monitoring/main.tf b/modules/http_monitoring/main.tf index f3b9eee..6500b29 100644 --- a/modules/http_monitoring/main.tf +++ b/modules/http_monitoring/main.tf @@ -1,7 +1,7 @@ locals { - suffix = var.uptime_monitoring_path != "/" ? var.uptime_monitoring_path : "" - uptime_monitoring_display_name = var.uptime_monitoring_display_name != "" ? "${var.uptime_monitoring_display_name} - ${var.uptime_monitoring_host}${local.suffix}" : "${var.uptime_monitoring_host}${local.suffix}" - alert_display_name = var.alert_display_name != "" ? var.alert_display_name : "Failure of uptime check for: ${local.uptime_monitoring_display_name}" + suffix = var.uptime_monitoring_path != "/" ? var.uptime_monitoring_path : "" + uptime_monitoring_display_name = var.uptime_monitoring_display_name != "" ? "${var.uptime_monitoring_display_name} - ${var.uptime_monitoring_host}${local.suffix}" : "${var.uptime_monitoring_host}${local.suffix}" + alert_display_name = var.alert_display_name != "" ? var.alert_display_name : "Failure of uptime check for: ${local.uptime_monitoring_display_name}" } resource "google_monitoring_uptime_check_config" "https_uptime" { diff --git a/variables.tf b/variables.tf index abea591..3f9982c 100644 --- a/variables.tf +++ b/variables.tf @@ -69,7 +69,7 @@ variable "cloud_sql" { } variable "kyverno" { - description = "Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, error pattern inclusions/exclusions, and namespace." + description = "Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, error pattern inclusions/exclusions for jsonPayload.error matching, and namespace." default = {} type = object({ enabled = optional(bool, true) @@ -81,7 +81,7 @@ variable "kyverno" { logmatch_notification_rate_limit = optional(string, "300s") alert_documentation = optional(string, null) auto_close_seconds = optional(number, 3600) - namespace = optional(string, "kyverno") + namespace = optional(string, "kyverno") # List of error patterns to exclude from the default set. # Default patterns available for exclusion: # "internal error", "failed calling webhook", "timeout", "client-side throttling", @@ -94,8 +94,8 @@ variable "kyverno" { # "the server could not find the requested resource", "Too Many Requests", "x509", # "is invalid:", "connection refused", "no agent available", "fatal error", "panic" error_patterns_exclude = optional(list(string), []) - # List of additional error patterns to include (added to default set) - # e.g. ["my custom error", "another pattern"] + # List of additional regex error patterns to include (added to default set) + # e.g. ["my custom.*error", "failed to connect.*database"] error_patterns_include = optional(list(string), []) }) @@ -144,6 +144,48 @@ variable "kyverno" { ]) error_message = "error_patterns_exclude contains invalid pattern(s). Only default patterns can be excluded. Check the variable description for the list of valid patterns." } + + validation { + condition = ( + !var.kyverno.enabled || + length(setsubtract( + toset(concat([ + "internal error", + "failed calling webhook", + "timeout", + "client-side throttling", + "failed to run warmup", + "schema not found", + "failed to list resources", + "failed to watch resource", + "context deadline exceeded", + "is forbidden", + "cannot list resource", + "cannot watch resource", + "RBAC.*denied", + "failed to start watcher", + "leader election lost", + "unable to update .*WebhookConfiguration", + "failed to sync", + "dropping request", + "failed to load certificate", + "failed to update lock", + "the object has been modified", + "no matches for kind", + "the server could not find the requested resource", + "Too Many Requests", + "x509", + "is invalid:", + "connection refused", + "no agent available", + "fatal error", + "panic", + ], var.kyverno.error_patterns_include)), + toset(var.kyverno.error_patterns_exclude) + )) > 0 + ) + error_message = "The combination of error_patterns_exclude and error_patterns_include results in no active error patterns. At least one pattern must remain active, otherwise the alert will not be created." + } } variable "cert_manager" { From 4df43aec092d69f888aefe0ec16ec50ca9cb7e64 Mon Sep 17 00:00:00 2001 From: Filippo Date: Wed, 4 Feb 2026 14:34:28 +0100 Subject: [PATCH 5/5] docs(changelog): clarify note on new error pattern matching options --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c98e72..2abf53f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,7 +28,7 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - The `filter_extra` variable has been removed and replaced with `error_patterns_include` and `error_patterns_exclude`. To migrate: - If you were using `filter_extra` to add custom error patterns for `jsonPayload.error` matching, use `error_patterns_include` instead. - If you need to exclude specific default error patterns, use `error_patterns_exclude`. - - **Note:** The new options only support error pattern matching against `jsonPayload.error`. If you were using `filter_extra` for arbitrary log filter conditions (e.g., negative filters like `-textPayload:"..."`), this functionality is no longer available. + - **Note:** The new options are specifically designed for error pattern matching against `jsonPayload.error`. - See [examples/main.tf](examples/main.tf) for usage examples. ## [0.12.0] - 2026-01-28