diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d1da43..19f98b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,13 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +## [0.3.0] - 2025-10-07 + +[Compare with previous version](https://github.com/sparkfabrik/terraform-google-services-monitoring/compare/0.2.0...0.3.0) + +### Changed + +- Add kyverno alert log. - Update module documentation. ## [0.2.0] - 2024-10-17 diff --git a/Makefile b/Makefile index 526c105..f66fe83 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,15 @@ +TERRAFORM_DOCS_VERSION ?= 0.20.0 +TERRAFORM_TF_LINT_VERSION ?= 0.59.1 +TERRAFORM_TF_SEC_VERSION ?= 1.28.14 .PHONY: lint tfscan generate-docs lint: - docker run --rm -v $${PWD}:/data -t ghcr.io/terraform-linters/tflint --var-file=/data/examples/test.tfvars - + docker run --rm -v $${PWD}:/data -t ghcr.io/terraform-linters/tflint:v$(TERRAFORM_TF_LINT_VERSION) --var-file=/data/examples/test.tfvars tfsec: - docker run --rm -it -v "$$(pwd):/src" aquasec/tfsec /src --tfvars-file=/src/examples/test.tfvars + docker run --rm -it -v "$$(pwd):/src" aquasec/tfsec:v$(TERRAFORM_TF_SEC_VERSION) /src --tfvars-file=/src/examples/test.tfvars generate-docs: lint docker run --rm -u $$(id -u) \ --volume "$(PWD):/terraform-docs" \ -w /terraform-docs \ - quay.io/terraform-docs/terraform-docs:0.16.0 markdown table --config .terraform-docs.yml --output-file README.md --output-mode inject . + quay.io/terraform-docs/terraform-docs:$(TERRAFORM_DOCS_VERSION) markdown table --config .terraform-docs.yml --output-file README.md --output-mode inject . diff --git a/README.md b/README.md index 2601d1d..18b825f 100644 --- a/README.md +++ b/README.md @@ -5,10 +5,16 @@ This module creates a set of monitoring alerts for Google Cloud Platform service Supported services: - Cloud SQL + - CPU usage - Storage usage - Memory usage +- Kyverno + + - Error logs for admission-controller, background-controller, cleanup-controller, reports-controller + - Metric threshold (optional) + ## Providers @@ -27,10 +33,10 @@ Supported services: | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [auto\_close](#input\_auto\_close) | n/a | `string` | `"86400s"` | no | -| [cloud\_sql](#input\_cloud\_sql) | n/a |
object({
project = optional(string, null)
auto_close = optional(string, null)
notification_channels = optional(list(string), [])
instances = optional(map(object({
cpu_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "120s")
duration = optional(string, "300s")
})), [
{
threshold = 0.85,
duration = "1200s",
},
{
severity = "CRITICAL",
threshold = 1,
duration = "300s",
alignment_period = "60s",
}
])
memory_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "300s")
duration = optional(string, "300s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
disk_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.85)
alignment_period = optional(string, "300s")
duration = optional(string, "600s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
})), {})
}) | n/a | yes |
-| [notification\_channels](#input\_notification\_channels) | n/a | `list(string)` | `[]` | no |
-| [project](#input\_project) | n/a | `string` | `null` | no |
+| [cloud\_sql](#input\_cloud\_sql) | Configuration for Cloud SQL monitoring alerts. Supports customization of project, auto-close timing, notification channels, and per-instance alert thresholds for CPU, memory, and disk utilization. | object({
project_id = optional(string, null)
auto_close = optional(string, "86400s") # default 24h
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
instances = optional(map(object({
cpu_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "120s")
duration = optional(string, "300s")
})), [
{
threshold = 0.85,
duration = "1200s",
},
{
severity = "CRITICAL",
threshold = 1,
duration = "300s",
alignment_period = "60s",
}
])
memory_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "300s")
duration = optional(string, "300s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
disk_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.85)
alignment_period = optional(string, "300s")
duration = optional(string, "600s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
})), {})
}) | n/a | yes |
+| [kyverno](#input\_kyverno) | Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace. | object({
enabled = optional(bool, true)
cluster_name = string
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
# Rate limit for notifications, e.g. "300s" for 5 minutes, used only for log match alerts
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
namespace = optional(string, "kyverno")
}) | n/a | yes |
+| [notification\_channels](#input\_notification\_channels) | List of notification channel IDs to notify when an alert is triggered | `list(string)` | `[]` | no |
+| [project\_id](#input\_project\_id) | The Google Cloud project ID where logging exclusions will be created | `string` | n/a | yes |
## Outputs
@@ -47,10 +53,10 @@ Supported services:
| [google_monitoring_alert_policy.cloud_sql_cpu_utilization](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
| [google_monitoring_alert_policy.cloud_sql_disk_utilization](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
| [google_monitoring_alert_policy.cloud_sql_memory_utilization](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
+| [google_monitoring_alert_policy.kyverno_logmatch_alert](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
## Modules
No modules.
-
diff --git a/cloud-sql.tf b/cloud-sql.tf
index b1568d6..0b92afb 100644
--- a/cloud-sql.tf
+++ b/cloud-sql.tf
@@ -3,13 +3,10 @@
# ----------------------
locals {
# Use the cloud_sql project if specified, otherwise use the project.
- cloud_sql_project = var.cloud_sql.project != null ? var.cloud_sql.project : var.project
+ cloud_sql_project = var.cloud_sql.project_id != null ? var.cloud_sql.project_id : var.project_id
# Use the cloud_sql notification channels for if not specified in the configuration.
- cloud_sql_notification_channels = length(var.cloud_sql.notification_channels) > 0 ? var.cloud_sql.notification_channels : var.notification_channels
-
- # Use the cloud_sql auto_close if specified, otherwise use the auto_close.
- cloud_sql_auto_close = var.cloud_sql.auto_close != null ? var.cloud_sql.auto_close : var.auto_close
+ cloud_sql_notification_channels = var.cloud_sql.notification_enabled ? (length(var.cloud_sql.notification_channels) > 0 ? var.cloud_sql.notification_channels : var.notification_channels) : []
cloud_sql_cpu_utilization = {
for item in flatten(
@@ -22,7 +19,7 @@ locals {
},
cpu_utilization
)
- ]
+ ]
]
) : "${item.instance}--${item.severity}--${item.threshold}" => item
}
@@ -38,10 +35,10 @@ locals {
},
memory_utilization
)
- ]
+ ]
]
) : "${item.instance}--${item.severity}--${item.threshold}" => item
- }
+ }
cloud_sql_disk_utilization = {
for item in flatten(
@@ -54,10 +51,10 @@ locals {
},
disk_utilization
)
- ]
+ ]
]
) : "${item.instance}--${item.severity}--${item.threshold}" => item
- }
+ }
}
# ----------------------
@@ -67,7 +64,7 @@ resource "google_monitoring_alert_policy" "cloud_sql_cpu_utilization" {
for_each = local.cloud_sql_cpu_utilization
display_name = "${local.cloud_sql_project} ${each.value.instance} - CPU utilization ${each.value.severity} ${each.value.threshold * 100}%"
- combiner = "OR"
+ combiner = "OR"
severity = each.value.severity
conditions {
@@ -87,7 +84,7 @@ resource "google_monitoring_alert_policy" "cloud_sql_cpu_utilization" {
display_name = "${local.cloud_sql_project} ${each.value.instance} - CPU utilization ${each.value.severity} ${each.value.threshold * 100}%"
}
alert_strategy {
- auto_close = local.cloud_sql_auto_close
+ auto_close = var.cloud_sql.auto_close
}
notification_channels = local.cloud_sql_notification_channels
}
@@ -117,7 +114,7 @@ resource "google_monitoring_alert_policy" "cloud_sql_memory_utilization" {
}
alert_strategy {
- auto_close = local.cloud_sql_auto_close
+ auto_close = var.cloud_sql.auto_close
}
notification_channels = local.cloud_sql_notification_channels
@@ -149,7 +146,7 @@ resource "google_monitoring_alert_policy" "cloud_sql_disk_utilization" {
}
alert_strategy {
- auto_close = local.cloud_sql_auto_close
+ auto_close = var.cloud_sql.auto_close
}
notification_channels = local.cloud_sql_notification_channels
}
diff --git a/examples/main.tf b/examples/main.tf
index c2ceda7..b91edef 100644
--- a/examples/main.tf
+++ b/examples/main.tf
@@ -4,12 +4,12 @@
locals {
# Enable all Cdoud SQL monitorings on selected instances, eg.
- cloud_sql = {
- instances = {
- (google_sql_database_instance.master.name) = {}
+ cloud_sql = {
+ instances = {
+ (google_sql_database_instance.master.name) = {}
(google_sql_database_instance.stage.name) = {}
- }
- }
+ }
+ }
# Use custom Cloud SQL cpu monitoring on google_sql_database_instance.master.name
# Use all default Cloud SQL monitoring on google_sql_database_instance.stage.name
@@ -35,7 +35,7 @@ locals {
# cloud_sql = {
# instances = {
# (google_sql_database_instance.master.stage) = { cpu_utilization = [] }
- # (google_sql_database_instance.master.prod) = {}
+ # (google_sql_database_instance.master.prod) = {}
# }
# }
@@ -46,6 +46,16 @@ module "example" {
version = ">= 0.1.0"
notification_channels = var.notification_channels
- project = var.project
- cloud_sql = local.cloud_sql
+ project_id = var.project_id
+ cloud_sql = local.cloud_sql
+ kyverno = {
+ cluster_name = "test-cluster"
+ enabled = true
+ use_metric_threshold = true
+ metric_threshold_count = 5
+ notification_channels = []
+ # Optional filter for log entries, exclude known non-actionable messages
+ # e.g., "-textPayload:\"stale GroupVersion discovery: metrics.k8s.io/v1beta1\""
+ filter_extra = "-textPayload:\"stale GroupVersion discovery: metrics.k8s.io/v1beta1\""
+ }
}
diff --git a/examples/test.tfvars b/examples/test.tfvars
index cd2a684..db7e868 100644
--- a/examples/test.tfvars
+++ b/examples/test.tfvars
@@ -1,5 +1,4 @@
-project = "Simple project"
-
+project_id = "simple-project"
notification_channels = [
"cloud_support_email",
"slack-channel"
diff --git a/examples/variables.tf b/examples/variables.tf
index aad9089..2a09651 100644
--- a/examples/variables.tf
+++ b/examples/variables.tf
@@ -1,10 +1,28 @@
-variable "project" {
- type = string
- default = ""
+variable "project_id" {
+ description = "The Google Cloud project ID where logging exclusions will be created"
+ type = string
}
variable "notification_channels" {
- type = list(string)
- default = []
+ description = "List of notification channel IDs to notify when an alert is triggered"
+ type = list(string)
+ default = []
+}
+
+variable "kyverno" {
+ description = "Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace."
+ type = object({
+ enabled = optional(bool, true)
+ project_id = optional(string, null)
+ cluster_name = string
+ namespace = optional(string, "kyverno")
+ notification_enabled = optional(bool, true)
+ notification_channels = optional(list(string), [])
+ alert_documentation = optional(string, null)
+ metric_threshold_count = optional(number, 2)
+ metric_lookback_minutes = optional(number, 1)
+ auto_close_seconds = optional(number, 3600)
+ filter_extra = optional(string, "")
+ })
}
diff --git a/kyverno_log_alert.tf b/kyverno_log_alert.tf
new file mode 100644
index 0000000..048a1b4
--- /dev/null
+++ b/kyverno_log_alert.tf
@@ -0,0 +1,50 @@
+locals {
+ kyverno_project_id = var.kyverno.project_id != null ? var.kyverno.project_id : var.project_id
+ alert_documentation = var.kyverno.alert_documentation != null ? var.kyverno.alert_documentation : "Kyverno controllers produced ERROR logs in namespace ${var.kyverno.namespace}."
+ kyverno_notification_channels = var.kyverno.notification_enabled ? (length(var.kyverno.notification_channels) > 0 ? var.kyverno.notification_channels : var.notification_channels) : []
+
+ kyverno_log_filter = <<-EOT
+ resource.type="k8s_container"
+ resource.labels.project_id="${local.kyverno_project_id}"
+ resource.labels.cluster_name="${var.kyverno.cluster_name}"
+ resource.labels.namespace_name="${var.kyverno.namespace}"
+ severity>=ERROR
+ (
+ labels."k8s-pod/app_kubernetes_io/component"=~"(admission-controller|background-controller|cleanup-controller|reports-controller)"
+ OR resource.labels.pod_name=~"kyverno-(admission|background|cleanup|reports)-controller-.*"
+ )
+ ${trimspace(var.kyverno.filter_extra)}
+ EOT
+}
+
+resource "google_monitoring_alert_policy" "kyverno_logmatch_alert" {
+ count = (
+ var.kyverno.enabled
+ && trimspace(var.kyverno.cluster_name) != ""
+ ) ? 1 : 0
+
+ display_name = "Kyverno controllers ERROR logs (namespace=${var.kyverno.namespace})"
+ combiner = "OR"
+ enabled = var.kyverno.enabled
+
+ conditions {
+ display_name = "Kyverno ERROR in logs"
+ condition_matched_log {
+ filter = local.kyverno_log_filter
+ }
+ }
+
+ documentation {
+ content = local.alert_documentation
+ mime_type = "text/markdown"
+ }
+
+ notification_channels = local.kyverno_notification_channels
+
+ alert_strategy {
+ auto_close = "${var.kyverno.auto_close_seconds}s"
+ notification_rate_limit {
+ period = var.kyverno.logmatch_notification_rate_limit
+ }
+ }
+}
diff --git a/main.tf b/main.tf
index e69de29..8b13789 100644
--- a/main.tf
+++ b/main.tf
@@ -0,0 +1 @@
+
diff --git a/variables.tf b/variables.tf
index 98d4da0..14a0392 100644
--- a/variables.tf
+++ b/variables.tf
@@ -1,22 +1,20 @@
-variable "project" {
- type = string
- default = null
+variable "project_id" {
+ description = "The Google Cloud project ID where logging exclusions will be created"
+ type = string
}
variable "notification_channels" {
- type = list(string)
- default = []
-}
-
-variable "auto_close" {
- type = string
- default = "86400s" # 24h
+ description = "List of notification channel IDs to notify when an alert is triggered"
+ type = list(string)
+ default = []
}
variable "cloud_sql" {
+ description = "Configuration for Cloud SQL monitoring alerts. Supports customization of project, auto-close timing, notification channels, and per-instance alert thresholds for CPU, memory, and disk utilization."
type = object({
- project = optional(string, null)
- auto_close = optional(string, null)
+ project_id = optional(string, null)
+ auto_close = optional(string, "86400s") # default 24h
+ notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
instances = optional(map(object({
cpu_utilization = optional(list(object({
@@ -30,9 +28,9 @@ variable "cloud_sql" {
duration = "1200s",
},
{
- severity = "CRITICAL",
- threshold = 1,
- duration = "300s",
+ severity = "CRITICAL",
+ threshold = 1,
+ duration = "300s",
alignment_period = "60s",
}
])
@@ -43,7 +41,7 @@ variable "cloud_sql" {
duration = optional(string, "300s")
})), [
{
- severity = "WARNING",
+ severity = "WARNING",
},
{
severity = "CRITICAL",
@@ -57,13 +55,30 @@ variable "cloud_sql" {
duration = optional(string, "600s")
})), [
{
- severity = "WARNING",
+ severity = "WARNING",
},
{
severity = "CRITICAL",
- threshold = 0.95,
+ threshold = 0.95,
}
])
})), {})
})
}
+
+variable "kyverno" {
+ description = "Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace."
+ type = object({
+ enabled = optional(bool, true)
+ cluster_name = string
+ project_id = optional(string, null)
+ notification_enabled = optional(bool, true)
+ notification_channels = optional(list(string), [])
+ # Rate limit for notifications, e.g. "300s" for 5 minutes, used only for log match alerts
+ logmatch_notification_rate_limit = optional(string, "300s")
+ alert_documentation = optional(string, null)
+ auto_close_seconds = optional(number, 3600)
+ filter_extra = optional(string, "")
+ namespace = optional(string, "kyverno")
+ })
+}