diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d1da43..19f98b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,13 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +## [0.3.0] - 2025-10-07 + +[Compare with previous version](https://github.com/sparkfabrik/terraform-google-services-monitoring/compare/0.2.0...0.3.0) + +### Changed + +- Add kyverno alert log. - Update module documentation. ## [0.2.0] - 2024-10-17 diff --git a/Makefile b/Makefile index 526c105..f66fe83 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,15 @@ +TERRAFORM_DOCS_VERSION ?= 0.20.0 +TERRAFORM_TF_LINT_VERSION ?= 0.59.1 +TERRAFORM_TF_SEC_VERSION ?= 1.28.14 .PHONY: lint tfscan generate-docs lint: - docker run --rm -v $${PWD}:/data -t ghcr.io/terraform-linters/tflint --var-file=/data/examples/test.tfvars - + docker run --rm -v $${PWD}:/data -t ghcr.io/terraform-linters/tflint:v$(TERRAFORM_TF_LINT_VERSION) --var-file=/data/examples/test.tfvars tfsec: - docker run --rm -it -v "$$(pwd):/src" aquasec/tfsec /src --tfvars-file=/src/examples/test.tfvars + docker run --rm -it -v "$$(pwd):/src" aquasec/tfsec:v$(TERRAFORM_TF_SEC_VERSION) /src --tfvars-file=/src/examples/test.tfvars generate-docs: lint docker run --rm -u $$(id -u) \ --volume "$(PWD):/terraform-docs" \ -w /terraform-docs \ - quay.io/terraform-docs/terraform-docs:0.16.0 markdown table --config .terraform-docs.yml --output-file README.md --output-mode inject . + quay.io/terraform-docs/terraform-docs:$(TERRAFORM_DOCS_VERSION) markdown table --config .terraform-docs.yml --output-file README.md --output-mode inject . diff --git a/README.md b/README.md index 2601d1d..18b825f 100644 --- a/README.md +++ b/README.md @@ -5,10 +5,16 @@ This module creates a set of monitoring alerts for Google Cloud Platform service Supported services: - Cloud SQL + - CPU usage - Storage usage - Memory usage +- Kyverno + + - Error logs for admission-controller, background-controller, cleanup-controller, reports-controller + - Metric threshold (optional) + ## Providers @@ -27,10 +33,10 @@ Supported services: | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [auto\_close](#input\_auto\_close) | n/a | `string` | `"86400s"` | no | -| [cloud\_sql](#input\_cloud\_sql) | n/a |
object({
project = optional(string, null)
auto_close = optional(string, null)
notification_channels = optional(list(string), [])
instances = optional(map(object({
cpu_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "120s")
duration = optional(string, "300s")
})), [
{
threshold = 0.85,
duration = "1200s",
},
{
severity = "CRITICAL",
threshold = 1,
duration = "300s",
alignment_period = "60s",
}
])
memory_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "300s")
duration = optional(string, "300s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
disk_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.85)
alignment_period = optional(string, "300s")
duration = optional(string, "600s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
})), {})
})
| n/a | yes | -| [notification\_channels](#input\_notification\_channels) | n/a | `list(string)` | `[]` | no | -| [project](#input\_project) | n/a | `string` | `null` | no | +| [cloud\_sql](#input\_cloud\_sql) | Configuration for Cloud SQL monitoring alerts. Supports customization of project, auto-close timing, notification channels, and per-instance alert thresholds for CPU, memory, and disk utilization. |
object({
project_id = optional(string, null)
auto_close = optional(string, "86400s") # default 24h
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
instances = optional(map(object({
cpu_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "120s")
duration = optional(string, "300s")
})), [
{
threshold = 0.85,
duration = "1200s",
},
{
severity = "CRITICAL",
threshold = 1,
duration = "300s",
alignment_period = "60s",
}
])
memory_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "300s")
duration = optional(string, "300s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
disk_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.85)
alignment_period = optional(string, "300s")
duration = optional(string, "600s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
})), {})
})
| n/a | yes | +| [kyverno](#input\_kyverno) | Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace. |
object({
enabled = optional(bool, true)
cluster_name = string
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
# Rate limit for notifications, e.g. "300s" for 5 minutes, used only for log match alerts
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
namespace = optional(string, "kyverno")
})
| n/a | yes | +| [notification\_channels](#input\_notification\_channels) | List of notification channel IDs to notify when an alert is triggered | `list(string)` | `[]` | no | +| [project\_id](#input\_project\_id) | The Google Cloud project ID where logging exclusions will be created | `string` | n/a | yes | ## Outputs @@ -47,10 +53,10 @@ Supported services: | [google_monitoring_alert_policy.cloud_sql_cpu_utilization](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource | | [google_monitoring_alert_policy.cloud_sql_disk_utilization](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource | | [google_monitoring_alert_policy.cloud_sql_memory_utilization](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource | +| [google_monitoring_alert_policy.kyverno_logmatch_alert](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource | ## Modules No modules. - diff --git a/cloud-sql.tf b/cloud-sql.tf index b1568d6..0b92afb 100644 --- a/cloud-sql.tf +++ b/cloud-sql.tf @@ -3,13 +3,10 @@ # ---------------------- locals { # Use the cloud_sql project if specified, otherwise use the project. - cloud_sql_project = var.cloud_sql.project != null ? var.cloud_sql.project : var.project + cloud_sql_project = var.cloud_sql.project_id != null ? var.cloud_sql.project_id : var.project_id # Use the cloud_sql notification channels for if not specified in the configuration. - cloud_sql_notification_channels = length(var.cloud_sql.notification_channels) > 0 ? var.cloud_sql.notification_channels : var.notification_channels - - # Use the cloud_sql auto_close if specified, otherwise use the auto_close. - cloud_sql_auto_close = var.cloud_sql.auto_close != null ? var.cloud_sql.auto_close : var.auto_close + cloud_sql_notification_channels = var.cloud_sql.notification_enabled ? (length(var.cloud_sql.notification_channels) > 0 ? var.cloud_sql.notification_channels : var.notification_channels) : [] cloud_sql_cpu_utilization = { for item in flatten( @@ -22,7 +19,7 @@ locals { }, cpu_utilization ) - ] + ] ] ) : "${item.instance}--${item.severity}--${item.threshold}" => item } @@ -38,10 +35,10 @@ locals { }, memory_utilization ) - ] + ] ] ) : "${item.instance}--${item.severity}--${item.threshold}" => item - } + } cloud_sql_disk_utilization = { for item in flatten( @@ -54,10 +51,10 @@ locals { }, disk_utilization ) - ] + ] ] ) : "${item.instance}--${item.severity}--${item.threshold}" => item - } + } } # ---------------------- @@ -67,7 +64,7 @@ resource "google_monitoring_alert_policy" "cloud_sql_cpu_utilization" { for_each = local.cloud_sql_cpu_utilization display_name = "${local.cloud_sql_project} ${each.value.instance} - CPU utilization ${each.value.severity} ${each.value.threshold * 100}%" - combiner = "OR" + combiner = "OR" severity = each.value.severity conditions { @@ -87,7 +84,7 @@ resource "google_monitoring_alert_policy" "cloud_sql_cpu_utilization" { display_name = "${local.cloud_sql_project} ${each.value.instance} - CPU utilization ${each.value.severity} ${each.value.threshold * 100}%" } alert_strategy { - auto_close = local.cloud_sql_auto_close + auto_close = var.cloud_sql.auto_close } notification_channels = local.cloud_sql_notification_channels } @@ -117,7 +114,7 @@ resource "google_monitoring_alert_policy" "cloud_sql_memory_utilization" { } alert_strategy { - auto_close = local.cloud_sql_auto_close + auto_close = var.cloud_sql.auto_close } notification_channels = local.cloud_sql_notification_channels @@ -149,7 +146,7 @@ resource "google_monitoring_alert_policy" "cloud_sql_disk_utilization" { } alert_strategy { - auto_close = local.cloud_sql_auto_close + auto_close = var.cloud_sql.auto_close } notification_channels = local.cloud_sql_notification_channels } diff --git a/examples/main.tf b/examples/main.tf index c2ceda7..b91edef 100644 --- a/examples/main.tf +++ b/examples/main.tf @@ -4,12 +4,12 @@ locals { # Enable all Cdoud SQL monitorings on selected instances, eg. - cloud_sql = { - instances = { - (google_sql_database_instance.master.name) = {} + cloud_sql = { + instances = { + (google_sql_database_instance.master.name) = {} (google_sql_database_instance.stage.name) = {} - } - } + } + } # Use custom Cloud SQL cpu monitoring on google_sql_database_instance.master.name # Use all default Cloud SQL monitoring on google_sql_database_instance.stage.name @@ -35,7 +35,7 @@ locals { # cloud_sql = { # instances = { # (google_sql_database_instance.master.stage) = { cpu_utilization = [] } - # (google_sql_database_instance.master.prod) = {} + # (google_sql_database_instance.master.prod) = {} # } # } @@ -46,6 +46,16 @@ module "example" { version = ">= 0.1.0" notification_channels = var.notification_channels - project = var.project - cloud_sql = local.cloud_sql + project_id = var.project_id + cloud_sql = local.cloud_sql + kyverno = { + cluster_name = "test-cluster" + enabled = true + use_metric_threshold = true + metric_threshold_count = 5 + notification_channels = [] + # Optional filter for log entries, exclude known non-actionable messages + # e.g., "-textPayload:\"stale GroupVersion discovery: metrics.k8s.io/v1beta1\"" + filter_extra = "-textPayload:\"stale GroupVersion discovery: metrics.k8s.io/v1beta1\"" + } } diff --git a/examples/test.tfvars b/examples/test.tfvars index cd2a684..db7e868 100644 --- a/examples/test.tfvars +++ b/examples/test.tfvars @@ -1,5 +1,4 @@ -project = "Simple project" - +project_id = "simple-project" notification_channels = [ "cloud_support_email", "slack-channel" diff --git a/examples/variables.tf b/examples/variables.tf index aad9089..2a09651 100644 --- a/examples/variables.tf +++ b/examples/variables.tf @@ -1,10 +1,28 @@ -variable "project" { - type = string - default = "" +variable "project_id" { + description = "The Google Cloud project ID where logging exclusions will be created" + type = string } variable "notification_channels" { - type = list(string) - default = [] + description = "List of notification channel IDs to notify when an alert is triggered" + type = list(string) + default = [] +} + +variable "kyverno" { + description = "Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace." + type = object({ + enabled = optional(bool, true) + project_id = optional(string, null) + cluster_name = string + namespace = optional(string, "kyverno") + notification_enabled = optional(bool, true) + notification_channels = optional(list(string), []) + alert_documentation = optional(string, null) + metric_threshold_count = optional(number, 2) + metric_lookback_minutes = optional(number, 1) + auto_close_seconds = optional(number, 3600) + filter_extra = optional(string, "") + }) } diff --git a/kyverno_log_alert.tf b/kyverno_log_alert.tf new file mode 100644 index 0000000..048a1b4 --- /dev/null +++ b/kyverno_log_alert.tf @@ -0,0 +1,50 @@ +locals { + kyverno_project_id = var.kyverno.project_id != null ? var.kyverno.project_id : var.project_id + alert_documentation = var.kyverno.alert_documentation != null ? var.kyverno.alert_documentation : "Kyverno controllers produced ERROR logs in namespace ${var.kyverno.namespace}." + kyverno_notification_channels = var.kyverno.notification_enabled ? (length(var.kyverno.notification_channels) > 0 ? var.kyverno.notification_channels : var.notification_channels) : [] + + kyverno_log_filter = <<-EOT + resource.type="k8s_container" + resource.labels.project_id="${local.kyverno_project_id}" + resource.labels.cluster_name="${var.kyverno.cluster_name}" + resource.labels.namespace_name="${var.kyverno.namespace}" + severity>=ERROR + ( + labels."k8s-pod/app_kubernetes_io/component"=~"(admission-controller|background-controller|cleanup-controller|reports-controller)" + OR resource.labels.pod_name=~"kyverno-(admission|background|cleanup|reports)-controller-.*" + ) + ${trimspace(var.kyverno.filter_extra)} + EOT +} + +resource "google_monitoring_alert_policy" "kyverno_logmatch_alert" { + count = ( + var.kyverno.enabled + && trimspace(var.kyverno.cluster_name) != "" + ) ? 1 : 0 + + display_name = "Kyverno controllers ERROR logs (namespace=${var.kyverno.namespace})" + combiner = "OR" + enabled = var.kyverno.enabled + + conditions { + display_name = "Kyverno ERROR in logs" + condition_matched_log { + filter = local.kyverno_log_filter + } + } + + documentation { + content = local.alert_documentation + mime_type = "text/markdown" + } + + notification_channels = local.kyverno_notification_channels + + alert_strategy { + auto_close = "${var.kyverno.auto_close_seconds}s" + notification_rate_limit { + period = var.kyverno.logmatch_notification_rate_limit + } + } +} diff --git a/main.tf b/main.tf index e69de29..8b13789 100644 --- a/main.tf +++ b/main.tf @@ -0,0 +1 @@ + diff --git a/variables.tf b/variables.tf index 98d4da0..14a0392 100644 --- a/variables.tf +++ b/variables.tf @@ -1,22 +1,20 @@ -variable "project" { - type = string - default = null +variable "project_id" { + description = "The Google Cloud project ID where logging exclusions will be created" + type = string } variable "notification_channels" { - type = list(string) - default = [] -} - -variable "auto_close" { - type = string - default = "86400s" # 24h + description = "List of notification channel IDs to notify when an alert is triggered" + type = list(string) + default = [] } variable "cloud_sql" { + description = "Configuration for Cloud SQL monitoring alerts. Supports customization of project, auto-close timing, notification channels, and per-instance alert thresholds for CPU, memory, and disk utilization." type = object({ - project = optional(string, null) - auto_close = optional(string, null) + project_id = optional(string, null) + auto_close = optional(string, "86400s") # default 24h + notification_enabled = optional(bool, true) notification_channels = optional(list(string), []) instances = optional(map(object({ cpu_utilization = optional(list(object({ @@ -30,9 +28,9 @@ variable "cloud_sql" { duration = "1200s", }, { - severity = "CRITICAL", - threshold = 1, - duration = "300s", + severity = "CRITICAL", + threshold = 1, + duration = "300s", alignment_period = "60s", } ]) @@ -43,7 +41,7 @@ variable "cloud_sql" { duration = optional(string, "300s") })), [ { - severity = "WARNING", + severity = "WARNING", }, { severity = "CRITICAL", @@ -57,13 +55,30 @@ variable "cloud_sql" { duration = optional(string, "600s") })), [ { - severity = "WARNING", + severity = "WARNING", }, { severity = "CRITICAL", - threshold = 0.95, + threshold = 0.95, } ]) })), {}) }) } + +variable "kyverno" { + description = "Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace." + type = object({ + enabled = optional(bool, true) + cluster_name = string + project_id = optional(string, null) + notification_enabled = optional(bool, true) + notification_channels = optional(list(string), []) + # Rate limit for notifications, e.g. "300s" for 5 minutes, used only for log match alerts + logmatch_notification_rate_limit = optional(string, "300s") + alert_documentation = optional(string, null) + auto_close_seconds = optional(number, 3600) + filter_extra = optional(string, "") + namespace = optional(string, "kyverno") + }) +}