Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

## [0.3.0] - 2025-10-07

[Compare with previous version](https://github.com/sparkfabrik/terraform-google-services-monitoring/compare/0.2.0...0.3.0)

### Changed

- Add kyverno alert log.
- Update module documentation.

## [0.2.0] - 2024-10-17
Expand Down
10 changes: 6 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
TERRAFORM_DOCS_VERSION ?= 0.20.0
TERRAFORM_TF_LINT_VERSION ?= 0.59.1
TERRAFORM_TF_SEC_VERSION ?= 1.28.14
.PHONY: lint tfscan generate-docs

lint:
docker run --rm -v $${PWD}:/data -t ghcr.io/terraform-linters/tflint --var-file=/data/examples/test.tfvars

docker run --rm -v $${PWD}:/data -t ghcr.io/terraform-linters/tflint:v$(TERRAFORM_TF_LINT_VERSION) --var-file=/data/examples/test.tfvars
tfsec:
docker run --rm -it -v "$$(pwd):/src" aquasec/tfsec /src --tfvars-file=/src/examples/test.tfvars
docker run --rm -it -v "$$(pwd):/src" aquasec/tfsec:v$(TERRAFORM_TF_SEC_VERSION) /src --tfvars-file=/src/examples/test.tfvars

generate-docs: lint
docker run --rm -u $$(id -u) \
--volume "$(PWD):/terraform-docs" \
-w /terraform-docs \
quay.io/terraform-docs/terraform-docs:0.16.0 markdown table --config .terraform-docs.yml --output-file README.md --output-mode inject .
quay.io/terraform-docs/terraform-docs:$(TERRAFORM_DOCS_VERSION) markdown table --config .terraform-docs.yml --output-file README.md --output-mode inject .
16 changes: 11 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,16 @@ This module creates a set of monitoring alerts for Google Cloud Platform service
Supported services:

- Cloud SQL

- CPU usage
- Storage usage
- Memory usage

- Kyverno

- Error logs for admission-controller, background-controller, cleanup-controller, reports-controller
- Metric threshold (optional)

<!-- BEGIN_TF_DOCS -->
## Providers

Expand All @@ -27,10 +33,10 @@ Supported services:

| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| <a name="input_auto_close"></a> [auto\_close](#input\_auto\_close) | n/a | `string` | `"86400s"` | no |
| <a name="input_cloud_sql"></a> [cloud\_sql](#input\_cloud\_sql) | n/a | <pre>object({<br> project = optional(string, null)<br> auto_close = optional(string, null)<br> notification_channels = optional(list(string), [])<br> instances = optional(map(object({<br> cpu_utilization = optional(list(object({<br> severity = optional(string, "WARNING"),<br> threshold = optional(number, 0.90)<br> alignment_period = optional(string, "120s")<br> duration = optional(string, "300s")<br> })), [<br> {<br> threshold = 0.85,<br> duration = "1200s",<br> },<br> {<br> severity = "CRITICAL",<br> threshold = 1,<br> duration = "300s",<br> alignment_period = "60s",<br> }<br> ])<br> memory_utilization = optional(list(object({<br> severity = optional(string, "WARNING"),<br> threshold = optional(number, 0.90)<br> alignment_period = optional(string, "300s")<br> duration = optional(string, "300s")<br> })), [<br> {<br> severity = "WARNING",<br> },<br> {<br> severity = "CRITICAL",<br> threshold = 0.95,<br> }<br> ])<br> disk_utilization = optional(list(object({<br> severity = optional(string, "WARNING"),<br> threshold = optional(number, 0.85)<br> alignment_period = optional(string, "300s")<br> duration = optional(string, "600s")<br> })), [<br> {<br> severity = "WARNING",<br> },<br> {<br> severity = "CRITICAL",<br> threshold = 0.95, <br> }<br> ])<br> })), {})<br> })</pre> | n/a | yes |
| <a name="input_notification_channels"></a> [notification\_channels](#input\_notification\_channels) | n/a | `list(string)` | `[]` | no |
| <a name="input_project"></a> [project](#input\_project) | n/a | `string` | `null` | no |
| <a name="input_cloud_sql"></a> [cloud\_sql](#input\_cloud\_sql) | Configuration for Cloud SQL monitoring alerts. Supports customization of project, auto-close timing, notification channels, and per-instance alert thresholds for CPU, memory, and disk utilization. | <pre>object({<br/> project_id = optional(string, null)<br/> auto_close = optional(string, "86400s") # default 24h<br/> notification_enabled = optional(bool, true)<br/> notification_channels = optional(list(string), [])<br/> instances = optional(map(object({<br/> cpu_utilization = optional(list(object({<br/> severity = optional(string, "WARNING"),<br/> threshold = optional(number, 0.90)<br/> alignment_period = optional(string, "120s")<br/> duration = optional(string, "300s")<br/> })), [<br/> {<br/> threshold = 0.85,<br/> duration = "1200s",<br/> },<br/> {<br/> severity = "CRITICAL",<br/> threshold = 1,<br/> duration = "300s",<br/> alignment_period = "60s",<br/> }<br/> ])<br/> memory_utilization = optional(list(object({<br/> severity = optional(string, "WARNING"),<br/> threshold = optional(number, 0.90)<br/> alignment_period = optional(string, "300s")<br/> duration = optional(string, "300s")<br/> })), [<br/> {<br/> severity = "WARNING",<br/> },<br/> {<br/> severity = "CRITICAL",<br/> threshold = 0.95,<br/> }<br/> ])<br/> disk_utilization = optional(list(object({<br/> severity = optional(string, "WARNING"),<br/> threshold = optional(number, 0.85)<br/> alignment_period = optional(string, "300s")<br/> duration = optional(string, "600s")<br/> })), [<br/> {<br/> severity = "WARNING",<br/> },<br/> {<br/> severity = "CRITICAL",<br/> threshold = 0.95,<br/> }<br/> ])<br/> })), {})<br/> })</pre> | n/a | yes |
| <a name="input_kyverno"></a> [kyverno](#input\_kyverno) | Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace. | <pre>object({<br/> enabled = optional(bool, true)<br/> cluster_name = string<br/> project_id = optional(string, null)<br/> notification_enabled = optional(bool, true)<br/> notification_channels = optional(list(string), [])<br/> # Rate limit for notifications, e.g. "300s" for 5 minutes, used only for log match alerts<br/> logmatch_notification_rate_limit = optional(string, "300s")<br/> alert_documentation = optional(string, null)<br/> auto_close_seconds = optional(number, 3600)<br/> filter_extra = optional(string, "")<br/> namespace = optional(string, "kyverno")<br/> })</pre> | n/a | yes |
| <a name="input_notification_channels"></a> [notification\_channels](#input\_notification\_channels) | List of notification channel IDs to notify when an alert is triggered | `list(string)` | `[]` | no |
| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The Google Cloud project ID where logging exclusions will be created | `string` | n/a | yes |

## Outputs

Expand All @@ -47,10 +53,10 @@ Supported services:
| [google_monitoring_alert_policy.cloud_sql_cpu_utilization](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
| [google_monitoring_alert_policy.cloud_sql_disk_utilization](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
| [google_monitoring_alert_policy.cloud_sql_memory_utilization](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
| [google_monitoring_alert_policy.kyverno_logmatch_alert](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |

## Modules

No modules.


<!-- END_TF_DOCS -->
25 changes: 11 additions & 14 deletions cloud-sql.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,10 @@
# ----------------------
locals {
# Use the cloud_sql project if specified, otherwise use the project.
cloud_sql_project = var.cloud_sql.project != null ? var.cloud_sql.project : var.project
cloud_sql_project = var.cloud_sql.project_id != null ? var.cloud_sql.project_id : var.project_id

# Use the cloud_sql notification channels for if not specified in the configuration.
cloud_sql_notification_channels = length(var.cloud_sql.notification_channels) > 0 ? var.cloud_sql.notification_channels : var.notification_channels

# Use the cloud_sql auto_close if specified, otherwise use the auto_close.
cloud_sql_auto_close = var.cloud_sql.auto_close != null ? var.cloud_sql.auto_close : var.auto_close
cloud_sql_notification_channels = var.cloud_sql.notification_enabled ? (length(var.cloud_sql.notification_channels) > 0 ? var.cloud_sql.notification_channels : var.notification_channels) : []

cloud_sql_cpu_utilization = {
for item in flatten(
Expand All @@ -22,7 +19,7 @@ locals {
},
cpu_utilization
)
]
]
]
) : "${item.instance}--${item.severity}--${item.threshold}" => item
}
Expand All @@ -38,10 +35,10 @@ locals {
},
memory_utilization
)
]
]
]
) : "${item.instance}--${item.severity}--${item.threshold}" => item
}
}

cloud_sql_disk_utilization = {
for item in flatten(
Expand All @@ -54,10 +51,10 @@ locals {
},
disk_utilization
)
]
]
]
) : "${item.instance}--${item.severity}--${item.threshold}" => item
}
}
}

# ----------------------
Expand All @@ -67,7 +64,7 @@ resource "google_monitoring_alert_policy" "cloud_sql_cpu_utilization" {
for_each = local.cloud_sql_cpu_utilization

display_name = "${local.cloud_sql_project} ${each.value.instance} - CPU utilization ${each.value.severity} ${each.value.threshold * 100}%"
combiner = "OR"
combiner = "OR"
severity = each.value.severity

conditions {
Expand All @@ -87,7 +84,7 @@ resource "google_monitoring_alert_policy" "cloud_sql_cpu_utilization" {
display_name = "${local.cloud_sql_project} ${each.value.instance} - CPU utilization ${each.value.severity} ${each.value.threshold * 100}%"
}
alert_strategy {
auto_close = local.cloud_sql_auto_close
auto_close = var.cloud_sql.auto_close
}
notification_channels = local.cloud_sql_notification_channels
}
Expand Down Expand Up @@ -117,7 +114,7 @@ resource "google_monitoring_alert_policy" "cloud_sql_memory_utilization" {
}

alert_strategy {
auto_close = local.cloud_sql_auto_close
auto_close = var.cloud_sql.auto_close
}

notification_channels = local.cloud_sql_notification_channels
Expand Down Expand Up @@ -149,7 +146,7 @@ resource "google_monitoring_alert_policy" "cloud_sql_disk_utilization" {
}

alert_strategy {
auto_close = local.cloud_sql_auto_close
auto_close = var.cloud_sql.auto_close
}
notification_channels = local.cloud_sql_notification_channels
}
26 changes: 18 additions & 8 deletions examples/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

locals {
# Enable all Cdoud SQL monitorings on selected instances, eg.
cloud_sql = {
instances = {
(google_sql_database_instance.master.name) = {}
cloud_sql = {
instances = {
(google_sql_database_instance.master.name) = {}
(google_sql_database_instance.stage.name) = {}
}
}
}
}

# Use custom Cloud SQL cpu monitoring on google_sql_database_instance.master.name
# Use all default Cloud SQL monitoring on google_sql_database_instance.stage.name
Expand All @@ -35,7 +35,7 @@ locals {
# cloud_sql = {
# instances = {
# (google_sql_database_instance.master.stage) = { cpu_utilization = [] }
# (google_sql_database_instance.master.prod) = {}
# (google_sql_database_instance.master.prod) = {}
# }
# }

Expand All @@ -46,6 +46,16 @@ module "example" {
version = ">= 0.1.0"

notification_channels = var.notification_channels
project = var.project
cloud_sql = local.cloud_sql
project_id = var.project_id
cloud_sql = local.cloud_sql
kyverno = {
cluster_name = "test-cluster"
enabled = true
use_metric_threshold = true
metric_threshold_count = 5
notification_channels = []
# Optional filter for log entries, exclude known non-actionable messages
# e.g., "-textPayload:\"stale GroupVersion discovery: metrics.k8s.io/v1beta1\""
filter_extra = "-textPayload:\"stale GroupVersion discovery: metrics.k8s.io/v1beta1\""
}
}
3 changes: 1 addition & 2 deletions examples/test.tfvars
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
project = "Simple project"

project_id = "simple-project"
notification_channels = [
"cloud_support_email",
"slack-channel"
Expand Down
28 changes: 23 additions & 5 deletions examples/variables.tf
Original file line number Diff line number Diff line change
@@ -1,10 +1,28 @@

variable "project" {
type = string
default = ""
variable "project_id" {
description = "The Google Cloud project ID where logging exclusions will be created"
type = string
}

variable "notification_channels" {
type = list(string)
default = []
description = "List of notification channel IDs to notify when an alert is triggered"
type = list(string)
default = []
}

variable "kyverno" {
description = "Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace."
type = object({
enabled = optional(bool, true)
project_id = optional(string, null)
cluster_name = string
namespace = optional(string, "kyverno")
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
alert_documentation = optional(string, null)
metric_threshold_count = optional(number, 2)
metric_lookback_minutes = optional(number, 1)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
})
}
50 changes: 50 additions & 0 deletions kyverno_log_alert.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
locals {
kyverno_project_id = var.kyverno.project_id != null ? var.kyverno.project_id : var.project_id
alert_documentation = var.kyverno.alert_documentation != null ? var.kyverno.alert_documentation : "Kyverno controllers produced ERROR logs in namespace ${var.kyverno.namespace}."
kyverno_notification_channels = var.kyverno.notification_enabled ? (length(var.kyverno.notification_channels) > 0 ? var.kyverno.notification_channels : var.notification_channels) : []

kyverno_log_filter = <<-EOT
resource.type="k8s_container"
resource.labels.project_id="${local.kyverno_project_id}"
resource.labels.cluster_name="${var.kyverno.cluster_name}"
resource.labels.namespace_name="${var.kyverno.namespace}"
severity>=ERROR
(
labels."k8s-pod/app_kubernetes_io/component"=~"(admission-controller|background-controller|cleanup-controller|reports-controller)"
OR resource.labels.pod_name=~"kyverno-(admission|background|cleanup|reports)-controller-.*"
)
${trimspace(var.kyverno.filter_extra)}
EOT
}

resource "google_monitoring_alert_policy" "kyverno_logmatch_alert" {
count = (
var.kyverno.enabled
&& trimspace(var.kyverno.cluster_name) != ""
) ? 1 : 0

display_name = "Kyverno controllers ERROR logs (namespace=${var.kyverno.namespace})"
combiner = "OR"
enabled = var.kyverno.enabled

conditions {
display_name = "Kyverno ERROR in logs"
condition_matched_log {
filter = local.kyverno_log_filter
}
}

documentation {
content = local.alert_documentation
mime_type = "text/markdown"
}

notification_channels = local.kyverno_notification_channels

alert_strategy {
auto_close = "${var.kyverno.auto_close_seconds}s"
notification_rate_limit {
period = var.kyverno.logmatch_notification_rate_limit
}
}
}
1 change: 1 addition & 0 deletions main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

51 changes: 33 additions & 18 deletions variables.tf
Original file line number Diff line number Diff line change
@@ -1,22 +1,20 @@
variable "project" {
type = string
default = null
variable "project_id" {
description = "The Google Cloud project ID where logging exclusions will be created"
type = string
}

variable "notification_channels" {
type = list(string)
default = []
}

variable "auto_close" {
type = string
default = "86400s" # 24h
description = "List of notification channel IDs to notify when an alert is triggered"
type = list(string)
default = []
}

variable "cloud_sql" {
description = "Configuration for Cloud SQL monitoring alerts. Supports customization of project, auto-close timing, notification channels, and per-instance alert thresholds for CPU, memory, and disk utilization."
type = object({
project = optional(string, null)
auto_close = optional(string, null)
project_id = optional(string, null)
auto_close = optional(string, "86400s") # default 24h
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
instances = optional(map(object({
cpu_utilization = optional(list(object({
Expand All @@ -30,9 +28,9 @@ variable "cloud_sql" {
duration = "1200s",
},
{
severity = "CRITICAL",
threshold = 1,
duration = "300s",
severity = "CRITICAL",
threshold = 1,
duration = "300s",
alignment_period = "60s",
}
])
Expand All @@ -43,7 +41,7 @@ variable "cloud_sql" {
duration = optional(string, "300s")
})), [
{
severity = "WARNING",
severity = "WARNING",
},
{
severity = "CRITICAL",
Expand All @@ -57,13 +55,30 @@ variable "cloud_sql" {
duration = optional(string, "600s")
})), [
{
severity = "WARNING",
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
threshold = 0.95,
}
])
})), {})
})
}

variable "kyverno" {
description = "Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace."
type = object({
enabled = optional(bool, true)
cluster_name = string
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
# Rate limit for notifications, e.g. "300s" for 5 minutes, used only for log match alerts
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
namespace = optional(string, "kyverno")
})
}