From 9034719997b9c77c7cbf4ac2220327db191921a4 Mon Sep 17 00:00:00 2001 From: FabrizioCafolla Date: Tue, 27 Jan 2026 16:48:39 +0100 Subject: [PATCH 1/5] refs platform/board#4082: remove module dependencies --- cert_manager.tf | 5 +++-- kyverno.tf | 5 +++-- variables.tf | 15 ++++++++++----- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/cert_manager.tf b/cert_manager.tf index a27df28..58f53f0 100644 --- a/cert_manager.tf +++ b/cert_manager.tf @@ -9,7 +9,7 @@ locals { ) cert_manager_notification_channels = var.cert_manager.notification_enabled ? (length(var.cert_manager.notification_channels) > 0 ? var.cert_manager.notification_channels : var.notification_channels) : [] - cert_manager_log_filter = <<-EOT + cert_manager_log_filter = var.cert_manager.cluster_name != null ? (<<-EOT ( ( resource.type="k8s_container" @@ -34,12 +34,13 @@ locals { ) ${trimspace(var.cert_manager.filter_extra)} EOT + ) : "" } resource "google_monitoring_alert_policy" "cert_manager_logmatch_alert" { count = ( var.cert_manager.enabled - && trimspace(var.cert_manager.cluster_name) != "" + && try(var.cert_manager.cluster_name, "") != "" && var.cert_manager.cluster_name != null ) ? 1 : 0 diff --git a/kyverno.tf b/kyverno.tf index 462950d..50774ba 100644 --- a/kyverno.tf +++ b/kyverno.tf @@ -3,7 +3,7 @@ locals { alert_documentation = var.kyverno.alert_documentation != null ? var.kyverno.alert_documentation : "Kyverno controllers produced ERROR logs in namespace ${var.kyverno.namespace}." kyverno_notification_channels = var.kyverno.notification_enabled ? (length(var.kyverno.notification_channels) > 0 ? var.kyverno.notification_channels : var.notification_channels) : [] - kyverno_log_filter = <<-EOT + kyverno_log_filter = var.kyverno.cluster_name != null ? (<<-EOT resource.type="k8s_container" AND resource.labels.project_id="${local.kyverno_project_id}" AND resource.labels.cluster_name="${var.kyverno.cluster_name}" @@ -48,12 +48,13 @@ locals { ) ${trimspace(var.kyverno.filter_extra)} EOT + ) : "" } resource "google_monitoring_alert_policy" "kyverno_logmatch_alert" { count = ( var.kyverno.enabled - && trimspace(var.kyverno.cluster_name) != "" + && try(var.kyverno.cluster_name, "") != "" && var.kyverno.cluster_name != null ) ? 1 : 0 diff --git a/variables.tf b/variables.tf index d5ca40a..be04281 100644 --- a/variables.tf +++ b/variables.tf @@ -11,7 +11,9 @@ variable "notification_channels" { variable "cloud_sql" { description = "Configuration for Cloud SQL monitoring alerts. Supports customization of project, auto-close timing, notification channels, and per-instance alert thresholds for CPU, memory, and disk utilization." + default = {} type = object({ + enabled = optional(bool, true) project_id = optional(string, null) auto_close = optional(string, "86400s") # default 24h notification_enabled = optional(bool, true) @@ -68,9 +70,10 @@ variable "cloud_sql" { variable "kyverno" { description = "Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace." + default = {} type = object({ - enabled = optional(bool, true) - cluster_name = string + enabled = optional(bool, false) + cluster_name = optional(string, null) project_id = optional(string, null) notification_enabled = optional(bool, true) notification_channels = optional(list(string), []) @@ -85,9 +88,10 @@ variable "kyverno" { variable "cert_manager" { description = "Configuration for cert-manager missing issuer log alert. Allows customization of project, cluster, namespace, notification channels, alert documentation, enablement, extra filters, auto-close timing, and notification rate limiting." + default = {} type = object({ - enabled = optional(bool, true) - cluster_name = string + enabled = optional(bool, false) + cluster_name = optional(string, null) project_id = optional(string, null) namespace = optional(string, "cert-manager") notification_enabled = optional(bool, true) @@ -101,8 +105,9 @@ variable "cert_manager" { variable "konnectivity_agent" { description = "Configuration for Konnectivity agent deployment replica alert in GKE. Triggers when there are no available replicas." + default = {} type = object({ - enabled = optional(bool, true) + enabled = optional(bool, false) cluster_name = optional(string, null) project_id = optional(string, null) namespace = optional(string, "kube-system") From a1e18e3c1654d8721c63ee19463e866f456bc3cb Mon Sep 17 00:00:00 2001 From: FabrizioCafolla Date: Tue, 27 Jan 2026 17:01:43 +0100 Subject: [PATCH 2/5] feat: remove dependecies --- lite_llm.tf | 4 +- modules/http_monitoring/main.tf | 86 ++++++++++++++++++++++++++++ modules/http_monitoring/variables.tf | 86 ++++++++++++++++++++++++++++ typesense.tf | 4 +- 4 files changed, 176 insertions(+), 4 deletions(-) create mode 100644 modules/http_monitoring/main.tf create mode 100644 modules/http_monitoring/variables.tf diff --git a/lite_llm.tf b/lite_llm.tf index 37638e9..731460f 100644 --- a/lite_llm.tf +++ b/lite_llm.tf @@ -19,8 +19,8 @@ locals { module "litellm_uptime_checks" { for_each = local.litellm_uptime_checks - source = "github.com/sparkfabrik/terraform-sparkfabrik-gcp-http-monitoring?ref=1.0.0" - gcp_project = local.litellm_project + source = "./modules/http_monitoring" + gcp_project_id = local.litellm_project uptime_monitoring_host = each.value.host uptime_monitoring_path = each.value.path alert_notification_channels = local.litellm_notification_channels diff --git a/modules/http_monitoring/main.tf b/modules/http_monitoring/main.tf new file mode 100644 index 0000000..f3b9eee --- /dev/null +++ b/modules/http_monitoring/main.tf @@ -0,0 +1,86 @@ +locals { + suffix = var.uptime_monitoring_path != "/" ? var.uptime_monitoring_path : "" + uptime_monitoring_display_name = var.uptime_monitoring_display_name != "" ? "${var.uptime_monitoring_display_name} - ${var.uptime_monitoring_host}${local.suffix}" : "${var.uptime_monitoring_host}${local.suffix}" + alert_display_name = var.alert_display_name != "" ? var.alert_display_name : "Failure of uptime check for: ${local.uptime_monitoring_display_name}" +} + +resource "google_monitoring_uptime_check_config" "https_uptime" { + display_name = local.uptime_monitoring_display_name + timeout = var.uptime_check_timeout + period = var.uptime_check_period + selected_regions = var.uptime_check_regions + + http_check { + path = var.uptime_monitoring_path + port = "443" + use_ssl = true + validate_ssl = true + headers = var.uptime_monitoring_headers + + dynamic "accepted_response_status_codes" { + for_each = var.accepted_response_status_values + + content { + status_value = accepted_response_status_codes.value + } + } + + dynamic "accepted_response_status_codes" { + for_each = var.accepted_response_status_classes + + content { + status_class = accepted_response_status_codes.value + } + } + } + + monitored_resource { + type = "uptime_url" + labels = { + project_id = var.gcp_project_id + host = var.uptime_monitoring_host + } + } + + project = var.gcp_project_id + + lifecycle { + create_before_destroy = true + } +} + +# ------------- +# Alerts policy +# ------------- +resource "google_monitoring_alert_policy" "failure_alert" { + display_name = local.alert_display_name + combiner = "OR" + + conditions { + condition_threshold { + filter = "metric.type=\"monitoring.googleapis.com/uptime_check/check_passed\" AND metric.label.check_id=\"${google_monitoring_uptime_check_config.https_uptime.uptime_check_id}\" AND resource.type=\"uptime_url\"" + comparison = "COMPARISON_LT" + threshold_value = var.alert_threshold_value + duration = var.alert_threshold_duration + trigger { + count = 1 + } + aggregations { + alignment_period = "1200s" + per_series_aligner = "ALIGN_NEXT_OLDER" + cross_series_reducer = "REDUCE_COUNT_TRUE" + group_by_fields = [] + } + } + display_name = local.alert_display_name + } + + user_labels = var.uptime_alert_user_labels + + notification_channels = var.alert_notification_channels + project = var.gcp_project_id + + depends_on = [ + google_monitoring_uptime_check_config.https_uptime + ] +} diff --git a/modules/http_monitoring/variables.tf b/modules/http_monitoring/variables.tf new file mode 100644 index 0000000..80ccc02 --- /dev/null +++ b/modules/http_monitoring/variables.tf @@ -0,0 +1,86 @@ +variable "gcp_project_id" { + type = string + description = "The Google Cloud project ID." +} + +variable "uptime_monitoring_display_name" { + type = string + description = "A human-friendly name for the uptime check configuration. Used for monitoring display_name." + default = "" +} + +variable "uptime_monitoring_path" { + type = string + description = "The path to the page to run the check against." + default = "/" +} + +variable "uptime_check_period" { + type = string + description = "How often, in seconds, the uptime check is performed. Currently, the only supported values are 60s (1 minute), 300s (5 minutes), 600s (10 minutes), and 900s (15 minutes). Defaults to 300s." + default = "60s" +} + +variable "uptime_check_timeout" { + type = string + description = "The maximum amount of time to wait for the request to complete (must be between 1 and 60 seconds)." + default = "10s" +} + +variable "uptime_monitoring_host" { + type = string + description = "A hostname to monitor (without protocol, example: 'www.my-site.com')." +} + +variable "uptime_check_regions" { + type = list(string) + description = "The list of regions from which the check will be run. Some regions contain one location, and others contain more than one. If this field is specified, enough regions to include a minimum of 3 locations must be provided, or an error message is returned. Not specifying this field will result in uptime checks running from all regions." + default = ["USA_VIRGINIA", "EUROPE", "ASIA_PACIFIC"] +} + +variable "uptime_alert_user_labels" { + type = map(string) + description = "This field is intended to be used for labelling the SSL alerts. Labels and values can contain only lowercase letters, numerals, underscores, and dashes. Keys must begin with a letter." + default = {} +} + +variable "uptime_monitoring_headers" { + type = map(string) + description = "A set of key/value header pairs to send in the HTTP request to the URL." + default = {} +} + +variable "alert_threshold_duration" { + type = string + description = "The amount of time that a time series must violate the threshold to be considered failing. Currently, only values that are a multiple of a minute--e.g., 0, 60, 120, or 300 seconds--are supported." + default = "60s" +} + +variable "alert_threshold_value" { + type = number + description = "A value against which to compare the time series." + default = 1 +} + +variable "alert_notification_channels" { + type = list(string) + description = "Identifies the notification channels to which notifications should be sent when incidents are opened or closed. The syntax of the entries in this field is projects/[PROJECT_ID]/notificationChannels/[CHANNEL_ID]" +} + +variable "alert_display_name" { + type = string + description = "A human-friendly name for the alert policy. Used for monitoring display_name." + default = "" +} + +variable "accepted_response_status_values" { + description = "Check will only pass if the HTTP response status code is in this set of status values (combined with the set of status classes)." + type = set(number) + default = [] +} + +variable "accepted_response_status_classes" { + description = "Check will only pass if the HTTP response status code is in this set of status classes (combined with the set of status values). Possible values: STATUS_CLASS_1XX, STATUS_CLASS_2XX, STATUS_CLASS_3XX, STATUS_CLASS_4XX, STATUS_CLASS_5XX, STATUS_CLASS_ANY" + type = set(string) + default = [] +} diff --git a/typesense.tf b/typesense.tf index 286db50..4b28657 100644 --- a/typesense.tf +++ b/typesense.tf @@ -19,8 +19,8 @@ locals { module "typesense_uptime_checks" { for_each = local.typesense_uptime_checks - source = "github.com/sparkfabrik/terraform-sparkfabrik-gcp-http-monitoring?ref=1.0.0" - gcp_project = local.typesense_project + source = "./modules/http_monitoring" + gcp_project_id = local.typesense_project uptime_monitoring_host = each.value.host uptime_monitoring_path = each.value.path alert_notification_channels = local.typesense_notification_channels From f9f50f0630c87a96f761565e5b074f29dbc4e1b4 Mon Sep 17 00:00:00 2001 From: FabrizioCafolla Date: Tue, 27 Jan 2026 17:12:14 +0100 Subject: [PATCH 3/5] update --- CHANGELOG.md | 8 +++++ README.md | 14 ++++----- modules/http_monitoring/README.md | 49 +++++++++++++++++++++++++++++ modules/http_monitoring/versions.tf | 10 ++++++ 4 files changed, 74 insertions(+), 7 deletions(-) create mode 100644 modules/http_monitoring/README.md create mode 100644 modules/http_monitoring/versions.tf diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f2ca2f..d2aa9f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,14 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +## [0.12.0] - 2026-01-28 + +[Compare with previous version](https://github.com/sparkfabrik/terraform-google-services-monitoring/compare/0.11.0...0.12.0) + +### Changed + +- refs platform/board#4071: remove dependecies from [`terraform-sparkfabrik-gcp-http-monitoring`](https://github.com/sparkfabrik/terraform-sparkfabrik-gcp-http-monitoring) terraform module. **⚠️ WARN** Disabled monitoring alerts by default for `kyverno`, `cert-manager`, and `konnectivity_agent`, from now on, you must add the explicit value `enabled = true` to activate these alerts. + ## [0.11.0] - 2026-01-14 [Compare with previous version](https://github.com/sparkfabrik/terraform-google-services-monitoring/compare/0.10.0...0.11.0) diff --git a/README.md b/README.md index a5e35f3..40b3271 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ Supported services: | Name | Version | |------|---------| -| [google](#provider\_google) | 7.15.0 | +| [google](#provider\_google) | >= 5.10 | ## Requirements @@ -53,10 +53,10 @@ Supported services: | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [cert\_manager](#input\_cert\_manager) | Configuration for cert-manager missing issuer log alert. Allows customization of project, cluster, namespace, notification channels, alert documentation, enablement, extra filters, auto-close timing, and notification rate limiting. |
object({
enabled = optional(bool, true)
cluster_name = string
project_id = optional(string, null)
namespace = optional(string, "cert-manager")
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
})
| n/a | yes | -| [cloud\_sql](#input\_cloud\_sql) | Configuration for Cloud SQL monitoring alerts. Supports customization of project, auto-close timing, notification channels, and per-instance alert thresholds for CPU, memory, and disk utilization. |
object({
project_id = optional(string, null)
auto_close = optional(string, "86400s") # default 24h
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
instances = optional(map(object({
cpu_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "120s")
duration = optional(string, "300s")
})), [
{
threshold = 0.85,
duration = "1200s",
},
{
severity = "CRITICAL",
threshold = 1,
duration = "300s",
alignment_period = "60s",
}
])
memory_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "300s")
duration = optional(string, "300s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
disk_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.85)
alignment_period = optional(string, "300s")
duration = optional(string, "600s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
})), {})
})
| n/a | yes | -| [konnectivity\_agent](#input\_konnectivity\_agent) | Configuration for Konnectivity agent deployment replica alert in GKE. Triggers when there are no available replicas. |
object({
enabled = optional(bool, true)
cluster_name = optional(string, null)
project_id = optional(string, null)
namespace = optional(string, "kube-system")
deployment_name = optional(string, "konnectivity-agent")
duration_seconds = optional(number, 60)
auto_close_seconds = optional(number, 3600)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
notification_prompts = optional(list(string), null)
})
| n/a | yes | -| [kyverno](#input\_kyverno) | Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace. |
object({
enabled = optional(bool, true)
cluster_name = string
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
# Rate limit for notifications, e.g. "300s" for 5 minutes, used only for log match alerts
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
namespace = optional(string, "kyverno")
})
| n/a | yes | +| [cert\_manager](#input\_cert\_manager) | Configuration for cert-manager missing issuer log alert. Allows customization of project, cluster, namespace, notification channels, alert documentation, enablement, extra filters, auto-close timing, and notification rate limiting. |
object({
enabled = optional(bool, false)
cluster_name = optional(string, null)
project_id = optional(string, null)
namespace = optional(string, "cert-manager")
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
})
| `{}` | no | +| [cloud\_sql](#input\_cloud\_sql) | Configuration for Cloud SQL monitoring alerts. Supports customization of project, auto-close timing, notification channels, and per-instance alert thresholds for CPU, memory, and disk utilization. |
object({
enabled = optional(bool, true)
project_id = optional(string, null)
auto_close = optional(string, "86400s") # default 24h
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
instances = optional(map(object({
cpu_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "120s")
duration = optional(string, "300s")
})), [
{
threshold = 0.85,
duration = "1200s",
},
{
severity = "CRITICAL",
threshold = 1,
duration = "300s",
alignment_period = "60s",
}
])
memory_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "300s")
duration = optional(string, "300s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
disk_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.85)
alignment_period = optional(string, "300s")
duration = optional(string, "600s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
})), {})
})
| `{}` | no | +| [konnectivity\_agent](#input\_konnectivity\_agent) | Configuration for Konnectivity agent deployment replica alert in GKE. Triggers when there are no available replicas. |
object({
enabled = optional(bool, false)
cluster_name = optional(string, null)
project_id = optional(string, null)
namespace = optional(string, "kube-system")
deployment_name = optional(string, "konnectivity-agent")
duration_seconds = optional(number, 60)
auto_close_seconds = optional(number, 3600)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
notification_prompts = optional(list(string), null)
})
| `{}` | no | +| [kyverno](#input\_kyverno) | Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace. |
object({
enabled = optional(bool, false)
cluster_name = optional(string, null)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
# Rate limit for notifications, e.g. "300s" for 5 minutes, used only for log match alerts
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
namespace = optional(string, "kyverno")
})
| `{}` | no | | [litellm](#input\_litellm) | Configuration for LiteLLM monitoring alerts. Supports uptime checks for HTTP endpoints and container-level alerts (pod restarts) in GKE. Each app is identified by its name (map key). |
object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
cluster_name = optional(string, null)

apps = optional(map(object({
uptime_check = optional(object({
enabled = optional(bool, true)
host = string
path = optional(string, "/health/readiness")
}), null)

container_check = optional(object({
enabled = optional(bool, true)
namespace = string
pod_restart = optional(object({
threshold = optional(number, 0)
alignment_period = optional(number, 60)
duration = optional(number, 180)
auto_close_seconds = optional(number, 3600)
notification_prompts = optional(list(string), null)
}), {})
}), null)
})), {})
})
| `{}` | no | | [notification\_channels](#input\_notification\_channels) | List of notification channel IDs to notify when an alert is triggered | `list(string)` | `[]` | no | | [project\_id](#input\_project\_id) | The Google Cloud project ID where logging exclusions will be created | `string` | n/a | yes | @@ -90,7 +90,7 @@ Supported services: | Name | Source | Version | |------|--------|---------| -| [litellm\_uptime\_checks](#module\_litellm\_uptime\_checks) | github.com/sparkfabrik/terraform-sparkfabrik-gcp-http-monitoring | 1.0.0 | -| [typesense\_uptime\_checks](#module\_typesense\_uptime\_checks) | github.com/sparkfabrik/terraform-sparkfabrik-gcp-http-monitoring | 1.0.0 | +| [litellm\_uptime\_checks](#module\_litellm\_uptime\_checks) | ./modules/http_monitoring | n/a | +| [typesense\_uptime\_checks](#module\_typesense\_uptime\_checks) | ./modules/http_monitoring | n/a | diff --git a/modules/http_monitoring/README.md b/modules/http_monitoring/README.md new file mode 100644 index 0000000..c9282d4 --- /dev/null +++ b/modules/http_monitoring/README.md @@ -0,0 +1,49 @@ +# HTTP Monitoring Module + +Terraform module for creating HTTPS uptime checks and alert policies in Google Cloud Monitoring. + +## Features + +- HTTPS uptime check with SSL validation +- Configurable check intervals and timeouts +- Multi-region monitoring (default: USA, Europe, Asia Pacific) +- Automatic alert policy on check failure +- Custom response status code validation + +## Usage + +```hcl +module "http_monitoring" { + source = "./modules/http_monitoring" + + gcp_project_id = "my-project" + uptime_monitoring_host = "www.example.com" + uptime_monitoring_path = "/health" + alert_notification_channels = ["projects/my-project/notificationChannels/123456"] +} +``` + +## Inputs + +| Name | Description | Type | Default | Required | +| ---------------------------------- | ----------------------------------------------------- | -------------- | -------------------------------------------- | :------: | +| `gcp_project_id` | Google Cloud project ID | `string` | - | yes | +| `uptime_monitoring_host` | Hostname to monitor (without protocol) | `string` | - | yes | +| `alert_notification_channels` | Notification channel IDs for alerts | `list(string)` | - | yes | +| `uptime_monitoring_path` | Path to check | `string` | `"/"` | no | +| `uptime_monitoring_display_name` | Display name for the uptime check | `string` | `""` | no | +| `uptime_check_period` | Check interval (60s, 300s, 600s, 900s) | `string` | `"60s"` | no | +| `uptime_check_timeout` | Request timeout (1-60 seconds) | `string` | `"10s"` | no | +| `uptime_check_regions` | Regions to run checks from | `list(string)` | `["USA_VIRGINIA", "EUROPE", "ASIA_PACIFIC"]` | no | +| `uptime_monitoring_headers` | HTTP headers to send | `map(string)` | `{}` | no | +| `uptime_alert_user_labels` | Labels for the alert policy | `map(string)` | `{}` | no | +| `alert_threshold_duration` | Duration before triggering alert | `string` | `"60s"` | no | +| `alert_threshold_value` | Threshold for alert trigger | `number` | `1` | no | +| `alert_display_name` | Display name for the alert | `string` | `""` | no | +| `accepted_response_status_values` | Accepted HTTP status codes | `set(number)` | `[]` | no | +| `accepted_response_status_classes` | Accepted HTTP status classes (e.g., STATUS_CLASS_2XX) | `set(string)` | `[]` | no | + +## Resources Created + +- `google_monitoring_uptime_check_config.https_uptime` - HTTPS uptime check +- `google_monitoring_alert_policy.failure_alert` - Alert policy for check failures diff --git a/modules/http_monitoring/versions.tf b/modules/http_monitoring/versions.tf new file mode 100644 index 0000000..41b6e61 --- /dev/null +++ b/modules/http_monitoring/versions.tf @@ -0,0 +1,10 @@ +terraform { + required_version = ">= 1.5" + + required_providers { + google = { + source = "hashicorp/google" + version = ">= 5.10" + } + } +} From cdc0bdb94458bc4a5c3ca1c79535285d2dc3273c Mon Sep 17 00:00:00 2001 From: FabrizioCafolla Date: Tue, 27 Jan 2026 17:20:39 +0100 Subject: [PATCH 4/5] update --- CHANGELOG.md | 2 +- README.md | 6 +++--- cert_manager.tf | 8 ++++---- kyverno.tf | 9 +++++---- lite_llm.tf | 2 +- modules/http_monitoring/variables.tf | 2 +- typesense.tf | 2 +- variables.tf | 30 +++++++++++++++++++++++++--- 8 files changed, 43 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2aa9f3..845b9c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ### Changed -- refs platform/board#4071: remove dependecies from [`terraform-sparkfabrik-gcp-http-monitoring`](https://github.com/sparkfabrik/terraform-sparkfabrik-gcp-http-monitoring) terraform module. **⚠️ WARN** Disabled monitoring alerts by default for `kyverno`, `cert-manager`, and `konnectivity_agent`, from now on, you must add the explicit value `enabled = true` to activate these alerts. +- refs platform/board#4071: remove dependecies from [`terraform-sparkfabrik-gcp-http-monitoring`](https://github.com/sparkfabrik/terraform-sparkfabrik-gcp-http-monitoring) terraform module. ## [0.11.0] - 2026-01-14 diff --git a/README.md b/README.md index 40b3271..2f092e2 100644 --- a/README.md +++ b/README.md @@ -53,10 +53,10 @@ Supported services: | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [cert\_manager](#input\_cert\_manager) | Configuration for cert-manager missing issuer log alert. Allows customization of project, cluster, namespace, notification channels, alert documentation, enablement, extra filters, auto-close timing, and notification rate limiting. |
object({
enabled = optional(bool, false)
cluster_name = optional(string, null)
project_id = optional(string, null)
namespace = optional(string, "cert-manager")
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
})
| `{}` | no | +| [cert\_manager](#input\_cert\_manager) | Configuration for cert-manager missing issuer log alert. Allows customization of project, cluster, namespace, notification channels, alert documentation, enablement, extra filters, auto-close timing, and notification rate limiting. |
object({
enabled = optional(bool, true)
cluster_name = optional(string, null)
project_id = optional(string, null)
namespace = optional(string, "cert-manager")
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
})
| `{}` | no | | [cloud\_sql](#input\_cloud\_sql) | Configuration for Cloud SQL monitoring alerts. Supports customization of project, auto-close timing, notification channels, and per-instance alert thresholds for CPU, memory, and disk utilization. |
object({
enabled = optional(bool, true)
project_id = optional(string, null)
auto_close = optional(string, "86400s") # default 24h
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
instances = optional(map(object({
cpu_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "120s")
duration = optional(string, "300s")
})), [
{
threshold = 0.85,
duration = "1200s",
},
{
severity = "CRITICAL",
threshold = 1,
duration = "300s",
alignment_period = "60s",
}
])
memory_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "300s")
duration = optional(string, "300s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
disk_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.85)
alignment_period = optional(string, "300s")
duration = optional(string, "600s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
})), {})
})
| `{}` | no | -| [konnectivity\_agent](#input\_konnectivity\_agent) | Configuration for Konnectivity agent deployment replica alert in GKE. Triggers when there are no available replicas. |
object({
enabled = optional(bool, false)
cluster_name = optional(string, null)
project_id = optional(string, null)
namespace = optional(string, "kube-system")
deployment_name = optional(string, "konnectivity-agent")
duration_seconds = optional(number, 60)
auto_close_seconds = optional(number, 3600)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
notification_prompts = optional(list(string), null)
})
| `{}` | no | -| [kyverno](#input\_kyverno) | Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace. |
object({
enabled = optional(bool, false)
cluster_name = optional(string, null)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
# Rate limit for notifications, e.g. "300s" for 5 minutes, used only for log match alerts
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
namespace = optional(string, "kyverno")
})
| `{}` | no | +| [konnectivity\_agent](#input\_konnectivity\_agent) | Configuration for Konnectivity agent deployment replica alert in GKE. Triggers when there are no available replicas. |
object({
enabled = optional(bool, true)
cluster_name = optional(string, null)
project_id = optional(string, null)
namespace = optional(string, "kube-system")
deployment_name = optional(string, "konnectivity-agent")
duration_seconds = optional(number, 60)
auto_close_seconds = optional(number, 3600)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
notification_prompts = optional(list(string), null)
})
| `{}` | no | +| [kyverno](#input\_kyverno) | Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace. |
object({
enabled = optional(bool, true)
cluster_name = optional(string, null)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
# Rate limit for notifications, e.g. "300s" for 5 minutes, used only for log match alerts
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
namespace = optional(string, "kyverno")
})
| `{}` | no | | [litellm](#input\_litellm) | Configuration for LiteLLM monitoring alerts. Supports uptime checks for HTTP endpoints and container-level alerts (pod restarts) in GKE. Each app is identified by its name (map key). |
object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
cluster_name = optional(string, null)

apps = optional(map(object({
uptime_check = optional(object({
enabled = optional(bool, true)
host = string
path = optional(string, "/health/readiness")
}), null)

container_check = optional(object({
enabled = optional(bool, true)
namespace = string
pod_restart = optional(object({
threshold = optional(number, 0)
alignment_period = optional(number, 60)
duration = optional(number, 180)
auto_close_seconds = optional(number, 3600)
notification_prompts = optional(list(string), null)
}), {})
}), null)
})), {})
})
| `{}` | no | | [notification\_channels](#input\_notification\_channels) | List of notification channel IDs to notify when an alert is triggered | `list(string)` | `[]` | no | | [project\_id](#input\_project\_id) | The Google Cloud project ID where logging exclusions will be created | `string` | n/a | yes | diff --git a/cert_manager.tf b/cert_manager.tf index 58f53f0..3a62827 100644 --- a/cert_manager.tf +++ b/cert_manager.tf @@ -8,13 +8,14 @@ locals { EOT ) cert_manager_notification_channels = var.cert_manager.notification_enabled ? (length(var.cert_manager.notification_channels) > 0 ? var.cert_manager.notification_channels : var.notification_channels) : [] + cert_manager_cluster_name = var.cert_manager.cluster_name != null ? trimspace(var.cert_manager.cluster_name) : "" - cert_manager_log_filter = var.cert_manager.cluster_name != null ? (<<-EOT + cert_manager_log_filter = local.cert_manager_cluster_name != "" ? (<<-EOT ( ( resource.type="k8s_container" AND resource.labels.project_id="${local.cert_manager_project_id}" - AND resource.labels.cluster_name="${var.cert_manager.cluster_name}" + AND resource.labels.cluster_name="${local.cert_manager_cluster_name}" AND resource.labels.namespace_name="${var.cert_manager.namespace}" ) OR ( @@ -40,8 +41,7 @@ locals { resource "google_monitoring_alert_policy" "cert_manager_logmatch_alert" { count = ( var.cert_manager.enabled - && try(var.cert_manager.cluster_name, "") != "" - && var.cert_manager.cluster_name != null + && local.cert_manager_cluster_name != "" ) ? 1 : 0 display_name = "cert-manager missing Issuer/ClusterIssuer (cluster=${var.cert_manager.cluster_name}, namespace=${var.cert_manager.namespace})" diff --git a/kyverno.tf b/kyverno.tf index 50774ba..1814dcf 100644 --- a/kyverno.tf +++ b/kyverno.tf @@ -3,10 +3,12 @@ locals { alert_documentation = var.kyverno.alert_documentation != null ? var.kyverno.alert_documentation : "Kyverno controllers produced ERROR logs in namespace ${var.kyverno.namespace}." kyverno_notification_channels = var.kyverno.notification_enabled ? (length(var.kyverno.notification_channels) > 0 ? var.kyverno.notification_channels : var.notification_channels) : [] - kyverno_log_filter = var.kyverno.cluster_name != null ? (<<-EOT + kyverno_cluster_name = var.kyverno.cluster_name != null ? trimspace(var.kyverno.cluster_name) : "" + + kyverno_log_filter = local.kyverno_cluster_name != "" ? (<<-EOT resource.type="k8s_container" AND resource.labels.project_id="${local.kyverno_project_id}" - AND resource.labels.cluster_name="${var.kyverno.cluster_name}" + AND resource.labels.cluster_name="${local.kyverno_cluster_name}" AND resource.labels.namespace_name="${var.kyverno.namespace}" AND ( labels."k8s-pod/app_kubernetes_io/component"=~"(admission-controller|background-controller|cleanup-controller|reports-controller)" @@ -54,8 +56,7 @@ locals { resource "google_monitoring_alert_policy" "kyverno_logmatch_alert" { count = ( var.kyverno.enabled - && try(var.kyverno.cluster_name, "") != "" - && var.kyverno.cluster_name != null + && local.kyverno_cluster_name != "" ) ? 1 : 0 display_name = "Kyverno controllers ERROR logs (namespace=${var.kyverno.namespace})" diff --git a/lite_llm.tf b/lite_llm.tf index 731460f..ef06a76 100644 --- a/lite_llm.tf +++ b/lite_llm.tf @@ -20,7 +20,7 @@ module "litellm_uptime_checks" { for_each = local.litellm_uptime_checks source = "./modules/http_monitoring" - gcp_project_id = local.litellm_project + gcp_project_id = local.litellm_project uptime_monitoring_host = each.value.host uptime_monitoring_path = each.value.path alert_notification_channels = local.litellm_notification_channels diff --git a/modules/http_monitoring/variables.tf b/modules/http_monitoring/variables.tf index 80ccc02..a8a908f 100644 --- a/modules/http_monitoring/variables.tf +++ b/modules/http_monitoring/variables.tf @@ -17,7 +17,7 @@ variable "uptime_monitoring_path" { variable "uptime_check_period" { type = string - description = "How often, in seconds, the uptime check is performed. Currently, the only supported values are 60s (1 minute), 300s (5 minutes), 600s (10 minutes), and 900s (15 minutes). Defaults to 300s." + description = "How often, in seconds, the uptime check is performed. Currently, the only supported values are 60s (1 minute), 300s (5 minutes), 600s (10 minutes), and 900s (15 minutes)" default = "60s" } diff --git a/typesense.tf b/typesense.tf index 4b28657..ac07146 100644 --- a/typesense.tf +++ b/typesense.tf @@ -20,7 +20,7 @@ module "typesense_uptime_checks" { for_each = local.typesense_uptime_checks source = "./modules/http_monitoring" - gcp_project_id = local.typesense_project + gcp_project_id = local.typesense_project uptime_monitoring_host = each.value.host uptime_monitoring_path = each.value.path alert_notification_channels = local.typesense_notification_channels diff --git a/variables.tf b/variables.tf index be04281..dcde9b9 100644 --- a/variables.tf +++ b/variables.tf @@ -72,7 +72,7 @@ variable "kyverno" { description = "Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace." default = {} type = object({ - enabled = optional(bool, false) + enabled = optional(bool, true) cluster_name = optional(string, null) project_id = optional(string, null) notification_enabled = optional(bool, true) @@ -84,13 +84,21 @@ variable "kyverno" { filter_extra = optional(string, "") namespace = optional(string, "kyverno") }) + + validation { + condition = ( + !var.kyverno.enabled || + (var.kyverno.cluster_name != null && var.kyverno.cluster_name != "") + ) + error_message = "When 'enabled' is true, 'cluster_name' must be provided and cannot be empty." + } } variable "cert_manager" { description = "Configuration for cert-manager missing issuer log alert. Allows customization of project, cluster, namespace, notification channels, alert documentation, enablement, extra filters, auto-close timing, and notification rate limiting." default = {} type = object({ - enabled = optional(bool, false) + enabled = optional(bool, true) cluster_name = optional(string, null) project_id = optional(string, null) namespace = optional(string, "cert-manager") @@ -101,13 +109,21 @@ variable "cert_manager" { auto_close_seconds = optional(number, 3600) filter_extra = optional(string, "") }) + + validation { + condition = ( + !var.cert_manager.enabled || + (var.cert_manager.cluster_name != null && var.cert_manager.cluster_name != "") + ) + error_message = "When 'enabled' is true, 'cluster_name' must be provided and cannot be empty." + } } variable "konnectivity_agent" { description = "Configuration for Konnectivity agent deployment replica alert in GKE. Triggers when there are no available replicas." default = {} type = object({ - enabled = optional(bool, false) + enabled = optional(bool, true) cluster_name = optional(string, null) project_id = optional(string, null) namespace = optional(string, "kube-system") @@ -118,6 +134,14 @@ variable "konnectivity_agent" { notification_channels = optional(list(string), []) notification_prompts = optional(list(string), null) }) + + validation { + condition = ( + !var.konnectivity_agent.enabled || + (var.konnectivity_agent.cluster_name != null && var.konnectivity_agent.cluster_name != "") + ) + error_message = "When 'enabled' is true, 'cluster_name' must be provided and cannot be empty." + } } variable "typesense" { From 416bc8bde3cc4356400235f4b162f085d358ac8b Mon Sep 17 00:00:00 2001 From: FabrizioCafolla Date: Wed, 28 Jan 2026 10:09:53 +0100 Subject: [PATCH 5/5] update --- CHANGELOG.md | 2 +- cert_manager.tf | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 845b9c1..0012553 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ### Changed -- refs platform/board#4071: remove dependecies from [`terraform-sparkfabrik-gcp-http-monitoring`](https://github.com/sparkfabrik/terraform-sparkfabrik-gcp-http-monitoring) terraform module. +- refs platform/board#4071: remove dependencies from [`terraform-sparkfabrik-gcp-http-monitoring`](https://github.com/sparkfabrik/terraform-sparkfabrik-gcp-http-monitoring) terraform module. ## [0.11.0] - 2026-01-14 diff --git a/cert_manager.tf b/cert_manager.tf index 3a62827..7cb7fb7 100644 --- a/cert_manager.tf +++ b/cert_manager.tf @@ -21,7 +21,7 @@ locals { OR ( log_id("events") AND resource.labels.project_id="${local.cert_manager_project_id}" - AND resource.labels.cluster_name="${var.cert_manager.cluster_name}" + AND resource.labels.cluster_name="${local.cert_manager_cluster_name}" AND ( jsonPayload.involvedObject.namespace="${var.cert_manager.namespace}" OR jsonPayload.metadata.namespace="${var.cert_manager.namespace}" @@ -44,7 +44,7 @@ resource "google_monitoring_alert_policy" "cert_manager_logmatch_alert" { && local.cert_manager_cluster_name != "" ) ? 1 : 0 - display_name = "cert-manager missing Issuer/ClusterIssuer (cluster=${var.cert_manager.cluster_name}, namespace=${var.cert_manager.namespace})" + display_name = "cert-manager missing Issuer/ClusterIssuer (cluster=${local.cert_manager_cluster_name}, namespace=${var.cert_manager.namespace})" combiner = "OR" enabled = var.cert_manager.enabled