From c7872e14ef684589cc1ef479b43aa1fabd4055cb Mon Sep 17 00:00:00 2001 From: FabrizioCafolla Date: Mon, 15 Dec 2025 11:30:25 +0100 Subject: [PATCH 1/5] feat: update monitoring parameters for LiteLLM and Typesense, add notification prompts --- CHANGELOG.md | 12 ++++++++++++ README.md | 4 ++-- lite_llm.tf | 1 + typesense.tf | 1 + variables.tf | 11 +++++------ 5 files changed, 21 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3344f6c..210936d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,18 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +## [0.9.0] - 2025-12-15 + +[Compare with previous version](https://github.com/sparkfabrik/terraform-google-services-monitoring/compare/0.8.0...0.9.0) + +### Added + +- Add `notification_prompts` param for LiteLLM and Typesense + +### Changed + +- Modify the default values of the pod restart alerts `duration` and `alignment_period` + ## [0.8.0] - 2025-12-12 [Compare with previous version](https://github.com/sparkfabrik/terraform-google-services-monitoring/compare/0.7.0...0.8.0) diff --git a/README.md b/README.md index 67634f2..dafede7 100644 --- a/README.md +++ b/README.md @@ -38,11 +38,11 @@ Supported services: | [cert\_manager](#input\_cert\_manager) | Configuration for cert-manager missing issuer log alert. Allows customization of project, cluster, namespace, notification channels, alert documentation, enablement, extra filters, auto-close timing, and notification rate limiting. |
object({
enabled = optional(bool, true)
cluster_name = string
project_id = optional(string, null)
namespace = optional(string, "cert-manager")
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
})
| n/a | yes | | [cloud\_sql](#input\_cloud\_sql) | Configuration for Cloud SQL monitoring alerts. Supports customization of project, auto-close timing, notification channels, and per-instance alert thresholds for CPU, memory, and disk utilization. |
object({
project_id = optional(string, null)
auto_close = optional(string, "86400s") # default 24h
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
instances = optional(map(object({
cpu_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "120s")
duration = optional(string, "300s")
})), [
{
threshold = 0.85,
duration = "1200s",
},
{
severity = "CRITICAL",
threshold = 1,
duration = "300s",
alignment_period = "60s",
}
])
memory_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "300s")
duration = optional(string, "300s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
disk_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.85)
alignment_period = optional(string, "300s")
duration = optional(string, "600s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
})), {})
})
| n/a | yes | | [kyverno](#input\_kyverno) | Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace. |
object({
enabled = optional(bool, true)
cluster_name = string
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
# Rate limit for notifications, e.g. "300s" for 5 minutes, used only for log match alerts
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
namespace = optional(string, "kyverno")
})
| n/a | yes | -| [litellm](#input\_litellm) | Configuration for LiteLLM monitoring alerts. Supports uptime checks for HTTP endpoints and container-level alerts (pod restarts) in GKE. Each app is identified by its name (map key). |
object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
cluster_name = optional(string, null)

apps = optional(map(object({
uptime_check = optional(object({
enabled = optional(bool, true)
host = string
path = optional(string, "/health/readiness")
}), null)

container_check = optional(object({
enabled = optional(bool, true)
namespace = string
pod_restart = optional(object({
threshold = optional(number, 0)
alignment_period = optional(number, 60)
duration = optional(number, 0)
auto_close_seconds = optional(number, 3600)
}), {})
}), null)
})), {})
})
| `{}` | no | +| [litellm](#input\_litellm) | Configuration for LiteLLM monitoring alerts. Supports uptime checks for HTTP endpoints and container-level alerts (pod restarts) in GKE. Each app is identified by its name (map key). |
object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
cluster_name = optional(string, null)

apps = optional(map(object({
uptime_check = optional(object({
enabled = optional(bool, true)
host = string
path = optional(string, "/health/readiness")
}), null)

container_check = optional(object({
enabled = optional(bool, true)
namespace = string
pod_restart = optional(object({
threshold = optional(number, 0)
alignment_period = optional(number, 60)
duration = optional(number, 120)
auto_close_seconds = optional(number, 3600)
notification_prompts = optional(list(string), ["OPENED", "CLOSED"])
}), {})
}), null)
})), {})
})
| `{}` | no | | [notification\_channels](#input\_notification\_channels) | List of notification channel IDs to notify when an alert is triggered | `list(string)` | `[]` | no | | [project\_id](#input\_project\_id) | The Google Cloud project ID where logging exclusions will be created | `string` | n/a | yes | | [ssl\_alert](#input\_ssl\_alert) | Configuration for SSL certificate expiration alerts. Allows customization of project, notification channels, alert thresholds, and user labels. |
object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
threshold_days = optional(list(number), [15, 7])
user_labels = optional(map(string), {})
})
| `{}` | no | -| [typesense](#input\_typesense) | Configuration for Typesense monitoring alerts. Supports uptime checks for HTTP endpoints and container-level alerts (pod restarts) in GKE. Each app is identified by its name (map key). |
object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
cluster_name = optional(string, null) # GKE cluster name for container checks

# Apps configuration - map keyed by app_name
apps = optional(map(object({
# Uptime check configuration (optional)
uptime_check = optional(object({
enabled = optional(bool, true)
host = string
path = optional(string, "/readyz")
}), null)

# Container check configuration for GKE (optional)
container_check = optional(object({
enabled = optional(bool, true)
namespace = string
pod_restart = optional(object({
threshold = optional(number, 0)
alignment_period = optional(number, 60)
duration = optional(number, 0)
auto_close_seconds = optional(number, 3600)
}), {})
}), null)
})), {})
})
| `{}` | no | +| [typesense](#input\_typesense) | Configuration for Typesense monitoring alerts. Supports uptime checks for HTTP endpoints and container-level alerts (pod restarts) in GKE. Each app is identified by its name (map key). |
object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
cluster_name = optional(string, null)

apps = optional(map(object({
uptime_check = optional(object({
enabled = optional(bool, true)
host = string
path = optional(string, "/readyz")
}), null)

container_check = optional(object({
enabled = optional(bool, true)
namespace = string
pod_restart = optional(object({
threshold = optional(number, 0)
duration = optional(number, 120)
auto_close_seconds = optional(number, 3600)
notification_prompts = optional(list(string), ["OPENED", "CLOSED"])
}), {})
}), null)
})), {})
})
| `{}` | no | ## Outputs diff --git a/lite_llm.tf b/lite_llm.tf index a6f38cf..a7ca9a6 100644 --- a/lite_llm.tf +++ b/lite_llm.tf @@ -75,5 +75,6 @@ resource "google_monitoring_alert_policy" "litellm_pod_restart" { alert_strategy { auto_close = "${each.value.pod_restart.auto_close_seconds}s" + notification_prompts = each.value.pod_restart.notification_prompts } } diff --git a/typesense.tf b/typesense.tf index 07fb529..4e69f40 100644 --- a/typesense.tf +++ b/typesense.tf @@ -75,5 +75,6 @@ resource "google_monitoring_alert_policy" "typesense_pod_restart" { alert_strategy { auto_close = "${each.value.pod_restart.auto_close_seconds}s" + notification_prompts = each.value.pod_restart.notification_prompts } } diff --git a/variables.tf b/variables.tf index 54d748a..453794c 100644 --- a/variables.tf +++ b/variables.tf @@ -107,26 +107,24 @@ variable "typesense" { project_id = optional(string, null) notification_enabled = optional(bool, true) notification_channels = optional(list(string), []) - cluster_name = optional(string, null) # GKE cluster name for container checks + cluster_name = optional(string, null) - # Apps configuration - map keyed by app_name apps = optional(map(object({ - # Uptime check configuration (optional) uptime_check = optional(object({ enabled = optional(bool, true) host = string path = optional(string, "/readyz") }), null) - # Container check configuration for GKE (optional) container_check = optional(object({ enabled = optional(bool, true) namespace = string pod_restart = optional(object({ threshold = optional(number, 0) alignment_period = optional(number, 60) - duration = optional(number, 0) + duration = optional(number, 120) auto_close_seconds = optional(number, 3600) + notification_prompts = optional(list(string), ["OPENED", "CLOSED"]) }), {}) }), null) })), {}) @@ -175,8 +173,9 @@ variable "litellm" { pod_restart = optional(object({ threshold = optional(number, 0) alignment_period = optional(number, 60) - duration = optional(number, 0) + duration = optional(number, 120) auto_close_seconds = optional(number, 3600) + notification_prompts = optional(list(string), ["OPENED", "CLOSED"]) }), {}) }), null) })), {}) From 28ad4ac993260af92244462c2b7fbd59217a1cbf Mon Sep 17 00:00:00 2001 From: FabrizioCafolla Date: Mon, 15 Dec 2025 11:53:48 +0100 Subject: [PATCH 2/5] update --- variables.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/variables.tf b/variables.tf index 453794c..733863e 100644 --- a/variables.tf +++ b/variables.tf @@ -122,7 +122,7 @@ variable "typesense" { pod_restart = optional(object({ threshold = optional(number, 0) alignment_period = optional(number, 60) - duration = optional(number, 120) + duration = optional(number, 180) auto_close_seconds = optional(number, 3600) notification_prompts = optional(list(string), ["OPENED", "CLOSED"]) }), {}) @@ -173,7 +173,7 @@ variable "litellm" { pod_restart = optional(object({ threshold = optional(number, 0) alignment_period = optional(number, 60) - duration = optional(number, 120) + duration = optional(number, 180) auto_close_seconds = optional(number, 3600) notification_prompts = optional(list(string), ["OPENED", "CLOSED"]) }), {}) From b054aaf2734a93c73f27d26ad0ec1da58cb877c4 Mon Sep 17 00:00:00 2001 From: FabrizioCafolla Date: Mon, 15 Dec 2025 11:54:23 +0100 Subject: [PATCH 3/5] update --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index dafede7..6793540 100644 --- a/README.md +++ b/README.md @@ -38,11 +38,11 @@ Supported services: | [cert\_manager](#input\_cert\_manager) | Configuration for cert-manager missing issuer log alert. Allows customization of project, cluster, namespace, notification channels, alert documentation, enablement, extra filters, auto-close timing, and notification rate limiting. |
object({
enabled = optional(bool, true)
cluster_name = string
project_id = optional(string, null)
namespace = optional(string, "cert-manager")
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
})
| n/a | yes | | [cloud\_sql](#input\_cloud\_sql) | Configuration for Cloud SQL monitoring alerts. Supports customization of project, auto-close timing, notification channels, and per-instance alert thresholds for CPU, memory, and disk utilization. |
object({
project_id = optional(string, null)
auto_close = optional(string, "86400s") # default 24h
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
instances = optional(map(object({
cpu_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "120s")
duration = optional(string, "300s")
})), [
{
threshold = 0.85,
duration = "1200s",
},
{
severity = "CRITICAL",
threshold = 1,
duration = "300s",
alignment_period = "60s",
}
])
memory_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "300s")
duration = optional(string, "300s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
disk_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.85)
alignment_period = optional(string, "300s")
duration = optional(string, "600s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
})), {})
})
| n/a | yes | | [kyverno](#input\_kyverno) | Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace. |
object({
enabled = optional(bool, true)
cluster_name = string
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
# Rate limit for notifications, e.g. "300s" for 5 minutes, used only for log match alerts
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
namespace = optional(string, "kyverno")
})
| n/a | yes | -| [litellm](#input\_litellm) | Configuration for LiteLLM monitoring alerts. Supports uptime checks for HTTP endpoints and container-level alerts (pod restarts) in GKE. Each app is identified by its name (map key). |
object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
cluster_name = optional(string, null)

apps = optional(map(object({
uptime_check = optional(object({
enabled = optional(bool, true)
host = string
path = optional(string, "/health/readiness")
}), null)

container_check = optional(object({
enabled = optional(bool, true)
namespace = string
pod_restart = optional(object({
threshold = optional(number, 0)
alignment_period = optional(number, 60)
duration = optional(number, 120)
auto_close_seconds = optional(number, 3600)
notification_prompts = optional(list(string), ["OPENED", "CLOSED"])
}), {})
}), null)
})), {})
})
| `{}` | no | +| [litellm](#input\_litellm) | Configuration for LiteLLM monitoring alerts. Supports uptime checks for HTTP endpoints and container-level alerts (pod restarts) in GKE. Each app is identified by its name (map key). |
object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
cluster_name = optional(string, null)

apps = optional(map(object({
uptime_check = optional(object({
enabled = optional(bool, true)
host = string
path = optional(string, "/health/readiness")
}), null)

container_check = optional(object({
enabled = optional(bool, true)
namespace = string
pod_restart = optional(object({
threshold = optional(number, 0)
alignment_period = optional(number, 60)
duration = optional(number, 180)
auto_close_seconds = optional(number, 3600)
notification_prompts = optional(list(string), ["OPENED", "CLOSED"])
}), {})
}), null)
})), {})
})
| `{}` | no | | [notification\_channels](#input\_notification\_channels) | List of notification channel IDs to notify when an alert is triggered | `list(string)` | `[]` | no | | [project\_id](#input\_project\_id) | The Google Cloud project ID where logging exclusions will be created | `string` | n/a | yes | | [ssl\_alert](#input\_ssl\_alert) | Configuration for SSL certificate expiration alerts. Allows customization of project, notification channels, alert thresholds, and user labels. |
object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
threshold_days = optional(list(number), [15, 7])
user_labels = optional(map(string), {})
})
| `{}` | no | -| [typesense](#input\_typesense) | Configuration for Typesense monitoring alerts. Supports uptime checks for HTTP endpoints and container-level alerts (pod restarts) in GKE. Each app is identified by its name (map key). |
object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
cluster_name = optional(string, null)

apps = optional(map(object({
uptime_check = optional(object({
enabled = optional(bool, true)
host = string
path = optional(string, "/readyz")
}), null)

container_check = optional(object({
enabled = optional(bool, true)
namespace = string
pod_restart = optional(object({
threshold = optional(number, 0)
duration = optional(number, 120)
auto_close_seconds = optional(number, 3600)
notification_prompts = optional(list(string), ["OPENED", "CLOSED"])
}), {})
}), null)
})), {})
})
| `{}` | no | +| [typesense](#input\_typesense) | Configuration for Typesense monitoring alerts. Supports uptime checks for HTTP endpoints and container-level alerts (pod restarts) in GKE. Each app is identified by its name (map key). |
object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
cluster_name = optional(string, null)

apps = optional(map(object({
uptime_check = optional(object({
enabled = optional(bool, true)
host = string
path = optional(string, "/readyz")
}), null)

container_check = optional(object({
enabled = optional(bool, true)
namespace = string
pod_restart = optional(object({
threshold = optional(number, 0)
alignment_period = optional(number, 60)
duration = optional(number, 180)
auto_close_seconds = optional(number, 3600)
notification_prompts = optional(list(string), ["OPENED", "CLOSED"])
}), {})
}), null)
})), {})
})
| `{}` | no | ## Outputs From aebe9808390983245df6db89d7720e1abb126b3f Mon Sep 17 00:00:00 2001 From: FabrizioCafolla Date: Mon, 15 Dec 2025 14:49:30 +0100 Subject: [PATCH 4/5] update --- variables.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/variables.tf b/variables.tf index 733863e..8596ea9 100644 --- a/variables.tf +++ b/variables.tf @@ -124,7 +124,7 @@ variable "typesense" { alignment_period = optional(number, 60) duration = optional(number, 180) auto_close_seconds = optional(number, 3600) - notification_prompts = optional(list(string), ["OPENED", "CLOSED"]) + notification_prompts = optional(list(string), null) }), {}) }), null) })), {}) @@ -175,7 +175,7 @@ variable "litellm" { alignment_period = optional(number, 60) duration = optional(number, 180) auto_close_seconds = optional(number, 3600) - notification_prompts = optional(list(string), ["OPENED", "CLOSED"]) + notification_prompts = optional(list(string), null) }), {}) }), null) })), {}) From 07553adbe025aa308855a60ec692e67d29bc054e Mon Sep 17 00:00:00 2001 From: FabrizioCafolla Date: Mon, 15 Dec 2025 17:00:02 +0100 Subject: [PATCH 5/5] update --- examples/main.tf | 43 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/examples/main.tf b/examples/main.tf index 92ffa3b..5ab7c75 100644 --- a/examples/main.tf +++ b/examples/main.tf @@ -42,15 +42,13 @@ locals { } module "example" { - source = "github.com/sparkfabrik/terraform-google-services-monitoring" - version = ">= 0.1.0" + source = "github.com/sparkfabrik/terraform-google-services-monitoring?ref=0.9.0" notification_channels = var.notification_channels project_id = var.project_id cloud_sql = local.cloud_sql kyverno = { cluster_name = "test-cluster" - enabled = true notification_channels = [] # Optional filter for log entries, exclude known non-actionable messages # e.g., "-textPayload:\"stale GroupVersion discovery: metrics.k8s.io/v1beta1\"" @@ -59,7 +57,42 @@ module "example" { cert_manager = { cluster_name = "test-cluster" namespace = "cert-manager" - enabled = true - notification_channels = [] + } + + typesense = { + cluster_name = "test-cluster" + apps = { + "typesense-app" = { + uptime_check = { + host = "typesense.example.com" + } + container_check = { + enabled = true + namespace = "typesense" + pod_restart = { + threshold = 1 + } + } + } + } + } + + litellm = { + cluster_name = "test-cluster" + apps = { + "litellm-app" = { + uptime_check = { + host = "litellm.example.com" + } + container_check = { + namespace = "litellm" + pod_restart = { + threshold = 2 + duration = 300 + notification_prompts = ["CLOSED"] + } + } + } + } } }