diff --git a/CHANGELOG.md b/CHANGELOG.md index 3344f6c..210936d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,18 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +## [0.9.0] - 2025-12-15 + +[Compare with previous version](https://github.com/sparkfabrik/terraform-google-services-monitoring/compare/0.8.0...0.9.0) + +### Added + +- Add `notification_prompts` param for LiteLLM and Typesense + +### Changed + +- Modify the default values of the pod restart alerts `duration` and `alignment_period` + ## [0.8.0] - 2025-12-12 [Compare with previous version](https://github.com/sparkfabrik/terraform-google-services-monitoring/compare/0.7.0...0.8.0) diff --git a/README.md b/README.md index 67634f2..6793540 100644 --- a/README.md +++ b/README.md @@ -38,11 +38,11 @@ Supported services: | [cert\_manager](#input\_cert\_manager) | Configuration for cert-manager missing issuer log alert. Allows customization of project, cluster, namespace, notification channels, alert documentation, enablement, extra filters, auto-close timing, and notification rate limiting. |
object({
enabled = optional(bool, true)
cluster_name = string
project_id = optional(string, null)
namespace = optional(string, "cert-manager")
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
}) | n/a | yes |
| [cloud\_sql](#input\_cloud\_sql) | Configuration for Cloud SQL monitoring alerts. Supports customization of project, auto-close timing, notification channels, and per-instance alert thresholds for CPU, memory, and disk utilization. | object({
project_id = optional(string, null)
auto_close = optional(string, "86400s") # default 24h
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
instances = optional(map(object({
cpu_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "120s")
duration = optional(string, "300s")
})), [
{
threshold = 0.85,
duration = "1200s",
},
{
severity = "CRITICAL",
threshold = 1,
duration = "300s",
alignment_period = "60s",
}
])
memory_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.90)
alignment_period = optional(string, "300s")
duration = optional(string, "300s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
disk_utilization = optional(list(object({
severity = optional(string, "WARNING"),
threshold = optional(number, 0.85)
alignment_period = optional(string, "300s")
duration = optional(string, "600s")
})), [
{
severity = "WARNING",
},
{
severity = "CRITICAL",
threshold = 0.95,
}
])
})), {})
}) | n/a | yes |
| [kyverno](#input\_kyverno) | Configuration for Kyverno monitoring alerts. Allows customization of cluster name, project, notification channels, alert documentation, metric thresholds, auto-close timing, enablement, extra filters, and namespace. | object({
enabled = optional(bool, true)
cluster_name = string
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
# Rate limit for notifications, e.g. "300s" for 5 minutes, used only for log match alerts
logmatch_notification_rate_limit = optional(string, "300s")
alert_documentation = optional(string, null)
auto_close_seconds = optional(number, 3600)
filter_extra = optional(string, "")
namespace = optional(string, "kyverno")
}) | n/a | yes |
-| [litellm](#input\_litellm) | Configuration for LiteLLM monitoring alerts. Supports uptime checks for HTTP endpoints and container-level alerts (pod restarts) in GKE. Each app is identified by its name (map key). | object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
cluster_name = optional(string, null)
apps = optional(map(object({
uptime_check = optional(object({
enabled = optional(bool, true)
host = string
path = optional(string, "/health/readiness")
}), null)
container_check = optional(object({
enabled = optional(bool, true)
namespace = string
pod_restart = optional(object({
threshold = optional(number, 0)
alignment_period = optional(number, 60)
duration = optional(number, 0)
auto_close_seconds = optional(number, 3600)
}), {})
}), null)
})), {})
}) | `{}` | no |
+| [litellm](#input\_litellm) | Configuration for LiteLLM monitoring alerts. Supports uptime checks for HTTP endpoints and container-level alerts (pod restarts) in GKE. Each app is identified by its name (map key). | object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
cluster_name = optional(string, null)
apps = optional(map(object({
uptime_check = optional(object({
enabled = optional(bool, true)
host = string
path = optional(string, "/health/readiness")
}), null)
container_check = optional(object({
enabled = optional(bool, true)
namespace = string
pod_restart = optional(object({
threshold = optional(number, 0)
alignment_period = optional(number, 60)
duration = optional(number, 180)
auto_close_seconds = optional(number, 3600)
notification_prompts = optional(list(string), ["OPENED", "CLOSED"])
}), {})
}), null)
})), {})
}) | `{}` | no |
| [notification\_channels](#input\_notification\_channels) | List of notification channel IDs to notify when an alert is triggered | `list(string)` | `[]` | no |
| [project\_id](#input\_project\_id) | The Google Cloud project ID where logging exclusions will be created | `string` | n/a | yes |
| [ssl\_alert](#input\_ssl\_alert) | Configuration for SSL certificate expiration alerts. Allows customization of project, notification channels, alert thresholds, and user labels. | object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
threshold_days = optional(list(number), [15, 7])
user_labels = optional(map(string), {})
}) | `{}` | no |
-| [typesense](#input\_typesense) | Configuration for Typesense monitoring alerts. Supports uptime checks for HTTP endpoints and container-level alerts (pod restarts) in GKE. Each app is identified by its name (map key). | object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
cluster_name = optional(string, null) # GKE cluster name for container checks
# Apps configuration - map keyed by app_name
apps = optional(map(object({
# Uptime check configuration (optional)
uptime_check = optional(object({
enabled = optional(bool, true)
host = string
path = optional(string, "/readyz")
}), null)
# Container check configuration for GKE (optional)
container_check = optional(object({
enabled = optional(bool, true)
namespace = string
pod_restart = optional(object({
threshold = optional(number, 0)
alignment_period = optional(number, 60)
duration = optional(number, 0)
auto_close_seconds = optional(number, 3600)
}), {})
}), null)
})), {})
}) | `{}` | no |
+| [typesense](#input\_typesense) | Configuration for Typesense monitoring alerts. Supports uptime checks for HTTP endpoints and container-level alerts (pod restarts) in GKE. Each app is identified by its name (map key). | object({
enabled = optional(bool, false)
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
cluster_name = optional(string, null)
apps = optional(map(object({
uptime_check = optional(object({
enabled = optional(bool, true)
host = string
path = optional(string, "/readyz")
}), null)
container_check = optional(object({
enabled = optional(bool, true)
namespace = string
pod_restart = optional(object({
threshold = optional(number, 0)
alignment_period = optional(number, 60)
duration = optional(number, 180)
auto_close_seconds = optional(number, 3600)
notification_prompts = optional(list(string), ["OPENED", "CLOSED"])
}), {})
}), null)
})), {})
}) | `{}` | no |
## Outputs
diff --git a/examples/main.tf b/examples/main.tf
index 92ffa3b..5ab7c75 100644
--- a/examples/main.tf
+++ b/examples/main.tf
@@ -42,15 +42,13 @@ locals {
}
module "example" {
- source = "github.com/sparkfabrik/terraform-google-services-monitoring"
- version = ">= 0.1.0"
+ source = "github.com/sparkfabrik/terraform-google-services-monitoring?ref=0.9.0"
notification_channels = var.notification_channels
project_id = var.project_id
cloud_sql = local.cloud_sql
kyverno = {
cluster_name = "test-cluster"
- enabled = true
notification_channels = []
# Optional filter for log entries, exclude known non-actionable messages
# e.g., "-textPayload:\"stale GroupVersion discovery: metrics.k8s.io/v1beta1\""
@@ -59,7 +57,42 @@ module "example" {
cert_manager = {
cluster_name = "test-cluster"
namespace = "cert-manager"
- enabled = true
- notification_channels = []
+ }
+
+ typesense = {
+ cluster_name = "test-cluster"
+ apps = {
+ "typesense-app" = {
+ uptime_check = {
+ host = "typesense.example.com"
+ }
+ container_check = {
+ enabled = true
+ namespace = "typesense"
+ pod_restart = {
+ threshold = 1
+ }
+ }
+ }
+ }
+ }
+
+ litellm = {
+ cluster_name = "test-cluster"
+ apps = {
+ "litellm-app" = {
+ uptime_check = {
+ host = "litellm.example.com"
+ }
+ container_check = {
+ namespace = "litellm"
+ pod_restart = {
+ threshold = 2
+ duration = 300
+ notification_prompts = ["CLOSED"]
+ }
+ }
+ }
+ }
}
}
diff --git a/lite_llm.tf b/lite_llm.tf
index a6f38cf..a7ca9a6 100644
--- a/lite_llm.tf
+++ b/lite_llm.tf
@@ -75,5 +75,6 @@ resource "google_monitoring_alert_policy" "litellm_pod_restart" {
alert_strategy {
auto_close = "${each.value.pod_restart.auto_close_seconds}s"
+ notification_prompts = each.value.pod_restart.notification_prompts
}
}
diff --git a/typesense.tf b/typesense.tf
index 07fb529..4e69f40 100644
--- a/typesense.tf
+++ b/typesense.tf
@@ -75,5 +75,6 @@ resource "google_monitoring_alert_policy" "typesense_pod_restart" {
alert_strategy {
auto_close = "${each.value.pod_restart.auto_close_seconds}s"
+ notification_prompts = each.value.pod_restart.notification_prompts
}
}
diff --git a/variables.tf b/variables.tf
index 54d748a..8596ea9 100644
--- a/variables.tf
+++ b/variables.tf
@@ -107,26 +107,24 @@ variable "typesense" {
project_id = optional(string, null)
notification_enabled = optional(bool, true)
notification_channels = optional(list(string), [])
- cluster_name = optional(string, null) # GKE cluster name for container checks
+ cluster_name = optional(string, null)
- # Apps configuration - map keyed by app_name
apps = optional(map(object({
- # Uptime check configuration (optional)
uptime_check = optional(object({
enabled = optional(bool, true)
host = string
path = optional(string, "/readyz")
}), null)
- # Container check configuration for GKE (optional)
container_check = optional(object({
enabled = optional(bool, true)
namespace = string
pod_restart = optional(object({
threshold = optional(number, 0)
alignment_period = optional(number, 60)
- duration = optional(number, 0)
+ duration = optional(number, 180)
auto_close_seconds = optional(number, 3600)
+ notification_prompts = optional(list(string), null)
}), {})
}), null)
})), {})
@@ -175,8 +173,9 @@ variable "litellm" {
pod_restart = optional(object({
threshold = optional(number, 0)
alignment_period = optional(number, 60)
- duration = optional(number, 0)
+ duration = optional(number, 180)
auto_close_seconds = optional(number, 3600)
+ notification_prompts = optional(list(string), null)
}), {})
}), null)
})), {})