HYPERFLEET-856 - feat: add deletion observability metrics and alerts

rafabene · rafabene · commit ac6d35e42b96 · 2026-05-04T11:04:05.000-03:00
Add Prometheus metrics to track the lifecycle of resource deletion:
- pending_deletion_total counter: tracks resources entering Pending Deletion
- pending_deletion_duration_seconds histogram: measures soft-delete to hard-delete
- pending_deletion_stuck gauge (collector): reports resources stuck beyond threshold

Includes PrometheusRule alerts (warning at 1h, critical at 2.5h total),
partial indexes on deleted_time for efficient collector queries,
and integration tests for the full metrics pipeline.
diff --git a/charts/templates/prometheusrule.yaml b/charts/templates/prometheusrule.yaml
@@ -0,0 +1,40 @@
+{{- if .Values.prometheusRule.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: {{ include "hyperfleet-api.fullname" . }}
+  namespace: {{ .Values.prometheusRule.namespace | default .Release.Namespace }}
+  labels:
+    {{- include "hyperfleet-api.labels" . | nindent 4 }}
+    {{- with .Values.prometheusRule.labels }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+spec:
+  groups:
+    - name: hyperfleet-api-deletion
+      rules:
+        - alert: HyperFleetResourceDeletionStuckWarning
+          expr: max by (namespace, resource_type)(hyperfleet_api_resource_pending_deletion_stuck) > 0
+          for: {{ .Values.prometheusRule.rules.deletionStuck.for | default "30m" }}
+          labels:
+            severity: warning
+          annotations:
+            summary: "HyperFleet resources stuck in Pending Deletion state"
+            description: >-
+              {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in
+              Pending Deletion state for more than {{ .Values.config.metrics.deletion_stuck_threshold | default "30m" }}
+              (stuck threshold) + {{ .Values.prometheusRule.rules.deletionStuck.for | default "30m" }} (alert delay).
+            runbook_url: {{ .Values.prometheusRule.rules.deletionStuck.runbookUrl | default "" | quote }}
+        - alert: HyperFleetResourceDeletionStuckCritical
+          expr: max by (namespace, resource_type)(hyperfleet_api_resource_pending_deletion_stuck) > 0
+          for: {{ .Values.prometheusRule.rules.deletionTimeout.for | default "2h" }}
+          labels:
+            severity: critical
+          annotations:
+            summary: "HyperFleet resources timed out in Pending Deletion state"
+            description: >-
+              {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in
+              Pending Deletion state for more than {{ .Values.config.metrics.deletion_stuck_threshold | default "30m" }}
+              (stuck threshold) + {{ .Values.prometheusRule.rules.deletionTimeout.for | default "2h" }} (alert delay). Immediate investigation required.
+            runbook_url: {{ .Values.prometheusRule.rules.deletionTimeout.runbookUrl | default "" | quote }}
+{{- end }}
diff --git a/charts/values.yaml b/charts/values.yaml
@@ -126,6 +126,7 @@ config:
       enabled: false
 
     label_metrics_inclusion_duration: 168h
+    deletion_stuck_threshold: 30m
 
   # Health check configuration
   health:
@@ -243,6 +244,19 @@ database:
       size: 1Gi
       storageClass: ""
 
+# PrometheusRule for alerting
+prometheusRule:
+  enabled: false
+  labels: {}
+  namespace: ""
+  rules:
+    deletionStuck:
+      for: "30m"
+      runbookUrl: ""
+    deletionTimeout:
+      for: "2h"
+      runbookUrl: ""
+
 # ServiceMonitor for Prometheus Operator
 serviceMonitor:
   enabled: false
diff --git a/cmd/hyperfleet-api/servecmd/cmd.go b/cmd/hyperfleet-api/servecmd/cmd.go
@@ -18,6 +18,7 @@ import (
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/db/db_session"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/health"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger"
+	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/telemetry"
 )
 
@@ -129,6 +130,15 @@ func runServe(cmd *cobra.Command, args []string) {
 		"masking_enabled", environments.Environment().Config.Logging.Masking.Enabled,
 	).Info("Logger initialized")
 
+	if sf := environments.Environment().Database.SessionFactory; sf != nil {
+		if err := metrics.RegisterCollector(
+			sf.DirectDB(),
+			environments.Environment().Config.Metrics.DeletionStuckThreshold,
+		); err != nil {
+			logger.WithError(ctx, err).Error("Failed to register pending deletion collector")
+		}
+	}
+
 	apiServer := server.NewAPIServer(tracingEnabled)
 	go apiServer.Start()
 
diff --git a/cmd/hyperfleet-api/server/metrics_middleware.go b/cmd/hyperfleet-api/server/metrics_middleware.go
@@ -62,6 +62,7 @@ import (
 
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/api"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/db/db_metrics"
+	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics"
 )
 
 // MetricsMiddleware creates a new handler that collects metrics for the requests processed by the
@@ -112,6 +113,7 @@ func ResetMetricCollectors() {
 	requestCountMetric.Reset()
 	requestDurationMetric.Reset()
 	db_metrics.ResetMetrics()
+	metrics.ResetMetrics()
 	buildInfoMetric.Reset()
 	buildInfoMetric.With(prometheus.Labels{
 		metricsComponentLabel: metricsComponentValue,
diff --git a/docs/metrics.md b/docs/metrics.md
@@ -99,6 +99,69 @@ hyperfleet_api_request_duration_seconds_sum{component="api",version="abc123",cod
 hyperfleet_api_request_duration_seconds_count{component="api",version="abc123",code="200",method="GET",path="/api/hyperfleet/v1/clusters"} 1523
 ```
 
+### Deletion Observability Metrics
+
+These metrics track resources in the Pending Deletion state (`deleted_time` set, pending hard-delete by adapters).
+
+#### `hyperfleet_api_resource_pending_deletion_total`
+
+**Type:** Counter
+
+**Description:** Total number of resources that entered the Pending Deletion state (`deleted_time` set).
+
+**Labels:**
+
+| Label | Description | Example Values |
+|-------|-------------|----------------|
+| `resource_type` | Type of resource | `cluster`, `nodepool` |
+| `component` | Component name | `api` |
+| `version` | Application version | `abc123` |
+
+**Example output:**
+
+```text
+hyperfleet_api_resource_pending_deletion_total{component="api",resource_type="cluster",version="abc123"} 42
+hyperfleet_api_resource_pending_deletion_total{component="api",resource_type="nodepool",version="abc123"} 156
+```
+
+#### `hyperfleet_api_resource_pending_deletion_duration_seconds`
+
+**Type:** Histogram
+
+**Description:** Duration from pending deletion (`deleted_time` set) to hard-delete completion in seconds. Observed when a resource is hard-deleted after all adapters report `Finalized=True`.
+
+**Labels:** Same as `hyperfleet_api_resource_pending_deletion_total`
+
+**Buckets:** `1s`, `5s`, `10s`, `30s`, `60s`, `120s`, `300s`, `600s`, `1800s`, `3600s`
+
+**Note:** This metric is populated when the hard-delete flow is active. See the [hard-delete design](https://github.com/openshift-hyperfleet/architecture/blob/main/hyperfleet/components/api-service/hard-delete-design.md) for details.
+
+#### `hyperfleet_api_resource_pending_deletion_stuck`
+
+**Type:** Gauge (Collector)
+
+**Description:** Number of resources in Pending Deletion state beyond the stuck threshold (default 30 minutes). This gauge is computed on each Prometheus scrape by querying the database for resources with `deleted_time` set before the threshold.
+
+**Labels:** Same as `hyperfleet_api_resource_pending_deletion_total`
+
+**Configuration:** The stuck threshold is configurable via `--metrics-deletion-stuck-threshold` (default `30m`).
+
+**Example output:**
+
+```text
+hyperfleet_api_resource_pending_deletion_stuck{component="api",resource_type="cluster",version="abc123"} 2
+hyperfleet_api_resource_pending_deletion_stuck{component="api",resource_type="nodepool",version="abc123"} 0
+```
+
+### Deletion Alerts
+
+Two alerts are available via the PrometheusRule (requires `prometheusRule.enabled=true` in Helm values):
+
+| Alert | Severity | Condition | Description |
+|-------|----------|-----------|-------------|
+| `HyperFleetResourceDeletionStuckWarning` | Warning | `resource_pending_deletion_stuck > 0` for 30m | Resources stuck in Pending Deletion beyond 1 hour |
+| `HyperFleetResourceDeletionStuckCritical` | Critical | `resource_pending_deletion_stuck > 0` for 2h | Resources stuck in Pending Deletion beyond 2.5 hours |
+
 ## Go Runtime Metrics
 
 The following metrics are automatically exposed by the Prometheus Go client library.
@@ -255,6 +318,26 @@ rate(process_cpu_seconds_total[5m])
 process_open_fds / process_max_fds * 100
 ```
 
+### Deletion Observability
+
+```promql
+# Resources entering Pending Deletion state per minute
+sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_total[5m])) * 60
+
+# Resources currently stuck in Pending Deletion state
+hyperfleet_api_resource_pending_deletion_stuck
+
+# Stuck resources by type
+sum by (resource_type) (hyperfleet_api_resource_pending_deletion_stuck)
+
+# Average pending deletion duration (once hard-delete is active)
+sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_sum[5m])) /
+sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_count[5m]))
+
+# P99 pending deletion duration
+histogram_quantile(0.99, sum by (le, resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_bucket[5m])))
+```
+
 ### Common Investigation Queries
 
 ```promql
diff --git a/pkg/config/flags.go b/pkg/config/flags.go
@@ -79,6 +79,8 @@ func AddMetricsFlags(cmd *cobra.Command) {
 	cmd.Flags().String("metrics-tls-key-file", defaults.TLS.KeyFile, "Path to TLS key file for metrics")
 	cmd.Flags().Duration("metrics-label-metrics-inclusion-duration", defaults.LabelMetricsInclusionDuration,
 		"Duration for cluster telemetry label inclusion")
+	cmd.Flags().Duration("metrics-deletion-stuck-threshold", defaults.DeletionStuckThreshold,
+		"Duration after which a pending deletion resource is considered stuck")
 }
 
 // AddHealthFlags adds health check configuration flags following standard naming
diff --git a/pkg/config/loader.go b/pkg/config/loader.go
@@ -185,6 +185,9 @@ func (l *ConfigLoader) validateConfig(config *ApplicationConfig) error {
 		if valErr := config.Metrics.TLS.Validate(); valErr != nil {
 			return fmt.Errorf("metrics TLS validation failed: %w", valErr)
 		}
+		if valErr := config.Metrics.Validate(); valErr != nil {
+			return fmt.Errorf("metrics config validation failed: %w", valErr)
+		}
 		return nil
 	}
 
@@ -345,6 +348,7 @@ func (l *ConfigLoader) bindAllEnvVars() {
 	l.bindEnv("metrics.port")
 	l.bindEnv("metrics.tls.enabled")
 	l.bindEnv("metrics.label_metrics_inclusion_duration")
+	l.bindEnv("metrics.deletion_stuck_threshold")
 
 	// Health config
 	l.bindEnv("health.host")
@@ -411,6 +415,8 @@ func (l *ConfigLoader) bindFlags(cmd *cobra.Command) {
 	l.bindPFlag("metrics.tls.key_file", cmd.Flags().Lookup("metrics-tls-key-file"))
 	l.bindPFlag("metrics.label_metrics_inclusion_duration",
 		cmd.Flags().Lookup("metrics-label-metrics-inclusion-duration"))
+	l.bindPFlag("metrics.deletion_stuck_threshold",
+		cmd.Flags().Lookup("metrics-deletion-stuck-threshold"))
 
 	// Health flags: --health-* -> health.*
 	l.bindPFlag("health.host", cmd.Flags().Lookup("health-host"))
diff --git a/pkg/config/metrics.go b/pkg/config/metrics.go
@@ -1,6 +1,7 @@
 package config
 
 import (
+	"fmt"
 	"net"
 	"strconv"
 	"time"
@@ -13,6 +14,7 @@ type MetricsConfig struct {
 	TLS                           TLSConfig     `mapstructure:"tls" json:"tls" validate:"required"`
 	Port                          int           `mapstructure:"port" json:"port" validate:"required,min=1,max=65535"`
 	LabelMetricsInclusionDuration time.Duration `mapstructure:"label_metrics_inclusion_duration" json:"label_metrics_inclusion_duration" validate:"required"` //nolint:lll
+	DeletionStuckThreshold        time.Duration `mapstructure:"deletion_stuck_threshold" json:"deletion_stuck_threshold" validate:"required"`                 //nolint:lll
 }
 
 // NewMetricsConfig returns default MetricsConfig values
@@ -25,9 +27,18 @@ func NewMetricsConfig() *MetricsConfig {
 			Enabled: false,
 		},
 		LabelMetricsInclusionDuration: 168 * time.Hour, // 7 days
+		DeletionStuckThreshold:        30 * time.Minute,
 	}
 }
 
+// Validate validates MetricsConfig fields that struct tags cannot enforce
+func (m *MetricsConfig) Validate() error {
+	if m.DeletionStuckThreshold <= 0 {
+		return fmt.Errorf("DeletionStuckThreshold must be positive, got %v", m.DeletionStuckThreshold)
+	}
+	return nil
+}
+
 // ============================================================
 // Convenience Accessor Methods
 // ============================================================
diff --git a/pkg/db/migrations/202604290001_add_deleted_time_indexes.go b/pkg/db/migrations/202604290001_add_deleted_time_indexes.go
@@ -0,0 +1,25 @@
+package migrations
+
+import (
+	"gorm.io/gorm"
+
+	"github.com/go-gormigrate/gormigrate/v2"
+)
+
+func addDeletedTimeIndexes() *gormigrate.Migration {
+	return &gormigrate.Migration{
+		ID: "202604290001",
+		Migrate: func(tx *gorm.DB) error {
+			// Partial indexes for metrics collector queries:
+			//   SELECT COUNT(*) FROM clusters WHERE deleted_time IS NOT NULL AND deleted_time < $1
+			//   SELECT COUNT(*) FROM node_pools WHERE deleted_time IS NOT NULL AND deleted_time < $1
+			if err := tx.Exec("CREATE INDEX IF NOT EXISTS idx_clusters_deleted_time ON clusters(deleted_time) WHERE deleted_time IS NOT NULL;").Error; err != nil { //nolint:lll
+				return err
+			}
+			if err := tx.Exec("CREATE INDEX IF NOT EXISTS idx_node_pools_deleted_time ON node_pools(deleted_time) WHERE deleted_time IS NOT NULL;").Error; err != nil { //nolint:lll
+				return err
+			}
+			return nil
+		},
+	}
+}
diff --git a/pkg/db/migrations/migration_structs.go b/pkg/db/migrations/migration_structs.go
@@ -35,6 +35,7 @@ var MigrationList = []*gormigrate.Migration{
 	addSoftDeleteSchema(),
 	addNodePoolOwnerDeletedIndex(),
 	addReconciledIndex(),
+	addDeletedTimeIndexes(),
 }
 
 // Model represents the base model struct. All entities will have this struct embedded.
diff --git a/pkg/metrics/deletion.go b/pkg/metrics/deletion.go
diff --git a/pkg/metrics/deletion_test.go b/pkg/metrics/deletion_test.go
diff --git a/pkg/services/cluster.go b/pkg/services/cluster.go
diff --git a/pkg/services/node_pool.go b/pkg/services/node_pool.go
diff --git a/test/integration/deletion_metrics_test.go b/test/integration/deletion_metrics_test.go

Original file line number	Diff line number	Diff line change
`@@ -79,6 +79,8 @@ func AddMetricsFlags(cmd *cobra.Command) {`
`79`	`79`	`cmd.Flags().String("metrics-tls-key-file", defaults.TLS.KeyFile, "Path to TLS key file for metrics")`
`80`	`80`	`cmd.Flags().Duration("metrics-label-metrics-inclusion-duration", defaults.LabelMetricsInclusionDuration,`
`81`	`81`	`"Duration for cluster telemetry label inclusion")`
	`82`	`+ cmd.Flags().Duration("metrics-deletion-stuck-threshold", defaults.DeletionStuckThreshold,`
	`83`	`+ "Duration after which a pending deletion resource is considered stuck")`
`82`	`84`	`}`
`83`	`85`
`84`	`86`	`// AddHealthFlags adds health check configuration flags following standard naming`
Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@ var MigrationList = []*gormigrate.Migration{`
`35`	`35`	`addSoftDeleteSchema(),`
`36`	`36`	`addNodePoolOwnerDeletedIndex(),`
`37`	`37`	`addReconciledIndex(),`
	`38`	`+ addDeletedTimeIndexes(),`
`38`	`39`	`}`
`39`	`40`
`40`	`41`	`// Model represents the base model struct. All entities will have this struct embedded.`