Skip to content

Commit ac6d35e

Browse files
committed
HYPERFLEET-856 - feat: add deletion observability metrics and alerts
Add Prometheus metrics to track the lifecycle of resource deletion: - pending_deletion_total counter: tracks resources entering Pending Deletion - pending_deletion_duration_seconds histogram: measures soft-delete to hard-delete - pending_deletion_stuck gauge (collector): reports resources stuck beyond threshold Includes PrometheusRule alerts (warning at 1h, critical at 2.5h total), partial indexes on deleted_time for efficient collector queries, and integration tests for the full metrics pipeline.
1 parent da061c6 commit ac6d35e

15 files changed

Lines changed: 741 additions & 5 deletions

File tree

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
{{- if .Values.prometheusRule.enabled }}
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PrometheusRule
4+
metadata:
5+
name: {{ include "hyperfleet-api.fullname" . }}
6+
namespace: {{ .Values.prometheusRule.namespace | default .Release.Namespace }}
7+
labels:
8+
{{- include "hyperfleet-api.labels" . | nindent 4 }}
9+
{{- with .Values.prometheusRule.labels }}
10+
{{- toYaml . | nindent 4 }}
11+
{{- end }}
12+
spec:
13+
groups:
14+
- name: hyperfleet-api-deletion
15+
rules:
16+
- alert: HyperFleetResourceDeletionStuckWarning
17+
expr: max by (namespace, resource_type)(hyperfleet_api_resource_pending_deletion_stuck) > 0
18+
for: {{ .Values.prometheusRule.rules.deletionStuck.for | default "30m" }}
19+
labels:
20+
severity: warning
21+
annotations:
22+
summary: "HyperFleet resources stuck in Pending Deletion state"
23+
description: >-
24+
{{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in
25+
Pending Deletion state for more than {{ .Values.config.metrics.deletion_stuck_threshold | default "30m" }}
26+
(stuck threshold) + {{ .Values.prometheusRule.rules.deletionStuck.for | default "30m" }} (alert delay).
27+
runbook_url: {{ .Values.prometheusRule.rules.deletionStuck.runbookUrl | default "" | quote }}
28+
- alert: HyperFleetResourceDeletionStuckCritical
29+
expr: max by (namespace, resource_type)(hyperfleet_api_resource_pending_deletion_stuck) > 0
30+
for: {{ .Values.prometheusRule.rules.deletionTimeout.for | default "2h" }}
31+
labels:
32+
severity: critical
33+
annotations:
34+
summary: "HyperFleet resources timed out in Pending Deletion state"
35+
description: >-
36+
{{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in
37+
Pending Deletion state for more than {{ .Values.config.metrics.deletion_stuck_threshold | default "30m" }}
38+
(stuck threshold) + {{ .Values.prometheusRule.rules.deletionTimeout.for | default "2h" }} (alert delay). Immediate investigation required.
39+
runbook_url: {{ .Values.prometheusRule.rules.deletionTimeout.runbookUrl | default "" | quote }}
40+
{{- end }}

charts/values.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ config:
126126
enabled: false
127127

128128
label_metrics_inclusion_duration: 168h
129+
deletion_stuck_threshold: 30m
129130

130131
# Health check configuration
131132
health:
@@ -243,6 +244,19 @@ database:
243244
size: 1Gi
244245
storageClass: ""
245246

247+
# PrometheusRule for alerting
248+
prometheusRule:
249+
enabled: false
250+
labels: {}
251+
namespace: ""
252+
rules:
253+
deletionStuck:
254+
for: "30m"
255+
runbookUrl: ""
256+
deletionTimeout:
257+
for: "2h"
258+
runbookUrl: ""
259+
246260
# ServiceMonitor for Prometheus Operator
247261
serviceMonitor:
248262
enabled: false

cmd/hyperfleet-api/servecmd/cmd.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/openshift-hyperfleet/hyperfleet-api/pkg/db/db_session"
1919
"github.com/openshift-hyperfleet/hyperfleet-api/pkg/health"
2020
"github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger"
21+
"github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics"
2122
"github.com/openshift-hyperfleet/hyperfleet-api/pkg/telemetry"
2223
)
2324

@@ -129,6 +130,15 @@ func runServe(cmd *cobra.Command, args []string) {
129130
"masking_enabled", environments.Environment().Config.Logging.Masking.Enabled,
130131
).Info("Logger initialized")
131132

133+
if sf := environments.Environment().Database.SessionFactory; sf != nil {
134+
if err := metrics.RegisterCollector(
135+
sf.DirectDB(),
136+
environments.Environment().Config.Metrics.DeletionStuckThreshold,
137+
); err != nil {
138+
logger.WithError(ctx, err).Error("Failed to register pending deletion collector")
139+
}
140+
}
141+
132142
apiServer := server.NewAPIServer(tracingEnabled)
133143
go apiServer.Start()
134144

cmd/hyperfleet-api/server/metrics_middleware.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ import (
6262

6363
"github.com/openshift-hyperfleet/hyperfleet-api/pkg/api"
6464
"github.com/openshift-hyperfleet/hyperfleet-api/pkg/db/db_metrics"
65+
"github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics"
6566
)
6667

6768
// MetricsMiddleware creates a new handler that collects metrics for the requests processed by the
@@ -112,6 +113,7 @@ func ResetMetricCollectors() {
112113
requestCountMetric.Reset()
113114
requestDurationMetric.Reset()
114115
db_metrics.ResetMetrics()
116+
metrics.ResetMetrics()
115117
buildInfoMetric.Reset()
116118
buildInfoMetric.With(prometheus.Labels{
117119
metricsComponentLabel: metricsComponentValue,

docs/metrics.md

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,69 @@ hyperfleet_api_request_duration_seconds_sum{component="api",version="abc123",cod
9999
hyperfleet_api_request_duration_seconds_count{component="api",version="abc123",code="200",method="GET",path="/api/hyperfleet/v1/clusters"} 1523
100100
```
101101

102+
### Deletion Observability Metrics
103+
104+
These metrics track resources in the Pending Deletion state (`deleted_time` set, pending hard-delete by adapters).
105+
106+
#### `hyperfleet_api_resource_pending_deletion_total`
107+
108+
**Type:** Counter
109+
110+
**Description:** Total number of resources that entered the Pending Deletion state (`deleted_time` set).
111+
112+
**Labels:**
113+
114+
| Label | Description | Example Values |
115+
|-------|-------------|----------------|
116+
| `resource_type` | Type of resource | `cluster`, `nodepool` |
117+
| `component` | Component name | `api` |
118+
| `version` | Application version | `abc123` |
119+
120+
**Example output:**
121+
122+
```text
123+
hyperfleet_api_resource_pending_deletion_total{component="api",resource_type="cluster",version="abc123"} 42
124+
hyperfleet_api_resource_pending_deletion_total{component="api",resource_type="nodepool",version="abc123"} 156
125+
```
126+
127+
#### `hyperfleet_api_resource_pending_deletion_duration_seconds`
128+
129+
**Type:** Histogram
130+
131+
**Description:** Duration from pending deletion (`deleted_time` set) to hard-delete completion in seconds. Observed when a resource is hard-deleted after all adapters report `Finalized=True`.
132+
133+
**Labels:** Same as `hyperfleet_api_resource_pending_deletion_total`
134+
135+
**Buckets:** `1s`, `5s`, `10s`, `30s`, `60s`, `120s`, `300s`, `600s`, `1800s`, `3600s`
136+
137+
**Note:** This metric is populated when the hard-delete flow is active. See the [hard-delete design](https://github.com/openshift-hyperfleet/architecture/blob/main/hyperfleet/components/api-service/hard-delete-design.md) for details.
138+
139+
#### `hyperfleet_api_resource_pending_deletion_stuck`
140+
141+
**Type:** Gauge (Collector)
142+
143+
**Description:** Number of resources in Pending Deletion state beyond the stuck threshold (default 30 minutes). This gauge is computed on each Prometheus scrape by querying the database for resources with `deleted_time` set before the threshold.
144+
145+
**Labels:** Same as `hyperfleet_api_resource_pending_deletion_total`
146+
147+
**Configuration:** The stuck threshold is configurable via `--metrics-deletion-stuck-threshold` (default `30m`).
148+
149+
**Example output:**
150+
151+
```text
152+
hyperfleet_api_resource_pending_deletion_stuck{component="api",resource_type="cluster",version="abc123"} 2
153+
hyperfleet_api_resource_pending_deletion_stuck{component="api",resource_type="nodepool",version="abc123"} 0
154+
```
155+
156+
### Deletion Alerts
157+
158+
Two alerts are available via the PrometheusRule (requires `prometheusRule.enabled=true` in Helm values):
159+
160+
| Alert | Severity | Condition | Description |
161+
|-------|----------|-----------|-------------|
162+
| `HyperFleetResourceDeletionStuckWarning` | Warning | `resource_pending_deletion_stuck > 0` for 30m | Resources stuck in Pending Deletion beyond 1 hour |
163+
| `HyperFleetResourceDeletionStuckCritical` | Critical | `resource_pending_deletion_stuck > 0` for 2h | Resources stuck in Pending Deletion beyond 2.5 hours |
164+
102165
## Go Runtime Metrics
103166

104167
The following metrics are automatically exposed by the Prometheus Go client library.
@@ -255,6 +318,26 @@ rate(process_cpu_seconds_total[5m])
255318
process_open_fds / process_max_fds * 100
256319
```
257320

321+
### Deletion Observability
322+
323+
```promql
324+
# Resources entering Pending Deletion state per minute
325+
sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_total[5m])) * 60
326+
327+
# Resources currently stuck in Pending Deletion state
328+
hyperfleet_api_resource_pending_deletion_stuck
329+
330+
# Stuck resources by type
331+
sum by (resource_type) (hyperfleet_api_resource_pending_deletion_stuck)
332+
333+
# Average pending deletion duration (once hard-delete is active)
334+
sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_sum[5m])) /
335+
sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_count[5m]))
336+
337+
# P99 pending deletion duration
338+
histogram_quantile(0.99, sum by (le, resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_bucket[5m])))
339+
```
340+
258341
### Common Investigation Queries
259342

260343
```promql

pkg/config/flags.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ func AddMetricsFlags(cmd *cobra.Command) {
7979
cmd.Flags().String("metrics-tls-key-file", defaults.TLS.KeyFile, "Path to TLS key file for metrics")
8080
cmd.Flags().Duration("metrics-label-metrics-inclusion-duration", defaults.LabelMetricsInclusionDuration,
8181
"Duration for cluster telemetry label inclusion")
82+
cmd.Flags().Duration("metrics-deletion-stuck-threshold", defaults.DeletionStuckThreshold,
83+
"Duration after which a pending deletion resource is considered stuck")
8284
}
8385

8486
// AddHealthFlags adds health check configuration flags following standard naming

pkg/config/loader.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,9 @@ func (l *ConfigLoader) validateConfig(config *ApplicationConfig) error {
185185
if valErr := config.Metrics.TLS.Validate(); valErr != nil {
186186
return fmt.Errorf("metrics TLS validation failed: %w", valErr)
187187
}
188+
if valErr := config.Metrics.Validate(); valErr != nil {
189+
return fmt.Errorf("metrics config validation failed: %w", valErr)
190+
}
188191
return nil
189192
}
190193

@@ -345,6 +348,7 @@ func (l *ConfigLoader) bindAllEnvVars() {
345348
l.bindEnv("metrics.port")
346349
l.bindEnv("metrics.tls.enabled")
347350
l.bindEnv("metrics.label_metrics_inclusion_duration")
351+
l.bindEnv("metrics.deletion_stuck_threshold")
348352

349353
// Health config
350354
l.bindEnv("health.host")
@@ -411,6 +415,8 @@ func (l *ConfigLoader) bindFlags(cmd *cobra.Command) {
411415
l.bindPFlag("metrics.tls.key_file", cmd.Flags().Lookup("metrics-tls-key-file"))
412416
l.bindPFlag("metrics.label_metrics_inclusion_duration",
413417
cmd.Flags().Lookup("metrics-label-metrics-inclusion-duration"))
418+
l.bindPFlag("metrics.deletion_stuck_threshold",
419+
cmd.Flags().Lookup("metrics-deletion-stuck-threshold"))
414420

415421
// Health flags: --health-* -> health.*
416422
l.bindPFlag("health.host", cmd.Flags().Lookup("health-host"))

pkg/config/metrics.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package config
22

33
import (
4+
"fmt"
45
"net"
56
"strconv"
67
"time"
@@ -13,6 +14,7 @@ type MetricsConfig struct {
1314
TLS TLSConfig `mapstructure:"tls" json:"tls" validate:"required"`
1415
Port int `mapstructure:"port" json:"port" validate:"required,min=1,max=65535"`
1516
LabelMetricsInclusionDuration time.Duration `mapstructure:"label_metrics_inclusion_duration" json:"label_metrics_inclusion_duration" validate:"required"` //nolint:lll
17+
DeletionStuckThreshold time.Duration `mapstructure:"deletion_stuck_threshold" json:"deletion_stuck_threshold" validate:"required"` //nolint:lll
1618
}
1719

1820
// NewMetricsConfig returns default MetricsConfig values
@@ -25,9 +27,18 @@ func NewMetricsConfig() *MetricsConfig {
2527
Enabled: false,
2628
},
2729
LabelMetricsInclusionDuration: 168 * time.Hour, // 7 days
30+
DeletionStuckThreshold: 30 * time.Minute,
2831
}
2932
}
3033

34+
// Validate validates MetricsConfig fields that struct tags cannot enforce
35+
func (m *MetricsConfig) Validate() error {
36+
if m.DeletionStuckThreshold <= 0 {
37+
return fmt.Errorf("DeletionStuckThreshold must be positive, got %v", m.DeletionStuckThreshold)
38+
}
39+
return nil
40+
}
41+
3142
// ============================================================
3243
// Convenience Accessor Methods
3344
// ============================================================
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package migrations
2+
3+
import (
4+
"gorm.io/gorm"
5+
6+
"github.com/go-gormigrate/gormigrate/v2"
7+
)
8+
9+
func addDeletedTimeIndexes() *gormigrate.Migration {
10+
return &gormigrate.Migration{
11+
ID: "202604290001",
12+
Migrate: func(tx *gorm.DB) error {
13+
// Partial indexes for metrics collector queries:
14+
// SELECT COUNT(*) FROM clusters WHERE deleted_time IS NOT NULL AND deleted_time < $1
15+
// SELECT COUNT(*) FROM node_pools WHERE deleted_time IS NOT NULL AND deleted_time < $1
16+
if err := tx.Exec("CREATE INDEX IF NOT EXISTS idx_clusters_deleted_time ON clusters(deleted_time) WHERE deleted_time IS NOT NULL;").Error; err != nil { //nolint:lll
17+
return err
18+
}
19+
if err := tx.Exec("CREATE INDEX IF NOT EXISTS idx_node_pools_deleted_time ON node_pools(deleted_time) WHERE deleted_time IS NOT NULL;").Error; err != nil { //nolint:lll
20+
return err
21+
}
22+
return nil
23+
},
24+
}
25+
}

pkg/db/migrations/migration_structs.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ var MigrationList = []*gormigrate.Migration{
3535
addSoftDeleteSchema(),
3636
addNodePoolOwnerDeletedIndex(),
3737
addReconciledIndex(),
38+
addDeletedTimeIndexes(),
3839
}
3940

4041
// Model represents the base model struct. All entities will have this struct embedded.

0 commit comments

Comments
 (0)