Skip to content

Commit 6487c64

Browse files
committed
HYPERFLEET-856 - fix: address PR review findings
- Aggregate alert exprs with max by(resource_type) to deduplicate across pods - Template alert description durations from values instead of hard-coding - Add context timeout (5s) to TerminatingCollector.Collect DB queries - Record nodepool termination metrics on cluster cascade delete - Validate DeletionStuckThreshold > 0 in config loader
1 parent 6019453 commit 6487c64

5 files changed

Lines changed: 30 additions & 6 deletions

File tree

charts/templates/prometheusrule.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,25 +14,25 @@ spec:
1414
- name: hyperfleet-api-deletion
1515
rules:
1616
- alert: HyperFleetResourceDeletionStuck
17-
expr: hyperfleet_api_resource_terminating_stuck > 0
17+
expr: max by (resource_type)(hyperfleet_api_resource_terminating_stuck) > 0
1818
for: {{ .Values.prometheusRule.rules.deletionStuck.for | default "5m" }}
1919
labels:
2020
severity: warning
2121
annotations:
2222
summary: "HyperFleet resources stuck in Terminating state"
2323
description: >-
2424
{{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in
25-
Terminating state for more than 30 minutes.
25+
Terminating state for more than {{ .Values.prometheusRule.rules.deletionStuck.for | default "5m" }}.
2626
runbook_url: {{ .Values.prometheusRule.rules.deletionStuck.runbookUrl | default "" | quote }}
2727
- alert: HyperFleetResourceDeletionTimeout
28-
expr: hyperfleet_api_resource_terminating_stuck > 0
28+
expr: max by (resource_type)(hyperfleet_api_resource_terminating_stuck) > 0
2929
for: {{ .Values.prometheusRule.rules.deletionTimeout.for | default "30m" }}
3030
labels:
3131
severity: critical
3232
annotations:
3333
summary: "HyperFleet resources timed out in Terminating state"
3434
description: >-
3535
{{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in
36-
Terminating state for more than 1 hour. Immediate investigation required.
36+
Terminating state for more than {{ .Values.prometheusRule.rules.deletionTimeout.for | default "30m" }}. Immediate investigation required.
3737
runbook_url: {{ .Values.prometheusRule.rules.deletionTimeout.runbookUrl | default "" | quote }}
3838
{{- end }}

pkg/config/loader.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,9 @@ func (l *ConfigLoader) validateConfig(config *ApplicationConfig) error {
185185
if valErr := config.Metrics.TLS.Validate(); valErr != nil {
186186
return fmt.Errorf("metrics TLS validation failed: %w", valErr)
187187
}
188+
if valErr := config.Metrics.Validate(); valErr != nil {
189+
return fmt.Errorf("metrics config validation failed: %w", valErr)
190+
}
188191
return nil
189192
}
190193

pkg/config/metrics.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package config
22

33
import (
4+
"fmt"
45
"net"
56
"strconv"
67
"time"
@@ -30,6 +31,14 @@ func NewMetricsConfig() *MetricsConfig {
3031
}
3132
}
3233

34+
// Validate validates MetricsConfig fields that struct tags cannot enforce
35+
func (m *MetricsConfig) Validate() error {
36+
if m.DeletionStuckThreshold <= 0 {
37+
return fmt.Errorf("DeletionStuckThreshold must be positive, got %v", m.DeletionStuckThreshold)
38+
}
39+
return nil
40+
}
41+
3342
// ============================================================
3443
// Convenience Accessor Methods
3544
// ============================================================

pkg/metrics/deletion.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package metrics
1818

1919
import (
20+
"context"
2021
"database/sql"
2122
"log/slog"
2223
"sync"
@@ -98,16 +99,20 @@ func ResetMetrics() {
9899
// TerminatingCollector implements prometheus.Collector to report the number of
99100
// resources stuck in Terminating state beyond a configurable threshold.
100101
// It queries the database on each Prometheus scrape.
102+
const defaultQueryTimeout = 5 * time.Second
103+
101104
type TerminatingCollector struct {
102105
stuckDesc *prometheus.Desc
103106
db *sql.DB
104107
stuckThreshold time.Duration
108+
queryTimeout time.Duration
105109
}
106110

107111
func NewTerminatingCollector(db *sql.DB, stuckThreshold time.Duration) *TerminatingCollector {
108112
return &TerminatingCollector{
109113
db: db,
110114
stuckThreshold: stuckThreshold,
115+
queryTimeout: defaultQueryTimeout,
111116
stuckDesc: prometheus.NewDesc(
112117
metricsSubsystem+"_resource_terminating_stuck",
113118
"Number of resources in Terminating state beyond the stuck threshold.",
@@ -136,11 +141,14 @@ func (c *TerminatingCollector) Collect(ch chan<- prometheus.Metric) {
136141
return
137142
}
138143

144+
ctx, cancel := context.WithTimeout(context.Background(), c.queryTimeout)
145+
defer cancel()
146+
139147
threshold := time.Now().UTC().Add(-c.stuckThreshold)
140148

141149
for _, q := range stuckQueries {
142150
var count int64
143-
row := c.db.QueryRow(q.query, threshold) //nolint:gosec // table names are compile-time constants
151+
row := c.db.QueryRowContext(ctx, q.query, threshold) //nolint:gosec // table names are compile-time constants
144152
if err := row.Scan(&count); err != nil {
145153
slog.Error("Failed to query terminating resources",
146154
"resource_type", q.resourceType,

pkg/services/cluster.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,11 +140,15 @@ func (s *sqlClusterService) SoftDelete(ctx context.Context, id string) (*api.Clu
140140
return nil, svcErr
141141
}
142142

143-
// Update status for all cascade-deleted nodepools so their Ready condition reflects the generation bump.
144143
nodePools, err := s.nodePoolDao.FindSoftDeletedByOwner(ctx, id)
145144
if err != nil {
146145
return nil, errors.GeneralError("Failed to fetch cascade-deleted nodepools: %s", err)
147146
}
147+
148+
for range nodePools {
149+
metrics.RecordTerminating("nodepool")
150+
}
151+
148152
if svcErr := batchUpdateNodePoolStatusesFromAdapters(
149153
ctx,
150154
nodePools,

0 commit comments

Comments
 (0)