Skip to content

Commit f59fa11

Browse files
authored
fix(operator): reconcile on Prometheus and ServiceMonitor changes (#1539)
Signed-off-by: gshaibi <gshaibi@nvidia.com>
1 parent 5f88833 commit f59fa11

2 files changed

Lines changed: 46 additions & 15 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
3838
- Fixed `skipTopOwnerGrouper` not propagating per-type defaults (priority class and preemptibility) for skipped owners (e.g. `DynamoGraphDeployment`), causing PodGroup spec to retain stale values after defaults ConfigMap updates.
3939
- Fixed binder DRA detection on clusters where the upstream `DynamicResourceAllocation` feature gate does not reflect server-side DRA availability. The binder now probes the API server during init (matching the scheduler) so the DRA plugin is gated on the same authoritative decision. [#1481](https://github.com/kai-scheduler/KAI-Scheduler/issues/1481)
4040
- Suppressed noisy `Reconciler error` logs and `PodGrouperWarning` events on transient PodGroup update conflicts. The podgrouper now treats `IsConflict` errors as expected and silently requeues the reconcile instead of surfacing the apiserver's "object has been modified" message.
41+
- Fixed kai-operator not reconciling on Prometheus and ServiceMonitor changes. The Config controller now watches owned `Prometheus` and `ServiceMonitor` resources, so deletions and drift trigger reconciliation. CRD presence is checked at startup against the API server (the scheme-only check used previously could not detect missing CRDs), and the watch is registered only when the CRDs are installed. [#877](https://github.com/kai-scheduler/KAI-Scheduler/issues/877)
4142
- Added `before-hook-creation` to the `crd-upgrader` Helm hook delete policy so failed hook Jobs no longer block subsequent `helm upgrade --install` retries. Aligns with the policy already used by the chart's other hook resources. [#1404](https://github.com/kai-scheduler/KAI-Scheduler/issues/1404)
4243

4344
## [v0.14.0] - 2026-03-30

pkg/operator/operands/known_types/prometheus.go

Lines changed: 45 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,46 +26,65 @@ func prometheusIndexer(object client.Object) []string {
2626
}
2727

2828
func registerPrometheus() {
29-
// Only register Prometheus collectable if CRD is available
30-
// We'll check this at runtime during manager initialization
29+
var prometheusCRDAvailable bool
3130
collectable := &Collectable{
3231
Collect: getCurrentPrometheusState,
3332
InitWithManager: func(ctx context.Context, mgr manager.Manager) error {
34-
// Try to register the indexer, but don't fail if the CRD is not available
3533
log.FromContext(ctx).Info("Attempting to register Prometheus resource management")
36-
err := mgr.GetFieldIndexer().IndexField(ctx, &monitoringv1.Prometheus{}, CollectableOwnerKey, prometheusIndexer)
34+
available, err := crdAvailable(ctx, mgr, "prometheus")
3735
if err != nil {
38-
log.FromContext(ctx).Info("Prometheus CRD not available, skipping field indexer registration", "error", err)
39-
return nil // Don't fail the test if CRD is not available
36+
log.FromContext(ctx).Info("Failed to check Prometheus CRD availability, skipping registration", "error", err)
37+
return nil
4038
}
39+
if !available {
40+
log.FromContext(ctx).Info("Prometheus CRD not available, skipping registration")
41+
return nil
42+
}
43+
if err := mgr.GetFieldIndexer().IndexField(ctx, &monitoringv1.Prometheus{}, CollectableOwnerKey, prometheusIndexer); err != nil {
44+
return err
45+
}
46+
prometheusCRDAvailable = true
4147
log.FromContext(ctx).Info("Successfully registered Prometheus resource management")
4248
return nil
4349
},
44-
InitWithBuilder: func(builder *builder.Builder) *builder.Builder {
45-
return builder
50+
InitWithBuilder: func(b *builder.Builder) *builder.Builder {
51+
if !prometheusCRDAvailable {
52+
return b
53+
}
54+
return b.Owns(&monitoringv1.Prometheus{})
4655
},
4756
InitWithFakeClientBuilder: func(fakeClientBuilder *fake.ClientBuilder) {
4857
fakeClientBuilder.WithIndex(&monitoringv1.Prometheus{}, CollectableOwnerKey, prometheusIndexer)
4958
},
5059
}
5160
SetupKAIConfigOwned(collectable)
5261

53-
// Register ServiceMonitor collectable if CRD is available
62+
var serviceMonitorCRDAvailable bool
5463
serviceMonitorCollectable := &Collectable{
5564
Collect: getCurrentServiceMonitorState,
5665
InitWithManager: func(ctx context.Context, mgr manager.Manager) error {
57-
// Try to register the indexer, but don't fail if the CRD is not available
5866
log.FromContext(ctx).Info("Attempting to register ServiceMonitor resource management")
59-
err := mgr.GetFieldIndexer().IndexField(ctx, &monitoringv1.ServiceMonitor{}, CollectableOwnerKey, serviceMonitorIndexer)
67+
available, err := crdAvailable(ctx, mgr, "serviceMonitor")
6068
if err != nil {
61-
log.FromContext(ctx).Info("ServiceMonitor CRD not available, skipping field indexer registration", "error", err)
62-
return nil // Don't fail the test if CRD is not available
69+
log.FromContext(ctx).Info("Failed to check ServiceMonitor CRD availability, skipping registration", "error", err)
70+
return nil
71+
}
72+
if !available {
73+
log.FromContext(ctx).Info("ServiceMonitor CRD not available, skipping registration")
74+
return nil
75+
}
76+
if err := mgr.GetFieldIndexer().IndexField(ctx, &monitoringv1.ServiceMonitor{}, CollectableOwnerKey, serviceMonitorIndexer); err != nil {
77+
return err
6378
}
79+
serviceMonitorCRDAvailable = true
6480
log.FromContext(ctx).Info("Successfully registered ServiceMonitor resource management")
6581
return nil
6682
},
67-
InitWithBuilder: func(builder *builder.Builder) *builder.Builder {
68-
return builder
83+
InitWithBuilder: func(b *builder.Builder) *builder.Builder {
84+
if !serviceMonitorCRDAvailable {
85+
return b
86+
}
87+
return b.Owns(&monitoringv1.ServiceMonitor{})
6988
},
7089
InitWithFakeClientBuilder: func(fakeClientBuilder *fake.ClientBuilder) {
7190
fakeClientBuilder.WithIndex(&monitoringv1.ServiceMonitor{}, CollectableOwnerKey, serviceMonitorIndexer)
@@ -74,6 +93,17 @@ func registerPrometheus() {
7493
SetupKAIConfigOwned(serviceMonitorCollectable)
7594
}
7695

96+
// crdAvailable performs a live API-server check for the given Prometheus-family CRD.
97+
// The manager scheme registers monitoringv1 unconditionally, so a scheme-only check
98+
// would falsely report availability on clusters without prometheus-operator installed.
99+
func crdAvailable(ctx context.Context, mgr manager.Manager, target string) (bool, error) {
100+
tempClient, err := client.New(mgr.GetConfig(), client.Options{Scheme: mgr.GetScheme()})
101+
if err != nil {
102+
return false, err
103+
}
104+
return common.CheckPrometheusCRDsAvailable(ctx, tempClient, target)
105+
}
106+
77107
func getCurrentPrometheusState(ctx context.Context, runtimeClient client.Client, reconciler client.Object) (map[string]client.Object, error) {
78108
result := map[string]client.Object{}
79109

0 commit comments

Comments
 (0)