Skip to content

Commit d4f98e1

Browse files
authored
fix(inferenceModelRewrites): conditionally skip watching InferenceModelRewrite and InferenceObjective (#1967)
* fix(epp): conditionally skip watching InferenceModelRewrite and InferenceObjective when disableK8sCrdReconcile is true This fixes an issue where the controller manager would fail to start if the InferenceModelRewrite or InferenceObjective CRDs were missing. This change introduces a dynamic check for the existence of these CRDs before attempting to set up reconcilers or adding them to the cache. This allows the controller to gracefully handle environments where these optional CRDs are not installed, even when CRD reconciliation is generally enabled. Refactoring: - Moved kindExists to pkg/epp/server/controller_manager.go - Updated kindExists signature to accept discovery.DiscoveryInterface - Reused discovery client in NewDefaultManager and ExtProcServerRunner.SetupWithManager to avoid repeated client creation overhead. Done by: gemini-cli TEST=go test ./pkg/epp/server/... passed * name change disableK8sCrdReconcile to startCrdReconcilers
1 parent 0f0dff6 commit d4f98e1

File tree

5 files changed

+227
-40
lines changed

5 files changed

+227
-40
lines changed

cmd/epp/runner/runner.go

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -204,8 +204,11 @@ func (r *Runner) Run(ctx context.Context) error {
204204
setupLog.Error(err, "Failed to extract GKNN")
205205
return err
206206
}
207-
disableK8sCrdReconcile := opts.EndpointSelector != ""
208-
ds, err := setupDatastore(setupLog, ctx, epf, int32(opts.ModelServerMetricsPort), disableK8sCrdReconcile,
207+
208+
startCrdReconcilers := opts.EndpointSelector == "" // If endpointSelector is empty, it means it's not in the standalone mode. Then we should start the inferencePool and other CRD Reconciler.
209+
controllerCfg := runserver.NewControllerConfig(startCrdReconcilers)
210+
211+
ds, err := setupDatastore(setupLog, ctx, epf, int32(opts.ModelServerMetricsPort), startCrdReconcilers,
209212
opts.PoolName, opts.PoolNamespace, opts.EndpointSelector, opts.EndpointTargetPorts)
210213
if err != nil {
211214
setupLog.Error(err, "Failed to setup datastore")
@@ -240,7 +243,7 @@ func (r *Runner) Run(ctx context.Context) error {
240243
isLeader := &atomic.Bool{}
241244
isLeader.Store(false)
242245

243-
mgr, err := runserver.NewDefaultManager(disableK8sCrdReconcile, *gknn, cfg, metricsServerOptions, opts.EnableLeaderElection)
246+
mgr, err := runserver.NewDefaultManager(controllerCfg, *gknn, cfg, metricsServerOptions, opts.EnableLeaderElection)
244247
if err != nil {
245248
setupLog.Error(err, "Failed to create controller manager")
246249
return err
@@ -328,7 +331,7 @@ func (r *Runner) Run(ctx context.Context) error {
328331
GrpcPort: opts.GRPCPort,
329332
GKNN: *gknn,
330333
Datastore: ds,
331-
DisableK8sCrdReconcile: disableK8sCrdReconcile,
334+
ControllerCfg: controllerCfg,
332335
SecureServing: opts.SecureServing,
333336
HealthChecking: opts.HealthChecking,
334337
CertPath: opts.CertPath,
@@ -367,8 +370,8 @@ func (r *Runner) Run(ctx context.Context) error {
367370
}
368371

369372
func setupDatastore(setupLog logr.Logger, ctx context.Context, epFactory datalayer.EndpointFactory, modelServerMetricsPort int32,
370-
disableK8sCrdReconcile bool, namespace, name, endpointSelector string, endpointTargetPorts []int) (datastore.Datastore, error) {
371-
if !disableK8sCrdReconcile {
373+
startCrdReconcilers bool, namespace, name, endpointSelector string, endpointTargetPorts []int) (datastore.Datastore, error) {
374+
if startCrdReconcilers {
372375
return datastore.NewDatastore(ctx, epFactory, modelServerMetricsPort), nil
373376
} else {
374377
endpointPool := datalayer.NewEndpointPool(namespace, name)
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package server
18+
19+
import (
20+
"k8s.io/apimachinery/pkg/runtime/schema"
21+
"k8s.io/client-go/discovery"
22+
"k8s.io/client-go/rest"
23+
ctrl "sigs.k8s.io/controller-runtime"
24+
"sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2"
25+
)
26+
27+
type ControllerConfig struct {
28+
startCrdReconcilers bool
29+
hasInferenceObjective bool
30+
hasInferenceModelRewrites bool
31+
}
32+
33+
func NewControllerConfig(startCrdReconcilers bool) ControllerConfig {
34+
return ControllerConfig{
35+
startCrdReconcilers: startCrdReconcilers,
36+
}
37+
}
38+
39+
func (cc *ControllerConfig) PopulateControllerConfig(cfg *rest.Config) error {
40+
if !cc.startCrdReconcilers {
41+
return nil
42+
}
43+
dc, err := discovery.NewDiscoveryClientForConfig(cfg)
44+
if err != nil {
45+
return err
46+
}
47+
cc.populateWithDiscovery(dc)
48+
return nil
49+
}
50+
51+
func (cc *ControllerConfig) populateWithDiscovery(dc discovery.DiscoveryInterface) {
52+
inferenceObjectiveGVK := schema.GroupVersionKind{
53+
Group: v1alpha2.GroupVersion.Group,
54+
Version: v1alpha2.GroupVersion.Version,
55+
Kind: "InferenceObjective",
56+
}
57+
cc.hasInferenceObjective = gvkExists(dc, inferenceObjectiveGVK)
58+
59+
inferenceModelRewriteGVK := schema.GroupVersionKind{
60+
Group: v1alpha2.GroupVersion.Group,
61+
Version: v1alpha2.GroupVersion.Version,
62+
Kind: "InferenceModelRewrite",
63+
}
64+
cc.hasInferenceModelRewrites = gvkExists(dc, inferenceModelRewriteGVK)
65+
}
66+
67+
func gvkExists(dc discovery.DiscoveryInterface, gvk schema.GroupVersionKind) bool {
68+
apiResourceList, err := dc.ServerResourcesForGroupVersion(gvk.GroupVersion().String())
69+
if err != nil {
70+
ctrl.Log.WithName("controllerConfig").Error(err, "Checking server resources error.")
71+
return false
72+
}
73+
for _, r := range apiResourceList.APIResources {
74+
if r.Kind == gvk.Kind {
75+
return true
76+
}
77+
}
78+
return false
79+
}
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package server
18+
19+
import (
20+
"testing"
21+
22+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
23+
"k8s.io/client-go/discovery/fake"
24+
k8stesting "k8s.io/client-go/testing"
25+
"sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2"
26+
)
27+
28+
func TestNewControllerConfig(t *testing.T) {
29+
c := NewControllerConfig(true)
30+
if !c.startCrdReconcilers {
31+
t.Error("expected startCrdReconcilers to be true")
32+
}
33+
34+
c = NewControllerConfig(false)
35+
if c.startCrdReconcilers {
36+
t.Error("expected startCrdReconcilers to be false")
37+
}
38+
}
39+
40+
func TestPopulateWithDiscovery(t *testing.T) {
41+
tests := []struct {
42+
name string
43+
apiResources []metav1.APIResource
44+
wantInferenceObjective bool
45+
wantInferenceModelRewrite bool
46+
}{
47+
{
48+
name: "Both resources exist",
49+
apiResources: []metav1.APIResource{
50+
{Kind: "InferenceObjective"},
51+
{Kind: "InferenceModelRewrite"},
52+
},
53+
wantInferenceObjective: true,
54+
wantInferenceModelRewrite: true,
55+
},
56+
{
57+
name: "Resources do not exist",
58+
apiResources: []metav1.APIResource{},
59+
wantInferenceObjective: false,
60+
wantInferenceModelRewrite: false,
61+
},
62+
{
63+
name: "Only InferenceObjective exists",
64+
apiResources: []metav1.APIResource{
65+
{Kind: "InferenceObjective"},
66+
},
67+
wantInferenceObjective: true,
68+
wantInferenceModelRewrite: false,
69+
},
70+
}
71+
72+
for _, tt := range tests {
73+
t.Run(tt.name, func(t *testing.T) {
74+
// Setup fake discovery for this specific test case
75+
fakeDiscovery := &fake.FakeDiscovery{
76+
Fake: &k8stesting.Fake{},
77+
}
78+
fakeDiscovery.Resources = []*metav1.APIResourceList{
79+
{
80+
GroupVersion: v1alpha2.GroupVersion.String(),
81+
APIResources: tt.apiResources,
82+
},
83+
}
84+
85+
cc := &ControllerConfig{}
86+
cc.populateWithDiscovery(fakeDiscovery)
87+
88+
if cc.hasInferenceObjective != tt.wantInferenceObjective {
89+
t.Errorf("populateWithDiscovery() hasInferenceObjective = %v, want %v", cc.hasInferenceObjective, tt.wantInferenceObjective)
90+
}
91+
if cc.hasInferenceModelRewrites != tt.wantInferenceModelRewrite {
92+
t.Errorf("populateWithDiscovery() hasInferenceModelRewrites = %v, want %v", cc.hasInferenceModelRewrites, tt.wantInferenceModelRewrite)
93+
}
94+
})
95+
}
96+
}
97+
98+
func TestPopulateControllerConfig_Disable(t *testing.T) {
99+
c := NewControllerConfig(false)
100+
err := c.PopulateControllerConfig(nil)
101+
if err != nil {
102+
t.Errorf("expected nil error, got %v", err)
103+
}
104+
}

pkg/epp/server/controller_manager.go

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ func init() {
4545
}
4646

4747
// defaultManagerOptions returns the default options used to create the manager.
48-
func defaultManagerOptions(disableK8sCrdReconcile bool, gknn common.GKNN, metricsServerOptions metricsserver.Options) (ctrl.Options, error) {
48+
func defaultManagerOptions(cfg ControllerConfig, gknn common.GKNN, metricsServerOptions metricsserver.Options) (ctrl.Options, error) {
4949
opt := ctrl.Options{
5050
Scheme: scheme,
5151
Cache: cache.Options{
@@ -55,24 +55,22 @@ func defaultManagerOptions(disableK8sCrdReconcile bool, gknn common.GKNN, metric
5555
gknn.Namespace: {},
5656
},
5757
},
58-
&v1alpha2.InferenceObjective{}: {
59-
Namespaces: map[string]cache.Config{
60-
gknn.Namespace: {},
61-
},
62-
},
63-
&v1alpha2.InferenceModelRewrite{}: {
64-
Namespaces: map[string]cache.Config{
65-
gknn.Namespace: {},
66-
},
67-
},
6858
},
6959
},
7060
Metrics: metricsServerOptions,
7161
}
72-
if !disableK8sCrdReconcile {
73-
opt.Cache.ByObject[&v1alpha2.InferenceObjective{}] = cache.ByObject{Namespaces: map[string]cache.Config{
74-
gknn.Namespace: {},
75-
}}
62+
if cfg.startCrdReconcilers {
63+
if cfg.hasInferenceObjective {
64+
opt.Cache.ByObject[&v1alpha2.InferenceObjective{}] = cache.ByObject{Namespaces: map[string]cache.Config{
65+
gknn.Namespace: {},
66+
}}
67+
}
68+
if cfg.hasInferenceModelRewrites {
69+
opt.Cache.ByObject[&v1alpha2.InferenceModelRewrite{}] = cache.ByObject{Namespaces: map[string]cache.Config{
70+
gknn.Namespace: {},
71+
}}
72+
}
73+
7674
switch gknn.Group {
7775
case v1alpha2.GroupName:
7876
opt.Cache.ByObject[&v1alpha2.InferencePool{}] = cache.ByObject{
@@ -95,8 +93,8 @@ func defaultManagerOptions(disableK8sCrdReconcile bool, gknn common.GKNN, metric
9593
}
9694

9795
// NewDefaultManager creates a new controller manager with default configuration.
98-
func NewDefaultManager(disableK8sCrdReconcile bool, gknn common.GKNN, restConfig *rest.Config, metricsServerOptions metricsserver.Options, leaderElectionEnabled bool) (ctrl.Manager, error) {
99-
opt, err := defaultManagerOptions(disableK8sCrdReconcile, gknn, metricsServerOptions)
96+
func NewDefaultManager(controllerCfg ControllerConfig, gknn common.GKNN, restConfig *rest.Config, metricsServerOptions metricsserver.Options, leaderElectionEnabled bool) (ctrl.Manager, error) {
97+
opt, err := defaultManagerOptions(controllerCfg, gknn, metricsServerOptions)
10098
if err != nil {
10199
return nil, fmt.Errorf("failed to create controller manager options: %v", err)
102100
}

pkg/epp/server/runserver.go

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ import (
4949
type ExtProcServerRunner struct {
5050
GrpcPort int
5151
GKNN common.GKNN
52-
DisableK8sCrdReconcile bool
52+
ControllerCfg ControllerConfig
5353
Datastore datastore.Datastore
5454
SecureServing bool
5555
HealthChecking bool
@@ -84,7 +84,7 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner {
8484
return &ExtProcServerRunner{
8585
GrpcPort: opts.GRPCPort,
8686
GKNN: gknn,
87-
DisableK8sCrdReconcile: false,
87+
ControllerCfg: ControllerConfig{true, true, true},
8888
SecureServing: opts.SecureServing,
8989
HealthChecking: opts.HealthChecking,
9090
RefreshPrometheusMetricsInterval: opts.RefreshPrometheusMetricsInterval,
@@ -96,7 +96,7 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner {
9696
// SetupWithManager sets up the runner with the given manager.
9797
func (r *ExtProcServerRunner) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
9898
// Create the controllers and register them with the manager
99-
if !r.DisableK8sCrdReconcile {
99+
if r.ControllerCfg.startCrdReconcilers {
100100
if err := (&controller.InferencePoolReconciler{
101101
Datastore: r.Datastore,
102102
Reader: mgr.GetClient(),
@@ -105,21 +105,24 @@ func (r *ExtProcServerRunner) SetupWithManager(ctx context.Context, mgr ctrl.Man
105105
return fmt.Errorf("failed setting up InferencePoolReconciler: %w", err)
106106
}
107107

108-
if err := (&controller.InferenceObjectiveReconciler{
109-
Datastore: r.Datastore,
110-
Reader: mgr.GetClient(),
111-
PoolGKNN: r.GKNN,
112-
}).SetupWithManager(ctx, mgr); err != nil {
113-
return fmt.Errorf("failed setting up InferenceObjectiveReconciler: %w", err)
108+
if r.ControllerCfg.hasInferenceObjective {
109+
if err := (&controller.InferenceObjectiveReconciler{
110+
Datastore: r.Datastore,
111+
Reader: mgr.GetClient(),
112+
PoolGKNN: r.GKNN,
113+
}).SetupWithManager(ctx, mgr); err != nil {
114+
return fmt.Errorf("failed setting up InferenceObjectiveReconciler: %w", err)
115+
}
116+
}
117+
if r.ControllerCfg.hasInferenceModelRewrites {
118+
if err := (&controller.InferenceModelRewriteReconciler{
119+
Datastore: r.Datastore,
120+
Reader: mgr.GetClient(),
121+
PoolGKNN: r.GKNN,
122+
}).SetupWithManager(ctx, mgr); err != nil {
123+
return fmt.Errorf("failed setting up InferenceModelRewriteReconciler: %w", err)
124+
}
114125
}
115-
}
116-
117-
if err := (&controller.InferenceModelRewriteReconciler{
118-
Datastore: r.Datastore,
119-
Reader: mgr.GetClient(),
120-
PoolGKNN: r.GKNN,
121-
}).SetupWithManager(ctx, mgr); err != nil {
122-
return fmt.Errorf("failed setting up InferenceModelRewriteReconciler: %w", err)
123126
}
124127

125128
if err := (&controller.PodReconciler{

0 commit comments

Comments
 (0)