Skip to content

Commit 2736a06

Browse files
CARRY: Add watcher to controller and webhook
1 parent fdafd7a commit 2736a06

File tree

4 files changed

+139
-20
lines changed

4 files changed

+139
-20
lines changed

config/components/rbac/role.yaml

+8
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,14 @@ rules:
9292
- list
9393
- update
9494
- watch
95+
- apiGroups:
96+
- apiextensions.k8s.io
97+
resources:
98+
- customresourcedefinitions
99+
verbs:
100+
- get
101+
- list
102+
- watch
95103
- apiGroups:
96104
- autoscaling.x-k8s.io
97105
resources:

go.mod

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ require (
1515
github.com/prometheus/client_model v0.5.0
1616
github.com/ray-project/kuberay/ray-operator v1.1.0-alpha.0
1717
go.uber.org/zap v1.26.0
18+
golang.org/x/exp v0.0.0-20230905200255-921286631fa9
1819
k8s.io/api v0.29.2
20+
k8s.io/apiextensions-apiserver v0.29.0
1921
k8s.io/apimachinery v0.29.2
2022
k8s.io/apiserver v0.29.1
2123
k8s.io/autoscaler/cluster-autoscaler/apis v0.0.0-20240325113845-0130d33747bb
@@ -100,7 +102,6 @@ require (
100102
go.uber.org/atomic v1.11.0 // indirect
101103
go.uber.org/multierr v1.11.0 // indirect
102104
golang.org/x/crypto v0.18.0 // indirect
103-
golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect
104105
golang.org/x/mod v0.14.0 // indirect
105106
golang.org/x/net v0.20.0 // indirect
106107
golang.org/x/oauth2 v0.12.0 // indirect
@@ -121,7 +122,6 @@ require (
121122
gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect
122123
gopkg.in/yaml.v2 v2.4.0 // indirect
123124
gopkg.in/yaml.v3 v3.0.1 // indirect
124-
k8s.io/apiextensions-apiserver v0.29.0 // indirect
125125
k8s.io/gengo v0.0.0-20230829151522-9cce18d56c01 // indirect
126126
k8s.io/kms v0.29.1 // indirect
127127
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.28.0 // indirect

pkg/controller/jobframework/setup.go

+127-18
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,32 @@ import (
2222
"fmt"
2323
"os"
2424

25+
apierrors "k8s.io/apimachinery/pkg/api/errors"
26+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27+
"k8s.io/apimachinery/pkg/runtime"
28+
"k8s.io/apimachinery/pkg/runtime/schema"
29+
"k8s.io/client-go/tools/cache"
30+
2531
"github.com/go-logr/logr"
32+
"golang.org/x/exp/slices"
2633
"k8s.io/apimachinery/pkg/api/meta"
2734
ctrl "sigs.k8s.io/controller-runtime"
2835
"sigs.k8s.io/controller-runtime/pkg/client"
2936
"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
3037

38+
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
39+
apiextensionsclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
40+
"k8s.io/apimachinery/pkg/watch"
41+
retrywatch "k8s.io/client-go/tools/watch"
42+
3143
"sigs.k8s.io/kueue/pkg/controller/jobs/noop"
3244
)
3345

46+
const (
47+
pytorchjobAPI = "pytorchjobs.kubeflow.org"
48+
rayclusterAPI = "rayclusters.ray.io"
49+
)
50+
3451
var (
3552
errFailedMappingResource = errors.New("restMapper failed mapping resource")
3653
)
@@ -48,7 +65,6 @@ func SetupControllers(mgr ctrl.Manager, log logr.Logger, opts ...Option) error {
4865
options := ProcessOptions(opts...)
4966

5067
return ForEachIntegration(func(name string, cb IntegrationCallbacks) error {
51-
logger := log.WithValues("jobFrameworkName", name)
5268
fwkNamePrefix := fmt.Sprintf("jobFrameworkName %q", name)
5369

5470
if options.EnabledFrameworks.Has(name) {
@@ -62,24 +78,14 @@ func SetupControllers(mgr ctrl.Manager, log logr.Logger, opts ...Option) error {
6278
if err != nil {
6379
return fmt.Errorf("%s: %w: %w", fwkNamePrefix, errFailedMappingResource, err)
6480
}
65-
if _, err = mgr.GetRESTMapper().RESTMapping(gvk.GroupKind(), gvk.Version); err != nil {
66-
if !meta.IsNoMatchError(err) {
67-
return fmt.Errorf("%s: %w", fwkNamePrefix, err)
68-
}
69-
logger.Info("No matching API in the server for job framework, skipped setup of controller and webhook")
81+
if !isAPIAvailable(context.TODO(), mgr, rayclusterAPI) {
82+
log.Info("API not available, waiting for it to become available... - Skipping setup of controller and webhook")
83+
waitForAPI(context.TODO(), log, mgr, rayclusterAPI, func() {
84+
setupComponents(mgr, log, gvk, fwkNamePrefix, cb, opts...)
85+
})
7086
} else {
71-
if err = cb.NewReconciler(
72-
mgr.GetClient(),
73-
mgr.GetEventRecorderFor(fmt.Sprintf("%s-%s-controller", name, options.ManagerName)),
74-
opts...,
75-
).SetupWithManager(mgr); err != nil {
76-
return fmt.Errorf("%s: %w", fwkNamePrefix, err)
77-
}
78-
if err = cb.SetupWebhook(mgr, opts...); err != nil {
79-
return fmt.Errorf("%s: unable to create webhook: %w", fwkNamePrefix, err)
80-
}
81-
logger.Info("Set up controller and webhook for job framework")
82-
return nil
87+
log.Info("API is available, setting up components...")
88+
setupComponents(mgr, log, gvk, fwkNamePrefix, cb, opts...)
8389
}
8490
}
8591
if err := noop.SetupWebhook(mgr, cb.JobType); err != nil {
@@ -89,6 +95,39 @@ func SetupControllers(mgr ctrl.Manager, log logr.Logger, opts ...Option) error {
8995
})
9096
}
9197

98+
func setupComponents(mgr ctrl.Manager, log logr.Logger, gvk schema.GroupVersionKind, fwkNamePrefix string, cb IntegrationCallbacks, opts ...Option) {
99+
// Attempt to get the REST mapping for the GVK
100+
if _, err := mgr.GetRESTMapper().RESTMapping(gvk.GroupKind(), gvk.Version); err != nil {
101+
if !meta.IsNoMatchError(err) {
102+
log.Error(err, fmt.Sprintf("%s: unable to get REST mapping", fwkNamePrefix))
103+
return
104+
}
105+
log.Info("No matching API in the server for job framework, skipped setup of controller and webhook")
106+
} else {
107+
if err := setupControllerAndWebhook(mgr, gvk, fwkNamePrefix, cb, opts...); err != nil {
108+
log.Error(err, "Failed to set up controller and webhook")
109+
} else {
110+
log.Info("Set up controller and webhook for job framework")
111+
}
112+
}
113+
}
114+
115+
func setupControllerAndWebhook(mgr ctrl.Manager, gvk schema.GroupVersionKind, fwkNamePrefix string, cb IntegrationCallbacks, opts ...Option) error {
116+
if err := cb.NewReconciler(
117+
mgr.GetClient(),
118+
mgr.GetEventRecorderFor(fmt.Sprintf("%s-%s-controller", gvk.Kind, "managerName")), // Ensure managerName is defined or fetched
119+
opts...,
120+
).SetupWithManager(mgr); err != nil {
121+
return fmt.Errorf("%s: %w", fwkNamePrefix, err)
122+
}
123+
124+
if err := cb.SetupWebhook(mgr, opts...); err != nil {
125+
return fmt.Errorf("%s: unable to create webhook: %w", fwkNamePrefix, err)
126+
}
127+
128+
return nil
129+
}
130+
92131
// SetupIndexes setups the indexers for integrations.
93132
// When the platform developers implement a separate kueue-manager to manage the in-house custom jobs,
94133
// they can easily setup indexers for the in-house custom jobs.
@@ -105,3 +144,73 @@ func SetupIndexes(ctx context.Context, indexer client.FieldIndexer, opts ...Opti
105144
return nil
106145
})
107146
}
147+
148+
func isAPIAvailable(ctx context.Context, mgr ctrl.Manager, apiName string) bool {
149+
crdClient, err := apiextensionsclientset.NewForConfig(mgr.GetConfig())
150+
exitOnError(err, "unable to create CRD client")
151+
152+
crdList, err := crdClient.ApiextensionsV1().CustomResourceDefinitions().List(ctx, metav1.ListOptions{})
153+
exitOnError(err, "unable to list CRDs")
154+
155+
return slices.ContainsFunc(crdList.Items, func(crd apiextensionsv1.CustomResourceDefinition) bool {
156+
return crd.Name == apiName
157+
})
158+
}
159+
160+
func waitForAPI(ctx context.Context, log logr.Logger, mgr ctrl.Manager, apiName string, action func()) {
161+
crdClient, err := apiextensionsclientset.NewForConfig(mgr.GetConfig())
162+
exitOnError(err, "unable to create CRD client")
163+
164+
crdList, err := crdClient.ApiextensionsV1().CustomResourceDefinitions().List(ctx, metav1.ListOptions{})
165+
exitOnError(err, "unable to list CRDs")
166+
167+
// If API is already available, just invoke action
168+
if slices.ContainsFunc(crdList.Items, func(crd apiextensionsv1.CustomResourceDefinition) bool {
169+
return crd.Name == apiName
170+
}) {
171+
action()
172+
return
173+
}
174+
175+
// Wait for the API to become available then invoke action
176+
log.Info(fmt.Sprintf("API %v not available, setting up retry watcher", apiName))
177+
retryWatcher, err := retrywatch.NewRetryWatcher(crdList.ResourceVersion, &cache.ListWatch{
178+
ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
179+
return crdClient.ApiextensionsV1().CustomResourceDefinitions().List(ctx, metav1.ListOptions{})
180+
},
181+
WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
182+
return crdClient.ApiextensionsV1().CustomResourceDefinitions().Watch(ctx, metav1.ListOptions{})
183+
},
184+
})
185+
exitOnError(err, "unable to create retry watcher")
186+
187+
defer retryWatcher.Stop()
188+
for {
189+
select {
190+
case <-ctx.Done():
191+
return
192+
case event := <-retryWatcher.ResultChan():
193+
switch event.Type {
194+
case watch.Error:
195+
exitOnError(apierrors.FromObject(event.Object), fmt.Sprintf("error watching for API %v", apiName))
196+
197+
case watch.Added, watch.Modified:
198+
if crd := event.Object.(*apiextensionsv1.CustomResourceDefinition); crd.Name == apiName &&
199+
slices.ContainsFunc(crd.Status.Conditions, func(condition apiextensionsv1.CustomResourceDefinitionCondition) bool {
200+
return condition.Type == apiextensionsv1.Established && condition.Status == apiextensionsv1.ConditionTrue
201+
}) {
202+
log.Info(fmt.Sprintf("API %v installed, invoking deferred action", apiName))
203+
action()
204+
return
205+
}
206+
}
207+
}
208+
}
209+
}
210+
211+
func exitOnError(err error, msg string) {
212+
if err != nil {
213+
fmt.Sprint(err, msg)
214+
os.Exit(1)
215+
}
216+
}

pkg/controller/jobs/raycluster/raycluster_controller.go

+2
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ func init() {
5959
// +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=resourceflavors,verbs=get;list;watch
6060
// +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloadpriorityclasses,verbs=get;list;watch
6161
// +kubebuilder:rbac:groups=ray.io,resources=rayclusters/finalizers,verbs=get;update
62+
// +kubebuilder:rbac:groups="apiextensions.k8s.io",resources=customresourcedefinitions,verbs=get;list;watch
63+
6264

6365
var NewReconciler = jobframework.NewGenericReconcilerFactory(func() jobframework.GenericJob { return &RayCluster{} })
6466

0 commit comments

Comments
 (0)