@@ -22,19 +22,38 @@ import (
22
22
"fmt"
23
23
"os"
24
24
25
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
26
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27
+ "k8s.io/apimachinery/pkg/runtime"
28
+ "k8s.io/apimachinery/pkg/runtime/schema"
29
+ "k8s.io/client-go/tools/cache"
30
+
25
31
"github.com/go-logr/logr"
32
+ "golang.org/x/exp/slices"
26
33
"k8s.io/apimachinery/pkg/api/meta"
27
34
ctrl "sigs.k8s.io/controller-runtime"
28
35
"sigs.k8s.io/controller-runtime/pkg/client"
29
36
"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
30
37
38
+ apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
39
+ apiextensionsclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
40
+ "k8s.io/apimachinery/pkg/watch"
41
+ retrywatch "k8s.io/client-go/tools/watch"
42
+
31
43
"sigs.k8s.io/kueue/pkg/controller/jobs/noop"
32
44
)
33
45
46
+ const (
47
+ pytorchjobAPI = "pytorchjobs.kubeflow.org"
48
+ rayclusterAPI = "rayclusters.ray.io"
49
+ )
50
+
34
51
var (
35
52
errFailedMappingResource = errors .New ("restMapper failed mapping resource" )
36
53
)
37
54
55
+ // +kubebuilder:rbac:groups="apiextensions.k8s.io",resources=customresourcedefinitions,verbs=get;list;watch
56
+
38
57
// SetupControllers setups all controllers and webhooks for integrations.
39
58
// When the platform developers implement a separate kueue-manager to manage the in-house custom jobs,
40
59
// they can easily setup controllers and webhooks for the in-house custom jobs.
@@ -62,24 +81,14 @@ func SetupControllers(mgr ctrl.Manager, log logr.Logger, opts ...Option) error {
62
81
if err != nil {
63
82
return fmt .Errorf ("%s: %w: %w" , fwkNamePrefix , errFailedMappingResource , err )
64
83
}
65
- if _ , err = mgr . GetRESTMapper (). RESTMapping ( gvk . GroupKind ( ), gvk . Version ); err != nil {
66
- if ! meta . IsNoMatchError ( err ) {
67
- return fmt . Errorf ( "%s: %w" , fwkNamePrefix , err )
68
- }
69
- logger . Info ( "No matching API in the server for job framework, skipped setup of controller and webhook" )
84
+ if ! isAPIAvailable ( context . TODO ( ), mgr , rayclusterAPI ) {
85
+ logger . Info ( "API not available, waiting for it to become available... - Skipping setup of controller and webhook" )
86
+ waitForAPI ( context . TODO (), logger , mgr , rayclusterAPI , func () {
87
+ setupComponents ( mgr , logger , gvk , fwkNamePrefix , cb , opts ... )
88
+ } )
70
89
} else {
71
- if err = cb .NewReconciler (
72
- mgr .GetClient (),
73
- mgr .GetEventRecorderFor (fmt .Sprintf ("%s-%s-controller" , name , options .ManagerName )),
74
- opts ... ,
75
- ).SetupWithManager (mgr ); err != nil {
76
- return fmt .Errorf ("%s: %w" , fwkNamePrefix , err )
77
- }
78
- if err = cb .SetupWebhook (mgr , opts ... ); err != nil {
79
- return fmt .Errorf ("%s: unable to create webhook: %w" , fwkNamePrefix , err )
80
- }
81
- logger .Info ("Set up controller and webhook for job framework" )
82
- return nil
90
+ logger .Info ("API is available, setting up components..." )
91
+ setupComponents (mgr , logger , gvk , fwkNamePrefix , cb , opts ... )
83
92
}
84
93
}
85
94
if err := noop .SetupWebhook (mgr , cb .JobType ); err != nil {
@@ -89,6 +98,39 @@ func SetupControllers(mgr ctrl.Manager, log logr.Logger, opts ...Option) error {
89
98
})
90
99
}
91
100
101
+ func setupComponents (mgr ctrl.Manager , log logr.Logger , gvk schema.GroupVersionKind , fwkNamePrefix string , cb IntegrationCallbacks , opts ... Option ) {
102
+ // Attempt to get the REST mapping for the GVK
103
+ if _ , err := mgr .GetRESTMapper ().RESTMapping (gvk .GroupKind (), gvk .Version ); err != nil {
104
+ if ! meta .IsNoMatchError (err ) {
105
+ log .Error (err , fmt .Sprintf ("%s: unable to get REST mapping" , fwkNamePrefix ))
106
+ return
107
+ }
108
+ log .Info ("No matching API in the server for job framework, skipped setup of controller and webhook" )
109
+ } else {
110
+ if err := setupControllerAndWebhook (mgr , gvk , fwkNamePrefix , cb , opts ... ); err != nil {
111
+ log .Error (err , "Failed to set up controller and webhook" )
112
+ } else {
113
+ log .Info ("Set up controller and webhook for job framework" )
114
+ }
115
+ }
116
+ }
117
+
118
+ func setupControllerAndWebhook (mgr ctrl.Manager , gvk schema.GroupVersionKind , fwkNamePrefix string , cb IntegrationCallbacks , opts ... Option ) error {
119
+ if err := cb .NewReconciler (
120
+ mgr .GetClient (),
121
+ mgr .GetEventRecorderFor (fmt .Sprintf ("%s-%s-controller" , gvk .Kind , "managerName" )), // Ensure managerName is defined or fetched
122
+ opts ... ,
123
+ ).SetupWithManager (mgr ); err != nil {
124
+ return fmt .Errorf ("%s: %w" , fwkNamePrefix , err )
125
+ }
126
+
127
+ if err := cb .SetupWebhook (mgr , opts ... ); err != nil {
128
+ return fmt .Errorf ("%s: unable to create webhook: %w" , fwkNamePrefix , err )
129
+ }
130
+
131
+ return nil
132
+ }
133
+
92
134
// SetupIndexes setups the indexers for integrations.
93
135
// When the platform developers implement a separate kueue-manager to manage the in-house custom jobs,
94
136
// they can easily setup indexers for the in-house custom jobs.
@@ -105,3 +147,73 @@ func SetupIndexes(ctx context.Context, indexer client.FieldIndexer, opts ...Opti
105
147
return nil
106
148
})
107
149
}
150
+
151
+ func isAPIAvailable (ctx context.Context , mgr ctrl.Manager , apiName string ) bool {
152
+ crdClient , err := apiextensionsclientset .NewForConfig (mgr .GetConfig ())
153
+ exitOnError (err , "unable to create CRD client" )
154
+
155
+ crdList , err := crdClient .ApiextensionsV1 ().CustomResourceDefinitions ().List (ctx , metav1.ListOptions {})
156
+ exitOnError (err , "unable to list CRDs" )
157
+
158
+ return slices .ContainsFunc (crdList .Items , func (crd apiextensionsv1.CustomResourceDefinition ) bool {
159
+ return crd .Name == apiName
160
+ })
161
+ }
162
+
163
+ func waitForAPI (ctx context.Context , log logr.Logger , mgr ctrl.Manager , apiName string , action func ()) {
164
+ crdClient , err := apiextensionsclientset .NewForConfig (mgr .GetConfig ())
165
+ exitOnError (err , "unable to create CRD client" )
166
+
167
+ crdList , err := crdClient .ApiextensionsV1 ().CustomResourceDefinitions ().List (ctx , metav1.ListOptions {})
168
+ exitOnError (err , "unable to list CRDs" )
169
+
170
+ // If API is already available, just invoke action
171
+ if slices .ContainsFunc (crdList .Items , func (crd apiextensionsv1.CustomResourceDefinition ) bool {
172
+ return crd .Name == apiName
173
+ }) {
174
+ action ()
175
+ return
176
+ }
177
+
178
+ // Wait for the API to become available then invoke action
179
+ log .Info (fmt .Sprintf ("API %v not available, setting up retry watcher" , apiName ))
180
+ retryWatcher , err := retrywatch .NewRetryWatcher (crdList .ResourceVersion , & cache.ListWatch {
181
+ ListFunc : func (options metav1.ListOptions ) (runtime.Object , error ) {
182
+ return crdClient .ApiextensionsV1 ().CustomResourceDefinitions ().List (ctx , metav1.ListOptions {})
183
+ },
184
+ WatchFunc : func (options metav1.ListOptions ) (watch.Interface , error ) {
185
+ return crdClient .ApiextensionsV1 ().CustomResourceDefinitions ().Watch (ctx , metav1.ListOptions {})
186
+ },
187
+ })
188
+ exitOnError (err , "unable to create retry watcher" )
189
+
190
+ defer retryWatcher .Stop ()
191
+ for {
192
+ select {
193
+ case <- ctx .Done ():
194
+ return
195
+ case event := <- retryWatcher .ResultChan ():
196
+ switch event .Type {
197
+ case watch .Error :
198
+ exitOnError (apierrors .FromObject (event .Object ), fmt .Sprintf ("error watching for API %v" , apiName ))
199
+
200
+ case watch .Added , watch .Modified :
201
+ if crd := event .Object .(* apiextensionsv1.CustomResourceDefinition ); crd .Name == apiName &&
202
+ slices .ContainsFunc (crd .Status .Conditions , func (condition apiextensionsv1.CustomResourceDefinitionCondition ) bool {
203
+ return condition .Type == apiextensionsv1 .Established && condition .Status == apiextensionsv1 .ConditionTrue
204
+ }) {
205
+ log .Info (fmt .Sprintf ("API %v installed, invoking deferred action" , apiName ))
206
+ action ()
207
+ return
208
+ }
209
+ }
210
+ }
211
+ }
212
+ }
213
+
214
+ func exitOnError (err error , msg string ) {
215
+ if err != nil {
216
+ fmt .Sprint (err , msg )
217
+ os .Exit (1 )
218
+ }
219
+ }
0 commit comments