@@ -22,15 +22,32 @@ import (
22
22
"fmt"
23
23
"os"
24
24
25
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
26
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27
+ "k8s.io/apimachinery/pkg/runtime"
28
+ "k8s.io/apimachinery/pkg/runtime/schema"
29
+ "k8s.io/client-go/tools/cache"
30
+
25
31
"github.com/go-logr/logr"
32
+ "golang.org/x/exp/slices"
26
33
"k8s.io/apimachinery/pkg/api/meta"
27
34
ctrl "sigs.k8s.io/controller-runtime"
28
35
"sigs.k8s.io/controller-runtime/pkg/client"
29
36
"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
30
37
38
+ apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
39
+ apiextensionsclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
40
+ "k8s.io/apimachinery/pkg/watch"
41
+ retrywatch "k8s.io/client-go/tools/watch"
42
+
31
43
"sigs.k8s.io/kueue/pkg/controller/jobs/noop"
32
44
)
33
45
46
+ const (
47
+ pytorchjobAPI = "pytorchjobs.kubeflow.org"
48
+ rayclusterAPI = "rayclusters.ray.io"
49
+ )
50
+
34
51
var (
35
52
errFailedMappingResource = errors .New ("restMapper failed mapping resource" )
36
53
)
@@ -48,7 +65,6 @@ func SetupControllers(mgr ctrl.Manager, log logr.Logger, opts ...Option) error {
48
65
options := ProcessOptions (opts ... )
49
66
50
67
return ForEachIntegration (func (name string , cb IntegrationCallbacks ) error {
51
- logger := log .WithValues ("jobFrameworkName" , name )
52
68
fwkNamePrefix := fmt .Sprintf ("jobFrameworkName %q" , name )
53
69
54
70
if options .EnabledFrameworks .Has (name ) {
@@ -62,24 +78,14 @@ func SetupControllers(mgr ctrl.Manager, log logr.Logger, opts ...Option) error {
62
78
if err != nil {
63
79
return fmt .Errorf ("%s: %w: %w" , fwkNamePrefix , errFailedMappingResource , err )
64
80
}
65
- if _ , err = mgr . GetRESTMapper (). RESTMapping ( gvk . GroupKind ( ), gvk . Version ); err != nil {
66
- if ! meta . IsNoMatchError ( err ) {
67
- return fmt . Errorf ( "%s: %w" , fwkNamePrefix , err )
68
- }
69
- logger . Info ( "No matching API in the server for job framework, skipped setup of controller and webhook" )
81
+ if ! isAPIAvailable ( context . TODO ( ), mgr , rayclusterAPI ) {
82
+ log . Info ( "API not available, waiting for it to become available... - Skipping setup of controller and webhook" )
83
+ waitForAPI ( context . TODO (), log , mgr , rayclusterAPI , func () {
84
+ setupComponents ( mgr , log , gvk , fwkNamePrefix , cb , opts ... )
85
+ } )
70
86
} else {
71
- if err = cb .NewReconciler (
72
- mgr .GetClient (),
73
- mgr .GetEventRecorderFor (fmt .Sprintf ("%s-%s-controller" , name , options .ManagerName )),
74
- opts ... ,
75
- ).SetupWithManager (mgr ); err != nil {
76
- return fmt .Errorf ("%s: %w" , fwkNamePrefix , err )
77
- }
78
- if err = cb .SetupWebhook (mgr , opts ... ); err != nil {
79
- return fmt .Errorf ("%s: unable to create webhook: %w" , fwkNamePrefix , err )
80
- }
81
- logger .Info ("Set up controller and webhook for job framework" )
82
- return nil
87
+ log .Info ("API is available, setting up components..." )
88
+ setupComponents (mgr , log , gvk , fwkNamePrefix , cb , opts ... )
83
89
}
84
90
}
85
91
if err := noop .SetupWebhook (mgr , cb .JobType ); err != nil {
@@ -89,6 +95,39 @@ func SetupControllers(mgr ctrl.Manager, log logr.Logger, opts ...Option) error {
89
95
})
90
96
}
91
97
98
+ func setupComponents (mgr ctrl.Manager , log logr.Logger , gvk schema.GroupVersionKind , fwkNamePrefix string , cb IntegrationCallbacks , opts ... Option ) {
99
+ // Attempt to get the REST mapping for the GVK
100
+ if _ , err := mgr .GetRESTMapper ().RESTMapping (gvk .GroupKind (), gvk .Version ); err != nil {
101
+ if ! meta .IsNoMatchError (err ) {
102
+ log .Error (err , fmt .Sprintf ("%s: unable to get REST mapping" , fwkNamePrefix ))
103
+ return
104
+ }
105
+ log .Info ("No matching API in the server for job framework, skipped setup of controller and webhook" )
106
+ } else {
107
+ if err := setupControllerAndWebhook (mgr , gvk , fwkNamePrefix , cb , opts ... ); err != nil {
108
+ log .Error (err , "Failed to set up controller and webhook" )
109
+ } else {
110
+ log .Info ("Set up controller and webhook for job framework" )
111
+ }
112
+ }
113
+ }
114
+
115
+ func setupControllerAndWebhook (mgr ctrl.Manager , gvk schema.GroupVersionKind , fwkNamePrefix string , cb IntegrationCallbacks , opts ... Option ) error {
116
+ if err := cb .NewReconciler (
117
+ mgr .GetClient (),
118
+ mgr .GetEventRecorderFor (fmt .Sprintf ("%s-%s-controller" , gvk .Kind , "managerName" )), // Ensure managerName is defined or fetched
119
+ opts ... ,
120
+ ).SetupWithManager (mgr ); err != nil {
121
+ return fmt .Errorf ("%s: %w" , fwkNamePrefix , err )
122
+ }
123
+
124
+ if err := cb .SetupWebhook (mgr , opts ... ); err != nil {
125
+ return fmt .Errorf ("%s: unable to create webhook: %w" , fwkNamePrefix , err )
126
+ }
127
+
128
+ return nil
129
+ }
130
+
92
131
// SetupIndexes setups the indexers for integrations.
93
132
// When the platform developers implement a separate kueue-manager to manage the in-house custom jobs,
94
133
// they can easily setup indexers for the in-house custom jobs.
@@ -105,3 +144,73 @@ func SetupIndexes(ctx context.Context, indexer client.FieldIndexer, opts ...Opti
105
144
return nil
106
145
})
107
146
}
147
+
148
+ func isAPIAvailable (ctx context.Context , mgr ctrl.Manager , apiName string ) bool {
149
+ crdClient , err := apiextensionsclientset .NewForConfig (mgr .GetConfig ())
150
+ exitOnError (err , "unable to create CRD client" )
151
+
152
+ crdList , err := crdClient .ApiextensionsV1 ().CustomResourceDefinitions ().List (ctx , metav1.ListOptions {})
153
+ exitOnError (err , "unable to list CRDs" )
154
+
155
+ return slices .ContainsFunc (crdList .Items , func (crd apiextensionsv1.CustomResourceDefinition ) bool {
156
+ return crd .Name == apiName
157
+ })
158
+ }
159
+
160
+ func waitForAPI (ctx context.Context , log logr.Logger , mgr ctrl.Manager , apiName string , action func ()) {
161
+ crdClient , err := apiextensionsclientset .NewForConfig (mgr .GetConfig ())
162
+ exitOnError (err , "unable to create CRD client" )
163
+
164
+ crdList , err := crdClient .ApiextensionsV1 ().CustomResourceDefinitions ().List (ctx , metav1.ListOptions {})
165
+ exitOnError (err , "unable to list CRDs" )
166
+
167
+ // If API is already available, just invoke action
168
+ if slices .ContainsFunc (crdList .Items , func (crd apiextensionsv1.CustomResourceDefinition ) bool {
169
+ return crd .Name == apiName
170
+ }) {
171
+ action ()
172
+ return
173
+ }
174
+
175
+ // Wait for the API to become available then invoke action
176
+ log .Info (fmt .Sprintf ("API %v not available, setting up retry watcher" , apiName ))
177
+ retryWatcher , err := retrywatch .NewRetryWatcher (crdList .ResourceVersion , & cache.ListWatch {
178
+ ListFunc : func (options metav1.ListOptions ) (runtime.Object , error ) {
179
+ return crdClient .ApiextensionsV1 ().CustomResourceDefinitions ().List (ctx , metav1.ListOptions {})
180
+ },
181
+ WatchFunc : func (options metav1.ListOptions ) (watch.Interface , error ) {
182
+ return crdClient .ApiextensionsV1 ().CustomResourceDefinitions ().Watch (ctx , metav1.ListOptions {})
183
+ },
184
+ })
185
+ exitOnError (err , "unable to create retry watcher" )
186
+
187
+ defer retryWatcher .Stop ()
188
+ for {
189
+ select {
190
+ case <- ctx .Done ():
191
+ return
192
+ case event := <- retryWatcher .ResultChan ():
193
+ switch event .Type {
194
+ case watch .Error :
195
+ exitOnError (apierrors .FromObject (event .Object ), fmt .Sprintf ("error watching for API %v" , apiName ))
196
+
197
+ case watch .Added , watch .Modified :
198
+ if crd := event .Object .(* apiextensionsv1.CustomResourceDefinition ); crd .Name == apiName &&
199
+ slices .ContainsFunc (crd .Status .Conditions , func (condition apiextensionsv1.CustomResourceDefinitionCondition ) bool {
200
+ return condition .Type == apiextensionsv1 .Established && condition .Status == apiextensionsv1 .ConditionTrue
201
+ }) {
202
+ log .Info (fmt .Sprintf ("API %v installed, invoking deferred action" , apiName ))
203
+ action ()
204
+ return
205
+ }
206
+ }
207
+ }
208
+ }
209
+ }
210
+
211
+ func exitOnError (err error , msg string ) {
212
+ if err != nil {
213
+ fmt .Sprint (err , msg )
214
+ os .Exit (1 )
215
+ }
216
+ }
0 commit comments