@@ -18,6 +18,7 @@ package main
1818
1919import (
2020 "flag"
21+ "fmt"
2122 "os"
2223
2324 coordinationv1 "k8s.io/api/coordination/v1"
@@ -26,7 +27,9 @@ import (
2627 "k8s.io/apimachinery/pkg/runtime"
2728 "k8s.io/apimachinery/pkg/runtime/schema"
2829 utilruntime "k8s.io/apimachinery/pkg/util/runtime"
30+ "k8s.io/client-go/discovery"
2931 clientgoscheme "k8s.io/client-go/kubernetes/scheme"
32+ "k8s.io/client-go/rest"
3033 ctrl "sigs.k8s.io/controller-runtime"
3134 "sigs.k8s.io/controller-runtime/pkg/healthz"
3235 "sigs.k8s.io/controller-runtime/pkg/log/zap"
@@ -82,7 +85,22 @@ func main() {
8285
8386 ctrl .SetLogger (zap .New (zap .UseFlagOptions (& opts )))
8487
85- mgr , err := ctrl .NewManager (ctrl .GetConfigOrDie (), ctrl.Options {
88+ restCfg := ctrl .GetConfigOrDie ()
89+
90+ // Fail fast if required CRDs are not yet installed in the cluster. The
91+ // gpu-node-mocker controllers watch karpenter.sh/v1 NodeClaim objects;
92+ // without the CRD, controller-runtime's informers loop on "no kind is
93+ // registered" errors instead of failing. Exiting with a non-zero status
94+ // here lets the Deployment's restart policy back off and retry until
95+ // the KAITO operator (which ships the karpenter CRDs) finishes
96+ // installing them. This unblocks parallel install ordering at the
97+ // shell level (no need to gate on KAITO CRDs before deploying us).
98+ if err := checkRequiredCRDs (restCfg ); err != nil {
99+ setupLog .Error (err , "required CRDs are not ready; exiting so the pod is restarted" )
100+ os .Exit (1 )
101+ }
102+
103+ mgr , err := ctrl .NewManager (restCfg , ctrl.Options {
86104 Scheme : scheme ,
87105 Metrics : metricsserver.Options {BindAddress : metricsAddr },
88106 HealthProbeBindAddress : probeAddr ,
@@ -120,3 +138,44 @@ func main() {
120138 os .Exit (1 )
121139 }
122140}
141+
142+ // checkRequiredCRDs verifies that every API resource the gpu-node-mocker
143+ // controllers depend on is already registered with the API server. The
144+ // check is done via discovery so it does not require the CRD types to be
145+ // served — only that the apiserver advertises the resource. A single
146+ // missing resource returns an error; the caller is expected to exit so
147+ // the kubelet restarts the pod (the simplest "wait for CRDs" strategy).
148+ func checkRequiredCRDs (cfg * rest.Config ) error {
149+ dc , err := discovery .NewDiscoveryClientForConfig (cfg )
150+ if err != nil {
151+ return err
152+ }
153+
154+ required := []struct {
155+ groupVersion string
156+ resource string
157+ }{
158+ // Karpenter NodeClaim CRD is installed by the KAITO workspace
159+ // operator's chart; the NodeClaimReconciler watches it.
160+ {groupVersion : "karpenter.sh/v1" , resource : "nodeclaims" },
161+ }
162+
163+ for _ , r := range required {
164+ list , err := dc .ServerResourcesForGroupVersion (r .groupVersion )
165+ if err != nil {
166+ return fmt .Errorf ("discovering resources for %s: %w" , r .groupVersion , err )
167+ }
168+ found := false
169+ for _ , api := range list .APIResources {
170+ if api .Name == r .resource {
171+ found = true
172+ break
173+ }
174+ }
175+ if ! found {
176+ return fmt .Errorf ("required resource %s.%s is not yet registered with the apiserver" , r .resource , r .groupVersion )
177+ }
178+ setupLog .Info ("required CRD is ready" , "groupVersion" , r .groupVersion , "resource" , r .resource )
179+ }
180+ return nil
181+ }
0 commit comments