@@ -25,6 +25,7 @@ import (
2525 "syscall"
2626 "time"
2727
28+ "golang.org/x/sync/errgroup"
2829 batchv1 "k8s.io/api/batch/v1"
2930 corev1 "k8s.io/api/core/v1"
3031 utilruntime "k8s.io/apimachinery/pkg/util/runtime"
@@ -105,12 +106,63 @@ func run() error {
105106func setupCtrlRuntimeManagement (ctx context.Context ) error {
106107 slog .Info ("Running in controller runtime managed mode" )
107108
109+ mgr , err := createManager ()
110+ if err != nil {
111+ return err
112+ }
113+
114+ params := initializer.InitializationParams {
115+ TomlConfigPath : tomlConfigPath ,
116+ DryRun : dryRun ,
117+ EnableLogCollector : enableLogCollector ,
118+ Config : mgr .GetConfig (),
119+ }
120+
121+ g , gCtx := errgroup .WithContext (ctx )
122+
123+ // Start the manager first so health/metrics endpoints are live immediately.
124+ // This prevents Kubernetes liveness probes from killing the pod while MongoDB
125+ // initialization (which may be slow due to stale resume tokens or connectivity
126+ // issues) is still in progress.
127+ g .Go (func () error {
128+ slog .Info ("Starting controller runtime controller" )
129+
130+ if err := mgr .Start (gCtx ); err != nil {
131+ slog .Error ("Problem running manager" , "error" , err )
132+ return err
133+ }
134+
135+ return nil
136+ })
137+
138+ // Initialize datastore and reconciler concurrently — the manager is already
139+ // serving health probes, so the pod won't be killed during this phase.
140+ // cleanupReconciler is set once the reconciler is created so cleanup can run
141+ // after g.Wait() (i.e., after the manager has fully drained).
142+ var cleanupReconciler func ()
143+
144+ g .Go (func () error {
145+ cleanup , initErr := initializeAndWatch (gCtx , params , mgr )
146+ cleanupReconciler = cleanup
147+
148+ return initErr
149+ })
150+
151+ err = g .Wait ()
152+
153+ if cleanupReconciler != nil {
154+ cleanupReconciler ()
155+ }
156+
157+ return err
158+ }
159+
160+ func createManager () (ctrl.Manager , error ) {
108161 cfg := ctrl .GetConfigOrDie ()
109162 cfg .Wrap (func (rt http.RoundTripper ) http.RoundTripper {
110163 return auditlogger .NewAuditingRoundTripper (rt )
111164 })
112165
113- //TODO: setup informers for node and job
114166 mgr , err := ctrl .NewManager (cfg , ctrl.Options {
115167 Scheme : scheme ,
116168 Metrics : metricsserver.Options {
@@ -126,52 +178,64 @@ func setupCtrlRuntimeManagement(ctx context.Context) error {
126178 })
127179 if err != nil {
128180 slog .Error ("Unable to start manager" , "error" , err )
129- return err
181+ return nil , err
130182 }
131183
132184 if err = mgr .AddHealthzCheck ("healthz" , healthz .Ping ); err != nil {
133185 slog .Error ("Unable to set up health check" , "error" , err )
134- return err
186+ return nil , err
135187 }
136188
137189 if err := mgr .AddReadyzCheck ("readyz" , healthz .Ping ); err != nil {
138190 slog .Error ("Unable to set up ready check" , "error" , err )
139- return err
191+ return nil , err
140192 }
141193
142- params := initializer.InitializationParams {
143- TomlConfigPath : tomlConfigPath ,
144- DryRun : dryRun ,
145- EnableLogCollector : enableLogCollector ,
146- Config : mgr .GetConfig (),
147- }
194+ return mgr , nil
195+ }
196+
197+ const reconcilerCloseTimeout = 30 * time .Second
148198
199+ // initializeAndWatch performs MongoDB initialization, registers the reconciler, and
200+ // blocks until shutdown or unexpected stream death. It returns a cleanup function that
201+ // the caller must invoke after the manager has fully stopped (after g.Wait) so that
202+ // datastore resources are not torn down under in-flight reconciles.
203+ func initializeAndWatch (
204+ ctx context.Context , params initializer.InitializationParams , mgr ctrl.Manager ,
205+ ) (cleanup func (), err error ) {
149206 components , err := initializer .InitializeAll (ctx , params , mgr .GetClient ())
150207 if err != nil {
151- return fmt .Errorf ("initialization failed: %w" , err )
208+ return nil , fmt .Errorf ("initialization failed: %w" , err )
152209 }
153210
154211 reconciler := components .FaultRemediationReconciler
155212
156- defer func () {
157- if err := reconciler .CloseAll (ctx ); err != nil {
158- slog .Error ("failed to close datastore components" , "error" , err )
213+ cleanup = func () {
214+ closeCtx , cancel := context .WithTimeout (context .Background (), reconcilerCloseTimeout )
215+ defer cancel ()
216+
217+ if closeErr := reconciler .CloseAll (closeCtx ); closeErr != nil {
218+ slog .Error ("failed to close datastore components" , "error" , closeErr )
159219 }
160- }()
220+ }
161221
162- err = components . FaultRemediationReconciler .SetupWithManager (ctx , mgr )
163- if err != nil {
164- return fmt .Errorf ("SetupWithManager failed: %w" , err )
222+ watcherDone , setupErr := reconciler .SetupWithManager (ctx , mgr )
223+ if setupErr != nil {
224+ return cleanup , fmt .Errorf ("SetupWithManager failed: %w" , setupErr )
165225 }
166226
167- slog .Info ("Starting controller runtime controller " )
227+ slog .Info ("Initialization completed, reconciler registered with manager " )
168228
169- if err = mgr .Start (ctx ); err != nil {
170- slog .Error ("Problem running manager" , "error" , err )
171- return err
172- }
229+ select {
230+ case <- ctx .Done ():
231+ return cleanup , nil
232+ case <- watcherDone :
233+ if ctx .Err () == nil {
234+ return cleanup , fmt .Errorf ("change stream watcher terminated unexpectedly, event processing has stopped" )
235+ }
173236
174- return nil
237+ return cleanup , nil
238+ }
175239}
176240
177241func parseFlags () {
0 commit comments