@@ -16,26 +16,22 @@ package main
1616
1717import (
1818 "context"
19- "errors"
2019 "flag"
2120 "fmt"
2221 "log/slog"
22+ "net/http"
2323 "os"
2424 "os/signal"
25- "strconv"
26- "strings"
2725 "syscall"
2826 "time"
2927
30- "golang.org/x/sync/errgroup"
3128 "k8s.io/apimachinery/pkg/runtime"
3229 ctrl "sigs.k8s.io/controller-runtime"
3330 "sigs.k8s.io/controller-runtime/pkg/healthz"
3431 metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
3532
3633 "github.com/nvidia/nvsentinel/commons/pkg/auditlogger"
3734 "github.com/nvidia/nvsentinel/commons/pkg/logger"
38- "github.com/nvidia/nvsentinel/commons/pkg/server"
3935 "github.com/nvidia/nvsentinel/fault-remediation/pkg/initializer"
4036)
4137
4945
5046 // These variables are populated by parsing flags
5147 enableLeaderElection bool
52- enableControllerRuntime bool
5348 leaderElectionLeaseDuration time.Duration
5449 leaderElectionRenewDeadline time.Duration
5550 leaderElectionRetryPeriod time.Duration
@@ -88,96 +83,33 @@ func main() {
8883func run () error {
8984 parseFlags ()
9085
91- if ! enableControllerRuntime && enableLeaderElection {
92- return errors .New ("leader-election requires controller-runtime" )
93- }
94-
9586 ctx , stop := signal .NotifyContext (context .Background (), syscall .SIGINT , syscall .SIGTERM )
9687 defer stop ()
9788
9889 params := initializer.InitializationParams {
99- KubeconfigPath : kubeconfigPath ,
10090 TomlConfigPath : tomlConfigPath ,
10191 DryRun : dryRun ,
10292 EnableLogCollector : enableLogCollector ,
10393 }
10494
105- components , err := initializer . InitializeAll (ctx , params )
95+ err := setupCtrlRuntimeManagement (ctx , params )
10696 if err != nil {
107- return fmt .Errorf ("initialization failed: %w" , err )
108- }
109-
110- reconciler := components .FaultRemediationReconciler
111-
112- defer func () {
113- if err := reconciler .CloseAll (ctx ); err != nil {
114- slog .Error ("failed to close datastore components" , "error" , err )
115- }
116- }()
117-
118- if enableControllerRuntime {
119- err = setupCtrlRuntimeManagement (ctx , components )
120- if err != nil {
121- return err
122- }
123- } else {
124- err = setupNonCtrlRuntimeManaged (ctx , components )
125- if err != nil {
126- return err
127- }
97+ return err
12898 }
12999
130100 return nil
131101}
132102
133- func setupNonCtrlRuntimeManaged (ctx context.Context , components * initializer.Components ) error {
134- slog .Info ("Running without controller runtime management" )
135-
136- metricsAddr = strings .TrimPrefix (metricsAddr , ":" )
137-
138- portInt , err := strconv .Atoi (metricsAddr )
139- if err != nil {
140- return fmt .Errorf ("invalid metrics port: %w" , err )
141- }
142-
143- srv := server .NewServer (
144- server .WithPort (portInt ),
145- server .WithPrometheusMetricsCtrlRuntime (),
146- server .WithSimpleHealth (),
147- )
148-
149- g , gCtx := errgroup .WithContext (ctx )
150-
151- g .Go (func () error {
152- slog .Info ("Starting metrics server" , "port" , portInt )
153-
154- if err := srv .Serve (gCtx ); err != nil {
155- slog .Error ("Metrics server failed - continuing without metrics" , "error" , err )
156- }
157-
158- return nil
159- })
160-
161- g .Go (func () error {
162- components .FaultRemediationReconciler .StartWatcherStream (gCtx )
163-
164- slog .Info ("Listening for events on the channel..." )
165-
166- for event := range components .FaultRemediationReconciler .Watcher .Events () {
167- slog .Info ("Event received" , "event" , event )
168- _ , _ = components .FaultRemediationReconciler .Reconcile (gCtx , & event )
169- }
103+ func setupCtrlRuntimeManagement (ctx context.Context , params initializer.InitializationParams ) error {
104+ slog .Info ("Running in controller runtime managed mode" )
170105
171- return nil
106+ cfg := ctrl .GetConfigOrDie ()
107+ cfg .Wrap (func (rt http.RoundTripper ) http.RoundTripper {
108+ return auditlogger .NewAuditingRoundTripper (rt )
172109 })
173110
174- return g .Wait ()
175- }
176-
177- func setupCtrlRuntimeManagement (ctx context.Context , components * initializer.Components ) error {
178- slog .Info ("Running in controller runtime managed mode" )
179-
180- mgr , err := ctrl .NewManager (ctrl .GetConfigOrDie (), ctrl.Options {
111+ //TODO: setup informers for node and job
112+ mgr , err := ctrl .NewManager (cfg , ctrl.Options {
181113 Scheme : scheme ,
182114 Metrics : metricsserver.Options {
183115 BindAddress : metricsAddr ,
@@ -205,6 +137,19 @@ func setupCtrlRuntimeManagement(ctx context.Context, components *initializer.Com
205137 return err
206138 }
207139
140+ components , err := initializer .InitializeAll (ctx , params , mgr .GetClient ())
141+ if err != nil {
142+ return fmt .Errorf ("initialization failed: %w" , err )
143+ }
144+
145+ reconciler := components .FaultRemediationReconciler
146+
147+ defer func () {
148+ if err := reconciler .CloseAll (ctx ); err != nil {
149+ slog .Error ("failed to close datastore components" , "error" , err )
150+ }
151+ }()
152+
208153 err = components .FaultRemediationReconciler .SetupWithManager (ctx , mgr )
209154 if err != nil {
210155 return fmt .Errorf ("SetupWithManager failed: %w" , err )
@@ -235,20 +180,13 @@ func parseFlags() {
235180 " (otherwise metrics and health are on same port)." ,
236181 )
237182
238- flag .StringVar (& kubeconfigPath , "kubeconfig-path " , "" , "path to kubeconfig file" )
183+ flag .StringVar (& kubeconfigPath , "kubeconfig" , "" , "path to kubeconfig file" )
239184
240185 flag .StringVar (& tomlConfigPath , "config-path" , "/etc/config/config.toml" ,
241186 "path where the fault remediation config file is present" )
242187
243188 flag .BoolVar (& dryRun , "dry-run" , false , "flag to run fault remediation module in dry-run mode." )
244189
245- flag .BoolVar (
246- & enableControllerRuntime ,
247- "controller-runtime" ,
248- false ,
249- "Enable controller runtime management of the reconciler." ,
250- )
251-
252190 flag .BoolVar (
253191 & enableLeaderElection ,
254192 "leader-elect" ,
0 commit comments