@@ -16,26 +16,22 @@ package main
1616
1717import (
1818 "context"
19- "errors"
2019 "flag"
2120 "fmt"
2221 "log/slog"
22+ "net/http"
2323 "os"
2424 "os/signal"
25- "strconv"
26- "strings"
2725 "syscall"
2826 "time"
2927
30- "golang.org/x/sync/errgroup"
3128 "k8s.io/apimachinery/pkg/runtime"
3229 ctrl "sigs.k8s.io/controller-runtime"
3330 "sigs.k8s.io/controller-runtime/pkg/healthz"
3431 metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
3532
3633 "github.com/nvidia/nvsentinel/commons/pkg/auditlogger"
3734 "github.com/nvidia/nvsentinel/commons/pkg/logger"
38- "github.com/nvidia/nvsentinel/commons/pkg/server"
3935 "github.com/nvidia/nvsentinel/fault-remediation/pkg/initializer"
4036)
4137
4945
5046 // These variables are populated by parsing flags
5147 enableLeaderElection bool
52- enableControllerRuntime bool
5348 leaderElectionLeaseDuration time.Duration
5449 leaderElectionRenewDeadline time.Duration
5550 leaderElectionRetryPeriod time.Duration
@@ -88,96 +83,27 @@ func main() {
8883func run () error {
8984 parseFlags ()
9085
91- if ! enableControllerRuntime && enableLeaderElection {
92- return errors .New ("leader-election requires controller-runtime" )
93- }
94-
9586 ctx , stop := signal .NotifyContext (context .Background (), syscall .SIGINT , syscall .SIGTERM )
9687 defer stop ()
9788
98- params := initializer.InitializationParams {
99- KubeconfigPath : kubeconfigPath ,
100- TomlConfigPath : tomlConfigPath ,
101- DryRun : dryRun ,
102- EnableLogCollector : enableLogCollector ,
103- }
104-
105- components , err := initializer .InitializeAll (ctx , params )
89+ err := setupCtrlRuntimeManagement (ctx )
10690 if err != nil {
107- return fmt .Errorf ("initialization failed: %w" , err )
108- }
109-
110- reconciler := components .FaultRemediationReconciler
111-
112- defer func () {
113- if err := reconciler .CloseAll (ctx ); err != nil {
114- slog .Error ("failed to close datastore components" , "error" , err )
115- }
116- }()
117-
118- if enableControllerRuntime {
119- err = setupCtrlRuntimeManagement (ctx , components )
120- if err != nil {
121- return err
122- }
123- } else {
124- err = setupNonCtrlRuntimeManaged (ctx , components )
125- if err != nil {
126- return err
127- }
91+ return err
12892 }
12993
13094 return nil
13195}
13296
133- func setupNonCtrlRuntimeManaged (ctx context.Context , components * initializer.Components ) error {
134- slog .Info ("Running without controller runtime management" )
135-
136- metricsAddr = strings .TrimPrefix (metricsAddr , ":" )
137-
138- portInt , err := strconv .Atoi (metricsAddr )
139- if err != nil {
140- return fmt .Errorf ("invalid metrics port: %w" , err )
141- }
142-
143- srv := server .NewServer (
144- server .WithPort (portInt ),
145- server .WithPrometheusMetricsCtrlRuntime (),
146- server .WithSimpleHealth (),
147- )
148-
149- g , gCtx := errgroup .WithContext (ctx )
150-
151- g .Go (func () error {
152- slog .Info ("Starting metrics server" , "port" , portInt )
153-
154- if err := srv .Serve (gCtx ); err != nil {
155- slog .Error ("Metrics server failed - continuing without metrics" , "error" , err )
156- }
157-
158- return nil
159- })
160-
161- g .Go (func () error {
162- components .FaultRemediationReconciler .StartWatcherStream (gCtx )
163-
164- slog .Info ("Listening for events on the channel..." )
165-
166- for event := range components .FaultRemediationReconciler .Watcher .Events () {
167- slog .Info ("Event received" , "event" , event )
168- _ , _ = components .FaultRemediationReconciler .Reconcile (gCtx , & event )
169- }
97+ func setupCtrlRuntimeManagement (ctx context.Context ) error {
98+ slog .Info ("Running in controller runtime managed mode" )
17099
171- return nil
100+ cfg := ctrl .GetConfigOrDie ()
101+ cfg .Wrap (func (rt http.RoundTripper ) http.RoundTripper {
102+ return auditlogger .NewAuditingRoundTripper (rt )
172103 })
173104
174- return g .Wait ()
175- }
176-
177- func setupCtrlRuntimeManagement (ctx context.Context , components * initializer.Components ) error {
178- slog .Info ("Running in controller runtime managed mode" )
179-
180- mgr , err := ctrl .NewManager (ctrl .GetConfigOrDie (), ctrl.Options {
105+ //TODO: setup informers for node and job
106+ mgr , err := ctrl .NewManager (cfg , ctrl.Options {
181107 Scheme : scheme ,
182108 Metrics : metricsserver.Options {
183109 BindAddress : metricsAddr ,
@@ -205,6 +131,26 @@ func setupCtrlRuntimeManagement(ctx context.Context, components *initializer.Com
205131 return err
206132 }
207133
134+ params := initializer.InitializationParams {
135+ TomlConfigPath : tomlConfigPath ,
136+ DryRun : dryRun ,
137+ EnableLogCollector : enableLogCollector ,
138+ Config : mgr .GetConfig (),
139+ }
140+
141+ components , err := initializer .InitializeAll (ctx , params , mgr .GetClient ())
142+ if err != nil {
143+ return fmt .Errorf ("initialization failed: %w" , err )
144+ }
145+
146+ reconciler := components .FaultRemediationReconciler
147+
148+ defer func () {
149+ if err := reconciler .CloseAll (ctx ); err != nil {
150+ slog .Error ("failed to close datastore components" , "error" , err )
151+ }
152+ }()
153+
208154 err = components .FaultRemediationReconciler .SetupWithManager (ctx , mgr )
209155 if err != nil {
210156 return fmt .Errorf ("SetupWithManager failed: %w" , err )
@@ -235,20 +181,13 @@ func parseFlags() {
235181 " (otherwise metrics and health are on same port)." ,
236182 )
237183
238- flag .StringVar (& kubeconfigPath , "kubeconfig-path " , "" , "path to kubeconfig file" )
184+ flag .StringVar (& kubeconfigPath , "kubeconfig" , "" , "path to kubeconfig file" )
239185
240186 flag .StringVar (& tomlConfigPath , "config-path" , "/etc/config/config.toml" ,
241187 "path where the fault remediation config file is present" )
242188
243189 flag .BoolVar (& dryRun , "dry-run" , false , "flag to run fault remediation module in dry-run mode." )
244190
245- flag .BoolVar (
246- & enableControllerRuntime ,
247- "controller-runtime" ,
248- false ,
249- "Enable controller runtime management of the reconciler." ,
250- )
251-
252191 flag .BoolVar (
253192 & enableLeaderElection ,
254193 "leader-elect" ,
0 commit comments