Skip to content

Commit 48914e5

Browse files
author
Igor Velichkovich
committed
fix(remediation): use ctrl-runtime business logic
Signed-off-by: Igor Velichkovich <[email protected]>
1 parent 47f275b commit 48914e5

30 files changed

+2054
-2228
lines changed

.gitignore

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -140,30 +140,7 @@ dist/
140140
### JetBrains IDEs (GoLand, PyCharm, IntelliJ) ###
141141
### JetBrains IDEs (GoLand, PyCharm, IntelliJ) ###
142142
# User-specific stuff
143-
.idea/**/workspace.xml
144-
.idea/**/tasks.xml
145-
.idea/**/usage.statistics.xml
146-
.idea/**/dictionaries
147-
.idea/**/shelf
148-
149-
# AWS User-specific
150-
.idea/**/aws.xml
151-
152-
# Generated files
153-
.idea/**/contentModel.xml
154-
155-
# Sensitive or high-churn files
156-
.idea/**/dataSources/
157-
.idea/**/dataSources.ids
158-
.idea/**/dataSources.local.xml
159-
.idea/**/sqlDataSources.xml
160-
.idea/**/dynamic.xml
161-
.idea/**/uiDesigner.xml
162-
.idea/**/dbnavigator.xml
163-
164-
# Gradle
165-
.idea/**/gradle.xml
166-
.idea/**/libraries
143+
.idea/
167144

168145
# CMake
169146
cmake-build-*/

commons/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,4 +81,4 @@ require (
8181
replace (
8282
github.com/nvidia/nvsentinel/data-models => ../data-models
8383
github.com/nvidia/nvsentinel/store-client => ../store-client
84-
)
84+
)

commons/go.sum

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,4 +207,4 @@ sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxO
207207
sigs.k8s.io/structured-merge-diff/v6 v6.3.1 h1:JrhdFMqOd/+3ByqlP2I45kTOZmTRLBUm5pvRjeheg7E=
208208
sigs.k8s.io/structured-merge-diff/v6 v6.3.1/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE=
209209
sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs=
210-
sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4=
210+
sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4=

distros/kubernetes/nvsentinel/charts/fault-remediation/templates/deployment.yaml

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -87,29 +87,24 @@ spec:
8787
args:
8888
- "--dry-run={{ ((.Values.global).dryRun) | default false }}"
8989
- "--enable-log-collector={{ .Values.logCollector.enabled }}"
90-
{{- if (.Values.global).ctrlRuntimeEnabled }}
91-
- "--controller-runtime=true"
9290
- "--leader-elect=true"
93-
{{- end }}
9491
ports:
9592
- name: metrics
9693
containerPort: {{ ((.Values.global).metricsPort) | default 2112 }}
97-
{{- if (.Values.global).ctrlRuntimeEnabled }}
9894
- name: health
9995
containerPort: {{ ((.Values.global).healthPort) | default 9440 }}
100-
{{- end }}
10196
livenessProbe:
10297
httpGet:
10398
path: /healthz
104-
port: {{ ternary "health" "metrics" (default false (.Values.global).ctrlRuntimeEnabled) }}
99+
port: health
105100
initialDelaySeconds: 15
106101
periodSeconds: 20
107102
timeoutSeconds: 5
108103
failureThreshold: 3
109104
readinessProbe:
110105
httpGet:
111-
path: {{ ternary "/readyz" "/healthz" (default false (.Values.global).ctrlRuntimeEnabled) }}
112-
port: {{ ternary "health" "metrics" (default false (.Values.global).ctrlRuntimeEnabled) }}
106+
path: "/readyz"
107+
port: "health"
113108
initialDelaySeconds: 5
114109
periodSeconds: 10
115110
timeoutSeconds: 3

distros/kubernetes/nvsentinel/values-full.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,6 @@ global:
5656
# Prometheus should scrape these endpoints to monitor NVSentinel
5757
healthPort: 9440
5858

59-
# Whether or not to run applicable controllers using ctrl-runtime
60-
# this is still experimental
61-
ctrlRuntimeEnabled: false
62-
6359
# Dry-run mode - when enabled, all actions are logged but not executed
6460
# Useful for:
6561
# - Testing configuration changes safely

distros/kubernetes/nvsentinel/values-tilt-mongodb.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,6 @@ global:
2424
# Keep legacy resource naming for Tilt compatibility
2525
useLegacyResourceNames: true
2626

27-
# Use ctrl runtime in mongo and leave postgres with legacy so both get test coverage
28-
ctrlRuntimeEnabled: true
29-
3027
# Enable certificate rotation for testing hot-reload functionality
3128
certificateRotationEnabled: true
3229

distros/kubernetes/nvsentinel/values-tilt.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,6 @@ global:
3030
maxAgeDays: 7
3131
compress: true
3232

33-
ctrlRuntimeEnabled: true
34-
3533
nodeSelector: {}
3634

3735
tolerations:

distros/kubernetes/nvsentinel/values.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414

1515
global:
1616
dryRun: false
17-
ctrlRuntimeEnabled: false
1817
image:
1918
tag: "main"
2019
initContainerImage:

fault-remediation/main.go

Lines changed: 31 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -16,26 +16,22 @@ package main
1616

1717
import (
1818
"context"
19-
"errors"
2019
"flag"
2120
"fmt"
2221
"log/slog"
22+
"net/http"
2323
"os"
2424
"os/signal"
25-
"strconv"
26-
"strings"
2725
"syscall"
2826
"time"
2927

30-
"golang.org/x/sync/errgroup"
3128
"k8s.io/apimachinery/pkg/runtime"
3229
ctrl "sigs.k8s.io/controller-runtime"
3330
"sigs.k8s.io/controller-runtime/pkg/healthz"
3431
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
3532

3633
"github.com/nvidia/nvsentinel/commons/pkg/auditlogger"
3734
"github.com/nvidia/nvsentinel/commons/pkg/logger"
38-
"github.com/nvidia/nvsentinel/commons/pkg/server"
3935
"github.com/nvidia/nvsentinel/fault-remediation/pkg/initializer"
4036
)
4137

@@ -49,7 +45,6 @@ var (
4945

5046
// These variables are populated by parsing flags
5147
enableLeaderElection bool
52-
enableControllerRuntime bool
5348
leaderElectionLeaseDuration time.Duration
5449
leaderElectionRenewDeadline time.Duration
5550
leaderElectionRetryPeriod time.Duration
@@ -88,96 +83,27 @@ func main() {
8883
func run() error {
8984
parseFlags()
9085

91-
if !enableControllerRuntime && enableLeaderElection {
92-
return errors.New("leader-election requires controller-runtime")
93-
}
94-
9586
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
9687
defer stop()
9788

98-
params := initializer.InitializationParams{
99-
KubeconfigPath: kubeconfigPath,
100-
TomlConfigPath: tomlConfigPath,
101-
DryRun: dryRun,
102-
EnableLogCollector: enableLogCollector,
103-
}
104-
105-
components, err := initializer.InitializeAll(ctx, params)
89+
err := setupCtrlRuntimeManagement(ctx)
10690
if err != nil {
107-
return fmt.Errorf("initialization failed: %w", err)
108-
}
109-
110-
reconciler := components.FaultRemediationReconciler
111-
112-
defer func() {
113-
if err := reconciler.CloseAll(ctx); err != nil {
114-
slog.Error("failed to close datastore components", "error", err)
115-
}
116-
}()
117-
118-
if enableControllerRuntime {
119-
err = setupCtrlRuntimeManagement(ctx, components)
120-
if err != nil {
121-
return err
122-
}
123-
} else {
124-
err = setupNonCtrlRuntimeManaged(ctx, components)
125-
if err != nil {
126-
return err
127-
}
91+
return err
12892
}
12993

13094
return nil
13195
}
13296

133-
func setupNonCtrlRuntimeManaged(ctx context.Context, components *initializer.Components) error {
134-
slog.Info("Running without controller runtime management")
135-
136-
metricsAddr = strings.TrimPrefix(metricsAddr, ":")
137-
138-
portInt, err := strconv.Atoi(metricsAddr)
139-
if err != nil {
140-
return fmt.Errorf("invalid metrics port: %w", err)
141-
}
142-
143-
srv := server.NewServer(
144-
server.WithPort(portInt),
145-
server.WithPrometheusMetricsCtrlRuntime(),
146-
server.WithSimpleHealth(),
147-
)
148-
149-
g, gCtx := errgroup.WithContext(ctx)
150-
151-
g.Go(func() error {
152-
slog.Info("Starting metrics server", "port", portInt)
153-
154-
if err := srv.Serve(gCtx); err != nil {
155-
slog.Error("Metrics server failed - continuing without metrics", "error", err)
156-
}
157-
158-
return nil
159-
})
160-
161-
g.Go(func() error {
162-
components.FaultRemediationReconciler.StartWatcherStream(gCtx)
163-
164-
slog.Info("Listening for events on the channel...")
165-
166-
for event := range components.FaultRemediationReconciler.Watcher.Events() {
167-
slog.Info("Event received", "event", event)
168-
_, _ = components.FaultRemediationReconciler.Reconcile(gCtx, &event)
169-
}
97+
func setupCtrlRuntimeManagement(ctx context.Context) error {
98+
slog.Info("Running in controller runtime managed mode")
17099

171-
return nil
100+
cfg := ctrl.GetConfigOrDie()
101+
cfg.Wrap(func(rt http.RoundTripper) http.RoundTripper {
102+
return auditlogger.NewAuditingRoundTripper(rt)
172103
})
173104

174-
return g.Wait()
175-
}
176-
177-
func setupCtrlRuntimeManagement(ctx context.Context, components *initializer.Components) error {
178-
slog.Info("Running in controller runtime managed mode")
179-
180-
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
105+
//TODO: setup informers for node and job
106+
mgr, err := ctrl.NewManager(cfg, ctrl.Options{
181107
Scheme: scheme,
182108
Metrics: metricsserver.Options{
183109
BindAddress: metricsAddr,
@@ -205,6 +131,26 @@ func setupCtrlRuntimeManagement(ctx context.Context, components *initializer.Com
205131
return err
206132
}
207133

134+
params := initializer.InitializationParams{
135+
TomlConfigPath: tomlConfigPath,
136+
DryRun: dryRun,
137+
EnableLogCollector: enableLogCollector,
138+
Config: mgr.GetConfig(),
139+
}
140+
141+
components, err := initializer.InitializeAll(ctx, params, mgr.GetClient())
142+
if err != nil {
143+
return fmt.Errorf("initialization failed: %w", err)
144+
}
145+
146+
reconciler := components.FaultRemediationReconciler
147+
148+
defer func() {
149+
if err := reconciler.CloseAll(ctx); err != nil {
150+
slog.Error("failed to close datastore components", "error", err)
151+
}
152+
}()
153+
208154
err = components.FaultRemediationReconciler.SetupWithManager(ctx, mgr)
209155
if err != nil {
210156
return fmt.Errorf("SetupWithManager failed: %w", err)
@@ -235,20 +181,13 @@ func parseFlags() {
235181
" (otherwise metrics and health are on same port).",
236182
)
237183

238-
flag.StringVar(&kubeconfigPath, "kubeconfig-path", "", "path to kubeconfig file")
184+
flag.StringVar(&kubeconfigPath, "kubeconfig", "", "path to kubeconfig file")
239185

240186
flag.StringVar(&tomlConfigPath, "config-path", "/etc/config/config.toml",
241187
"path where the fault remediation config file is present")
242188

243189
flag.BoolVar(&dryRun, "dry-run", false, "flag to run fault remediation module in dry-run mode.")
244190

245-
flag.BoolVar(
246-
&enableControllerRuntime,
247-
"controller-runtime",
248-
false,
249-
"Enable controller runtime management of the reconciler.",
250-
)
251-
252191
flag.BoolVar(
253192
&enableLeaderElection,
254193
"leader-elect",

0 commit comments

Comments
 (0)