Skip to content

Commit c657396

Browse files
authored
Merge pull request #851 from herb-duan/feat/controller-leader-election
feat(controller): Add leader election for high availability
2 parents aded8ca + 3ec74e0 commit c657396

File tree

14 files changed

+1599
-4
lines changed

14 files changed

+1599
-4
lines changed

cmd/compute-domain-controller/main.go

Lines changed: 119 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,14 @@ import (
2727
"path"
2828
"syscall"
2929

30+
"github.com/google/uuid"
3031
"github.com/prometheus/client_golang/prometheus"
3132
"github.com/prometheus/client_golang/prometheus/promhttp"
3233
"github.com/urfave/cli/v2"
3334

35+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
36+
"k8s.io/client-go/tools/leaderelection"
37+
"k8s.io/client-go/tools/leaderelection/resourcelock"
3438
"k8s.io/component-base/logs"
3539
"k8s.io/component-base/metrics/legacyregistry"
3640
"k8s.io/klog/v2"
@@ -56,7 +60,8 @@ const (
5660
)
5761

5862
type Flags struct {
59-
kubeClientConfig pkgflags.KubeClientConfig
63+
kubeClientConfig pkgflags.KubeClientConfig
64+
leaderElectionConfig pkgflags.LeaderElectionConfig
6065

6166
podName string
6267
namespace string
@@ -157,6 +162,7 @@ func newApp() *cli.App {
157162
},
158163
}
159164

165+
cliFlags = append(cliFlags, flags.leaderElectionConfig.Flags()...)
160166
cliFlags = append(cliFlags, flags.kubeClientConfig.Flags()...)
161167
cliFlags = append(cliFlags, featureGateConfig.Flags()...)
162168
cliFlags = append(cliFlags, loggingConfig.Flags()...)
@@ -217,12 +223,19 @@ func newApp() *cli.App {
217223
controller := NewController(config)
218224
ctx, cancel := context.WithCancel(c.Context)
219225
go func() {
220-
errChan <- controller.Run(ctx)
226+
// Fallback to standalone mode if leader election is disabled
227+
if !config.flags.leaderElectionConfig.Enabled {
228+
klog.Info("Leader election disabled, starting controller directly")
229+
errChan <- controller.Run(ctx)
230+
return
231+
}
232+
errChan <- runWithLeaderElection(ctx, config, controller)
221233
}()
222234

223235
for {
224236
select {
225-
case <-sigs:
237+
case sig := <-sigs:
238+
klog.InfoS("Received signal, shutting down", "signal", sig.String())
226239
cancel()
227240
case err := <-errChan:
228241
cancel()
@@ -253,6 +266,109 @@ func newApp() *cli.App {
253266
return app
254267
}
255268

269+
func runWithLeaderElection(ctx context.Context, config *Config, controller *Controller) error {
270+
klog.Info("Leader election enabled")
271+
// Unique identity: PodName + UUID to prevent conflicts on restarts
272+
id := uuid.New().String()
273+
lockID := fmt.Sprintf("%s-%s", config.flags.podName, id)
274+
klog.InfoS("Leader election candidate registered", "lockID", lockID,
275+
"leaseName", config.flags.leaderElectionConfig.LeaseLockName,
276+
"leaseNamespace", config.flags.leaderElectionConfig.LeaseLockNamespace)
277+
278+
// electorCtx controls the lifecycle of the leader election loop
279+
electorCtx, cancelElector := context.WithCancel(ctx)
280+
// Standard defer to ensure resources are cleaned up on function exit
281+
defer cancelElector()
282+
283+
lock := &resourcelock.LeaseLock{
284+
LeaseMeta: metav1.ObjectMeta{
285+
Name: config.flags.leaderElectionConfig.LeaseLockName,
286+
Namespace: config.flags.leaderElectionConfig.LeaseLockNamespace,
287+
},
288+
Client: config.clientsets.Core.CoordinationV1(),
289+
LockConfig: resourcelock.ResourceLockConfig{
290+
Identity: lockID,
291+
},
292+
}
293+
294+
controllerErrCh := make(chan error, 1)
295+
callbacks := leaderelection.LeaderCallbacks{
296+
OnStartedLeading: func(leaderCtx context.Context) {
297+
klog.InfoS("Became leader, starting controller", "lockID", lockID)
298+
299+
// ARCHITECTURE NOTE:
300+
// We use cancelElector() to ensure that if the controller logic exits
301+
// (either gracefully or with an error), the entire leader election loop
302+
// terminates. This triggers ReleaseOnCancel, clearing the lease holder
303+
// identity and allowing standby replicas to take over immediately.
304+
//
305+
// By returning from run() after elector.Run() finishes, we rely on
306+
// Kubernetes to restart the Pod, ensuring a clean in-memory state
307+
// for the next leadership term.
308+
defer cancelElector()
309+
310+
// NOTE: Use leaderCtx provided by the callback.
311+
// It is automatically cancelled if leadership is lost.
312+
if err := controller.Run(leaderCtx); err != nil {
313+
select {
314+
case controllerErrCh <- err:
315+
default:
316+
}
317+
klog.ErrorS(err, "Controller exited with error", "lockID", lockID)
318+
} else {
319+
klog.InfoS("Controller exited gracefully", "lockID", lockID)
320+
}
321+
},
322+
OnStoppedLeading: func() {
323+
// ARCHITECTURE NOTE:
324+
// We only log here. The actual shutdown of the controller is handled by the
325+
// cancellation of the leaderCtx passed to OnStartedLeading.
326+
// When leadership is lost, the library cancels that context, triggering
327+
// the controller's graceful shutdown logic.
328+
klog.Warningf("Stopped leading, lockID: %s", lockID)
329+
},
330+
OnNewLeader: func(identity string) {
331+
// OnNewLeader is called when a new leader is observed.
332+
// We ignore the case where the "new" leader is ourselves to avoid
333+
// redundant logs during initial election or re-election.
334+
if identity == lockID {
335+
klog.V(6).InfoS("OnNewLeader callback: observed leader is still ourselves", "lockID", lockID)
336+
return
337+
}
338+
klog.InfoS("New leader elected", "leader", identity, "currentCandidate", lockID)
339+
},
340+
}
341+
342+
elector, err := leaderelection.NewLeaderElector(leaderelection.LeaderElectionConfig{
343+
Lock: lock,
344+
LeaseDuration: config.flags.leaderElectionConfig.LeaseDuration,
345+
RenewDeadline: config.flags.leaderElectionConfig.RenewDeadline,
346+
RetryPeriod: config.flags.leaderElectionConfig.RetryPeriod,
347+
Name: config.flags.leaderElectionConfig.LeaseLockName,
348+
Callbacks: callbacks,
349+
ReleaseOnCancel: true, // Steps down immediately by clearing the Lease holder
350+
})
351+
if err != nil {
352+
return fmt.Errorf("failed to create leader elector: %w", err)
353+
}
354+
355+
// Block until electorCtx is cancelled or leadership is lost
356+
klog.InfoS("Starting leader election loop", "lockID", lockID)
357+
elector.Run(electorCtx)
358+
359+
// If exiting due to a controller failure, propagate the error to main
360+
select {
361+
case err := <-controllerErrCh:
362+
if err != nil {
363+
klog.ErrorS(err, "Process exiting due to controller failure")
364+
return fmt.Errorf("controller execution failed: %w", err)
365+
}
366+
default:
367+
}
368+
klog.InfoS("Leader election loop ended gracefully", "lockID", lockID)
369+
return nil
370+
}
371+
256372
func SetupHTTPEndpoint(config *Config) error {
257373
if config.flags.metricsPath != "" {
258374
// To collect metrics data from the metric handler itself, we

deployments/helm/nvidia-dra-driver-gpu/templates/controller.yaml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ metadata:
2222
labels:
2323
{{- include "nvidia-dra-driver-gpu.labels" . | nindent 4 }}
2424
spec:
25-
replicas: 1
25+
replicas: {{ .Values.controller.replicas }}
2626
selector:
2727
matchLabels:
2828
{{- include "nvidia-dra-driver-gpu.selectorLabels" (dict "context" . "componentName" "controller") | nindent 6 }}
@@ -87,6 +87,18 @@ spec:
8787
{{- with .Values.controller.containers.computeDomain.env }}
8888
{{- toYaml . | nindent 8 }}
8989
{{- end }}
90+
- name: LEADER_ELECTION_ENABLED
91+
value: "{{ .Values.controller.leaderElection.enabled }}"
92+
- name: LEADER_ELECTION_LEASE_LOCK_NAME
93+
value: "{{ include "nvidia-dra-driver-gpu.name" . }}-controller"
94+
- name: LEADER_ELECTION_LEASE_LOCK_NAMESPACE
95+
value: "{{ include "nvidia-dra-driver-gpu.namespace" . }}"
96+
- name: LEADER_ELECTION_LEASE_DURATION
97+
value: "{{ .Values.controller.leaderElection.leaseDuration }}"
98+
- name: LEADER_ELECTION_RENEW_DEADLINE
99+
value: "{{ .Values.controller.leaderElection.renewDeadline }}"
100+
- name: LEADER_ELECTION_RETRY_PERIOD
101+
value: "{{ .Values.controller.leaderElection.retryPeriod }}"
90102
{{- with .Values.controller.nodeSelector }}
91103
nodeSelector:
92104
{{- toYaml . | nindent 8 }}

deployments/helm/nvidia-dra-driver-gpu/templates/rbac-controller.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ rules:
1414
- apiGroups: ["resource.k8s.io"]
1515
resources: ["resourceclaimtemplates"]
1616
verbs: ["get", "list", "watch", "create", "update", "delete"]
17+
- apiGroups: ["coordination.k8s.io"]
18+
resources: ["leases"]
19+
verbs: ["get", "create", "update"]
1720
- apiGroups: [""]
1821
resources: ["nodes"]
1922
verbs: ["get", "list", "watch", "update"]

deployments/helm/nvidia-dra-driver-gpu/values.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,12 @@ webhook:
185185
caBundle: ""
186186

187187
controller:
188+
replicas: 1
189+
leaderElection:
190+
enabled: false
191+
leaseDuration: "15s"
192+
renewDeadline: "10s"
193+
retryPeriod: "2s"
188194
priorityClassName: "system-node-critical"
189195
podAnnotations: {}
190196
podSecurityContext: {}
@@ -208,6 +214,14 @@ controller:
208214
- matchExpressions:
209215
- key: "node-role.kubernetes.io/control-plane"
210216
operator: "Exists"
217+
podAntiAffinity:
218+
preferredDuringSchedulingIgnoredDuringExecution:
219+
- weight: 100
220+
podAffinityTerm:
221+
labelSelector:
222+
matchLabels:
223+
nvidia-dra-driver-gpu-component: controller
224+
topologyKey: kubernetes.io/hostname
211225
# Network policy settings
212226
networkPolicy:
213227
# If the network policy is enabled or not

pkg/flags/leaderelection.go

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
/*
2+
* Copyright 2025 NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package flags
18+
19+
import (
20+
"time"
21+
22+
"github.com/urfave/cli/v2"
23+
)
24+
25+
type LeaderElectionConfig struct {
26+
Enabled bool
27+
LeaseLockName string
28+
LeaseLockNamespace string
29+
LeaseDuration time.Duration
30+
RenewDeadline time.Duration
31+
RetryPeriod time.Duration
32+
}
33+
34+
func (l *LeaderElectionConfig) Flags() []cli.Flag {
35+
return []cli.Flag{
36+
&cli.BoolFlag{
37+
Category: "Leader election:",
38+
Name: "leader-election-enabled",
39+
Usage: "Start a leader election client and gain leadership before executing the main loop. Enable this when running replicated components for high availability.",
40+
Value: false,
41+
Destination: &l.Enabled,
42+
EnvVars: []string{"LEADER_ELECTION_ENABLED"},
43+
},
44+
&cli.StringFlag{
45+
Category: "Leader election:",
46+
Name: "leader-election-lease-lock-namespace",
47+
Usage: "The lease lock resource namespace.",
48+
Value: "default",
49+
Destination: &l.LeaseLockNamespace,
50+
EnvVars: []string{"LEADER_ELECTION_LEASE_LOCK_NAMESPACE"},
51+
},
52+
&cli.StringFlag{
53+
Category: "Leader election:",
54+
Name: "leader-election-lease-lock-name",
55+
Usage: "The lease lock resource name.",
56+
Value: "nvidia-compute-domain-controller",
57+
Destination: &l.LeaseLockName,
58+
EnvVars: []string{"LEADER_ELECTION_LEASE_LOCK_NAME"},
59+
},
60+
&cli.DurationFlag{
61+
Category: "Leader election:",
62+
Name: "leader-election-lease-duration",
63+
Usage: "The duration that non-leader candidates will wait to force acquire leadership. This is measured against time of last observed ack.",
64+
Value: 15 * time.Second,
65+
Destination: &l.LeaseDuration,
66+
EnvVars: []string{"LEADER_ELECTION_LEASE_DURATION"},
67+
},
68+
&cli.DurationFlag{
69+
Category: "Leader election:",
70+
Name: "leader-election-renew-deadline",
71+
Usage: "The duration that the acting controlplane will retry refreshing leadership before giving up.",
72+
Value: 10 * time.Second,
73+
Destination: &l.RenewDeadline,
74+
EnvVars: []string{"LEADER_ELECTION_RENEW_DEADLINE"},
75+
},
76+
&cli.DurationFlag{
77+
Category: "Leader election:",
78+
Name: "leader-election-retry-period",
79+
Usage: "The duration the LeaderElector clients should wait between tries of actions.",
80+
Value: 2 * time.Second,
81+
Destination: &l.RetryPeriod,
82+
EnvVars: []string{"LEADER_ELECTION_RETRY_PERIOD"},
83+
},
84+
}
85+
}

vendor/k8s.io/client-go/tools/leaderelection/OWNERS

Lines changed: 13 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)