forked from kubernetes-sigs/dra-driver-nvidia-gpu
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcontroller.go
More file actions
114 lines (87 loc) · 3.74 KB
/
controller.go
File metadata and controls
114 lines (87 loc) · 3.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/*
Copyright The Kubernetes Authors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package main implements a Kubernetes Device Resource Allocation (DRA) driver controller
package main
import (
"context"
"fmt"
"k8s.io/klog/v2"
"sigs.k8s.io/nvidia-dra-driver-gpu/pkg/flags"
"sigs.k8s.io/nvidia-dra-driver-gpu/pkg/workqueue"
)
// ManagerConfig defines the common configuration options shared across all managers.
// It contains essential fields for driver identification, Kubernetes client access,
// and work queue management.
type ManagerConfig struct {
// driverName is the unique identifier for this DRA driver
driverName string
// driverNamespace is the Kubernetes namespace where the driver operates
driverNamespace string
// imageName is the full image name to use when rendering templates
imageName string
// maxNodesPerIMEXDomain is the maximum number of nodes per IMEX domain to allocate
maxNodesPerIMEXDomain int
// clientsets provides access to various Kubernetes API client interfaces
clientsets flags.ClientSets
// workQueue manages the asynchronous processing of tasks
workQueue *workqueue.WorkQueue
// additionalNamespaces is a list of additional namespaces
// where the driver can manage resources
additionalNamespaces []string
// logVerbosityCDDaemon controls the log verbosity for dynamically launched
// ComputeDomain daemons.
logVerbosityCDDaemon int
// httpEndpoint is the TCP network address where the HTTP server for diagnostics
// (including pprof and metrics) will listen
httpEndpoint string
// metricsPath is the HTTP path for Prometheus metrics
metricsPath string
}
// Controller manages the lifecycle of the DRA driver and its components.
type Controller struct {
// config holds the controller's configuration settings
config *Config
}
// NewController creates and initializes a new Controller instance with the provided configuration.
func NewController(config *Config) *Controller {
return &Controller{config: config}
}
// Run starts the controller's main loop and manages the lifecycle of its components.
// It initializes the work queue, starts the ComputeDomain manager, and handles
// graceful shutdown when the context is cancelled.
func (c *Controller) Run(ctx context.Context) error {
workQueue := workqueue.New(workqueue.DefaultControllerRateLimiter())
managerConfig := &ManagerConfig{
driverName: c.config.driverName,
driverNamespace: c.config.flags.namespace,
additionalNamespaces: c.config.flags.additionalNamespaces.Value(),
imageName: c.config.flags.imageName,
maxNodesPerIMEXDomain: c.config.flags.maxNodesPerIMEXDomain,
clientsets: c.config.clientsets,
workQueue: workQueue,
logVerbosityCDDaemon: c.config.flags.logVerbosityCDDaemon,
httpEndpoint: c.config.flags.httpEndpoint,
metricsPath: c.config.flags.metricsPath,
}
// TODO: log full, nested cliFlags structure.
klog.Infof("controller manager config: %+v", managerConfig)
cdManager := NewComputeDomainManager(managerConfig)
if err := cdManager.Start(ctx); err != nil {
return fmt.Errorf("error starting ComputeDomain manager: %w", err)
}
workQueue.Run(ctx)
if err := cdManager.Stop(); err != nil {
return fmt.Errorf("error stopping ComputeDomain manager: %w", err)
}
return nil
}