Skip to content

Commit 664a264

Browse files
authored
Merge pull request #115 from rbtr/feat/bbolt-persistence
feat: bbolt persistent pod device configs
2 parents df8cf66 + cf2b12e commit 664a264

16 files changed

Lines changed: 931 additions & 46 deletions

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,15 @@ helm upgrade --install dranet ./deployments/helm/dranet -n kube-system
103103

104104
For available configuration options, see the [chart README](deployments/helm/dranet/README.md).
105105

106+
DRANET persists prepared pod device configuration in a local bbolt database so
107+
NRI hooks can continue initialization after a driver restart. The default DB
108+
path is `/var/run/dranet/dranet.db`, so deployment manifests must mount
109+
`/var/run/dranet` as writable storage (hostPath with `DirectoryOrCreate` in the
110+
provided manifests).
111+
112+
You can override the DB path with `--db-path`; set it to an empty
113+
string to disable persistence and use in-memory state.
114+
106115
### How to Use It
107116

108117
Once DRANET is running, you can inspect the network interfaces and their

cmd/dranet/app.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"net/http"
2424
"os"
2525
"os/signal"
26+
"path/filepath"
2627
"reflect"
2728
"runtime/debug"
2829
"sync/atomic"
@@ -54,6 +55,7 @@ var (
5455
kubeconfig string
5556
bindAddress string
5657
celExpression string
58+
dbPath string
5759
minPollInterval time.Duration
5860
maxPollInterval time.Duration
5961
pollBurst int
@@ -68,6 +70,7 @@ func init() {
6870
flag.StringVar(&bindAddress, "bind-address", ":9177", "The IP address and port for the metrics and healthz server to serve on")
6971
flag.StringVar(&hostnameOverride, "hostname-override", "", "If non-empty, will be used as the name of the Node that kube-network-policies is running on. If unset, the node name is assumed to be the same as the node's hostname.")
7072
flag.StringVar(&celExpression, "filter", `!("dra.net/type" in attributes) || attributes["dra.net/type"].StringValue != "veth"`, "CEL expression to filter network interface attributes (v1.DeviceAttribute).")
73+
flag.StringVar(&dbPath, "db-path", filepath.Join("/var/run/dranet", "dranet.db"), "Path to the persistent bbolt database file. Set to an empty string to disable persistence and use in-memory state.")
7174
flag.DurationVar(&minPollInterval, "inventory-min-poll-interval", 2*time.Second, "The minimum interval between two consecutive polls of the inventory.")
7275
flag.DurationVar(&maxPollInterval, "inventory-max-poll-interval", 1*time.Minute, "The maximum interval between two consecutive polls of the inventory.")
7376
flag.IntVar(&pollBurst, "inventory-poll-burst", 5, "The number of polls that can be run in a burst.")
@@ -143,6 +146,11 @@ func main() {
143146
signal.Notify(signalCh, syscall.SIGINT, syscall.SIGTERM)
144147

145148
opts := []driver.Option{}
149+
150+
if dbPath != "" {
151+
opts = append(opts, driver.WithDBPath(dbPath))
152+
}
153+
146154
if celExpression != "" {
147155
env, err := cel.NewEnv(
148156
ext.NativeTypes(

examples/dranetctl-install.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,8 @@ spec:
144144
- name: netns
145145
mountPath: /var/run/netns
146146
mountPropagation: HostToContainer
147+
- name: dranet-run
148+
mountPath: /var/run/dranet
147149
volumes:
148150
- name: device-plugin
149151
hostPath:
@@ -157,6 +159,10 @@ spec:
157159
- name: netns
158160
hostPath:
159161
path: /var/run/netns
162+
- name: dranet-run
163+
hostPath:
164+
path: /var/run/dranet
165+
type: DirectoryOrCreate
160166
- name: etc
161167
hostPath:
162168
path: /etc

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ require (
2222
github.com/spf13/pflag v1.0.10
2323
github.com/vishvananda/netlink v1.3.1
2424
github.com/vishvananda/netns v0.0.5
25+
go.etcd.io/bbolt v1.4.3
2526
golang.org/x/sys v0.42.0
2627
golang.org/x/time v0.15.0
2728
google.golang.org/api v0.273.1

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,8 @@ github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
186186
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
187187
github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0=
188188
github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
189+
go.etcd.io/bbolt v1.4.3 h1:dEadXpI6G79deX5prL3QRNP6JB8UxVkqo4UPnHaNXJo=
190+
go.etcd.io/bbolt v1.4.3/go.mod h1:tKQlpPaYCVFctUIgFKFnAlvbmB3tpy1vkTnDWohtc0E=
189191
go.etcd.io/etcd/client/pkg/v3 v3.6.5 h1:Duz9fAzIZFhYWgRjp/FgNq2gO1jId9Yae/rLn3RrBP8=
190192
go.etcd.io/etcd/client/pkg/v3 v3.6.5/go.mod h1:8Wx3eGRPiy0qOFMZT/hfvdos+DjEaPxdIDiCDUv/FQk=
191193
go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=

install.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,8 @@ spec:
139139
- name: bpf-programs
140140
mountPath: /sys/fs/bpf
141141
mountPropagation: HostToContainer
142+
- name: dranet-run
143+
mountPath: /var/run/dranet
142144
volumes:
143145
- name: device-plugin
144146
hostPath:
@@ -158,4 +160,8 @@ spec:
158160
- name: bpf-programs
159161
hostPath:
160162
path: /sys/fs/bpf
163+
- name: dranet-run
164+
hostPath:
165+
path: /var/run/dranet
166+
type: DirectoryOrCreate
161167
---

pkg/driver/dra_hooks.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,8 @@ func (np *NetworkDriver) PublishResources(ctx context.Context) {
6666

6767
resources := resourceslice.DriverResources{
6868
Pools: map[string]resourceslice.Pool{
69-
np.nodeName: {Slices: []resourceslice.Slice{{Devices: devices}}}},
69+
np.nodeName: {Slices: []resourceslice.Slice{{Devices: devices}}},
70+
},
7071
}
7172
err := np.draPlugin.PublishResources(ctx, resources)
7273
if err != nil {
@@ -247,7 +248,9 @@ func (np *NetworkDriver) prepareResourceClaim(ctx context.Context, claim *resour
247248
}
248249
deviceCfg.RDMADevice = buildRDMAConfig(rdmaDevName, charDevices)
249250
for _, uid := range podUIDs {
250-
np.podConfigStore.SetDeviceConfig(uid, result.Device, deviceCfg)
251+
if err := np.podConfigStore.SetDeviceConfig(uid, result.Device, deviceCfg); err != nil {
252+
errorList = append(errorList, fmt.Errorf("failed to persist device config for pod %s device %s: %v", uid, result.Device, err))
253+
}
251254
}
252255
klog.V(4).Infof("IB-only claim resources for pods %v : %#v", podUIDs, deviceCfg)
253256
continue
@@ -395,7 +398,9 @@ func (np *NetworkDriver) prepareResourceClaim(ctx context.Context, claim *resour
395398
// TODO: support for multiple pods sharing the same device
396399
// we'll create the subinterface here
397400
for _, uid := range podUIDs {
398-
np.podConfigStore.SetDeviceConfig(uid, result.Device, deviceCfg)
401+
if err := np.podConfigStore.SetDeviceConfig(uid, result.Device, deviceCfg); err != nil {
402+
errorList = append(errorList, fmt.Errorf("failed to persist device config for pod %s device %s: %v", uid, result.Device, err))
403+
}
399404
}
400405
klog.V(4).Infof("Claim Resources for pods %v : %#v", podUIDs, deviceCfg)
401406
}

pkg/driver/dra_hooks_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ func TestUnprepareResourceClaimsMetrics(t *testing.T) {
193193
draPluginRequestsLatencySeconds.Reset()
194194

195195
np := &NetworkDriver{
196-
podConfigStore: NewPodConfigStore(),
196+
podConfigStore: mustNewPodConfigStore(),
197197
}
198198
claimName := types.NamespacedName{Name: "test-claim", Namespace: "test-ns"}
199199
np.podConfigStore.SetDeviceConfig("pod-uid-1", "device-a", DeviceConfig{Claim: claimName})

pkg/driver/driver.go

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,7 @@ import (
4141
)
4242

4343
const (
44-
kubeletPluginRegistryPath = "/var/lib/kubelet/plugins_registry"
45-
kubeletPluginPath = "/var/lib/kubelet/plugins"
44+
kubeletPluginPath = "/var/lib/kubelet/plugins"
4645
)
4746

4847
const (
@@ -84,6 +83,14 @@ func WithInventory(db inventoryDB) Option {
8483
}
8584
}
8685

86+
// WithDBPath sets the path for the persistent pod config database.
87+
// If not set, an in-memory store is used.
88+
func WithDBPath(path string) Option {
89+
return func(o *NetworkDriver) {
90+
o.dbPath = path
91+
}
92+
}
93+
8794
type NetworkDriver struct {
8895
driverName string
8996
nodeName string
@@ -98,6 +105,7 @@ type NetworkDriver struct {
98105
// Cache the rdma shared mode state
99106
rdmaSharedMode bool
100107
podConfigStore *PodConfigStore
108+
dbPath string // path for persistent bbolt database; empty means in-memory
101109

102110
clock clock.WithTicker // Injectable clock for testing
103111
}
@@ -120,14 +128,31 @@ func Start(ctx context.Context, driverName string, kubeClient kubernetes.Interfa
120128
nodeName: nodeName,
121129
kubeClient: kubeClient,
122130
rdmaSharedMode: rdmaNetnsMode == apis.RdmaNetnsModeShared,
123-
podConfigStore: NewPodConfigStore(),
124131
clock: clock.RealClock{},
125132
}
126133

127134
for _, o := range opts {
128135
o(plugin)
129136
}
130137

138+
// Initialize the pod config store with optional bbolt checkpoint backend.
139+
var checkpointer Checkpointer
140+
if plugin.dbPath != "" {
141+
var err error
142+
checkpointer, err = newBoltCheckpointer(plugin.dbPath)
143+
if err != nil {
144+
return nil, fmt.Errorf("failed to open pod config database at %s: %v", plugin.dbPath, err)
145+
}
146+
}
147+
store, err := NewPodConfigStore(checkpointer)
148+
if err != nil {
149+
if checkpointer != nil {
150+
checkpointer.Close()
151+
}
152+
return nil, fmt.Errorf("failed to initialize pod config store: %v", err)
153+
}
154+
plugin.podConfigStore = store
155+
131156
driverPluginPath := filepath.Join(kubeletPluginPath, driverName)
132157
err = os.MkdirAll(driverPluginPath, 0750)
133158
if err != nil {
@@ -312,5 +337,11 @@ func (np *NetworkDriver) Stop(ctxCancel context.CancelFunc) {
312337
ctxCancel()
313338

314339
np.nriPlugin.Stop()
340+
341+
// Close the pod config store.
342+
if err := np.podConfigStore.Close(); err != nil {
343+
klog.Errorf("Failed to close pod config database: %v", err)
344+
}
345+
315346
klog.Info("Driver stopped.")
316347
}

pkg/driver/driver_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ func TestStop(t *testing.T) {
107107
np := &NetworkDriver{
108108
draPlugin: fakeDra,
109109
nriPlugin: fakeNri,
110-
podConfigStore: NewPodConfigStore(),
110+
podConfigStore: mustNewPodConfigStore(),
111111
clock: fakeClock,
112112
}
113113

0 commit comments

Comments
 (0)