Skip to content

Commit 061ca24

Browse files
committed
Consolidate pod-level state into podConfigStore
1 parent 46fd516 commit 061ca24

8 files changed

Lines changed: 175 additions & 71 deletions

File tree

pkg/driver/driver.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,6 @@ type inventoryDB interface {
6464
IsIBOnlyDevice(deviceName string) bool
6565
GetRDMADeviceName(deviceName string) (string, error)
6666
GetDeviceConfig(deviceName string) (*apis.NetworkConfig, bool)
67-
AddPodNetNs(podKey string, netNs string)
68-
RemovePodNetNs(podKey string)
69-
GetPodNetNs(podKey string) (netNs string)
7067
RequestRescan()
7168
}
7269

pkg/driver/driver_test.go

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,12 @@ func (m *fakePluginHelper) RegistrationStatus() *registerapi.RegistrationStatus
4747
// mockNetDB is a mock implementation of the inventoryDB interface for testing.
4848
type fakeInventoryDB struct {
4949
resources chan []resourcev1.Device
50-
podNetNs map[string]string
5150
rescanCalls atomic.Int32
5251
}
5352

5453
func newFakeInventoryDB() *fakeInventoryDB {
5554
return &fakeInventoryDB{
5655
resources: make(chan []resourcev1.Device, 1),
57-
podNetNs: make(map[string]string),
5856
}
5957
}
6058

@@ -70,18 +68,6 @@ func (m *fakeInventoryDB) IsIBOnlyDevice(_ string) bool { return false }
7068

7169
func (m *fakeInventoryDB) GetRDMADeviceName(_ string) (string, error) { return "", nil }
7270

73-
func (m *fakeInventoryDB) AddPodNetNs(podKey string, netNs string) {
74-
m.podNetNs[podKey] = netNs
75-
}
76-
77-
func (m *fakeInventoryDB) RemovePodNetNs(podKey string) {
78-
delete(m.podNetNs, podKey)
79-
}
80-
81-
func (m *fakeInventoryDB) GetPodNetNs(podKey string) string {
82-
return m.podNetNs[podKey]
83-
}
84-
8571
func (m *fakeInventoryDB) GetDeviceConfig(deviceName string) (*apis.NetworkConfig, bool) {
8672
return nil, false
8773
}

pkg/driver/nri_hooks.go

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -42,24 +42,24 @@ func (np *NetworkDriver) Synchronize(_ context.Context, pods []*api.PodSandbox,
4242
klog.Infof("Synchronized state with the runtime (%d pods, %d containers)...",
4343
len(pods), len(containers))
4444

45-
livePods := set.New[types.UID]()
45+
// livePodNetNs map tracks live pods by UID and their network namespace paths.
46+
livePodNetNs := make(map[types.UID]string)
4647
for _, pod := range pods {
4748
klog.Infof("Synchronize Pod %s/%s UID %s", pod.Namespace, pod.Name, pod.Uid)
4849
klog.V(2).Infof("pod %s/%s: namespace=%s ips=%v", pod.GetNamespace(), pod.GetName(), getNetworkNamespace(pod), pod.GetIps())
49-
livePods.Insert(types.UID(pod.Uid))
50-
// get the pod network namespace
51-
ns := getNetworkNamespace(pod)
52-
// host network pods are skipped
53-
if ns != "" {
54-
// store the Pod metadata in the db
55-
np.netdb.AddPodNetNs(podKey(pod), ns)
56-
}
50+
livePodNetNs[types.UID(pod.Uid)] = getNetworkNamespace(pod)
5751
}
5852

59-
// Prune persisted configs for pods that no longer exist in the runtime.
60-
// This handles the case where pods were deleted while the driver was down.
53+
// Process stored pods: update NetNS for live pods, and prune configurations
54+
// for pods that no longer exist in the runtime.
6155
for _, storedUID := range np.podConfigStore.ListPods() {
62-
if !livePods.Has(storedUID) {
56+
ns, isLive := livePodNetNs[storedUID]
57+
58+
if isLive {
59+
if ns != "" {
60+
np.podConfigStore.SetPodNetNs(storedUID, ns)
61+
}
62+
} else {
6363
klog.Infof("Synchronize: pruning stale config for pod %s", storedUID)
6464
np.podConfigStore.DeletePod(storedUID)
6565
}
@@ -152,8 +152,8 @@ func (np *NetworkDriver) runPodSandbox(_ context.Context, pod *api.PodSandbox, p
152152
if ns == "" {
153153
return fmt.Errorf("RunPodSandbox pod %s/%s using host network can not claim host devices", pod.Namespace, pod.Name)
154154
}
155-
// store the Pod metadata in the db
156-
np.netdb.AddPodNetNs(podKey(pod), ns)
155+
// store the Pod network namespace in the pod config store
156+
np.podConfigStore.SetPodNetNs(types.UID(pod.GetUid()), ns)
157157

158158
// Track all the status updates needed for the resource claims of the pod.
159159
statusUpdates := map[types.NamespacedName]*resourceapply.ResourceClaimStatusApplyConfiguration{}
@@ -363,18 +363,23 @@ func (np *NetworkDriver) StopPodSandbox(ctx context.Context, pod *api.PodSandbox
363363
}
364364

365365
func (np *NetworkDriver) stopPodSandbox(_ context.Context, pod *api.PodSandbox, podConfig PodConfig) error {
366-
defer func() {
367-
np.netdb.RemovePodNetNs(podKey(pod))
368-
}()
369366
// get the pod network namespace
370367
ns := getNetworkNamespace(pod)
371368
if ns == "" {
372369
// some version of containerd does not send the network namespace information on this hook so
373370
// we workaround it using the local copy we have in the db to associate interfaces with Pods via
374371
// the network namespace id.
375-
ns = np.netdb.GetPodNetNs(podKey(pod))
376-
if ns == "" {
377-
klog.Infof("StopPodSandbox pod %s/%s using host network ... skipping", pod.Namespace, pod.Name)
372+
storedNs, ok := np.podConfigStore.GetPodNetNs(types.UID(pod.GetUid()))
373+
if ok {
374+
if storedNs != "" {
375+
ns = storedNs
376+
} else {
377+
// Pod is not configured for DRAnet (host network or other driver)
378+
klog.Infof("StopPodSandbox pod %s/%s using host network ... skipping", pod.Namespace, pod.Name)
379+
return nil
380+
}
381+
} else {
382+
klog.Warningf("StopPodSandbox: pod %s/%s (UID %s) not found in podConfigStore when fetching fallback NetNS", pod.Namespace, pod.Name, pod.Uid)
378383
return nil
379384
}
380385
}
@@ -452,7 +457,6 @@ func (np *NetworkDriver) RemovePodSandbox(ctx context.Context, pod *api.PodSandb
452457
}
453458

454459
func (np *NetworkDriver) removePodSandbox(_ context.Context, pod *api.PodSandbox) error {
455-
np.netdb.RemovePodNetNs(podKey(pod))
456460
return nil
457461
}
458462

pkg/driver/nri_hooks_test.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,69 @@ func TestSynchronizePrunesStaleConfigs(t *testing.T) {
225225
}
226226
}
227227

228+
func TestSynchronizeStoresNetNSOnlyForConfiguredPods(t *testing.T) {
229+
store := mustNewPodConfigStore()
230+
231+
// Pod 1: Has device config (configured)
232+
store.SetDeviceConfig("configured-pod", "eth0", DeviceConfig{}) //nolint:errcheck
233+
234+
// Pod 2: Does not have device config (unconfigured)
235+
236+
np := &NetworkDriver{
237+
podConfigStore: store,
238+
netdb: inventory.New(),
239+
}
240+
241+
pods := []*api.PodSandbox{
242+
{
243+
Uid: "configured-pod",
244+
Name: "configured",
245+
Namespace: "default",
246+
Linux: &api.LinuxPodSandbox{
247+
Namespaces: []*api.LinuxNamespace{
248+
{Type: "network", Path: "/var/run/netns/configured"},
249+
},
250+
},
251+
},
252+
{
253+
Uid: "unconfigured-pod",
254+
Name: "unconfigured",
255+
Namespace: "default",
256+
Linux: &api.LinuxPodSandbox{
257+
Namespaces: []*api.LinuxNamespace{
258+
{Type: "network", Path: "/var/run/netns/unconfigured"},
259+
},
260+
},
261+
},
262+
}
263+
264+
_, err := np.Synchronize(context.Background(), pods, nil)
265+
if err != nil {
266+
t.Fatalf("Synchronize() error: %v", err)
267+
}
268+
269+
// Case 1: Configured pod should have its NetNS stored
270+
netns, found := store.GetPodNetNs("configured-pod")
271+
if !found {
272+
t.Error("configured-pod should have its NetNS stored")
273+
}
274+
if netns != "/var/run/netns/configured" {
275+
t.Errorf("expected NetNS /var/run/netns/configured, got %q", netns)
276+
}
277+
278+
// Case 2: Unconfigured pod should NOT have its NetNS stored
279+
_, found = store.GetPodNetNs("unconfigured-pod")
280+
if found {
281+
t.Error("unconfigured-pod should NOT have its NetNS stored")
282+
}
283+
284+
// Also verify it didn't create a skeleton config for unconfigured-pod
285+
_, found = store.GetPodConfig("unconfigured-pod")
286+
if found {
287+
t.Error("unconfigured-pod should NOT have any PodConfig in the store")
288+
}
289+
}
290+
228291
func TestCreateContainerMetrics(t *testing.T) {
229292
testCases := []struct {
230293
name string

pkg/driver/pod_device_config.go

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ type PodConfig struct {
3535
// LastNRIActivity timestamp is updated whenever an NRI hook processes
3636
// a container for this Pod. Used to track pod initialization progress.
3737
LastNRIActivity time.Time
38+
39+
// NetNS is the path to the Pod's network namespace as observed by the
40+
// container runtime.
41+
NetNS string `json:"netns,omitempty"`
3842
}
3943

4044
// DeviceConfig holds the set of configurations to be applied for a single
@@ -87,9 +91,11 @@ type LinuxDevice struct {
8791
// daemon restarts. Following the kubelet DRA checkpoint pattern
8892
// (pkg/kubelet/cm/dra/state), the in-memory PodConfigStore is the source
8993
// of truth and the Checkpointer is a write-through backend.
94+
// Note: Pod-level metadata like NetNS is not persisted (rebuilt on restart
95+
// via Synchronize() which queries the container runtime).
9096
type Checkpointer interface {
91-
// GetOrCreate returns all persisted pod device configs, or an empty map
92-
// if the checkpoint does not yet exist. Used at startup to restore state.
97+
// GetOrCreate returns all persisted device configs. Used at startup to
98+
// restore state.
9399
GetOrCreate() (map[types.UID]map[string]DeviceConfig, error)
94100
// Store persists the device config for a single pod/device pair.
95101
Store(podUID types.UID, deviceName string, config DeviceConfig) error
@@ -110,7 +116,9 @@ type PodConfigStore struct {
110116
}
111117

112118
// NewPodConfigStore creates a new PodConfigStore. If a Checkpointer is
113-
// provided, existing state is loaded from the checkpoint into memory.
119+
// provided, existing device configs are loaded from the checkpoint into memory.
120+
// Pod-level state is not persisted; NetNS is rebuilt through Synchronize() on
121+
// driver startup, while LastNRIActivity resets to its zero value.
114122
func NewPodConfigStore(checkpointer Checkpointer) (*PodConfigStore, error) {
115123
s := &PodConfigStore{
116124
configs: make(map[types.UID]PodConfig),
@@ -253,9 +261,34 @@ func (s *PodConfigStore) GetPodConfig(podUID types.UID) (PodConfig, bool) {
253261
return PodConfig{
254262
DeviceConfigs: configsCopy,
255263
LastNRIActivity: podConfig.LastNRIActivity,
264+
NetNS: podConfig.NetNS,
256265
}, true
257266
}
258267

268+
// SetPodNetNs stores the Pod's network namespace path in the pod-level config.
269+
// This is in-memory only; pod NetNS is rebuilt from the container runtime on
270+
// driver restart via Synchronize().
271+
func (s *PodConfigStore) SetPodNetNs(podUID types.UID, netns string) {
272+
s.mu.Lock()
273+
defer s.mu.Unlock()
274+
275+
podCfg, ok := s.configs[podUID]
276+
if !ok {
277+
klog.Warningf("SetPodNetNs: pod UID %s not found in store; skipping NetNS update", podUID)
278+
return
279+
}
280+
podCfg.NetNS = netns
281+
s.configs[podUID] = podCfg
282+
}
283+
284+
// GetPodNetNs returns the stored network namespace for the given pod UID.
285+
func (s *PodConfigStore) GetPodNetNs(podUID types.UID) (string, bool) {
286+
s.mu.RLock()
287+
defer s.mu.RUnlock()
288+
podCfg, ok := s.configs[podUID]
289+
return podCfg.NetNS, ok
290+
}
291+
259292
// DeleteClaim removes all configurations associated with a given claim and
260293
// returns the list of Pod UIDs that were associated with it.
261294
// Like DeletePod, checkpoint failures do not prevent in-memory cleanup.

pkg/driver/pod_device_config_bolt_test.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ func TestPodConfigStore_Persistence(t *testing.T) {
176176
t.Fatalf("NewPodConfigStore() error: %v", err)
177177
}
178178
store1.SetDeviceConfig("pod-1", "eth0", config)
179+
store1.SetPodNetNs("pod-1", "/var/run/netns/test-ns")
179180
store1.Close()
180181

181182
// Reopen and verify data was restored from checkpoint.
@@ -204,6 +205,11 @@ func TestPodConfigStore_Persistence(t *testing.T) {
204205
if len(podConfig.DeviceConfigs) != 1 {
205206
t.Errorf("Expected 1 device config after reopen, got %d", len(podConfig.DeviceConfigs))
206207
}
208+
209+
// Verify NetNS is NOT restored (in-memory only)
210+
if podConfig.NetNS != "" {
211+
t.Errorf("Expected NetNS to be empty after reopen, but got %s", podConfig.NetNS)
212+
}
207213
}
208214

209215
// TestPodConfigStore_DeletePodCheckpoints verifies that DeletePod and
@@ -385,3 +391,4 @@ func TestPodConfigStore_NoCheckpointer(t *testing.T) {
385391
t.Errorf("Close() error: %v", err)
386392
}
387393
}
394+

pkg/driver/pod_device_config_test.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,49 @@ func TestPodConfigStore_SetAndGet(t *testing.T) {
107107
}
108108
}
109109

110+
// TestPodConfigStore_NetNs verifies that NetNS path can be stored and retrieved correctly in memory.
111+
func TestPodConfigStore_NetNs(t *testing.T) {
112+
store := mustNewPodConfigStore()
113+
podUID := types.UID("test-pod-uid-1")
114+
netns := "/var/run/netns/test-ns"
115+
116+
// Test Get on non-existent item
117+
_, found := store.GetPodNetNs(podUID)
118+
if found {
119+
t.Errorf("GetPodNetNs() found a netns before SetPodNetNs(), expected not found")
120+
}
121+
122+
// Add a dummy device config so the pod exists in the store
123+
store.SetDeviceConfig(podUID, "dummy-device", DeviceConfig{})
124+
125+
store.SetPodNetNs(podUID, netns)
126+
127+
retrievedNetNs, found := store.GetPodNetNs(podUID)
128+
if !found {
129+
t.Fatalf("GetPodNetNs() did not find netns after SetPodNetNs(), expected found")
130+
}
131+
if retrievedNetNs != netns {
132+
t.Errorf("GetPodNetNs() retrieved %s, want %s", retrievedNetNs, netns)
133+
}
134+
135+
// Test Get with different podUID
136+
_, found = store.GetPodNetNs(types.UID("other-pod-uid"))
137+
if found {
138+
t.Errorf("GetPodNetNs() found netns for wrong podUID, expected not found")
139+
}
140+
141+
// Test overwriting
142+
newNetNs := "/var/run/netns/new-ns"
143+
store.SetPodNetNs(podUID, newNetNs)
144+
retrievedNetNs, found = store.GetPodNetNs(podUID)
145+
if !found {
146+
t.Fatalf("GetPodNetNs() did not find netns after overwrite, expected found")
147+
}
148+
if retrievedNetNs != newNetNs {
149+
t.Errorf("GetPodNetNs() retrieved %s after overwrite, want %s", retrievedNetNs, newNetNs)
150+
}
151+
}
152+
110153
func TestPodConfigStore_DeletePod(t *testing.T) {
111154
store := mustNewPodConfigStore()
112155
podUID1 := types.UID("test-pod-uid-1")

0 commit comments

Comments
 (0)