Skip to content

Commit f5f470b

Browse files
committed
config-daemon: Restart all instances of device-plugin
When the operator changes the device-plugin Spec (e.g. .Spec.NodeSelector), it may happen that there are two device plugin pods for a given node, one that is terminating, the other that is initializing. If the config-daemon executes `restartDevicePluginPod()` at the same time, it may kill the terminating pod, while the initializing one will run with the old dp configuration. This may cause one or more resources to not being advertised, until a manual device plugin restart occurs. Make the config-daemon restart all the device-plugin instances it founds for its own node. Signed-off-by: Andrea Panattoni <[email protected]>
1 parent 31175eb commit f5f470b

File tree

2 files changed

+73
-40
lines changed

2 files changed

+73
-40
lines changed

pkg/daemon/daemon.go

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ func New(
120120
eventRecorder: er,
121121
featureGate: featureGates,
122122
disabledPlugins: disabledPlugins,
123+
mu: &sync.Mutex{},
123124
}
124125
}
125126

@@ -159,7 +160,6 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error {
159160

160161
var timeout int64 = 5
161162
var metadataKey = "metadata.name"
162-
dn.mu = &sync.Mutex{}
163163
informerFactory := sninformer.NewFilteredSharedInformerFactory(dn.sriovClient,
164164
time.Second*15,
165165
vars.Namespace,
@@ -683,7 +683,6 @@ func (dn *Daemon) restartDevicePluginPod() error {
683683
defer dn.mu.Unlock()
684684
log.Log.V(2).Info("restartDevicePluginPod(): try to restart device plugin pod")
685685

686-
var podToDelete string
687686
pods, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{
688687
LabelSelector: "app=sriov-device-plugin",
689688
FieldSelector: "spec.nodeName=" + vars.NodeName,
@@ -702,35 +701,37 @@ func (dn *Daemon) restartDevicePluginPod() error {
702701
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
703702
return nil
704703
}
705-
podToDelete = pods.Items[0].Name
706704

707-
log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete)
708-
err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{})
709-
if errors.IsNotFound(err) {
710-
log.Log.Info("restartDevicePluginPod(): pod to delete not found")
711-
return nil
712-
}
713-
if err != nil {
714-
log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying")
715-
return err
716-
}
717-
718-
if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) {
719-
_, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{})
705+
for _, pod := range pods.Items {
706+
podToDelete := pod.Name
707+
log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete)
708+
err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{})
720709
if errors.IsNotFound(err) {
721-
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
722-
return true, nil
710+
log.Log.Info("restartDevicePluginPod(): pod to delete not found")
711+
return nil
723712
}
724-
725713
if err != nil {
726-
log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying")
727-
} else {
728-
log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete)
714+
log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying")
715+
return err
716+
}
717+
718+
if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) {
719+
_, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{})
720+
if errors.IsNotFound(err) {
721+
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
722+
return true, nil
723+
}
724+
725+
if err != nil {
726+
log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying")
727+
} else {
728+
log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete)
729+
}
730+
return false, nil
731+
}, dn.stopCh); err != nil {
732+
log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion")
733+
return err
729734
}
730-
return false, nil
731-
}, dn.stopCh); err != nil {
732-
log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion")
733-
return err
734735
}
735736

736737
return nil

pkg/daemon/daemon_test.go

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,19 @@ import (
3232
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/fakefilesystem"
3333
)
3434

35+
var SriovDevicePluginPod corev1.Pod = corev1.Pod{
36+
ObjectMeta: metav1.ObjectMeta{
37+
Name: "sriov-device-plugin-xxxx",
38+
Namespace: vars.Namespace,
39+
Labels: map[string]string{
40+
"app": "sriov-device-plugin",
41+
},
42+
},
43+
Spec: corev1.PodSpec{
44+
NodeName: "test-node",
45+
},
46+
}
47+
3548
func TestConfigDaemon(t *testing.T) {
3649
RegisterFailHandler(Fail)
3750
RunSpecs(t, "Config Daemon Suite")
@@ -107,19 +120,6 @@ var _ = Describe("Config Daemon", func() {
107120
},
108121
}
109122

110-
SriovDevicePluginPod := corev1.Pod{
111-
ObjectMeta: metav1.ObjectMeta{
112-
Name: "sriov-device-plugin-xxxx",
113-
Namespace: vars.Namespace,
114-
Labels: map[string]string{
115-
"app": "sriov-device-plugin",
116-
},
117-
},
118-
Spec: corev1.PodSpec{
119-
NodeName: "test-node",
120-
},
121-
}
122-
123123
err = sriovnetworkv1.AddToScheme(scheme.Scheme)
124124
Expect(err).ToNot(HaveOccurred())
125125
kClient := kclient.NewClientBuilder().WithScheme(scheme.Scheme).WithRuntimeObjects(&corev1.Node{
@@ -130,7 +130,7 @@ var _ = Describe("Config Daemon", func() {
130130
Namespace: vars.Namespace,
131131
}}).Build()
132132

133-
kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs, &SriovDevicePluginPod)
133+
kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs)
134134
snclient := snclientset.NewSimpleClientset()
135135
err = sriovnetworkv1.InitNicIDMapFromConfigMap(kubeClient, vars.Namespace)
136136
Expect(err).ToNot(HaveOccurred())
@@ -175,6 +175,10 @@ var _ = Describe("Config Daemon", func() {
175175
err := sut.Run(stopCh, exitCh)
176176
Expect(err).ToNot(HaveOccurred())
177177
}()
178+
179+
_, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), &SriovDevicePluginPod, metav1.CreateOptions{})
180+
Expect(err).ToNot(HaveOccurred())
181+
178182
})
179183

180184
AfterEach(func() {
@@ -286,6 +290,34 @@ var _ = Describe("Config Daemon", func() {
286290

287291
Expect(sut.desiredNodeState.GetGeneration()).To(BeNumerically("==", 777))
288292
})
293+
294+
It("restart all the sriov-device-plugin pods present on the node", func() {
295+
otherPod1 := SriovDevicePluginPod.DeepCopy()
296+
otherPod1.Name = "sriov-device-plugin-xxxa"
297+
_, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod1, metav1.CreateOptions{})
298+
Expect(err).ToNot(HaveOccurred())
299+
300+
otherPod2 := SriovDevicePluginPod.DeepCopy()
301+
otherPod2.Name = "sriov-device-plugin-xxxz"
302+
_, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod2, metav1.CreateOptions{})
303+
Expect(err).ToNot(HaveOccurred())
304+
305+
err = sut.restartDevicePluginPod()
306+
Expect(err).ToNot(HaveOccurred())
307+
308+
Eventually(func() (int, error) {
309+
podList, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{
310+
LabelSelector: "app=sriov-device-plugin",
311+
FieldSelector: "spec.nodeName=test-node",
312+
})
313+
314+
if err != nil {
315+
return 0, err
316+
}
317+
318+
return len(podList.Items), nil
319+
}, "1s").Should(BeZero())
320+
})
289321
})
290322
})
291323

0 commit comments

Comments
 (0)