Skip to content

Commit f286a04

Browse files
committed
config-daemon: Restart all instances of device-plugin
When the operator changes the device-plugin Spec (e.g. .Spec.NodeSelector), it may happen that there are two device plugin pods for a given node, one that is terminating, the other that is initializing. If the config-daemon executes `restartDevicePluginPod()` at the same time, it may kill the terminating pod, while the initializing one will run with the old dp configuration. This may cause one or more resources to not being advertised, until a manual device plugin restart occurs. Make the config-daemon restart all the device-plugin instances it founds for its own node. Signed-off-by: Andrea Panattoni <[email protected]>
1 parent 92cf81c commit f286a04

File tree

2 files changed

+74
-40
lines changed

2 files changed

+74
-40
lines changed

pkg/daemon/daemon.go

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ func New(
120120
eventRecorder: er,
121121
featureGate: featureGates,
122122
disabledPlugins: disabledPlugins,
123+
mu: &sync.Mutex{},
123124
}
124125
}
125126

@@ -159,7 +160,6 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error {
159160

160161
var timeout int64 = 5
161162
var metadataKey = "metadata.name"
162-
dn.mu = &sync.Mutex{}
163163
informerFactory := sninformer.NewFilteredSharedInformerFactory(dn.sriovClient,
164164
time.Second*15,
165165
vars.Namespace,
@@ -683,7 +683,6 @@ func (dn *Daemon) restartDevicePluginPod() error {
683683
defer dn.mu.Unlock()
684684
log.Log.V(2).Info("restartDevicePluginPod(): try to restart device plugin pod")
685685

686-
var podToDelete string
687686
pods, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{
688687
LabelSelector: "app=sriov-device-plugin",
689688
FieldSelector: "spec.nodeName=" + vars.NodeName,
@@ -702,35 +701,37 @@ func (dn *Daemon) restartDevicePluginPod() error {
702701
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
703702
return nil
704703
}
705-
podToDelete = pods.Items[0].Name
706704

707-
log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete)
708-
err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{})
709-
if errors.IsNotFound(err) {
710-
log.Log.Info("restartDevicePluginPod(): pod to delete not found")
711-
return nil
712-
}
713-
if err != nil {
714-
log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying")
715-
return err
716-
}
717-
718-
if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) {
719-
_, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{})
705+
for _, pod := range pods.Items {
706+
podToDelete := pod.Name
707+
log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete)
708+
err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{})
720709
if errors.IsNotFound(err) {
721-
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
722-
return true, nil
710+
log.Log.Info("restartDevicePluginPod(): pod to delete not found")
711+
continue
723712
}
724-
725713
if err != nil {
726-
log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying")
727-
} else {
728-
log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete)
714+
log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying")
715+
return err
716+
}
717+
718+
if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) {
719+
_, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{})
720+
if errors.IsNotFound(err) {
721+
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
722+
return true, nil
723+
}
724+
725+
if err != nil {
726+
log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying")
727+
} else {
728+
log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete)
729+
}
730+
return false, nil
731+
}, dn.stopCh); err != nil {
732+
log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion")
733+
return err
729734
}
730-
return false, nil
731-
}, dn.stopCh); err != nil {
732-
log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion")
733-
return err
734735
}
735736

736737
return nil

pkg/daemon/daemon_test.go

Lines changed: 47 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ import (
3232
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/fakefilesystem"
3333
)
3434

35+
var SriovDevicePluginPod corev1.Pod
36+
3537
func TestConfigDaemon(t *testing.T) {
3638
RegisterFailHandler(Fail)
3739
RunSpecs(t, "Config Daemon Suite")
@@ -107,19 +109,6 @@ var _ = Describe("Config Daemon", func() {
107109
},
108110
}
109111

110-
SriovDevicePluginPod := corev1.Pod{
111-
ObjectMeta: metav1.ObjectMeta{
112-
Name: "sriov-device-plugin-xxxx",
113-
Namespace: vars.Namespace,
114-
Labels: map[string]string{
115-
"app": "sriov-device-plugin",
116-
},
117-
},
118-
Spec: corev1.PodSpec{
119-
NodeName: "test-node",
120-
},
121-
}
122-
123112
err = sriovnetworkv1.AddToScheme(scheme.Scheme)
124113
Expect(err).ToNot(HaveOccurred())
125114
kClient := kclient.NewClientBuilder().WithScheme(scheme.Scheme).WithRuntimeObjects(&corev1.Node{
@@ -130,7 +119,7 @@ var _ = Describe("Config Daemon", func() {
130119
Namespace: vars.Namespace,
131120
}}).Build()
132121

133-
kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs, &SriovDevicePluginPod)
122+
kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs)
134123
snclient := snclientset.NewSimpleClientset()
135124
err = sriovnetworkv1.InitNicIDMapFromConfigMap(kubeClient, vars.Namespace)
136125
Expect(err).ToNot(HaveOccurred())
@@ -175,6 +164,22 @@ var _ = Describe("Config Daemon", func() {
175164
err := sut.Run(stopCh, exitCh)
176165
Expect(err).ToNot(HaveOccurred())
177166
}()
167+
168+
SriovDevicePluginPod = corev1.Pod{
169+
ObjectMeta: metav1.ObjectMeta{
170+
Name: "sriov-device-plugin-xxxx",
171+
Namespace: vars.Namespace,
172+
Labels: map[string]string{
173+
"app": "sriov-device-plugin",
174+
},
175+
},
176+
Spec: corev1.PodSpec{
177+
NodeName: "test-node",
178+
},
179+
}
180+
_, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), &SriovDevicePluginPod, metav1.CreateOptions{})
181+
Expect(err).ToNot(HaveOccurred())
182+
178183
})
179184

180185
AfterEach(func() {
@@ -286,6 +291,34 @@ var _ = Describe("Config Daemon", func() {
286291

287292
Expect(sut.desiredNodeState.GetGeneration()).To(BeNumerically("==", 777))
288293
})
294+
295+
It("restart all the sriov-device-plugin pods present on the node", func() {
296+
otherPod1 := SriovDevicePluginPod.DeepCopy()
297+
otherPod1.Name = "sriov-device-plugin-xxxa"
298+
_, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod1, metav1.CreateOptions{})
299+
Expect(err).ToNot(HaveOccurred())
300+
301+
otherPod2 := SriovDevicePluginPod.DeepCopy()
302+
otherPod2.Name = "sriov-device-plugin-xxxz"
303+
_, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod2, metav1.CreateOptions{})
304+
Expect(err).ToNot(HaveOccurred())
305+
306+
err = sut.restartDevicePluginPod()
307+
Expect(err).ToNot(HaveOccurred())
308+
309+
Eventually(func() (int, error) {
310+
podList, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{
311+
LabelSelector: "app=sriov-device-plugin",
312+
FieldSelector: "spec.nodeName=test-node",
313+
})
314+
315+
if err != nil {
316+
return 0, err
317+
}
318+
319+
return len(podList.Items), nil
320+
}, "1s").Should(BeZero())
321+
})
289322
})
290323
})
291324

0 commit comments

Comments
 (0)