Skip to content

Commit a85ab70

Browse files
authored
Merge pull request #783 from zeeke/us/multiple-device-plugins
config-daemon: Restart all instances of device-plugin
2 parents 92cf81c + f286a04 commit a85ab70

File tree

2 files changed

+74
-40
lines changed

2 files changed

+74
-40
lines changed

pkg/daemon/daemon.go

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ func New(
120120
eventRecorder: er,
121121
featureGate: featureGates,
122122
disabledPlugins: disabledPlugins,
123+
mu: &sync.Mutex{},
123124
}
124125
}
125126

@@ -159,7 +160,6 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error {
159160

160161
var timeout int64 = 5
161162
var metadataKey = "metadata.name"
162-
dn.mu = &sync.Mutex{}
163163
informerFactory := sninformer.NewFilteredSharedInformerFactory(dn.sriovClient,
164164
time.Second*15,
165165
vars.Namespace,
@@ -683,7 +683,6 @@ func (dn *Daemon) restartDevicePluginPod() error {
683683
defer dn.mu.Unlock()
684684
log.Log.V(2).Info("restartDevicePluginPod(): try to restart device plugin pod")
685685

686-
var podToDelete string
687686
pods, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{
688687
LabelSelector: "app=sriov-device-plugin",
689688
FieldSelector: "spec.nodeName=" + vars.NodeName,
@@ -702,35 +701,37 @@ func (dn *Daemon) restartDevicePluginPod() error {
702701
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
703702
return nil
704703
}
705-
podToDelete = pods.Items[0].Name
706704

707-
log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete)
708-
err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{})
709-
if errors.IsNotFound(err) {
710-
log.Log.Info("restartDevicePluginPod(): pod to delete not found")
711-
return nil
712-
}
713-
if err != nil {
714-
log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying")
715-
return err
716-
}
717-
718-
if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) {
719-
_, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{})
705+
for _, pod := range pods.Items {
706+
podToDelete := pod.Name
707+
log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete)
708+
err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{})
720709
if errors.IsNotFound(err) {
721-
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
722-
return true, nil
710+
log.Log.Info("restartDevicePluginPod(): pod to delete not found")
711+
continue
723712
}
724-
725713
if err != nil {
726-
log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying")
727-
} else {
728-
log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete)
714+
log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying")
715+
return err
716+
}
717+
718+
if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) {
719+
_, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{})
720+
if errors.IsNotFound(err) {
721+
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
722+
return true, nil
723+
}
724+
725+
if err != nil {
726+
log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying")
727+
} else {
728+
log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete)
729+
}
730+
return false, nil
731+
}, dn.stopCh); err != nil {
732+
log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion")
733+
return err
729734
}
730-
return false, nil
731-
}, dn.stopCh); err != nil {
732-
log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion")
733-
return err
734735
}
735736

736737
return nil

pkg/daemon/daemon_test.go

Lines changed: 47 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ import (
3232
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/fakefilesystem"
3333
)
3434

35+
var SriovDevicePluginPod corev1.Pod
36+
3537
func TestConfigDaemon(t *testing.T) {
3638
RegisterFailHandler(Fail)
3739
RunSpecs(t, "Config Daemon Suite")
@@ -107,19 +109,6 @@ var _ = Describe("Config Daemon", func() {
107109
},
108110
}
109111

110-
SriovDevicePluginPod := corev1.Pod{
111-
ObjectMeta: metav1.ObjectMeta{
112-
Name: "sriov-device-plugin-xxxx",
113-
Namespace: vars.Namespace,
114-
Labels: map[string]string{
115-
"app": "sriov-device-plugin",
116-
},
117-
},
118-
Spec: corev1.PodSpec{
119-
NodeName: "test-node",
120-
},
121-
}
122-
123112
err = sriovnetworkv1.AddToScheme(scheme.Scheme)
124113
Expect(err).ToNot(HaveOccurred())
125114
kClient := kclient.NewClientBuilder().WithScheme(scheme.Scheme).WithRuntimeObjects(&corev1.Node{
@@ -130,7 +119,7 @@ var _ = Describe("Config Daemon", func() {
130119
Namespace: vars.Namespace,
131120
}}).Build()
132121

133-
kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs, &SriovDevicePluginPod)
122+
kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs)
134123
snclient := snclientset.NewSimpleClientset()
135124
err = sriovnetworkv1.InitNicIDMapFromConfigMap(kubeClient, vars.Namespace)
136125
Expect(err).ToNot(HaveOccurred())
@@ -175,6 +164,22 @@ var _ = Describe("Config Daemon", func() {
175164
err := sut.Run(stopCh, exitCh)
176165
Expect(err).ToNot(HaveOccurred())
177166
}()
167+
168+
SriovDevicePluginPod = corev1.Pod{
169+
ObjectMeta: metav1.ObjectMeta{
170+
Name: "sriov-device-plugin-xxxx",
171+
Namespace: vars.Namespace,
172+
Labels: map[string]string{
173+
"app": "sriov-device-plugin",
174+
},
175+
},
176+
Spec: corev1.PodSpec{
177+
NodeName: "test-node",
178+
},
179+
}
180+
_, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), &SriovDevicePluginPod, metav1.CreateOptions{})
181+
Expect(err).ToNot(HaveOccurred())
182+
178183
})
179184

180185
AfterEach(func() {
@@ -286,6 +291,34 @@ var _ = Describe("Config Daemon", func() {
286291

287292
Expect(sut.desiredNodeState.GetGeneration()).To(BeNumerically("==", 777))
288293
})
294+
295+
It("restart all the sriov-device-plugin pods present on the node", func() {
296+
otherPod1 := SriovDevicePluginPod.DeepCopy()
297+
otherPod1.Name = "sriov-device-plugin-xxxa"
298+
_, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod1, metav1.CreateOptions{})
299+
Expect(err).ToNot(HaveOccurred())
300+
301+
otherPod2 := SriovDevicePluginPod.DeepCopy()
302+
otherPod2.Name = "sriov-device-plugin-xxxz"
303+
_, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod2, metav1.CreateOptions{})
304+
Expect(err).ToNot(HaveOccurred())
305+
306+
err = sut.restartDevicePluginPod()
307+
Expect(err).ToNot(HaveOccurred())
308+
309+
Eventually(func() (int, error) {
310+
podList, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{
311+
LabelSelector: "app=sriov-device-plugin",
312+
FieldSelector: "spec.nodeName=test-node",
313+
})
314+
315+
if err != nil {
316+
return 0, err
317+
}
318+
319+
return len(podList.Items), nil
320+
}, "1s").Should(BeZero())
321+
})
289322
})
290323
})
291324

0 commit comments

Comments
 (0)