Skip to content

Commit 3d20dff

Browse files
committed
redesign device plugin
always deploy sriov network device plugin and use a label to enable or disable it on the nodes Signed-off-by: Sebastian Sch <[email protected]>
1 parent 823b4d4 commit 3d20dff

File tree

14 files changed

+340
-122
lines changed

14 files changed

+340
-122
lines changed

bindata/manifests/plugins/sriov-device-plugin.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ spec:
2727
hostNetwork: true
2828
nodeSelector:
2929
{{- range $key, $value := .NodeSelectorField }}
30-
{{ $key }}: {{ $value }}
30+
{{ $key }}: "{{ $value }}"
3131
{{- end }}
3232
tolerations:
3333
- operator: Exists

controllers/drain_controller_helper.go

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -109,17 +109,6 @@ func (dr *DrainReconcile) handleNodeDrainOrReboot(ctx context.Context,
109109
return reconcile.Result{RequeueAfter: 5 * time.Second}, nil
110110
}
111111

112-
reqLogger.Info("remove Device plugin from node")
113-
err = utils.LabelNode(ctx, node.Name, constants.SriovDevicePluginLabel, constants.SriovDevicePluginLabelDisabled, dr.Client)
114-
if err != nil {
115-
reqLogger.Error(err, "failed to label node for device plugin label",
116-
"labelKey",
117-
constants.SriovDevicePluginLabel,
118-
"labelValue",
119-
constants.SriovDevicePluginLabelDisabled)
120-
return reconcile.Result{}, err
121-
}
122-
123112
// if we manage to drain we label the node state with drain completed and finish
124113
err = utils.AnnotateObject(ctx, nodeNetworkState, constants.NodeStateDrainAnnotationCurrent, constants.DrainComplete, dr.Client)
125114
if err != nil {

controllers/helper.go

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -152,11 +152,17 @@ func GetDefaultNodeSelector() map[string]string {
152152

153153
// GetDefaultNodeSelectorForDevicePlugin return a nodeSelector with worker linux os
154154
// and the enabled sriov device plugin
155-
func GetDefaultNodeSelectorForDevicePlugin() map[string]string {
156-
return map[string]string{
157-
"kubernetes.io/os": "linux",
158-
constants.SriovDevicePluginLabel: constants.SriovDevicePluginLabelEnabled,
155+
func GetNodeSelectorForDevicePlugin(dc *sriovnetworkv1.SriovOperatorConfig) map[string]string {
156+
if len(dc.Spec.ConfigDaemonNodeSelector) == 0 {
157+
return map[string]string{
158+
"kubernetes.io/os": "linux",
159+
constants.SriovDevicePluginLabel: constants.SriovDevicePluginLabelEnabled,
160+
}
159161
}
162+
163+
tmp := dc.Spec.DeepCopy()
164+
tmp.ConfigDaemonNodeSelector[constants.SriovDevicePluginLabel] = constants.SriovDevicePluginLabelEnabled
165+
return tmp.ConfigDaemonNodeSelector
160166
}
161167

162168
func syncPluginDaemonObjs(ctx context.Context,
@@ -173,7 +179,7 @@ func syncPluginDaemonObjs(ctx context.Context,
173179
data.Data["ReleaseVersion"] = os.Getenv("RELEASEVERSION")
174180
data.Data["ResourcePrefix"] = vars.ResourcePrefix
175181
data.Data["ImagePullSecrets"] = GetImagePullSecrets()
176-
data.Data["NodeSelectorField"] = GetDefaultNodeSelectorForDevicePlugin()
182+
data.Data["NodeSelectorField"] = GetNodeSelectorForDevicePlugin(dc)
177183
data.Data["UseCDI"] = dc.Spec.UseCDI
178184
objs, err := renderDsForCR(constants.PluginPath, &data)
179185
if err != nil {

controllers/sriovnetworknodepolicy_controller.go

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ import (
4646
sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
4747
constants "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
4848
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate"
49+
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils"
4950
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
5051
)
5152

@@ -133,10 +134,6 @@ func (r *SriovNetworkNodePolicyReconciler) Reconcile(ctx context.Context, req ct
133134
if err = r.syncDevicePluginConfigMap(ctx, defaultOpConf, policyList, nodeList); err != nil {
134135
return reconcile.Result{}, err
135136
}
136-
// Render and sync Daemon objects
137-
if err = syncPluginDaemonObjs(ctx, r.Client, r.Scheme, defaultOpConf, policyList); err != nil {
138-
return reconcile.Result{}, err
139-
}
140137

141138
// All was successful. Request that this be re-triggered after ResyncPeriod,
142139
// so we can reconcile state again.
@@ -182,6 +179,12 @@ func (r *SriovNetworkNodePolicyReconciler) SetupWithManager(mgr ctrl.Manager) er
182179
Info("Enqueuing sync for create event", "resource", e.Object.GetName())
183180
qHandler(q)
184181
},
182+
UpdateFunc: func(ctx context.Context, e event.UpdateEvent, q workqueue.RateLimitingInterface) {
183+
reflect.DeepEqual(e.ObjectOld.GetLabels(), e.ObjectNew.GetLabels())
184+
log.Log.WithName("SriovNetworkNodePolicy").
185+
Info("Enqueuing sync for create event", "resource", e.ObjectNew.GetName())
186+
qHandler(q)
187+
},
185188
DeleteFunc: func(ctx context.Context, e event.DeleteEvent, q workqueue.RateLimitingInterface) {
186189
log.Log.WithName("SriovNetworkNodePolicy").
187190
Info("Enqueuing sync for delete event", "resource", e.Object.GetName())
@@ -219,6 +222,30 @@ func (r *SriovNetworkNodePolicyReconciler) syncDevicePluginConfigMap(ctx context
219222
return err
220223
}
221224
configData[node.Name] = string(config)
225+
226+
if data.ResourceList == nil || len(data.ResourceList) == 0 {
227+
// if we don't have policies we should add the disabled label for the device plugin
228+
err = utils.LabelNode(ctx, node.Name, constants.SriovDevicePluginLabel, constants.SriovDevicePluginLabelDisabled, r.Client)
229+
if err != nil {
230+
logger.Error(err, "failed to label node for device plugin label",
231+
"labelKey",
232+
constants.SriovDevicePluginLabel,
233+
"labelValue",
234+
constants.SriovDevicePluginLabelDisabled)
235+
return err
236+
}
237+
} else {
238+
// if we have policies we should add the enabled label for the device plugin
239+
err = utils.LabelNode(ctx, node.Name, constants.SriovDevicePluginLabel, constants.SriovDevicePluginLabelEnabled, r.Client)
240+
if err != nil {
241+
logger.Error(err, "failed to label node for device plugin label",
242+
"labelKey",
243+
constants.SriovDevicePluginLabel,
244+
"labelValue",
245+
constants.SriovDevicePluginLabelEnabled)
246+
return err
247+
}
248+
}
222249
}
223250

224251
cm := &corev1.ConfigMap{
@@ -296,8 +323,15 @@ func (r *SriovNetworkNodePolicyReconciler) syncAllSriovNetworkNodeStates(ctx con
296323
}
297324
}
298325
if !found {
326+
// remove device plugin labels
327+
logger.Info("removing device plugin label from node as SriovNetworkNodeState doesn't exist", "nodeStateName", ns.Name)
328+
err = utils.RemoveLabelFromNode(ctx, ns.Name, constants.SriovDevicePluginLabel, r.Client)
329+
if err != nil {
330+
logger.Error(err, "Fail to remove device plugin label from node", "node", ns.Name)
331+
return err
332+
}
299333
logger.Info("Deleting SriovNetworkNodeState as node with that name doesn't exist", "nodeStateName", ns.Name)
300-
err := r.Delete(ctx, &ns, &client.DeleteOptions{})
334+
err = r.Delete(ctx, &ns, &client.DeleteOptions{})
301335
if err != nil {
302336
logger.Error(err, "Fail to Delete", "SriovNetworkNodeState CR:", ns.GetName())
303337
return err
@@ -415,13 +449,13 @@ func (r *SriovNetworkNodePolicyReconciler) renderDevicePluginConfigData(ctx cont
415449
found, i := resourceNameInList(p.Spec.ResourceName, &rcl)
416450

417451
if found {
418-
err := updateDevicePluginResource(ctx, &rcl.ResourceList[i], &p, nodeState)
452+
err := updateDevicePluginResource(&rcl.ResourceList[i], &p, nodeState)
419453
if err != nil {
420454
return rcl, err
421455
}
422456
logger.V(1).Info("Update resource", "Resource", rcl.ResourceList[i])
423457
} else {
424-
rc, err := createDevicePluginResource(ctx, &p, nodeState)
458+
rc, err := createDevicePluginResource(&p, nodeState)
425459
if err != nil {
426460
return rcl, err
427461
}
@@ -442,7 +476,6 @@ func resourceNameInList(name string, rcl *dptypes.ResourceConfList) (bool, int)
442476
}
443477

444478
func createDevicePluginResource(
445-
ctx context.Context,
446479
p *sriovnetworkv1.SriovNetworkNodePolicy,
447480
nodeState *sriovnetworkv1.SriovNetworkNodeState) (*dptypes.ResourceConfig, error) {
448481
netDeviceSelectors := dptypes.NetDeviceSelectors{}
@@ -516,7 +549,6 @@ func createDevicePluginResource(
516549
}
517550

518551
func updateDevicePluginResource(
519-
ctx context.Context,
520552
rc *dptypes.ResourceConfig,
521553
p *sriovnetworkv1.SriovNetworkNodePolicy,
522554
nodeState *sriovnetworkv1.SriovNetworkNodeState) error {

controllers/sriovnetworknodepolicy_controller_test.go

Lines changed: 136 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,20 @@ package controllers
33
import (
44
"context"
55
"encoding/json"
6+
"sync"
67
"testing"
8+
"time"
9+
10+
. "github.com/onsi/ginkgo/v2"
11+
. "github.com/onsi/gomega"
712

813
"github.com/google/go-cmp/cmp"
914
corev1 "k8s.io/api/core/v1"
15+
"k8s.io/apimachinery/pkg/api/errors"
1016
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1117
"k8s.io/apimachinery/pkg/runtime"
1218
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
13-
19+
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
1420
"sigs.k8s.io/controller-runtime/pkg/client/fake"
1521

1622
dptypes "github.com/k8snetworkplumbingwg/sriov-network-device-plugin/pkg/types"
@@ -126,3 +132,132 @@ func TestRenderDevicePluginConfigData(t *testing.T) {
126132
})
127133
}
128134
}
135+
136+
var _ = Describe("SriovnetworkNodePolicy controller", Ordered, func() {
137+
var cancel context.CancelFunc
138+
var ctx context.Context
139+
140+
BeforeAll(func() {
141+
By("Create SriovOperatorConfig controller k8s objs")
142+
config := makeDefaultSriovOpConfig()
143+
Expect(k8sClient.Create(context.Background(), config)).Should(Succeed())
144+
DeferCleanup(func() {
145+
err := k8sClient.Delete(context.Background(), config)
146+
Expect(err).ToNot(HaveOccurred())
147+
})
148+
149+
// setup controller manager
150+
By("Setup controller manager")
151+
k8sManager, err := setupK8sManagerForTest()
152+
Expect(err).ToNot(HaveOccurred())
153+
154+
err = (&SriovNetworkNodePolicyReconciler{
155+
Client: k8sManager.GetClient(),
156+
Scheme: k8sManager.GetScheme(),
157+
FeatureGate: featuregate.New(),
158+
}).SetupWithManager(k8sManager)
159+
Expect(err).ToNot(HaveOccurred())
160+
161+
ctx, cancel = context.WithCancel(context.Background())
162+
163+
wg := sync.WaitGroup{}
164+
wg.Add(1)
165+
go func() {
166+
defer wg.Done()
167+
defer GinkgoRecover()
168+
By("Start controller manager")
169+
err := k8sManager.Start(ctx)
170+
Expect(err).ToNot(HaveOccurred())
171+
}()
172+
173+
DeferCleanup(func() {
174+
By("Shut down manager")
175+
cancel()
176+
wg.Wait()
177+
})
178+
})
179+
AfterEach(func() {
180+
err := k8sClient.DeleteAllOf(context.Background(), &corev1.Node{})
181+
Expect(err).ToNot(HaveOccurred())
182+
183+
err = k8sClient.DeleteAllOf(context.Background(), &sriovnetworkv1.SriovNetworkNodePolicy{}, k8sclient.InNamespace(vars.Namespace))
184+
Expect(err).ToNot(HaveOccurred())
185+
186+
err = k8sClient.DeleteAllOf(context.Background(), &sriovnetworkv1.SriovNetworkNodeState{}, k8sclient.InNamespace(vars.Namespace))
187+
Expect(err).ToNot(HaveOccurred())
188+
})
189+
Context("device plugin labels", func() {
190+
It("Should add the right labels to the nodes", func() {
191+
node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{
192+
Name: "node0",
193+
Labels: map[string]string{"kubernetes.io/os": "linux",
194+
"node-role.kubernetes.io/worker": ""},
195+
}}
196+
Expect(k8sClient.Create(ctx, node)).To(Succeed())
197+
198+
nodeState := &sriovnetworkv1.SriovNetworkNodeState{}
199+
Eventually(func(g Gomega) {
200+
err := k8sClient.Get(context.TODO(), k8sclient.ObjectKey{Name: "node0", Namespace: testNamespace}, nodeState)
201+
g.Expect(err).ToNot(HaveOccurred())
202+
}, time.Minute, time.Second).Should(Succeed())
203+
204+
Eventually(func(g Gomega) {
205+
err := k8sClient.Get(context.Background(), k8sclient.ObjectKey{Name: node.Name}, node)
206+
g.Expect(err).ToNot(HaveOccurred())
207+
value, exist := node.Labels[consts.SriovDevicePluginLabel]
208+
g.Expect(exist).To(BeTrue())
209+
g.Expect(value).To(Equal(consts.SriovDevicePluginLabelDisabled))
210+
}, time.Minute, time.Second).Should(Succeed())
211+
212+
nodeState.Status.Interfaces = sriovnetworkv1.InterfaceExts{
213+
sriovnetworkv1.InterfaceExt{
214+
Vendor: "8086",
215+
Driver: "i40e",
216+
Mtu: 1500,
217+
Name: "ens803f0",
218+
PciAddress: "0000:86:00.0",
219+
NumVfs: 0,
220+
TotalVfs: 64,
221+
},
222+
}
223+
err := k8sClient.Status().Update(context.Background(), nodeState)
224+
Expect(err).ToNot(HaveOccurred())
225+
226+
somePolicy := &sriovnetworkv1.SriovNetworkNodePolicy{}
227+
somePolicy.SetNamespace(testNamespace)
228+
somePolicy.SetName("some-policy")
229+
somePolicy.Spec = sriovnetworkv1.SriovNetworkNodePolicySpec{
230+
NumVfs: 5,
231+
NodeSelector: map[string]string{"node-role.kubernetes.io/worker": ""},
232+
NicSelector: sriovnetworkv1.SriovNetworkNicSelector{Vendor: "8086"},
233+
Priority: 20,
234+
}
235+
Expect(k8sClient.Create(context.Background(), somePolicy)).ToNot(HaveOccurred())
236+
237+
Eventually(func(g Gomega) {
238+
err := k8sClient.Get(context.Background(), k8sclient.ObjectKey{Name: node.Name}, node)
239+
g.Expect(err).ToNot(HaveOccurred())
240+
value, exist := node.Labels[consts.SriovDevicePluginLabel]
241+
g.Expect(exist).To(BeTrue())
242+
g.Expect(value).To(Equal(consts.SriovDevicePluginLabelEnabled))
243+
}, time.Minute, time.Second).Should(Succeed())
244+
245+
delete(node.Labels, "node-role.kubernetes.io/worker")
246+
err = k8sClient.Update(context.Background(), node)
247+
Expect(err).ToNot(HaveOccurred())
248+
249+
Eventually(func(g Gomega) {
250+
err := k8sClient.Get(context.Background(), k8sclient.ObjectKey{Name: node.Name}, node)
251+
g.Expect(err).ToNot(HaveOccurred())
252+
_, exist := node.Labels[consts.SriovDevicePluginLabel]
253+
g.Expect(exist).To(BeFalse())
254+
}, time.Minute, time.Second).Should(Succeed())
255+
256+
Eventually(func(g Gomega) {
257+
err := k8sClient.Get(context.Background(), k8sclient.ObjectKey{Name: node.Name, Namespace: testNamespace}, nodeState)
258+
Expect(err).To(HaveOccurred())
259+
Expect(errors.IsNotFound(err)).To(BeTrue())
260+
}, time.Minute, time.Second).Should(Succeed())
261+
})
262+
})
263+
})

controllers/sriovoperatorconfig_controller.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,12 @@ import (
4444
machinev1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1"
4545

4646
sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
47-
apply "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/apply"
48-
consts "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
47+
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/apply"
48+
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
4949
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate"
5050
snolog "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/log"
5151
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms"
52-
render "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/render"
52+
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/render"
5353
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
5454
)
5555

@@ -140,7 +140,7 @@ func (r *SriovOperatorConfigReconciler) Reconcile(ctx context.Context, req ctrl.
140140
return reconcile.Result{}, err
141141
}
142142

143-
if err = syncPluginDaemonObjs(ctx, r.Client, r.Scheme, defaultConfig, policyList); err != nil {
143+
if err = syncPluginDaemonObjs(ctx, r.Client, r.Scheme, defaultConfig); err != nil {
144144
return reconcile.Result{}, err
145145
}
146146

0 commit comments

Comments
 (0)