forked from k8snetworkplumbingwg/sriov-network-operator
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmellanox_plugin.go
More file actions
264 lines (228 loc) · 8.81 KB
/
mellanox_plugin.go
File metadata and controls
264 lines (228 loc) · 8.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
package mellanox
import (
"fmt"
"sigs.k8s.io/controller-runtime/pkg/log"
sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper"
plugin "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
mlx "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vendors/mellanox"
)
var PluginName = "mellanox"
type MellanoxPlugin struct {
PluginName string
SpecVersion string
helpers helper.HostHelpersInterface
}
var pciAddressesToReset []string
var attributesToChange map[string]mlx.MlxNic
var mellanoxNicsStatus map[string]map[string]sriovnetworkv1.InterfaceExt
var mellanoxNicsSpec map[string]sriovnetworkv1.Interface
// Initialize our plugin and set up initial values
func NewMellanoxPlugin(helpers helper.HostHelpersInterface) (plugin.VendorPlugin, error) {
mellanoxNicsStatus = map[string]map[string]sriovnetworkv1.InterfaceExt{}
return &MellanoxPlugin{
PluginName: PluginName,
SpecVersion: "1.0",
helpers: helpers,
}, nil
}
// Name returns the name of the plugin
func (p *MellanoxPlugin) Name() string {
return p.PluginName
}
// SpecVersion returns the version of the spec expected by the plugin
func (p *MellanoxPlugin) Spec() string {
return p.SpecVersion
}
// OnNodeStateChange Invoked when SriovNetworkNodeState CR is created or updated, return if need dain and/or reboot node
func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeState) (needDrain bool, needReboot bool, err error) {
log.Log.Info("mellanox plugin OnNodeStateChange()")
needDrain = false
needReboot = false
err = nil
pciAddressesToReset = []string{}
attributesToChange = map[string]mlx.MlxNic{}
mellanoxNicsStatus = map[string]map[string]sriovnetworkv1.InterfaceExt{}
mellanoxNicsSpec = map[string]sriovnetworkv1.Interface{}
processedNics := map[string]bool{}
// fill mellanoxNicsStatus
for _, iface := range new.Status.Interfaces {
if iface.Vendor != mlx.MellanoxVendorID {
continue
}
pciPrefix := mlx.GetPciAddressPrefix(iface.PciAddress)
if ifaces, ok := mellanoxNicsStatus[pciPrefix]; ok {
ifaces[iface.PciAddress] = iface
} else {
mellanoxNicsStatus[pciPrefix] = map[string]sriovnetworkv1.InterfaceExt{iface.PciAddress: iface}
}
}
// Add only mellanox cards that required changes in the map, to help track dual port NICs
for _, iface := range new.Spec.Interfaces {
pciPrefix := mlx.GetPciAddressPrefix(iface.PciAddress)
if _, ok := mellanoxNicsStatus[pciPrefix]; !ok {
continue
}
mellanoxNicsSpec[iface.PciAddress] = iface
}
if p.helpers.IsKernelLockdownMode() {
if len(mellanoxNicsSpec) > 0 {
log.Log.Info("Lockdown mode detected, failing on interface update for mellanox devices")
return false, false, fmt.Errorf("mellanox device detected when in lockdown mode")
}
log.Log.Info("Lockdown mode detected, skpping mellanox nic processing")
return
}
for _, ifaceSpec := range mellanoxNicsSpec {
pciPrefix := mlx.GetPciAddressPrefix(ifaceSpec.PciAddress)
// skip processed nics, help not running the same logic 2 times for dual port NICs
if _, ok := processedNics[pciPrefix]; ok {
continue
}
processedNics[pciPrefix] = true
fwCurrent, fwNext, err := p.helpers.GetMlxNicFwData(ifaceSpec.PciAddress)
if err != nil {
return false, false, err
}
isDualPort := mlx.IsDualPort(ifaceSpec.PciAddress, mellanoxNicsStatus)
// Attributes to change
attrs := &mlx.MlxNic{TotalVfs: -1}
var changeWithoutReboot bool
totalVfs, totalVfsNeedReboot, totalVfsChangeWithoutReboot := mlx.HandleTotalVfs(fwCurrent, fwNext, attrs, ifaceSpec, isDualPort, mellanoxNicsSpec)
sriovEnNeedReboot, sriovEnChangeWithoutReboot := mlx.HandleEnableSriov(totalVfs, fwCurrent, fwNext, attrs)
needReboot = totalVfsNeedReboot || sriovEnNeedReboot
changeWithoutReboot = totalVfsChangeWithoutReboot || sriovEnChangeWithoutReboot
needLinkChange, err := mlx.HandleLinkType(pciPrefix, fwCurrent, attrs, mellanoxNicsSpec, mellanoxNicsStatus)
if err != nil {
return false, false, err
}
needReboot = needReboot || needLinkChange
// no FW changes allowed when NIC is externally managed
if ifaceSpec.ExternallyManaged {
if totalVfsNeedReboot || totalVfsChangeWithoutReboot {
return false, false, fmt.Errorf(
"interface %s required a change in the TotalVfs but the policy is externally managed failing: firmware TotalVf %d requested TotalVf %d",
ifaceSpec.PciAddress, fwCurrent.TotalVfs, totalVfs)
}
if needLinkChange {
return false, false, fmt.Errorf("change required for link type but the policy is externally managed, failing")
}
}
if needReboot || changeWithoutReboot {
attributesToChange[ifaceSpec.PciAddress] = *attrs
}
if needReboot {
pciAddressesToReset = append(pciAddressesToReset, ifaceSpec.PciAddress)
}
}
// Set total VFs to 0 for mellanox interfaces with no spec
for pciPrefix, portsMap := range mellanoxNicsStatus {
if _, ok := processedNics[pciPrefix]; ok {
continue
}
// Add the nic to processed Nics to not repeat the process for dual nic ports
processedNics[pciPrefix] = true
pciAddress := pciPrefix + "0"
// Skip devices not configured by the operator
isConfigured, err := p.nicConfiguredByOperator(portsMap)
if err != nil {
return false, false, err
}
if !isConfigured {
log.Log.V(2).Info("None of the ports are configured by the operator skipping firmware reset",
"portMap", portsMap)
continue
}
// Skip externally managed NICs
hasExternally, err := p.nicHasExternallyManagedPFs(portsMap)
if err != nil {
return false, false, err
}
if hasExternally {
log.Log.V(2).Info("One of the ports is configured as externally managed skipping firmware reset",
"portMap", portsMap)
continue
}
// Skip unsupported devices
if id := sriovnetworkv1.GetVfDeviceID(portsMap[pciAddress].DeviceID); id == "" {
continue
}
_, fwNext, err := p.helpers.GetMlxNicFwData(pciAddress)
if err != nil {
return false, false, err
}
if fwNext.TotalVfs > 0 || fwNext.EnableSriov {
attributesToChange[pciAddress] = mlx.MlxNic{TotalVfs: 0}
log.Log.V(2).Info("Changing TotalVfs to 0, doesn't require rebooting", "fwNext.totalVfs", fwNext.TotalVfs)
}
}
if needReboot {
needDrain = true
}
log.Log.V(2).Info("mellanox plugin", "need-drain", needDrain, "need-reboot", needReboot)
return
}
// TODO: implement - https://github.com/k8snetworkplumbingwg/sriov-network-operator/issues/631
// OnNodeStatusChange verify whether SriovNetworkNodeState CR status present changes on configured VFs.
func (p *MellanoxPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) (bool, error) {
return false, nil
}
// Apply config change
func (p *MellanoxPlugin) Apply() error {
if p.helpers.IsKernelLockdownMode() {
log.Log.Info("mellanox plugin Apply() - skipping due to lockdown mode")
return nil
}
log.Log.Info("mellanox plugin Apply()")
if err := p.helpers.MlxConfigFW(attributesToChange); err != nil {
return err
}
if vars.MlxPluginFwReset {
return p.helpers.MlxResetFW(pciAddressesToReset)
}
return nil
}
// nicHasExternallyManagedPFs returns true if one of the ports(interface) of the NIC is marked as externally managed
// in StoreManagerInterface.
func (p *MellanoxPlugin) nicHasExternallyManagedPFs(nicPortsMap map[string]sriovnetworkv1.InterfaceExt) (bool, error) {
for _, iface := range nicPortsMap {
pfStatus, exist, err := p.helpers.LoadPfsStatus(iface.PciAddress)
if err != nil {
// nolint:goconst
log.Log.Error(err, "failed to load PF status from disk. "+
"This should not happen, to overcome config daemon stuck, "+
"please remove the PCI file on the host under the operator configuration path",
"path", consts.PfAppliedConfig, "pciAddress", iface.PciAddress)
return false, err
}
if !exist {
continue
}
if pfStatus.ExternallyManaged {
log.Log.V(2).Info("PF is extenally managed, skip FW TotalVfs reset")
return true, nil
}
}
return false, nil
}
// nicConfiguredByOperator returns true if one of the ports(interface) of the NIC is configured by operator
func (p *MellanoxPlugin) nicConfiguredByOperator(nicPortsMap map[string]sriovnetworkv1.InterfaceExt) (bool, error) {
for _, iface := range nicPortsMap {
_, exist, err := p.helpers.LoadPfsStatus(iface.PciAddress)
if err != nil {
// nolint:goconst
log.Log.Error(err, "failed to load PF status from disk. "+
"This should not happen, to overcome config daemon stuck, "+
"please remove the PCI file on the host under the operator configuration path",
"path", consts.PfAppliedConfig, "pciAddress", iface.PciAddress)
return false, err
}
if exist {
log.Log.V(2).Info("PF configured by the operator", "interface", iface)
return true, nil
}
}
return false, nil
}