Skip to content

Commit 4ba69ac

Browse files
committed
Implement Probe() function for netResourcePool
The implementation checks: 1. If the physical function PF of the SR-IOV devices is carrier down. This should be marked unhealthy. Normally, SR-IOV would still function when the PF is carrier down. But in the case of DPUs/IPUs/SmartNics with an embedded CPU, the PF being down **can** signal that the embedded CPU is in reset or shutdown with carrier down. 2. If any of the devices are gone. This could be due to someone changing the number of virtual functions. Or in the case of DPUs/IPUs/SmartNics with an embedded CPU, the driver needed to reset. This will cause the virtual functions to be removed. All devices that are gone should be marked unhealthy. Normally this won't be the case since the SR-IOV Network Operator will be managing the SR-IOV devices. However for DPUs/IPUs/SmartNics with an embedded CPU, would be externally managed with a seperate operator. Both these can be switched on and off using checkHealthOnPf and checkHealthOnDeviceExist within the resource config. Signed-off-by: William Zhao <[email protected]>
1 parent bec0f56 commit 4ba69ac

File tree

5 files changed

+192
-7
lines changed

5 files changed

+192
-7
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,8 @@ This plugin creates device plugin endpoints based on the configurations given in
286286
| "resourcePrefix" | N | Endpoint resource prefix name override. Should not contain special characters | string Default : "intel.com" | "yourcompany.com" |
287287
| "deviceType" | N | Device Type for a resource pool. | string value of supported types. Default: "netDevice" | Currently supported values: "accelerator", "netDevice", "auxNetDevice" |
288288
| "excludeTopology" | N | Exclude advertising of device's NUMA topology | bool Default: "false" | "excludeTopology": true |
289+
| "checkHealthOnPf" | N | Check the health of a net device by inspecting the link state of the PF | bool Default: "false" | "checkHealthOnPf": true |
290+
| "checkHealthOnDeviceExist" | N | Check the health of a net device by periodically checking if the PCI device exists in sysfs | bool Default: "false" | "checkHealthOnDeviceExist": true |
289291
| "selectors" | N | Either a single device selector map or a list of maps. The list syntax is preferred. The "deviceType" value determines the device selector options. | json list of objects or json object. Default: null | Example: "selectors": [{"vendors": ["8086"],"devices": ["154c"]}] |
290292
| "additionalInfo" | N | A map of map to add additional information to the pod via environment variables to devices | json object as string Default: null | Example: "additionalInfo": {"*": {"token": "3e49019f-412f-4f02-824e-4cd195944205"}} |
291293

pkg/netdevice/netResourcePool.go

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
type netResourcePool struct {
3131
*resources.ResourcePoolImpl
3232
nadutils types.NadUtils
33+
config *types.ResourceConfig
3334
}
3435

3536
var _ types.ResourcePool = &netResourcePool{}
@@ -41,6 +42,7 @@ func NewNetResourcePool(nadutils types.NadUtils, rc *types.ResourceConfig,
4142
return &netResourcePool{
4243
ResourcePoolImpl: rp,
4344
nadutils: nadutils,
45+
config: rc,
4446
}
4547
}
4648

@@ -66,6 +68,82 @@ func (rp *netResourcePool) GetDeviceSpecs(deviceIDs []string) []*pluginapi.Devic
6668
return devSpecs
6769
}
6870

71+
func (rp *netResourcePool) Probe() bool {
72+
// 1. If the physical function PF of the SR-IOV devices is carrier down. This should be marked unhealthy. Normally, SR-IOV
73+
// would still function when the PF is carrier down. But in the case of DPUs/IPUs/SmartNics with an embedded CPU,
74+
// the PF being down **can** signal that the embedded CPU is in reset or shutdown with carrier down.
75+
// 2. If any of the devices are gone. This could be due to someone changing the number of virtual functions.
76+
// Or in the case of DPUs/IPUs/SmartNics with an embedded CPU, the driver needed to reset. This will cause the
77+
// virtual functions to be removed. All devices that are gone should be marked unhealthy. Normally this won't be the case
78+
// since the SR-IOV Network Operator will be managing the SR-IOV devices. However for DPUs/IPUs/SmartNics with an embedded CPU,
79+
// would be externally managed with a seperate operator.
80+
changes := false
81+
cachedPfLinkStatus := make(map[string]bool)
82+
for id, device := range rp.GetDevicePool() {
83+
netDev, ok := device.(types.PciNetDevice)
84+
if !ok {
85+
// Skip devices that are not PCI net devices
86+
continue
87+
}
88+
currentHealth := device.GetHealth()
89+
pfName := netDev.GetPfNetName()
90+
91+
var pfIsUp bool = true
92+
var err error
93+
pfIsUpLog := ""
94+
if rp.config.CheckHealthOnPf {
95+
if cachedStatus, exists := cachedPfLinkStatus[pfName]; exists {
96+
pfIsUp = cachedStatus
97+
} else {
98+
pfIsUp, err = netDev.IsPfLinkUp()
99+
if err != nil {
100+
// If we can't check the link status, assume it's up. It could be that the PF was moved to a different netns.
101+
// We want a conservative approach, as we don't want to mark the device as unhealthy if we are unsure.
102+
pfIsUp = true
103+
}
104+
cachedPfLinkStatus[pfName] = pfIsUp
105+
}
106+
pfIsUpLog = fmt.Sprintf("PF %s", pfName)
107+
if pfIsUp {
108+
pfIsUpLog = fmt.Sprintf("%s is UP, ", pfIsUpLog)
109+
} else {
110+
pfIsUpLog = fmt.Sprintf("%s is DOWN, ", pfIsUpLog)
111+
}
112+
}
113+
114+
var deviceExists bool = true
115+
deviceExistsLog := ""
116+
if rp.config.CheckHealthOnDeviceExist {
117+
deviceExists = netDev.DeviceExists()
118+
deviceExistsLog = fmt.Sprintf("Device %s", netDev.GetPciAddr())
119+
if deviceExists {
120+
deviceExistsLog = fmt.Sprintf("%s is existing, ", deviceExistsLog)
121+
} else {
122+
deviceExistsLog = fmt.Sprintf("%s is missing, ", deviceExistsLog)
123+
}
124+
}
125+
126+
if pfIsUp && deviceExists && !currentHealth {
127+
glog.Infof("%s%sdevice was unhealthy, marking device %s as healthy", pfIsUpLog, deviceExistsLog, id)
128+
device.SetHealth(true)
129+
changes = true
130+
} else if !pfIsUp && deviceExists && currentHealth {
131+
glog.Infof("%s%sdevice was healthy, marking device %s as unhealthy", pfIsUpLog, deviceExistsLog, id)
132+
device.SetHealth(false)
133+
changes = true
134+
} else if pfIsUp && !deviceExists && currentHealth {
135+
glog.Infof("%s%sdevice was healthy, marking device %s as unhealthy", pfIsUpLog, deviceExistsLog, id)
136+
device.SetHealth(false)
137+
changes = true
138+
} else if !pfIsUp && !deviceExists && currentHealth {
139+
glog.Infof("%s%sdevice was healthy, marking device %s as unhealthy", pfIsUpLog, deviceExistsLog, id)
140+
device.SetHealth(false)
141+
changes = true
142+
}
143+
}
144+
return changes
145+
}
146+
69147
// StoreDeviceInfoFile stores the Device Info files according to the
70148
// k8snetworkplumbingwg/device-info-spec
71149
// for the requested deviceIDs

pkg/netdevice/netResourcePool_test.go

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,4 +237,107 @@ var _ = Describe("NetResourcePool", func() {
237237
})
238238
})
239239
})
240+
Describe("Health Checking on Net Devices", func() {
241+
Context("PF link status and device existance for VFs with Same PF", func() {
242+
rf := factory.NewResourceFactory("fake", "fake", true, false)
243+
nadutils := rf.GetNadUtils()
244+
rc := &types.ResourceConfig{
245+
ResourceName: "fake",
246+
ResourcePrefix: "fake",
247+
CheckHealthOnPf: true,
248+
CheckHealthOnDeviceExist: true,
249+
SelectorObjs: []interface{}{&types.NetDeviceSelectors{}},
250+
}
251+
252+
var fake1health bool = true
253+
var fake2health bool = true
254+
255+
fake1 := &mocks.PciNetDevice{}
256+
fake1.On("GetPfNetName").Return("fakepf1")
257+
fake1.On("GetPciAddr").Return("0000:01:00.1")
258+
fake1.On("SetHealth", Anything).Run(func(args Arguments) {
259+
fake1health = args.Bool(0)
260+
}).Return()
261+
262+
fake2 := &mocks.PciNetDevice{}
263+
fake2.On("GetPfNetName").Return("fakepf1")
264+
fake2.On("GetPciAddr").Return("0000:01:00.2")
265+
fake2.On("SetHealth", Anything).Run(func(args Arguments) {
266+
fake2health = args.Bool(0)
267+
}).Return()
268+
269+
pcis := map[string]types.HostDevice{"fake1": fake1, "fake2": fake2}
270+
271+
rp := netdevice.NewNetResourcePool(nadutils, rc, pcis)
272+
273+
SetCurrentHealth := func(health bool) {
274+
fake1health = health
275+
fake2health = health
276+
fake1.On("GetHealth").Unset()
277+
fake2.On("GetHealth").Unset()
278+
fake1.On("GetHealth").Return(health).Once()
279+
fake2.On("GetHealth").Return(health).Once()
280+
}
281+
282+
SetCurrentLinkState := func(up bool) {
283+
fake1.On("IsPfLinkUp").Unset()
284+
fake2.On("IsPfLinkUp").Unset()
285+
fake1.On("IsPfLinkUp").Return(up, nil).Once()
286+
fake2.On("IsPfLinkUp").Return(up, nil).Once()
287+
}
288+
289+
SetCurrentDeviceExistance := func(exist bool) {
290+
fake1.On("DeviceExists").Unset()
291+
fake2.On("DeviceExists").Unset()
292+
fake1.On("DeviceExists").Return(exist, nil).Once()
293+
fake2.On("DeviceExists").Return(exist, nil).Once()
294+
}
295+
296+
RunTest := func(health, link, exist bool) {
297+
SetCurrentHealth(health)
298+
SetCurrentLinkState(link)
299+
SetCurrentDeviceExistance(exist)
300+
change := rp.Probe()
301+
if health && link && exist {
302+
Expect(change).To(BeFalse())
303+
} else if !health && !(link && exist) {
304+
Expect(change).To(BeFalse())
305+
} else {
306+
Expect(change).To(BeTrue())
307+
}
308+
if link && exist {
309+
Expect(fake1health).To(BeTrue())
310+
Expect(fake2health).To(BeTrue())
311+
} else {
312+
Expect(fake1health).To(BeFalse())
313+
Expect(fake2health).To(BeFalse())
314+
}
315+
}
316+
317+
It("Currently Device Healthy, PF Link Up, Device exists", func() {
318+
RunTest(true, true, true)
319+
})
320+
It("Currently Device Healthy, PF Link Up, Device missing", func() {
321+
RunTest(true, true, false)
322+
})
323+
It("Currently Device Healthy, PF Link Down, Device exists", func() {
324+
RunTest(true, false, true)
325+
})
326+
It("Currently Device Healthy, PF Link Down, Device missing", func() {
327+
RunTest(true, false, false)
328+
})
329+
It("Currently Device Unhealthy, PF Link Up, Device exists", func() {
330+
RunTest(false, true, true)
331+
})
332+
It("Currently Device Unhealthy, PF Link Up, Device missing", func() {
333+
RunTest(false, true, false)
334+
})
335+
It("Currently Device Unhealthy, PF Link Down, Device exists", func() {
336+
RunTest(false, false, true)
337+
})
338+
It("Currently Device Unhealthy, PF Link Down, Device missing", func() {
339+
RunTest(false, false, false)
340+
})
341+
})
342+
})
240343
})

pkg/resources/server.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ func NewResourceServer(prefix, suffix string, pluginWatch, useCdi bool, rp types
7373
termSignal: make(chan bool, 1),
7474
updateSignal: make(chan bool),
7575
stopWatcher: make(chan bool),
76-
checkIntervals: 20, // updates every 20 seconds
76+
checkIntervals: 5, // updates every 5 seconds
7777
cdi: cdiPkg.New(),
7878
}
7979
}

pkg/types/types.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,12 +97,14 @@ type ResourceConfig struct {
9797
// optional resource prefix that will overwrite global prefix specified in cli params
9898
ResourcePrefix string `json:"resourcePrefix,omitempty"`
9999
//nolint:lll
100-
ResourceName string `json:"resourceName"` // the resource name will be added with resource prefix in K8s api
101-
DeviceType DeviceType `json:"deviceType,omitempty"`
102-
ExcludeTopology bool `json:"excludeTopology,omitempty"`
103-
Selectors *json.RawMessage `json:"selectors,omitempty"`
104-
AdditionalInfo map[string]AdditionalInfo `json:"additionalInfo,omitempty"`
105-
SelectorObjs []interface{}
100+
ResourceName string `json:"resourceName"` // the resource name will be added with resource prefix in K8s api
101+
DeviceType DeviceType `json:"deviceType,omitempty"`
102+
ExcludeTopology bool `json:"excludeTopology,omitempty"`
103+
CheckHealthOnPf bool `json:"checkHealthOnPf,omitempty"`
104+
CheckHealthOnDeviceExist bool `json:"checkHealthOnDeviceExist,omitempty"`
105+
Selectors *json.RawMessage `json:"selectors,omitempty"`
106+
AdditionalInfo map[string]AdditionalInfo `json:"additionalInfo,omitempty"`
107+
SelectorObjs []interface{}
106108
}
107109

108110
// DeviceSelectors contains common device selectors fields

0 commit comments

Comments
 (0)