-
Notifications
You must be signed in to change notification settings - Fork 197
Add health check for Net Devices #640
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
3aa9914
d56d95b
3f71202
875ca5d
55719bb
ebbed81
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,6 +30,7 @@ import ( | |
| type netResourcePool struct { | ||
| *resources.ResourcePoolImpl | ||
| nadutils types.NadUtils | ||
| config *types.ResourceConfig | ||
| } | ||
|
|
||
| var _ types.ResourcePool = &netResourcePool{} | ||
|
|
@@ -41,6 +42,7 @@ func NewNetResourcePool(nadutils types.NadUtils, rc *types.ResourceConfig, | |
| return &netResourcePool{ | ||
| ResourcePoolImpl: rp, | ||
| nadutils: nadutils, | ||
| config: rc, | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -66,6 +68,66 @@ func (rp *netResourcePool) GetDeviceSpecs(deviceIDs []string) []*pluginapi.Devic | |
| return devSpecs | ||
| } | ||
|
|
||
| func (rp *netResourcePool) Probe() bool { | ||
| // 1. If the physical function PF of the SR-IOV devices is carrier down. This should be marked unhealthy. Normally, SR-IOV | ||
| // would still function when the PF is carrier down. But in the case of DPUs/IPUs/SmartNics with an embedded CPU, | ||
| // the PF being down **can** signal that the embedded CPU is in reset or shutdown with carrier down. | ||
| // 2. If any of the devices are gone. This could be due to someone changing the number of virtual functions. | ||
| // Or in the case of DPUs/IPUs/SmartNics with an embedded CPU, the driver needed to reset. This will cause the | ||
| // virtual functions to be removed. All devices that are gone should be marked unhealthy. Normally this won't be the case | ||
| // since the SR-IOV Network Operator will be managing the SR-IOV devices. However for DPUs/IPUs/SmartNics with an embedded CPU, | ||
| // would be externally managed with a separate operator. | ||
| changes := false | ||
| cachedPfLinkStatus := make(map[string]bool) | ||
| for id, device := range rp.GetDevicePool() { | ||
| netDev, ok := device.(types.PciNetDevice) | ||
| if !ok { | ||
| // Skip devices that are not PCI net devices | ||
| continue | ||
| } | ||
| currentHealth := device.GetHealth() | ||
| pfName := netDev.GetPfNetName() | ||
|
|
||
| pfIsUp := true | ||
| var err error | ||
| pfIsUpLog := "" | ||
| if rp.config.CheckHealthOnPf { | ||
| if cachedStatus, exists := cachedPfLinkStatus[pfName]; exists { | ||
| pfIsUp = cachedStatus | ||
| } else { | ||
| pfIsUp, err = netDev.IsPfLinkUp() | ||
| if err != nil { | ||
| // If we can't check the link status, assume it's up. It could be that the PF was moved to a different netns. | ||
| // We want a conservative approach, as we don't want to mark the device as unhealthy if we are unsure. | ||
| pfIsUp = true | ||
| } | ||
| cachedPfLinkStatus[pfName] = pfIsUp | ||
| } | ||
| pfIsUpLog = fmt.Sprintf("PF %s is %s,", pfName, map[bool]string{true: "Up", false: "Down"}[pfIsUp]) | ||
| } | ||
|
|
||
| deviceExists := true | ||
| deviceExistsLog := "" | ||
| if rp.config.CheckHealthOnDeviceExist { | ||
| deviceExists = netDev.DeviceExists() | ||
| deviceExistsLog = fmt.Sprintf("Device %s is %s,", netDev.GetPciAddr(), | ||
| map[bool]string{true: "existing", false: "missing"}[deviceExists]) | ||
| } | ||
|
|
||
| if pfIsUp && deviceExists && !currentHealth { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you think we can make this long if else statement simpler? took me some time to get it
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please take a look now at the reduce if statement. Let me know if it is better. |
||
| glog.Infof("%s %s device was unhealthy, marking device %s as healthy", pfIsUpLog, deviceExistsLog, id) | ||
| device.SetHealth(true) | ||
| changes = true | ||
| } else if (!pfIsUp || !deviceExists) && currentHealth { | ||
| // If either the PF is down or the device is missing: | ||
| glog.Infof("%s %s device was healthy, marking device %s as unhealthy", pfIsUpLog, deviceExistsLog, id) | ||
| device.SetHealth(false) | ||
| changes = true | ||
| } | ||
| } | ||
| return changes | ||
| } | ||
|
|
||
| // StoreDeviceInfoFile stores the Device Info files according to the | ||
| // k8snetworkplumbingwg/device-info-spec | ||
| // for the requested deviceIDs | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -73,7 +73,7 @@ func NewResourceServer(prefix, suffix string, pluginWatch, useCdi bool, rp types | |
| termSignal: make(chan bool, 1), | ||
| updateSignal: make(chan bool), | ||
| stopWatcher: make(chan bool), | ||
| checkIntervals: 20, // updates every 20 seconds | ||
| checkIntervals: 5, // updates every 5 seconds | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Light be too short for Telco environments. |
||
| cdi: cdiPkg.New(), | ||
| } | ||
| } | ||
|
|
||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should this be a list ? that way we dont overcumber the API
e.g
healthChecks: ["DeviceNetworkLink", "DeviceExists"]or similarThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I want to add also something like this
Also today we have no implementation of probe it was running a sleep loop doing nothing.
let's change that to only start the probe go routine if there is something in the
healthCheckslist