Skip to content

Commit fc0ef70

Browse files
committed
Fix the issue where scheduling can still occur on the node when the device-plugin crashes.
Signed-off-by: chaunceyjiang <[email protected]>
1 parent b398634 commit fc0ef70

File tree

11 files changed

+83
-81
lines changed

11 files changed

+83
-81
lines changed

pkg/device/ascend/device.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,8 @@ func (dev *AscendDevices) CheckUUID(annos map[string]string, d util.DeviceUsage)
177177
return true
178178
}
179179

180-
func (dev *AscendDevices) CheckHealth(devType string, n *corev1.Node) (bool, bool) {
181-
return true, true
180+
func (dev *AscendDevices) CheckHealth(devType string, n *corev1.Node) bool {
181+
return true
182182
}
183183

184184
func trimMemory(i int64) int64 {

pkg/device/cambricon/device.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,8 @@ func (dev *CambriconDevices) NodeCleanUp(nn string) error {
167167
return nil
168168
}
169169

170-
func (dev *CambriconDevices) CheckHealth(devType string, n *corev1.Node) (bool, bool) {
171-
return true, true
170+
func (dev *CambriconDevices) CheckHealth(devType string, n *corev1.Node) bool {
171+
return true
172172
}
173173

174174
func (dev *CambriconDevices) GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, error) {

pkg/device/devices.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ import (
3939

4040
type Devices interface {
4141
MutateAdmission(ctr *corev1.Container) (bool, error)
42-
CheckHealth(devType string, n *corev1.Node) (bool, bool)
42+
CheckHealth(devType string, n *corev1.Node) bool
4343
NodeCleanUp(nn string) error
4444
GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, error)
4545
CheckType(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) (bool, bool, bool)

pkg/device/hygon/device.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ func (dev *DCUDevices) NodeCleanUp(nn string) error {
131131
return util.MarkAnnotationsToDelete(HygonDCUDevice, nn)
132132
}
133133

134-
func (dev *DCUDevices) CheckHealth(devType string, n *corev1.Node) (bool, bool) {
134+
func (dev *DCUDevices) CheckHealth(devType string, n *corev1.Node) bool {
135135
return util.CheckHealth(devType, n)
136136
}
137137

pkg/device/iluvatar/device.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,8 @@ func (dev *IluvatarDevices) CheckUUID(annos map[string]string, d util.DeviceUsag
158158
return true
159159
}
160160

161-
func (dev *IluvatarDevices) CheckHealth(devType string, n *corev1.Node) (bool, bool) {
162-
return true, true
161+
func (dev *IluvatarDevices) CheckHealth(devType string, n *corev1.Node) bool {
162+
return true
163163
}
164164

165165
func (dev *IluvatarDevices) GenerateResourceRequests(ctr *corev1.Container) util.ContainerDeviceRequest {

pkg/device/nvidia/device.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ func (dev *NvidiaGPUDevices) NodeCleanUp(nn string) error {
8181
return util.MarkAnnotationsToDelete(NvidiaGPUDevice, nn)
8282
}
8383

84-
func (dev *NvidiaGPUDevices) CheckHealth(devType string, n *corev1.Node) (bool, bool) {
84+
func (dev *NvidiaGPUDevices) CheckHealth(devType string, n *corev1.Node) bool {
8585
return util.CheckHealth(devType, n)
8686
}
8787

pkg/scheduler/nodes.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ func (m *nodeManager) addNode(nodeID string, nodeInfo *util.NodeInfo) {
5757
}
5858
}
5959

60-
func (m *nodeManager) rmNodeDevice(nodeID string, nodeInfo *util.NodeInfo) {
60+
func (m *nodeManager) rmNodeDevice(nodeID string, nodeInfo *util.NodeInfo, deviceVendor string) {
6161
m.mutex.Lock()
6262
defer m.mutex.Unlock()
6363
_, ok := m.nodes[nodeID]
@@ -69,6 +69,9 @@ func (m *nodeManager) rmNodeDevice(nodeID string, nodeInfo *util.NodeInfo) {
6969
klog.V(5).Infoln("before rm:", m.nodes[nodeID].Devices, "needs remove", nodeInfo.Devices)
7070
tmp := make([]util.DeviceInfo, 0, len(m.nodes[nodeID].Devices)-len(nodeInfo.Devices))
7171
for _, val := range m.nodes[nodeID].Devices {
72+
if strings.Compare(val.DeviceVendor, deviceVendor) != 0 {
73+
continue
74+
}
7275
found := false
7376
for _, rmval := range nodeInfo.Devices {
7477
if strings.Compare(val.ID, rmval.ID) == 0 {

pkg/scheduler/scheduler.go

Lines changed: 18 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,6 @@ func (s *Scheduler) Stop() {
150150

151151
func (s *Scheduler) RegisterFromNodeAnnotations() {
152152
klog.V(5).Infoln("Scheduler into RegisterFromNodeAnnotations")
153-
nodeInfoCopy := make(map[string]*util.NodeInfo)
154153
ticker := time.NewTicker(time.Second * 15)
155154
for {
156155
select {
@@ -168,26 +167,20 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
168167
for _, val := range nodes {
169168
nodeNames = append(nodeNames, val.Name)
170169
for devhandsk, devInstance := range device.GetDevices() {
171-
health, needUpdate := devInstance.CheckHealth(devhandsk, val)
170+
health := devInstance.CheckHealth(devhandsk, val)
172171
if !health {
173-
_, ok := s.nodes[val.Name]
172+
info, ok := s.nodes[val.Name]
174173
if ok {
175-
_, ok = nodeInfoCopy[devhandsk]
176-
if ok && nodeInfoCopy[devhandsk] != nil {
177-
s.rmNodeDevice(val.Name, nodeInfoCopy[devhandsk])
178-
klog.Infof("node %v device %s:%v leave, %v remaining devices:%v", val.Name, devhandsk, nodeInfoCopy[devhandsk], err, s.nodes[val.Name].Devices)
179-
180-
err := devInstance.NodeCleanUp(val.Name)
181-
if err != nil {
182-
klog.ErrorS(err, "markAnnotationsToDeleteFailed")
183-
}
184-
continue
174+
err := devInstance.NodeCleanUp(val.Name)
175+
if err != nil {
176+
klog.ErrorS(err, "markAnnotationsToDeleteFailed")
185177
}
178+
s.rmNodeDevice(val.Name, info, devhandsk)
179+
klog.Infof("node %v device %s:%v leave, %v remaining devices:%v", val.Name, devhandsk, info, err, s.nodes[val.Name].Devices)
186180
}
187-
}
188-
if !needUpdate {
189181
continue
190182
}
183+
191184
_, ok := util.HandshakeAnnos[devhandsk]
192185
if ok {
193186
tmppat := make(map[string]string)
@@ -223,21 +216,21 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
223216
}
224217
if !found {
225218
nodeInfo.Devices = append(nodeInfo.Devices, util.DeviceInfo{
226-
ID: deviceinfo.Id,
227-
Index: uint(deviceinfo.Index),
228-
Count: deviceinfo.Count,
229-
Devmem: deviceinfo.Devmem,
230-
Devcore: deviceinfo.Devcore,
231-
Type: deviceinfo.Type,
232-
Numa: deviceinfo.Numa,
233-
Health: deviceinfo.Health,
219+
ID: deviceinfo.Id,
220+
Index: uint(deviceinfo.Index),
221+
Count: deviceinfo.Count,
222+
Devmem: deviceinfo.Devmem,
223+
Devcore: deviceinfo.Devcore,
224+
Type: deviceinfo.Type,
225+
Numa: deviceinfo.Numa,
226+
Health: deviceinfo.Health,
227+
DeviceVendor: devhandsk,
234228
})
235229
}
236230
}
237231
s.addNode(val.Name, nodeInfo)
238-
nodeInfoCopy[devhandsk] = nodeInfo
239232
if s.nodes[val.Name] != nil && len(nodeInfo.Devices) > 0 {
240-
klog.Infof("node %v device %s come node info=%v total=%v", val.Name, devhandsk, nodeInfoCopy[devhandsk], s.nodes[val.Name].Devices)
233+
klog.Infof("node %v device %s come node info=%v total=%v", val.Name, devhandsk, nodeInfo, s.nodes[val.Name].Devices)
241234
}
242235
}
243236
}

pkg/scheduler/scheduler_test.go

Lines changed: 37 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ func Test_Filter(t *testing.T) {
221221
nodes, _ := s.ListNodes()
222222
for index := range nodes {
223223
node := nodes[index]
224-
s.rmNodeDevice(node.ID, node)
224+
s.rmNodeDevice(node.ID, node, nvidia.NvidiaGPUDevice)
225225
}
226226
pods, _ := s.ListPods()
227227
for index := range pods {
@@ -232,49 +232,53 @@ func Test_Filter(t *testing.T) {
232232
ID: "node1",
233233
Devices: []util.DeviceInfo{
234234
{
235-
ID: "device1",
236-
Index: 0,
237-
Count: 10,
238-
Devmem: 8000,
239-
Devcore: 100,
240-
Numa: 0,
241-
Type: nvidia.NvidiaGPUDevice,
242-
Health: true,
235+
ID: "device1",
236+
Index: 0,
237+
Count: 10,
238+
Devmem: 8000,
239+
Devcore: 100,
240+
Numa: 0,
241+
Type: nvidia.NvidiaGPUDevice,
242+
Health: true,
243+
DeviceVendor: nvidia.NvidiaGPUDevice,
243244
},
244245
{
245-
ID: "device2",
246-
Index: 1,
247-
Count: 10,
248-
Devmem: 8000,
249-
Devcore: 100,
250-
Numa: 0,
251-
Type: nvidia.NvidiaGPUDevice,
252-
Health: true,
246+
ID: "device2",
247+
Index: 1,
248+
Count: 10,
249+
Devmem: 8000,
250+
Devcore: 100,
251+
Numa: 0,
252+
Type: nvidia.NvidiaGPUDevice,
253+
Health: true,
254+
DeviceVendor: nvidia.NvidiaGPUDevice,
253255
},
254256
},
255257
})
256258
s.addNode("node2", &util.NodeInfo{
257259
ID: "node2",
258260
Devices: []util.DeviceInfo{
259261
{
260-
ID: "device3",
261-
Index: 0,
262-
Count: 10,
263-
Devmem: 8000,
264-
Devcore: 100,
265-
Numa: 0,
266-
Type: nvidia.NvidiaGPUDevice,
267-
Health: true,
262+
ID: "device3",
263+
Index: 0,
264+
Count: 10,
265+
Devmem: 8000,
266+
Devcore: 100,
267+
Numa: 0,
268+
Type: nvidia.NvidiaGPUDevice,
269+
Health: true,
270+
DeviceVendor: nvidia.NvidiaGPUDevice,
268271
},
269272
{
270-
ID: "device4",
271-
Index: 1,
272-
Count: 10,
273-
Devmem: 8000,
274-
Devcore: 100,
275-
Numa: 0,
276-
Type: nvidia.NvidiaGPUDevice,
277-
Health: true,
273+
ID: "device4",
274+
Index: 1,
275+
Count: 10,
276+
Devmem: 8000,
277+
Devcore: 100,
278+
Numa: 0,
279+
Type: nvidia.NvidiaGPUDevice,
280+
Health: true,
281+
DeviceVendor: nvidia.NvidiaGPUDevice,
278282
},
279283
},
280284
})

pkg/util/types.go

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -124,14 +124,15 @@ type DeviceUsage struct {
124124
}
125125

126126
type DeviceInfo struct {
127-
ID string
128-
Index uint
129-
Count int32
130-
Devmem int32
131-
Devcore int32
132-
Type string
133-
Numa int
134-
Health bool
127+
ID string
128+
Index uint
129+
Count int32
130+
Devmem int32
131+
Devcore int32
132+
Type string
133+
Numa int
134+
Health bool
135+
DeviceVendor string
135136
}
136137

137138
type NodeInfo struct {

0 commit comments

Comments
 (0)