Skip to content

Commit 542bf47

Browse files
BRWindfarer
authored andcommitted
fix: svi mode HandleByIndex
1 parent 3bb50dc commit 542bf47

File tree

9 files changed

+105
-26
lines changed

9 files changed

+105
-26
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
k8s-device*
2+
*.swp
3+
device-plugin.tar

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
REPO ?= ghcr.io/BirenTechnology
22
PROJECT ?= k8s-device-plugin
3-
BUILD_ENV?=GOPROXY=direct
3+
BUILD_ENV?=
44
tag=$(shell git describe --abbrev=0 --tags)
55
VERSION=$(shell git describe --tags --always)
66

@@ -16,6 +16,7 @@ push:
1616

1717
build:
1818
${BUILD_ENV} GOOS=linux GOARCH=amd64 CGO_ENABLED=1 go build -ldflags="-X 'main.Version=$(VERSION)' -X 'main.Time=$(shell LC_TIME=en_US.UTF-8 date)' -X 'main.Commit=$(shell git rev-parse --short HEAD)'" -o k8s-device-plugin cmd/manager.go
19+
${BUILD_ENV} GOOS=linux GOARCH=amd64 CGO_ENABLED=1 go build -ldflags="-X 'main.Version=$(VERSION)' -X 'main.Time=$(shell LC_TIME=en_US.UTF-8 date)' -X 'main.Commit=$(shell git rev-parse --short HEAD)'" -o k8s-device-topo debug/topo/topo.go
1920

2021
build-arm:
2122
${BUILD_ENV} GOOS=linux GOARCH=arm64 CGO_ENABLED=1 go build -ldflags="-X 'main.Version=$(VERSION)' -X 'main.Time=$(shell LC_TIME=en_US.UTF-8 date)' -X 'main.Commit=$(shell git rev-parse --short HEAD)'" -o k8s-device-plugin cmd/manager.go

debug/topo/topo.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
package main
2+
3+
import (
4+
"fmt"
5+
"os"
6+
"os/exec"
7+
"strings"
8+
9+
"github.com/BirenTechnology/go-brml/brml"
10+
"github.com/BirenTechnology/k8s-device-plugin/pkg/brgpu"
11+
log "github.com/sirupsen/logrus"
12+
)
13+
14+
func main() {
15+
err := brml.Init()
16+
if err != nil {
17+
log.Errorf("brml init failed %v", err)
18+
}
19+
defer brml.Shutdown()
20+
21+
cardsFiles, err := os.ReadDir("/dev/biren")
22+
if err != nil {
23+
log.Errorf("read dir /dev/biren failed %v", err)
24+
panic(err)
25+
}
26+
cards := []string{}
27+
for _, f := range cardsFiles {
28+
if strings.Contains(f.Name(), "card_") {
29+
cards = append(cards, f.Name())
30+
}
31+
}
32+
devices, err := brgpu.DeviceDiscover()
33+
if err != nil {
34+
log.Errorf("discover devices failed %v", err)
35+
panic(err)
36+
}
37+
log.Info("discover devices:")
38+
for _, d := range devices {
39+
fmt.Println(d.PhysicalNum, d.Instances)
40+
}
41+
42+
_, err = brgpu.Device2Graph(cards)
43+
if err != nil {
44+
log.Errorf("device %v to graph failed %v", cards, err)
45+
panic(err)
46+
}
47+
48+
log.Info("/dev/biren/card_x -> gpu hw:")
49+
for _, c := range cards {
50+
gpu_id, err := os.ReadFile(fmt.Sprintf("/sys/class/biren/%s/device/physical_id", c))
51+
if err != nil {
52+
log.Errorf("read sys/class/biren/%s/device/physical_id failed %v", c, err)
53+
} else {
54+
fmt.Printf("%s -> %v", c, string(gpu_id))
55+
}
56+
}
57+
58+
log.Info("brsmi gpu list:")
59+
cmd := exec.Command("brsmi", "gpu", "list")
60+
cmd.Stdout = os.Stdout
61+
if err := cmd.Run(); err != nil {
62+
log.Errorf("exec `brsmi gpu list` failed %v", err)
63+
}
64+
65+
log.Info("brsmi gpu topo:")
66+
cmd = exec.Command("brsmi", "topo", "-m")
67+
cmd.Stdout = os.Stdout
68+
if err := cmd.Run(); err != nil {
69+
log.Errorf("exec `brsmi gpu list` failed %v", err)
70+
}
71+
}

pkg/brgpu/allocator.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
14-
//
1514
package brgpu
1615

1716
import (

pkg/brgpu/cdi.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ func cdiSPec(runtime ContainerRuntime) ([]*cdi.Spec, error) {
4949
}
5050

5151
func runcCDI() ([]*cdi.Spec, error) {
52-
info, err := deviceDiscover()
52+
info, err := DeviceDiscover()
5353
if err != nil {
5454
log.Errorf("deviceDiscover error: %v", err)
5555
return nil, err
@@ -182,7 +182,7 @@ func generateConfigCdiFile(runtime ContainerRuntime) error {
182182
}
183183
exists, err := PathExists(cdiConfigPath)
184184
if err != nil {
185-
log.Error(err)
185+
log.Errorf("cdiconfig path %v", err)
186186
return err
187187
}
188188
// 如果文件存在并且不需要覆盖写 直接返回

pkg/brgpu/kata.go

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,16 @@
11
// Copyright 2024 Shanghai Biren Technology Co., Ltd.
2-
//
2+
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
55
// You may obtain a copy of the License at
6-
//
7-
// http://www.apache.org/licenses/LICENSE-2.0
8-
//
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
99
// Unless required by applicable law or agreed to in writing, software
1010
// distributed under the License is distributed on an "AS IS" BASIS,
1111
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
14-
//
1514
package brgpu
1615

1716
import (
@@ -99,7 +98,7 @@ func (p PFDeviceInfoList) getResourceByCardId(cardId string) string {
9998
func (bgm *brGPUManager) kataManager() {
10099
info, err := vfDeviceDiscover()
101100
if err != nil {
102-
log.Error(err)
101+
log.Errorf("kata device discover failed %v", err)
103102
bgm.Stop <- true
104103
}
105104
l := Lister{
@@ -115,7 +114,7 @@ func (bgm *brGPUManager) kataManager() {
115114

116115
err = bgm.generateCdiConfigFile(RuntimeKata)
117116
if err != nil {
118-
log.Error(err)
117+
log.Errorf("kata generate cdi config failed %v", err)
119118
bgm.Stop <- true
120119
}
121120

pkg/brgpu/manager.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,16 @@
11
// Copyright 2024 Shanghai Biren Technology Co., Ltd.
2-
//
2+
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
55
// You may obtain a copy of the License at
6-
//
7-
// http://www.apache.org/licenses/LICENSE-2.0
8-
//
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
99
// Unless required by applicable law or agreed to in writing, software
1010
// distributed under the License is distributed on an "AS IS" BASIS,
1111
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
14-
//
1514
package brgpu
1615

1716
import (

pkg/brgpu/plugin.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
14-
//
1514
package brgpu
1615

1716
import (
@@ -192,7 +191,7 @@ func (p *Plugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListA
192191
}
193192
tg, err := Device2Graph(devIDs)
194193
if err != nil {
195-
log.Errorf("Generate gpu topo error,%v", err)
194+
log.Errorf("Generate gpu %v topo error %v", devIDs, err)
196195
}
197196
p.TopoGraph = tg
198197
}

pkg/brgpu/runc.go

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
14-
//
1514
package brgpu
1615

1716
import (
@@ -89,14 +88,14 @@ func (d DevicesInfoList) getResourceByCardId(cardId string) string {
8988
func (bgm *brGPUManager) runcManager(pulse int, mountAllDev bool, mountDriDevice bool) {
9089
err := brml.Init()
9190
if err != nil {
92-
log.Error(err)
91+
log.Errorf("brml init failed %v", err)
9392
bgm.Stop <- true
9493
}
9594
defer brml.Shutdown()
9695

97-
info, err := deviceDiscover()
96+
info, err := DeviceDiscover()
9897
if err != nil {
99-
log.Error(err)
98+
log.Errorf("runc device discover failed: %v", err)
10099
bgm.Stop <- true
101100
}
102101
l := Lister{
@@ -133,33 +132,37 @@ func (bgm *brGPUManager) runcManager(pulse int, mountAllDev bool, mountDriDevice
133132

134133
err = bgm.generateCdiConfigFile(RuntimeRunc)
135134
if err != nil {
136-
log.Error(err)
135+
log.Errorf("runc generate cdi config failed %v", err)
137136
bgm.Stop <- true
138137
}
139138

140139
manager.Run()
141140
}
142141

143-
func deviceDiscover() (DevicesInfoList, error) {
142+
func DeviceDiscover() (DevicesInfoList, error) {
144143
dis := DevicesInfoList{}
145144
physicalNum, err := brml.DeviceCount()
146145
if err != nil {
146+
log.Errorf("brml device count err: %v", err)
147147
return nil, err
148148
}
149149

150150
for i := 0; i < physicalNum; i++ {
151-
device, err := brml.HandleByNodeID(i)
151+
log.Infof("discovering device node id %v/%v", i, physicalNum)
152+
device, err := brml.HandleByIndex(i)
152153
if err != nil {
154+
log.Errorf("brml HandleByIndex %v err: %v", i, err)
153155
return nil, err
154156
}
155157
sviCount, err := brml.GetSviMode(device)
156158
if err != nil {
159+
log.Errorf("brml GetSviMode %v err: %v", device, err)
157160
return nil, err
158161
}
159162

160163
phyUUID, err := brml.DeviceUUID(device)
161-
162164
if err != nil {
165+
log.Errorf("brml DeviceUUID %v err: %v", device, err)
163166
return nil, err
164167
}
165168

@@ -169,11 +172,13 @@ func deviceDiscover() (DevicesInfoList, error) {
169172
case 0, 1:
170173
memInfo, err := brml.MemoryInfo(device)
171174
if err != nil {
175+
log.Errorf("brml MemoryInfo %v err: %v", device, err)
172176
return nil, err
173177
}
174178

175179
id, err := brml.GetGPUNodeIds(device)
176180
if err != nil {
181+
log.Errorf("brml GetGPUNodeIds %v err: %v", device, err)
177182
return nil, err
178183
}
179184

@@ -196,16 +201,19 @@ func deviceDiscover() (DevicesInfoList, error) {
196201
for j := 0; j < sviCount; j++ {
197202
ins, err := brml.GetGPUInstanceByID(device, uint32(j))
198203
if err != nil {
204+
log.Errorf("brml GetGPUInstanceByID %v/%v err: %v", device, j, err)
199205
return nil, err
200206
}
201207

202208
mem, err := brml.MemoryInfo(ins)
203209
if err != nil {
210+
log.Errorf("brml MemoryInfo %v err: %v", ins, err)
204211
return nil, err
205212
}
206213

207214
id, err := brml.GetGPUNodeIds(ins)
208215
if err != nil {
216+
log.Errorf("brml GetGPUNodeIds %v err: %v", ins, err)
209217
return nil, err
210218
}
211219

0 commit comments

Comments
 (0)