1111// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212// See the License for the specific language governing permissions and
1313// limitations under the License.
14- //
1514package brgpu
1615
1716import (
@@ -89,14 +88,14 @@ func (d DevicesInfoList) getResourceByCardId(cardId string) string {
8988func (bgm * brGPUManager ) runcManager (pulse int , mountAllDev bool , mountDriDevice bool ) {
9089 err := brml .Init ()
9190 if err != nil {
92- log .Error ( err )
91+ log .Errorf ( "brml init failed %v" , err )
9392 bgm .Stop <- true
9493 }
9594 defer brml .Shutdown ()
9695
97- info , err := deviceDiscover ()
96+ info , err := DeviceDiscover ()
9897 if err != nil {
99- log .Error ( err )
98+ log .Errorf ( "runc device discover failed: %v" , err )
10099 bgm .Stop <- true
101100 }
102101 l := Lister {
@@ -133,33 +132,37 @@ func (bgm *brGPUManager) runcManager(pulse int, mountAllDev bool, mountDriDevice
133132
134133 err = bgm .generateCdiConfigFile (RuntimeRunc )
135134 if err != nil {
136- log .Error ( err )
135+ log .Errorf ( "runc generate cdi config failed %v" , err )
137136 bgm .Stop <- true
138137 }
139138
140139 manager .Run ()
141140}
142141
143- func deviceDiscover () (DevicesInfoList , error ) {
142+ func DeviceDiscover () (DevicesInfoList , error ) {
144143 dis := DevicesInfoList {}
145144 physicalNum , err := brml .DeviceCount ()
146145 if err != nil {
146+ log .Errorf ("brml device count err: %v" , err )
147147 return nil , err
148148 }
149149
150150 for i := 0 ; i < physicalNum ; i ++ {
151- device , err := brml .HandleByNodeID (i )
151+ log .Infof ("discovering device node id %v/%v" , i , physicalNum )
152+ device , err := brml .HandleByIndex (i )
152153 if err != nil {
154+ log .Errorf ("brml HandleByIndex %v err: %v" , i , err )
153155 return nil , err
154156 }
155157 sviCount , err := brml .GetSviMode (device )
156158 if err != nil {
159+ log .Errorf ("brml GetSviMode %v err: %v" , device , err )
157160 return nil , err
158161 }
159162
160163 phyUUID , err := brml .DeviceUUID (device )
161-
162164 if err != nil {
165+ log .Errorf ("brml DeviceUUID %v err: %v" , device , err )
163166 return nil , err
164167 }
165168
@@ -169,11 +172,13 @@ func deviceDiscover() (DevicesInfoList, error) {
169172 case 0 , 1 :
170173 memInfo , err := brml .MemoryInfo (device )
171174 if err != nil {
175+ log .Errorf ("brml MemoryInfo %v err: %v" , device , err )
172176 return nil , err
173177 }
174178
175179 id , err := brml .GetGPUNodeIds (device )
176180 if err != nil {
181+ log .Errorf ("brml GetGPUNodeIds %v err: %v" , device , err )
177182 return nil , err
178183 }
179184
@@ -196,16 +201,19 @@ func deviceDiscover() (DevicesInfoList, error) {
196201 for j := 0 ; j < sviCount ; j ++ {
197202 ins , err := brml .GetGPUInstanceByID (device , uint32 (j ))
198203 if err != nil {
204+ log .Errorf ("brml GetGPUInstanceByID %v/%v err: %v" , device , j , err )
199205 return nil , err
200206 }
201207
202208 mem , err := brml .MemoryInfo (ins )
203209 if err != nil {
210+ log .Errorf ("brml MemoryInfo %v err: %v" , ins , err )
204211 return nil , err
205212 }
206213
207214 id , err := brml .GetGPUNodeIds (ins )
208215 if err != nil {
216+ log .Errorf ("brml GetGPUNodeIds %v err: %v" , ins , err )
209217 return nil , err
210218 }
211219
0 commit comments