Skip to content

Commit 2779ae4

Browse files
authored
feat(nvidia): simulate "get remapped rows" (optional) (#524)
> {"level":"info","ts":"2025-03-13T06:37:35Z","caller":"nvml/nvml.go:39","msg":"injecting remapped rows pending","corrRows":0,"uncRows":10,"isPending":true,"failureOccurred":false} > ... > {"level":"warn","ts":"2025-03-13T06:38:09Z","caller":"remapped-rows/component_output.go:141","msg":"suggested actions","suggestedActions":"REBOOT_SYSTEM"} Signed-off-by: Gyuho Lee <[email protected]>
1 parent 4bcd9f5 commit 2779ae4

File tree

11 files changed

+460
-256
lines changed

11 files changed

+460
-256
lines changed

cmd/gpud/command/accelerator.go

+3-7
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,15 @@
11
package command
22

33
import (
4-
"context"
54
"fmt"
6-
"time"
7-
8-
"github.com/leptonai/gpud/pkg/accelerator"
95

106
"github.com/urfave/cli"
7+
8+
"github.com/leptonai/gpud/pkg/accelerator"
119
)
1210

1311
func cmdAccelerator(cliContext *cli.Context) error {
14-
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
15-
defer cancel()
16-
acceleratorType, productName, err := accelerator.DetectTypeAndProductName(ctx)
12+
acceleratorType, productName, err := accelerator.DetectTypeAndProductName()
1713
if err != nil {
1814
return err
1915
}

cmd/gpud/command/join.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ func cmdJoin(cliContext *cli.Context) (retErr error) {
5454
return fmt.Errorf("error parsing cpu: %w", err)
5555
}
5656

57-
_, productName, err := accelerator.DetectTypeAndProductName(rootCtx)
57+
_, productName, err := accelerator.DetectTypeAndProductName()
5858
if err != nil {
5959
return err
6060
}

pkg/accelerator/accelerator.go

+3-5
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
package accelerator
22

33
import (
4-
"context"
5-
64
"github.com/leptonai/gpud/pkg/file"
7-
nvidia_query "github.com/leptonai/gpud/pkg/nvidia-query"
5+
"github.com/leptonai/gpud/pkg/nvidia-query/nvml"
86
)
97

108
type Type string
@@ -15,9 +13,9 @@ const (
1513
)
1614

1715
// Returns the GPU type (e.g., "NVIDIA") and product name (e.g., "A100")
18-
func DetectTypeAndProductName(ctx context.Context) (Type, string, error) {
16+
func DetectTypeAndProductName() (Type, string, error) {
1917
if _, err := file.LocateExecutable("nvidia-smi"); err == nil {
20-
productName, err := nvidia_query.LoadGPUDeviceName(ctx)
18+
productName, err := nvml.LoadGPUDeviceName()
2119
if err != nil {
2220
return TypeNVIDIA, "unknown", err
2321
}

pkg/nvidia-query/detect.go

+2-41
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,9 @@ import (
55
"fmt"
66
"strings"
77

8-
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
9-
"github.com/NVIDIA/go-nvml/pkg/nvml"
10-
118
"github.com/leptonai/gpud/pkg/file"
129
"github.com/leptonai/gpud/pkg/log"
13-
nvmlquery "github.com/leptonai/gpud/pkg/nvidia-query/nvml"
10+
"github.com/leptonai/gpud/pkg/nvidia-query/nvml"
1411
"github.com/leptonai/gpud/pkg/process"
1512
)
1613

@@ -29,7 +26,7 @@ func GPUsInstalled(ctx context.Context) (bool, error) {
2926

3027
// now that we have the NVIDIA PCI devices,
3128
// call NVML C-based API for NVML API
32-
gpuDeviceName, err := LoadGPUDeviceName(ctx)
29+
gpuDeviceName, err := nvml.LoadGPUDeviceName()
3330
if err != nil {
3431
if IsErrDeviceHandleUnknownError(err) {
3532
log.Logger.Warnw("nvidia device handler failed for unknown error -- likely GPU has fallen off the bus or other Xid error", "error", err)
@@ -42,42 +39,6 @@ func GPUsInstalled(ctx context.Context) (bool, error) {
4239
return true, nil
4340
}
4441

45-
// Loads the product name of the NVIDIA GPU device.
46-
func LoadGPUDeviceName(ctx context.Context) (string, error) {
47-
nvmlLib := nvmlquery.NewNVML()
48-
if ret := nvmlLib.Init(); ret != nvml.SUCCESS {
49-
return "", fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret))
50-
}
51-
52-
deviceLib := device.New(nvmlLib)
53-
54-
// do not check nvml lib if it is mocked
55-
infoLib := nvmlquery.NewNVInfo(nvmlLib, deviceLib)
56-
nvmlExists, nvmlExistsMsg := infoLib.HasNvml()
57-
if !nvmlExists {
58-
return "", fmt.Errorf("NVML not found: %s", nvmlExistsMsg)
59-
}
60-
61-
// "NVIDIA Xid 79: GPU has fallen off the bus" may fail this syscall with:
62-
// "error getting device handle for index '6': Unknown Error"
63-
devices, err := deviceLib.GetDevices()
64-
if err != nil {
65-
return "", err
66-
}
67-
68-
for _, d := range devices {
69-
name, ret := d.GetName()
70-
if ret != nvml.SUCCESS {
71-
return "", fmt.Errorf("failed to get device name: %v", nvml.ErrorString(ret))
72-
}
73-
if name != "" {
74-
return name, nil
75-
}
76-
}
77-
78-
return "", nil
79-
}
80-
8142
// Lists all PCI devices that are compatible with NVIDIA.
8243
func ListNVIDIAPCIs(ctx context.Context) ([]string, error) {
8344
lspciPath, err := file.LocateExecutable("lspci")

pkg/nvidia-query/nvml/clock_events.go

+2-4
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,14 @@ import (
2222
// ref. undefined symbol: nvmlDeviceGetCurrentClocksEventReasons for older nvidia drivers
2323
func ClockEventsSupported() (bool, error) {
2424
nvmlLib := NewNVML()
25-
if ret := nvmlLib.Init(); ret != nvml.SUCCESS {
25+
if ret := nvmlLib.NVML().Init(); ret != nvml.SUCCESS {
2626
return false, fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret))
2727
}
2828
log.Logger.Debugw("successfully initialized NVML")
2929

30-
deviceLib := device.New(nvmlLib)
31-
3230
// "NVIDIA Xid 79: GPU has fallen off the bus" may fail this syscall with:
3331
// "error getting device handle for index '6': Unknown Error"
34-
devices, err := deviceLib.GetDevices()
32+
devices, err := nvmlLib.Device().GetDevices()
3533
if err != nil {
3634
return false, err
3735
}

pkg/nvidia-query/nvml/gpm.go

+2-4
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,14 @@ import (
1919
// Returns false if any device does not support GPM.
2020
func GPMSupported() (bool, error) {
2121
nvmlLib := NewNVML()
22-
if ret := nvmlLib.Init(); ret != nvml.SUCCESS {
22+
if ret := nvmlLib.NVML().Init(); ret != nvml.SUCCESS {
2323
return false, fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret))
2424
}
2525
log.Logger.Debugw("successfully initialized NVML")
2626

27-
deviceLib := device.New(nvmlLib)
28-
2927
// "NVIDIA Xid 79: GPU has fallen off the bus" may fail this syscall with:
3028
// "error getting device handle for index '6': Unknown Error"
31-
devices, err := deviceLib.GetDevices()
29+
devices, err := nvmlLib.Device().GetDevices()
3230
if err != nil {
3331
return false, err
3432
}

pkg/nvidia-query/nvml/lib/lib.go

+130
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
// Package lib implements the NVIDIA Management Library (NVML) interface.
2+
// See https://docs.nvidia.com/deploy/nvml-api/nvml-api-reference.html#nvml-api-reference for more details.
3+
package lib
4+
5+
import (
6+
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
7+
nvinfo "github.com/NVIDIA/go-nvlib/pkg/nvlib/info"
8+
"github.com/NVIDIA/go-nvml/pkg/nvml"
9+
)
10+
11+
type Library interface {
12+
NVML() nvml.Interface
13+
Device() device.Interface
14+
Info() nvinfo.Interface
15+
Shutdown() nvml.Return
16+
}
17+
18+
var _ Library = &nvmlInterface{}
19+
var _ nvml.Interface = &nvmlInterface{}
20+
21+
type nvmlInterface struct {
22+
nvml.Interface
23+
24+
dev *devInterface
25+
info nvinfo.Interface
26+
27+
initReturn *nvml.Return
28+
propertyExtractor nvinfo.PropertyExtractor
29+
}
30+
31+
func (n *nvmlInterface) NVML() nvml.Interface {
32+
return n
33+
}
34+
35+
func (n *nvmlInterface) Device() device.Interface {
36+
return n.dev
37+
}
38+
39+
func (n *nvmlInterface) Info() nvinfo.Interface {
40+
return n.info
41+
}
42+
43+
func (n *nvmlInterface) Shutdown() nvml.Return {
44+
return n.Interface.Shutdown()
45+
}
46+
47+
// New creates a new NVML instance and returns nil if NVML is not supported.
48+
func New(opts ...OpOption) Library {
49+
options := &Op{}
50+
options.applyOpts(opts)
51+
52+
nvInterface := &nvmlInterface{
53+
Interface: options.nvmlLib,
54+
55+
initReturn: options.initReturn,
56+
propertyExtractor: options.propertyExtractor,
57+
}
58+
59+
devLib := device.New(nvInterface.Interface)
60+
nvInterface.dev = &devInterface{
61+
Interface: devLib,
62+
devices: options.devicesToReturn,
63+
getRemappedRows: options.devGetRemappedRows,
64+
}
65+
66+
infoOpts := []nvinfo.Option{
67+
nvinfo.WithNvmlLib(nvInterface),
68+
nvinfo.WithDeviceLib(nvInterface.dev),
69+
}
70+
if nvInterface.propertyExtractor != nil {
71+
infoOpts = append(infoOpts, nvinfo.WithPropertyExtractor(nvInterface.propertyExtractor))
72+
}
73+
nvInterface.info = nvinfo.New(infoOpts...)
74+
75+
return nvInterface
76+
}
77+
78+
func (n *nvmlInterface) Init() nvml.Return {
79+
if n.initReturn != nil {
80+
return *n.initReturn
81+
}
82+
return n.Interface.Init()
83+
}
84+
85+
var _ device.Interface = &devInterface{}
86+
87+
type devInterface struct {
88+
device.Interface
89+
devices []device.Device
90+
getRemappedRows func() (int, int, bool, bool, nvml.Return)
91+
}
92+
93+
func (d *devInterface) GetDevices() ([]device.Device, error) {
94+
devs := d.devices
95+
96+
var err error
97+
if len(devs) == 0 {
98+
devs, err = d.Interface.GetDevices()
99+
}
100+
101+
if err != nil {
102+
return nil, err
103+
}
104+
105+
updated := make([]device.Device, len(devs))
106+
for i, dev := range devs {
107+
updated[i] = &devDevInterface{
108+
Device: dev,
109+
getRemappedRows: d.getRemappedRows,
110+
}
111+
}
112+
113+
return updated, nil
114+
}
115+
116+
var _ device.Device = &devDevInterface{}
117+
118+
type devDevInterface struct {
119+
device.Device
120+
getRemappedRows func() (int, int, bool, bool, nvml.Return)
121+
}
122+
123+
func (d *devDevInterface) GetRemappedRows() (int, int, bool, bool, nvml.Return) {
124+
// no injected remapped rows
125+
// thus just passthrough to call the underlying device.Device.GetRemappedRows()
126+
if d.getRemappedRows == nil {
127+
return d.Device.GetRemappedRows()
128+
}
129+
return d.getRemappedRows()
130+
}

pkg/nvidia-query/nvml/lib/lib_test.go

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
package lib
2+
3+
import (
4+
"testing"
5+
6+
"github.com/NVIDIA/go-nvml/pkg/nvml"
7+
"github.com/stretchr/testify/assert"
8+
)
9+
10+
func TestLibrary(t *testing.T) {
11+
nv := New(
12+
WithInitReturn(nvml.SUCCESS),
13+
)
14+
assert.Equal(t, nv.NVML().Init(), nvml.SUCCESS)
15+
}

pkg/nvidia-query/nvml/lib/options.go

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
package lib
2+
3+
import (
4+
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
5+
nvinfo "github.com/NVIDIA/go-nvlib/pkg/nvlib/info"
6+
"github.com/NVIDIA/go-nvml/pkg/nvml"
7+
)
8+
9+
type Op struct {
10+
nvmlLib nvml.Interface
11+
12+
initReturn *nvml.Return
13+
propertyExtractor nvinfo.PropertyExtractor
14+
devicesToReturn []device.Device
15+
16+
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g055e7c34f7f15b6ae9aac1dabd60870d
17+
devGetRemappedRows func() (corrRows int, uncRows int, isPending bool, failureOccurred bool, ret nvml.Return)
18+
}
19+
20+
type OpOption func(*Op)
21+
22+
func (op *Op) applyOpts(opts []OpOption) {
23+
for _, opt := range opts {
24+
opt(op)
25+
}
26+
if op.nvmlLib == nil {
27+
op.nvmlLib = nvml.New()
28+
}
29+
}
30+
31+
// Specifies the NVML library instance.
32+
// Otherwise, defaults to the NVML library instance returned by nvml.New().
33+
func WithNVML(nvmlLib nvml.Interface) OpOption {
34+
return func(op *Op) {
35+
op.nvmlLib = nvmlLib
36+
}
37+
}
38+
39+
// Specifies the return value of the NVML library's Init() function.
40+
// Otherwise, defaults to the return value of the NVML library's Init() function.
41+
func WithInitReturn(initReturn nvml.Return) OpOption {
42+
return func(op *Op) {
43+
op.initReturn = &initReturn
44+
}
45+
}
46+
47+
// Specifies the property extractor for the NVML library.
48+
func WithPropertyExtractor(propertyExtractor nvinfo.PropertyExtractor) OpOption {
49+
return func(op *Op) {
50+
op.propertyExtractor = propertyExtractor
51+
}
52+
}
53+
54+
func WithDevice(dev device.Device) OpOption {
55+
return func(op *Op) {
56+
op.devicesToReturn = append(op.devicesToReturn, dev)
57+
}
58+
}
59+
60+
// Specifies the function to get the remapped rows of the device.
61+
// Otherwise, defaults to the function returned by device.GetRemappedRows().
62+
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g055e7c34f7f15b6ae9aac1dabd60870d
63+
func WithDeviceGetRemappedRows(f func() (corrRows int, uncRows int, isPending bool, failureOccurred bool, ret nvml.Return)) OpOption {
64+
return func(op *Op) {
65+
op.devGetRemappedRows = f
66+
}
67+
}

0 commit comments

Comments
 (0)