Skip to content

Commit e84ae4e

Browse files
gyuhoeahydra
andauthored
fix(nvml): fix hw slowdown events db writes, support simulate hw slowdown flag (#526)
```go const ( EnvMockAllSuccess = "GPUD_NVML_MOCK_ALL_SUCCESS" EnvInjectRemapedRowsPending = "GPUD_NVML_INJECT_REMAPPED_ROWS_PENDING" EnvInjectClockEventsHwSlowdown = "GPUD_NVML_INJECT_CLOCK_EVENTS_HW_SLOWDOWN" ) ``` Tested ``` "component": "accelerator-nvidia-hw-slowdown", "startTime": "2024-11-15T04:48:56Z", "endTime": "2025-03-13T11:34:36.809052648Z", "events": [ { "time": "2025-03-13T11:34:00Z", "name": "hw_slowdown", "type": "Warning", "message": "GPU-49004c5e-a258-143f-1c5c-319f2db1a1f1: HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged (External Power Brake Assertion being triggered) ('HW Power Brake Slowdown' in nvidia-smi --query) (nvml), GPU-49004c5e-a258-143f-1c5c-319f2db1a1f1: HW Slowdown is engaged due to high temperature, power brake assertion, or high power draw ('HW Slowdown: Active' in nvidia-smi --query) (nvml), GPU-49004c5e-a258-143f-1c5c-319f2db1a1f1: HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged (temperature being too high) ('HW Thermal Slowdown' in nvidia-smi --query) (nvml)", "extra_info": { "data_source": "nvml", "gpu_uuid": "GPU-49004c5e-a258-143f-1c5c-319f2db1a1f1" } }, { "time": "2025-03-13T11:33:00Z", "name": "hw_slowdown", "type": "Warning", "message": "GPU-49004c5e-a258-143f-1c5c-319f2db1a1f1: HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged (External Power Brake Assertion being triggered) ('HW Power Brake Slowdown' in nvidia-smi --query) (nvml), GPU-49004c5e-a258-143f-1c5c-319f2db1a1f1: HW Slowdown is engaged due to high temperature, power brake assertion, or high power draw ('HW Slowdown: Active' in nvidia-smi --query) (nvml), GPU-49004c5e-a258-143f-1c5c-319f2db1a1f1: HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged (temperature being too high) ('HW Thermal Slowdown' in nvidia-smi --query) (nvml)", "extra_info": { "data_source": "nvml", "gpu_uuid": "GPU-49004c5e-a258-143f-1c5c-319f2db1a1f1" } } ] }, ``` --------- Signed-off-by: Gyuho Lee <[email protected]> Co-authored-by: Joseph <[email protected]>
1 parent 2779ae4 commit e84ae4e

14 files changed

+436
-78
lines changed

e2e/e2e_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ import (
2222
v1 "github.com/leptonai/gpud/api/v1"
2323
client_v1 "github.com/leptonai/gpud/client/v1"
2424
mocklspci "github.com/leptonai/gpud/e2e/mock/lspci"
25-
mocknvml "github.com/leptonai/gpud/e2e/mock/nvml"
2625
"github.com/leptonai/gpud/pkg/errdefs"
26+
nvml_lib "github.com/leptonai/gpud/pkg/nvidia-query/nvml/lib"
2727
"github.com/leptonai/gpud/pkg/server"
2828
)
2929

@@ -53,8 +53,8 @@ var _ = Describe("[GPUD E2E]", Ordered, func() {
5353
gCtx, gCancel := context.WithTimeout(context.Background(), time.Minute*8)
5454

5555
BeforeAll(func() {
56-
err = os.Setenv(mocknvml.EnvNVMLMock, "true")
57-
Expect(err).NotTo(HaveOccurred(), "failed to set GPUD_MOCK_NVML")
56+
err = os.Setenv(nvml_lib.EnvMockAllSuccess, "true")
57+
Expect(err).NotTo(HaveOccurred(), "failed to set "+nvml_lib.EnvMockAllSuccess)
5858

5959
By("mock lspci")
6060
err = mocklspci.Mock(mocklspci.NormalOutput)

e2e/mock/nvml/nvml_mock.go

-11
This file was deleted.

pkg/eventstore/database.go

+1
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ func (t *table) Insert(ctx context.Context, ev components.Event) error {
161161
return insertEvent(ctx, t.dbRW, t.table, ev)
162162
}
163163

164+
// Find returns nil if the event is not found.
164165
func (t *table) Find(ctx context.Context, ev components.Event) (*components.Event, error) {
165166
return findEvent(ctx, t.dbRO, t.table, ev)
166167
}

pkg/eventstore/types.go

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ type Store interface {
1818
type Bucket interface {
1919
Name() string
2020
Insert(ctx context.Context, ev components.Event) error
21+
// Find returns nil if the event is not found.
2122
Find(ctx context.Context, ev components.Event) (*components.Event, error)
2223
// Get queries the event in the descending order of timestamp (latest event first).
2324
Get(ctx context.Context, since time.Time) ([]components.Event, error)

pkg/nvidia-query/nvml/clock_events.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,14 @@ import (
1515
"github.com/leptonai/gpud/components"
1616
"github.com/leptonai/gpud/pkg/common"
1717
"github.com/leptonai/gpud/pkg/log"
18+
nvml_lib "github.com/leptonai/gpud/pkg/nvidia-query/nvml/lib"
1819
)
1920

2021
// Returns true if clock events is supported by all devices.
2122
// Returns false if any device does not support clock events.
2223
// ref. undefined symbol: nvmlDeviceGetCurrentClocksEventReasons for older nvidia drivers
2324
func ClockEventsSupported() (bool, error) {
24-
nvmlLib := NewNVML()
25+
nvmlLib := nvml_lib.NewDefault()
2526
if ret := nvmlLib.NVML().Init(); ret != nvml.SUCCESS {
2627
return false, fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret))
2728
}

pkg/nvidia-query/nvml/clock_events_test.go

+17-19
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@ import (
1414
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1515

1616
"github.com/leptonai/gpud/components"
17-
mocknvml "github.com/leptonai/gpud/e2e/mock/nvml"
1817
"github.com/leptonai/gpud/pkg/common"
18+
nvml_lib "github.com/leptonai/gpud/pkg/nvidia-query/nvml/lib"
19+
nvml_lib_mock "github.com/leptonai/gpud/pkg/nvidia-query/nvml/lib/mock"
1920
"github.com/leptonai/gpud/pkg/nvidia-query/nvml/testutil"
2021
)
2122

@@ -392,17 +393,16 @@ func TestClockEventsSupported(t *testing.T) {
392393
},
393394
}
394395

395-
// Set up mock NVML environment
396-
err := os.Setenv(mocknvml.EnvNVMLMock, "true")
396+
err := os.Setenv(nvml_lib.EnvMockAllSuccess, "true")
397397
if err != nil {
398398
t.Fatalf("failed to set mock NVML environment: %v", err)
399399
}
400-
defer os.Unsetenv(mocknvml.EnvNVMLMock)
400+
defer os.Unsetenv(nvml_lib.EnvMockAllSuccess)
401401

402402
// Replace the mock instance
403-
originalMockInstance := mocknvml.MockInstance
404-
mocknvml.MockInstance = mockNVML
405-
defer func() { mocknvml.MockInstance = originalMockInstance }()
403+
originalMockInstance := nvml_lib_mock.AllSuccessInterface
404+
nvml_lib_mock.AllSuccessInterface = mockNVML
405+
defer func() { nvml_lib_mock.AllSuccessInterface = originalMockInstance }()
406406

407407
result, err := ClockEventsSupported()
408408
if tt.expectError {
@@ -663,12 +663,11 @@ func TestClockEventsWithNilPointer(t *testing.T) {
663663
func TestClockEventsSupportedWithMockedNVML(t *testing.T) {
664664
// Test with initialization failure
665665
t.Run("nvml initialization failure", func(t *testing.T) {
666-
// Set up mock NVML environment
667-
err := os.Setenv(mocknvml.EnvNVMLMock, "true")
666+
err := os.Setenv(nvml_lib.EnvMockAllSuccess, "true")
668667
if err != nil {
669668
t.Fatalf("failed to set mock NVML environment: %v", err)
670669
}
671-
defer os.Unsetenv(mocknvml.EnvNVMLMock)
670+
defer os.Unsetenv(nvml_lib.EnvMockAllSuccess)
672671

673672
// Mock NVML with init failure
674673
mockNVML := &mock.Interface{
@@ -678,9 +677,9 @@ func TestClockEventsSupportedWithMockedNVML(t *testing.T) {
678677
}
679678

680679
// Replace the mock instance
681-
originalMockInstance := mocknvml.MockInstance
682-
mocknvml.MockInstance = mockNVML
683-
defer func() { mocknvml.MockInstance = originalMockInstance }()
680+
originalMockInstance := nvml_lib_mock.AllSuccessInterface
681+
nvml_lib_mock.AllSuccessInterface = mockNVML
682+
defer func() { nvml_lib_mock.AllSuccessInterface = originalMockInstance }()
684683

685684
// Call function
686685
result, err := ClockEventsSupported()
@@ -691,12 +690,11 @@ func TestClockEventsSupportedWithMockedNVML(t *testing.T) {
691690

692691
// Test with device initialization but GetDevices failure
693692
t.Run("device get failure", func(t *testing.T) {
694-
// Set up mock NVML environment
695-
err := os.Setenv(mocknvml.EnvNVMLMock, "true")
693+
err := os.Setenv(nvml_lib.EnvMockAllSuccess, "true")
696694
if err != nil {
697695
t.Fatalf("failed to set mock NVML environment: %v", err)
698696
}
699-
defer os.Unsetenv(mocknvml.EnvNVMLMock)
697+
defer os.Unsetenv(nvml_lib.EnvMockAllSuccess)
700698

701699
// Mock NVML with device get failure
702700
mockNVML := &mock.Interface{
@@ -709,9 +707,9 @@ func TestClockEventsSupportedWithMockedNVML(t *testing.T) {
709707
}
710708

711709
// Replace the mock instance
712-
originalMockInstance := mocknvml.MockInstance
713-
mocknvml.MockInstance = mockNVML
714-
defer func() { mocknvml.MockInstance = originalMockInstance }()
710+
originalMockInstance := nvml_lib_mock.AllSuccessInterface
711+
nvml_lib_mock.AllSuccessInterface = mockNVML
712+
defer func() { nvml_lib_mock.AllSuccessInterface = originalMockInstance }()
715713

716714
// Call function
717715
result, err := ClockEventsSupported()

pkg/nvidia-query/nvml/gpm.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88

99
"github.com/leptonai/gpud/pkg/log"
1010
metrics_gpm "github.com/leptonai/gpud/pkg/nvidia-query/metrics/gpm"
11+
nvml_lib "github.com/leptonai/gpud/pkg/nvidia-query/nvml/lib"
1112

1213
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
1314
"github.com/NVIDIA/go-nvml/pkg/nvml"
@@ -18,7 +19,7 @@ import (
1819
// Returns true if GPM is supported by all devices.
1920
// Returns false if any device does not support GPM.
2021
func GPMSupported() (bool, error) {
21-
nvmlLib := NewNVML()
22+
nvmlLib := nvml_lib.NewDefault()
2223
if ret := nvmlLib.NVML().Init(); ret != nvml.SUCCESS {
2324
return false, fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret))
2425
}

pkg/nvidia-query/nvml/lib/default.go

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
package lib
2+
3+
import (
4+
"os"
5+
6+
"github.com/NVIDIA/go-nvml/pkg/nvml"
7+
nvml_lib_mock "github.com/leptonai/gpud/pkg/nvidia-query/nvml/lib/mock"
8+
9+
"github.com/leptonai/gpud/pkg/log"
10+
)
11+
12+
const (
13+
EnvMockAllSuccess = "GPUD_NVML_MOCK_ALL_SUCCESS"
14+
EnvInjectRemapedRowsPending = "GPUD_NVML_INJECT_REMAPPED_ROWS_PENDING"
15+
EnvInjectClockEventsHwSlowdown = "GPUD_NVML_INJECT_CLOCK_EVENTS_HW_SLOWDOWN"
16+
)
17+
18+
// 0x0000000000000000 is none
19+
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksEventReasons.html
20+
const (
21+
reasonHWSlowdown uint64 = 0x0000000000000008
22+
reasonSwThermalSlowdown uint64 = 0x0000000000000020
23+
reasonHWSlowdownThermal uint64 = 0x0000000000000040
24+
reasonHWSlowdownPowerBrake uint64 = 0x0000000000000080
25+
)
26+
27+
var clockEventsToInjectHwSlowdown = reasonHWSlowdown | reasonSwThermalSlowdown | reasonHWSlowdownThermal | reasonHWSlowdownPowerBrake
28+
29+
func NewDefault() Library {
30+
opts := []OpOption{}
31+
32+
if os.Getenv(EnvMockAllSuccess) == "true" {
33+
opts = append(opts,
34+
WithNVML(nvml_lib_mock.AllSuccessInterface),
35+
WithPropertyExtractor(nvml_lib_mock.HasNvmlPropertyExtractor),
36+
)
37+
}
38+
39+
if os.Getenv(EnvInjectRemapedRowsPending) == "true" {
40+
opts = append(opts,
41+
WithDeviceGetRemappedRowsForAllDevs(func() (corrRows int, uncRows int, isPending bool, failureOccurred bool, ret nvml.Return) {
42+
log.Logger.Infow("injecting remapped rows pending", "corrRows", 0, "uncRows", 10, "isPending", true, "failureOccurred", false)
43+
return 0, 10, true, false, nvml.SUCCESS
44+
}),
45+
)
46+
}
47+
48+
if os.Getenv(EnvInjectClockEventsHwSlowdown) == "true" {
49+
opts = append(opts,
50+
WithDeviceGetCurrentClocksEventReasonsForAllDevs(func() (uint64, nvml.Return) {
51+
log.Logger.Infow("injecting clock events hw slowdown", "reasons", clockEventsToInjectHwSlowdown)
52+
return clockEventsToInjectHwSlowdown, nvml.SUCCESS
53+
}),
54+
)
55+
}
56+
57+
return New(opts...)
58+
}
+99
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
package lib
2+
3+
import (
4+
"os"
5+
"testing"
6+
7+
"github.com/NVIDIA/go-nvml/pkg/nvml"
8+
"github.com/stretchr/testify/assert"
9+
"github.com/stretchr/testify/require"
10+
)
11+
12+
// TestNewDefaultNoEnvVars tests the NewDefault function when no environment variables are set
13+
func TestNewDefaultNoEnvVars(t *testing.T) {
14+
// Make sure environment variables are not set
15+
os.Unsetenv(EnvMockAllSuccess)
16+
os.Unsetenv(EnvInjectRemapedRowsPending)
17+
os.Unsetenv(EnvInjectClockEventsHwSlowdown)
18+
19+
// Create a new library instance
20+
lib := NewDefault()
21+
22+
// Verify the library instance is created with default options
23+
assert.NotNil(t, lib)
24+
assert.NotNil(t, lib.NVML())
25+
assert.NotNil(t, lib.Device())
26+
}
27+
28+
// TestNewDefaultMockAllSuccess tests the NewDefault function when EnvMockAllSuccess is set
29+
func TestNewDefaultMockAllSuccess(t *testing.T) {
30+
// Clean up environment variables first
31+
cleanupEnvVars()
32+
defer cleanupEnvVars()
33+
34+
// Set the environment variable
35+
os.Setenv(EnvMockAllSuccess, "true")
36+
37+
// Create a new library instance
38+
lib := NewDefault()
39+
40+
// Verify the library instance is created with mock interface
41+
assert.NotNil(t, lib)
42+
43+
// Test that NVML functions succeed
44+
ret := lib.NVML().Init()
45+
assert.Equal(t, nvml.SUCCESS, ret)
46+
47+
// Test that device functions are available and succeed
48+
devices, err := lib.Device().GetDevices()
49+
assert.NoError(t, err)
50+
assert.NotEmpty(t, devices)
51+
}
52+
53+
// TestNewDefaultMultipleEnvVars tests the NewDefault function when multiple environment variables are set
54+
func TestNewDefaultMultipleEnvVars(t *testing.T) {
55+
// Clean up environment variables first
56+
cleanupEnvVars()
57+
defer cleanupEnvVars()
58+
59+
// Set multiple environment variables
60+
os.Setenv(EnvMockAllSuccess, "true")
61+
os.Setenv(EnvInjectRemapedRowsPending, "true")
62+
os.Setenv(EnvInjectClockEventsHwSlowdown, "true")
63+
64+
// Create a new library instance
65+
lib := NewDefault()
66+
67+
// Verify the library instance is created correctly
68+
assert.NotNil(t, lib)
69+
70+
// Test that NVML functions succeed
71+
ret := lib.NVML().Init()
72+
assert.Equal(t, nvml.SUCCESS, ret)
73+
74+
// Get devices to test modified functions
75+
devices, err := lib.Device().GetDevices()
76+
require.NoError(t, err)
77+
require.NotEmpty(t, devices)
78+
79+
// Test the injected function to get remapped rows
80+
corrRows, uncRows, isPending, failureOccurred, retRemapped := devices[0].GetRemappedRows()
81+
assert.Equal(t, 0, corrRows)
82+
assert.Equal(t, 10, uncRows)
83+
assert.True(t, isPending)
84+
assert.False(t, failureOccurred)
85+
assert.Equal(t, nvml.SUCCESS, retRemapped)
86+
87+
// Test the injected function to get clock events
88+
reasons, retClock := devices[0].GetCurrentClocksEventReasons()
89+
expectedReasons := reasonHWSlowdown | reasonSwThermalSlowdown | reasonHWSlowdownThermal | reasonHWSlowdownPowerBrake
90+
assert.Equal(t, expectedReasons, reasons)
91+
assert.Equal(t, nvml.SUCCESS, retClock)
92+
}
93+
94+
// Utility function to clean up environment variables
95+
func cleanupEnvVars() {
96+
os.Unsetenv(EnvMockAllSuccess)
97+
os.Unsetenv(EnvInjectRemapedRowsPending)
98+
os.Unsetenv(EnvInjectClockEventsHwSlowdown)
99+
}

pkg/nvidia-query/nvml/lib/lib.go

+23-10
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,10 @@ func New(opts ...OpOption) Library {
5858

5959
devLib := device.New(nvInterface.Interface)
6060
nvInterface.dev = &devInterface{
61-
Interface: devLib,
62-
devices: options.devicesToReturn,
63-
getRemappedRows: options.devGetRemappedRows,
61+
Interface: devLib,
62+
devices: options.devicesToReturn,
63+
getRemappedRowsForAllDevs: options.devGetRemappedRowsForAllDevs,
64+
getCurrentClocksEventReasonsForAllDevs: options.devGetCurrentClocksEventReasonsForAllDevs,
6465
}
6566

6667
infoOpts := []nvinfo.Option{
@@ -86,8 +87,9 @@ var _ device.Interface = &devInterface{}
8687

8788
type devInterface struct {
8889
device.Interface
89-
devices []device.Device
90-
getRemappedRows func() (int, int, bool, bool, nvml.Return)
90+
devices []device.Device
91+
getRemappedRowsForAllDevs func() (int, int, bool, bool, nvml.Return)
92+
getCurrentClocksEventReasonsForAllDevs func() (uint64, nvml.Return)
9193
}
9294

9395
func (d *devInterface) GetDevices() ([]device.Device, error) {
@@ -105,8 +107,9 @@ func (d *devInterface) GetDevices() ([]device.Device, error) {
105107
updated := make([]device.Device, len(devs))
106108
for i, dev := range devs {
107109
updated[i] = &devDevInterface{
108-
Device: dev,
109-
getRemappedRows: d.getRemappedRows,
110+
Device: dev,
111+
getRemappedRowsForAllDevs: d.getRemappedRowsForAllDevs,
112+
getCurrentClocksEventReasonsForAllDevs: d.getCurrentClocksEventReasonsForAllDevs,
110113
}
111114
}
112115

@@ -117,14 +120,24 @@ var _ device.Device = &devDevInterface{}
117120

118121
type devDevInterface struct {
119122
device.Device
120-
getRemappedRows func() (int, int, bool, bool, nvml.Return)
123+
getRemappedRowsForAllDevs func() (int, int, bool, bool, nvml.Return)
124+
getCurrentClocksEventReasonsForAllDevs func() (uint64, nvml.Return)
121125
}
122126

123127
func (d *devDevInterface) GetRemappedRows() (int, int, bool, bool, nvml.Return) {
124128
// no injected remapped rows
125129
// thus just passthrough to call the underlying device.Device.GetRemappedRows()
126-
if d.getRemappedRows == nil {
130+
if d.getRemappedRowsForAllDevs == nil {
127131
return d.Device.GetRemappedRows()
128132
}
129-
return d.getRemappedRows()
133+
return d.getRemappedRowsForAllDevs()
134+
}
135+
136+
func (d *devDevInterface) GetCurrentClocksEventReasons() (uint64, nvml.Return) {
137+
// no injected current clocks event reasons
138+
// thus just passthrough to call the underlying device.Device.GetCurrentClocksEventReasons()
139+
if d.getCurrentClocksEventReasonsForAllDevs == nil {
140+
return d.Device.GetCurrentClocksEventReasons()
141+
}
142+
return d.getCurrentClocksEventReasonsForAllDevs()
130143
}

0 commit comments

Comments
 (0)