Skip to content

Commit c3b083d

Browse files
authored
fix: restore gpu info gpus OTLP attribute (#201) (#202)
Signed-off-by: Jingxiang Zhang <jingzhang@nvidia.com>
1 parent b5bebe6 commit c3b083d

7 files changed

Lines changed: 177 additions & 12 deletions

File tree

internal/exporter/collector/collector.go

Lines changed: 69 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,15 @@ import (
2828
"github.com/NVIDIA/fleet-intelligence-sdk/pkg/eventstore"
2929
"github.com/NVIDIA/fleet-intelligence-sdk/pkg/log"
3030
pkgmetrics "github.com/NVIDIA/fleet-intelligence-sdk/pkg/metrics"
31+
nvidianvml "github.com/NVIDIA/fleet-intelligence-sdk/pkg/nvidia-query/nvml"
3132
"github.com/google/uuid"
3233

3334
"github.com/NVIDIA/fleet-intelligence-agent/internal/config"
3435
"github.com/NVIDIA/fleet-intelligence-agent/internal/machineinfo"
3536
)
3637

38+
const initialMachineInfoWait = 5 * time.Second
39+
3740
// GenerateCollectionID generates a unique identifier for a data collection cycle
3841
func GenerateCollectionID() string {
3942
bytes := make([]byte, 16)
@@ -65,12 +68,27 @@ type Collector interface {
6568

6669
// collector implements the Collector interface
6770
type collector struct {
68-
config *config.HealthExporterConfig
69-
metricsStore pkgmetrics.Store
70-
eventStore eventstore.Store
71-
componentsRegistry components.Registry
72-
machineID string // Agent's stable identity from server initialization
73-
dcgmGPUIndexes map[string]string // UUID → DCGM device ID override for GPU indices
71+
config *config.HealthExporterConfig
72+
metricsStore pkgmetrics.Store
73+
eventStore eventstore.Store
74+
componentsRegistry components.Registry
75+
machineID string // Agent's stable identity from server initialization
76+
dcgmGPUIndexes map[string]string // UUID → DCGM device ID override for GPU indices
77+
machineInfoProvider machineInfoProvider
78+
}
79+
80+
type collectorOptions struct {
81+
nvmlInstance nvidianvml.Instance
82+
}
83+
84+
// Option configures optional collector dependencies.
85+
type Option func(*collectorOptions)
86+
87+
// WithNVMLInstance enables cached machine-info collection for health exports.
88+
func WithNVMLInstance(nvmlInstance nvidianvml.Instance) Option {
89+
return func(o *collectorOptions) {
90+
o.nvmlInstance = nvmlInstance
91+
}
7492
}
7593

7694
// New creates a new health data collector
@@ -81,14 +99,31 @@ func New(
8199
componentsRegistry components.Registry,
82100
machineID string,
83101
dcgmGPUIndexes map[string]string,
102+
opts ...Option,
84103
) Collector {
104+
var collectorOpts collectorOptions
105+
for _, opt := range opts {
106+
opt(&collectorOpts)
107+
}
108+
109+
var provider machineInfoProvider
110+
if cfg != nil && cfg.IncludeMachineInfo && collectorOpts.nvmlInstance != nil {
111+
var machineInfoOpts []machineinfo.MachineInfoOption
112+
if len(dcgmGPUIndexes) > 0 {
113+
machineInfoOpts = append(machineInfoOpts, machineinfo.WithDCGMGPUIndexes(dcgmGPUIndexes))
114+
}
115+
provider = newCachedMachineInfoProvider(collectorOpts.nvmlInstance, 0, machineInfoOpts...)
116+
provider.RefreshAsync(context.Background())
117+
}
118+
85119
return &collector{
86-
config: cfg,
87-
metricsStore: metricsStore,
88-
eventStore: eventStore,
89-
componentsRegistry: componentsRegistry,
90-
machineID: machineID,
91-
dcgmGPUIndexes: dcgmGPUIndexes,
120+
config: cfg,
121+
metricsStore: metricsStore,
122+
eventStore: eventStore,
123+
componentsRegistry: componentsRegistry,
124+
machineID: machineID,
125+
dcgmGPUIndexes: dcgmGPUIndexes,
126+
machineInfoProvider: provider,
92127
}
93128
}
94129

@@ -110,6 +145,11 @@ func (c *collector) Collect(ctx context.Context) (*HealthData, error) {
110145
GPUUUIDToIndex: cloneStringMap(c.dcgmGPUIndexes),
111146
}
112147

148+
// Collect machine info if enabled. The converter only exports selected fields.
149+
if c.config.IncludeMachineInfo {
150+
c.collectMachineInfo(ctx, data)
151+
}
152+
113153
// Collect metrics if enabled
114154
if c.config.IncludeMetrics {
115155
if err := c.collectMetrics(ctx, data); err != nil {
@@ -134,6 +174,23 @@ func (c *collector) Collect(ctx context.Context) (*HealthData, error) {
134174
return data, nil
135175
}
136176

177+
// collectMachineInfo reads cached machine info and triggers a best-effort refresh.
178+
func (c *collector) collectMachineInfo(ctx context.Context, data *HealthData) {
179+
if c.machineInfoProvider == nil {
180+
return
181+
}
182+
183+
if _, ok := c.machineInfoProvider.Get(); !ok {
184+
c.machineInfoProvider.WaitForInitialRefresh(ctx, initialMachineInfoWait)
185+
}
186+
187+
if machineInfo, ok := c.machineInfoProvider.Get(); ok {
188+
data.MachineInfo = machineInfo
189+
}
190+
191+
c.machineInfoProvider.RefreshAsync(ctx)
192+
}
193+
137194
// collectMetrics collects metrics data from the metrics store
138195
func (c *collector) collectMetrics(ctx context.Context, data *HealthData) error {
139196
if c.metricsStore == nil {

internal/exporter/collector/collector_test.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,28 @@ func TestCollector_CollectMachineInfo_NoNVML(t *testing.T) {
134134
assert.Nil(t, data.MachineInfo, "MachineInfo should be nil without NVML")
135135
}
136136

137+
func TestCollector_CollectMachineInfo_UsesCachedProvider(t *testing.T) {
138+
info := &machineinfo.MachineInfo{
139+
GPUInfo: &apiv1.MachineGPUInfo{
140+
GPUs: []apiv1.MachineGPUInstance{{UUID: "GPU-123", GPUIndex: "0"}},
141+
},
142+
}
143+
provider := &fakeMachineInfoProvider{
144+
info: info,
145+
ok: true,
146+
}
147+
data := &HealthData{}
148+
c := &collector{
149+
machineInfoProvider: provider,
150+
}
151+
152+
c.collectMachineInfo(context.Background(), data)
153+
154+
assert.Same(t, info, data.MachineInfo)
155+
assert.True(t, provider.refreshed)
156+
assert.False(t, provider.waited)
157+
}
158+
137159
func TestCachedMachineInfoProvider_DeduplicatesConcurrentRefresh(t *testing.T) {
138160
originalGetMachineInfo := getMachineInfo
139161
defer func() {
@@ -603,6 +625,26 @@ func (m *mockEventStore) Bucket(name string, opts ...eventstore.OpOption) (event
603625
return nil, nil
604626
}
605627

628+
type fakeMachineInfoProvider struct {
629+
info *machineinfo.MachineInfo
630+
ok bool
631+
refreshed bool
632+
waited bool
633+
}
634+
635+
func (f *fakeMachineInfoProvider) Get() (*machineinfo.MachineInfo, bool) {
636+
return f.info, f.ok
637+
}
638+
639+
func (f *fakeMachineInfoProvider) RefreshAsync(parent context.Context) {
640+
f.refreshed = true
641+
}
642+
643+
func (f *fakeMachineInfoProvider) WaitForInitialRefresh(ctx context.Context, maxWait time.Duration) bool {
644+
f.waited = true
645+
return f.ok
646+
}
647+
606648
type mockComponent struct {
607649
name string
608650
events []apiv1.Event

internal/exporter/converter/otlp.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,17 @@ func (c *otlpConverter) createOTLPResource(data *collector.HealthData) *resource
127127
})
128128
}
129129

130+
if data.MachineInfo != nil && data.MachineInfo.GPUInfo != nil && len(data.MachineInfo.GPUInfo.GPUs) > 0 {
131+
if gpus, err := json.Marshal(data.MachineInfo.GPUInfo.GPUs); err == nil {
132+
attributes = append(attributes, &commonv1.KeyValue{
133+
Key: "gpuInfo.gpus",
134+
Value: &commonv1.AnyValue{
135+
Value: &commonv1.AnyValue_StringValue{StringValue: string(gpus)},
136+
},
137+
})
138+
}
139+
}
140+
130141
return &resourcev1.Resource{
131142
Attributes: attributes,
132143
}

internal/exporter/converter/otlp_test.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -773,6 +773,49 @@ func TestOTLPConverter_ResourceAttributes(t *testing.T) {
773773
assert.Equal(t, "test-machine-123", attrMap["machine.id"])
774774
}
775775

776+
func TestOTLPConverter_ResourceAttributes_IncludesOnlyGPUInfoGPUs(t *testing.T) {
777+
data := &collector.HealthData{
778+
Timestamp: time.Now(),
779+
MachineID: "test-machine-123",
780+
MachineInfo: &machineinfo.MachineInfo{
781+
GPUInfo: &apiv1.MachineGPUInfo{
782+
Product: "NVIDIA-H100",
783+
Manufacturer: "NVIDIA",
784+
Architecture: "hopper",
785+
Memory: "85899345920",
786+
GPUs: []apiv1.MachineGPUInstance{
787+
{
788+
UUID: "GPU-123",
789+
GPUIndex: "0",
790+
BusID: "0000:01:00.0",
791+
SN: "serial-123",
792+
MinorID: "0",
793+
BoardID: 7,
794+
VBIOSVersion: "96.00.68.00.01",
795+
ChassisSN: "chassis-123",
796+
},
797+
},
798+
},
799+
},
800+
}
801+
802+
converter := NewOTLPConverter()
803+
otlpData := converter.Convert(data)
804+
805+
attrs := otlpData.Metrics.ResourceMetrics[0].Resource.Attributes
806+
gpus := findAttribute(t, attrs, "gpuInfo.gpus").GetStringValue()
807+
assert.JSONEq(t, `[{"uuid":"GPU-123","gpuIndex":"0","busID":"0000:01:00.0","sn":"serial-123","minorID":"0","boardID":7,"vbiosVersion":"96.00.68.00.01","chassisSN":"chassis-123"}]`, gpus)
808+
809+
for _, attr := range attrs {
810+
assert.NotContains(t, []string{
811+
"gpuInfo.product",
812+
"gpuInfo.manufacturer",
813+
"gpuInfo.architecture",
814+
"gpuInfo.memory",
815+
}, attr.Key)
816+
}
817+
}
818+
776819
func TestOTLPConverter_Interface(t *testing.T) {
777820
// Verify otlpConverter implements OTLPConverter interface
778821
var _ OTLPConverter = (*otlpConverter)(nil)

internal/exporter/exporter.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ func New(ctx context.Context, opts ...ExporterOption) (Exporter, error) {
8282
options.componentsRegistry,
8383
options.machineID,
8484
options.dcgmGPUIndexes,
85+
collector.WithNVMLInstance(options.nvmlInstance),
8586
)
8687

8788
otlpConverter := converter.NewOTLPConverter()

internal/exporter/options.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/NVIDIA/fleet-intelligence-sdk/components"
2525
"github.com/NVIDIA/fleet-intelligence-sdk/pkg/eventstore"
2626
pkgmetrics "github.com/NVIDIA/fleet-intelligence-sdk/pkg/metrics"
27+
nvidianvml "github.com/NVIDIA/fleet-intelligence-sdk/pkg/nvidia-query/nvml"
2728

2829
"github.com/NVIDIA/fleet-intelligence-agent/internal/config"
2930
)
@@ -40,6 +41,7 @@ type exporterOptions struct {
4041
metricsStore pkgmetrics.Store
4142
eventStore eventstore.Store
4243
componentsRegistry components.Registry
44+
nvmlInstance nvidianvml.Instance
4345
httpClient *http.Client
4446
timeout time.Duration
4547
dbRW *sql.DB // Read-write database connection
@@ -84,6 +86,14 @@ func WithComponentsRegistry(registry components.Registry) ExporterOption {
8486
}
8587
}
8688

89+
// WithNVMLInstance sets the NVML instance used for cached machine-info collection.
90+
func WithNVMLInstance(instance nvidianvml.Instance) ExporterOption {
91+
return func(c *exporterOptions) error {
92+
c.nvmlInstance = instance
93+
return nil
94+
}
95+
}
96+
8797
// WithHTTPClient sets a custom HTTP client
8898
func WithHTTPClient(client *http.Client) ExporterOption {
8999
return func(c *exporterOptions) error {

internal/server/server.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,7 @@ func New(ctx context.Context, auditLogger log.AuditLogger, config *config.Config
395395
exporter.WithMetricsStore(metricsSQLiteStore),
396396
exporter.WithEventStore(eventStore),
397397
exporter.WithComponentsRegistry(s.componentsRegistry),
398+
exporter.WithNVMLInstance(nvmlInstance),
398399
exporter.WithDatabaseConnections(dbRW, dbRO),
399400
exporter.WithMachineID(machineID),
400401
exporter.WithDCGMGPUIndexes(dcgmGPUIndexes),

0 commit comments

Comments
 (0)