@@ -28,12 +28,15 @@ import (
2828 "github.com/NVIDIA/fleet-intelligence-sdk/pkg/eventstore"
2929 "github.com/NVIDIA/fleet-intelligence-sdk/pkg/log"
3030 pkgmetrics "github.com/NVIDIA/fleet-intelligence-sdk/pkg/metrics"
31+ nvidianvml "github.com/NVIDIA/fleet-intelligence-sdk/pkg/nvidia-query/nvml"
3132 "github.com/google/uuid"
3233
3334 "github.com/NVIDIA/fleet-intelligence-agent/internal/config"
3435 "github.com/NVIDIA/fleet-intelligence-agent/internal/machineinfo"
3536)
3637
38+ const initialMachineInfoWait = 5 * time .Second
39+
3740// GenerateCollectionID generates a unique identifier for a data collection cycle
3841func GenerateCollectionID () string {
3942 bytes := make ([]byte , 16 )
@@ -65,12 +68,27 @@ type Collector interface {
6568
6669// collector implements the Collector interface
6770type collector struct {
68- config * config.HealthExporterConfig
69- metricsStore pkgmetrics.Store
70- eventStore eventstore.Store
71- componentsRegistry components.Registry
72- machineID string // Agent's stable identity from server initialization
73- dcgmGPUIndexes map [string ]string // UUID → DCGM device ID override for GPU indices
71+ config * config.HealthExporterConfig
72+ metricsStore pkgmetrics.Store
73+ eventStore eventstore.Store
74+ componentsRegistry components.Registry
75+ machineID string // Agent's stable identity from server initialization
76+ dcgmGPUIndexes map [string ]string // UUID → DCGM device ID override for GPU indices
77+ machineInfoProvider machineInfoProvider
78+ }
79+
80+ type collectorOptions struct {
81+ nvmlInstance nvidianvml.Instance
82+ }
83+
84+ // Option configures optional collector dependencies.
85+ type Option func (* collectorOptions )
86+
87+ // WithNVMLInstance enables cached machine-info collection for health exports.
88+ func WithNVMLInstance (nvmlInstance nvidianvml.Instance ) Option {
89+ return func (o * collectorOptions ) {
90+ o .nvmlInstance = nvmlInstance
91+ }
7492}
7593
7694// New creates a new health data collector
@@ -81,14 +99,31 @@ func New(
8199 componentsRegistry components.Registry ,
82100 machineID string ,
83101 dcgmGPUIndexes map [string ]string ,
102+ opts ... Option ,
84103) Collector {
104+ var collectorOpts collectorOptions
105+ for _ , opt := range opts {
106+ opt (& collectorOpts )
107+ }
108+
109+ var provider machineInfoProvider
110+ if cfg != nil && cfg .IncludeMachineInfo && collectorOpts .nvmlInstance != nil {
111+ var machineInfoOpts []machineinfo.MachineInfoOption
112+ if len (dcgmGPUIndexes ) > 0 {
113+ machineInfoOpts = append (machineInfoOpts , machineinfo .WithDCGMGPUIndexes (dcgmGPUIndexes ))
114+ }
115+ provider = newCachedMachineInfoProvider (collectorOpts .nvmlInstance , 0 , machineInfoOpts ... )
116+ provider .RefreshAsync (context .Background ())
117+ }
118+
85119 return & collector {
86- config : cfg ,
87- metricsStore : metricsStore ,
88- eventStore : eventStore ,
89- componentsRegistry : componentsRegistry ,
90- machineID : machineID ,
91- dcgmGPUIndexes : dcgmGPUIndexes ,
120+ config : cfg ,
121+ metricsStore : metricsStore ,
122+ eventStore : eventStore ,
123+ componentsRegistry : componentsRegistry ,
124+ machineID : machineID ,
125+ dcgmGPUIndexes : dcgmGPUIndexes ,
126+ machineInfoProvider : provider ,
92127 }
93128}
94129
@@ -110,6 +145,11 @@ func (c *collector) Collect(ctx context.Context) (*HealthData, error) {
110145 GPUUUIDToIndex : cloneStringMap (c .dcgmGPUIndexes ),
111146 }
112147
148+ // Collect machine info if enabled. The converter only exports selected fields.
149+ if c .config .IncludeMachineInfo {
150+ c .collectMachineInfo (ctx , data )
151+ }
152+
113153 // Collect metrics if enabled
114154 if c .config .IncludeMetrics {
115155 if err := c .collectMetrics (ctx , data ); err != nil {
@@ -134,6 +174,23 @@ func (c *collector) Collect(ctx context.Context) (*HealthData, error) {
134174 return data , nil
135175}
136176
177+ // collectMachineInfo reads cached machine info and triggers a best-effort refresh.
178+ func (c * collector ) collectMachineInfo (ctx context.Context , data * HealthData ) {
179+ if c .machineInfoProvider == nil {
180+ return
181+ }
182+
183+ if _ , ok := c .machineInfoProvider .Get (); ! ok {
184+ c .machineInfoProvider .WaitForInitialRefresh (ctx , initialMachineInfoWait )
185+ }
186+
187+ if machineInfo , ok := c .machineInfoProvider .Get (); ok {
188+ data .MachineInfo = machineInfo
189+ }
190+
191+ c .machineInfoProvider .RefreshAsync (ctx )
192+ }
193+
137194// collectMetrics collects metrics data from the metrics store
138195func (c * collector ) collectMetrics (ctx context.Context , data * HealthData ) error {
139196 if c .metricsStore == nil {
0 commit comments