7
7
"fmt"
8
8
"log/slog"
9
9
"sort"
10
+ "time"
10
11
11
12
"github.com/NVIDIA/go-nvml/pkg/nvml"
12
13
"go.opentelemetry.io/collector/pdata/pcommon"
@@ -19,6 +20,9 @@ const (
19
20
metricNameGPUUtilizationMemoryPercent = "gpu_utilization_memory_percent"
20
21
metricNameGPUUtilizationPercent = "gpu_utilization_percent"
21
22
metricNameGPUPowerWatt = "gpu_power_watt"
23
+ metricNameGPUPCIeThroughputReceive = "gpu_pcie_throughput_receive_bytes"
24
+ metricNameGPUPCIeThroughputTransmit = "gpu_pcie_throughput_transmit_bytes"
25
+ metricNameGPUPCIeThroughputCount = "gpu_pcie_throughput_count"
22
26
)
23
27
24
28
type perDeviceState struct {
@@ -85,6 +89,11 @@ func (p *NvidiaProducer) Produce(ms pmetric.MetricSlice) error {
85
89
slog .Error ("Failed to get GPU memory utilization for device" , "uuid" , uuid , "index" , i , "error" , err )
86
90
continue
87
91
}
92
+
93
+ err = p .produceThroughput (pds , uuid , i , ms )
94
+ if err != nil {
95
+ slog .Error ("Failed to get GPU PICe throughput for device" , "uuid" , uuid , "index" , i , "error" , err )
96
+ }
88
97
}
89
98
90
99
return nil
@@ -209,6 +218,44 @@ func (p *NvidiaProducer) producePowerConsumption(pds perDeviceState, uuid string
209
218
return nil
210
219
}
211
220
221
+ var pcieCounters = []nvml.PcieUtilCounter {
222
+ nvml .PCIE_UTIL_TX_BYTES ,
223
+ nvml .PCIE_UTIL_RX_BYTES ,
224
+ nvml .PCIE_UTIL_COUNT ,
225
+ }
226
+
227
+ func (p * NvidiaProducer ) produceThroughput (pds perDeviceState , uuid string , index int , ms pmetric.MetricSlice ) error {
228
+ for _ , counter := range pcieCounters {
229
+ ts := time .Now ()
230
+
231
+ tp , ret := pds .d .GetPcieThroughput (counter )
232
+ if ! errors .Is (ret , nvml .SUCCESS ) {
233
+ return fmt .Errorf ("failed to get PCIe throughput for %d %d: %s" , index , counter , nvml .ErrorString (ret ))
234
+ }
235
+
236
+ var metricName string
237
+ switch counter {
238
+ case nvml .PCIE_UTIL_TX_BYTES :
239
+ metricName = metricNameGPUPCIeThroughputTransmit
240
+ case nvml .PCIE_UTIL_RX_BYTES :
241
+ metricName = metricNameGPUPCIeThroughputReceive
242
+ case nvml .PCIE_UTIL_COUNT :
243
+ metricName = metricNameGPUPCIeThroughputCount
244
+ }
245
+
246
+ m := ms .AppendEmpty ()
247
+ m .SetName (metricName )
248
+ g := m .SetEmptyGauge ()
249
+ dp := g .DataPoints ().AppendEmpty ()
250
+ dp .Attributes ().PutStr (attributeUUID , uuid )
251
+ dp .Attributes ().PutInt (attributeIndex , int64 (index ))
252
+ dp .SetTimestamp (pcommon .Timestamp (ts .UnixNano ()))
253
+ dp .SetIntValue (int64 (tp * 1000 )) // KB/s to bytes/s
254
+ }
255
+
256
+ return nil
257
+ }
258
+
212
259
func valueGetter (sampleType nvml.ValueType ) (func ([8 ]byte ) any , error ) {
213
260
switch sampleType {
214
261
case nvml .VALUE_TYPE_DOUBLE :
0 commit comments