Skip to content

Commit 8139413

Browse files
committed
Add support for GPU PCIe throughput metrics
This commit introduces metrics for GPU PCIe throughput, including transmit, receive, and utilization counts. The changes include retrieving PCIe throughput data via NVML and exposing these as telemetry metrics.
1 parent e8d881f commit 8139413

File tree

1 file changed

+47
-0
lines changed

1 file changed

+47
-0
lines changed

nvidia.go

+47
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"fmt"
88
"log/slog"
99
"sort"
10+
"time"
1011

1112
"github.com/NVIDIA/go-nvml/pkg/nvml"
1213
"go.opentelemetry.io/collector/pdata/pcommon"
@@ -19,6 +20,9 @@ const (
1920
metricNameGPUUtilizationMemoryPercent = "gpu_utilization_memory_percent"
2021
metricNameGPUUtilizationPercent = "gpu_utilization_percent"
2122
metricNameGPUPowerWatt = "gpu_power_watt"
23+
metricNameGPUPCIeThroughputReceive = "gpu_pcie_throughput_receive_bytes"
24+
metricNameGPUPCIeThroughputTransmit = "gpu_pcie_throughput_transmit_bytes"
25+
metricNameGPUPCIeThroughputCount = "gpu_pcie_throughput_count"
2226
)
2327

2428
type perDeviceState struct {
@@ -85,6 +89,11 @@ func (p *NvidiaProducer) Produce(ms pmetric.MetricSlice) error {
8589
slog.Error("Failed to get GPU memory utilization for device", "uuid", uuid, "index", i, "error", err)
8690
continue
8791
}
92+
93+
err = p.produceThroughput(pds, uuid, i, ms)
94+
if err != nil {
95+
slog.Error("Failed to get GPU PICe throughput for device", "uuid", uuid, "index", i, "error", err)
96+
}
8897
}
8998

9099
return nil
@@ -209,6 +218,44 @@ func (p *NvidiaProducer) producePowerConsumption(pds perDeviceState, uuid string
209218
return nil
210219
}
211220

221+
var pcieCounters = []nvml.PcieUtilCounter{
222+
nvml.PCIE_UTIL_TX_BYTES,
223+
nvml.PCIE_UTIL_RX_BYTES,
224+
nvml.PCIE_UTIL_COUNT,
225+
}
226+
227+
func (p *NvidiaProducer) produceThroughput(pds perDeviceState, uuid string, index int, ms pmetric.MetricSlice) error {
228+
for _, counter := range pcieCounters {
229+
ts := time.Now()
230+
231+
tp, ret := pds.d.GetPcieThroughput(counter)
232+
if !errors.Is(ret, nvml.SUCCESS) {
233+
return fmt.Errorf("failed to get PCIe throughput for %d %d: %s", index, counter, nvml.ErrorString(ret))
234+
}
235+
236+
var metricName string
237+
switch counter {
238+
case nvml.PCIE_UTIL_TX_BYTES:
239+
metricName = metricNameGPUPCIeThroughputTransmit
240+
case nvml.PCIE_UTIL_RX_BYTES:
241+
metricName = metricNameGPUPCIeThroughputReceive
242+
case nvml.PCIE_UTIL_COUNT:
243+
metricName = metricNameGPUPCIeThroughputCount
244+
}
245+
246+
m := ms.AppendEmpty()
247+
m.SetName(metricName)
248+
g := m.SetEmptyGauge()
249+
dp := g.DataPoints().AppendEmpty()
250+
dp.Attributes().PutStr(attributeUUID, uuid)
251+
dp.Attributes().PutInt(attributeIndex, int64(index))
252+
dp.SetTimestamp(pcommon.Timestamp(ts.UnixNano()))
253+
dp.SetIntValue(int64(tp * 1000)) // KB/s to bytes/s
254+
}
255+
256+
return nil
257+
}
258+
212259
func valueGetter(sampleType nvml.ValueType) (func([8]byte) any, error) {
213260
switch sampleType {
214261
case nvml.VALUE_TYPE_DOUBLE:

0 commit comments

Comments
 (0)