Add support for GPU PCIe throughput metrics

metalmatze · metalmatze · commit 813941393013 · 2025-04-16T19:12:33.000+02:00
This commit introduces metrics for GPU PCIe throughput, including transmit, receive, and utilization counts. The changes include retrieving PCIe throughput data via NVML and exposing these as telemetry metrics.
diff --git a/nvidia.go b/nvidia.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"log/slog"
 	"sort"
+	"time"
 
 	"github.com/NVIDIA/go-nvml/pkg/nvml"
 	"go.opentelemetry.io/collector/pdata/pcommon"
@@ -19,6 +20,9 @@ const (
 	metricNameGPUUtilizationMemoryPercent = "gpu_utilization_memory_percent"
 	metricNameGPUUtilizationPercent       = "gpu_utilization_percent"
 	metricNameGPUPowerWatt                = "gpu_power_watt"
+	metricNameGPUPCIeThroughputReceive    = "gpu_pcie_throughput_receive_bytes"
+	metricNameGPUPCIeThroughputTransmit   = "gpu_pcie_throughput_transmit_bytes"
+	metricNameGPUPCIeThroughputCount      = "gpu_pcie_throughput_count"
 )
 
 type perDeviceState struct {
@@ -85,6 +89,11 @@ func (p *NvidiaProducer) Produce(ms pmetric.MetricSlice) error {
 			slog.Error("Failed to get GPU memory utilization for device", "uuid", uuid, "index", i, "error", err)
 			continue
 		}
+
+		err = p.produceThroughput(pds, uuid, i, ms)
+		if err != nil {
+			slog.Error("Failed to get GPU PICe throughput for device", "uuid", uuid, "index", i, "error", err)
+		}
 	}
 
 	return nil
@@ -209,6 +218,44 @@ func (p *NvidiaProducer) producePowerConsumption(pds perDeviceState, uuid string
 	return nil
 }
 
+var pcieCounters = []nvml.PcieUtilCounter{
+	nvml.PCIE_UTIL_TX_BYTES,
+	nvml.PCIE_UTIL_RX_BYTES,
+	nvml.PCIE_UTIL_COUNT,
+}
+
+func (p *NvidiaProducer) produceThroughput(pds perDeviceState, uuid string, index int, ms pmetric.MetricSlice) error {
+	for _, counter := range pcieCounters {
+		ts := time.Now()
+
+		tp, ret := pds.d.GetPcieThroughput(counter)
+		if !errors.Is(ret, nvml.SUCCESS) {
+			return fmt.Errorf("failed to get PCIe throughput for %d %d: %s", index, counter, nvml.ErrorString(ret))
+		}
+
+		var metricName string
+		switch counter {
+		case nvml.PCIE_UTIL_TX_BYTES:
+			metricName = metricNameGPUPCIeThroughputTransmit
+		case nvml.PCIE_UTIL_RX_BYTES:
+			metricName = metricNameGPUPCIeThroughputReceive
+		case nvml.PCIE_UTIL_COUNT:
+			metricName = metricNameGPUPCIeThroughputCount
+		}
+
+		m := ms.AppendEmpty()
+		m.SetName(metricName)
+		g := m.SetEmptyGauge()
+		dp := g.DataPoints().AppendEmpty()
+		dp.Attributes().PutStr(attributeUUID, uuid)
+		dp.Attributes().PutInt(attributeIndex, int64(index))
+		dp.SetTimestamp(pcommon.Timestamp(ts.UnixNano()))
+		dp.SetIntValue(int64(tp * 1000)) // KB/s to bytes/s
+	}
+
+	return nil
+}
+
 func valueGetter(sampleType nvml.ValueType) (func([8]byte) any, error) {
 	switch sampleType {
 	case nvml.VALUE_TYPE_DOUBLE: