Skip to content

Commit be16571

Browse files
committed
improve pynvml add gpm for sm util
1 parent 6e61701 commit be16571

File tree

6 files changed

+604
-221
lines changed

6 files changed

+604
-221
lines changed

src/aiperf/common/models/telemetry_models.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ class TelemetryMetrics(AIPerfBaseModel):
2828
default=None, description="Cumulative energy consumption in MJ"
2929
)
3030
gpu_utilization: float | None = Field(
31-
default=None, description="GPU utilization percentage (0-100)"
31+
default=None,
32+
description="GPU utilization percentage (0-100). "
33+
"Percent of time over the past sample period during which one or more kernels was executing on the GPU.",
3234
)
3335
gpu_memory_used: float | None = Field(
3436
default=None, description="GPU memory used in GB"
@@ -37,7 +39,9 @@ class TelemetryMetrics(AIPerfBaseModel):
3739
default=None, description="GPU temperature in °C"
3840
)
3941
mem_utilization: float | None = Field(
40-
default=None, description="Memory bandwidth utilization percentage (0-100)"
42+
default=None,
43+
description="Memory bandwidth utilization percentage (0-100). "
44+
"Percent of time over the past sample period during which global (device) memory was being read or written.",
4145
)
4246
sm_utilization: float | None = Field(
4347
default=None,

src/aiperf/gpu_telemetry/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,8 @@
1717
GPUTelemetryAccumulator,
1818
)
1919
from aiperf.gpu_telemetry.constants import (
20-
DCGM_SCALING_FACTORS,
2120
DCGM_TO_FIELD_MAPPING,
2221
GPU_TELEMETRY_METRICS_CONFIG,
23-
PYNVML_SCALING_FACTORS,
2422
PYNVML_SOURCE_IDENTIFIER,
2523
get_gpu_telemetry_metrics_config,
2624
)
@@ -48,7 +46,6 @@
4846

4947
__all__ = [
5048
"DCGMTelemetryCollector",
51-
"DCGM_SCALING_FACTORS",
5249
"DCGM_TO_FIELD_MAPPING",
5350
"GPUTelemetryAccumulator",
5451
"GPUTelemetryCollectorFactory",
@@ -57,7 +54,6 @@
5754
"GPUTelemetryManager",
5855
"GPU_TELEMETRY_METRICS_CONFIG",
5956
"MetricsConfigLoader",
60-
"PYNVML_SCALING_FACTORS",
6157
"PYNVML_SOURCE_IDENTIFIER",
6258
"PyNVMLTelemetryCollector",
6359
"TErrorCallback",

src/aiperf/gpu_telemetry/constants.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,6 @@
1616
# Source identifier for pynvml collector (used in TelemetryRecord.dcgm_url field)
1717
PYNVML_SOURCE_IDENTIFIER = "pynvml://localhost"
1818

19-
# DCGM unit conversion scaling factors
20-
DCGM_SCALING_FACTORS = {
21-
"energy_consumption": 1e-9, # mJ to MJ
22-
"gpu_memory_used": 1.048576 * 1e-3, # MiB to GB
23-
"sm_utilization": 100, # ratio (0-1) to percentage (0-100)
24-
"power_violation": 1e-3, # ns to µs
25-
}
26-
27-
# PyNVML unit conversion scaling factors
28-
# NVML returns values in different units than what TelemetryMetrics expects
29-
PYNVML_SCALING_FACTORS = {
30-
"gpu_power_usage": 1e-3, # mW -> W
31-
"energy_consumption": 1e-9, # mJ -> MJ
32-
"gpu_memory_used": 1e-9, # bytes -> GB
33-
}
34-
3519
# DCGM field mapping to telemetry record fields
3620
DCGM_TO_FIELD_MAPPING = {
3721
"DCGM_FI_DEV_POWER_USAGE": "gpu_power_usage",

src/aiperf/gpu_telemetry/dcgm_collector.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,22 @@
1414
TRecordCallback,
1515
)
1616
from aiperf.common.models import GpuMetadata, TelemetryMetrics, TelemetryRecord
17-
from aiperf.gpu_telemetry.constants import (
18-
DCGM_SCALING_FACTORS,
19-
DCGM_TO_FIELD_MAPPING,
20-
)
17+
from aiperf.gpu_telemetry.constants import DCGM_TO_FIELD_MAPPING
2118
from aiperf.gpu_telemetry.factories import (
2219
GPUTelemetryCollectorFactory,
2320
GPUTelemetryCollectorProtocol,
2421
)
2522

2623
__all__ = ["DCGMTelemetryCollector"]
2724

25+
# Unit conversion scaling factors for DCGM metrics
26+
SCALING_FACTORS = {
27+
"energy_consumption": 1e-9, # mJ -> MJ
28+
"gpu_memory_used": 1.048576e-3, # MiB -> GB
29+
"sm_utilization": 100, # ratio (0-1) -> percentage (0-100)
30+
"power_violation": 1e-3, # ns -> µs
31+
}
32+
2833

2934
@implements_protocol(GPUTelemetryCollectorProtocol)
3035
@GPUTelemetryCollectorFactory.register(GPUTelemetryCollectorType.DCGM)
@@ -65,7 +70,7 @@ def __init__(
6570
error_callback: TErrorCallback | None = None,
6671
collector_id: str = "telemetry_collector",
6772
) -> None:
68-
self._scaling_factors = DCGM_SCALING_FACTORS
73+
self._scaling_factors = SCALING_FACTORS
6974
super().__init__(
7075
endpoint_url=dcgm_url,
7176
collection_interval=collection_interval,

0 commit comments

Comments
 (0)