|
1 | 1 | from collections.abc import Iterable |
| 2 | +from typing import Final |
2 | 3 |
|
3 | 4 | from ai.backend.common.clients.prometheus.metric_types import KernelLiveStatBatchResult |
4 | 5 | from ai.backend.common.clients.prometheus.types import MetricValue as PrometheusMetricValue |
5 | 6 | from ai.backend.common.clients.prometheus.types import ValueType |
6 | | -from ai.backend.common.metrics.types import ( |
7 | | - DIFF_STAT_METRICS, |
8 | | - RATE_STAT_METRICS, |
9 | | - UTILIZATION_METRIC_INTERVAL, |
10 | | - MetricValue, |
11 | | - make_default_metric_value, |
12 | | - resolve_unit_hint, |
13 | | -) |
14 | | -from ai.backend.common.types import KernelId |
| 7 | +from ai.backend.common.metrics.types import UTILIZATION_METRIC_INTERVAL |
| 8 | +from ai.backend.common.types import KernelId, MetricValue |
| 9 | + |
| 10 | +# Metric-name classification used only while adapting Prometheus samples back |
| 11 | +# into the legacy live_stat dict that Graphene/WebUI still expects. |
| 12 | +_RATE_STAT_METRICS: Final[frozenset[str]] = frozenset({"net_rx", "net_tx"}) |
| 13 | +_DIFF_STAT_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"}) |
| 14 | + |
| 15 | +# Per-metric unit hint emitted by the agent (source of truth: |
| 16 | +# src/ai/backend/agent/docker/intrinsic.py). |
| 17 | +_METRIC_UNIT_HINTS: Final[dict[str, str]] = { |
| 18 | + "cpu_used": "msec", |
| 19 | + "cpu_util": "percent", |
| 20 | + "mem": "bytes", |
| 21 | + "net_rx": "bps", |
| 22 | + "net_tx": "bps", |
| 23 | + "io_read": "bytes", |
| 24 | + "io_write": "bytes", |
| 25 | + "io_scratch_size": "bytes", |
| 26 | +} |
| 27 | + |
| 28 | + |
| 29 | +def _make_default_metric_value(unit_hint: str) -> MetricValue: |
| 30 | + return MetricValue({ |
| 31 | + "current": "0", |
| 32 | + "capacity": "0", |
| 33 | + "pct": "0", |
| 34 | + "unit_hint": unit_hint, |
| 35 | + "stats.min": "0", |
| 36 | + "stats.max": "0", |
| 37 | + "stats.sum": "0", |
| 38 | + "stats.avg": "0", |
| 39 | + "stats.diff": "0", |
| 40 | + "stats.rate": "0", |
| 41 | + "stats.version": None, |
| 42 | + }) |
| 43 | + |
| 44 | + |
| 45 | +def _resolve_unit_hint(metric_name: str) -> str: |
| 46 | + if metric_name in _METRIC_UNIT_HINTS: |
| 47 | + return _METRIC_UNIT_HINTS[metric_name] |
| 48 | + if metric_name.endswith("_util"): |
| 49 | + return "percent" |
| 50 | + if metric_name == "mem" or metric_name.endswith("_mem"): |
| 51 | + return "bytes" |
| 52 | + if metric_name.startswith("io_"): |
| 53 | + return "bytes" |
| 54 | + if metric_name.startswith("net_"): |
| 55 | + return "bps" |
| 56 | + return metric_name |
15 | 57 |
|
16 | 58 |
|
17 | 59 | class LegacyLiveStatConverter: |
@@ -53,17 +95,17 @@ def _convert_one_kernel(cls, values: Iterable[PrometheusMetricValue]) -> dict[st |
53 | 95 | def _convert_metric_samples( |
54 | 96 | metric_name: str, samples: list[PrometheusMetricValue] |
55 | 97 | ) -> MetricValue: |
56 | | - # `resolve_unit_hint` falls back to naming conventions and finally |
| 98 | + # `_resolve_unit_hint` falls back to naming conventions and finally |
57 | 99 | # the metric_name itself for unregistered plugin metrics. |
58 | | - unit_hint = resolve_unit_hint(metric_name) |
59 | | - out = make_default_metric_value(unit_hint=unit_hint) |
| 100 | + unit_hint = _resolve_unit_hint(metric_name) |
| 101 | + out = _make_default_metric_value(unit_hint=unit_hint) |
60 | 102 |
|
61 | 103 | currents = [s.value for s in samples if s.value_type is ValueType.CURRENT] |
62 | 104 | capacities = [s.value for s in samples if s.value_type is ValueType.CAPACITY] |
63 | 105 | pcts = [s.value for s in samples if s.value_type is ValueType.PCT] |
64 | 106 |
|
65 | | - is_rate_metric = metric_name in RATE_STAT_METRICS |
66 | | - is_diff_metric = metric_name in DIFF_STAT_METRICS |
| 107 | + is_rate_metric = metric_name in _RATE_STAT_METRICS |
| 108 | + is_diff_metric = metric_name in _DIFF_STAT_METRICS |
67 | 109 |
|
68 | 110 | if currents: |
69 | 111 | # RATE/DIFF: prefer the rate/diff query result over the raw gauge, |
@@ -96,7 +138,10 @@ def _convert_metric_samples( |
96 | 138 | else: |
97 | 139 | try: |
98 | 140 | current_value = float(out["current"]) |
99 | | - capacity_value = float(out["capacity"]) |
| 141 | + capacity = out["capacity"] |
| 142 | + if capacity is None: |
| 143 | + return out |
| 144 | + capacity_value = float(capacity) |
100 | 145 | if capacity_value > 0: |
101 | 146 | out["pct"] = f"{current_value / capacity_value * 100:.2f}" |
102 | 147 | except ValueError: |
|
0 commit comments