Skip to content

Commit 5c54cab

Browse files
committed
fix: keep legacy metric conversion local
1 parent 1a2dbeb commit 5c54cab

7 files changed

Lines changed: 98 additions & 120 deletions

File tree

src/ai/backend/agent/stats.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,15 @@
2626
from ai.backend.common import msgpack
2727
from ai.backend.common.identity import is_containerized
2828
from ai.backend.common.metrics.metric import StageObserver
29-
from ai.backend.common.metrics.types import (
30-
UTILIZATION_METRIC_INTERVAL,
31-
MetricValue,
32-
MovingStatValue,
33-
)
29+
from ai.backend.common.metrics.types import UTILIZATION_METRIC_INTERVAL
3430
from ai.backend.common.types import (
3531
PID,
3632
ContainerId,
3733
DeviceId,
3834
KernelId,
3935
MetricKey,
36+
MetricValue,
37+
MovingStatValue,
4038
SessionId,
4139
SlotName,
4240
)

src/ai/backend/appproxy/worker/types.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,10 @@
4242
SafeGauge,
4343
SafeHistogram,
4444
)
45-
from ai.backend.common.metrics.types import MetricValue, MovingStatValue
4645
from ai.backend.common.types import (
4746
MetricKey,
47+
MetricValue,
48+
MovingStatValue,
4849
RuntimeVariant,
4950
)
5051

src/ai/backend/client/output/formatters.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
import humanize
1111

12-
from ai.backend.common.metrics.types import MetricValue
12+
from ai.backend.common.types import MetricValue
1313

1414
from .types import AbstractOutputFormatter, FieldSpec
1515

Lines changed: 1 addition & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -1,107 +1,11 @@
1-
from typing import Final, TypedDict
1+
from typing import Final
22

33
UNDEFINED: Final[str] = "undefined"
44

5-
6-
class MovingStatValue(TypedDict):
7-
min: str
8-
max: str
9-
sum: str
10-
avg: str
11-
diff: str
12-
rate: str
13-
version: int | None # for legacy client compatibility
14-
15-
16-
MetricValue = TypedDict(
17-
"MetricValue",
18-
{
19-
"current": str,
20-
"capacity": str,
21-
"pct": str,
22-
"unit_hint": str,
23-
"stats.min": str,
24-
"stats.max": str,
25-
"stats.sum": str,
26-
"stats.avg": str,
27-
"stats.diff": str,
28-
"stats.rate": str,
29-
"stats.version": int | None,
30-
},
31-
)
32-
33-
34-
def make_default_metric_value(unit_hint: str) -> MetricValue:
35-
"""Return a `MetricValue` populated with neutral defaults.
36-
37-
All numeric string fields are `"0"` (including `capacity`, matching the
38-
legacy Valkey shape where every metric carried a string capacity).
39-
`unit_hint` is supplied by the caller.
40-
"""
41-
return MetricValue({
42-
"current": "0",
43-
"capacity": "0",
44-
"pct": "0",
45-
"unit_hint": unit_hint,
46-
"stats.min": "0",
47-
"stats.max": "0",
48-
"stats.sum": "0",
49-
"stats.avg": "0",
50-
"stats.diff": "0",
51-
"stats.rate": "0",
52-
"stats.version": None,
53-
})
54-
55-
565
UTILIZATION_METRIC_INTERVAL: Final[float] = 5.0
576
UTILIZATION_METRIC_DETENTION: Final[float] = 600.0 # 10 minutes
587

598
CONTAINER_UTILIZATION_METRIC_NAME: Final[str] = "backendai_container_utilization"
609
CONTAINER_UTILIZATION_METRIC_LABEL_NAME: Final[str] = "container_metric_name"
6110
DEVICE_UTILIZATION_METRIC_LABEL_NAME: Final[str] = "device_metric_name"
6211
PROCESS_UTILIZATION_METRIC_LABEL_NAME: Final[str] = "process_metric_name"
63-
64-
# Metric-name classification used by the legacy live_stat dict converter.
65-
# These mirror the semantics that Worker's MovingStatistics produced when
66-
# kernel stats were stored in Valkey:
67-
# - RATE_STAT_METRICS: stats.rate is meaningful (rate of change per second).
68-
# - DIFF_STAT_METRICS: stats.diff is meaningful (delta over the last window).
69-
RATE_STAT_METRICS: Final[frozenset[str]] = frozenset({"net_rx", "net_tx"})
70-
DIFF_STAT_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"})
71-
72-
# Per-metric unit hint emitted by the agent (source of truth: src/ai/backend/agent/docker/intrinsic.py).
73-
METRIC_UNIT_HINTS: Final[dict[str, str]] = {
74-
"cpu_used": "msec",
75-
"cpu_util": "percent",
76-
"mem": "bytes",
77-
"net_rx": "bps",
78-
"net_tx": "bps",
79-
"io_read": "bytes",
80-
"io_write": "bytes",
81-
"io_scratch_size": "bytes",
82-
}
83-
84-
85-
def resolve_unit_hint(metric_name: str) -> str:
86-
"""Return the unit_hint for a Backend.AI container metric name.
87-
88-
Prometheus does not carry the agent-side `unit_hint` in its samples, so the
89-
manager has to recover it from the metric name alone. Lookup order:
90-
91-
1. Explicit registration in :data:`METRIC_UNIT_HINTS` (highest priority).
92-
2. Naming-convention fallback for plugin metrics that follow Backend.AI
93-
conventions (e.g., `cuda_util`, `gpu_mem`, `tpu_util`).
94-
3. The metric_name itself as a last resort — preserves the sample data
95-
and surfaces the missing registration to the WebUI via the response.
96-
"""
97-
if metric_name in METRIC_UNIT_HINTS:
98-
return METRIC_UNIT_HINTS[metric_name]
99-
if metric_name.endswith("_util"):
100-
return "percent"
101-
if metric_name == "mem" or metric_name.endswith("_mem"):
102-
return "bytes"
103-
if metric_name.startswith("io_"):
104-
return "bytes"
105-
if metric_name.startswith("net_"):
106-
return "bps"
107-
return metric_name

src/ai/backend/common/types.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@
110110
"KernelEnqueueingConfig",
111111
"KernelId",
112112
"MetricKey",
113+
"MetricValue",
113114
"ModelServiceProfile",
114115
"ModelServiceStatus",
115116
"MountExpression",
@@ -118,6 +119,7 @@
118119
"MountPermissionLiteral",
119120
"MountPoint",
120121
"MountTypes",
122+
"MovingStatValue",
121123
"PreemptionMode",
122124
"PreemptionOrder",
123125
"PromMetric",
@@ -567,6 +569,34 @@ class AbuseReport(TypedDict):
567569
abuse_report: str | None
568570

569571

572+
class MovingStatValue(TypedDict):
573+
min: str
574+
max: str
575+
sum: str
576+
avg: str
577+
diff: str
578+
rate: str
579+
version: int | None # for legacy client compatibility
580+
581+
582+
MetricValue = TypedDict(
583+
"MetricValue",
584+
{
585+
"current": str,
586+
"capacity": str | None,
587+
"pct": str,
588+
"unit_hint": str,
589+
"stats.min": str,
590+
"stats.max": str,
591+
"stats.sum": str,
592+
"stats.avg": str,
593+
"stats.diff": str,
594+
"stats.rate": str,
595+
"stats.version": int | None,
596+
},
597+
)
598+
599+
570600
class IntrinsicSlotNames(enum.Enum):
571601
CPU = SlotName("cpu")
572602
MEMORY = SlotName("mem")

src/ai/backend/manager/api/gql_legacy/stat_converter.py

Lines changed: 60 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,59 @@
11
from collections.abc import Iterable
2+
from typing import Final
23

34
from ai.backend.common.clients.prometheus.metric_types import KernelLiveStatBatchResult
45
from ai.backend.common.clients.prometheus.types import MetricValue as PrometheusMetricValue
56
from ai.backend.common.clients.prometheus.types import ValueType
6-
from ai.backend.common.metrics.types import (
7-
DIFF_STAT_METRICS,
8-
RATE_STAT_METRICS,
9-
UTILIZATION_METRIC_INTERVAL,
10-
MetricValue,
11-
make_default_metric_value,
12-
resolve_unit_hint,
13-
)
14-
from ai.backend.common.types import KernelId
7+
from ai.backend.common.metrics.types import UTILIZATION_METRIC_INTERVAL
8+
from ai.backend.common.types import KernelId, MetricValue
9+
10+
# Metric-name classification used only while adapting Prometheus samples back
11+
# into the legacy live_stat dict that Graphene/WebUI still expects.
12+
_RATE_STAT_METRICS: Final[frozenset[str]] = frozenset({"net_rx", "net_tx"})
13+
_DIFF_STAT_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"})
14+
15+
# Per-metric unit hint emitted by the agent (source of truth:
16+
# src/ai/backend/agent/docker/intrinsic.py).
17+
_METRIC_UNIT_HINTS: Final[dict[str, str]] = {
18+
"cpu_used": "msec",
19+
"cpu_util": "percent",
20+
"mem": "bytes",
21+
"net_rx": "bps",
22+
"net_tx": "bps",
23+
"io_read": "bytes",
24+
"io_write": "bytes",
25+
"io_scratch_size": "bytes",
26+
}
27+
28+
29+
def _make_default_metric_value(unit_hint: str) -> MetricValue:
30+
return MetricValue({
31+
"current": "0",
32+
"capacity": "0",
33+
"pct": "0",
34+
"unit_hint": unit_hint,
35+
"stats.min": "0",
36+
"stats.max": "0",
37+
"stats.sum": "0",
38+
"stats.avg": "0",
39+
"stats.diff": "0",
40+
"stats.rate": "0",
41+
"stats.version": None,
42+
})
43+
44+
45+
def _resolve_unit_hint(metric_name: str) -> str:
46+
if metric_name in _METRIC_UNIT_HINTS:
47+
return _METRIC_UNIT_HINTS[metric_name]
48+
if metric_name.endswith("_util"):
49+
return "percent"
50+
if metric_name == "mem" or metric_name.endswith("_mem"):
51+
return "bytes"
52+
if metric_name.startswith("io_"):
53+
return "bytes"
54+
if metric_name.startswith("net_"):
55+
return "bps"
56+
return metric_name
1557

1658

1759
class LegacyLiveStatConverter:
@@ -53,17 +95,17 @@ def _convert_one_kernel(cls, values: Iterable[PrometheusMetricValue]) -> dict[st
5395
def _convert_metric_samples(
5496
metric_name: str, samples: list[PrometheusMetricValue]
5597
) -> MetricValue:
56-
# `resolve_unit_hint` falls back to naming conventions and finally
98+
# `_resolve_unit_hint` falls back to naming conventions and finally
5799
# the metric_name itself for unregistered plugin metrics.
58-
unit_hint = resolve_unit_hint(metric_name)
59-
out = make_default_metric_value(unit_hint=unit_hint)
100+
unit_hint = _resolve_unit_hint(metric_name)
101+
out = _make_default_metric_value(unit_hint=unit_hint)
60102

61103
currents = [s.value for s in samples if s.value_type is ValueType.CURRENT]
62104
capacities = [s.value for s in samples if s.value_type is ValueType.CAPACITY]
63105
pcts = [s.value for s in samples if s.value_type is ValueType.PCT]
64106

65-
is_rate_metric = metric_name in RATE_STAT_METRICS
66-
is_diff_metric = metric_name in DIFF_STAT_METRICS
107+
is_rate_metric = metric_name in _RATE_STAT_METRICS
108+
is_diff_metric = metric_name in _DIFF_STAT_METRICS
67109

68110
if currents:
69111
# RATE/DIFF: prefer the rate/diff query result over the raw gauge,
@@ -96,7 +138,10 @@ def _convert_metric_samples(
96138
else:
97139
try:
98140
current_value = float(out["current"])
99-
capacity_value = float(out["capacity"])
141+
capacity = out["capacity"]
142+
if capacity is None:
143+
return out
144+
capacity_value = float(capacity)
100145
if capacity_value > 0:
101146
out["pct"] = f"{current_value / capacity_value * 100:.2f}"
102147
except ValueError:

tests/unit/manager/api/gql_legacy/test_stat_converter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66

77
from ai.backend.common.clients.prometheus.metric_types import KernelLiveStatBatchResult
88
from ai.backend.common.clients.prometheus.types import MetricValue, ValueType
9-
from ai.backend.common.metrics.types import MetricValue as LegacyMetricValue
109
from ai.backend.common.types import KernelId
10+
from ai.backend.common.types import MetricValue as LegacyMetricValue
1111
from ai.backend.manager.api.gql_legacy.stat_converter import LegacyLiveStatConverter
1212

1313

0 commit comments

Comments
 (0)