fix: keep legacy metric conversion local

seedspirit · seedspirit · commit 5c54cab7659f · 2026-05-12T13:59:20.000+09:00
diff --git a/src/ai/backend/agent/stats.py b/src/ai/backend/agent/stats.py
@@ -26,17 +26,15 @@
 from ai.backend.common import msgpack
 from ai.backend.common.identity import is_containerized
 from ai.backend.common.metrics.metric import StageObserver
-from ai.backend.common.metrics.types import (
-    UTILIZATION_METRIC_INTERVAL,
-    MetricValue,
-    MovingStatValue,
-)
+from ai.backend.common.metrics.types import UTILIZATION_METRIC_INTERVAL
 from ai.backend.common.types import (
     PID,
     ContainerId,
     DeviceId,
     KernelId,
     MetricKey,
+    MetricValue,
+    MovingStatValue,
     SessionId,
     SlotName,
 )
diff --git a/src/ai/backend/appproxy/worker/types.py b/src/ai/backend/appproxy/worker/types.py
@@ -42,9 +42,10 @@
     SafeGauge,
     SafeHistogram,
 )
-from ai.backend.common.metrics.types import MetricValue, MovingStatValue
 from ai.backend.common.types import (
     MetricKey,
+    MetricValue,
+    MovingStatValue,
     RuntimeVariant,
 )
 
diff --git a/src/ai/backend/client/output/formatters.py b/src/ai/backend/client/output/formatters.py
@@ -9,7 +9,7 @@
 
 import humanize
 
-from ai.backend.common.metrics.types import MetricValue
+from ai.backend.common.types import MetricValue
 
 from .types import AbstractOutputFormatter, FieldSpec
 
diff --git a/src/ai/backend/common/metrics/types.py b/src/ai/backend/common/metrics/types.py
@@ -1,107 +1,11 @@
-from typing import Final, TypedDict
+from typing import Final
 
 UNDEFINED: Final[str] = "undefined"
 
-
-class MovingStatValue(TypedDict):
-    min: str
-    max: str
-    sum: str
-    avg: str
-    diff: str
-    rate: str
-    version: int | None  # for legacy client compatibility
-
-
-MetricValue = TypedDict(
-    "MetricValue",
-    {
-        "current": str,
-        "capacity": str,
-        "pct": str,
-        "unit_hint": str,
-        "stats.min": str,
-        "stats.max": str,
-        "stats.sum": str,
-        "stats.avg": str,
-        "stats.diff": str,
-        "stats.rate": str,
-        "stats.version": int | None,
-    },
-)
-
-
-def make_default_metric_value(unit_hint: str) -> MetricValue:
-    """Return a `MetricValue` populated with neutral defaults.
-
-    All numeric string fields are `"0"` (including `capacity`, matching the
-    legacy Valkey shape where every metric carried a string capacity).
-    `unit_hint` is supplied by the caller.
-    """
-    return MetricValue({
-        "current": "0",
-        "capacity": "0",
-        "pct": "0",
-        "unit_hint": unit_hint,
-        "stats.min": "0",
-        "stats.max": "0",
-        "stats.sum": "0",
-        "stats.avg": "0",
-        "stats.diff": "0",
-        "stats.rate": "0",
-        "stats.version": None,
-    })
-
-
 UTILIZATION_METRIC_INTERVAL: Final[float] = 5.0
 UTILIZATION_METRIC_DETENTION: Final[float] = 600.0  # 10 minutes
 
 CONTAINER_UTILIZATION_METRIC_NAME: Final[str] = "backendai_container_utilization"
 CONTAINER_UTILIZATION_METRIC_LABEL_NAME: Final[str] = "container_metric_name"
 DEVICE_UTILIZATION_METRIC_LABEL_NAME: Final[str] = "device_metric_name"
 PROCESS_UTILIZATION_METRIC_LABEL_NAME: Final[str] = "process_metric_name"
-
-# Metric-name classification used by the legacy live_stat dict converter.
-# These mirror the semantics that Worker's MovingStatistics produced when
-# kernel stats were stored in Valkey:
-#   - RATE_STAT_METRICS: stats.rate is meaningful (rate of change per second).
-#   - DIFF_STAT_METRICS: stats.diff is meaningful (delta over the last window).
-RATE_STAT_METRICS: Final[frozenset[str]] = frozenset({"net_rx", "net_tx"})
-DIFF_STAT_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"})
-
-# Per-metric unit hint emitted by the agent (source of truth: src/ai/backend/agent/docker/intrinsic.py).
-METRIC_UNIT_HINTS: Final[dict[str, str]] = {
-    "cpu_used": "msec",
-    "cpu_util": "percent",
-    "mem": "bytes",
-    "net_rx": "bps",
-    "net_tx": "bps",
-    "io_read": "bytes",
-    "io_write": "bytes",
-    "io_scratch_size": "bytes",
-}
-
-
-def resolve_unit_hint(metric_name: str) -> str:
-    """Return the unit_hint for a Backend.AI container metric name.
-
-    Prometheus does not carry the agent-side `unit_hint` in its samples, so the
-    manager has to recover it from the metric name alone. Lookup order:
-
-      1. Explicit registration in :data:`METRIC_UNIT_HINTS` (highest priority).
-      2. Naming-convention fallback for plugin metrics that follow Backend.AI
-         conventions (e.g., `cuda_util`, `gpu_mem`, `tpu_util`).
-      3. The metric_name itself as a last resort — preserves the sample data
-         and surfaces the missing registration to the WebUI via the response.
-    """
-    if metric_name in METRIC_UNIT_HINTS:
-        return METRIC_UNIT_HINTS[metric_name]
-    if metric_name.endswith("_util"):
-        return "percent"
-    if metric_name == "mem" or metric_name.endswith("_mem"):
-        return "bytes"
-    if metric_name.startswith("io_"):
-        return "bytes"
-    if metric_name.startswith("net_"):
-        return "bps"
-    return metric_name
diff --git a/src/ai/backend/common/types.py b/src/ai/backend/common/types.py
@@ -110,6 +110,7 @@
     "KernelEnqueueingConfig",
     "KernelId",
     "MetricKey",
+    "MetricValue",
     "ModelServiceProfile",
     "ModelServiceStatus",
     "MountExpression",
@@ -118,6 +119,7 @@
     "MountPermissionLiteral",
     "MountPoint",
     "MountTypes",
+    "MovingStatValue",
     "PreemptionMode",
     "PreemptionOrder",
     "PromMetric",
@@ -567,6 +569,34 @@ class AbuseReport(TypedDict):
     abuse_report: str | None
 
 
+class MovingStatValue(TypedDict):
+    min: str
+    max: str
+    sum: str
+    avg: str
+    diff: str
+    rate: str
+    version: int | None  # for legacy client compatibility
+
+
+MetricValue = TypedDict(
+    "MetricValue",
+    {
+        "current": str,
+        "capacity": str | None,
+        "pct": str,
+        "unit_hint": str,
+        "stats.min": str,
+        "stats.max": str,
+        "stats.sum": str,
+        "stats.avg": str,
+        "stats.diff": str,
+        "stats.rate": str,
+        "stats.version": int | None,
+    },
+)
+
+
 class IntrinsicSlotNames(enum.Enum):
     CPU = SlotName("cpu")
     MEMORY = SlotName("mem")
diff --git a/src/ai/backend/manager/api/gql_legacy/stat_converter.py b/src/ai/backend/manager/api/gql_legacy/stat_converter.py
@@ -1,17 +1,59 @@
 from collections.abc import Iterable
+from typing import Final
 
 from ai.backend.common.clients.prometheus.metric_types import KernelLiveStatBatchResult
 from ai.backend.common.clients.prometheus.types import MetricValue as PrometheusMetricValue
 from ai.backend.common.clients.prometheus.types import ValueType
-from ai.backend.common.metrics.types import (
-    DIFF_STAT_METRICS,
-    RATE_STAT_METRICS,
-    UTILIZATION_METRIC_INTERVAL,
-    MetricValue,
-    make_default_metric_value,
-    resolve_unit_hint,
-)
-from ai.backend.common.types import KernelId
+from ai.backend.common.metrics.types import UTILIZATION_METRIC_INTERVAL
+from ai.backend.common.types import KernelId, MetricValue
+
+# Metric-name classification used only while adapting Prometheus samples back
+# into the legacy live_stat dict that Graphene/WebUI still expects.
+_RATE_STAT_METRICS: Final[frozenset[str]] = frozenset({"net_rx", "net_tx"})
+_DIFF_STAT_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"})
+
+# Per-metric unit hint emitted by the agent (source of truth:
+# src/ai/backend/agent/docker/intrinsic.py).
+_METRIC_UNIT_HINTS: Final[dict[str, str]] = {
+    "cpu_used": "msec",
+    "cpu_util": "percent",
+    "mem": "bytes",
+    "net_rx": "bps",
+    "net_tx": "bps",
+    "io_read": "bytes",
+    "io_write": "bytes",
+    "io_scratch_size": "bytes",
+}
+
+
+def _make_default_metric_value(unit_hint: str) -> MetricValue:
+    return MetricValue({
+        "current": "0",
+        "capacity": "0",
+        "pct": "0",
+        "unit_hint": unit_hint,
+        "stats.min": "0",
+        "stats.max": "0",
+        "stats.sum": "0",
+        "stats.avg": "0",
+        "stats.diff": "0",
+        "stats.rate": "0",
+        "stats.version": None,
+    })
+
+
+def _resolve_unit_hint(metric_name: str) -> str:
+    if metric_name in _METRIC_UNIT_HINTS:
+        return _METRIC_UNIT_HINTS[metric_name]
+    if metric_name.endswith("_util"):
+        return "percent"
+    if metric_name == "mem" or metric_name.endswith("_mem"):
+        return "bytes"
+    if metric_name.startswith("io_"):
+        return "bytes"
+    if metric_name.startswith("net_"):
+        return "bps"
+    return metric_name
 
 
 class LegacyLiveStatConverter:
@@ -53,17 +95,17 @@ def _convert_one_kernel(cls, values: Iterable[PrometheusMetricValue]) -> dict[st
     def _convert_metric_samples(
         metric_name: str, samples: list[PrometheusMetricValue]
     ) -> MetricValue:
-        # `resolve_unit_hint` falls back to naming conventions and finally
+        # `_resolve_unit_hint` falls back to naming conventions and finally
         # the metric_name itself for unregistered plugin metrics.
-        unit_hint = resolve_unit_hint(metric_name)
-        out = make_default_metric_value(unit_hint=unit_hint)
+        unit_hint = _resolve_unit_hint(metric_name)
+        out = _make_default_metric_value(unit_hint=unit_hint)
 
         currents = [s.value for s in samples if s.value_type is ValueType.CURRENT]
         capacities = [s.value for s in samples if s.value_type is ValueType.CAPACITY]
         pcts = [s.value for s in samples if s.value_type is ValueType.PCT]
 
-        is_rate_metric = metric_name in RATE_STAT_METRICS
-        is_diff_metric = metric_name in DIFF_STAT_METRICS
+        is_rate_metric = metric_name in _RATE_STAT_METRICS
+        is_diff_metric = metric_name in _DIFF_STAT_METRICS
 
         if currents:
             # RATE/DIFF: prefer the rate/diff query result over the raw gauge,
@@ -96,7 +138,10 @@ def _convert_metric_samples(
         else:
             try:
                 current_value = float(out["current"])
-                capacity_value = float(out["capacity"])
+                capacity = out["capacity"]
+                if capacity is None:
+                    return out
+                capacity_value = float(capacity)
                 if capacity_value > 0:
                     out["pct"] = f"{current_value / capacity_value * 100:.2f}"
             except ValueError:
diff --git a/tests/unit/manager/api/gql_legacy/test_stat_converter.py b/tests/unit/manager/api/gql_legacy/test_stat_converter.py
@@ -6,8 +6,8 @@
 
 from ai.backend.common.clients.prometheus.metric_types import KernelLiveStatBatchResult
 from ai.backend.common.clients.prometheus.types import MetricValue, ValueType
-from ai.backend.common.metrics.types import MetricValue as LegacyMetricValue
 from ai.backend.common.types import KernelId
+from ai.backend.common.types import MetricValue as LegacyMetricValue
 from ai.backend.manager.api.gql_legacy.stat_converter import LegacyLiveStatConverter
 
 

Original file line number	Diff line number	Diff line change
`@@ -42,9 +42,10 @@`
`42`	`42`	`SafeGauge,`
`43`	`43`	`SafeHistogram,`
`44`	`44`	`)`
`45`		`-from ai.backend.common.metrics.types import MetricValue, MovingStatValue`
`46`	`45`	`from ai.backend.common.types import (`
`47`	`46`	`MetricKey,`
	`47`	`+ MetricValue,`
	`48`	`+ MovingStatValue,`
`48`	`49`	`RuntimeVariant,`
`49`	`50`	`)`
`50`	`51`