[iris] k8s: bulk pod-metrics query to cut controller load

rjpower · rjpower · commit 81ed3a5c48ef · 2026-06-23T03:18:48.000Z
The k8s ResourceCollector polled one pod's metrics at a time: every tick
it fanned out top_pod(pod_name) -&gt; get_namespaced_custom_object across a
32-thread pool, so an N-pod cluster issued N metrics-API requests per
tick. At ~1000 pods this is the dominant controller load.

Replace top_pod with a bulk top_pods(): a single
list_namespaced_custom_object on metrics.k8s.io (PodMetricsList), scoped
to the managed-pod label selector, returning pod_name -&gt; usage for the
whole namespace in one request. The collector now does one API call per
tick regardless of pod count, looks up each tracked pod in the result,
and writes the rows in a single batched Table.write. The per-pod thread
pool is gone.

Also align the resource poll cadence with the metrics-server scrape
resolution: the collector ran every 5s, but metrics-server only refreshes
samples ~every 15s, so faster polling re-read identical values. Add a
resource_poll_interval (default 15s) separate from log_poll_interval.

Net effect at 1000 pods: from 1000 requests / 5s to 1 request / 15s.
diff --git a/lib/iris/src/iris/cluster/backends/k8s/fake.py b/lib/iris/src/iris/cluster/backends/k8s/fake.py
@@ -275,12 +275,14 @@ def __init__(
         )
         self._resources: dict[tuple[str, str], dict] = {}  # (kind, name) -> manifest
         self._injected_failures: dict[str, Exception] = {}
+        self._persistent_failures: dict[str, Exception] = {}
         self._logs: dict[str, str] = {}  # pod_name -> log text
         self._events: list[dict] = []
         self._exec_responses: dict[str, list[ExecResult]] = {}
         self._file_contents: dict[tuple[str, str], bytes] = {}  # (pod_name, path) -> data
         self._rm_files_calls: list[tuple[str, list[str]]] = []
         self._top_pod_overrides: dict[str, PodResourceUsage | None] = {}
+        self.top_pods_call_count = 0
         self._log_watermarks: dict[str, int] = {}  # pod_name -> bytes consumed
 
         # Pods living outside the service's own namespace, keyed by
@@ -472,12 +474,21 @@ def close(self) -> None:
 
     # -- Failure injection --
 
-    def inject_failure(self, operation: str, error: Exception) -> None:
-        """Inject a one-shot failure for the next call to *operation*."""
-        self._injected_failures[operation] = error
+    def inject_failure(self, operation: str, error: Exception, *, persistent: bool = False) -> None:
+        """Inject a failure for *operation*.
+
+        One-shot by default (consumed by the next call); ``persistent=True``
+        raises on every call until cleared — needed for operations a background
+        loop retries on its own cadence.
+        """
+        if persistent:
+            self._persistent_failures[operation] = error
+        else:
+            self._injected_failures[operation] = error
 
     def clear_failure(self, operation: str) -> None:
         self._injected_failures.pop(operation, None)
+        self._persistent_failures.pop(operation, None)
 
     # -- Node pool management --
 
@@ -610,7 +621,7 @@ def set_file_content(self, pod_name: str, path: str, data: bytes) -> None:
         self._file_contents[(pod_name, path)] = data
 
     def set_top_pod(self, pod_name: str, result: PodResourceUsage | None) -> None:
-        """Configure a specific top_pod result for a pod."""
+        """Configure a pod's reported resource usage (None = metrics absent)."""
         self._top_pod_overrides[pod_name] = result
 
     def seed_resource(self, resource: K8sResource, name: str, manifest: dict) -> None:
@@ -628,6 +639,8 @@ def seed_namespaced_pod(self, namespace: str, name: str, manifest: dict) -> None
     # -- Protocol methods --
 
     def _check_failure(self, operation: str) -> None:
+        if err := self._persistent_failures.get(operation):
+            raise err
         if err := self._injected_failures.pop(operation, None):
             raise err
 
@@ -852,13 +865,27 @@ def get_events(self, field_selector: str | None = None) -> list[dict]:
                 results.append(event)
         return results
 
-    def top_pod(self, pod_name: str) -> PodResourceUsage | None:
-        self._check_failure("top_pod")
-        if pod_name in self._top_pod_overrides:
-            return self._top_pod_overrides[pod_name]
-        if any(name == pod_name for (_, name) in self._resources):
-            return PodResourceUsage(cpu_millicores=100, memory_bytes=256 * 1024 * 1024)
-        return None
+    def top_pods(self, *, labels: dict[str, str] | None = None) -> dict[str, PodResourceUsage]:
+        self._check_failure("top_pods")
+        self.top_pods_call_count += 1
+        plural = K8sResource.PODS.plural
+        usage: dict[str, PodResourceUsage] = {}
+        for (stored_plural, name), manifest in self._resources.items():
+            if stored_plural != plural:
+                continue
+            if labels:
+                res_labels = manifest.get("metadata", {}).get("labels", {})
+                if not all(res_labels.get(k) == v for k, v in labels.items()):
+                    continue
+            usage[name] = PodResourceUsage(cpu_millicores=100, memory_bytes=256 * 1024 * 1024)
+        # Per-pod overrides win regardless of the label scope; a None override
+        # means "metrics absent" and drops the pod from the result.
+        for name, override in self._top_pod_overrides.items():
+            if override is None:
+                usage.pop(name, None)
+            else:
+                usage[name] = override
+        return usage
 
     def read_file(
         self,
diff --git a/lib/iris/src/iris/cluster/backends/k8s/service.py b/lib/iris/src/iris/cluster/backends/k8s/service.py
@@ -134,7 +134,7 @@ def get_events(
         field_selector: str | None = None,
     ) -> list[dict]: ...
 
-    def top_pod(self, pod_name: str) -> PodResourceUsage | None: ...
+    def top_pods(self, *, labels: dict[str, str] | None = None) -> dict[str, PodResourceUsage]: ...
 
     def read_file(
         self,
@@ -684,39 +684,53 @@ def rm_files(self, pod_name: str, paths: list[str], *, container: str | None = N
         """Remove files inside a Pod container. Ignores missing files."""
         self.exec(pod_name, ["rm", "-f", *paths], container=container, timeout=10)
 
-    # -- top_pod -------------------------------------------------------------
+    # -- top_pods ------------------------------------------------------------
 
-    def top_pod(self, pod_name: str) -> PodResourceUsage | None:
-        """Get CPU/memory usage for a pod via metrics.k8s.io API."""
-        logger.info("k8s: top_pod %s", pod_name)
-        with slow_log(logger, f"top_pod {pod_name}", threshold_ms=_SLOW_THRESHOLD_MS):
+    def top_pods(self, *, labels: dict[str, str] | None = None) -> dict[str, PodResourceUsage]:
+        """Bulk pod CPU/memory usage via a single metrics.k8s.io list call.
+
+        Lists ``PodMetrics`` for the namespace (optionally scoped by ``labels``)
+        in one request and returns a ``pod_name -> PodResourceUsage`` map. One
+        request covers every pod, so resource collection over N pods costs a
+        single API round-trip instead of N per-pod GETs.
+
+        A 404 means the metrics API is unavailable (metrics-server absent);
+        returns an empty map rather than raising so collection degrades quietly.
+        """
+        logger.info("k8s: top_pods labels=%s", labels)
+        kwargs = self._request_timeout_kwargs()
+        if labels:
+            kwargs["label_selector"] = _label_selector(labels)
+        with slow_log(logger, "top_pods", threshold_ms=_SLOW_THRESHOLD_MS):
             try:
-                result = self._custom.get_namespaced_custom_object(
+                result = self._custom.list_namespaced_custom_object(
                     group="metrics.k8s.io",
                     version="v1beta1",
                     namespace=self.namespace,
                     plural="pods",
-                    name=pod_name,
-                    **self._request_timeout_kwargs(),
+                    **kwargs,
                 )
             except ApiException as e:
                 if e.status == 404:
-                    return None
+                    return {}
                 raise
 
-        containers = result.get("containers", [])
-        if not containers:
-            return None
-
-        total_cpu = 0
-        total_mem = 0
-        for c in containers:
-            usage = c.get("usage", {})
-            if "cpu" in usage:
-                total_cpu += parse_k8s_cpu(usage["cpu"])
-            if "memory" in usage:
-                total_mem += parse_k8s_quantity(usage["memory"])
-        return PodResourceUsage(cpu_millicores=total_cpu, memory_bytes=total_mem)
+        usage_by_pod: dict[str, PodResourceUsage] = {}
+        for item in result.get("items", []):
+            name = item.get("metadata", {}).get("name", "")
+            containers = item.get("containers", [])
+            if not name or not containers:
+                continue
+            total_cpu = 0
+            total_mem = 0
+            for c in containers:
+                usage = c.get("usage", {})
+                if "cpu" in usage:
+                    total_cpu += parse_k8s_cpu(usage["cpu"])
+                if "memory" in usage:
+                    total_mem += parse_k8s_quantity(usage["memory"])
+            usage_by_pod[name] = PodResourceUsage(cpu_millicores=total_cpu, memory_bytes=total_mem)
+        return usage_by_pod
 
     # -- port_forward (subprocess-based) -------------------------------------
 
diff --git a/lib/iris/src/iris/cluster/backends/k8s/tasks.py b/lib/iris/src/iris/cluster/backends/k8s/tasks.py
@@ -10,7 +10,6 @@
 from __future__ import annotations
 
 import base64
-import concurrent.futures
 import hashlib
 import json
 import logging
@@ -69,7 +68,7 @@
     wrap_with_kill_watchdog,
 )
 from iris.cluster.types import JobName, TaskAttempt, WorkerId, get_gpu_count
-from iris.cluster.worker.stats import build_task_stat
+from iris.cluster.worker.stats import IrisTaskStat, build_task_stat
 from iris.rpc import controller_pb2, job_pb2, worker_pb2
 from iris.time_proto import timestamp_to_proto
 
@@ -1138,22 +1137,35 @@ class ResourceCollector:
     """Background resource usage collector that writes to ``iris.task`` stats.
 
     Same set_pods() pattern as LogCollector: the sync loop declares the
-    authoritative set of running pods once per cycle. Each tick, the collector
-    fans out to ``kubectl top`` per pod and appends one ``IrisTaskStat`` row
-    per successful read to the supplied stats Table — the same table the
-    worker daemon writes to on the GCE/TPU path, so the dashboard's
-    ``iris.task`` queries cover both runtimes uniformly.
+    authoritative set of running pods once per cycle. Each tick the collector
+    issues a single bulk metrics list (``kubectl top`` equivalent) scoped to the
+    managed-pod labels, then appends one ``IrisTaskStat`` row per tracked pod
+    that has a sample — to the same table the worker daemon writes to on the
+    GCE/TPU path, so the dashboard's ``iris.task`` queries cover both runtimes
+    uniformly. One API round-trip covers every pod, so cost is independent of
+    pod count and no per-pod thread fan-out is needed.
+
+    ``poll_interval`` defaults to the metrics-server scrape resolution (15s);
+    polling faster only re-reads the same sample.
     """
 
-    def __init__(self, kubectl: K8sService, task_stats_table: Table, *, concurrency: int = 8):
+    def __init__(
+        self,
+        kubectl: K8sService,
+        task_stats_table: Table,
+        *,
+        labels: dict[str, str] | None = None,
+        poll_interval: float = 15.0,
+    ):
         self._kubectl = kubectl
         self._table = task_stats_table
+        self._labels = labels
+        self._poll_interval = poll_interval
         # (task_id_wire, attempt_id) -> pod_name. Tuple keys carry the
         # identity needed to build IrisTaskStat without parsing strings.
         self._pods: dict[tuple[str, int], str] = {}
         self._lock = threading.Lock()
         self._stop = threading.Event()
-        self._executor = ThreadPoolExecutor(max_workers=concurrency, thread_name_prefix="resource-collect")
         self._thread = threading.Thread(target=self._run, daemon=True, name="resource-collector")
         self._thread.start()
 
@@ -1164,47 +1176,47 @@ def set_pods(self, pods: dict[tuple[str, int], str]) -> None:
 
     def _run(self) -> None:
         while not self._stop.is_set():
-            with self._lock:
-                snapshot = list(self._pods.items())
-            if snapshot:
-                futures = [self._executor.submit(self._fetch_one, key, pod_name) for key, pod_name in snapshot]
-                for f in concurrent.futures.as_completed(futures):
-                    try:
-                        f.result()
-                    except Exception:
-                        pass
-            self._stop.wait(timeout=5.0)
-
-    def _fetch_one(self, key: tuple[str, int], pod_name: str) -> None:
+            self._collect_once()
+            self._stop.wait(timeout=self._poll_interval)
+
+    def _collect_once(self) -> None:
+        with self._lock:
+            snapshot = list(self._pods.items())
+        if not snapshot:
+            return
         try:
-            top = self._kubectl.top_pod(pod_name)
+            usage_by_pod = self._kubectl.top_pods(labels=self._labels)
         except Exception as e:
-            logger.debug("ResourceCollector: top_pod raised for pod %s: %s", pod_name, e)
-            return
-        if top is None:
+            logger.debug("ResourceCollector: top_pods raised: %s", e)
             return
 
-        task_id_wire, attempt_id = key
-        usage = job_pb2.ResourceUsage(
-            cpu_millicores=top.cpu_millicores,
-            memory_mb=top.memory_bytes // (1024 * 1024),
-        )
-        stat = build_task_stat(
-            task_id=task_id_wire,
-            attempt_id=attempt_id,
-            # Pod name is the per-attempt platform identity on k8s, mirroring
-            # worker_id on the GCE/TPU path.
-            worker_id=pod_name,
-            usage=usage,
-        )
+        stats: list[IrisTaskStat] = []
+        for (task_id_wire, attempt_id), pod_name in snapshot:
+            top = usage_by_pod.get(pod_name)
+            if top is None:
+                continue
+            stats.append(
+                build_task_stat(
+                    task_id=task_id_wire,
+                    attempt_id=attempt_id,
+                    # Pod name is the per-attempt platform identity on k8s,
+                    # mirroring worker_id on the GCE/TPU path.
+                    worker_id=pod_name,
+                    usage=job_pb2.ResourceUsage(
+                        cpu_millicores=top.cpu_millicores,
+                        memory_mb=top.memory_bytes // (1024 * 1024),
+                    ),
+                )
+            )
+        if not stats:
+            return
         try:
-            self._table.write([stat])
+            self._table.write(stats)
         except Exception:
             logger.debug("ResourceCollector: write to iris.task failed", exc_info=True)
 
     def close(self) -> None:
         self._stop.set()
-        self._executor.shutdown(wait=False)
         self._thread.join(timeout=5)
 
 
@@ -1310,8 +1322,14 @@ class K8sTaskProvider:
     # Pre-resolved iris.profile Table handle injected by the controller
     # alongside task_stats_table. None in test mode.
     profile_table: Table | None = None
+    # Log fetch fan-out: logs have no bulk API, so each pod is streamed on its
+    # own worker thread.
     poll_concurrency: int = 32
     log_poll_interval: float = 15.0
+    # Resource-usage poll cadence. Defaults to the metrics-server scrape
+    # resolution (15s) — sampling faster only re-reads the same value. One bulk
+    # metrics list per tick covers every managed pod (see ResourceCollector).
+    resource_poll_interval: float = 15.0
     # Cluster-wide kubectl scans (pod list, stray-pod GC, pod poll, node refresh)
     # are coarse-grained: the controller ticks reconcile at poll_interval (1s),
     # but these LISTs run at most once per cluster_scan_interval to bound kubectl
@@ -1335,7 +1353,10 @@ def _ensure_resource_collector(self) -> ResourceCollector | None:
             return None
         if self._resource_collector is None:
             self._resource_collector = ResourceCollector(
-                self.kubectl, self.task_stats_table, concurrency=self.poll_concurrency
+                self.kubectl,
+                self.task_stats_table,
+                labels=_MANAGED_POD_LABELS,
+                poll_interval=self.resource_poll_interval,
             )
         return self._resource_collector
 
diff --git a/lib/iris/tests/cluster/backends/k8s/conftest.py b/lib/iris/tests/cluster/backends/k8s/conftest.py
@@ -54,6 +54,7 @@ def provider(k8s, log_client, task_stats_table):
         log_client=log_client,
         task_stats_table=task_stats_table,
         log_poll_interval=1.0,
+        resource_poll_interval=0.5,
         cluster_scan_interval=0.0,
     )
     yield p
diff --git a/lib/iris/tests/cluster/backends/k8s/test_provider.py b/lib/iris/tests/cluster/backends/k8s/test_provider.py

Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,7 @@ def provider(k8s, log_client, task_stats_table):`
`54`	`54`	`log_client=log_client,`
`55`	`55`	`task_stats_table=task_stats_table,`
`56`	`56`	`log_poll_interval=1.0,`
	`57`	`+ resource_poll_interval=0.5,`
`57`	`58`	`cluster_scan_interval=0.0,`
`58`	`59`	`)`
`59`	`60`	`yield p`