[iris] k8s: address lint review on resource collector

rjpower · rjpower · commit ffa2acb84d6d · 2026-06-23T03:26:12.000Z
Drive resource-collection tests synchronously via a single _collect_once()
pass instead of sleeping on the background poll thread; raise the test
collector's poll interval so the thread never races the assertions. Split
the fake's persistent-failure injection into inject_persistent_failure()
rather than a boolean flag, and trim implementation narration from the
top_pods / ResourceCollector docstrings.
diff --git a/lib/iris/src/iris/cluster/backends/k8s/fake.py b/lib/iris/src/iris/cluster/backends/k8s/fake.py
@@ -474,17 +474,17 @@ def close(self) -> None:
 
     # -- Failure injection --
 
-    def inject_failure(self, operation: str, error: Exception, *, persistent: bool = False) -> None:
-        """Inject a failure for *operation*.
+    def inject_failure(self, operation: str, error: Exception) -> None:
+        """Inject a one-shot failure consumed by the next call to *operation*."""
+        self._injected_failures[operation] = error
 
-        One-shot by default (consumed by the next call); ``persistent=True``
-        raises on every call until cleared — needed for operations a background
-        loop retries on its own cadence.
+    def inject_persistent_failure(self, operation: str, error: Exception) -> None:
+        """Fail every call to *operation* until cleared.
+
+        Needed for operations a background loop retries on its own cadence,
+        where a one-shot failure would be consumed by the first poll.
         """
-        if persistent:
-            self._persistent_failures[operation] = error
-        else:
-            self._injected_failures[operation] = error
+        self._persistent_failures[operation] = error
 
     def clear_failure(self, operation: str) -> None:
         self._injected_failures.pop(operation, None)
diff --git a/lib/iris/src/iris/cluster/backends/k8s/service.py b/lib/iris/src/iris/cluster/backends/k8s/service.py
@@ -687,15 +687,12 @@ def rm_files(self, pod_name: str, paths: list[str], *, container: str | None = N
     # -- top_pods ------------------------------------------------------------
 
     def top_pods(self, *, labels: dict[str, str] | None = None) -> dict[str, PodResourceUsage]:
-        """Bulk pod CPU/memory usage via a single metrics.k8s.io list call.
+        """Return CPU/memory usage for every pod, keyed by pod name.
 
         Lists ``PodMetrics`` for the namespace (optionally scoped by ``labels``)
-        in one request and returns a ``pod_name -> PodResourceUsage`` map. One
-        request covers every pod, so resource collection over N pods costs a
-        single API round-trip instead of N per-pod GETs.
-
-        A 404 means the metrics API is unavailable (metrics-server absent);
-        returns an empty map rather than raising so collection degrades quietly.
+        via the metrics.k8s.io list endpoint. A 404 means the metrics API is
+        unavailable (metrics-server absent); returns an empty map rather than
+        raising so callers degrade quietly.
         """
         logger.info("k8s: top_pods labels=%s", labels)
         kwargs = self._request_timeout_kwargs()
diff --git a/lib/iris/src/iris/cluster/backends/k8s/tasks.py b/lib/iris/src/iris/cluster/backends/k8s/tasks.py
@@ -1134,16 +1134,14 @@ def close(self) -> None:
 
 
 class ResourceCollector:
-    """Background resource usage collector that writes to ``iris.task`` stats.
-
-    Same set_pods() pattern as LogCollector: the sync loop declares the
-    authoritative set of running pods once per cycle. Each tick the collector
-    issues a single bulk metrics list (``kubectl top`` equivalent) scoped to the
-    managed-pod labels, then appends one ``IrisTaskStat`` row per tracked pod
-    that has a sample — to the same table the worker daemon writes to on the
-    GCE/TPU path, so the dashboard's ``iris.task`` queries cover both runtimes
-    uniformly. One API round-trip covers every pod, so cost is independent of
-    pod count and no per-pod thread fan-out is needed.
+    """Background thread that samples running pods' CPU/memory usage.
+
+    The reconcile loop declares the authoritative set of running pods via
+    ``set_pods()`` once per cycle. Each ``poll_interval`` the collector samples
+    those pods via one bulk metrics query and appends an ``IrisTaskStat`` row
+    per pod to the ``iris.task`` table — the same table the worker daemon writes
+    to on the GCE/TPU path, so the dashboard's ``iris.task`` queries cover both
+    runtimes uniformly.
 
     ``poll_interval`` defaults to the metrics-server scrape resolution (15s);
     polling faster only re-reads the same sample.
diff --git a/lib/iris/tests/cluster/backends/k8s/conftest.py b/lib/iris/tests/cluster/backends/k8s/conftest.py
@@ -54,7 +54,9 @@ def provider(k8s, log_client, task_stats_table):
         log_client=log_client,
         task_stats_table=task_stats_table,
         log_poll_interval=1.0,
-        resource_poll_interval=0.5,
+        # Long enough that the background collector never fires mid-test; resource
+        # tests drive a collection pass synchronously instead of racing the thread.
+        resource_poll_interval=3600.0,
         cluster_scan_interval=0.0,
     )
     yield p
diff --git a/lib/iris/tests/cluster/backends/k8s/test_provider.py b/lib/iris/tests/cluster/backends/k8s/test_provider.py
@@ -532,8 +532,19 @@ def test_sync_survives_node_list_failure(provider, k8s):
 # ---------------------------------------------------------------------------
 
 
+def _collect_resources_once(provider) -> None:
+    """Drive one synchronous resource-collection pass.
+
+    reconcile() registers the running-pod set with the background collector;
+    this runs a single collection against that set without waiting on (or
+    racing) the collector's poll thread.
+    """
+    assert provider._resource_collector is not None, "reconcile should have started the collector"
+    provider._resource_collector._collect_once()
+
+
 def test_resource_stats_from_kubectl_top(provider, k8s, task_stats_table):
-    """Running pods emit IrisTaskStat rows via the background ResourceCollector."""
+    """Running pods emit IrisTaskStat rows via the ResourceCollector."""
 
     task_id = JobName.from_wire("/job/0")
     attempt_id = 0
@@ -543,12 +554,9 @@ def test_resource_stats_from_kubectl_top(provider, k8s, task_stats_table):
     populate_pod(k8s, pod_name, "Running")
     k8s.set_top_pod(pod_name, PodResourceUsage(cpu_millicores=500, memory_bytes=1024 * 1024 * 1024))
 
-    batch = make_batch(running_tasks=[entry])
-    # First sync registers the pod with the ResourceCollector.
-    provider.reconcile(batch)
-    # Wait for background collector to fetch and write.
-    time.sleep(2)
-    # No more sync needed — the row has already been written to the table.
+    # reconcile registers the pod; then collect once.
+    provider.reconcile(make_batch(running_tasks=[entry]))
+    _collect_resources_once(provider)
 
     rows = [row for batch_rows in task_stats_table.writes for row in batch_rows]
     assert rows, "ResourceCollector did not write any IrisTaskStat rows"
@@ -562,7 +570,7 @@ def test_resource_stats_from_kubectl_top(provider, k8s, task_stats_table):
 
 
 def test_resource_stats_skipped_when_metrics_unavailable(provider, k8s, task_stats_table):
-    """No IrisTaskStat row is written when kubectl top returns None."""
+    """No IrisTaskStat row is written when a pod has no metrics sample."""
     task_id = JobName.from_wire("/job/0")
     attempt_id = 0
     pod_name = _pod_name(task_id, attempt_id)
@@ -571,28 +579,24 @@ def test_resource_stats_skipped_when_metrics_unavailable(provider, k8s, task_sta
     populate_pod(k8s, pod_name, "Running")
     k8s.set_top_pod(pod_name, None)
 
-    batch = make_batch(running_tasks=[entry])
-    provider.reconcile(batch)
-    time.sleep(2)
+    provider.reconcile(make_batch(running_tasks=[entry]))
+    _collect_resources_once(provider)
 
     assert task_stats_table.writes == []
 
 
 def test_resource_stats_skipped_when_top_pods_raises(provider, k8s, task_stats_table):
-    """No IrisTaskStat row is written when the bulk metrics query raises."""
+    """A raising bulk metrics query is swallowed; no IrisTaskStat row is written."""
     task_id = JobName.from_wire("/job/0")
     attempt_id = 0
     pod_name = _pod_name(task_id, attempt_id)
     entry = RunningTaskEntry(task_id=task_id, attempt_id=attempt_id)
 
     populate_pod(k8s, pod_name, "Running")
-    # Persistent: the background collector retries on its own cadence, so a
-    # one-shot failure would be consumed and later polls would succeed.
-    k8s.inject_failure("top_pods", RuntimeError("metrics-server unavailable"), persistent=True)
+    k8s.inject_persistent_failure("top_pods", RuntimeError("metrics-server unavailable"))
 
-    batch = make_batch(running_tasks=[entry])
-    provider.reconcile(batch)
-    time.sleep(2)
+    provider.reconcile(make_batch(running_tasks=[entry]))
+    _collect_resources_once(provider)
 
     assert task_stats_table.writes == []
 
@@ -606,9 +610,8 @@ def test_resource_stats_skipped_for_non_running_pods(provider, k8s, task_stats_t
 
     populate_pod(k8s, pod_name, "Succeeded")
 
-    batch = make_batch(running_tasks=[entry])
-    provider.reconcile(batch)
-    time.sleep(2)
+    provider.reconcile(make_batch(running_tasks=[entry]))
+    _collect_resources_once(provider)
 
     assert task_stats_table.writes == []