Optimize task generation

cesarebernardis · cesarebernardis · commit ab489b148706 · 2026-02-16T19:29:15.000+01:00
diff --git a/pyagentspec/src/pyagentspec/evaluation/_computers/_async_callables_computers.py b/pyagentspec/src/pyagentspec/evaluation/_computers/_async_callables_computers.py
@@ -48,6 +48,8 @@ async def register(self, key: K, value: V) -> None:
 class _AsyncCallablesComputer(Generic[T]):
     """Evaluate a set of async callables across every sample in a dataset."""
 
+    _QUEUE_BUFFER_FACTOR = 3
+
     def __init__(
         self,
         dataset: Dataset,
@@ -57,6 +59,7 @@ def __init__(
         """Configure the computer with the dataset, callables, and concurrency cap."""
         self.dataset = dataset
         self.callables = callables
+        self.max_concurrency = max_concurrency
         if max_concurrency == -1:
             self.semaphore = None
         else:
@@ -80,16 +83,55 @@ async def _queue(self, sample_id: Any, callable_id: str) -> None:
 
     async def run(self) -> Dict[Tuple[Any, str], T]:
         """Kick off all pending computations and return the populated registry."""
-        # Materialise identifiers up-front to avoid holding async generators open
-        # while scheduling the computation fan-out.
-        sample_ids = [sample_id async for sample_id in self.dataset.ids()]
+
         metrics_names = list(self.callables.keys())
-        # ``anyio`` drives every (sample, metric) pair while respecting
-        # the concurrency limit enforced by ``_queue``.
+        if not metrics_names:
+            return {}
+
+        # For "unlimited" concurrency we still spawn one task per work item since callers
+        # explicitly opted out of concurrency caps. The producer/worker pattern below
+        # is primarily meant to prevent memory blow-ups when a bounded concurrency limit is used.
+        if self.semaphore is None:
+            sample_ids = [sample_id async for sample_id in self.dataset.ids()]
+            async with anyio.create_task_group() as tg:
+                for sample_id in sample_ids:
+                    for metric_name in metrics_names:
+                        tg.start_soon(self._queue, sample_id, metric_name)
+            return self._registry.store
+
+        # Avoid spawning one task per (sample, metric) pair: for large datasets
+        # that can create millions of tasks and consume large amounts of memory.
+        #
+        # Instead, use a producer/worker pattern:
+        # - one producer enumerates dataset sample ids and enqueues work items
+        # - N workers consume items from the queue and run computations
+
+        num_workers = max(1, self.max_concurrency)
+        queue_max_size = max(1, num_workers * self._QUEUE_BUFFER_FACTOR)
+        work_queue: anyio.abc.ObjectSendStream[Tuple[Any, str]]
+        receive_stream: anyio.abc.ObjectReceiveStream[Tuple[Any, str]]
+        work_queue, receive_stream = anyio.create_memory_object_stream(queue_max_size)
+
+        async def producer() -> None:
+            async with work_queue:
+                async for sample_id in self.dataset.ids():
+                    for metric_name in metrics_names:
+                        await work_queue.send((sample_id, metric_name))
+
+        async def worker(worker_id: int) -> None:
+            del worker_id
+            while True:
+                try:
+                    sample_id, metric_name = await receive_stream.receive()
+                except anyio.EndOfStream:
+                    return
+                await self._queue(sample_id, metric_name)
+
         async with anyio.create_task_group() as tg:
-            for sample_id in sample_ids:
-                for metric_name in metrics_names:
-                    tg.start_soon(self._queue, sample_id, metric_name)
+            tg.start_soon(producer)
+            for i in range(num_workers):
+                tg.start_soon(worker, i)
+
         return self._registry.store
 
 
diff --git a/pyagentspec/tests/evaluation/_computers/test_evaluator_concurrency.py b/pyagentspec/tests/evaluation/_computers/test_evaluator_concurrency.py
@@ -121,6 +121,46 @@ async def test_unlimited_concurrency() -> None:
         assert num_runnings_sequence[-i - 1] == i
 
 
+@pytest.mark.anyio
+async def test_run_does_not_spawn_one_task_per_item() -> None:
+    """
+    Ensure ``_AsyncCallablesComputer.run`` does not create O(N) tasks.
+    This is a regression test for memory blow-ups when datasets are large.
+    """
+
+    class CountingTaskGroup:
+        def __init__(self, max_allowed: int) -> None:
+            self.max_allowed = max_allowed
+            self.started = 0
+
+        async def __aenter__(self) -> "CountingTaskGroup":
+            return self
+
+        async def __aexit__(self, exc_type, exc, tb) -> None:
+            return None
+
+        def start_soon(self, func, *args) -> None:
+            self.started += 1
+            assert self.started <= self.max_allowed
+
+    dataset = Dataset.from_dict([{"dummy_arg": i} for i in range(10000)])
+    callables = {"dummy_callable": (lambda **kwargs: asyncio.sleep(0))}
+    computer = _AsyncCallablesComputer(
+        dataset=dataset,
+        callables=callables,
+        max_concurrency=10,
+    )
+
+    import anyio  # imported here to keep the patch localized to this test
+
+    original = anyio.create_task_group
+    try:
+        anyio.create_task_group = lambda: CountingTaskGroup(max_allowed=1 + 10)
+        await computer.run()
+    finally:
+        anyio.create_task_group = original
+
+
 @pytest.mark.anyio
 @pytest.mark.parametrize("max_concurrency", [5, 10, 20])
 async def test_firsts_begin_together(max_concurrency: int) -> None: