feat(eval): add pbar for eval progress tracking

Lawhy · Lawhy · commit 639d55a28dc2 · 2026-02-05T20:37:35.000-08:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
     "strands-agents-tools",
     "math-verify>=0.8.0",
     "click>=8.0.0",
+    "tqdm>=4.0.0",
 ]
 
 [project.optional-dependencies]
diff --git a/src/strands_env/eval/evaluator.py b/src/strands_env/eval/evaluator.py
@@ -25,6 +25,8 @@
 from pathlib import Path
 
 from pydantic import BaseModel
+from tqdm import tqdm
+from tqdm.contrib.logging import logging_redirect_tqdm
 
 from strands_env.core import Action, Environment, StepResult
 
@@ -159,25 +161,23 @@ async def run(self, actions: Iterable[Action]) -> dict[str, list[EvalSample]]:
 
         semaphore = asyncio.Semaphore(self.max_concurrency)
         save_counter = 0
-        completed = 0
         total = len(to_process)
 
-        async def process(prompt_id: str, sample_id: str, action: Action) -> None:
-            nonlocal save_counter, completed
+        async def process(prompt_id: str, sample_id: str, action: Action, pbar: tqdm) -> None:
+            nonlocal save_counter
             async with semaphore:
                 sample = await self.evaluate_sample(action)
                 self.results[prompt_id].append(sample)
                 self.completed_ids.add(sample_id)
-                completed += 1
+                pbar.update(1)
                 save_counter += 1
                 if save_counter >= self.save_interval:
                     self.save_results()
-                    logger.info(f"Progress: {completed}/{total}")
                     save_counter = 0
 
-        await asyncio.gather(*[process(pid, sid, a) for pid, sid, a in to_process])
-
-        logger.info(f"Completed: {completed}/{total}")
+        with logging_redirect_tqdm():
+            with tqdm(total=total, desc=f"Evaluating {self.benchmark_name}", unit="sample", dynamic_ncols=True) as pbar:
+                await asyncio.gather(*[process(pid, sid, a, pbar) for pid, sid, a in to_process])
         self.save_results()
         return dict(self.results)
 

Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@ dependencies = [`
`25`	`25`	`"strands-agents-tools",`
`26`	`26`	`"math-verify>=0.8.0",`
`27`	`27`	`"click>=8.0.0",`
	`28`	`+ "tqdm>=4.0.0",`
`28`	`29`	`]`
`29`	`30`
`30`	`31`	`[project.optional-dependencies]`