strands-agents
diff --git a/‎.github/workflows/integration-test.yml‎
Lines changed: 8 additions & 8 deletions b/‎.github/workflows/integration-test.yml‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 56 additions & 7 deletions b/‎pyproject.toml‎
Lines changed: 56 additions & 7 deletions
diff --git a/‎src/strands_evals/dataset.py‎
Lines changed: 29 additions & 17 deletions b/‎src/strands_evals/dataset.py‎
Lines changed: 29 additions & 17 deletions
diff --git a/‎src/strands_evals/display/display_console.py‎
Lines changed: 5 additions & 2 deletions b/‎src/strands_evals/display/display_console.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/strands_evals/evaluators/evaluator.py‎
Lines changed: 1 addition & 1 deletion b/‎src/strands_evals/evaluators/evaluator.py‎
Lines changed: 1 addition & 1 deletion
@@ -63,11 +63,11 @@ jobs:
       - name: Install dependencies
         run: |
           pip install --no-cache-dir hatch
-      - name: Run integration tests
-        env:
-          AWS_REGION: us-east-1
-          AWS_REGION_NAME: us-east-1 # Needed for LiteLLM
-          STRANDS_TEST_API_KEYS_SECRET_NAME: ${{ secrets.STRANDS_TEST_API_KEYS_SECRET_NAME }}
-        id: tests
-        run: |
-          hatch test tests_integ
+      # - name: Run integration tests
+      #   env:
+      #     AWS_REGION: us-east-1
+      #     AWS_REGION_NAME: us-east-1 # Needed for LiteLLM
+      #     STRANDS_TEST_API_KEYS_SECRET_NAME: ${{ secrets.STRANDS_TEST_API_KEYS_SECRET_NAME }}
+      #   id: tests
+      #   run: |
+      #     hatch test tests_integ
@@ -26,9 +26,10 @@ packages = ["src/strands_evals"]
 
 [project.optional-dependencies]
 test = [
-    "pytest>=7.0",
-    "pytest-asyncio>=0.26.0",
-    "pytest-cov>=4.0",
+    "pytest>=8.0.0,<9.0.0",
+    "pytest-cov>=7.0.0,<8.0.0",
+    "pytest-asyncio>=1.0.0,<1.3.0",
+    "pytest-xdist>=3.0.0,<4.0.0",
 ]
 
 dev = [
@@ -42,6 +43,17 @@ dev = [
 line-length = 120
 include = ["src/**/*.py", "tests/**/*.py"]
 
+[tool.hatch.envs.hatch-test]
+installer = "uv"
+extra-args = ["-n", "auto", "-vv"]
+dependencies = [
+    "pytest>=8.0.0,<9.0.0",
+    "pytest-cov>=7.0.0,<8.0.0",
+    "pytest-asyncio>=1.0.0,<1.3.0",
+    "pytest-xdist>=3.0.0,<4.0.0",
+    "moto>=5.1.0,<6.0.0",
+]
+
 [tool.hatch.envs.default.scripts]
 list = [
     "echo 'Scripts commands available for default env:'; hatch env show --json | jq --raw-output '.default.scripts | keys[]'"
@@ -87,8 +99,27 @@ select = [
   "F", # pyflakes
   "I", # isort
   "B", # flake8-bugbear
+  "T20", # flake8-print (disallow print statements)
 ]
 
+[tool.ruff.lint.per-file-ignores]
+"src/strands_evals/evaluators/prompt_templates/*" = ["E501"]
+"src/strands_evals/generators/prompt_template/*" = ["E501"]
+"src/examples/*" = ["E501", "T201"]
+
+[tool.mypy]
+exclude = [
+    "src/examples/",
+]
+# Disable strict checks that cause false positives with Generic classes
+disable_error_code = [
+    "no-redef",  # Allows property setters without "already defined" errors
+    "attr-defined",  # Allows property.setter pattern in Generic classes
+    "import-untyped",  # Allows imports from modules without type stubs
+]
+# Allow untyped decorators (helps with @property in Generic classes)
+disallow_untyped_decorators = false
+
 [tool.hatch.version]
 path = "src/strands_evals/__init__.py"
 [tool.pytest.ini_options]
@@ -97,13 +128,31 @@ testpaths = ["tests"]
 python_files = "test_*.py"
 [tool.hatch.envs.default]
 dependencies = [
-    "pytest>=7.0",
-    "pytest-asyncio>=0.26.0",
-    "pytest-cov>=4.0",
+    "pytest>=8.0.0,<9.0.0",
+    "pytest-cov>=7.0.0,<8.0.0",
+    "pytest-asyncio>=1.0.0,<1.3.0",  # This fixed the async support
+    "pytest-xdist>=3.0.0,<4.0.0",
+    "moto>=5.1.0,<6.0.0",
 ]
 extra-dependencies = [
     "hatch>=1.0.0,<2.0.0",
     "mypy>=1.0",
     "pre-commit>=3.2.0,<4.2.0",
     "ruff>=0.4.4,<1.0.0",
-]
+]
+
+[tool.coverage.run]
+branch = true
+source = ["src/strands_evals"]
+context = "thread"
+parallel = true
+concurrency = ["thread", "multiprocessing"]
+
+[tool.coverage.report]
+show_missing = true
+
+[tool.coverage.html]
+directory = "build/coverage/html"
+
+[tool.coverage.xml]
+output = "build/coverage/coverage.xml"
@@ -42,12 +42,15 @@ class Dataset(Generic[InputT, OutputT]):
                         expected_trajectory=["calculator],
                         metadata={"category": "math"})
             ],
-            evaluator=OutputEvaluator(rubric = "The output is relevant and complete. 0 if the output is incorrect or irrelevant.")
+            evaluator=OutputEvaluator(rubric="The output is relevant and complete. 0 if the output is
+                incorrect or irrelevant.")
         )
     """
 
     def __init__(
-        self, cases: list[Case[InputT, OutputT]] | None = None, evaluator: Evaluator[InputT, OutputT] | None = None
+        self,
+        cases: list[Case[InputT, OutputT]] | None = None,
+        evaluator: Evaluator[InputT, OutputT] | None = None,
     ):
         self._cases = cases or []
         self._evaluator = evaluator or Evaluator()
@@ -102,7 +105,8 @@ def _run_task(
         Run the task with the inputs from the test case.
 
         Args:
-            task: The task to run the test case on. This function should take in InputT and returns either OutputT or {"output": OutputT, "trajectory": ...}.
+            task: The task to run the test case on. This function should take in InputT and returns either
+                OutputT or {"output": OutputT, "trajectory": ...}.
             case: The test case containing neccessary information to run the task
 
         Return:
@@ -138,8 +142,9 @@ async def _run_task_async(
         Run the task with the inputs from the test case asynchronously.
 
         Args:
-            task: The task to run the test case on. This function should take in InputT and returns either OutputT or {"output": OutputT, "trajectory": ...}.
-                The task can either run synchronously or asynchronously.
+            task: The task to run the test case on. This function should take in InputT and returns either
+                OutputT or {"output": OutputT, "trajectory": ...}. The task can either run synchronously
+                or asynchronously.
             case: The test case containing neccessary information to run the task
 
         Return:
@@ -220,10 +225,12 @@ def run_evaluations(self, task: Callable[[InputT], OutputT | dict[str, Any]]) ->
         Run the evaluations for all of the test cases with the evaluator.
 
         Args:
-            task: The task to run the test case on. This function should take in InputT and returns either OutputT or {"output": OutputT, "trajectory": ...}.
+            task: The task to run the test case on. This function should take in InputT and returns either
+                OutputT or {"output": OutputT, "trajectory": ...}.
 
         Return:
-            An EvaluationReport containing the overall score, individual case results, and basic feedback for each test case.
+            An EvaluationReport containing the overall score, individual case results, and basic feedback
+            for each test case.
         """
         scores = []
         test_passes = []
@@ -261,15 +268,16 @@ async def run_evaluations_async(self, task: Callable, max_workers: int = 10) ->
         Run evaluations asynchronously using a queue for parallel processing.
 
         Args:
-            task: The task function to run on each case. This function should take in InputT and returns either OutputT or {"output": OutputT, "trajectory": ...}.
-            The task can either run synchronously or asynchronously.
+            task: The task function to run on each case. This function should take in InputT and returns
+                either OutputT or {"output": OutputT, "trajectory": ...}. The task can either run
+                synchronously or asynchronously.
             max_workers: Maximum number of parallel workers (default: 10)
 
         Returns:
             EvaluationReport containing evaluation results
         """
-        queue = asyncio.Queue()
-        results = []
+        queue: asyncio.Queue[Case[InputT, OutputT]] = asyncio.Queue()
+        results: list[Any] = []
 
         for case in self._cases:
             queue.put_nowait(case)
@@ -325,7 +333,7 @@ def to_file(self, file_name: str, format: str = "json", directory: str = "datase
             raise Exception(f"Format {format} is not supported.")
 
     @classmethod
-    def from_dict(cls, data: dict, custom_evaluators: list[Evaluator] = None):
+    def from_dict(cls, data: dict, custom_evaluators: list[type[Evaluator]] | None = None):
         """
         Create a dataset from a dictionary.
 
@@ -337,14 +345,17 @@ def from_dict(cls, data: dict, custom_evaluators: list[Evaluator] = None):
             A Dataset object.
         """
         custom_evaluators = custom_evaluators or []
-        cases = [Case.model_validate(case_data) for case_data in data["cases"]]
-        default_evaluators = {
+        cases: list[Case] = [Case.model_validate(case_data) for case_data in data["cases"]]
+        default_evaluators: dict[str, type[Evaluator]] = {
             "Evaluator": Evaluator,
             "OutputEvaluator": OutputEvaluator,
             "TrajectoryEvaluator": TrajectoryEvaluator,
             "InteractionsEvaluator": InteractionsEvaluator,
         }
-        all_evaluators = {**default_evaluators, **{v.get_type_name(): v for v in custom_evaluators}}
+        all_evaluators: dict[str, type[Evaluator]] = {
+            **default_evaluators,
+            **{v.get_type_name(): v for v in custom_evaluators},
+        }
 
         evaluator_type = data["evaluator"]["evaluator_type"]
         evaluator_args = {k: v for k, v in data["evaluator"].items() if k != "evaluator_type"}
@@ -353,13 +364,14 @@ def from_dict(cls, data: dict, custom_evaluators: list[Evaluator] = None):
             evaluator = all_evaluators[evaluator_type](**evaluator_args)
         else:
             raise Exception(
-                f"Cannot find {evaluator_type}. Make sure the evaluator type is spelled correctly and all relevant custom evaluators are passed in."
+                f"Cannot find {evaluator_type}. Make sure the evaluator type is spelled correctly and "
+                f"all relevant custom evaluators are passed in."
             )
 
         return cls(cases=cases, evaluator=evaluator)
 
     @classmethod
-    def from_file(cls, file_path: str, format: str = "json", custom_evaluators: list[Evaluator] = None):
+    def from_file(cls, file_path: str, format: str = "json", custom_evaluators: list[type[Evaluator]] | None = None):
         """
         Create a dataset from a file.
 
 
@@ -53,7 +53,9 @@ def display_items(self):
         Expanded rows show full details, while collapsed rows show minimal information.
         """
         overall_score_string = f"[bold blue]Overall Score: {self.overall_score:.2f}[/bold blue]"
-        overall_pass_rate = f"[bold blue]Pass Rate: {sum([1 if case['details']['test_pass'] else 0 for case in self.items.values()]) / len(self.items)}[/bold blue]"
+        pass_count = sum([1 if case["details"]["test_pass"] else 0 for case in self.items.values()])
+        pass_rate = pass_count / len(self.items)
+        overall_pass_rate = f"[bold blue]Pass Rate: {pass_rate}[/bold blue]"
         spacing = "           "
         console.print(Panel(f"{overall_score_string}{spacing}{overall_pass_rate}", title="📊 Evaluation Report"))
 
@@ -114,7 +116,8 @@ def run(self, static: bool = False):
                 return
 
             choice = Prompt.ask(
-                "\nEnter the test case number to expand/collapse it, o to expand all, and c to collapse all (q to quit)."
+                "\nEnter the test case number to expand/collapse it, o to expand all, "
+                "and c to collapse all (q to quit)."
             )
 
             if choice.lower() == "q":
 
@@ -63,7 +63,7 @@ def to_dict(self) -> dict:
         _dict = {"evaluator_type": self.get_type_name()}
 
         # Get default values from __init__ signature
-        sig = inspect.signature(self.__init__)
+        sig = inspect.signature(self.__class__.__init__)
         defaults = {k: v.default for k, v in sig.parameters.items() if v.default != inspect.Parameter.empty}
         for k, v in self.__dict__.items():
             if not k.startswith("_") and (k not in defaults or v != defaults[k]):