deepset-ai
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/deepset_mcp/agents/generalist/generalist_agent.py‎
Lines changed: 2 additions & 1 deletion b/‎src/deepset_mcp/agents/generalist/generalist_agent.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/deepset_mcp/api/pipeline/resource.py‎
Lines changed: 0 additions & 2 deletions b/‎src/deepset_mcp/api/pipeline/resource.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/deepset_mcp/benchmark/runner/agent_benchmark_runner.py‎
Lines changed: 9 additions & 31 deletions b/‎src/deepset_mcp/benchmark/runner/agent_benchmark_runner.py‎
Lines changed: 9 additions & 31 deletions
diff --git a/‎src/deepset_mcp/benchmark/runner/cli_agent.py‎
Lines changed: 2 additions & 2 deletions b/‎src/deepset_mcp/benchmark/runner/cli_agent.py‎
Lines changed: 2 additions & 2 deletions
@@ -33,7 +33,7 @@ benchmark = [
 agents = [
     "haystack-ai",
     "mcp-haystack",
-    "anthropic-haystack",
+    "anthropic-haystack>=2.7.0",
     "langfuse-haystack"
 ]
 
 
@@ -18,7 +18,8 @@ def get_agent(benchmark_config: BenchmarkConfig) -> Agent:
                 "DEEPSET_WORKSPACE": benchmark_config.deepset_workspace,
                 "DEEPSET_API_KEY": benchmark_config.deepset_api_key,
             },
-        )
+        ),
+        invocation_timeout=300.0,
     )
     prompt = (Path(__file__).parent / "system_prompt.md").read_text()
     generator = AnthropicChatGenerator(
 
@@ -223,8 +223,6 @@ async def get_logs(
             params=params,
         )
 
-        logger.warning(resp.json)
-
         raise_for_status(resp)
 
         if resp.json is not None:
 
@@ -1,7 +1,6 @@
 import asyncio
 import json
 import logging
-import sys
 from collections.abc import Callable
 from datetime import datetime
 from pathlib import Path
@@ -18,6 +17,7 @@
     load_test_case_by_name,
 )
 from deepset_mcp.benchmark.runner.models import AgentConfig, TestCaseConfig
+from deepset_mcp.benchmark.runner.streaming import StreamingCallbackManager
 from deepset_mcp.benchmark.runner.teardown_actions import teardown_test_case_async
 from deepset_mcp.benchmark.runner.tracing import enable_tracing
 
@@ -31,7 +31,7 @@ def __init__(
         self,
         agent_config: AgentConfig,
         benchmark_config: BenchmarkConfig,
-        streaming: bool = False,
+        streaming: bool = True,
     ):
         """
         Initialize the benchmark runner.
@@ -66,7 +66,6 @@ def __init__(
             f"{self.agent_config.display_name}-{self.commit_hash}_{self.run_timestamp.strftime('%Y%m%d_%H%M%S')}"
         )
 
-    # TODO: streaming is WIP; wait until https://github.com/deepset-ai/haystack-core-integrations/issues/1947 is fixed
     def _create_streaming_callback(self, test_case_name: str) -> Callable[[StreamingChunk], Any]:
         """
         Create a streaming callback function for a specific test case.
@@ -77,33 +76,10 @@ def _create_streaming_callback(self, test_case_name: str) -> Callable[[Streaming
         Returns:
             Callback function for streaming
         """
+        callback = StreamingCallbackManager()
 
-        async def streaming_callback(chunk: StreamingChunk) -> None:
-            """Handle streaming chunks from the agent."""
-            if hasattr(chunk, "content") and chunk.content:
-                # meta content_block type=tool_use
-                # meta type (content_block_start)
-                # meta delta type=input_json_delta
-                # meta delta message_delta
-                # meta delta stop_reason=tool_use
-                # Print with test case context, using a subtle prefix
-                content = chunk.content
-                # Handle newlines by adding the prefix to each new line
-                lines = content.split("\n")
-                for i, line in enumerate(lines):
-                    if i == 0:
-                        print(f"{line}", end="")
-                    elif line.strip():  # Only print non-empty lines with prefix
-                        print(f"\n[{test_case_name}] {line}", end="")
-                    else:
-                        print()  # Just print the newline for empty lines
-
-                # If the content ends with a newline, print it
-                if content.endswith("\n"):
-                    print()
-
-                # Ensure output is flushed immediately
-                sys.stdout.flush()
+        async def streaming_callback(chunk: StreamingChunk) -> Any:
+            return await callback(chunk)
 
         return streaming_callback
 
@@ -477,8 +453,10 @@ def _extract_assistant_message_stats(messages: list[ChatMessage]) -> dict[str, s
             meta = message.meta
             if "usage" in meta:
                 usage = meta["usage"]
-                total_prompt_tokens += usage.get("prompt_tokens", 0)
-                total_completion_tokens += usage.get("completion_tokens", 0)
+                prompt_tokens = usage.get("prompt_tokens")
+                total_prompt_tokens += prompt_tokens if prompt_tokens is not None else 0
+                completion_tokens = usage.get("completion_tokens")
+                total_completion_tokens += completion_tokens if completion_tokens is not None else 0
 
             # Extract model (should be consistent across messages)
             if "model" in meta and model is None:
 
@@ -53,7 +53,7 @@ def run_agent_single(
 
     try:
         results, _ = run_agent_benchmark(
-            agent_config=agent_cfg, test_case_name=test_case, benchmark_config=benchmark_cfg, streaming=False
+            agent_config=agent_cfg, test_case_name=test_case, benchmark_config=benchmark_cfg, streaming=True
         )
 
         result = results[0]
@@ -117,7 +117,7 @@ def run_agent_all(
             test_case_name=None,  # Run all
             benchmark_config=benchmark_cfg,
             concurrency=concurrency,
-            streaming=False,
+            streaming=True,
         )
 
         # Display summary statistics
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ benchmark = [`
`33`	`33`	`agents = [`
`34`	`34`	`"haystack-ai",`
`35`	`35`	`"mcp-haystack",`
`36`		`- "anthropic-haystack",`
	`36`	`+ "anthropic-haystack>=2.7.0",`
`37`	`37`	`"langfuse-haystack"`
`38`	`38`	`]`
`39`	`39`
Original file line number	Diff line number	Diff line change
`@@ -223,8 +223,6 @@ async def get_logs(`
`223`	`223`	`params=params,`
`224`	`224`	`)`
`225`	`225`
`226`		`- logger.warning(resp.json)`
`227`		`-`
`228`	`226`	`raise_for_status(resp)`
`229`	`227`
`230`	`228`	`if resp.json is not None:`
Original file line number	Diff line number	Diff line change
`@@ -53,7 +53,7 @@ def run_agent_single(`
`53`	`53`
`54`	`54`	`try:`
`55`	`55`	`results, _ = run_agent_benchmark(`
`56`		`- agent_config=agent_cfg, test_case_name=test_case, benchmark_config=benchmark_cfg, streaming=False`
	`56`	`+ agent_config=agent_cfg, test_case_name=test_case, benchmark_config=benchmark_cfg, streaming=True`
`57`	`57`	`)`
`58`	`58`
`59`	`59`	`result = results[0]`
`@@ -117,7 +117,7 @@ def run_agent_all(`
`117`	`117`	`test_case_name=None, # Run all`
`118`	`118`	`benchmark_config=benchmark_cfg,`
`119`	`119`	`concurrency=concurrency,`
`120`		`- streaming=False,`
	`120`	`+ streaming=True,`
`121`	`121`	`)`
`122`	`122`
`123`	`123`	`# Display summary statistics`