Skip to content

Commit ca3bf5c

Browse files
Shawclaude
andcommitted
feat(benchmarks): route mint/trust/woobench/solana through eliza TS bridge
Adds four new adapter modules under eliza-adapter/eliza_adapter/ that route the existing Python benchmark runners through the elizaOS TypeScript benchmark server (ElizaClient.send_message) instead of an in-process Python AgentRuntime. The legacy Python runtime path stays as default; the bridge is opt-in via a new CLI flag per benchmark. - mint: --provider eliza (swaps MINTAgent for ElizaMINTAgent in run_benchmark.py) - trust: --handler eliza-bridge (new ElizaBridgeTrustHandler implementing detect_*) - woobench: --agent eliza-bridge (build_eliza_bridge_agent_fn produces the agent_fn) - solana: MODE=eliza env (ElizaBridgeSolanaExplorer reroutes _execute_llm_step) Each adapter uses lazy imports so eliza-adapter remains importable in environments without all four benchmarks. The eliza_adapter package __init__.py is updated to expose the new symbols. Live smoke verification: - mint: --provider eliza --max-tasks 2 --no-ablation -> 87.5% baseline success on 8 tasks, results at /tmp/mint-eliza-test - trust: --handler eliza-bridge --categories prompt_injection --difficulty easy -> 88.9% F1 on 5 cases, results at /tmp/trust-eliza-bridge-test.json - woo: --agent eliza-bridge --scenario curious_newbie_tarot_01 -> 41.1/100, 9 turns, results at /tmp/woo-eliza-test/ - solana: bridge wiring landed; live test deferred — solana-gym-env's voyager.surfpool_env imports fail under the canonical PYTHONPATH=eliza/packages because benchmarks/solana/__init__.py shadows the installed solana RPC package. Pre-existing collision, unrelated to the bridge change. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent bf0ca77 commit ca3bf5c

9 files changed

Lines changed: 1051 additions & 16 deletions

File tree

packages/benchmarks/eliza-adapter/eliza_adapter/__init__.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,25 @@
5656
__all__.append("ElizaGauntletAgent")
5757
except ImportError:
5858
pass
59+
60+
# Optional: MINT bridge — only loaded when benchmarks.mint is on sys.path.
61+
try:
62+
from eliza_adapter.mint import ElizaMINTAgent # noqa: F401
63+
__all__.append("ElizaMINTAgent")
64+
except ImportError:
65+
pass
66+
67+
# Trust bridge — only depends on the lightweight HTTP client, always importable.
68+
from eliza_adapter.trust import ElizaBridgeTrustHandler # noqa: F401 # noqa: E402
69+
__all__.append("ElizaBridgeTrustHandler")
70+
71+
# WooBench bridge — only depends on the lightweight HTTP client, always importable.
72+
from eliza_adapter.woobench import build_eliza_bridge_agent_fn # noqa: F401 # noqa: E402
73+
__all__.append("build_eliza_bridge_agent_fn")
74+
75+
# Optional: Solana bridge — only loaded when benchmarks.solana + voyager are on sys.path.
76+
try:
77+
from eliza_adapter.solana import ElizaBridgeSolanaExplorer # noqa: F401
78+
__all__.append("ElizaBridgeSolanaExplorer")
79+
except ImportError:
80+
pass
Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
"""MINT benchmark agent backed by the eliza benchmark server.
2+
3+
Drop-in replacement for ``benchmarks.mint.agent.MINTAgent`` — same
4+
``solve_task`` interface returning a ``MINTTrajectory``, but each LLM
5+
call is forwarded to the eliza benchmark HTTP server via
6+
``ElizaClient.send_message`` instead of binding a model plugin into a
7+
Python AgentRuntime.
8+
9+
The TS bridge handles state composition and model dispatch; we run
10+
MINT's deterministic multi-turn loop in Python, parsing answers and
11+
optionally executing extracted Python code through the existing
12+
``PythonExecutor``.
13+
"""
14+
15+
from __future__ import annotations
16+
17+
import logging
18+
import time
19+
from typing import TYPE_CHECKING, Optional
20+
21+
from eliza_adapter.client import ElizaClient
22+
23+
if TYPE_CHECKING:
24+
from benchmarks.mint.executor import PythonExecutor
25+
from benchmarks.mint.feedback import FeedbackGenerator
26+
from benchmarks.mint.types import MINTTask, MINTTrajectory
27+
28+
29+
def _mint_imports():
30+
"""Lazy imports of benchmarks.mint.* — avoids requiring the module on sys.path at import."""
31+
from benchmarks.mint.agent import MINTAgent
32+
from benchmarks.mint.executor import PythonExecutor
33+
from benchmarks.mint.feedback import FeedbackGenerator
34+
from benchmarks.mint.types import MINTTask, MINTTrajectory, Turn, TurnType
35+
36+
return MINTAgent, PythonExecutor, FeedbackGenerator, MINTTask, MINTTrajectory, Turn, TurnType
37+
38+
39+
logger = logging.getLogger(__name__)
40+
41+
42+
class ElizaMINTAgent:
43+
"""MINT agent that delegates LLM calls to the eliza TS bridge.
44+
45+
Mirrors :class:`benchmarks.mint.agent.MINTAgent`'s public surface:
46+
- ``solve_task(task, enable_tools, enable_feedback) -> MINTTrajectory``
47+
- ``reset_session() -> None``
48+
49+
Internally it reuses the original ``MINTAgent`` for code-extraction,
50+
answer-extraction, and answer-checking helpers — but routes the
51+
"decide what to say next" call through ``ElizaClient.send_message``.
52+
"""
53+
54+
def __init__(
55+
self,
56+
client: ElizaClient | None = None,
57+
tool_executor: "PythonExecutor | None" = None,
58+
feedback_generator: "FeedbackGenerator | None" = None,
59+
temperature: float = 0.0,
60+
) -> None:
61+
MINTAgentCls, PythonExecutorCls, FeedbackGeneratorCls, *_ = _mint_imports()
62+
63+
self._client = client or ElizaClient()
64+
self.tool_executor = tool_executor or PythonExecutorCls()
65+
# Eliza bridge does its own LLM calls — skip the in-process LLM feedback path.
66+
self.feedback_generator = feedback_generator or FeedbackGeneratorCls(use_llm=False)
67+
self.temperature = max(0.0, min(1.0, temperature))
68+
69+
# Reuse helper methods from canonical MINTAgent (regex extractors, answer checker, etc.)
70+
# Pass runtime=None so the underlying agent stays in mock mode and we never touch it.
71+
self._helpers = MINTAgentCls(
72+
runtime=None,
73+
tool_executor=self.tool_executor,
74+
feedback_generator=self.feedback_generator,
75+
temperature=self.temperature,
76+
)
77+
78+
self._initialized = False
79+
80+
async def initialize(self) -> None:
81+
"""Verify the eliza server is reachable."""
82+
if self._initialized:
83+
return
84+
self._client.wait_until_ready(timeout=120)
85+
self._initialized = True
86+
87+
def reset_session(self) -> None:
88+
"""Reset for a new task — bridge sessions are keyed per task_id."""
89+
self._helpers.reset_session()
90+
91+
async def solve_task(
92+
self,
93+
task: "MINTTask",
94+
enable_tools: bool = True,
95+
enable_feedback: bool = True,
96+
) -> "MINTTrajectory":
97+
"""Solve a MINT task by routing each turn through the eliza TS bridge."""
98+
if not self._initialized:
99+
await self.initialize()
100+
101+
_, _, _, _, MINTTrajectory, Turn, TurnType = _mint_imports()
102+
103+
logger.info("[eliza-mint] Starting task %s: %s", task.id, task.description)
104+
105+
trajectory = MINTTrajectory(
106+
task_id=task.id,
107+
start_time_ms=time.time() * 1000,
108+
)
109+
110+
try:
111+
self._client.reset(task_id=task.id, benchmark="mint")
112+
except Exception as exc:
113+
logger.debug("[eliza-mint] Reset failed (continuing): %s", exc)
114+
115+
system_prompt = self._helpers._build_system_prompt(task)
116+
current_prompt = task.initial_prompt
117+
118+
for turn_num in range(task.max_turns):
119+
turn_start = time.time() * 1000
120+
121+
context: dict[str, object] = {
122+
"benchmark": "mint",
123+
"task_id": task.id,
124+
"task_category": task.category.value,
125+
"task_description": task.description,
126+
"evaluation_metric": task.evaluation_metric,
127+
"tools_allowed": list(task.tools_allowed),
128+
"max_turns": int(task.max_turns),
129+
"turn": turn_num + 1,
130+
"system_prompt": system_prompt,
131+
"enable_tools": bool(enable_tools),
132+
"enable_feedback": bool(enable_feedback),
133+
}
134+
135+
response = self._client.send_message(text=current_prompt, context=context)
136+
response_text = response.text or ""
137+
138+
trajectory.turns.append(
139+
Turn(
140+
turn_type=TurnType.ASSISTANT,
141+
content=response_text,
142+
turn_number=turn_num + 1,
143+
timestamp_ms=turn_start,
144+
)
145+
)
146+
147+
# Tool execution: extract code from response and run it via PythonExecutor.
148+
# The TS bridge does not host EXECUTE_CODE — we handle code execution Python-side.
149+
code_to_execute: str | None = None
150+
if enable_tools and "python" in task.tools_allowed:
151+
code_to_execute = self._helpers._extract_code(response_text)
152+
153+
if code_to_execute:
154+
exec_result = await self.tool_executor.execute(code_to_execute)
155+
trajectory.turns.append(
156+
Turn(
157+
turn_type=TurnType.TOOL,
158+
content=exec_result.output or exec_result.error or "",
159+
turn_number=turn_num + 1,
160+
tool_call=code_to_execute,
161+
tool_result=exec_result.output,
162+
tool_success=exec_result.success,
163+
timestamp_ms=time.time() * 1000,
164+
)
165+
)
166+
trajectory.num_tool_uses += 1
167+
168+
output_preview = (exec_result.output or "")[:500]
169+
if exec_result.success:
170+
current_prompt = (
171+
f"Code executed successfully. Output:\n```\n{output_preview}\n```\n\n"
172+
f"Now provide your final answer in the exact format requested. "
173+
f"End with: Final answer: <YOUR_ANSWER>"
174+
)
175+
else:
176+
error_preview = (exec_result.error or "Unknown error")[:300]
177+
current_prompt = (
178+
f"Code error:\n```\n{error_preview}\n```\n\nPlease fix the code and try again."
179+
)
180+
continue
181+
182+
predicted_answer = self._helpers._extract_answer(response_text, task)
183+
trajectory.final_answer = predicted_answer
184+
185+
if predicted_answer:
186+
if self._helpers._check_answer(predicted_answer, task):
187+
trajectory.success = True
188+
logger.info(
189+
"[eliza-mint] Task %s: correct answer on turn %d", task.id, turn_num + 1
190+
)
191+
break
192+
193+
if enable_feedback and turn_num < task.max_turns - 1:
194+
feedback = await self.feedback_generator.generate(
195+
task=task,
196+
predicted=predicted_answer,
197+
turn_num=turn_num,
198+
)
199+
trajectory.turns.append(
200+
Turn(
201+
turn_type=TurnType.FEEDBACK,
202+
content=feedback,
203+
turn_number=turn_num + 1,
204+
feedback=feedback,
205+
timestamp_ms=time.time() * 1000,
206+
)
207+
)
208+
trajectory.num_feedback_turns += 1
209+
current_prompt = (
210+
f"Feedback: {feedback}\n\nPlease try again with a different approach."
211+
)
212+
else:
213+
logger.info(
214+
"[eliza-mint] Task %s: incorrect answer %r", task.id, predicted_answer
215+
)
216+
break
217+
else:
218+
if enable_feedback and turn_num < task.max_turns - 1:
219+
feedback = (
220+
"I couldn't find a clear answer in your response. "
221+
"Please provide a specific answer ending with: Final answer: <YOUR_ANSWER>"
222+
)
223+
trajectory.turns.append(
224+
Turn(
225+
turn_type=TurnType.FEEDBACK,
226+
content=feedback,
227+
turn_number=turn_num + 1,
228+
feedback=feedback,
229+
timestamp_ms=time.time() * 1000,
230+
)
231+
)
232+
trajectory.num_feedback_turns += 1
233+
current_prompt = f"Feedback: {feedback}\n\nPlease try again."
234+
235+
trajectory.end_time_ms = time.time() * 1000
236+
return trajectory
237+
238+
async def close(self) -> None:
239+
"""No-op — the server manager owns subprocess lifecycle."""
240+
self._initialized = False

0 commit comments

Comments
 (0)