@@ -32,6 +32,7 @@ strands-env eval --evaluator <evaluator_file> --env <hook_file> [options]
3232- ` --base-url ` - SGLang server URL (default: ` http://localhost:30000 ` )
3333- ` --model-id ` - Model ID (auto-detected for SGLang, required for Bedrock)
3434- ` --tokenizer-path ` - Tokenizer path (defaults to model_id)
35+ - ` --tool-parser ` - Tool parser name (e.g., ` hermes ` , ` qwen_xml ` ) or path to hook file
3536- ` --region ` - AWS region for Bedrock
3637- ` --profile-name ` - AWS profile name for Bedrock
3738- ` --role-arn ` - AWS role ARN to assume for Bedrock
@@ -57,18 +58,29 @@ strands-env eval --evaluator <evaluator_file> --env <hook_file> [options]
5758### Examples
5859
5960``` bash
60- # Using registered benchmark
61- strands-env eval aime-2024 --env examples/envs/calculator_env.py --backend sglang
61+ # Using registered benchmark with code sandbox env
62+ strands-env eval aime-2024 \
63+ --env examples/eval/aime_code/code_sandbox_env.py \
64+ --base-url http://localhost:30000
6265
63- # Using custom evaluator hook (see examples/evaluators/)
64- strands-env eval --evaluator examples/evaluators/simple_math_evaluator.py \
65- --env examples/envs/calculator_env.py --backend sglang
66+ # Using custom evaluator hook (custom benchmark)
67+ strands-env eval \
68+ --evaluator examples/eval/simple_math/simple_math_evaluator.py \
69+ --env examples/eval/simple_math/calculator_env.py \
70+ --base-url http://localhost:30000
6671
6772# Pass@8 evaluation with high concurrency
68- strands-env eval aime-2024 --env examples/envs/calculator_env.py \
69- --backend sglang \
73+ strands-env eval aime-2024 \
74+ --env examples/eval/simple_math/calculator_env.py \
75+ --base-url http://localhost:30000 \
7076 --n-samples-per-prompt 8 \
7177 --max-concurrency 30
78+
79+ # With custom tool parser
80+ strands-env eval aime-2024 \
81+ --env examples/eval/simple_math/calculator_env.py \
82+ --base-url http://localhost:30000 \
83+ --tool-parser qwen_xml
7284```
7385
7486## Hook Files
@@ -104,11 +116,11 @@ def create_env_factory(model_factory: ModelFactory, env_config: EnvConfig):
104116### Example: Calculator Environment
105117
106118``` python
107- # examples/envs /calculator_env.py
119+ # examples/eval/simple_math /calculator_env.py
108120from strands_env.cli.config import EnvConfig
109121from strands_env.core.models import ModelFactory
110- from strands_env.environments.calculator import CalculatorEnv
111- from strands_env.rewards.math_reward import MathRewardFunction
122+ from strands_env.environments import CalculatorEnv
123+ from strands_env.rewards import MathRewardFunction
112124
113125def create_env_factory (model_factory : ModelFactory, env_config : EnvConfig):
114126 reward_fn = MathRewardFunction()
@@ -127,11 +139,11 @@ def create_env_factory(model_factory: ModelFactory, env_config: EnvConfig):
127139### Example: Code Sandbox Environment
128140
129141``` python
130- # examples/envs /code_sandbox_env.py
142+ # examples/eval/aime_code /code_sandbox_env.py
131143from strands_env.cli.config import EnvConfig
132144from strands_env.core.models import ModelFactory
133- from strands_env.environments.code_sandbox import CodeMode, CodeSandboxEnv
134- from strands_env.rewards.math_reward import MathRewardFunction
145+ from strands_env.environments import CodeMode, CodeSandboxEnv
146+ from strands_env.rewards import MathRewardFunction
135147
136148def create_env_factory (model_factory : ModelFactory, env_config : EnvConfig):
137149 reward_fn = MathRewardFunction()
@@ -181,7 +193,7 @@ EvaluatorClass = MyEvaluator
181193
182194Then run:
183195``` bash
184- strands-env eval --evaluator my_evaluator.py --env my_env.py --backend sglang
196+ strands-env eval --evaluator my_evaluator.py --env my_env.py --base-url http://localhost:30000
185197```
186198
187199### Registered Evaluator
@@ -254,6 +266,41 @@ class MyEvaluator(Evaluator):
254266 return {" my_metric" : compute_something(results)}
255267```
256268
269+ ## Tool Parser Hook
270+
271+ For models that use non-standard tool calling formats, you can specify a custom tool parser via ` --tool-parser ` . This accepts either:
272+
273+ 1 . A predefined parser name from ` strands-sglang ` (e.g., ` hermes ` , ` qwen_xml ` )
274+ 2 . A path to a Python hook file
275+
276+ ### Hook File Format
277+
278+ The hook file must export either ` tool_parser ` (instance) or ` ToolParserClass ` (subclass):
279+
280+ ``` python
281+ # my_tool_parser.py
282+ from strands_sglang.tool_parsers import ToolParser, ToolParseResult
283+
284+ class MyToolParser (ToolParser ):
285+ def parse (self , text : str ) -> list[ToolParseResult]:
286+ # Custom parsing logic
287+ ...
288+
289+ # Export as instance
290+ tool_parser = MyToolParser()
291+
292+ # OR export as class (will be instantiated)
293+ ToolParserClass = MyToolParser
294+ ```
295+
296+ Then use:
297+ ``` bash
298+ strands-env eval aime-2024 \
299+ --env my_env.py \
300+ --base-url http://localhost:30000 \
301+ --tool-parser my_tool_parser.py
302+ ```
303+
257304## Output Files
258305
259306Evaluation results are saved to the output directory:
0 commit comments