diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 99d3280..bb3cd02 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -81,8 +81,8 @@ jobs: EXTRA_ENV="" ;; 5) - # HVAC supply sim smoke tests + hvac_validation + bar_building - FILES="tests/test_hvac_supply_sim.py tests/test_hvac_validation.py tests/test_bar_building.py" + # HVAC supply sim smoke tests + hvac_validation + bar_building + concurrent regression + FILES="tests/test_hvac_supply_sim.py tests/test_hvac_validation.py tests/test_bar_building.py tests/test_concurrent_tools.py" EXTRA_ENV="" ;; esac diff --git a/CLAUDE.md b/CLAUDE.md index 29c3eef..6d01533 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,5 +1,5 @@ # CLAUDE.md — Instructions for Claude Code - +always be brutally honest ## Project: openstudio-mcp MCP server giving AI agents full control of building energy modeling — create buildings, author measures, configure HVAC, run EnergyPlus sims, extract @@ -73,7 +73,7 @@ docker run --rm \ - Targeted: `LLM_TESTS_ENABLED=1 pytest tests/llm/test_06_progressive.py -k "thermostat_L1" -v` - Full suite only for final validation - Markers: `-m smoke` (12), `-m generic` (10), `-m progressive` (102) -- Benchmark results go in `docs/llm-test-benchmark.md` +- Benchmark results go in `docs/testing/llm-test-benchmark.md` ### Local Development - Lint: `ruff check mcp_server/` diff --git a/README.md b/README.md index 7b43fcf..05e406c 100644 --- a/README.md +++ b/README.md @@ -498,7 +498,7 @@ The component properties tools can query and modify these 15 HVAC component type ## Testing -For the full testing guide — framework details, annotated examples, CI shards, and how to write new tests — see **[`docs/testing.md`](docs/testing.md)**. +For the full testing guide — framework details, annotated examples, CI shards, and how to write new tests — see **[`docs/testing/`](docs/testing/README.md)** (or [`docs/testing/testing.md`](docs/testing/testing.md) for the contributor guide). ### Quick start diff --git a/docker/Dockerfile b/docker/Dockerfile index 102317b..1a4ae9d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -58,6 +58,7 @@ COPY .github /repo/.github ENV OSMCP_RUN_ROOT=/runs ENV OSMCP_MAX_CONCURRENCY=1 +ENV OSMCP_CODE_MODE=0 ENV PYTHONUNBUFFERED=1 ENV OPENSTUDIO_MCP_MODE=dev diff --git a/docs/development-process-findings.md b/docs/development-process-findings.md deleted file mode 100644 index f201221..0000000 --- a/docs/development-process-findings.md +++ /dev/null @@ -1,271 +0,0 @@ -# Development Process Findings: MCP Tool Discovery at Scale - -**Project:** openstudio-mcp — MCP server for building energy modeling (OpenStudio SDK) -**Period:** Feb 18 – Mar 20, 2026 (31 days) -**Tool count:** 62 → 142 tools across 22 skills - -## Timeline of Key Decisions - -| Date | Commit | Decision | Rationale | Outcome | -|------|--------|----------|-----------|---------| -| Feb 18 | `5ef23ad` | Initial commit | — | 62 tools | -| Mar 2 | `f59f354` | Input hardening, HVAC auto-wiring | Security + usability | +4 tools (126) | -| **Mar 4** | **`a78d308`** | **Compress all tool descriptions ~30%** | Reduce context consumption (tool schemas ~100K chars) | Descriptions stripped of field lists, examples, educational text | -| Mar 4 | `884d371` | Release v0.4.0 | — | 127 tools | -| Mar 6 | `8b253fc` | Server instructions: NEVER/ALWAYS guardrails | Agent bypassing MCP tools for scripts | 6-domain anti-bypass rules | -| Mar 6 | `e9ad087` | First LLM agent test suite | Need automated verification of tool selection | 50 tests, 44% pass rate | -| Mar 7-8 | `40c8534` | LLM test improvements | System prompt + description fixes | 44% → 91% pass rate | -| Mar 10-12 | `65bee92` | Generic object access tools | Reduce tool count via universal tools | +3 generic tools (list_model_objects, get_object_fields, set_object_property) | -| **Mar 12** | **`cbfba81`** | **Remove 6 redundant typed list tools** | Generic tools replace them | 142 → 136 tools | -| Mar 12 | `feab46e` | Expand LLM tests to 159 | Progressive L1/L2/L3 framework | 96.2% pass rate | -| Mar 13 | `7e79c7c` | Measure authoring guardrails | Agent writing raw measure.rb files | Quote escaping, syntax validation | -| Mar 16 | — | Debug session: WSHP measure authoring failure | Agent hallucinated API methods, ignored MCP tools | Triggered tool routing plan | -| **Mar 19** | **`39d7608`** | **Add tags to all 141 tools, build recommend_tools** | RAG-MCP paper: 13.6% accuracy at 100+ tools | Tags inert (not in MCP spec), recommend_tools works | -| Mar 19 | — | Discover ToolSearch exists in Claude Code | Testing ENABLE_TOOL_SEARCH | Already enabled since Jan 14 | -| **Mar 19** | **`c09d6ee`** | **Enrich search_api + search_wiring_patterns descriptions** | ToolSearch matches on keywords in descriptions | Both tools go from invisible → 1st result | -| Mar 20 | `cdf4243` | Full regression: 164/171 (95.9%) | Verify no regressions from all changes | All failures known flaky | -| Mar 20 | — | Research: tags do nothing, descriptions are everything | Tags not in MCP spec, never sent to clients | Plan pivot: enrich descriptions, not consolidate | - -## Lesson 1: Description Compression Was Counterproductive - -**What we did (Mar 4):** Compressed all 127 tool descriptions by ~30%. -Stripped field lists, examples, return value descriptions, educational text. - -**Why:** Tool schemas consumed ~100K chars (~25K tokens). Believed this -was causing tool selection degradation. - -**What we didn't know:** Claude Code's ToolSearch had been shipping since -**Jan 14, 2026** (v2.1.7) — 7 weeks before our compression. ToolSearch -auto-defers MCP tools when schemas exceed 10% of context, presenting only -tool names + descriptions for keyword matching. The full schemas are loaded -on-demand only when a tool is selected. - -**The irony:** By compressing descriptions, we reduced the very keywords -ToolSearch uses to match tools. We optimized for a problem (context size) -that ToolSearch had already solved, while creating a new problem (discovery). - -**Evidence:** -- `search_api` with short description: invisible to ToolSearch with any query -- `search_api` with enriched description (use cases, examples, keywords): - found 1st for "search_api", "SDK methods", "verify method exists" -- Same tool, same functionality — only the description changed - -**Quantified impact:** -- Pre-compression: ~100K chars tool descriptions -- Post-compression: ~60K chars (40% reduction) -- With ToolSearch: context impact is ~500 chars (just the search tool) + - loaded-on-demand schemas. The 40% reduction saved nothing. - -## Lesson 2: Tags Are Inert Metadata - -**What we did (Mar 19):** Added `tags={"core"}`, `tags={"hvac"}`, etc. to -all 141 tools. Built `recommend_tools` meta-tool for keyword routing. - -**What we discovered:** -- `tags` is a FastMCP server-side feature, NOT part of the MCP wire protocol -- Tags are never sent from server to client in `tools/list` responses -- No client (Claude Desktop, Claude Code, Cursor, Windsurf, Gemini CLI) - reads or acts on tags -- ToolSearch does not use tags in its matching algorithm -- The only use is server-side `mcp.disable(tags=...)` / `mcp.enable()` - which requires `tools/list_changed` notification support — not available - in Claude Desktop or Claude Code - -**What actually works:** Tool names and descriptions. ToolSearch matches -against these. Rich descriptions with domain keywords are the mechanism. - -**Tags are kept** for future-proofing — the MCP spec or clients may add -tag support. But today they provide zero discovery benefit. - -## Lesson 3: Typed Tools Are More Discoverable Than Generic Tools - -**What we did (Mar 12):** Built generic tools (`list_model_objects`, -`get_object_fields`, `set_object_property`) and removed 6 typed list tools -that were redundant (Phase C). - -**What we learned:** The generic tools are powerful but less discoverable. -An energy modeler searching for "list spaces" will find `list_spaces` via -ToolSearch but may not find `list_model_objects("Space")` because the -generic tool's description doesn't mention specific type names. - -**Evidence from LLM tests:** -- `list_spaces_L1` (typed): PASS — LLM finds it with vague prompt -- `list_dynamic_type_L1` (generic): FAIL — LLM uses sizing tools instead - of `list_model_objects` when prompt says "What sizing parameters?" - -**Implication:** Don't consolidate typed tools further. The remaining typed -tools serve as discoverable entry points for common operations. The generic -tools serve as fallbacks for uncommon types. - -## Lesson 4: ToolSearch Indexes at Docker Build Time - -**What we discovered (Mar 19):** New tools added via volume-mounted code -(not baked into the Docker image) were invisible to ToolSearch. After -`docker build`, the same tools became discoverable. - -**Root cause:** ToolSearch indexes tool schemas when the MCP server first -connects. Tools registered at Python import time (from installed package -in Docker image) are indexed. Tools registered from volume-mounted code -are also registered at runtime but ToolSearch's index may cache from the -image's installed package. - -**Practical impact:** After adding any new MCP tool, Docker image MUST be -rebuilt. CI does this automatically. Local development requires manual -`docker build`. - -## Lesson 5: Server Instructions Are the Biggest Lever - -**What we did (Mar 6):** Added server instructions with NEVER/ALWAYS rules -for 6 domains (measures, results, visualization, models, weather, HVAC). - -**Impact:** LLM test pass rate jumped from 44% → 83% in one run. -Description improvements and tool-level fixes added another ~8% (to 91%). - -**Evidence:** -| Run | Date | Tests | Pass Rate | Key Change | -|-----|------|-------|-----------|------------| -| 1 | Mar 5 | 50 | 44% | Baseline (no system prompt) | -| 2 | Mar 6 | 90 | 83% | + server instructions | -| 3 | Mar 7 | 90 | 91% | + description fixes | -| 5 | Mar 10 | 107 | 96% | + generic access tests | -| 7 | Mar 12 | 159 | 97.5% | Test consolidation | -| 10 | Mar 19 | 172 | 96.5% | + tool routing (no regression) | -| 11 | Mar 20 | 171 | 95.9% | + ToolSearch + wiring recipes | - -The 44% → 83% jump from server instructions alone dwarfs all subsequent -improvements combined. Server-level guidance is more impactful than -tool-level description optimization. - -## Lesson 6: Progressive Prompt Testing Reveals Structural Limits - -**What we built (Mar 12):** Progressive test framework — each tool tested -at L1 (vague), L2 (moderate), L3 (explicit) prompt specificity. - -**Key finding:** L3 is 100% across all 42 cases. L1 failures are structural -— the prompt is genuinely too vague to determine the right tool. These are -not fixable by tool count reduction, description enrichment, or any -server-side change. - -**Examples of structural L1 failures:** -- "What sizing parameters?" → uses `get_sizing_zone_properties` (explicit) - instead of `list_model_objects` (generic). Reasonable behavior. -- "What loads?" → uses `get_space_details` instead of `get_load_details`. - The prompt doesn't specify what kind of loads. -- "Change thermostat settings" → multiple valid tools. LLM picks one. - -**Implication:** ~90% L1 pass rate is likely the ceiling for 142 tools -with current MCP architecture. The remaining 10% are ambiguous prompts -where multiple tools are reasonable choices. - -## Lesson 7: Cross-Client Compatibility Is the Real Constraint - -**Discovery:** -| Client | Tool Limit | Discovery Mechanism | -|--------|-----------|-------------------| -| Claude Code | Unlimited (ToolSearch) | Auto-defer at 10% context | -| Claude Desktop | Unlimited | None (all tools in context) | -| Cursor | 40 hard cap | None | -| Windsurf | 100 | Per-tool toggle | -| OpenAI | 128 (recommends ~10) | defer_loading | -| Gemini CLI | 100 soft / 512 API | includeTools/excludeTools | - -Our 142 tools work on Claude Code (ToolSearch) and Claude Desktop (brute -force). They're blocked on Cursor and marginal on Windsurf/Gemini. - -**No cross-client standard exists.** Each client implements discovery -differently or not at all. The only universal approach is reducing tool -count or splitting into multiple servers. - -## Key Metrics - -### Tool Schema Size Over Time -| Date | Tools | Schema Chars | Est. Tokens | -|------|-------|-------------|-------------| -| Feb 18 | 62 | ~30K | ~7.5K | -| Mar 2 | 126 | ~100K | ~25K | -| Mar 4 (pre-compress) | 127 | ~100K | ~25K | -| Mar 4 (post-compress) | 127 | ~60K | ~15K | -| Mar 12 | 136 | ~55K | ~14K | -| Mar 19 | 142 | ~61K | ~15K | - -### LLM Test Pass Rate Over Time -| Run | Date | Tests | Pass Rate | Primary Change | -|-----|------|-------|-----------|---------------| -| 1 | Mar 5 | 50 | 44.0% | Baseline | -| 2 | Mar 6 | 90 | 83.3% | Server instructions | -| 3 | Mar 7 | 90 | 91.1% | Description fixes | -| 4 | Mar 7 | 90 | 93.3% | Stability run | -| 5 | Mar 10 | 107 | 96.3% | Generic access tests | -| 6 | Mar 11 | 159 | 96.2% | Progressive expansion | -| 7 | Mar 12 | 159 | 97.5% | Test consolidation | -| 8 | Mar 13 | 25 | 92.0% | Measure authoring (separate) | -| 9a | Mar 19 | 9 | 100% | Tool routing baseline | -| 9b | Mar 19 | 9 | 100% | Post-docstring hardening | -| 10 | Mar 19 | 172 | 96.5% | Full regression (tool routing) | -| 11 | Mar 20 | 171 | 95.9% | Full suite with ToolSearch | - -### ToolSearch Discovery Rate -| Condition | Discoverable | Not Found | -|-----------|-------------|-----------| -| Short descriptions (pre-enrichment) | ~110/142 | ~32/142 | -| search_api (before enrichment) | 0 queries matched | All queries missed | -| search_api (after enrichment) | "search_api" → 1st, "SDK methods" → 1st | — | -| After Docker rebuild | All 142 tools indexed | 0 missing | - -## Research Citations - -See [research-tool-discovery-at-scale.md](research-tool-discovery-at-scale.md) -for comprehensive industry survey (13 papers, 30+ projects, empirical benchmarks). - -### Tool Overload -- RAG-MCP (arxiv:2505.03275): 100+ tools → 13.6% accuracy, semantic - retrieval → 43%. Sweet spot ≤30 tools (>90%). -- VS Code Copilot: embedding routing, 40→13 core tools, 94.5% coverage. - https://github.blog/ai-and-ml/github-copilot/how-were-making-github-copilot-smarter-with-fewer-tools/ -- MCP context overload analysis: - https://eclipsesource.com/blogs/2026/01/22/mcp-context-overload/ - -### Anthropic Tool Search -- Advanced Tool Use blog (Nov 24, 2025): - https://www.anthropic.com/engineering/advanced-tool-use -- Tool Search API docs: - https://platform.claude.com/docs/en/agents-and-tools/tool-use/tool-search-tool -- Claude Code ToolSearch: shipped v2.1.7 (Jan 14, 2026), auto at 10% context -- ENABLE_TOOL_SEARCH env var: auto (default), true, false, auto:N% - -### MCP Spec & Tags -- MCP Tool schema: name, description, inputSchema, annotations. No tags field. -- FastMCP tags: server-side only, enable/disable mechanism -- tools/list_changed: NOT supported by Claude Desktop or Claude Code - https://github.com/apify/mcp-client-capabilities - -### Client Limits -- Cursor 40-tool cap: - https://forum.cursor.com/t/request-increase-mcp-tools-limit/108637 -- Windsurf 100-tool limit: - https://docs.windsurf.com/windsurf/cascade/mcp -- OpenAI 128 limit + defer_loading: - https://developers.openai.com/api/docs/guides/tools-tool-search -- Gemini CLI 100/512: - https://github.com/google-gemini/gemini-cli/issues/21823 - -### Proxy/Router Patterns -- Portkey mcp-tool-filter (embedding proxy): - https://github.com/Portkey-AI/mcp-tool-filter -- openclaw-mcp-router: LanceDB embeddings + mcp_search/mcp_call gateway -- Redis solving MCP tool overload: - https://redis.io/blog/from-reasoning-to-retrieval-solving-the-mcp-tool-overload-problem/ - -## PR History (Supporting Data) - -| PR | Date | Title | Tools Before → After | -|----|------|-------|---------------------| -| #2 | Feb 19 | SWIG memory leak fix | 62 | -| #5 | Feb 22 | Claude Code skills | 62 → 64 | -| #8 | Mar 3 | Input hardening + HVAC auto-wiring | 64 → 126 | -| #18 | Mar 4 | Context reduction (description compression) | 126 → 127 | -| #33 | Mar 12 | Generic access + Phase C tool removal | 127 → 136 | -| #36 | Mar 13 | Measure authoring + cooled beam | 136 → 139 | -| #37 | Mar 14 | Test consolidation | 139 | -| #38 | Mar 16 | Merge develop | 139 | -| (optimize, not yet merged) | Mar 19-20 | Tool routing + wiring recipes | 139 → 142 | diff --git a/docs/knowledge/architecture-and-testing-patterns.md b/docs/knowledge/architecture-and-testing-patterns.md new file mode 100644 index 0000000..15c9144 --- /dev/null +++ b/docs/knowledge/architecture-and-testing-patterns.md @@ -0,0 +1,233 @@ +# Architecture & Testing Patterns for AI-Driven BEM + +Research consolidation: GPD orchestrator analysis, BEM-AI multi-agent paper, MCP ecosystem testing survey. Compiled for openstudio-mcp project planning. + +--- + +## 1. Multi-Agent Architectures + +### GPD (Get Physics Done) + +Open-source AI copilot for physics research from Physical Superintelligence PBC (Apache 2.0, v1.1.0). **Not an MCP server** -- it is an MCP client/consumer and prompt-orchestration framework that installs into Claude Code, Gemini CLI, Codex, and OpenCode. + +**Core pattern:** 61 commands drive the host LLM through structured research workflows via slash commands. No simulation engine -- relies on the LLM's inherent physics knowledge, carefully guided. + +**6 knowledge injection mechanisms:** + +| Mechanism | How it works | +|---|---| +| Convention locking | `/gpd:new-project` pins notation, assumptions, sign conventions to `.gpd/PROJECT.md` | +| Structured research memory | `.gpd/` directory: PROJECT.md, STATE.md (<150 lines), ROADMAP.md, observability logs, traces | +| Physics verification stages | 7 dedicated commands: dimensional analysis, limiting cases, convergence, experiment comparison, regression check | +| Specialist agent roles | 3 model tiers (opus/sonnet/haiku) x 5 research profiles (deep-theory, numerical, exploratory, review, paper-writing) | +| Deterministic validators | CLI validators for plan contracts, verification alignment, paper quality, reproducibility -- code-based, not LLM | +| Wave-based execution | Project -> Milestone -> Phase -> Plan -> Task; plans grouped into dependency waves for parallel execution | + +**Key architectural insight:** Don't trust the LLM to validate its own work -- use deterministic code where possible. + +### BEM-AI (PNNL) + +Xu et al., *Energy & Buildings* 2025. Multi-agent orchestrator using A2A protocol. Repo: `pnnl/BEM-AI` (renamed `automa-ai` v0.5.2 on PyPI). + +**Core pattern:** Planner (70B) decomposes task -> specialized agents (4B each) execute with 1-2 tools -> orchestrator assembles results via blackboard. + +**7 agents:** + +| Agent | Model | Role | +|---|---|---| +| Planner | llama3.3:70b | Decompose query into task list | +| Generator | qwen3:4b | Load template model by type/standard/CZ | +| Envelope | qwen3:4b | Modify WWR and insulation | +| Lighting | qwen3:4b | Adjust LPD, daylighting sensors | +| Simulation | qwen3:4b | Run annual simulation | +| Output | qwen3:4b | Retrieve EUI from results | +| Orchestrator | llama3.3:70b | Manage workflow graph, generate summary | + +Agent cards stored as JSON (A2A AgentCard schema), embedded in ChromaDB for semantic search discovery. + +**Small-model optimization techniques:** +1. Decision trees in prompts instead of reasoning +2. Forced chain-of-thought scaffolding (numbered steps) +3. One agent = one tool (reliable selection even at 4B) +4. Strict JSON output format with artifact markers +5. History amnesia ("Do NOT check history") -- state goes to blackboard +6. `` tag stripping (reasoning unreliable, final answer usually correct) +7. Semi-automated tuning: run -> analyze logs -> categorize error -> fix context -> rerun -> if fails at 70B, give up + +**Result:** ~15K total tokens for full WWR comparison workflow. A single Claude call with 142 tools burns ~60K+ on tool descriptions alone. + +**Blackboard pattern:** Shared key-value store replacing conversation context for cross-agent coordination. Agent A writes `original_model_path`, Agent C reads it directly without passing through intermediate agents. Production version (`automa_ai/blackboard/`) has optimistic concurrency, schema validation, revision tracking, audit trail, S3/DynamoDB backends. + +**Tool coverage:** 6 tools (4 OpenStudio + 2 model management). Medium office only. Envelope + lighting only. Zero HVAC. + +### Three-Way Comparison + +| Dimension | GPD | BEM-AI | openstudio-mcp | +|---|---|---|---| +| **Architecture** | Prompt orchestrator / MCP client | Multi-agent orchestrator (A2A) | MCP tool server (JSON-RPC stdio) | +| **What it wraps** | LLM's inherent physics knowledge | OpenStudio (6 tools) | OpenStudio + EnergyPlus (142 tools) | +| **MCP role** | Configures/consumes MCP servers | Consumes via LangChain adapter | IS the MCP server | +| **LLMs** | Frontier (tiered opus/sonnet/haiku) | Small local (4B-70B) | Frontier (Claude Sonnet/Opus) | +| **Agent count** | 1 LLM + specialist profiles | 7 specialized agents | 1 agent, all tools | +| **Memory** | `.gpd/` directory, STATE.md | Blackboard (shared KV store) | Agent's context window + skills | +| **Tool discovery** | Slash commands (fixed set) | RAG over agent cards (ChromaDB) | All 142 tools visible to client | +| **Verification** | 7 physics checks + deterministic validators | 10/10 reliability at temp=0 | `run_qaqc_checks` + 9-category ASHRAE | +| **HVAC coverage** | N/A (physics, not BEM) | None | All 10 ASHRAE + DOAS/VRF/radiant | +| **Building types** | N/A | Medium office only | 17 DOE prototypes | +| **Tests** | Not disclosed | 3 scenarios x 10 repeats | 625 integration + ~200 LLM + ~100 unit | +| **Dependencies** | Python venv, runtime configs | LangChain + LangGraph + ChromaDB + A2A + ADK + LiteLLM + Streamlit | Pure MCP, openstudio SDK | +| **License** | Apache 2.0 | Apache 2.0 | Custom | + +**Fundamental relationship:** Complementary, not competing. GPD orchestrates reasoning; BEM-AI orchestrates agents; openstudio-mcp provides the tool layer. BEM-AI could use openstudio-mcp as its MCP server and get 142 tools instead of 6. + +--- + +## 2. Testing Practices Across MCP Ecosystem + +### 8-Server Comparison + +| Repo | Stars | Unit | Integration (MCP protocol) | E2E (real backend) | LLM-in-Loop | Tool Chaining | Schema Snapshots | CI | +|---|---|---|---|---|---|---|---|---| +| modelcontextprotocol/servers | 81.6K | Yes | No | No | No | No | No | Yes | +| microsoft/playwright-mcp | 29.3K | No | Yes (stdio) | Yes (real browser) | No | Yes | No | Yes (3 OS) | +| github/github-mcp-server | 28.1K | Yes | No | Yes (real GitHub API) | No | Yes | Yes (toolsnaps) | Yes (3 OS) | +| supabase-community/supabase-mcp | 2.5K | Yes | Yes (StreamTransport) | Yes (PGlite + Anthropic API) | Yes (Claude) | Yes | No | Yes | +| upstash/context7 | 49.9K | Yes | No | No | No | No | No | Yes | +| executeautomation/mcp-playwright | 5.3K | Yes | No | No | No | No | No | Yes | +| stripe/agent-toolkit | 1.4K | No | No | No | Yes (multi-model) | Yes | No | N/A | +| **openstudio-mcp** | -- | Yes | Yes (stdio, Docker) | Yes (OpenStudio SDK) | Yes (Claude CLI) | Yes | No | Yes (5 shards) | + +### Key Findings + +**The testing gap:** Most MCP servers (even 50K+ stars) have only unit tests with mocked backends. Official SDK guidance covers protocol conformance but not behavioral correctness. + +**Notable patterns from the ecosystem:** +- **Playwright MCP** -- best integration testing: real `Client` over `StdioClientTransport`, real browser +- **GitHub MCP** -- novel **toolsnaps**: tool JSON schemas serialized to `.snap` files, CI fails on schema drift +- **Supabase MCP** -- most sophisticated before openstudio-mcp: LLM-in-the-loop E2E, LLM-as-judge assertions, prompt injection tests +- **Stripe** -- evaluation framework (not test suite): benchmark scenarios with multi-model comparison + +### Three Testing Tiers + +| Tier | What it validates | Docker | LLM | +|---|---|---|---| +| **Deterministic** (unit) | Skill registration, path safety, tool metadata, wiring recipes | No | No | +| **Protocol** (integration) | Full MCP JSON-RPC, real SDK, tool dispatch, stdout suppression | Yes | No | +| **Behavioral** (LLM agent) | Tool selection accuracy, workflow completion, guardrail compliance | Yes (server) | Yes | + +### Gaps in Official Guidance + +| Aspect | Support Level | +|---|---| +| In-memory unit testing | Strong (both SDKs) | +| Protocol conformance | Moderate (conformance package) | +| Integration with real backends | Weak (no patterns) | +| LLM behavioral testing | None | +| Tool description quality validation | None | +| Multi-tool workflow testing | None | + +### Complexity Scaling (Academic) + +TaskBench (NeurIPS 2024): single-tool accuracy 96% drops to 25% at 8 tools. openstudio-mcp operates at 142 tools -- far beyond any benchmark scale -- making its ~96% pass rate a significant data point. + +Temperature matters: BFCL shows 0.0 vs 0.7 can swing accuracy ~10%. Benchmarks disagree with each other (BFCL vs NFCL rankings don't correlate). + +### openstudio-mcp Novel Contributions + +| Contribution | What it is | +|---|---| +| Progressive prompt specificity (L1/L2/L3) | 43 cases x 3 levels. L1 vague, L2 moderate, L3 explicit. Pass-rate gradient diagnoses discovery vs execution failures | +| Eval.md-driven test generation | Skill authors write eval tables co-located with implementation. 32 cases auto-generated from 8 skill eval.md files | +| Guardrail regression tests | Verify LLM uses MCP tools instead of writing raw IDF/Python/Bash | +| Full workflow E2E | 31 multi-tool workflows, 10+ tool chains (load -> weather -> HVAC -> simulate -> extract -> compare) | +| Measure quality assertions | Authored measures checked for typed args, defaults, descriptions, valid run_body | +| Custom retry with budget caps | LLM tests retry up to 2x, stable/flaky auto-classification, 180 invocation max | +| CI sharding | 5 parallel Docker shards (~200s each), image built once | + +### Quantitative Comparison + +| Metric | Official Servers | Playwright MCP | GitHub MCP | Supabase MCP | **openstudio-mcp** | +|---|---|---|---|---|---| +| Tools tested | ~20 | ~30 | ~50 | ~30 | **142** | +| Integration tests (MCP protocol) | No | Yes | No | Yes | **Yes (625)** | +| LLM behavioral tests | No | No | No | Yes (~10) | **Yes (~200)** | +| Progressive difficulty | No | No | No | No | **Yes (3 levels)** | +| Multi-tool workflows | No | 2-step | 5-step | 2-step | **10+ step** | +| Guardrail tests | No | No | No | Yes (injection) | **Yes (bypass)** | + +### Emerging Best Practices + +- **In-memory transport** for fast unit tests (SDK pattern) +- **Schema snapshot testing** for API contract stability (GitHub MCP) +- **LLM-as-judge** for fuzzy output assertions (Supabase) +- **Progressive prompt specificity** for discovery vs execution diagnosis (openstudio-mcp) +- **Outcome-based grading** over path-based (Anthropic guidance) +- **Deterministic validation alongside LLM execution** (GPD pattern) + +--- + +## 3. Lessons for openstudio-mcp + +### Adopt + +| Pattern | Source | Implementation path | +|---|---|---| +| **Convention/assumption locking** | GPD | `project_init` tool writes `.bem/PROJECT.md` with climate zone, code vintage, baseline system, units, targets. Subsequent tools check it. Existing `ashrae-baseline-guide` skill becomes structural, not advisory | +| **Deterministic precondition checking** | GPD | `validate_workflow` tool checks model loaded, weather attached, design days exist, all zones have HVAC, constructions assigned -- before simulation | +| **Schema snapshot testing** | GitHub MCP | Serialize tool JSON schemas to `.snap` files, CI fails on drift. Catches accidental tool signature changes | +| **Daylighting sensor tool** | BEM-AI | Only real tool gap they exposed | + +### Adopt When Needed + +| Pattern | Source | Trigger | +|---|---|---| +| **Blackboard pattern** | BEM-AI | If/when we go multi-agent or remote multi-user. In single-agent arch, Claude's context IS the blackboard | +| **Project-level state persistence** | GPD | Multi-session workflows where user returns asking "what was baseline EUI?". `.bem/` directory with STATE.md, VARIANTS.md, DECISIONS.md | +| **Wave-based execution** | GPD | Multi-variant BEM workflows. Requires runtime support (subagents) more than MCP changes | +| **Agent card + semantic search** | BEM-AI | Useful for tool routing optimization -- their ChromaDB approach parallels our dynamic tool filtering | + +### Validates Our Approach + +| What we do | Validation | +|---|---| +| 142 MCP tools with real simulation | BEM-AI validates MCP-based BEM automation approach. They invested in architecture with 6 tools; we invested in tool depth | +| Three-tier test pyramid | Survey shows no other MCP server does all three tiers. Most have unit-only | +| Progressive L1/L2/L3 testing | No other project tests tool discoverability systematically. Academic benchmarks stop at 8 tools | +| ~96% pass rate at 142 tools | TaskBench shows 25% at 8 tools. Our scale is unprecedented in published results | +| Outcome-based grading in LLM tests | Aligns with Anthropic's "grade outcomes, not paths" guidance | +| Docker-based CI with sharding | More rigorous than any surveyed MCP server | + +### Watch + +| Risk | Source | Why it matters | +|---|---|---| +| Token cost at 142 tools | BEM-AI | Their 15K tokens vs our ~60K+ on tool descriptions alone. Dynamic tool filtering (our tool-routing optimization) is the answer for single-agent arch | +| Small-model support | BEM-AI | Two paths: (a) micro-agent decomposition (1-2 tools/agent), (b) dynamic tool filtering. We're pursuing (b) | +| Benchmark disagreement | Academic | BFCL vs NFCL rankings don't correlate. Need multiple evals, not single benchmark | +| Temperature sensitivity | BFCL | 0.0 vs 0.7 swings accuracy ~10%. Our LLM tests should pin temperature | + +--- + +## 4. Sources + +### Repos +- [GPD](https://github.com/psi-oss/get-physics-done) (v1.1.0) | [PSI blog post](https://theinnermostloop.substack.com/p/the-first-open-source-agentic-ai) +- [BEM-AI / automa-ai](https://github.com/pnnl/BEM-AI) | Xu et al., *Energy & Buildings* 2025 +- [modelcontextprotocol/servers](https://github.com/modelcontextprotocol/servers) (81.6K stars) +- [microsoft/playwright-mcp](https://github.com/microsoft/playwright-mcp) (29.3K stars) +- [github/github-mcp-server](https://github.com/github/github-mcp-server) (28.1K stars) +- [supabase-community/supabase-mcp](https://github.com/supabase-community/supabase-mcp) (2.5K stars) +- [stripe/agent-toolkit](https://github.com/stripe/agent-toolkit) (1.4K stars) + +### Industry Guidance +- Anthropic, "Demystifying Evals for AI Agents" +- AWS, "Evaluating AI Agents: Real-World Lessons" +- Lowin, "Stop Vibe-Testing Your MCP Server" +- merge.dev, "How to test MCP servers effectively" + +### Academic +- BFCL (Berkeley) -- ICML 2025 +- TaskBench (Microsoft) -- NeurIPS 2024 +- StableToolBench -- ACL 2024 +- AgentBench (Tsinghua) -- ICLR 2024 +- Mohammadi et al., Agent Eval Survey -- KDD 2025 diff --git a/docs/knowledge/codemode-benchmark-2026-04-05.md b/docs/knowledge/codemode-benchmark-2026-04-05.md new file mode 100644 index 0000000..e1d415b --- /dev/null +++ b/docs/knowledge/codemode-benchmark-2026-04-05.md @@ -0,0 +1,144 @@ +# CodeMode Benchmark: 2026-04-05 + +FastMCP 3.2.0 CodeMode transform tested against openstudio-mcp's 142-tool server via Claude Code (Sonnet). Result: **massive regression across every metric**. Feature kept as opt-in toggle (`OSMCP_CODE_MODE=1`) but NOT recommended for Claude Code clients. + +## TL;DR + +CodeMode reduced pass rate from **95.3% to 24.0%** (71pp drop). Doubled output tokens, tripled ToolSearch calls, 143% longer runtime. Conclusion: Claude Code's built-in ToolSearch already solves the tool discovery problem — adding CodeMode creates a conflicting second discovery layer that degrades performance on every dimension. + +## Setup + +- **FastMCP:** 3.2.0 (upgraded from 3.0.2) +- **Tools:** 142 (no changes) +- **Model:** Claude Sonnet via Claude Code CLI +- **Test suite:** `tests/llm/test_06_progressive.py` (129 tests, 43 cases × L1/L2/L3) +- **Retries:** 0 (first-attempt signal) +- **Toggle:** `OSMCP_CODE_MODE=1` via env var, activates `mcp.add_transform(CodeMode())` after `register_all_skills()` +- **Test harness:** `runner.py` parses `call_tool("name", ...)` patterns from CodeMode execute blocks to preserve existing assertions + +## Results + +| Metric | CodeMode OFF | CodeMode ON | Delta | +|--------|-------------|-------------|-------| +| Pass rate | 123/129 (95.3%) | 31/129 (**24.0%**) | **-71.3pp** | +| L1 (vague) | 40/43 (93.0%) | 8/43 (18.6%) | -74.4pp | +| L2 (moderate) | 42/43 (97.7%) | 12/43 (27.9%) | -69.8pp | +| L3 (explicit) | 41/43 (95.3%) | 11/43 (25.6%) | -69.8pp | +| Input tokens | 1,260 | 1,646 | +30.6% | +| Output tokens | 127,859 | **300,118** | **+134.7%** | +| Cache tokens | 12.3M | 20.3M | +65.5% | +| Duration | 69 min | **168 min** | **+143%** | +| Cost (notional) | $9.29 | $22.35 | +140% | +| ToolSearch avg/test | 1.6 | **5.8** | +263% | +| code_executions | 0 | 2.0/test | — | + +Raw data: +- `docs/sweeps/codemode-off-2026-04-05/benchmark.json` +- `docs/sweeps/codemode-on-2026-04-05/benchmark.json` + +## Failure Mode Analysis (CodeMode ON) + +| Mode | Count | Description | +|------|-------|-------------| +| wrong_tool | 67 | LLM wrote Python code calling wrong tool name or with wrong args | +| timeout | 30 | Exceeded 120s wall clock — CodeMode sandbox + meta-tool chain is slower | +| no_mcp_tool | 1 | LLM didn't call any MCP tool (gave up) | +| **Total failed** | **98** | | + +L1/L2/L3 all regressed similarly (-70pp each) — CodeMode doesn't discriminate between vague and explicit prompts. The failure is structural, not prompt-sensitivity. + +## Root Causes + +### 1. Double discovery layer +Claude Code already implements deferred tool loading via its built-in ToolSearch when tool definitions exceed 10K tokens. Our 142 tools hit this threshold and get auto-deferred. Adding CodeMode on top creates a second discovery layer: + +1. Claude Code calls ToolSearch to find relevant domain tools +2. Can't find them (CodeMode hid them behind 3 meta-tools) +3. Falls back to the CodeMode meta-tools (search, get_schema, execute) +4. Writes Python code to call the tools +5. Makes errors the LLM wouldn't make calling tools directly + +Evidence: ToolSearch calls went UP from 1.6 to 5.8/test. They should have gone to zero if CodeMode had cleanly replaced discovery. + +### 2. Sonnet struggles with 142-tool sandbox catalog +The FastMCP author explicitly warned: "Sonnet 4.6 class model was able to use code mode with a complex server, but Haiku 4.5 class model made a few errors." With 142 tools, even Sonnet makes frequent errors writing the `call_tool()` invocations correctly. + +Community examples where CodeMode worked (Amazon Ads MCP, 98% reduction) had a few dozen tools, not 142. The complexity scales poorly. + +### 3. Code generation adds tokens, not removes them +The promise: CodeMode reduces tokens by not shipping tool definitions. +The reality: The LLM writes Python orchestration code (`result = call_tool("create_baseline_osm", name="test"); print(result)`) that costs more tokens to generate than a direct tool call JSON. + +Output tokens more than doubled (128K → 300K). Total token cost increased despite input tokens staying similar. + +### 4. Meta-tool overhead +Each CodeMode workflow requires at minimum 3 meta-tool calls: search → get_schema → execute. Direct tool use is 1 call. Even when CodeMode succeeds, it takes 3x the turns for the same operation. + +## Why CodeMode's Promise Doesn't Apply to Us + +CodeMode is designed for API clients that ship all 142 tool definitions upfront (57K tokens of waste). Its value proposition: + +> "Entire tool catalog loads into context upfront, every tool call is a round-trip burning tokens on intermediate results." + +**We don't have this problem.** Claude Code already: +- Defers tool definitions at the 10K token threshold +- Only loads 3-5 relevant tools per turn via ToolSearch +- Keeps intermediate results out of context where possible + +Our 1,260 input tokens / test (already near-zero due to prompt caching) shows the token waste CodeMode targets does not exist in our setup. Adding CodeMode can only add overhead. + +## Recommendation + +**Do not use CodeMode with Claude Code clients.** + +### For Claude Code users +- Keep `OSMCP_CODE_MODE=0` (default) +- Claude Code's ToolSearch is already solving the discovery problem +- 95.3% pass rate at 1-2 ToolSearch calls per test is near-optimal + +### For API users (hypothetical future use case) +CodeMode might still help if we expose openstudio-mcp to API clients that do NOT have deferred loading (raw Anthropic API clients, non-Claude models via OpenAI API, etc.). In that case: +- Set `OSMCP_CODE_MODE=1` at deployment +- Expect some accuracy cost in exchange for token savings +- Test thoroughly — our 24% result suggests even then it may not be worth it + +### Toggle preservation +The toggle stays in place: +- `pyproject.toml`: `fastmcp>=3.1.0,<4.0` +- `mcp_server/config.py`: `ENABLE_CODE_MODE` env var +- `mcp_server/server.py`: conditional `mcp.add_transform(CodeMode())` +- `docker/Dockerfile`: `ENV OSMCP_CODE_MODE=0` +- `tests/llm/runner.py`: `LLM_TESTS_CODE_MODE` env var + `code_mode_tool_calls` parser +- `tests/llm/conftest.py`: benchmark tracks CodeMode active state + +Future experiments (new FastMCP versions, different sandbox providers, configuration tweaks) can toggle it on without code changes. + +## Open Questions for Future Testing + +If revisiting CodeMode: + +1. Does it work better with **fewer tools**? Test with a subset (e.g., 20 core tools) to see if the 142-tool scale is the problem. +2. Does **configuring fewer discovery stages** help? CodeMode supports collapsing the 3-stage flow to 2-stage. Worth trying. +3. Does **Opus** do better than Sonnet? Haiku was warned against by the FastMCP author; Opus was not tested. +4. Does **disabling Claude Code ToolSearch** (if possible) eliminate the double-discovery conflict? +5. Does **a custom search function** (embeddings instead of BM25) improve tool matching accuracy? +6. Does **CodeMode + `allowed_callers` PTC** work together in API mode, bypassing the Claude Code layer entirely? + +## Related Research + +- `docs/knowledge/fastmcp-code-mode-and-advanced-tool-use.md` — FastMCP 3.1/3.2 features, Anthropic advanced tool use +- `docs/knowledge/tool-discovery-and-llm-testing.md` — timeline of tool count growth, prior benchmark results +- `docs/knowledge/reddit-mcp-discovery-thread.md` — community approaches to tool discovery at scale + +## Files Modified for This Experiment + +The toggle code remains in place. No reversion needed. + +| File | Purpose | +|------|---------| +| `pyproject.toml` | Pin `fastmcp>=3.1.0,<4.0` | +| `mcp_server/config.py` | `ENABLE_CODE_MODE` env var | +| `mcp_server/server.py` | Conditional `mcp.add_transform(CodeMode())` | +| `docker/Dockerfile` | `ENV OSMCP_CODE_MODE=0` default | +| `tests/llm/runner.py` | Pass env to Docker, parse `call_tool(...)` from execute code | +| `tests/llm/conftest.py` | Track code_mode_active/code_executions in benchmark | diff --git a/docs/knowledge/fastmcp-code-mode-and-advanced-tool-use.md b/docs/knowledge/fastmcp-code-mode-and-advanced-tool-use.md new file mode 100644 index 0000000..14b815e --- /dev/null +++ b/docs/knowledge/fastmcp-code-mode-and-advanced-tool-use.md @@ -0,0 +1,166 @@ +# FastMCP Code Mode & Anthropic Advanced Tool Use + +Research compiled 2026-04-05. Covers FastMCP 3.1/3.2 releases, Anthropic's Advanced Tool Use blog, Code Execution with MCP blog, and community discussion. + +--- + +## FastMCP 3.1 "Code to Joy" (2026-03-03) + +### Code Mode (Experimental) + +`CodeMode` transform replaces the full tool catalog with 3 meta-tools: **search** (BM25), **get_schemas**, **execute** (sandboxed Python). LLM discovers tools on-demand, writes Python chaining `call_tool()`, intermediate results never touch context. + +```python +from fastmcp import FastMCP +from fastmcp.experimental.transforms.code_mode import CodeMode +mcp = FastMCP("Server", transforms=[CodeMode()]) +``` + +- Existing tools unchanged -- CodeMode wraps them +- 3-stage default (search -> schemas -> execute), configurable to 2-stage or no-discovery +- Sandbox: Monty (Pydantic project), resource limits on time/memory/recursion +- No special client support needed -- meta-tools look like normal MCP tools +- Model requirement: Sonnet 4.6 works well, Haiku 4.5 makes errors + +### Other 3.1 Features +- `SearchTools` transform available standalone (BM25 search without execution) +- `MultiAuth` for composing token verification sources +- Lazy-loaded heavy imports (faster startup) +- `search_result_serializer` hook for customizing search output + +## FastMCP 3.2 "Show Don't Tool" (2026-03-30) + +### FastMCPApp (Interactive UIs) +- `@app.ui()` renders charts/dashboards/forms inside conversations via Prefab (Python DSL -> React) +- Separates LLM-facing tools from backend tools +- Built-in providers: FileUpload, Approval, Choice, FormInput, GenerativeUI +- Dev server: `fastmcp dev apps` for browser preview + +### Security Hardening +- SSRF/path traversal fixes, JWT algorithm restrictions, OAuth per-tool auth, CSRF protection +- `readOnlyHint=True` on ResourcesAsTools generated tools + +### Notable for Us +- Fix: stale catalog in CodeMode execute +- `readOnlyHint=True` pattern — we should adopt for our read-only tools +- MCP conformance tests added to CI + +--- + +## Anthropic Advanced Tool Use (API Features, Beta) + +Three new API-level features (beta header: `advanced-tool-use-2025-11-20`): + +### 1. Tool Search Tool +- `defer_loading: true` per tool — excluded from initial context, discovered via search +- Built-in regex + BM25 search, or custom embeddings +- Per-MCP-server config with per-tool overrides +- Doesn't break prompt caching +- **85% token reduction** (77K -> 8.7K for 50+ tools) +- Accuracy: Opus 4 49%->74%, Opus 4.5 79.5%->88.1% +- Threshold: use when >10 tools or >10K tokens in definitions + +### 2. Programmatic Tool Calling (PTC) +- Claude writes Python orchestration; intermediate tool results stay in sandbox +- `allowed_callers: ["code_execution_20250825"]` opts tools in +- Only final `stdout` enters context +- **37% token reduction** on complex tasks +- Best for: large datasets needing aggregates, 3+ dependent tool calls, parallel operations +- `caller` field in tool requests identifies PTC calls vs direct + +### 3. Tool Use Examples +- `input_examples` array in tool definitions +- **72%->90% accuracy** on complex parameter handling +- Shows format conventions, optional parameter correlations, nested structure patterns +- Best for: complex schemas, many optional params, domain-specific conventions + +### Best Practices from Anthropic +- Layer features: context bloat -> Tool Search; large intermediate results -> PTC; parameter errors -> Examples +- Keep 3-5 most-used tools always loaded, defer rest +- Document return formats clearly for PTC (Claude writes parsing code) +- Realistic example data (not "string" or "value") + +--- + +## Anthropic Code Execution with MCP (Nov 2025) + +Earlier blog establishing the code-as-API pattern: +- Tools as filesystem: `./servers/google-drive/getDocument.ts` — agent browses filesystem to discover +- **98.7% token reduction** (150K -> 2K) +- Progressive disclosure: `search_tools` with detail level parameter (name-only, name+description, full schema) +- Context-efficient results: filter/aggregate in code before returning to model +- Privacy-preserving: intermediate data never enters model context +- State persistence: agents save code as reusable skills (`SKILL.md` pattern = our skills system) + +--- + +## Community Token Economics (Reddit r/mcp) + +| Setup | Before Code Mode | After Code Mode | Reduction | +|-------|-----------------|-----------------|-----------| +| Amazon Ads MCP (top 5 tools) | 34K tokens upfront | ~600 tokens/workflow | 98.2% | +| Generic 50K setup (u/No_More_Fail) | 50K tokens | 2-3K tokens | 95% | +| 5-server setup (Anthropic) | 55K tokens | 8.7K tokens | 85% | +| Cloudflare (1000 endpoints) | ~1M tokens | ~1K tokens | 99.9% | +| openstudio-mcp (142 tools) | ~57K tokens | ~600-3K est. | ~95% est. | + +Key community insights: +- Code mode reduces "half-plans" where model commits to wrong tool too early +- Multi-server: compose servers in FastMCP, then wrap outer with CodeMode +- Legacy backends: use API gateway (Kong, Tyk) to flatten surface before MCP +- Client-side code mode requested but not yet available + +--- + +## Impact on openstudio-mcp + +### Current State +- FastMCP 3.0.2 installed (`fastmcp>=0.4.0` in pyproject.toml) +- 142 tools, ~57K tokens of definitions +- Claude Code ToolSearch already defers our tools (>10K threshold) +- Skills system = hand-crafted progressive disclosure + +### Upgrade Path: FastMCP 3.1+ Code Mode + +**What it gives us:** +- One-line addition: `transforms=[CodeMode()]` wraps all 142 tools +- 3 meta-tools replace 142 tool definitions in context (~95% token reduction) +- Sandboxed execution: agent writes Python to chain our tools, intermediate results (timeseries data, zone lists, component properties) stay out of context +- No tool code changes needed + +**Concerns:** +- Experimental status +- Haiku-class models struggle with it (we sometimes target haiku) +- Sandbox security for code execution on MCP server side +- Our tools already work well with ToolSearch — incremental benefit unclear +- Breaking change in 3.2: app tool calls route via `___`-prefixed names + +### API-Level Features (for API users, not Claude Code) + +| Feature | Effort | Impact | Notes | +|---------|--------|--------|-------| +| `input_examples` on complex tools | Low | High | Add to ~15 tools with complex params | +| `defer_loading` per-tool config | None (client-side) | High | API users can defer our 142 tools | +| PTC `allowed_callers` | Low | High | Mark read-only data tools as PTC-compatible | +| Description quality for search | Already done | Maintained | Our descriptions are keyword-rich | + +### Recommended Actions + +1. **Now:** Add `input_examples` to top 15 complex tools (works with current FastMCP) +2. **Soon:** Upgrade to FastMCP 3.1+, test CodeMode with our integration tests +3. **Soon:** Mark data-heavy read tools as PTC `allowed_callers` compatible +4. **Watch:** FastMCP 3.2 Apps — potential for simulation result visualization +5. **Watch:** Client-side code mode — would help Claude Desktop users with our server + +--- + +## Sources + +- [Anthropic: Advanced Tool Use](https://www.anthropic.com/engineering/advanced-tool-use) +- [Anthropic: Code Execution with MCP](https://www.anthropic.com/engineering/code-execution-with-mcp) +- [FastMCP 3.1.0 Release](https://github.com/PrefectHQ/fastmcp/releases/tag/v3.1.0) +- [FastMCP 3.2.0 Release](https://github.com/PrefectHQ/fastmcp/releases/tag/v3.2.0) +- [Reddit: Stop Calling Tools, Start Writing Code Mode](https://www.reddit.com/r/mcp/comments/1rkx4pa/) +- [FastMCP Code Mode Blog](https://www.jlowin.dev/blog/fastmcp-3-1-code-mode) +- [FastMCP Code Mode Docs](https://gofastmcp.com/servers/transforms/code-mode) +- [Cloudflare Code Mode Blog](https://blog.cloudflare.com/code-mode/) diff --git a/docs/geometry-workflows-research.md b/docs/knowledge/geometry-workflows-research.md similarity index 100% rename from docs/geometry-workflows-research.md rename to docs/knowledge/geometry-workflows-research.md diff --git a/docs/knowledge/mcp-best-practices-gap-analysis.md b/docs/knowledge/mcp-best-practices-gap-analysis.md new file mode 100644 index 0000000..386f240 --- /dev/null +++ b/docs/knowledge/mcp-best-practices-gap-analysis.md @@ -0,0 +1,495 @@ +# MCP Best Practices: Research & Gap Analysis + +*March 2026 — based on MCP spec 2025-11-25, industry survey, codebase audit* + +--- + +## Executive Summary + +openstudio-mcp is the largest simulation-engine MCP server in production (142 tools, 26 skills). It leads peers in testing rigor (480+ integration tests, LLM agent tests, 5-shard CI) and HVAC mutation depth. Key gaps: no tool annotations, no async tasks for simulation, no structured output, and all 142 tool schemas ship to every client on connect (~60K tokens). The highest-value changes are tool annotations (low effort, immediate UX gains) and progressive tool discovery (high effort, 90%+ token reduction). + +--- + +## 1. Comparable MCP Servers + +### Building Energy Modeling + +| Project | Tools | Transport | State | Testing | MCP Features | +|---------|-------|-----------|-------|---------|-------------| +| **openstudio-mcp** | 142 | stdio | global in-memory singleton | 480+ integration, LLM agent, 5-shard CI | tools, 6 prompts, 4 resources | +| **EnergyPlus-MCP** (LBNL) | 35 | stdio | file-based (IDF path) | MCP Inspector only | tools only | +| **BEM-AI** (PNNL) | ~6 per server | SSE (A2A) | shared blackboard | TBD | A2A + MCP hybrid | + +**Key takeaway**: We have 4x the tools of EnergyPlus-MCP, the only HVAC mutation tools in the BEM space, and dramatically better test coverage. BEM-AI wraps us via A2A — validates our tool API surface. EnergyPlus-MCP is stateless (file-based), which scales horizontally more easily. + +### Engineering / CAD / Scientific Computing + +| Project | Tools | Notable Pattern | +|---------|-------|-----------------| +| **STK-MCP** (Ansys) | 3 tools + 5 resources | Uses MCP Resources for query state; HTTP transport | +| **Fusion 360 MCP** | 3 tools, 3 resources, 2 prompts | Only project using all 3 MCP primitives | +| **MATLAB MCP** (MathWorks) | 5 | Official vendor server; Go implementation; lazy MATLAB init | +| **Jupyter MCP** (Datalayer) | 20+ | Streamable HTTP + stdio; multi-notebook sessions | +| **Revit MCP** | 24 | WebSocket bridge to desktop app; most mature BIM MCP | +| **Blender MCP** | ~10 | TCP socket bridge to Blender addon | +| **OpenFOAM MCP** | 12 | Socratic questioning; user expertise tracking | +| **FEA-MCP** | 10 | Unified API across ETABS + LUSAS backends | +| **mcp.science** | 12 servers | Federated: many small single-purpose servers | + +**Key takeaway**: Almost no peer uses MCP resources, prompts, or sampling. STK-MCP and Fusion 360 are exceptions. Most have no formal test suites. We're ahead on feature breadth but behind on MCP spec feature adoption. + +--- + +## 2. Best Practices Inventory + +### 2.1 Tool Annotations + +**Best practice**: Every tool should declare `readOnlyHint`, `destructiveHint`, `idempotentHint`, `openWorldHint`. Clients use these for auto-approval (skip confirmation for read-only tools from trusted servers), confirmation dialogs (destructive), and safe retries (idempotent). + +**Spec reference**: Tool annotations added 2025-03-26; blog post 2026-03-16. + +**Our status**: **NOT IMPLEMENTED.** Zero annotations on 142 tools. All tools default to `destructiveHint=true, readOnlyHint=false` — meaning clients like Claude Desktop prompt for confirmation on every call, even `list_thermal_zones`. + +**Impact**: High — immediate UX improvement in Claude Desktop, VS Code, and any annotation-aware client. Users currently click "allow" for every read-only query. + +**Classification of our 142 tools**: +- ~70 read-only (`list_*`, `get_*`, `extract_*`, `query_*`, `search_*`, `inspect_*`, `compare_*`, `read_file`) — should be `readOnlyHint=true` +- ~50 mutating (`create_*`, `add_*`, `set_*`, `apply_*`, `replace_*`, `assign_*`, `enable_*`, `adjust_*`, `shift_*`, `match_*`) — `destructiveHint=false` (reversible) +- ~10 destructive (`delete_object`, `remove_*`, `clean_unused_objects`, `cancel_run`) — `destructiveHint=true` +- ~12 idempotent (`set_*`, `change_building_location`, `set_simulation_control`) — `idempotentHint=true` +- All 142 — `openWorldHint=false` (local-only, no external network calls) + +### 2.2 Progressive Tool Discovery + +**Best practice**: At 100+ tools, don't ship all schemas to the client. Use meta-tools for discovery: +- `list_tools(prefix?)` — browse tool categories +- `describe_tools(names)` — lazy-load schemas +- `execute_tool(name, args)` — call by name + +Benchmarked at 90-96% token reduction (Speakeasy, 400 tools). Constant initial tokens (~2,500) regardless of toolset size. + +**Our status**: **PARTIALLY IMPLEMENTED.** We have `recommend_tools` (keyword routing) and `list_skills`/`get_skill` (workflow guidance). But all 142 tool schemas still ship on `tools/list` — the token cost is paid upfront regardless. + +True progressive discovery requires the tools NOT be registered with FastMCP at init, and instead routed through a meta-tool dispatcher. This is a fundamental architecture change. + +**Alternatives**: +- Anthropic's "code-as-API" pattern: expose tool definitions as files the agent reads on demand (98.7% reduction reported) +- MCP spec proposal for hierarchical `tools/categories` + `tools/discover` + `tools/load` + `tools/unload` (discussion phase, not in spec yet) +- Semantic search via embeddings over tool descriptions + +**Impact**: Very high for token cost. At ~450 tokens/tool, 142 tools = ~64K tokens of schema per session. Progressive discovery would reduce to ~3K initial + ~2K per task. + +### 2.3 Tool Annotations: Tags & Grouping + +**Best practice**: Use `tags` on tools for client-side filtering and organization. Group tools by domain. + +**Our status**: **IMPLEMENTED.** All 142 tools have tags: `core`, `geometry`, `hvac`, `loads`, `measures`, `simulation`, `results`, `envelope`, `meta`. Our `recommend_tools` router uses these groups. + +### 2.4 Error Handling + +**Best practice (3-tier model)**: +1. Transport errors — connection failures (client infra handles) +2. Protocol errors — JSON-RPC codes -32700 to -32802 (SDK handles) +3. Application errors — `isError: true` in tool result (LLM reasons about) + +Tool error messages should be: +- Written for LLMs, not developers +- Include actionable guidance ("Call load_osm_model first") +- Include retry guidance where applicable +- Sanitize internals (no stack traces, no secrets) + +**Our status**: **MOSTLY GOOD.** `{"ok": False, "error": "..."}` pattern is clean. Errors are sanitized (no stack traces to client). Many errors include actionable guidance ("No model loaded. Call load_osm_model first."). No retry guidance. + +**Gaps**: +- Errors don't use MCP's `isError` flag on the tool result content — they return `{"ok": false}` as regular content. This means the LLM must parse JSON to detect failure, rather than the protocol signaling it. +- No suggested-next-action field for recovery guidance + +### 2.5 MCP Resources + +**Best practice**: Use resources for read-only context the LLM should have automatically, without requiring a tool call. Resources are application-controlled (host decides which to include), unlike tools (model-controlled). + +Use cases: +- Current model state summary (auto-attached to context) +- Standards reference data (ASHRAE tables) +- Simulation results summary (auto-updated via subscriptions) + +**Our status**: **PARTIALLY IMPLEMENTED.** 4 static resources (ASHRAE baselines, modern HVAC, common materials, tool catalog). No dynamic resources, no subscriptions, no resource templates. + +**Gaps**: +- No dynamic resource for loaded model state — every session starts blind and must call `get_model_summary` +- No simulation results resource — results require explicit `extract_*` tool calls +- No resource subscriptions — client can't know when model changes + +### 2.6 MCP Prompts + +**Best practice**: Prompts are user-controlled workflow templates. They appear as slash commands in VS Code. Should return structured `PromptMessage` arrays with roles, not flat strings. + +**Our status**: **PARTIALLY IMPLEMENTED.** 6 prompts exist (baseline comparison, envelope retrofit, etc.). All return plain text strings, not structured `PromptMessage` arrays. + +**Gap**: Prompts could embed resources (e.g., results deep dive could embed `openstudio://run/{id}/summary`) and use multi-turn message structures. + +### 2.7 Async Tasks (Long-Running Operations) + +**Best practice**: Operations >5s should use MCP Tasks (experimental in 2025-11-25 spec). Client gets immediate task ID, polls via `tasks/get`, retrieves results when done. Eliminates custom polling patterns. + +**Our status**: **NOT IMPLEMENTED.** `run_simulation` returns a `run_id` and the LLM polls `get_run_status` every 1-2 minutes. This is a custom polling pattern that MCP Tasks would replace at the protocol level. + +**Impact**: Medium-high. EnergyPlus sims take 30-120s. MCP Tasks would: +- Eliminate the instructions telling LLMs to poll every 1-2 minutes +- Let the client show native progress UI +- Allow the agent to do other work while sim runs + +**Caveat**: Tasks are experimental in the spec. Client support (Claude Desktop, Claude Code) may be limited. + +### 2.8 Progress Reporting + +**Best practice**: Attach `progressToken` to long requests. Server sends `notifications/progress` with `{progress, total, message}`. + +**Our status**: **NOT IMPLEMENTED.** No progress notifications. Sim progress visible only via polling `get_run_status`. + +### 2.9 Structured Output (outputSchema) + +**Best practice**: Tools declare `outputSchema` (JSON Schema) and return `structuredContent` alongside text `content`. Enables client-side validation and typed parsing. + +FastMCP auto-generates schemas from Pydantic models or typed dicts. + +**Our status**: **NOT IMPLEMENTED.** All tools return `{"ok": True, ...}` as text content. No `outputSchema`, no `structuredContent`. We have a `tool_responses.schema.json` but it's only used in unit tests, not declared to clients. + +**Impact**: Medium. Would let future clients validate responses and build typed integrations. Low urgency since our JSON response pattern is well-established. + +### 2.10 Transport + +**Best practice**: stdio for local/single-client. Streamable HTTP for remote/multi-user. SSE is deprecated. + +**Our status**: **CORRECT for current use case.** stdio only. For the planned remote multi-user deployment, Streamable HTTP would be needed. + +### 2.11 Security + +**Best practice**: Path traversal prevention, input validation, no eval/exec, no secrets in errors. For remote: OAuth 2.1, per-tool scopes, TLS. + +**Our status**: **GOOD for local deployment.** +- Allowlist-based path validation (`is_path_allowed`) +- No `eval()`, `exec()`, or `getattr()` dispatch +- No secrets in error messages +- `parse_str_list()` handles JSON-string array inputs safely + +**Gap**: No OAuth, no per-tool scopes — not needed for stdio but will be for remote. + +### 2.12 Testing + +**Best practice (3-tier)**: +1. Unit — tool logic, input validation (pytest, mock dependencies) +2. Integration — full protocol flow with real server (Docker/Testcontainers) +3. LLM/Agent — tool selection and multi-step workflows + +FastMCP in-memory testing (no subprocess overhead) is the emerging best practice for unit tests. + +**Our status**: **INDUSTRY-LEADING.** +- 480+ integration tests in Docker with real OpenStudio SDK +- LLM agent tests (~160 tests) with Claude evaluating tool selection +- 5-shard CI pipeline balanced at ~200s each +- Strict test quality rules (regression/validates comments, exact values, no mocks in integration) +- `unwrap()` helper, `create_and_load()` fixtures, `poll_until_done()` + +**Minor gap**: Not using FastMCP in-memory client for unit tests (would be faster than subprocess). + +### 2.13 Observability / Logging + +**Best practice**: MCP servers should emit structured logs via `notifications/message`. Levels: debug through emergency. OpenTelemetry semantic conventions for tracing. + +**Our status**: **MINIMAL.** Python `logging` only in skill auto-discovery. No per-tool logging, no MCP log notifications, no structured logging, no tracing. + +**Impact**: Low for current single-user Docker deployment. Would matter for remote/multi-user debugging. + +### 2.14 Server Instructions + +**Best practice**: Server provides `instructions` field at init to guide LLM behavior. Should be concise, focused on what the LLM must know to use tools correctly. + +**Our status**: **GOOD.** 42-line instructions embedded in `server.py`. Covers "use tools, don't write code" directive, tool-specific guidance, polling instructions. Well-targeted. + +### 2.15 Pagination + +**Best practice**: Server-side pagination with metadata (total count, truncation flag). + +**Our status**: **GOOD.** `list_paginated()` with `max_results`, `total_available`, `truncated` flags. LLM-friendly. + +### 2.16 Capability Negotiation + +**Best practice**: Declare capabilities explicitly. Only use features both sides support. + +**Our status**: **AUTOMATIC.** FastMCP handles capability declaration based on registered tools/prompts/resources. + +### 2.17 Cancellation + +**Best practice**: Wire protocol-level `notifications/cancelled` to actual cancellation of long operations. + +**Our status**: **CUSTOM IMPLEMENTATION.** `cancel_run` tool exists but isn't wired to MCP protocol-level cancellation. Functional but non-standard. + +--- + +## 3. Gap Analysis Summary + +### What We Do Well (keep doing) + +| Area | Status | Notes | +|------|--------|-------| +| Tool organization (skills) | Strong | 26 skills, clean tools/operations separation | +| Error handling pattern | Strong | `{"ok": bool}` is clean, sanitized, often actionable | +| Path traversal security | Strong | Allowlist-based, no eval/exec | +| Integration testing | Industry-leading | 480+ tests, 5-shard CI, real SDK | +| LLM agent testing | Unique | Only BEM MCP with LLM evaluation tests | +| Pagination | Good | Server-side with metadata | +| Server instructions | Good | 42-line focused guidance | +| Input validation | Good | `parse_str_list()`, Choice arg validation | +| Skill discovery | Good | `list_skills`/`get_skill` for workflows | +| Stdout suppression | Clever | Solves real SWIG/JSON-RPC corruption bug | + +### What Needs Work + +| Area | Gap | Effort | Impact | +|------|-----|--------|--------| +| Tool annotations | Zero annotations on 142 tools | **Low** | **High** — immediate UX in Claude Desktop/VS Code | +| Token cost | All 142 schemas ship on connect (~64K tokens) | **High** | **Very High** — 90%+ reduction possible | +| MCP Tasks | Custom sim polling vs protocol-level tasks | **Medium** | **High** — native async, client progress UI | +| Dynamic resources | No model-state or results resources | **Medium** | **Medium** — auto-context for LLM | +| Structured output | No outputSchema on any tool | **Medium** | **Medium** — typed responses for clients | +| MCP logging | No protocol-level log notifications | **Low** | **Low** (until remote) | +| `isError` flag | Errors returned as regular content | **Low** | **Low-Medium** — protocol-correct error signaling | +| Progress reporting | No progress notifications for sims | **Medium** | **Medium** — replaces polling | +| Prompt structure | Flat strings, not PromptMessage arrays | **Low** | **Low** | + +--- + +## 4. Recommended Changes (Plan Only) + +### Phase 1: Quick Wins (1-2 days) + +#### 1a. Tool Annotations +Add `readOnlyHint`, `destructiveHint`, `idempotentHint`, `openWorldHint` to all 142 tools. + +**Approach**: Create a classification map in a central module. Apply via a helper or directly in each `@mcp.tool()` call. FastMCP supports `annotations=ToolAnnotations(...)` parameter. + +```python +from mcp.types import ToolAnnotations + +# Read-only tools +@mcp.tool(name="list_thermal_zones", tags={"geometry"}, + annotations=ToolAnnotations( + readOnlyHint=True, + destructiveHint=False, + openWorldHint=False, + )) +``` + +**Classification pass needed**: +- Audit all 142 tools +- Assign each to: read-only / mutating / destructive / idempotent +- Set `openWorldHint=False` on all (we never make network calls) + +**Test**: Unit test asserting every registered tool has annotations. + +#### 1b. `isError` Flag on Error Responses +When `{"ok": False}`, set `isError=True` on the MCP tool result content. This is a middleware-level change — inspect the JSON response and set the flag. + +**Approach**: Modify `_StdoutSuppressionMiddleware` (or add a second middleware) that parses the tool result, checks for `"ok": false`, and sets `isError=True`. + +#### 1c. Error Recovery Guidance +Add `"suggestion"` field to error responses for common failures: +- No model loaded → `"suggestion": "Call load_osm_model or create_new_building first"` +- Object not found → `"suggestion": "Call list_model_objects to see available objects"` +- Path not allowed → `"suggestion": "Files must be under /runs or /inputs"` + +### Phase 2: Spec Feature Adoption (3-5 days) + +#### 2a. Dynamic Resources for Model State +Add resources that reflect current loaded model: + +- `openstudio://model/summary` — building info, zone count, loop count (auto-updates on model change) +- `openstudio://model/zones` — thermal zone list +- `openstudio://run/{run_id}/results` — simulation results summary + +Implement resource subscriptions so clients get `notifications/resources/updated` on model save, measure apply, simulation complete. + +**Approach**: model_manager emits events; resource handlers listen and notify. + +#### 2b. MCP Protocol Logging +Emit structured log notifications for key events: +- Model load/save +- Simulation start/complete/error +- Measure application +- Error conditions + +**Approach**: Add `ctx.log(level, message)` calls in operations. FastMCP propagates as `notifications/message`. + +#### 2c. Progress Notifications for Simulation +During `run_simulation`, parse EnergyPlus stdout for stage indicators (warmup, sizing, annual simulation months) and emit `notifications/progress`. + +**Approach**: Simulation runner already reads subprocess output. Add progress token tracking and emit notifications at stage boundaries. + +### Phase 3: Async Tasks for Simulation (5-7 days) + +#### 3a. MCP Tasks for `run_simulation` +Replace custom `run_simulation` → `get_run_status` polling with protocol-level Tasks: +- `run_simulation` returns `CreateTaskResult` with task ID immediately +- Client polls via `tasks/get` or receives push notifications +- `tasks/result` returns final results when sim completes + +**Prerequisites**: Verify FastMCP Tasks support (experimental). May need SDK upgrade or custom implementation. + +**Impact**: Eliminates the "poll every 1-2 minutes" instruction from server.py. Client shows native progress UI. + +#### 3b. Wire Protocol Cancellation +Connect `notifications/cancelled` for `run_simulation` tasks to the existing `cancel_run` subprocess kill logic. + +### Phase 4: Token Optimization (7-14 days) + +#### 4a. Progressive Tool Discovery +Replace static 142-tool registration with dynamic discovery: + +**Option A — Meta-tool dispatcher** (most impactful, highest effort): +- Register only 3 tools: `list_available_tools(category?)`, `get_tool_schema(name)`, `call_tool(name, args)` +- Tools loaded lazily on `get_tool_schema` +- ~95% token reduction +- Requires reworking how FastMCP registers tools + +**Option B — Lazy schema loading** (moderate impact, medium effort): +- Register all tools but with minimal descriptions +- Full schema/description loaded on demand via `describe_tool(name)` +- ~60% token reduction +- Easier to implement within FastMCP + +**Option C — Client-side filtering** (lowest effort): +- Ship all schemas but use tool annotations + tags to let smart clients filter +- No token reduction but better organization +- Depends on client support + +**Recommendation**: Start with Option C (annotations, already in Phase 1). Plan Option A for when the MCP spec finalizes hierarchical tool management (expected 2026). + +#### 4b. Structured Output (outputSchema) +Add `outputSchema` to high-frequency tools: `extract_summary_metrics`, `list_thermal_zones`, `get_model_summary`, `get_building_info`, `list_air_loops`, `list_plant_loops`. + +**Approach**: Define Pydantic response models. FastMCP auto-generates schemas. Return `structuredContent` alongside text `content` for backward compatibility. + +### Phase 5: Remote / Multi-User (future) + +#### 5a. Streamable HTTP Transport +Add Streamable HTTP alongside stdio. FastMCP claims support. Needed for: +- Multi-user access +- Web client integration +- Cloud deployment + +#### 5b. Session Isolation +Replace global `model_manager` singleton with per-session state. Each connected client gets its own model instance. + +**Approach**: Session-keyed dict of model states. FastMCP provides session context. + +#### 5c. OAuth 2.1 Authentication +Per-tool scopes. Read-only scope for `list_*`/`get_*`, write scope for mutations, admin scope for destructive ops. + +--- + +## 5. Priority Matrix + +| Change | Effort | Impact | Dependencies | Phase | +|--------|--------|--------|-------------|-------| +| Tool annotations (142 tools) | Low (1 day) | High | None | 1 | +| `isError` flag middleware | Low (2 hrs) | Medium | None | 1 | +| Error recovery suggestions | Low (4 hrs) | Medium | None | 1 | +| Dynamic model resource | Medium (2 days) | Medium | None | 2 | +| MCP protocol logging | Low (1 day) | Low | None | 2 | +| Sim progress notifications | Medium (2 days) | Medium | None | 2 | +| MCP Tasks for simulation | Medium (5 days) | High | FastMCP Tasks support | 3 | +| Protocol-level cancellation | Low (4 hrs) | Low | Phase 3a | 3 | +| Progressive tool discovery | High (10 days) | Very High | Spec finalization | 4 | +| Structured output schemas | Medium (3 days) | Medium | None | 4 | +| Streamable HTTP transport | Medium (3 days) | High (for remote) | None | 5 | +| Session isolation | High (7 days) | High (for remote) | Phase 5a | 5 | +| OAuth 2.1 | High (5 days) | High (for remote) | Phase 5a | 5 | + +--- + +## 6. Lessons From Peers + +### EnergyPlus-MCP (LBNL) +- Stateless file-based design (IDF path per call) vs our stateful in-memory model +- Pro: scales horizontally, survives restarts. Con: slower (disk I/O per call), no in-memory object graph +- Published in SoftwareX journal — our approach is more powerful but less documented academically + +### BEM-AI (PNNL) +- Multi-agent A2A architecture wrapping MCP servers (including openstudio-mcp) +- Uses small language models (Qwen3:4B) with context engineering +- Blackboard pattern for shared state across agents +- Validates that our tool API surface works as a composable building block + +### Fusion 360 MCP +- Only project using all 3 MCP primitives (tools + resources + prompts) +- Tiny tool count (3) but demonstrates resources for exposing design state + +### STK-MCP (Ansys) +- 3 tools + 5 resources — resources carry the query workload +- Resources for object listing, health, access analysis — what we do with tools + +### mcp.science (Path Integral Institute) +- Federated approach: 12 small single-purpose servers +- Opposite of our monolith. Simpler per-server, harder to orchestrate. +- MCP Gateway pattern would unify multiple servers behind one endpoint + +### OpenFOAM MCP +- User expertise tracking ("context engineering system") +- Adjusts explanation depth based on detected user knowledge +- Interesting for our LLM-facing tool descriptions + +--- + +## 7. Industry Trends (2026) + +1. **Tool annotations becoming standard** — clients auto-approve read-only, prompt for destructive +2. **Progressive discovery for large toolsets** — token cost is the bottleneck, not tool count +3. **Tasks primitive maturing** — async is the future for simulation/build/deploy workflows +4. **Streamable HTTP replacing stdio** for production — stateless horizontal scaling +5. **MCP Gateway pattern emerging** — aggregate multiple servers behind single endpoint +6. **A2A + MCP layering** — MCP for tools, A2A for agent-to-agent coordination +7. **Spec governance moving to Linux Foundation AAIF** — enterprise features coming (audit, SSO) +8. **97M monthly SDK downloads** — MCP is the de facto standard for AI-tool integration + +--- + +## 8. Unresolved Questions + +- FastMCP `annotations=ToolAnnotations(...)` support — which version added it? Need `fastmcp>=?` +- MCP Tasks: FastMCP support status? Experimental spec feature, SDK coverage unclear +- Claude Desktop / Claude Code: which annotations actually change UX behavior today? +- Progress notification rendering: does Claude Desktop show progress bars? +- Streamable HTTP in FastMCP: production-ready or experimental? +- `outputSchema` / `structuredContent`: any client actually validates/uses these today? +- Progressive discovery: does FastMCP support dynamic tool registration/unregistration? +- `isError` flag: can FastMCP middleware set this, or does it require patching the SDK? +- How does BEM-AI's A2A wrapper invoke our tools — direct stdio or via MCP client SDK? + +--- + +## Sources + +### Official MCP +- [MCP Spec 2025-11-25](https://modelcontextprotocol.io/specification/2025-11-25) +- [2026 MCP Roadmap](https://blog.modelcontextprotocol.io/posts/2026-mcp-roadmap/) +- [Tool Annotations Blog](https://blog.modelcontextprotocol.io/posts/2026-03-16-tool-annotations/) +- [MCP Security Best Practices](https://modelcontextprotocol.io/specification/draft/basic/security_best_practices) +- [MCP Transports](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports) + +### Industry Research +- [Speakeasy: 100x Token Reduction with Dynamic Toolsets](https://www.speakeasy.com/blog/100x-token-reduction-dynamic-toolsets) +- [Progressive Tool Discovery Pattern](https://agentic-patterns.com/patterns/progressive-tool-discovery/) +- [Anthropic: Code Execution with MCP](https://www.anthropic.com/engineering/code-execution-with-mcp) +- [Stop Vibe-Testing Your MCP Servers (FastMCP creator)](https://www.jlowin.dev/blog/stop-vibe-testing-mcp-servers) +- [CoSAI: Practical Guide to MCP Security](https://www.coalitionforsecureai.org/securing-the-ai-agent-revolution-a-practical-guide-to-mcp-security/) + +### Peer Projects +- [EnergyPlus-MCP (LBNL)](https://github.com/LBNL-ETA/EnergyPlus-MCP) — [Paper](https://www.sciencedirect.com/science/article/pii/S2352711025003334) +- [BEM-AI (PNNL)](https://github.com/pnnl/BEM-AI) — [Paper](https://www.sciencedirect.com/science/article/abs/pii/S0378778825314422) +- [STK-MCP (Ansys)](https://github.com/alti3/stk-mcp) +- [Fusion 360 MCP](https://github.com/Joe-Spencer/fusion-mcp-server) +- [MATLAB MCP Core Server](https://github.com/matlab/matlab-mcp-core-server) +- [Jupyter MCP Server](https://github.com/datalayer/jupyter-mcp-server) +- [mcp.science](https://github.com/pathintegral-institute/mcp.science) +- [MCP Hierarchical Tool Management Discussion](https://github.com/orgs/modelcontextprotocol/discussions/532) diff --git a/docs/knowledge/reddit-mcp-discovery-thread.md b/docs/knowledge/reddit-mcp-discovery-thread.md new file mode 100644 index 0000000..c82d3d5 --- /dev/null +++ b/docs/knowledge/reddit-mcp-discovery-thread.md @@ -0,0 +1,188 @@ +# Research: MCP Tool Discovery at Scale + +Source thread: https://www.reddit.com/r/mcp/comments/1r0egn7/how_do_you_handle_discovery_when_you_have_dozens/ +Fetched: 2026-04-05 | Score: 8 (91% upvote) | 24 comments | r/mcp (103K subscribers) + +--- + +## Original Post (u/Sea-Perception1619) + +> As MCP adoption grows, I keep running into the same question: how does a client find the right server when there are many of them? +> +> Right now it seems like most setups hardcode server connections in the client config. That works with 3-5 servers but what happens when you have 30? Or when servers are maintained by different teams? Or when you want an agent to dynamically discover which MCP server has the tool it needs? +> +> How are you all handling this? Is anyone building a discovery layer on top of MCP, or is the expectation that clients just know their servers upfront? + +--- + +## All Comments (verbatim, organized by thread) + +### 1. u/owlpellet (score: 2) +> ["Tool Search Tool"](https://www.anthropic.com/engineering/advanced-tool-use) pattern, or [dynamic tool discovery](https://spring.io/blog/2025/12/11/spring-ai-tool-search-tools-tzolov), reduces token bloat and improves outcomes by using user-scenario clues to choose which tools to expose to an LLM. + +### 2. u/ParamedicAble225 (score: 3) +> The same way you handle one mcp server that has 100s of tools: MODES! And depending on the mode, the AI system instructions, available tools, and goals change. Then have an orchestrator LLM that commands all of the MODED AI's around and uses them as needed. Modularity. + +### 3. u/Loose_Rip359 (score: 3) +> Claude Code handles this with a deferred tool pattern -- tools aren't loaded into context until the agent runs a semantic search against a tool registry. Keeps token usage low and avoids overwhelming the model with 100+ tool definitions upfront. Works well in practice once you have good tool descriptions. The key insight is treating discovery as a tool itself. + +### 4. u/Raplaplaf (score: 1) -- Registry + Trust Layer + +> The issue is real, I started working on a registry after asking myself the same question and did some research beforehand: +> - registry.modelcontextprotocol.io -- pretty raw (no KYC, no quality assessment, no privacy/security management) +> - Kong MCP Registry -- very enterprise oriented and proprietary +> - Google Cloud API Registry -- well, it's Google +> +> What I found missing across all of them is a trust layer -- not just "which servers exist" but "which ones can I actually trust with my data and which one is the best choice (quality and token wise) for a given task (or subtask)." So I've been combining the registry work with a data handling spec (ADHP) that lets servers declare their privacy practices. +> +> - registry: https://github.com/StevenJohnson998/agent-registry +> - adhp: https://github.com/StevenJohnson998/agent-data-handling-policy + +**Reply chain:** + +- **u/Sea-Perception1619 (OP):** Trust gap is the core issue. Static registries solve "what exists" but not "what should I trust" or "what's best for this specific task." Asks: once trust requirements pass, how route to the *best* server dynamically based on performance, load, and capability match? + +- **u/Raplaplaf:** Long-term vision is dedicated agents that learn to direct swarms of LLM/Agents, using all those bricks autonomously to achieve the best result for minimal cost within acceptable security/privacy. + +- **u/Sea-Perception1619 (OP):** Claims to be building exactly that -- routing protocol with independent scoring functions at each node, adaptive parallel search when confidence is low. Working in simulation at 500 nodes, 97% discovery availability, sub-200ms latency. Says ADHP could be the policy filter layer, manifest schema the capability description format. + +- **u/Raplaplaf:** "Let's make sci-fi a reality. :)" + +### 5. u/GentoroAI (score: 1) -- Gateway Pattern + +> Hardcoding breaks fast. The pattern I'm seeing is a registry/gateway: clients connect to one MCP endpoint, and the gateway owns the server list, auth, health checks, versioning, and a searchable tool catalog. If you want dynamic discovery, do it there (semantic routing over tool metadata), not in every client. +> +> OneMCP: https://github.com/Gentoro-OneMCP/onemcp + +**Reply chain:** + +- **u/Sea-Perception1619 (OP):** Gateway works when one team owns the stack. What about cross-org? Company A's procurement agent discovers Company B's invoicing agent, neither wants to register in the other's gateway. Who runs the shared gateway? + +- **u/owlpellet:** "I believe Agent2Agent is intended to address the public listing case." + +- **u/GentoroAI:** Proposes federation -- each company runs its own gateway/registry, publishes signed "service descriptors" into a neutral directory (DNS-style). Discovery via directory, traffic/auth stays end-to-end (mTLS/OIDC, partner-scoped creds, allowlisted egress). + +### 6. u/BC_MARO (score: 1) -- 20+ Server Operator + +> Running 20+ MCP servers right now and the config management alone is painful. What worked for me was grouping servers by domain (data, code, infra) and having a thin proxy that exposes a unified tool list. The proxy handles health checks and failover so the client just sees one endpoint. +> +> The registry problem is real though. Right now there's no standard way for a client to ask "who can do X?" at runtime. Closest thing I've seen is tool-level semantic search over descriptions, but that falls apart when servers have overlapping capabilities. + +**Reply chain:** + +- **u/Sea-Perception1619 (OP):** Overlapping capabilities is the interesting problem. Semantic search gives ranked list, but when 3 servers score similarly, how do you pick? Describes routing approach: independent scoring functions evaluate candidates on axes (past success rate, load, novelty, reliability). When they agree -> top pick. When they disagree -> parallel-query multiple candidates, let results compete. Disagreement = signal for more exploration. + +- **u/BC_MARO:** Currently first-healthy + manual pinning. Likes disagreement-as-signal. Asks: how to measure "quality" automatically? Structured outputs are straightforward (schema validation), but freeform is fuzzy. + +- **u/Sea-Perception1619 (OP):** Quality measurement approach: let the *caller* decide. After discovery+invocation, caller reports success/failure. Over time that feedback shifts routing. Not evaluating output quality directly -- tracking *outcome quality* from caller perspective. For freeform, caller-reported outcomes "get you surprisingly far if you have enough query volume." Building an SDK around this pattern. + +- **u/BC_MARO:** "Yeah I'd be down to try it. The caller-reported feedback loop is practical since you skip the LLM-as-judge overhead entirely." + +### 7. u/beycom99 (score: 1) -- OneTool + +> Give OneTool a try. It is my solution to this problem. +> - https://onetool.beycom.online/ +> - https://onetool.beycom.online/about/about-onetool/ + +### 8. u/xrxie (score: 1) -- ToolIQ Gateway + +> The MCP gateway we use has a clever tool discovery service. We can still connect to individual MCP servers, but have option of configuring agents to point to a single MCP server that sits in front of a group of MCP servers with tools for searching, describing, and executing the tools. This alone trims down the context window considerably. Combined with custom MD files context can be even sharper. +> +> https://barndoor.ai/introducing-tooliq-mcp-tool-optimization/ + +### 9. u/dinkinflika0 (score: 1) -- Bifrost Gateway + +> We solve this in Bifrost -- gateway acts as discovery layer. Connect all MCP servers once, clients talk to gateway. It routes tool calls to the right server automatically. Also lets you filter which tools are available per agent using virtual keys. +> +> Docs: https://getmax.im/bifrostdocs + +### 10. u/makinggrace (score: 1) -- Pragmatic Multi-Layer Approach + +> Don't duplicate coverage of capabilities. Prune so you have the best tool for a specific task. +> +> Right now using a single gateway (fastmcp) and the profiles feature released in the 3.0 beta per client but I may try to change that up to per agent type. +> +> Usually I build MCP usage into skills and call the skill. This works the best for coding. +> +> More generally agents get list_tools to choose from the most commonly used tools in the client's profile. It also returns something like "use more_tools for more tools." (This prompt was hell to get right and I still am annoyed that I can't make it work in one call.) +> +> more_tools calls the toolmaster. That's literally a llm call to google genai who matches the request to a markdown file of every other mcp I have available with keywords and use cases. (Having a frontier model write this and not me made it work flawlessly.) +> +> In my own clients that hot swaps MCPs, the toolmaster also enables and disables MCP availability when it recommends a tool. Failure to do that in any commercial client thus far sadly. +> +> Tl;dr consider using a tiny llm call to manage the mcps that are infrequently used. + +--- + +## Approaches/Solutions Summary + +| Approach | Who | How it works | +|----------|-----|-------------| +| **Deferred/Tool Search** | Claude Code, Anthropic | Tools not loaded until agent semantic-searches a registry. 85% context reduction. | +| **Modes + Orchestrator** | u/ParamedicAble225 | Define modes with different tool subsets; orchestrator LLM selects mode per task. | +| **Gateway/Proxy** | u/GentoroAI (OneMCP), u/dinkinflika0 (Bifrost), u/xrxie (ToolIQ), u/BC_MARO | Single endpoint fronts all servers; gateway owns routing, health, auth, catalog. | +| **Registry + Trust Layer** | u/Raplaplaf | Registry with ADHP (Agent Data Handling Policy) for servers to declare privacy practices. | +| **Federation** | u/GentoroAI | Cross-org: each company runs own gateway, publishes signed service descriptors to neutral DNS-style directory. | +| **Two-tier discovery** | u/makinggrace | Common tools in initial list_tools; "more_tools" triggers LLM call to match request against full catalog markdown. Hot-swaps MCP availability. | +| **Capability routing + feedback** | u/Sea-Perception1619 (OP) | Independent scoring functions evaluate candidates; disagreement triggers parallel query; caller-reported outcomes improve routing over time. | +| **Semantic vector retrieval** | arxiv:2603.20313 | Dense embeddings index tools; retrieve top 3-5 per query. 99.6% token reduction, 97.1% hit@3, sub-100ms. | +| **Prune + deduplicate** | u/makinggrace | Don't duplicate capabilities across servers. Best tool for each task, period. | + +--- + +## Tools, Libraries, and Projects Mentioned + +| Name | URL | Description | +|------|-----|-------------| +| **Anthropic Tool Search** | https://www.anthropic.com/engineering/advanced-tool-use | Deferred tool loading + semantic search in Claude Code | +| **Spring AI Tool Search** | https://spring.io/blog/2025/12/11/spring-ai-tool-search-tools-tzolov | Dynamic tool discovery for Spring AI | +| **Agent Registry** | https://github.com/StevenJohnson998/agent-registry | MCP server registry with trust layer | +| **ADHP** | https://github.com/StevenJohnson998/agent-data-handling-policy | Agent Data Handling Policy spec | +| **OneMCP** | https://github.com/Gentoro-OneMCP/onemcp | Single runtime boundary + dynamic tool selection | +| **OneTool** | https://onetool.beycom.online/ | Tool aggregation/discovery solution | +| **ToolIQ (Barndoor)** | https://barndoor.ai/introducing-tooliq-mcp-tool-optimization/ | MCP gateway with tool discovery service | +| **Bifrost** | https://getmax.im/bifrostdocs | MCP gateway with virtual key filtering per agent | +| **FastMCP** | (profiles feature in 3.0 beta) | Gateway with per-client profiles | +| **Agent2Agent** | (Google, mentioned by u/owlpellet) | Cross-org agent discovery protocol | +| **MCP Hierarchical Mgmt** | https://github.com/orgs/modelcontextprotocol/discussions/532 | Proposal: categories, lazy loading, dynamic registration | +| **Semantic Tool Discovery** | https://arxiv.org/abs/2603.20313 | Academic paper: vector-based MCP tool selection | +| **RAG-MCP** | https://writer.com/engineering/rag-mcp/ | Writer.com: semantic retrieval for tool selection | +| **MCPX (Lunar)** | https://www.lunar.dev/post/why-dynamic-tool-discovery-solves-the-context-management-problem | Tool Groups + policy gating + auto-refresh | +| **Cloudflare Code Mode** | (mentioned in agentpmt.com) | Compresses 2500+ endpoints into 2 tools (~1K tokens) | +| **ToolHive MCP Optimizer** | (Stacklok, mentioned in agentpmt.com) | Dynamic toolset optimization | +| **Speakeasy** | (mentioned in agentpmt.com) | Up to 160x token reduction, 100% success 40-400 tools | + +--- + +## Key Numbers from Broader Research + +| Metric | Value | Source | +|--------|-------|--------| +| Token cost per tool definition | ~400-500 tokens | MCP Discussion #532 | +| 50 tools upfront context cost | ~20-25K tokens | MCP Discussion #532 | +| 5-server setup (GitHub+Slack+Sentry+Grafana+Splunk) | ~55K tokens | agentpmt.com | +| GitHub MCP server alone | ~46K tokens (91 tools) | atcyrus.com | +| Tool Search context reduction | 85% (77K -> 8.7K) | Anthropic | +| Tool Search accuracy improvement | Opus 4: 49%->74%, Opus 4.5: 79.5%->88.1% | Anthropic | +| Semantic vector retrieval hit rate | 97.1% at K=3, 0.91 MRR | arxiv:2603.20313 | +| Semantic vector token reduction | 99.6% | arxiv:2603.20313 | +| Selection accuracy degradation threshold | >30-50 tools visible | Multiple sources | +| Auto-activation threshold (Claude Code) | >10K tokens in tool descriptions | Anthropic | +| Cloudflare compression | 2500+ endpoints -> 2 tools (~1K tokens) | agentpmt.com | +| Speakeasy reduction | up to 160x | agentpmt.com | + +--- + +## Relevance to openstudio-mcp (142 tools) + +Our server has 142 tools -- well past the 30-50 tool accuracy degradation threshold. At ~400 tokens/tool, that is ~57K tokens of tool definitions. Key takeaways: + +1. **Claude Code's deferred loading already helps us** -- our tools are auto-deferred when >10K token threshold is hit. The question is whether our tool *descriptions* are good enough for semantic search to find the right tool. + +2. **Two-tier discovery (u/makinggrace) maps to our skills system** -- `list_skills()` and `get_skill()` are the "common tools" tier; the full 142 tools are the "more_tools" tier. + +3. **Pruning overlapping capabilities matters** -- we should audit for tools that overlap (e.g., `set_weather_file` vs `change_building_location`) and either consolidate or make descriptions disambiguate clearly. + +4. **Modes/profiles could help** -- grouping tools by workflow phase (geometry, HVAC, simulation, results) so the agent context only loads the relevant subset. + +5. **Tool naming is critical for search** -- names like `github_create_issue` beat `create`. Our `_tool` suffix convention + MCP-visible names should be keyword-rich and searchable. diff --git a/docs/knowledge/research-aps-agent-paper.md b/docs/knowledge/research-aps-agent-paper.md new file mode 100644 index 0000000..2d924f6 --- /dev/null +++ b/docs/knowledge/research-aps-agent-paper.md @@ -0,0 +1,89 @@ +# APS-Agent Paper Analysis + +**Paper:** "LLM Agent for User-Friendly Chemical Process Simulations" (Liang, Groll, Sin — DTU, arxiv 2601.11650v2, Feb 2026) + +**Repo:** https://github.com/gsi-lab/APS-Agent (MIT, compiled .pyd core — not readable source) + +## What It Is + +MCP server wrapping AVEVA Process Simulation (APS) — chemical process simulator. Claude Desktop as client. **15 tools** for flowsheet analysis, synthesis, optimization via natural language. FastMCP, supports stdio/SSE/streamable HTTP. + +## Toolset (15 tools) + +| Tool | Purpose | +|------|---------| +| aps_connect | Connect to APS | +| sim_open/create/save | Session management | +| sim_status | Convergence/specification check | +| models_list | All models on flowsheet | +| connectors_list | All connections | +| model_all_vars | All variables for a model (thousands) | +| model_all_params | All parameters for a model | +| var_get/set_multiple | Batch variable read/write | +| param_set_multiple | Batch parameter write | +| model_add | Add equipment to flowsheet | +| models_connect | Wire two model ports | +| fluid_create | Create fluid with components + thermo | +| fluid_to_source | Assign fluid to source model | + +All return `success: bool` + structured context — same pattern as our `ok: True/False`. + +## Key Findings + +### Case Study 1: Analysis (read existing flowsheet) +- Agent extracts data from thousands of variables, interprets thermo relationships, presents clearly +- Minor errors: oversimplification of complex interactions, calculation mistakes +- 6 tool calls, single interaction round + +### Case Study 2: Synthesis (build flowsheet from scratch) +- **Step-by-step dialogue**: reliable but requires domain expertise to prompt correctly +- **Single prompt**: 23 tool calls, 3 rounds. Less consistent — tried to set 4 nonexistent variables, redundant queries, premature parameter adjustments +- Step-by-step better for education; single-prompt better for experienced users doing rapid prototyping + +### Future Architecture (Fig. 4) +Multi-agent + RAG: +- Orchestrator agent dispatches to specialized sub-agents (synthesis, analysis, optimization) +- RAG knowledge base grounds agent in simulator-specific knowledge +- Dynamic context filtering to reduce information overload + +## Why They Propose RAG + +**Not about context window limits** — they never mention token counts. The problem is: + +1. **Information overload** — `model_all_vars` returns thousands of variables per model. Complex flowsheets overwhelm the agent's ability to pick what matters +2. **Domain knowledge gaps** — LLM hallucinates variable names, tries to set nonexistent params, doesn't know APS-specific operational modes +3. **Variable selection errors** — agent doesn't know which variables are settable vs computed, leading to failed tool calls + +RAG would inject: valid variable paths, parameter constraints, best practices, operational mode knowledge. + +## Comparison to openstudio-mcp + +| Aspect | APS-Agent | openstudio-mcp | +|--------|-----------|----------------| +| Tools | 15 | 142 | +| Tool granularity | Coarse (dump all vars) | Fine (targeted getters) | +| Response pattern | `success: bool` | `ok: bool` | +| Context management | None (future: RAG) | Skills, ToolSearch, targeted tools | +| Testing | 2 qualitative case studies | 167 automated LLM tests (95.8%) | +| Multi-agent | Proposed future | Not yet | +| Transport | stdio/SSE/streamable HTTP | stdio | +| LLM | Claude Sonnet 4 | Claude Sonnet (configurable) | + +## Lessons for Us + +### Already ahead on +- **Tool discovery**: our ToolSearch + skills = their proposed "dynamic context filtering" + RAG +- **Targeted tool design**: `inspect_component` > `model_all_vars` dump. We avoid their information overload problem by design +- **Testing rigor**: 167 automated tests with failure mode analysis vs 2 qualitative case studies +- **Error handling**: our tools validate inputs, return structured errors. Their agent tries nonexistent variables + +### Worth adopting +- **Multi-agent for scale**: as we add tools, orchestrator + specialized sub-agents could replace ToolSearch. Their Fig. 4 architecture aligns with our remote MCP plan +- **Streamable HTTP transport**: they already support it, we have it planned +- **Batch operations**: their `var_get/set_multiple` pattern — we could add bulk property get/set for efficiency (fewer round-trips) + +### Validates our approach +- Step-by-step > single-prompt for complex tasks — matches our skills system encoding expert workflows +- Expert oversight still essential — supports our guardrails work +- `success/ok` + structured errors is the right response pattern +- Deterministic simulator as verification layer — EnergyPlus serves same role for us diff --git a/docs/knowledge/tool-discovery-and-llm-testing.md b/docs/knowledge/tool-discovery-and-llm-testing.md new file mode 100644 index 0000000..5105b66 --- /dev/null +++ b/docs/knowledge/tool-discovery-and-llm-testing.md @@ -0,0 +1,320 @@ +# Tool Discovery and LLM Testing at Scale + +## Overview + +This document consolidates research and findings on scaling MCP tool discovery for openstudio-mcp (142 tools, 22 skills). It covers the project timeline from 62 to 142 tools, an industry survey of 7 approaches to large tool sets, our hands-on ToolSearch implementation, a three-model benchmark (Sonnet/Haiku/Opus, 230 tests, zero retries), and distilled lessons. Primary conclusion: dynamic tool discovery via ToolSearch is sufficient at 142 tools; sub-agent routing is not justified. + +## Timeline + +### Tool Count and Pass Rate Evolution + +| Date | Event | Tools | LLM Pass Rate | Key Change | +|------|-------|-------|---------------|------------| +| Feb 18 | Initial commit | 62 | -- | -- | +| Mar 2 | Input hardening + HVAC auto-wiring | 126 | -- | +64 tools | +| Mar 4 | Description compression (~30%) | 127 | -- | 100K -> 60K chars schema | +| Mar 5 | First LLM test suite | 127 | 44% (50 tests) | Baseline, no system prompt | +| Mar 6 | Server instructions (NEVER/ALWAYS) | 127 | 83% (90 tests) | +39pp from instructions alone | +| Mar 7 | Description fixes | 127 | 91% (90 tests) | +8pp | +| Mar 10 | Generic access tools | 130 | 96% (107 tests) | Phase C | +| Mar 12 | Remove 6 redundant typed list tools | 136 | 97.5% (159 tests) | Progressive L1/L2/L3 framework | +| Mar 19 | Tags + recommend_tools + ToolSearch | 142 | 96.5% (172 tests) | No regression from routing work | +| Mar 20 | Full regression with ToolSearch | 142 | 95.9% (171 tests) | Final pre-benchmark run | +| Mar 28 | Three-model sweep (0 retries) | 142 | 94.4% Sonnet / 88.9% Haiku / 94.4% Opus | 180 non-skipped tests | + +### Schema Size Over Time + +| Date | Tools | Schema Chars | Est. Tokens | +|------|-------|-------------|-------------| +| Feb 18 | 62 | ~30K | ~7.5K | +| Mar 2 | 126 | ~100K | ~25K | +| Mar 4 (post-compress) | 127 | ~60K | ~15K | +| Mar 19 | 142 | ~61K | ~15K | + +## Industry Patterns + +Ranked by evidence strength. Core finding: don't collapse N tools into 1 meta-tool -- LLMs are equally bad at selecting parameter values as selecting tools. Every winning approach keeps tools distinct but **filters to 5-15 per turn**. + +### Accuracy vs Tool Count (Empirical) + +| Tools Presented | Accuracy | Source | +|----------------|----------|--------| +| 5-7 | ~92% | Jenova.ai | +| 10-15 | sweet spot | Multiple | +| 30+ w/retrieval | >90% | RAG-MCP | +| 51 | 2-26% (flat) | Allen Chan / IBM | +| 100+ | 13.6% (flat) | RAG-MCP | +| 100+ w/semantic retrieval | 43% | RAG-MCP | +| 2,792 w/hybrid search | 94% | Stacklok ToolHive | +| 10K w/Anthropic Tool Search | 74-88% | Anthropic internal | + +### 1. Deferred Loading + Search (Production-Proven) + +Mark tools `defer_loading: true`. LLM sees only a search tool + pinned essentials. Full schemas load on-demand. + +| Implementation | Mechanism | Results | +|---|---|---| +| Anthropic Tool Search | BM25/regex on name+description | Opus 4: 49%->74%, 85% token reduction, 10K tool cap | +| OpenAI defer_loading | Same pattern, gpt-5.4+ | Recommends <20 tools/turn | +| Claude Code ToolSearch | Auto at 10% context threshold | 3-5 tools returned per query | +| Stacklok ToolHive | Hybrid semantic+BM25 | 94% on 2,792 tools (vs BM25-only: 34%) | + +### 2. Description Enrichment (Highest ROI, Lowest Risk) + +Descriptions are the **only** field ToolSearch/clients match against. Tags are inert (FastMCP server-side only, never sent on wire). Best practices: write descriptions like onboarding a new team member; include domain keywords matching how users describe tasks; namespace tool names by service/resource. Note: 97.1% of MCP tool descriptions have at least one "smell" (arxiv:2602.14878). Augmenting descriptions: +5.85pp success but +67% execution steps. + +### 3. Server Split (Universal Cross-Client Fix) + +Every client with hard caps forces this. + +| Client | Limit | Discovery | +|--------|-------|-----------| +| Claude Code | Unlimited (ToolSearch) | Auto-defer at 10% context | +| Claude Desktop | ~100 | None (all in context) | +| Cursor | 40 hard cap | None | +| Windsurf | 100 | Per-tool toggle | +| OpenAI | 128 (recommends ~10) | defer_loading | +| Gemini CLI | 100 soft / 512 API | includeTools/excludeTools | +| TRAE | 40 | None | +| GitHub Copilot | 128 | None | + +GitHub MCP Server approach: starts with 4 core tools, user enables toolsets via `--dynamic-toolsets`. Cut 23K tokens (50%). + +### 4. Embedding-Based Retrieval (Best for 300+ Tools) + +Key insight (Red Hat Tool2Vec): embed **example queries per tool**, not descriptions. Query embeddings discriminate better. Implementations: LangGraph BigTool, tool-gating-mcp (MiniLM-L6-v2), RAG-MCP (Qwen LLM retriever), Portkey mcp-tool-filter, openclaw-mcp-router (LanceDB). + +### 5. Hierarchical Selection (~10% Gain) + +Pick category first, then tool. ToolTree (ICLR 2026): MCTS + bidirectional pruning, ~10% over SOTA. ToolLLM/DFSDT: 16,464 APIs / 49 domains. MCP-Zero: agent-pull model, 98% token reduction, 3K tools / 308 servers. + +### 6. Code Execution Pattern (Nuclear Option) + +Agent writes code against tools-as-API. Cloudflare Code Mode: 2,500 endpoints -> 2 tools, 99.9% token reduction. Anthropic programmatic tool calling: 150K->2K tokens. High implementation cost (needs sandbox). + +### 7. Meta-Tool / Composite Tools (Modest Gains) + +AWO meta-tools: 5-12% fewer LLM calls, +4.2pp success. Works for fixed workflows only. Does NOT solve general tool discovery. Our own evidence: `list_spaces` (typed) passes L1; `list_model_objects("Space")` (generic) fails. Typed > generic. + +### MCP Spec Status + +Tools are a flat list: `name`, `title`, `description`, `inputSchema`, `outputSchema`, `annotations`. No categories, tags, filtering, or namespaces. Key proposals: SEP-1300 groups+tags (rejected), #1978 Lazy Hydration (`tools/list?minimal=true`), SEP-1576 JSON `$ref` (~24% token reduction). `notifications/tools/list_changed` is in spec but NOT supported by Claude Desktop or Claude Code. + +## Our Implementation + +### What We Built + +1. **Tags on all 142 tools** -- `tags={"core"}`, `tags={"hvac"}`, etc. via FastMCP +2. **`recommend_tools` meta-tool** -- keyword routing to 9 groups +3. **Enriched descriptions** for `search_api` and `search_wiring_patterns` +4. **Docstring hardening** for bypass-prone tools + +### Tags Are Inert + +Tags are a FastMCP server-side feature, NOT part of the MCP wire protocol. Never sent in `tools/list` responses. No client reads or acts on them. ToolSearch does not use them. Only use: server-side `mcp.disable(tags=...)` / `mcp.enable()` -- which requires `tools/list_changed` support (unavailable in Claude Desktop/Code). Tags kept for future-proofing only. + +### ToolSearch Root Cause: Docker Build-Time Indexing + +New tools added via volume-mounted code were invisible to ToolSearch. Root cause: ToolSearch indexes tool schemas when the MCP server first connects from the installed package in the Docker image. Volume-mounted code registers tools at runtime but the index is stale. + +**Before Docker rebuild:** + +| ToolSearch Query | Found? | What it found instead | +|-----------------|--------|----------------------| +| "search_api" | NO | "No matching deferred tools found" | +| "SDK classes methods" | NO | LSP, create_measure, get_object_fields | +| "HVAC wiring recipe" | NO | list_zone_hvac_equipment, get_zone_hvac_details | + +**After Docker rebuild + enriched descriptions:** + +| Query | Found? | Position | +|-------|--------|----------| +| "search_api" | YES | 1st | +| "SDK methods" | YES | 1st | +| "wiring patterns" | YES | 1st | +| "four pipe beam wiring" | YES | 1st | +| "recommend tools" | YES | 1st | + +**Rule: Always rebuild Docker image after adding new MCP tools.** CI does this automatically. + +### Description Compression Was Counterproductive + +Mar 4: compressed all 127 tool descriptions ~30% (100K -> 60K chars) to reduce context. But Claude Code ToolSearch had shipped Jan 14, 2026 (7 weeks earlier), auto-deferring tools when schemas exceed 10% of context. ToolSearch matches on keywords in descriptions. By compressing, we removed keywords ToolSearch uses to match -- optimized for a problem already solved while creating a new one. + +## Model Comparison + +### Test Structure + +| Tier | Tests | What It Measures | +|------|-------|-----------------| +| setup | 6 | Baseline model creation, simulation setup | +| tier1 | 4 | Single tool selection | +| tier2 | 37 | Multi-step workflows (2-28 tool chains) | +| tier3 | 26 | Natural language eval prompts | +| tier4 | 3 | Guardrails (must use MCP, not scripts) | +| progressive | 104 | L1 vague / L2 moderate / L3 explicit (35 cases x 3 levels) | + +Progressive levels: L1 = "Add HVAC to the building" (vague). L2 = "Add a VAV reheat system to all thermal zones" (moderate). L3 = "Add System 7 VAV reheat using add_baseline_system" (explicit tool name). + +### Overall Results (Zero Retries) + +| Metric | Sonnet | Haiku | Opus | +|--------|--------|-------|------| +| Total pass rate | 170/180 (94.4%) | 160/180 (88.9%) | 170/180 (94.4%) | +| Progressive pass rate | 103/104 (99.0%) | 97/104 (93.3%) | 104/104 (100%) | +| L1 pass rate (vague) | 34/35 (97%) | 32/35 (91%) | 35/35 (100%) | +| L2 pass rate (moderate) | 35/35 (100%) | 34/35 (97%) | 35/35 (100%) | +| L3 pass rate (explicit) | 34/34 (100%) | 31/34 (91%) | 34/34 (100%) | +| Total runtime | 2h38m | 1h20m | 3h05m | +| Avg turns/test | 6.8 | 7.4 | 7.0 | +| Avg ToolSearch calls/test | 1.9 | 0.0 | 2.0 | +| Timeouts | 1 | 0 | 2 | +| Cost (notional) | $18.96 | $11.21 | $32.23 | + +### Per-Tier Breakdown + +| Tier | Sonnet | Haiku | Opus | +|------|--------|-------|------| +| setup | 6/6 (100%) | 6/6 (100%) | 6/6 (100%) | +| tier1 | 4/4 (100%) | 4/4 (100%) | 4/4 (100%) | +| tier2 | 33/37 (89.2%) | 31/37 (83.8%) | 34/37 (91.9%) | +| tier3 | 21/26 (80.8%) | 19/26 (73.1%) | 19/26 (73.1%) | +| tier4 | 3/3 (100%) | 3/3 (100%) | 3/3 (100%) | +| progressive | 103/104 (99.0%) | 97/104 (93.3%) | 104/104 (100%) | + +Tier 3 weakest across all models (73-81%) -- complex eval/workflow tests with natural domain language. Shared failures suggest test expectations or tool descriptions need refinement, not a model gap. + +### Progressive L1/L2/L3 Detail (Failures Only) + +| Case | Son L1 | Son L2 | Son L3 | Hai L1 | Hai L2 | Hai L3 | Opus | +|------|--------|--------|--------|--------|--------|--------|------| +| create_building | P | P | P | P | **F** | P | all P | +| create_loads | P | P | P | P | P | **F** | all P | +| hvac_sizing | P | P | P | **F** | P | P | all P | +| import_floorplan | P | P | P | **F** | P | **F** | all P | +| replace_windows | P | P | P | P | P | **F** | all P | +| thermal_zones | **F** | P | P | **F** | P | P | all P | + +Opus: 100% across all 35 cases at all levels. Haiku L3 failures (import_floorplan, replace_windows, create_loads) are reasoning failures -- even with explicit tool names, haiku can't execute correctly. + +### ToolSearch Overhead + +| Metric | Sonnet | Haiku | Opus | +|--------|--------|-------|------| +| Avg ToolSearch calls/test | 1.9 | 0.0 | 2.0 | +| Max ToolSearch calls | 10 | 0 | 11 | +| Tests with 0 ToolSearch | 0/180 | 180/180 | 0/180 | + +Haiku never calls ToolSearch -- attempts tools directly from initial list. Its failures are reasoning failures, not discovery failures. + +### Failure Mode Analysis + +| Mode | Sonnet | Haiku | Opus | Description | +|------|--------|-------|------|-------------| +| wrong_tool | 9 | 16 | 8 | Called MCP tool, not expected one | +| no_mcp_tool | 0 | 4 | 0 | No MCP tool called (stuck in builtins) | +| timeout | 1 | 0 | 2 | Exceeded time limit | + +**Five root causes across all 40 failures:** + +1. **qaqc tests (9 failures)**: all models map "check/validate" to `validate_model` instead of expected `run_qaqc_checks`. Test expectation issue. +2. **troubleshoot tests (5 failures)**: all models call `extract_simulation_errors` instead of expected `get_run_logs`. Test expectation issue. +3. **energy-report timeout (3 failures)**: simulation chain exceeds 120s timeout. Budget issue. +4. **Haiku reasoning failures (15 failures)**: no_mcp_tool (4), hallucination loops (2), L3 failures (3), incomplete chains (6). Model limitation. +5. **Measure code quality (3 failures)**: right tool called but generated code fails quality checks. Code gen issue, not discovery. + +**Corrected pass rates** (fixing 3 structural test issues): + +| Model | Current | Adjusted | +|-------|---------|----------| +| Sonnet | 94.4% | 97.2% | +| Haiku | 88.9% | 91.1% | +| Opus | 94.4% | 98.3% | + +### Architecture Decision: Dynamic Discovery vs Sub-Agent Routing + +| Signal | Dynamic OK | Need Sub-Agents | Sonnet | Haiku | Opus | Verdict | +|--------|-----------|-----------------|--------|-------|------|---------| +| L1 pass rate | > 85% | < 70% | 97% | 91% | 100% | OK | +| L2 pass rate | > 90% | < 75% | 100% | 97% | 100% | OK | +| Avg ToolSearch calls | <= 2 | > 4 | 1.9 | 0.0 | 2.0 | OK | +| wrong_tool rate | < 10% | > 25% | 5.0% | 8.9% | 4.4% | OK | + +**Every signal falls in "Dynamic Discovery OK" range.** Sub-agent routing not justified. + +### Comparison with BEM-AI (PNNL) + +| Dimension | BEM-AI | openstudio-mcp | +|-----------|--------|----------------| +| Architecture | Multi-agent (planner + specialists) | Single agent, dynamic discovery | +| Tools | 6 | 142 | +| Models | 4B-70B local | Claude sonnet/haiku/opus (cloud) | +| Reliability | 10/10 at temp=0 | 94-100% first-attempt, 0 retries | +| Test scope | 3 scenarios (envelope only) | 180 tests across all BEM domains | + +BEM-AI's multi-agent approach targets small local models that struggle with large tool surfaces. With Claude-class models, dynamic discovery handles 142 tools without routing overhead. + +## Lessons and Recommendations + +### Findings (Deduplicated) + +1. **Server instructions are the biggest lever.** NEVER/ALWAYS guardrails for 6 domains gave +39pp (44% -> 83%) in one change. All subsequent description/tool changes combined added ~13pp. + +2. **Description compression was counterproductive.** ToolSearch (shipped Jan 14, 2026) already solved context size. Compressing descriptions removed the keywords ToolSearch needs for matching. Rich descriptions with domain keywords are the mechanism. + +3. **Tags are inert metadata.** Not in MCP wire protocol, never sent to clients, not used by ToolSearch. Only useful for server-side enable/disable (which requires `tools/list_changed` -- unsupported by Claude Desktop/Code). + +4. **Typed tools > generic tools for discovery.** `list_spaces` passes L1; `list_model_objects("Space")` fails. Don't consolidate typed tools further -- they serve as discoverable entry points. Generic tools are fallbacks for uncommon types. + +5. **ToolSearch indexes at Docker build time.** Volume-mounted code is invisible until `docker build`. CI handles this automatically. Local dev requires manual rebuild after adding tools. + +6. **~90% L1 is the ceiling for 142 tools.** Remaining failures are genuinely ambiguous prompts where multiple tools are reasonable. Not fixable by description enrichment or tool count reduction. + +7. **ToolSearch overhead is minimal.** 1.9-2.0 avg calls for Sonnet/Opus. Well under the "need sub-agents" threshold of >4. + +8. **Haiku's failures are reasoning, not discovery.** Zero ToolSearch calls + L3 failures (explicit tool name in prompt) confirm the bottleneck is model capability, not tool surface. + +9. **No cross-client discovery standard exists.** 142 tools works on Claude Code (ToolSearch) and Claude Desktop (brute force). Blocked on Cursor (40 cap), marginal on Windsurf/Gemini. Server split is the only universal fix. + +10. **Don't collapse tools into meta-tools.** Shifts "which tool?" to "which parameter?" -- LLMs are equally bad at both when option count is high. Every winning approach filters tools per turn, not reduces catalog. + +### Action Items + +| Priority | Action | Status | +|----------|--------|--------| +| Done | Description enrichment for bypass-prone tools | Shipped Mar 19 | +| Done | Docker rebuild after new tools | CI handles; documented | +| Do | Fix 3 structural test issues (qaqc, troubleshoot, energy-report) | Lifts all models to 97-98% | +| Do | Stronger Haiku system prompt ("always use MCP tools") | Addresses 4 no_mcp_tool failures | +| Do if needed | Profile-based server split for Cursor/Windsurf/OpenAI | Only for cross-client support | +| Watch | MCP Lazy Hydration (#1978), MCP-Zero pull model, `tools/list_changed` | Spec evolution | +| Don't | Sub-agent routing | All signals in "dynamic discovery OK" range | +| Don't | Further tool consolidation | Typed > generic, proven by L1 tests | + +## Citations + +### Academic +- RAG-MCP: arxiv:2505.03275 -- semantic retrieval for MCP tools +- MCP-Zero: arxiv:2506.01056 -- agent-pull model, hierarchical routing +- MCP Tool Descriptions Are Smelly: arxiv:2602.14878 -- 97.1% smell rate +- ToolTree: arxiv:2603.12740 (ICLR 2026) -- MCTS hierarchical planning +- AWO Meta-Tools: arxiv:2601.22037 -- composite tool bundling + +### Industry +- Anthropic Advanced Tool Use: anthropic.com/engineering/advanced-tool-use +- Anthropic Tool Search docs: platform.claude.com/docs/en/agents-and-tools/tool-use/tool-search-tool +- GitHub Copilot fewer tools: github.blog/ai-and-ml/github-copilot/how-were-making-github-copilot-smarter-with-fewer-tools/ +- Stacklok vs Tool Search: stacklok.com/blog/stackloks-mcp-optimizer-vs-anthropics-tool-search-tool +- Red Hat Tool2Vec: next.redhat.com/2025/12/05/a-practical-approach-to-smart-tool-retrieval +- Allen Chan tool count: achan2013.medium.com/how-many-tools-functions-can-an-ai-agent-has + +### MCP Spec +- MCP Tools spec: modelcontextprotocol.io/specification/2025-06-18/server/tools +- SEP-1300 groups+tags (rejected): github.com/modelcontextprotocol/modelcontextprotocol/issues/1300 +- #1978 Lazy Hydration: github.com/modelcontextprotocol/modelcontextprotocol/issues/1978 +- Client capabilities: github.com/apify/mcp-client-capabilities + +### Raw Data +- Sonnet sweep: `docs/sweeps/sonnet-2026-03-28/` +- Haiku sweep: `docs/sweeps/haiku-2026-03-28/` +- Opus sweep: `docs/sweeps/opus-2026-03-28/` diff --git a/docs/sweeps/codemode-off-2026-04-05/benchmark.json b/docs/sweeps/codemode-off-2026-04-05/benchmark.json new file mode 100644 index 0000000..eeb3773 --- /dev/null +++ b/docs/sweeps/codemode-off-2026-04-05/benchmark.json @@ -0,0 +1,4152 @@ +{ + "timestamp": "2026-04-05T18:11:01+00:00", + "model": "sonnet", + "retries": 0, + "code_mode": false, + "code_mode_tests": 0, + "total_tests": 129, + "passed": 123, + "failed": 6, + "pass_rate": 95.3, + "total_duration_s": 4140.4, + "total_input_tokens": 1260, + "total_output_tokens": 127859, + "total_cache_read_tokens": 12330023, + "total_cost_usd": 9.2912, + "tiers": { + "progressive": { + "total": 129, + "passed": 123, + "duration_s": 4140.4, + "pass_rate": 95.3 + } + }, + "tests": [ + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L1]", + "passed": true, + "duration_s": 84.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.22197315, + "duration_ms": 82046, + "input_tokens": 20, + "output_tokens": 3572, + "cache_read_tokens": 200173, + "tool_calls": [ + "list_skills", + "get_skill", + "list_files", + "create_example_osm", + "import_floorspacejs" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_skills", + "ToolSearch", + "mcp__openstudio__get_skill", + "ToolSearch", + "mcp__openstudio__list_files", + "ToolSearch", + "mcp__openstudio__create_example_osm", + "mcp__openstudio__import_floorspacejs" + ], + "toolsearch_count": 4, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L2]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "import_floorspacejs", + "list_files", + "list_files" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__import_floorspacejs", + "ToolSearch", + "mcp__openstudio__list_files", + "mcp__openstudio__list_files" + ], + "toolsearch_count": 2, + "is_timeout": true, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L3]", + "passed": true, + "duration_s": 65.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.0988626, + "duration_ms": 63429, + "input_tokens": 13, + "output_tokens": 904, + "cache_read_tokens": 125812, + "tool_calls": [ + "import_floorspacejs", + "list_files", + "list_files", + "import_floorspacejs" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__import_floorspacejs", + "ToolSearch", + "mcp__openstudio__list_files", + "mcp__openstudio__list_files", + "mcp__openstudio__import_floorspacejs" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_hvac_L1]", + "passed": true, + "duration_s": 49.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 13, + "cost_usd": 0.12445140000000002, + "duration_ms": 46861, + "input_tokens": 21, + "output_tokens": 1798, + "cache_read_tokens": 214728, + "tool_calls": [ + "load_osm_model", + "list_skills", + "get_building_info", + "list_thermal_zones", + "add_baseline_system", + "save_osm_model" + ], + "num_tool_calls": 6, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_skills", + "Skill", + "mcp__openstudio__get_building_info", + "ToolSearch", + "mcp__openstudio__list_thermal_zones", + "ToolSearch", + "mcp__openstudio__add_baseline_system", + "ToolSearch", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 4, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_hvac_L2]", + "passed": true, + "duration_s": 16.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.054671700000000004, + "duration_ms": 14639, + "input_tokens": 9, + "output_tokens": 753, + "cache_read_tokens": 96624, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_baseline_system" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_baseline_system" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_hvac_L3]", + "passed": true, + "duration_s": 22.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.051965700000000004, + "duration_ms": 19927, + "input_tokens": 9, + "output_tokens": 772, + "cache_read_tokens": 97504, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_baseline_system" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_baseline_system" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[view_model_L1]", + "passed": true, + "duration_s": 20.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.05025105, + "duration_ms": 18270, + "input_tokens": 12, + "output_tokens": 617, + "cache_read_tokens": 114946, + "tool_calls": [ + "load_osm_model", + "view_model", + "copy_file" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model", + "ToolSearch", + "mcp__openstudio__copy_file" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[view_model_L2]", + "passed": true, + "duration_s": 20.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06826035, + "duration_ms": 18312, + "input_tokens": 8, + "output_tokens": 493, + "cache_read_tokens": 65842, + "tool_calls": [ + "load_osm_model", + "view_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[view_model_L3]", + "passed": true, + "duration_s": 26.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.0850857, + "duration_ms": 23884, + "input_tokens": 12, + "output_tokens": 637, + "cache_read_tokens": 105024, + "tool_calls": [ + "load_osm_model", + "view_model", + "copy_file" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model", + "ToolSearch", + "mcp__openstudio__copy_file" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_weather_L1]", + "passed": true, + "duration_s": 34.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0851376, + "duration_ms": 32574, + "input_tokens": 9, + "output_tokens": 1202, + "cache_read_tokens": 103027, + "tool_calls": [ + "load_osm_model", + "list_weather_files", + "change_building_location" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_weather_L2]", + "passed": true, + "duration_s": 45.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.14180099999999998, + "duration_ms": 43471, + "input_tokens": 13, + "output_tokens": 1643, + "cache_read_tokens": 135290, + "tool_calls": [ + "load_osm_model", + "change_building_location", + "list_weather_files", + "change_building_location" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_weather_L3]", + "passed": true, + "duration_s": 45.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.13331264999999998, + "duration_ms": 42993, + "input_tokens": 12, + "output_tokens": 1644, + "cache_read_tokens": 105768, + "tool_calls": [ + "load_osm_model", + "change_building_location", + "list_weather_files" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "ToolSearch", + "mcp__openstudio__list_weather_files" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L1]", + "passed": true, + "duration_s": 15.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.07714005, + "duration_ms": 13661, + "input_tokens": 11, + "output_tokens": 545, + "cache_read_tokens": 85936, + "tool_calls": [ + "load_osm_model", + "validate_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__validate_model" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L2]", + "passed": true, + "duration_s": 32.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.08301434999999999, + "duration_ms": 30538, + "input_tokens": 11, + "output_tokens": 901, + "cache_read_tokens": 86767, + "tool_calls": [ + "load_osm_model", + "validate_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__validate_model" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L3]", + "passed": true, + "duration_s": 18.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.11932545, + "duration_ms": 16663, + "input_tokens": 11, + "output_tokens": 954, + "cache_read_tokens": 77429, + "tool_calls": [ + "load_osm_model", + "inspect_osm_summary", + "validate_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__inspect_osm_summary", + "ToolSearch", + "mcp__openstudio__validate_model" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_building_L1]", + "passed": true, + "duration_s": 118.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 14, + "cost_usd": 0.31488134999999995, + "duration_ms": 116562, + "input_tokens": 23, + "output_tokens": 4467, + "cache_read_tokens": 346312, + "tool_calls": [ + "list_skills", + "get_skill", + "list_weather_files", + "create_new_building", + "change_building_location", + "create_typical_building", + "save_osm_model", + "get_model_summary", + "save_osm_model" + ], + "num_tool_calls": 9, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_skills", + "mcp__openstudio__get_skill", + "mcp__openstudio__list_weather_files", + "ToolSearch", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__change_building_location", + "mcp__openstudio__create_typical_building", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__get_model_summary", + "ToolSearch", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 4, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_building_L2]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "create_new_building", + "create_new_building", + "list_weather_files", + "change_building_location", + "change_building_location", + "create_typical_building" + ], + "num_tool_calls": 6, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location", + "mcp__openstudio__change_building_location", + "mcp__openstudio__create_typical_building", + "Read", + "Grep", + "Read", + "Bash" + ], + "toolsearch_count": 2, + "is_timeout": true, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_building_L3]", + "passed": true, + "duration_s": 18.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.06862485, + "duration_ms": 16199, + "input_tokens": 7, + "output_tokens": 455, + "cache_read_tokens": 46967, + "tool_calls": [ + "create_bar_building" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_bar_building" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_pv_L1]", + "passed": true, + "duration_s": 28.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03491055, + "duration_ms": 26074, + "input_tokens": 8, + "output_tokens": 484, + "cache_read_tokens": 75901, + "tool_calls": [ + "load_osm_model", + "add_rooftop_pv" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_pv_L2]", + "passed": true, + "duration_s": 20.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.035721300000000004, + "duration_ms": 18492, + "input_tokens": 8, + "output_tokens": 530, + "cache_read_tokens": 75966, + "tool_calls": [ + "load_osm_model", + "add_rooftop_pv" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_pv_L3]", + "passed": true, + "duration_s": 16.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.034326449999999994, + "duration_ms": 14640, + "input_tokens": 8, + "output_tokens": 455, + "cache_read_tokens": 75979, + "tool_calls": [ + "load_osm_model", + "add_rooftop_pv" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermostat_L1]", + "passed": true, + "duration_s": 17.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03568395, + "duration_ms": 15731, + "input_tokens": 8, + "output_tokens": 433, + "cache_read_tokens": 75354, + "tool_calls": [ + "load_osm_model", + "adjust_thermostat_setpoints" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__adjust_thermostat_setpoints" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermostat_L2]", + "passed": true, + "duration_s": 17.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03527759999999999, + "duration_ms": 14978, + "input_tokens": 8, + "output_tokens": 415, + "cache_read_tokens": 75362, + "tool_calls": [ + "load_osm_model", + "adjust_thermostat_setpoints" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__adjust_thermostat_setpoints" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermostat_L3]", + "passed": true, + "duration_s": 18.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0336747, + "duration_ms": 16294, + "input_tokens": 8, + "output_tokens": 444, + "cache_read_tokens": 75994, + "tool_calls": [ + "load_osm_model", + "adjust_thermostat_setpoints" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__adjust_thermostat_setpoints" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_spaces_L1]", + "passed": true, + "duration_s": 13.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0367509, + "duration_ms": 11502, + "input_tokens": 8, + "output_tokens": 470, + "cache_read_tokens": 75898, + "tool_calls": [ + "load_osm_model", + "list_spaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_spaces_L2]", + "passed": true, + "duration_s": 14.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0437529, + "duration_ms": 11840, + "input_tokens": 8, + "output_tokens": 757, + "cache_read_tokens": 75238, + "tool_calls": [ + "load_osm_model", + "list_spaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_spaces_L3]", + "passed": true, + "duration_s": 15.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.07465635, + "duration_ms": 13512, + "input_tokens": 8, + "output_tokens": 702, + "cache_read_tokens": 65962, + "tool_calls": [ + "load_osm_model", + "list_spaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedules_L1]", + "passed": true, + "duration_s": 24.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0850971, + "duration_ms": 22286, + "input_tokens": 9, + "output_tokens": 892, + "cache_read_tokens": 86942, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "list_model_objects" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedules_L2]", + "passed": true, + "duration_s": 25.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0733446, + "duration_ms": 21193, + "input_tokens": 8, + "output_tokens": 649, + "cache_read_tokens": 66452, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedules_L3]", + "passed": true, + "duration_s": 16.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.07633140000000001, + "duration_ms": 14321, + "input_tokens": 8, + "output_tokens": 652, + "cache_read_tokens": 65658, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[inspect_component_L1]", + "passed": true, + "duration_s": 23.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.07858230000000001, + "duration_ms": 20983, + "input_tokens": 9, + "output_tokens": 554, + "cache_read_tokens": 86526, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_component_properties" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_component_properties" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[inspect_component_L2]", + "passed": true, + "duration_s": 14.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.07951965, + "duration_ms": 12135, + "input_tokens": 9, + "output_tokens": 621, + "cache_read_tokens": 86588, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_component_properties" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_component_properties" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[inspect_component_L3]", + "passed": true, + "duration_s": 29.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.09338685, + "duration_ms": 27273, + "input_tokens": 12, + "output_tokens": 859, + "cache_read_tokens": 106582, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_object_fields" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_object_fields" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[modify_component_L1]", + "passed": true, + "duration_s": 29.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.10993634999999999, + "duration_ms": 27917, + "input_tokens": 14, + "output_tokens": 1025, + "cache_read_tokens": 149177, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_component_properties", + "set_component_properties", + "save_osm_model" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_component_properties", + "mcp__openstudio__set_component_properties", + "ToolSearch", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[modify_component_L2]", + "passed": true, + "duration_s": 16.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0778884, + "duration_ms": 13678, + "input_tokens": 9, + "output_tokens": 557, + "cache_read_tokens": 86913, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "set_component_properties" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__set_component_properties" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[modify_component_L3]", + "passed": true, + "duration_s": 19.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0849597, + "duration_ms": 17807, + "input_tokens": 9, + "output_tokens": 615, + "cache_read_tokens": 86309, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "set_object_property" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__set_object_property" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L1]", + "passed": true, + "duration_s": 36.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 18, + "cost_usd": 0.1738947, + "duration_ms": 33920, + "input_tokens": 12, + "output_tokens": 2133, + "cache_read_tokens": 95204, + "tool_calls": [ + "load_osm_model", + "get_simulation_control", + "list_air_loops", + "list_thermal_zones", + "get_sizing_system_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties" + ], + "num_tool_calls": 15, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__get_simulation_control", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__get_sizing_system_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L2]", + "passed": true, + "duration_s": 15.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.07031369999999999, + "duration_ms": 13460, + "input_tokens": 8, + "output_tokens": 517, + "cache_read_tokens": 66249, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L3]", + "passed": true, + "duration_s": 13.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.07054455, + "duration_ms": 10876, + "input_tokens": 8, + "output_tokens": 529, + "cache_read_tokens": 66281, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[floor_area_L1]", + "passed": true, + "duration_s": 21.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06966629999999999, + "duration_ms": 18972, + "input_tokens": 8, + "output_tokens": 497, + "cache_read_tokens": 65516, + "tool_calls": [ + "load_osm_model", + "get_building_info" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[floor_area_L2]", + "passed": true, + "duration_s": 17.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06776775, + "duration_ms": 15081, + "input_tokens": 8, + "output_tokens": 369, + "cache_read_tokens": 65525, + "tool_calls": [ + "load_osm_model", + "get_building_info" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[floor_area_L3]", + "passed": true, + "duration_s": 20.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06811755, + "duration_ms": 17847, + "input_tokens": 8, + "output_tokens": 436, + "cache_read_tokens": 65816, + "tool_calls": [ + "load_osm_model", + "get_building_info" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[materials_L1]", + "passed": true, + "duration_s": 18.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.07556489999999999, + "duration_ms": 15894, + "input_tokens": 8, + "output_tokens": 704, + "cache_read_tokens": 65728, + "tool_calls": [ + "load_osm_model", + "list_materials" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_materials" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[materials_L2]", + "passed": true, + "duration_s": 17.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.08083649999999999, + "duration_ms": 15411, + "input_tokens": 8, + "output_tokens": 968, + "cache_read_tokens": 65350, + "tool_calls": [ + "load_osm_model", + "list_materials" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_materials" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[materials_L3]", + "passed": true, + "duration_s": 22.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.07863794999999998, + "duration_ms": 20240, + "input_tokens": 8, + "output_tokens": 906, + "cache_read_tokens": 65734, + "tool_calls": [ + "load_osm_model", + "list_materials" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_materials" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L1]", + "passed": false, + "duration_s": 16.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0584067, + "duration_ms": 14515, + "input_tokens": 7, + "output_tokens": 275, + "cache_read_tokens": 46544, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L2]", + "passed": true, + "duration_s": 15.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.07339155, + "duration_ms": 13392, + "input_tokens": 8, + "output_tokens": 702, + "cache_read_tokens": 66046, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L3]", + "passed": true, + "duration_s": 20.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.07192844999999999, + "duration_ms": 18019, + "input_tokens": 8, + "output_tokens": 605, + "cache_read_tokens": 66044, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L1]", + "passed": true, + "duration_s": 16.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03329505, + "duration_ms": 14351, + "input_tokens": 8, + "output_tokens": 393, + "cache_read_tokens": 75916, + "tool_calls": [ + "load_osm_model", + "list_subsurfaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_subsurfaces" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L2]", + "passed": true, + "duration_s": 12.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.035889449999999996, + "duration_ms": 10339, + "input_tokens": 8, + "output_tokens": 439, + "cache_read_tokens": 75489, + "tool_calls": [ + "load_osm_model", + "list_subsurfaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_subsurfaces" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L3]", + "passed": true, + "duration_s": 10.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.10168245, + "duration_ms": 7839, + "input_tokens": 8, + "output_tokens": 418, + "cache_read_tokens": 56299, + "tool_calls": [ + "load_osm_model", + "list_subsurfaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_subsurfaces" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[surface_details_L1]", + "passed": true, + "duration_s": 23.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.09108195, + "duration_ms": 21233, + "input_tokens": 9, + "output_tokens": 923, + "cache_read_tokens": 87679, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "get_surface_details", + "get_surface_details" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__get_surface_details", + "mcp__openstudio__get_surface_details" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[surface_details_L2]", + "passed": true, + "duration_s": 23.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.08284215, + "duration_ms": 21379, + "input_tokens": 9, + "output_tokens": 756, + "cache_read_tokens": 86288, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "get_surface_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__get_surface_details" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[surface_details_L3]", + "passed": true, + "duration_s": 27.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.1265055, + "duration_ms": 25324, + "input_tokens": 8, + "output_tokens": 1526, + "cache_read_tokens": 66305, + "tool_calls": [ + "load_osm_model", + "list_surfaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_simulation_L1]", + "passed": true, + "duration_s": 189.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 19, + "cost_usd": 0.25704314999999994, + "duration_ms": 187567, + "input_tokens": 29, + "output_tokens": 3842, + "cache_read_tokens": 389308, + "tool_calls": [ + "load_osm_model", + "run_simulation", + "get_run_status", + "extract_simulation_errors", + "get_weather_info", + "list_air_loops", + "delete_object", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics", + "extract_end_use_breakdown" + ], + "num_tool_calls": 12, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "mcp__openstudio__get_run_status", + "ToolSearch", + "mcp__openstudio__extract_simulation_errors", + "mcp__openstudio__get_weather_info", + "ToolSearch", + "mcp__openstudio__list_air_loops", + "ToolSearch", + "mcp__openstudio__delete_object", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_simulation_L2]", + "passed": true, + "duration_s": 27.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.0971193, + "duration_ms": 25021, + "input_tokens": 13, + "output_tokens": 903, + "cache_read_tokens": 126001, + "tool_calls": [ + "load_osm_model", + "run_simulation", + "get_run_status", + "get_run_status" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_simulation_L3]", + "passed": true, + "duration_s": 117.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.10547805, + "duration_ms": 115653, + "input_tokens": 14, + "output_tokens": 960, + "cache_read_tokens": 146391, + "tool_calls": [ + "load_osm_model", + "run_simulation", + "get_run_status", + "get_run_status" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "mcp__openstudio__get_run_status", + "Bash", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[get_eui_L1]", + "passed": true, + "duration_s": 23.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.0842838, + "duration_ms": 21064, + "input_tokens": 11, + "output_tokens": 807, + "cache_read_tokens": 86261, + "tool_calls": [ + "extract_summary_metrics", + "extract_end_use_breakdown", + "get_run_status", + "extract_simulation_errors" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "ToolSearch", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_simulation_errors" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[get_eui_L2]", + "passed": true, + "duration_s": 23.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.08185529999999999, + "duration_ms": 21482, + "input_tokens": 11, + "output_tokens": 672, + "cache_read_tokens": 84991, + "tool_calls": [ + "extract_summary_metrics", + "get_run_status", + "extract_simulation_errors" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_summary_metrics", + "ToolSearch", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_simulation_errors" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[get_eui_L3]", + "passed": true, + "duration_s": 11.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.061070400000000004, + "duration_ms": 9672, + "input_tokens": 7, + "output_tokens": 482, + "cache_read_tokens": 46323, + "tool_calls": [ + "extract_summary_metrics" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L1]", + "passed": true, + "duration_s": 32.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.10692075000000001, + "duration_ms": 30508, + "input_tokens": 15, + "output_tokens": 1062, + "cache_read_tokens": 128015, + "tool_calls": [ + "extract_end_use_breakdown", + "get_run_artifacts", + "extract_summary_metrics", + "extract_simulation_errors" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_end_use_breakdown", + "ToolSearch", + "mcp__openstudio__get_run_artifacts", + "mcp__openstudio__extract_summary_metrics", + "ToolSearch", + "mcp__openstudio__extract_simulation_errors" + ], + "toolsearch_count": 3, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L2]", + "passed": true, + "duration_s": 22.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.08552055, + "duration_ms": 20045, + "input_tokens": 11, + "output_tokens": 839, + "cache_read_tokens": 85021, + "tool_calls": [ + "extract_end_use_breakdown", + "get_run_status", + "get_run_artifacts" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_end_use_breakdown", + "ToolSearch", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_artifacts" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L3]", + "passed": true, + "duration_s": 18.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0584994, + "duration_ms": 16485, + "input_tokens": 7, + "output_tokens": 370, + "cache_read_tokens": 46253, + "tool_calls": [ + "extract_end_use_breakdown" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L1]", + "passed": true, + "duration_s": 31.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.12789465, + "duration_ms": 29276, + "input_tokens": 11, + "output_tokens": 1191, + "cache_read_tokens": 74793, + "tool_calls": [ + "extract_hvac_sizing", + "extract_component_sizing", + "get_run_artifacts", + "extract_simulation_errors" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_hvac_sizing", + "ToolSearch", + "mcp__openstudio__extract_component_sizing", + "mcp__openstudio__get_run_artifacts", + "mcp__openstudio__extract_simulation_errors" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L2]", + "passed": true, + "duration_s": 18.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0604245, + "duration_ms": 16365, + "input_tokens": 7, + "output_tokens": 440, + "cache_read_tokens": 45945, + "tool_calls": [ + "extract_hvac_sizing" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_hvac_sizing" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L3]", + "passed": true, + "duration_s": 12.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0578652, + "duration_ms": 10011, + "input_tokens": 7, + "output_tokens": 340, + "cache_read_tokens": 46214, + "tool_calls": [ + "extract_hvac_sizing" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_hvac_sizing" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_wwr_L1]", + "passed": true, + "duration_s": 40.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 13, + "cost_usd": 0.11092604999999998, + "duration_ms": 38448, + "input_tokens": 12, + "output_tokens": 1527, + "cache_read_tokens": 105771, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio" + ], + "num_tool_calls": 10, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_wwr_L2]", + "passed": true, + "duration_s": 32.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 14, + "cost_usd": 0.0905109, + "duration_ms": 30348, + "input_tokens": 11, + "output_tokens": 1563, + "cache_read_tokens": 142343, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "save_osm_model" + ], + "num_tool_calls": 12, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_wwr_L3]", + "passed": true, + "duration_s": 31.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 13, + "cost_usd": 0.07500825, + "duration_ms": 29774, + "input_tokens": 12, + "output_tokens": 1514, + "cache_read_tokens": 116395, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio" + ], + "num_tool_calls": 10, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_windows_L1]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "get_construction_details", + "get_construction_details", + "list_common_measures", + "list_measure_arguments", + "list_files" + ], + "num_tool_calls": 10, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "ToolSearch", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__list_common_measures", + "mcp__openstudio__list_measure_arguments", + "ToolSearch", + "mcp__openstudio__list_files" + ], + "toolsearch_count": 4, + "is_timeout": true, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_windows_L2]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "list_subsurfaces", + "get_construction_details", + "get_component_properties", + "list_materials", + "list_materials", + "list_common_measures", + "list_measure_arguments", + "replace_window_constructions", + "get_construction_details", + "get_object_fields", + "get_object_fields", + "get_object_fields", + "get_object_fields", + "list_materials", + "get_object_fields", + "get_object_fields" + ], + "num_tool_calls": 18, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_subsurfaces", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_component_properties", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__list_materials", + "mcp__openstudio__list_materials", + "mcp__openstudio__list_common_measures", + "mcp__openstudio__list_measure_arguments", + "ToolSearch", + "mcp__openstudio__replace_window_constructions", + "mcp__openstudio__get_construction_details", + "ToolSearch", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__list_materials", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields" + ], + "toolsearch_count": 6, + "is_timeout": true, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_windows_L3]", + "passed": true, + "duration_s": 29.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.07137840000000001, + "duration_ms": 27655, + "input_tokens": 12, + "output_tokens": 1428, + "cache_read_tokens": 116358, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "replace_window_constructions" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__replace_window_constructions" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[construction_details_L1]", + "passed": true, + "duration_s": 18.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.048168, + "duration_ms": 16598, + "input_tokens": 9, + "output_tokens": 706, + "cache_read_tokens": 95970, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "get_construction_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__get_construction_details" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[construction_details_L2]", + "passed": true, + "duration_s": 18.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.04690275, + "duration_ms": 15822, + "input_tokens": 9, + "output_tokens": 752, + "cache_read_tokens": 96665, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_construction_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_construction_details" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[construction_details_L3]", + "passed": true, + "duration_s": 31.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 18, + "cost_usd": 0.12877365, + "duration_ms": 29196, + "input_tokens": 12, + "output_tokens": 2035, + "cache_read_tokens": 104438, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details" + ], + "num_tool_calls": 15, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[check_loads_L1]", + "passed": true, + "duration_s": 18.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0440358, + "duration_ms": 16510, + "input_tokens": 9, + "output_tokens": 578, + "cache_read_tokens": 96121, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "get_space_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "mcp__openstudio__get_space_details" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[check_loads_L2]", + "passed": true, + "duration_s": 30.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.09209205000000001, + "duration_ms": 28145, + "input_tokens": 17, + "output_tokens": 1346, + "cache_read_tokens": 179566, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "get_space_details", + "list_model_objects", + "list_model_objects", + "get_load_details", + "get_load_details" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "ToolSearch", + "mcp__openstudio__get_space_details", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details" + ], + "toolsearch_count": 3, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[check_loads_L3]", + "passed": true, + "duration_s": 33.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 12, + "cost_usd": 0.07638675, + "duration_ms": 30967, + "input_tokens": 12, + "output_tokens": 1730, + "cache_read_tokens": 117590, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "get_load_details", + "get_load_details", + "get_load_details", + "get_load_details", + "get_load_details" + ], + "num_tool_calls": 9, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_loads_L1]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "get_model_summary", + "get_space_type_details", + "get_space_details", + "get_load_details", + "get_load_details", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition" + ], + "num_tool_calls": 26, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "mcp__openstudio__get_model_summary", + "ToolSearch", + "mcp__openstudio__get_space_type_details", + "mcp__openstudio__get_space_details", + "ToolSearch", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details", + "ToolSearch", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition" + ], + "toolsearch_count": 4, + "is_timeout": true, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_loads_L2]", + "passed": true, + "duration_s": 46.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 24, + "cost_usd": 0.15104669999999998, + "duration_ms": 44183, + "input_tokens": 9, + "output_tokens": 3431, + "cache_read_tokens": 85749, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition" + ], + "num_tool_calls": 22, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_loads_L3]", + "passed": true, + "duration_s": 26.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.0610665, + "duration_ms": 24596, + "input_tokens": 12, + "output_tokens": 1047, + "cache_read_tokens": 116860, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "create_people_definition" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_spaces", + "mcp__openstudio__create_people_definition" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L1]", + "passed": true, + "duration_s": 15.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0349464, + "duration_ms": 12948, + "input_tokens": 8, + "output_tokens": 498, + "cache_read_tokens": 76333, + "tool_calls": [ + "load_osm_model", + "create_plant_loop" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_plant_loop" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L2]", + "passed": true, + "duration_s": 13.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03561855, + "duration_ms": 11028, + "input_tokens": 8, + "output_tokens": 529, + "cache_read_tokens": 76386, + "tool_calls": [ + "load_osm_model", + "create_plant_loop" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_plant_loop" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L3]", + "passed": true, + "duration_s": 19.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0423108, + "duration_ms": 17848, + "input_tokens": 9, + "output_tokens": 665, + "cache_read_tokens": 96746, + "tool_calls": [ + "load_osm_model", + "create_plant_loop", + "create_plant_loop" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_plant_loop", + "mcp__openstudio__create_plant_loop" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedule_details_L1]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "list_air_loops", + "get_air_loop_details", + "get_object_fields", + "get_component_properties", + "list_model_objects", + "get_schedule_details", + "get_schedule_details", + "get_schedule_details", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "get_object_fields", + "get_object_fields", + "get_object_fields", + "get_thermal_zone_details", + "inspect_osm_summary", + "get_thermal_zone_details", + "inspect_osm_summary", + "read_file", + "read_file" + ], + "num_tool_calls": 21, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__get_air_loop_details", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_component_properties", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_thermal_zone_details", + "mcp__openstudio__inspect_osm_summary", + "mcp__openstudio__get_thermal_zone_details", + "mcp__openstudio__inspect_osm_summary", + "mcp__openstudio__read_file", + "ToolSearch", + "mcp__openstudio__read_file", + "Grep", + "Read", + "Bash" + ], + "toolsearch_count": 2, + "is_timeout": true, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedule_details_L2]", + "passed": true, + "duration_s": 60.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.1104975, + "duration_ms": 57926, + "input_tokens": 12, + "output_tokens": 2762, + "cache_read_tokens": 158180, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "get_schedule_details", + "list_model_objects", + "get_schedule_details", + "get_object_fields" + ], + "num_tool_calls": 8, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_object_fields" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedule_details_L3]", + "passed": true, + "duration_s": 28.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.06146775, + "duration_ms": 26626, + "input_tokens": 12, + "output_tokens": 1021, + "cache_read_tokens": 116060, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_schedule_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_schedule_details" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[space_type_info_L1]", + "passed": true, + "duration_s": 25.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.05722694999999999, + "duration_ms": 22911, + "input_tokens": 9, + "output_tokens": 784, + "cache_read_tokens": 98729, + "tool_calls": [ + "load_osm_model", + "get_model_summary", + "list_spaces", + "get_space_type_details" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__list_spaces", + "mcp__openstudio__get_space_type_details" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[space_type_info_L2]", + "passed": true, + "duration_s": 34.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.05860485, + "duration_ms": 32176, + "input_tokens": 12, + "output_tokens": 953, + "cache_read_tokens": 115342, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_space_type_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_space_type_details" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[space_type_info_L3]", + "passed": true, + "duration_s": 19.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.057093899999999996, + "duration_ms": 17148, + "input_tokens": 12, + "output_tokens": 911, + "cache_read_tokens": 115818, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_space_type_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_space_type_details" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_run_period_L1]", + "passed": true, + "duration_s": 11.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0339255, + "duration_ms": 9364, + "input_tokens": 8, + "output_tokens": 478, + "cache_read_tokens": 76030, + "tool_calls": [ + "load_osm_model", + "set_run_period" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__set_run_period" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_run_period_L2]", + "passed": true, + "duration_s": 19.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03627435, + "duration_ms": 17212, + "input_tokens": 8, + "output_tokens": 478, + "cache_read_tokens": 75422, + "tool_calls": [ + "load_osm_model", + "set_run_period" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__set_run_period" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_run_period_L3]", + "passed": true, + "duration_s": 12.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0342159, + "duration_ms": 10304, + "input_tokens": 8, + "output_tokens": 453, + "cache_read_tokens": 75848, + "tool_calls": [ + "load_osm_model", + "set_run_period" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__set_run_period" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[ideal_air_L1]", + "passed": true, + "duration_s": 23.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03730575, + "duration_ms": 21446, + "input_tokens": 8, + "output_tokens": 757, + "cache_read_tokens": 75685, + "tool_calls": [ + "load_osm_model", + "enable_ideal_air_loads" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__enable_ideal_air_loads" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[ideal_air_L2]", + "passed": true, + "duration_s": 36.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.0929835, + "duration_ms": 34253, + "input_tokens": 16, + "output_tokens": 1558, + "cache_read_tokens": 157935, + "tool_calls": [ + "load_osm_model", + "enable_ideal_air_loads", + "list_thermal_zones", + "list_zone_hvac_equipment" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__enable_ideal_air_loads", + "ToolSearch", + "mcp__openstudio__list_thermal_zones", + "ToolSearch", + "mcp__openstudio__list_zone_hvac_equipment" + ], + "toolsearch_count": 3, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[ideal_air_L3]", + "passed": true, + "duration_s": 22.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0374487, + "duration_ms": 19791, + "input_tokens": 8, + "output_tokens": 768, + "cache_read_tokens": 75699, + "tool_calls": [ + "load_osm_model", + "enable_ideal_air_loads" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__enable_ideal_air_loads" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[save_model_L1]", + "passed": true, + "duration_s": 15.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0307266, + "duration_ms": 13024, + "input_tokens": 8, + "output_tokens": 325, + "cache_read_tokens": 75742, + "tool_calls": [ + "load_osm_model", + "save_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[save_model_L2]", + "passed": true, + "duration_s": 12.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03489525, + "duration_ms": 9939, + "input_tokens": 8, + "output_tokens": 444, + "cache_read_tokens": 75325, + "tool_calls": [ + "load_osm_model", + "save_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[save_model_L3]", + "passed": true, + "duration_s": 15.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0319071, + "duration_ms": 12947, + "input_tokens": 8, + "output_tokens": 394, + "cache_read_tokens": 75827, + "tool_calls": [ + "load_osm_model", + "save_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_ev_L1]", + "passed": true, + "duration_s": 20.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03967065, + "duration_ms": 18571, + "input_tokens": 8, + "output_tokens": 569, + "cache_read_tokens": 76118, + "tool_calls": [ + "load_osm_model", + "add_ev_load" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_ev_load" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_ev_L2]", + "passed": true, + "duration_s": 27.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.05687684999999999, + "duration_ms": 25105, + "input_tokens": 9, + "output_tokens": 959, + "cache_read_tokens": 97437, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "add_ev_load" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "mcp__openstudio__add_ev_load" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_ev_L3]", + "passed": true, + "duration_s": 18.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0416688, + "duration_ms": 16536, + "input_tokens": 8, + "output_tokens": 550, + "cache_read_tokens": 75416, + "tool_calls": [ + "load_osm_model", + "add_ev_load" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_ev_load" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_measures_L1]", + "passed": true, + "duration_s": 15.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.02796315, + "duration_ms": 12876, + "input_tokens": 7, + "output_tokens": 538, + "cache_read_tokens": 56353, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_custom_measures" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_measures_L2]", + "passed": true, + "duration_s": 13.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.030804749999999995, + "duration_ms": 11519, + "input_tokens": 7, + "output_tokens": 597, + "cache_read_tokens": 55800, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_custom_measures" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_measures_L3]", + "passed": true, + "duration_s": 15.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.027631049999999997, + "duration_ms": 12961, + "input_tokens": 7, + "output_tokens": 462, + "cache_read_tokens": 56046, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_custom_measures" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_measure_L1]", + "passed": true, + "duration_s": 16.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0321219, + "duration_ms": 14528, + "input_tokens": 7, + "output_tokens": 619, + "cache_read_tokens": 56903, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_measure_L2]", + "passed": true, + "duration_s": 10.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0283563, + "duration_ms": 8653, + "input_tokens": 7, + "output_tokens": 439, + "cache_read_tokens": 56801, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_measure_L3]", + "passed": true, + "duration_s": 16.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0315003, + "duration_ms": 14364, + "input_tokens": 7, + "output_tokens": 610, + "cache_read_tokens": 56831, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[test_measure_L1]", + "passed": false, + "duration_s": 14.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.030544949999999998, + "duration_ms": 12206, + "input_tokens": 7, + "output_tokens": 516, + "cache_read_tokens": 56559, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_custom_measures" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[test_measure_L2]", + "passed": true, + "duration_s": 17.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.06732285, + "duration_ms": 14973, + "input_tokens": 11, + "output_tokens": 888, + "cache_read_tokens": 96287, + "tool_calls": [ + "test_measure", + "list_files" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__test_measure", + "ToolSearch", + "mcp__openstudio__list_files" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[test_measure_L3]", + "passed": true, + "duration_s": 14.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.02504955, + "duration_ms": 12302, + "input_tokens": 7, + "output_tokens": 347, + "cache_read_tokens": 56466, + "tool_calls": [ + "test_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__test_measure" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[apply_existing_measure_L1]", + "passed": true, + "duration_s": 30.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 9, + "cost_usd": 0.07728990000000001, + "duration_ms": 28601, + "input_tokens": 14, + "output_tokens": 1175, + "cache_read_tokens": 158968, + "tool_calls": [ + "load_osm_model", + "list_measure_arguments", + "apply_measure" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "Bash", + "Glob", + "Glob", + "ToolSearch", + "mcp__openstudio__list_measure_arguments", + "mcp__openstudio__apply_measure" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[apply_existing_measure_L2]", + "passed": true, + "duration_s": 20.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03519075, + "duration_ms": 18720, + "input_tokens": 8, + "output_tokens": 456, + "cache_read_tokens": 75360, + "tool_calls": [ + "load_osm_model", + "apply_measure" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__apply_measure" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[apply_existing_measure_L3]", + "passed": true, + "duration_s": 31.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0337947, + "duration_ms": 29555, + "input_tokens": 8, + "output_tokens": 487, + "cache_read_tokens": 75994, + "tool_calls": [ + "load_osm_model", + "apply_measure" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__apply_measure" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_terminals_cooled_beam_L1]", + "passed": true, + "duration_s": 25.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.0597153, + "duration_ms": 23262, + "input_tokens": 12, + "output_tokens": 855, + "cache_read_tokens": 115081, + "tool_calls": [ + "load_osm_model", + "list_air_loops", + "replace_air_terminals" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__replace_air_terminals" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_terminals_cooled_beam_L2]", + "passed": true, + "duration_s": 24.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.05464125, + "duration_ms": 21793, + "input_tokens": 12, + "output_tokens": 812, + "cache_read_tokens": 116305, + "tool_calls": [ + "load_osm_model", + "list_air_loops", + "replace_air_terminals" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__replace_air_terminals" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_terminals_cooled_beam_L3]", + "passed": true, + "duration_s": 18.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.054180450000000005, + "duration_ms": 16068, + "input_tokens": 12, + "output_tokens": 722, + "cache_read_tokens": 116019, + "tool_calls": [ + "load_osm_model", + "list_air_loops", + "replace_air_terminals" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__replace_air_terminals" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_terminals_four_pipe_beam_L1]", + "passed": true, + "duration_s": 31.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.0725466, + "duration_ms": 29040, + "input_tokens": 13, + "output_tokens": 1301, + "cache_read_tokens": 138667, + "tool_calls": [ + "load_osm_model", + "list_air_loops", + "replace_air_terminals", + "save_osm_model" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__replace_air_terminals", + "ToolSearch", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_terminals_four_pipe_beam_L2]", + "passed": true, + "duration_s": 24.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0477069, + "duration_ms": 22043, + "input_tokens": 9, + "output_tokens": 754, + "cache_read_tokens": 95933, + "tool_calls": [ + "load_osm_model", + "list_air_loops", + "replace_air_terminals" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__replace_air_terminals" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_terminals_four_pipe_beam_L3]", + "passed": true, + "duration_s": 24.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.057055049999999996, + "duration_ms": 22130, + "input_tokens": 12, + "output_tokens": 898, + "cache_read_tokens": 116101, + "tool_calls": [ + "load_osm_model", + "list_air_loops", + "replace_air_terminals" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__replace_air_terminals" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[measure_replace_terminals_L1]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "list_skills", + "get_skill", + "list_air_loops", + "search_wiring_patterns", + "search_api", + "list_plant_loops", + "create_measure" + ], + "num_tool_calls": 8, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_skills", + "ToolSearch", + "mcp__openstudio__get_skill", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__search_wiring_patterns", + "mcp__openstudio__search_api", + "ToolSearch", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 3, + "is_timeout": true, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[measure_replace_terminals_L2]", + "passed": true, + "duration_s": 78.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.17329184999999997, + "duration_ms": 76248, + "input_tokens": 16, + "output_tokens": 4669, + "cache_read_tokens": 185367, + "tool_calls": [ + "load_osm_model", + "search_wiring_patterns", + "search_api", + "list_air_loops", + "list_plant_loops", + "create_measure", + "test_measure" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__search_wiring_patterns", + "mcp__openstudio__search_api", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__create_measure", + "ToolSearch", + "mcp__openstudio__test_measure" + ], + "toolsearch_count": 3, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[measure_replace_terminals_L3]", + "passed": true, + "duration_s": 31.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0612585, + "duration_ms": 29697, + "input_tokens": 7, + "output_tokens": 2145, + "cache_read_tokens": 57325, + "tool_calls": [ + "load_osm_model", + "create_measure" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[zone_equipment_priority_L1]", + "passed": true, + "duration_s": 51.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 9, + "cost_usd": 0.1088622, + "duration_ms": 49247, + "input_tokens": 15, + "output_tokens": 2471, + "cache_read_tokens": 183374, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_zone_equipment", + "list_zone_hvac_equipment", + "set_zone_equipment_priority", + "set_zone_equipment_priority" + ], + "num_tool_calls": 6, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_zone_equipment", + "ToolSearch", + "mcp__openstudio__list_zone_hvac_equipment", + "mcp__openstudio__set_zone_equipment_priority", + "mcp__openstudio__set_zone_equipment_priority" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[zone_equipment_priority_L2]", + "passed": true, + "duration_s": 66.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.13511025, + "duration_ms": 64028, + "input_tokens": 16, + "output_tokens": 3022, + "cache_read_tokens": 210620, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_zone_equipment", + "list_zone_hvac_equipment", + "get_thermal_zone_details", + "get_zone_hvac_details", + "get_air_loop_details", + "set_zone_equipment_priority" + ], + "num_tool_calls": 8, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_zone_equipment", + "ToolSearch", + "mcp__openstudio__list_zone_hvac_equipment", + "mcp__openstudio__get_thermal_zone_details", + "mcp__openstudio__get_zone_hvac_details", + "mcp__openstudio__get_air_loop_details", + "mcp__openstudio__set_zone_equipment_priority" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[zone_equipment_priority_L3]", + "passed": false, + "duration_s": 20.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.13511025, + "duration_ms": 64028, + "input_tokens": 16, + "output_tokens": 3022, + "cache_read_tokens": 210620, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_zone_equipment", + "list_zone_hvac_equipment", + "get_thermal_zone_details", + "get_zone_hvac_details", + "get_air_loop_details", + "set_zone_equipment_priority" + ], + "num_tool_calls": 8, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_zone_equipment", + "ToolSearch", + "mcp__openstudio__list_zone_hvac_equipment", + "mcp__openstudio__get_thermal_zone_details", + "mcp__openstudio__get_zone_hvac_details", + "mcp__openstudio__get_air_loop_details", + "mcp__openstudio__set_zone_equipment_priority" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[edit_measure_L1]", + "passed": false, + "duration_s": 2.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.13511025, + "duration_ms": 64028, + "input_tokens": 16, + "output_tokens": 3022, + "cache_read_tokens": 210620, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_zone_equipment", + "list_zone_hvac_equipment", + "get_thermal_zone_details", + "get_zone_hvac_details", + "get_air_loop_details", + "set_zone_equipment_priority" + ], + "num_tool_calls": 8, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_zone_equipment", + "ToolSearch", + "mcp__openstudio__list_zone_hvac_equipment", + "mcp__openstudio__get_thermal_zone_details", + "mcp__openstudio__get_zone_hvac_details", + "mcp__openstudio__get_air_loop_details", + "mcp__openstudio__set_zone_equipment_priority" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[edit_measure_L2]", + "passed": false, + "duration_s": 2.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.13511025, + "duration_ms": 64028, + "input_tokens": 16, + "output_tokens": 3022, + "cache_read_tokens": 210620, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_zone_equipment", + "list_zone_hvac_equipment", + "get_thermal_zone_details", + "get_zone_hvac_details", + "get_air_loop_details", + "set_zone_equipment_priority" + ], + "num_tool_calls": 8, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_zone_equipment", + "ToolSearch", + "mcp__openstudio__list_zone_hvac_equipment", + "mcp__openstudio__get_thermal_zone_details", + "mcp__openstudio__get_zone_hvac_details", + "mcp__openstudio__get_air_loop_details", + "mcp__openstudio__set_zone_equipment_priority" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[edit_measure_L3]", + "passed": false, + "duration_s": 2.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.13511025, + "duration_ms": 64028, + "input_tokens": 16, + "output_tokens": 3022, + "cache_read_tokens": 210620, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_zone_equipment", + "list_zone_hvac_equipment", + "get_thermal_zone_details", + "get_zone_hvac_details", + "get_air_loop_details", + "set_zone_equipment_priority" + ], + "num_tool_calls": 8, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_zone_equipment", + "ToolSearch", + "mcp__openstudio__list_zone_hvac_equipment", + "mcp__openstudio__get_thermal_zone_details", + "mcp__openstudio__get_zone_hvac_details", + "mcp__openstudio__get_air_loop_details", + "mcp__openstudio__set_zone_equipment_priority" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0, + "failure_mode": "wrong_tool" + } + ] +} \ No newline at end of file diff --git a/docs/sweeps/codemode-off-2026-04-05/benchmark.md b/docs/sweeps/codemode-off-2026-04-05/benchmark.md new file mode 100644 index 0000000..7fb6e08 --- /dev/null +++ b/docs/sweeps/codemode-off-2026-04-05/benchmark.md @@ -0,0 +1,223 @@ +# LLM Benchmark Report + +**Date:** 2026-04-05T18:11:01+00:00 +**Model:** sonnet | **Retries:** 0 | **CodeMode:** OFF +**Result:** 123/129 passed (95.3%) in 4140s +**Tokens:** 1.3k in + 127.9k out + 12.3M cache | **Cost:** $9.2912 (notional API pricing) + +## Summary by Tier + +| Tier | Passed | Rate | Time | Avg | +|--------|---------|--------|--------|--------| +| progressive | 123/129 | 95.3% | 4140s | 32s | + +## Detailed Results + +### progressive + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|-------------------------------------|--------|------|-------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| import_floorplan_L1 | PASS | 84s | 10 | list_skills, get_skill, list_files, create_example_osm, import_floorspacejs | 20 | 3.6k | 200.2k | $0.2220 | 1 | +| import_floorplan_L2 | PASS | 120s | 0 | import_floorspacejs, list_files, list_files | 0 | 0 | 0 | $0.0000 | 1 | +| import_floorplan_L3 | PASS | 66s | 7 | import_floorspacejs, list_files, list_files, import_floorspacejs | 13 | 904 | 125.8k | $0.0989 | 1 | +| add_hvac_L1 | PASS | 49s | 13 | load_osm_model, list_skills, get_building_info, list_thermal_zones, add_baseline_system, save_osm_model | 21 | 1.8k | 214.7k | $0.1245 | 1 | +| add_hvac_L2 | PASS | 17s | 5 | load_osm_model, list_thermal_zones, add_baseline_system | 9 | 753 | 96.6k | $0.0547 | 1 | +| add_hvac_L3 | PASS | 22s | 5 | load_osm_model, list_thermal_zones, add_baseline_system | 9 | 772 | 97.5k | $0.0520 | 1 | +| view_model_L1 | PASS | 20s | 6 | load_osm_model, view_model, copy_file | 12 | 617 | 114.9k | $0.0503 | 1 | +| view_model_L2 | PASS | 20s | 4 | load_osm_model, view_model | 8 | 493 | 65.8k | $0.0683 | 1 | +| view_model_L3 | PASS | 26s | 6 | load_osm_model, view_model, copy_file | 12 | 637 | 105.0k | $0.0851 | 1 | +| set_weather_L1 | PASS | 35s | 5 | load_osm_model, list_weather_files, change_building_location | 9 | 1.2k | 103.0k | $0.0851 | 1 | +| set_weather_L2 | PASS | 46s | 7 | load_osm_model, change_building_location, list_weather_files, change_building_location | 13 | 1.6k | 135.3k | $0.1418 | 1 | +| set_weather_L3 | PASS | 45s | 6 | load_osm_model, change_building_location, list_weather_files | 12 | 1.6k | 105.8k | $0.1333 | 1 | +| run_qaqc_L1 | PASS | 16s | 5 | load_osm_model, validate_model | 11 | 545 | 85.9k | $0.0771 | 1 | +| run_qaqc_L2 | PASS | 33s | 5 | load_osm_model, validate_model | 11 | 901 | 86.8k | $0.0830 | 1 | +| run_qaqc_L3 | PASS | 19s | 6 | load_osm_model, inspect_osm_summary, validate_model | 11 | 954 | 77.4k | $0.1193 | 1 | +| create_building_L1 | PASS | 119s | 14 | list_skills, get_skill, list_weather_files, create_new_building, change_building_location, create_typical_building, save_osm_model, get_model_summary, save_osm_model | 23 | 4.5k | 346.3k | $0.3149 | 1 | +| create_building_L2 | PASS | 120s | 0 | create_new_building, create_new_building, list_weather_files, change_building_location, change_building_location, create_typical_building | 0 | 0 | 0 | $0.0000 | 1 | +| create_building_L3 | PASS | 18s | 3 | create_bar_building | 7 | 455 | 47.0k | $0.0686 | 1 | +| add_pv_L1 | PASS | 28s | 4 | load_osm_model, add_rooftop_pv | 8 | 484 | 75.9k | $0.0349 | 1 | +| add_pv_L2 | PASS | 21s | 4 | load_osm_model, add_rooftop_pv | 8 | 530 | 76.0k | $0.0357 | 1 | +| add_pv_L3 | PASS | 17s | 4 | load_osm_model, add_rooftop_pv | 8 | 455 | 76.0k | $0.0343 | 1 | +| thermostat_L1 | PASS | 18s | 4 | load_osm_model, adjust_thermostat_setpoints | 8 | 433 | 75.4k | $0.0357 | 1 | +| thermostat_L2 | PASS | 17s | 4 | load_osm_model, adjust_thermostat_setpoints | 8 | 415 | 75.4k | $0.0353 | 1 | +| thermostat_L3 | PASS | 18s | 4 | load_osm_model, adjust_thermostat_setpoints | 8 | 444 | 76.0k | $0.0337 | 1 | +| list_spaces_L1 | PASS | 14s | 4 | load_osm_model, list_spaces | 8 | 470 | 75.9k | $0.0368 | 1 | +| list_spaces_L2 | PASS | 14s | 4 | load_osm_model, list_spaces | 8 | 757 | 75.2k | $0.0438 | 1 | +| list_spaces_L3 | PASS | 16s | 4 | load_osm_model, list_spaces | 8 | 702 | 66.0k | $0.0747 | 1 | +| schedules_L1 | PASS | 24s | 5 | load_osm_model, list_model_objects, list_model_objects | 9 | 892 | 86.9k | $0.0851 | 1 | +| schedules_L2 | PASS | 25s | 4 | load_osm_model, list_model_objects | 8 | 649 | 66.5k | $0.0733 | 1 | +| schedules_L3 | PASS | 16s | 4 | load_osm_model, list_model_objects | 8 | 652 | 65.7k | $0.0763 | 1 | +| inspect_component_L1 | PASS | 23s | 5 | load_osm_model, list_model_objects, get_component_properties | 9 | 554 | 86.5k | $0.0786 | 1 | +| inspect_component_L2 | PASS | 14s | 5 | load_osm_model, list_model_objects, get_component_properties | 9 | 621 | 86.6k | $0.0795 | 1 | +| inspect_component_L3 | PASS | 29s | 6 | load_osm_model, list_model_objects, get_object_fields | 12 | 859 | 106.6k | $0.0934 | 1 | +| modify_component_L1 | PASS | 30s | 8 | load_osm_model, list_model_objects, get_component_properties, set_component_properties, save_osm_model | 14 | 1.0k | 149.2k | $0.1099 | 1 | +| modify_component_L2 | PASS | 16s | 5 | load_osm_model, list_model_objects, set_component_properties | 9 | 557 | 86.9k | $0.0779 | 1 | +| modify_component_L3 | PASS | 20s | 5 | load_osm_model, list_model_objects, set_object_property | 9 | 615 | 86.3k | $0.0850 | 1 | +| list_dynamic_type_L1 | PASS | 36s | 18 | load_osm_model, get_simulation_control, list_air_loops, list_thermal_zones, get_sizing_system_properties, get_sizing_zone_properties, get_sizing_zone_properties, get_sizing_zone_properties, get_sizing_zone_properties, get_sizing_zone_properties, get_sizing_zone_properties, get_sizing_zone_properties, get_sizing_zone_properties, get_sizing_zone_properties, get_sizing_zone_properties | 12 | 2.1k | 95.2k | $0.1739 | 1 | +| list_dynamic_type_L2 | PASS | 16s | 4 | load_osm_model, list_model_objects | 8 | 517 | 66.2k | $0.0703 | 1 | +| list_dynamic_type_L3 | PASS | 13s | 4 | load_osm_model, list_model_objects | 8 | 529 | 66.3k | $0.0705 | 1 | +| floor_area_L1 | PASS | 21s | 4 | load_osm_model, get_building_info | 8 | 497 | 65.5k | $0.0697 | 1 | +| floor_area_L2 | PASS | 17s | 4 | load_osm_model, get_building_info | 8 | 369 | 65.5k | $0.0678 | 1 | +| floor_area_L3 | PASS | 20s | 4 | load_osm_model, get_building_info | 8 | 436 | 65.8k | $0.0681 | 1 | +| materials_L1 | PASS | 18s | 4 | load_osm_model, list_materials | 8 | 704 | 65.7k | $0.0756 | 1 | +| materials_L2 | PASS | 18s | 4 | load_osm_model, list_materials | 8 | 968 | 65.3k | $0.0808 | 1 | +| materials_L3 | PASS | 22s | 4 | load_osm_model, list_materials | 8 | 906 | 65.7k | $0.0786 | 1 | +| thermal_zones_L1 | FAIL | 17s | 3 | load_osm_model | 7 | 275 | 46.5k | $0.0584 | 1 | +| thermal_zones_L2 | PASS | 16s | 4 | load_osm_model, list_thermal_zones | 8 | 702 | 66.0k | $0.0734 | 1 | +| thermal_zones_L3 | PASS | 20s | 4 | load_osm_model, list_thermal_zones | 8 | 605 | 66.0k | $0.0719 | 1 | +| subsurfaces_L1 | PASS | 16s | 4 | load_osm_model, list_subsurfaces | 8 | 393 | 75.9k | $0.0333 | 1 | +| subsurfaces_L2 | PASS | 12s | 4 | load_osm_model, list_subsurfaces | 8 | 439 | 75.5k | $0.0359 | 1 | +| subsurfaces_L3 | PASS | 10s | 4 | load_osm_model, list_subsurfaces | 8 | 418 | 56.3k | $0.1017 | 1 | +| surface_details_L1 | PASS | 23s | 6 | load_osm_model, list_surfaces, get_surface_details, get_surface_details | 9 | 923 | 87.7k | $0.0911 | 1 | +| surface_details_L2 | PASS | 24s | 5 | load_osm_model, list_surfaces, get_surface_details | 9 | 756 | 86.3k | $0.0828 | 1 | +| surface_details_L3 | PASS | 28s | 4 | load_osm_model, list_surfaces | 8 | 1.5k | 66.3k | $0.1265 | 1 | +| run_simulation_L1 | PASS | 190s | 19 | load_osm_model, run_simulation, get_run_status, extract_simulation_errors, get_weather_info, list_air_loops, delete_object, save_osm_model, run_simulation, get_run_status, extract_summary_metrics, extract_end_use_breakdown | 29 | 3.8k | 389.3k | $0.2570 | 1 | +| run_simulation_L2 | PASS | 27s | 7 | load_osm_model, run_simulation, get_run_status, get_run_status | 13 | 903 | 126.0k | $0.0971 | 1 | +| run_simulation_L3 | PASS | 118s | 8 | load_osm_model, run_simulation, get_run_status, get_run_status | 14 | 960 | 146.4k | $0.1055 | 1 | +| get_eui_L1 | PASS | 23s | 7 | extract_summary_metrics, extract_end_use_breakdown, get_run_status, extract_simulation_errors | 11 | 807 | 86.3k | $0.0843 | 1 | +| get_eui_L2 | PASS | 24s | 6 | extract_summary_metrics, get_run_status, extract_simulation_errors | 11 | 672 | 85.0k | $0.0819 | 1 | +| get_eui_L3 | PASS | 12s | 3 | extract_summary_metrics | 7 | 482 | 46.3k | $0.0611 | 1 | +| end_use_breakdown_L1 | PASS | 33s | 8 | extract_end_use_breakdown, get_run_artifacts, extract_summary_metrics, extract_simulation_errors | 15 | 1.1k | 128.0k | $0.1069 | 1 | +| end_use_breakdown_L2 | PASS | 22s | 6 | extract_end_use_breakdown, get_run_status, get_run_artifacts | 11 | 839 | 85.0k | $0.0855 | 1 | +| end_use_breakdown_L3 | PASS | 19s | 3 | extract_end_use_breakdown | 7 | 370 | 46.3k | $0.0585 | 1 | +| hvac_sizing_L1 | PASS | 32s | 7 | extract_hvac_sizing, extract_component_sizing, get_run_artifacts, extract_simulation_errors | 11 | 1.2k | 74.8k | $0.1279 | 1 | +| hvac_sizing_L2 | PASS | 19s | 3 | extract_hvac_sizing | 7 | 440 | 45.9k | $0.0604 | 1 | +| hvac_sizing_L3 | PASS | 12s | 3 | extract_hvac_sizing | 7 | 340 | 46.2k | $0.0579 | 1 | +| set_wwr_L1 | PASS | 41s | 13 | load_osm_model, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio | 12 | 1.5k | 105.8k | $0.1109 | 1 | +| set_wwr_L2 | PASS | 33s | 14 | load_osm_model, list_surfaces, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, save_osm_model | 11 | 1.6k | 142.3k | $0.0905 | 1 | +| set_wwr_L3 | PASS | 32s | 13 | load_osm_model, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio | 12 | 1.5k | 116.4k | $0.0750 | 1 | +| replace_windows_L1 | PASS | 120s | 0 | load_osm_model, list_model_objects, list_model_objects, list_model_objects, list_model_objects, get_construction_details, get_construction_details, list_common_measures, list_measure_arguments, list_files | 0 | 0 | 0 | $0.0000 | 1 | +| replace_windows_L2 | PASS | 120s | 0 | load_osm_model, list_model_objects, list_subsurfaces, get_construction_details, get_component_properties, list_materials, list_materials, list_common_measures, list_measure_arguments, replace_window_constructions, get_construction_details, get_object_fields, get_object_fields, get_object_fields, get_object_fields, list_materials, get_object_fields, get_object_fields | 0 | 0 | 0 | $0.0000 | 1 | +| replace_windows_L3 | PASS | 30s | 6 | load_osm_model, list_model_objects, replace_window_constructions | 12 | 1.4k | 116.4k | $0.0714 | 1 | +| construction_details_L1 | PASS | 19s | 5 | load_osm_model, list_surfaces, get_construction_details | 9 | 706 | 96.0k | $0.0482 | 1 | +| construction_details_L2 | PASS | 18s | 5 | load_osm_model, list_model_objects, get_construction_details | 9 | 752 | 96.7k | $0.0469 | 1 | +| construction_details_L3 | PASS | 31s | 18 | load_osm_model, list_model_objects, get_construction_details, get_construction_details, get_construction_details, get_construction_details, get_construction_details, get_construction_details, get_construction_details, get_construction_details, get_construction_details, get_construction_details, get_construction_details, get_construction_details, get_construction_details | 12 | 2.0k | 104.4k | $0.1288 | 1 | +| check_loads_L1 | PASS | 19s | 5 | load_osm_model, list_spaces, get_space_details | 9 | 578 | 96.1k | $0.0440 | 1 | +| check_loads_L2 | PASS | 30s | 11 | load_osm_model, list_spaces, get_space_details, list_model_objects, list_model_objects, get_load_details, get_load_details | 17 | 1.3k | 179.6k | $0.0921 | 1 | +| check_loads_L3 | PASS | 33s | 12 | load_osm_model, list_model_objects, list_model_objects, list_model_objects, get_load_details, get_load_details, get_load_details, get_load_details, get_load_details | 12 | 1.7k | 117.6k | $0.0764 | 1 | +| create_loads_L1 | PASS | 120s | 0 | load_osm_model, list_spaces, get_model_summary, get_space_type_details, get_space_details, get_load_details, get_load_details, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition | 0 | 0 | 0 | $0.0000 | 1 | +| create_loads_L2 | PASS | 46s | 24 | load_osm_model, list_spaces, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition | 9 | 3.4k | 85.7k | $0.1510 | 1 | +| create_loads_L3 | PASS | 27s | 6 | load_osm_model, list_spaces, create_people_definition | 12 | 1.0k | 116.9k | $0.0611 | 1 | +| create_plant_loop_L1 | PASS | 15s | 4 | load_osm_model, create_plant_loop | 8 | 498 | 76.3k | $0.0349 | 1 | +| create_plant_loop_L2 | PASS | 13s | 4 | load_osm_model, create_plant_loop | 8 | 529 | 76.4k | $0.0356 | 1 | +| create_plant_loop_L3 | PASS | 20s | 5 | load_osm_model, create_plant_loop, create_plant_loop | 9 | 665 | 96.7k | $0.0423 | 1 | +| schedule_details_L1 | PASS | 120s | 0 | load_osm_model, list_air_loops, get_air_loop_details, get_object_fields, get_component_properties, list_model_objects, get_schedule_details, get_schedule_details, get_schedule_details, list_model_objects, list_model_objects, list_model_objects, get_object_fields, get_object_fields, get_object_fields, get_thermal_zone_details, inspect_osm_summary, get_thermal_zone_details, inspect_osm_summary, read_file, read_file | 0 | 0 | 0 | $0.0000 | 1 | +| schedule_details_L2 | PASS | 60s | 10 | load_osm_model, list_model_objects, list_model_objects, list_model_objects, get_schedule_details, list_model_objects, get_schedule_details, get_object_fields | 12 | 2.8k | 158.2k | $0.1105 | 1 | +| schedule_details_L3 | PASS | 29s | 6 | load_osm_model, list_model_objects, get_schedule_details | 12 | 1.0k | 116.1k | $0.0615 | 1 | +| space_type_info_L1 | PASS | 25s | 6 | load_osm_model, get_model_summary, list_spaces, get_space_type_details | 9 | 784 | 98.7k | $0.0572 | 1 | +| space_type_info_L2 | PASS | 34s | 6 | load_osm_model, list_model_objects, get_space_type_details | 12 | 953 | 115.3k | $0.0586 | 1 | +| space_type_info_L3 | PASS | 19s | 6 | load_osm_model, list_model_objects, get_space_type_details | 12 | 911 | 115.8k | $0.0571 | 1 | +| set_run_period_L1 | PASS | 11s | 4 | load_osm_model, set_run_period | 8 | 478 | 76.0k | $0.0339 | 1 | +| set_run_period_L2 | PASS | 19s | 4 | load_osm_model, set_run_period | 8 | 478 | 75.4k | $0.0363 | 1 | +| set_run_period_L3 | PASS | 12s | 4 | load_osm_model, set_run_period | 8 | 453 | 75.8k | $0.0342 | 1 | +| ideal_air_L1 | PASS | 24s | 4 | load_osm_model, enable_ideal_air_loads | 8 | 757 | 75.7k | $0.0373 | 1 | +| ideal_air_L2 | PASS | 36s | 8 | load_osm_model, enable_ideal_air_loads, list_thermal_zones, list_zone_hvac_equipment | 16 | 1.6k | 157.9k | $0.0930 | 1 | +| ideal_air_L3 | PASS | 22s | 4 | load_osm_model, enable_ideal_air_loads | 8 | 768 | 75.7k | $0.0374 | 1 | +| save_model_L1 | PASS | 16s | 4 | load_osm_model, save_osm_model | 8 | 325 | 75.7k | $0.0307 | 1 | +| save_model_L2 | PASS | 12s | 4 | load_osm_model, save_osm_model | 8 | 444 | 75.3k | $0.0349 | 1 | +| save_model_L3 | PASS | 15s | 4 | load_osm_model, save_osm_model | 8 | 394 | 75.8k | $0.0319 | 1 | +| add_ev_L1 | PASS | 21s | 4 | load_osm_model, add_ev_load | 8 | 569 | 76.1k | $0.0397 | 1 | +| add_ev_L2 | PASS | 27s | 5 | load_osm_model, list_spaces, add_ev_load | 9 | 959 | 97.4k | $0.0569 | 1 | +| add_ev_L3 | PASS | 19s | 4 | load_osm_model, add_ev_load | 8 | 550 | 75.4k | $0.0417 | 1 | +| list_measures_L1 | PASS | 15s | 3 | list_custom_measures | 7 | 538 | 56.4k | $0.0280 | 1 | +| list_measures_L2 | PASS | 14s | 3 | list_custom_measures | 7 | 597 | 55.8k | $0.0308 | 1 | +| list_measures_L3 | PASS | 15s | 3 | list_custom_measures | 7 | 462 | 56.0k | $0.0276 | 1 | +| create_measure_L1 | PASS | 17s | 3 | create_measure | 7 | 619 | 56.9k | $0.0321 | 1 | +| create_measure_L2 | PASS | 11s | 3 | create_measure | 7 | 439 | 56.8k | $0.0284 | 1 | +| create_measure_L3 | PASS | 16s | 3 | create_measure | 7 | 610 | 56.8k | $0.0315 | 1 | +| test_measure_L1 | FAIL | 14s | 3 | list_custom_measures | 7 | 516 | 56.6k | $0.0305 | 1 | +| test_measure_L2 | PASS | 17s | 5 | test_measure, list_files | 11 | 888 | 96.3k | $0.0673 | 1 | +| test_measure_L3 | PASS | 14s | 3 | test_measure | 7 | 347 | 56.5k | $0.0250 | 1 | +| apply_existing_measure_L1 | PASS | 31s | 9 | load_osm_model, list_measure_arguments, apply_measure | 14 | 1.2k | 159.0k | $0.0773 | 1 | +| apply_existing_measure_L2 | PASS | 21s | 4 | load_osm_model, apply_measure | 8 | 456 | 75.4k | $0.0352 | 1 | +| apply_existing_measure_L3 | PASS | 32s | 4 | load_osm_model, apply_measure | 8 | 487 | 76.0k | $0.0338 | 1 | +| replace_terminals_cooled_beam_L1 | PASS | 25s | 6 | load_osm_model, list_air_loops, replace_air_terminals | 12 | 855 | 115.1k | $0.0597 | 1 | +| replace_terminals_cooled_beam_L2 | PASS | 24s | 6 | load_osm_model, list_air_loops, replace_air_terminals | 12 | 812 | 116.3k | $0.0546 | 1 | +| replace_terminals_cooled_beam_L3 | PASS | 18s | 6 | load_osm_model, list_air_loops, replace_air_terminals | 12 | 722 | 116.0k | $0.0542 | 1 | +| replace_terminals_four_pipe_beam_L1 | PASS | 31s | 7 | load_osm_model, list_air_loops, replace_air_terminals, save_osm_model | 13 | 1.3k | 138.7k | $0.0725 | 1 | +| replace_terminals_four_pipe_beam_L2 | PASS | 24s | 5 | load_osm_model, list_air_loops, replace_air_terminals | 9 | 754 | 95.9k | $0.0477 | 1 | +| replace_terminals_four_pipe_beam_L3 | PASS | 24s | 6 | load_osm_model, list_air_loops, replace_air_terminals | 12 | 898 | 116.1k | $0.0571 | 1 | +| measure_replace_terminals_L1 | PASS | 120s | 0 | load_osm_model, list_skills, get_skill, list_air_loops, search_wiring_patterns, search_api, list_plant_loops, create_measure | 0 | 0 | 0 | $0.0000 | 1 | +| measure_replace_terminals_L2 | PASS | 78s | 11 | load_osm_model, search_wiring_patterns, search_api, list_air_loops, list_plant_loops, create_measure, test_measure | 16 | 4.7k | 185.4k | $0.1733 | 1 | +| measure_replace_terminals_L3 | PASS | 32s | 4 | load_osm_model, create_measure | 7 | 2.1k | 57.3k | $0.0613 | 1 | +| zone_equipment_priority_L1 | PASS | 51s | 9 | load_osm_model, list_thermal_zones, add_zone_equipment, list_zone_hvac_equipment, set_zone_equipment_priority, set_zone_equipment_priority | 15 | 2.5k | 183.4k | $0.1089 | 1 | +| zone_equipment_priority_L2 | PASS | 66s | 11 | load_osm_model, list_thermal_zones, add_zone_equipment, list_zone_hvac_equipment, get_thermal_zone_details, get_zone_hvac_details, get_air_loop_details, set_zone_equipment_priority | 16 | 3.0k | 210.6k | $0.1351 | 1 | +| zone_equipment_priority_L3 | FAIL | 21s | 11 | load_osm_model, list_thermal_zones, add_zone_equipment, list_zone_hvac_equipment, get_thermal_zone_details, get_zone_hvac_details, get_air_loop_details, set_zone_equipment_priority | 16 | 3.0k | 210.6k | $0.1351 | 1 | +| edit_measure_L1 | FAIL | 2s | 11 | load_osm_model, list_thermal_zones, add_zone_equipment, list_zone_hvac_equipment, get_thermal_zone_details, get_zone_hvac_details, get_air_loop_details, set_zone_equipment_priority | 16 | 3.0k | 210.6k | $0.1351 | 1 | +| edit_measure_L2 | FAIL | 2s | 11 | load_osm_model, list_thermal_zones, add_zone_equipment, list_zone_hvac_equipment, get_thermal_zone_details, get_zone_hvac_details, get_air_loop_details, set_zone_equipment_priority | 16 | 3.0k | 210.6k | $0.1351 | 1 | +| edit_measure_L3 | FAIL | 2s | 11 | load_osm_model, list_thermal_zones, add_zone_equipment, list_zone_hvac_equipment, get_thermal_zone_details, get_zone_hvac_details, get_air_loop_details, set_zone_equipment_priority | 16 | 3.0k | 210.6k | $0.1351 | 1 | + +## Progressive Prompt Analysis + +Pass rates by specificity level per case: + +| Case | L1 (vague) | L2 (moderate) | L3 (explicit) | +|----------------------|------------|---------------|---------------| +| import_floorplan | PASS | PASS | PASS | +| add_hvac | PASS | PASS | PASS | +| view_model | PASS | PASS | PASS | +| set_weather | PASS | PASS | PASS | +| run_qaqc | PASS | PASS | PASS | +| create_building | PASS | PASS | PASS | +| add_pv | PASS | PASS | PASS | +| thermostat | PASS | PASS | PASS | +| list_spaces | PASS | PASS | PASS | +| schedules | PASS | PASS | PASS | +| inspect_component | PASS | PASS | PASS | +| modify_component | PASS | PASS | PASS | +| list_dynamic_type | PASS | PASS | PASS | +| floor_area | PASS | PASS | PASS | +| materials | PASS | PASS | PASS | +| thermal_zones | FAIL | PASS | PASS | +| subsurfaces | PASS | PASS | PASS | +| surface_details | PASS | PASS | PASS | +| run_simulation | PASS | PASS | PASS | +| get_eui | PASS | PASS | PASS | +| end_use_breakdown | PASS | PASS | PASS | +| hvac_sizing | PASS | PASS | PASS | +| set_wwr | PASS | PASS | PASS | +| replace_windows | PASS | PASS | PASS | +| construction_details | PASS | PASS | PASS | +| check_loads | PASS | PASS | PASS | +| create_loads | PASS | PASS | PASS | +| create_plant_loop | PASS | PASS | PASS | +| schedule_details | PASS | PASS | PASS | +| space_type_info | PASS | PASS | PASS | +| set_run_period | PASS | PASS | PASS | +| ideal_air | PASS | PASS | PASS | +| save_model | PASS | PASS | PASS | +| add_ev | PASS | PASS | PASS | +| list_measures | PASS | PASS | PASS | +| create_measure | PASS | PASS | PASS | +| test_measure | FAIL | PASS | PASS | +| apply_existing_measure | PASS | PASS | PASS | +| replace_terminals_cooled_beam | PASS | PASS | PASS | +| replace_terminals_four_pipe_beam | PASS | PASS | PASS | +| measure_replace_terminals | PASS | PASS | PASS | +| zone_equipment_priority | PASS | PASS | FAIL | +| edit_measure | FAIL | FAIL | FAIL | + +**Summary:** L1=40/43 | L2=42/43 | L3=41/43 + +## Tool Discovery Overhead + +| Metric | Value | +|--------|-------| +| Avg ToolSearch calls/test | 1.6 | +| Max ToolSearch calls | 6 | +| Tests with 0 ToolSearch | 0/129 | + +## Failure Mode Analysis + +| Mode | Count | Description | +|------|-------|-------------| +| wrong_tool | 6 | MCP tool called but not the expected one | + +## Failed Tests + +- **thermal_zones_L1** (progressive, wrong_tool): 17s, 3 turns, tools: load_osm_model +- **test_measure_L1** (progressive, wrong_tool): 14s, 3 turns, tools: list_custom_measures +- **zone_equipment_priority_L3** (progressive, wrong_tool): 21s, 11 turns, tools: load_osm_model -> list_thermal_zones -> add_zone_equipment -> list_zone_hvac_equipment -> get_thermal_zone_details -> get_zone_hvac_details -> get_air_loop_details -> set_zone_equipment_priority +- **edit_measure_L1** (progressive, wrong_tool): 2s, 11 turns, tools: load_osm_model -> list_thermal_zones -> add_zone_equipment -> list_zone_hvac_equipment -> get_thermal_zone_details -> get_zone_hvac_details -> get_air_loop_details -> set_zone_equipment_priority +- **edit_measure_L2** (progressive, wrong_tool): 2s, 11 turns, tools: load_osm_model -> list_thermal_zones -> add_zone_equipment -> list_zone_hvac_equipment -> get_thermal_zone_details -> get_zone_hvac_details -> get_air_loop_details -> set_zone_equipment_priority +- **edit_measure_L3** (progressive, wrong_tool): 2s, 11 turns, tools: load_osm_model -> list_thermal_zones -> add_zone_equipment -> list_zone_hvac_equipment -> get_thermal_zone_details -> get_zone_hvac_details -> get_air_loop_details -> set_zone_equipment_priority diff --git a/docs/sweeps/codemode-on-2026-04-05/benchmark.json b/docs/sweeps/codemode-on-2026-04-05/benchmark.json new file mode 100644 index 0000000..ffbf377 --- /dev/null +++ b/docs/sweeps/codemode-on-2026-04-05/benchmark.json @@ -0,0 +1,5051 @@ +{ + "timestamp": "2026-04-05T22:50:04+00:00", + "model": "sonnet", + "retries": 0, + "code_mode": true, + "code_mode_tests": 128, + "total_tests": 129, + "passed": 31, + "failed": 98, + "pass_rate": 24.0, + "total_duration_s": 10101.7, + "total_input_tokens": 1646, + "total_output_tokens": 300118, + "total_cache_read_tokens": 20311882, + "total_cost_usd": 22.3458, + "tiers": { + "progressive": { + "total": 129, + "passed": 31, + "duration_s": 10101.7, + "pass_rate": 24.0 + } + }, + "tests": [ + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L1]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "get_skill", + "list_skills" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "AskUserQuestion", + "Glob", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ListMcpResourcesTool", + "ToolSearch", + "Glob", + "Read", + "Grep" + ], + "toolsearch_count": 9, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L2]", + "passed": true, + "duration_s": 50.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.1176363, + "duration_ms": 48096, + "input_tokens": 10, + "output_tokens": 2514, + "cache_read_tokens": 100571, + "tool_calls": [ + "import_floorspacejs" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__execute", + "mcp__openstudio__search", + "ToolSearch" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L3]", + "passed": true, + "duration_s": 96.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.15547335, + "duration_ms": 94262, + "input_tokens": 16, + "output_tokens": 4859, + "cache_read_tokens": 134197, + "tool_calls": [ + "import_floorspacejs" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch" + ], + "toolsearch_count": 4, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_hvac_L1]", + "passed": false, + "duration_s": 68.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.1522026, + "duration_ms": 66607, + "input_tokens": 16, + "output_tokens": 3549, + "cache_read_tokens": 156007, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "Skill", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__execute" + ], + "toolsearch_count": 3, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_hvac_L2]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__search", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash" + ], + "toolsearch_count": 10, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_hvac_L3]", + "passed": false, + "duration_s": 95.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 9, + "cost_usd": 0.45250574999999993, + "duration_ms": 93408, + "input_tokens": 15, + "output_tokens": 1617, + "cache_read_tokens": 235177, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "Agent", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__execute", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "Bash" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 4, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[view_model_L1]", + "passed": false, + "duration_s": 107.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 15, + "cost_usd": 0.24285420000000005, + "duration_ms": 105336, + "input_tokens": 22, + "output_tokens": 5262, + "cache_read_tokens": 287369, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "Bash", + "Bash", + "mcp__openstudio__execute", + "Bash" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[view_model_L2]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ListMcpResourcesTool", + "Glob", + "Glob", + "Grep", + "Grep", + "Grep", + "Read", + "Grep", + "Grep", + "Read", + "Bash", + "Bash" + ], + "toolsearch_count": 8, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[view_model_L3]", + "passed": false, + "duration_s": 93.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 9, + "cost_usd": 0.3816921, + "duration_ms": 91500, + "input_tokens": 18, + "output_tokens": 3032, + "cache_read_tokens": 166927, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "ListMcpResourcesTool", + "ToolSearch" + ], + "toolsearch_count": 11, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_weather_L1]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "mcp__openstudio__search", + "Glob", + "Glob", + "Glob", + "Grep", + "Read", + "Read", + "Glob", + "Read", + "Bash", + "Glob", + "Bash", + "Bash", + "Glob", + "Read" + ], + "toolsearch_count": 1, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_weather_L2]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "Bash" + ], + "toolsearch_count": 11, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_weather_L3]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "change_building_location" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "Agent" + ], + "toolsearch_count": 7, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L1]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "Skill", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "ToolSearch", + "Read", + "Read", + "ToolSearch", + "ToolSearch", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Glob", + "Bash", + "Bash", + "Bash" + ], + "toolsearch_count": 9, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L2]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "Skill", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__search", + "Bash" + ], + "toolsearch_count": 7, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L3]", + "passed": false, + "duration_s": 59.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 12, + "cost_usd": 0.19914359999999998, + "duration_ms": 57434, + "input_tokens": 16, + "output_tokens": 2950, + "cache_read_tokens": 207727, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "ToolSearch" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_building_L1]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "list_skills", + "list_weather_files" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "Agent", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search" + ], + "toolsearch_count": 11, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 3, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_building_L2]", + "passed": true, + "duration_s": 54.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.1285482, + "duration_ms": 52306, + "input_tokens": 13, + "output_tokens": 2970, + "cache_read_tokens": 121314, + "tool_calls": [ + "create_new_building" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch" + ], + "toolsearch_count": 3, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_building_L3]", + "passed": true, + "duration_s": 78.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.21664290000000003, + "duration_ms": 76601, + "input_tokens": 14, + "output_tokens": 4309, + "cache_read_tokens": 168328, + "tool_calls": [ + "create_bar_building" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__execute", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "ToolSearch" + ], + "toolsearch_count": 4, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_pv_L1]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "ToolSearch", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_schema" + ], + "toolsearch_count": 8, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_pv_L2]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch", + "Bash", + "Bash", + "Bash", + "Bash", + "Glob", + "Grep" + ], + "toolsearch_count": 7, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 3, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_pv_L3]", + "passed": false, + "duration_s": 56.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.14727825, + "duration_ms": 54062, + "input_tokens": 10, + "output_tokens": 2733, + "cache_read_tokens": 106340, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__execute", + "mcp__openstudio__search", + "mcp__openstudio__execute" + ], + "toolsearch_count": 1, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermostat_L1]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "Glob", + "Read", + "Grep", + "Glob", + "Glob", + "Grep", + "Bash" + ], + "toolsearch_count": 5, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermostat_L2]", + "passed": false, + "duration_s": 50.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.1299477, + "duration_ms": 48492, + "input_tokens": 15, + "output_tokens": 2795, + "cache_read_tokens": 120609, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "ToolSearch" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermostat_L3]", + "passed": false, + "duration_s": 79.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.21227069999999998, + "duration_ms": 73834, + "input_tokens": 18, + "output_tokens": 3890, + "cache_read_tokens": 209214, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute" + ], + "toolsearch_count": 4, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_spaces_L1]", + "passed": false, + "duration_s": 56.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.17030325000000002, + "duration_ms": 54315, + "input_tokens": 16, + "output_tokens": 2991, + "cache_read_tokens": 167105, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "ToolSearch" + ], + "toolsearch_count": 3, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_spaces_L2]", + "passed": true, + "duration_s": 98.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 12, + "cost_usd": 0.17683635, + "duration_ms": 96487, + "input_tokens": 19, + "output_tokens": 4172, + "cache_read_tokens": 197002, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "list_spaces" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__execute", + "ToolSearch" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_spaces_L3]", + "passed": false, + "duration_s": 86.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.3117852, + "duration_ms": 84116, + "input_tokens": 14, + "output_tokens": 1316, + "cache_read_tokens": 132892, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "Bash", + "Bash" + ], + "toolsearch_count": 7, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedules_L1]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute" + ], + "toolsearch_count": 7, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedules_L2]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Glob", + "Grep", + "Bash", + "Bash", + "Grep", + "Read" + ], + "toolsearch_count": 9, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedules_L3]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "Bash", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "Bash", + "Bash", + "Bash", + "Glob", + "Grep", + "Bash", + "Read", + "Read", + "Read" + ], + "toolsearch_count": 7, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[inspect_component_L1]", + "passed": false, + "duration_s": 83.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.33607560000000003, + "duration_ms": 80703, + "input_tokens": 19, + "output_tokens": 1501, + "cache_read_tokens": 214087, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "Bash", + "Bash", + "Bash" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 3, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[inspect_component_L2]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__execute", + "mcp__openstudio__search", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Grep", + "Grep", + "Grep", + "Grep", + "Write", + "Bash", + "Bash", + "Write", + "Bash", + "Bash", + "Bash", + "Bash", + "Glob", + "Read" + ], + "toolsearch_count": 7, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 4, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[inspect_component_L3]", + "passed": false, + "duration_s": 90.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.30140235000000004, + "duration_ms": 88005, + "input_tokens": 18, + "output_tokens": 4359, + "cache_read_tokens": 264782, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__load_osm_model" + ], + "toolsearch_count": 4, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[modify_component_L1]", + "passed": false, + "duration_s": 50.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.19658565000000003, + "duration_ms": 48529, + "input_tokens": 13, + "output_tokens": 2481, + "cache_read_tokens": 179893, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute" + ], + "toolsearch_count": 2, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[modify_component_L2]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Read", + "Bash", + "Bash", + "Read", + "Bash" + ], + "toolsearch_count": 7, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 4, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[modify_component_L3]", + "passed": false, + "duration_s": 89.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 15, + "cost_usd": 0.2044149, + "duration_ms": 87411, + "input_tokens": 21, + "output_tokens": 4831, + "cache_read_tokens": 242198, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L1]", + "passed": false, + "duration_s": 106.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.2786322, + "duration_ms": 103617, + "input_tokens": 18, + "output_tokens": 4785, + "cache_read_tokens": 171710, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch" + ], + "toolsearch_count": 7, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L2]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "list_model_objects", + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ToolSearch" + ], + "toolsearch_count": 11, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 3 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L3]", + "passed": true, + "duration_s": 55.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 9, + "cost_usd": 0.13239435, + "duration_ms": 53277, + "input_tokens": 16, + "output_tokens": 3078, + "cache_read_tokens": 135267, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_schema" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[floor_area_L1]", + "passed": true, + "duration_s": 109.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.44586495000000004, + "duration_ms": 107714, + "input_tokens": 13, + "output_tokens": 1034, + "cache_read_tokens": 148279, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "get_building_info" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "Bash", + "Bash", + "Glob", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Grep", + "Grep", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[floor_area_L2]", + "passed": true, + "duration_s": 96.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 9, + "cost_usd": 0.4157049, + "duration_ms": 94225, + "input_tokens": 18, + "output_tokens": 1065, + "cache_read_tokens": 165730, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "get_building_info" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash" + ], + "toolsearch_count": 7, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[floor_area_L3]", + "passed": false, + "duration_s": 68.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 9, + "cost_usd": 0.1698897, + "duration_ms": 66785, + "input_tokens": 19, + "output_tokens": 3720, + "cache_read_tokens": 171234, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_schema" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[materials_L1]", + "passed": false, + "duration_s": 81.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 14, + "cost_usd": 0.21070994999999998, + "duration_ms": 79449, + "input_tokens": 20, + "output_tokens": 4154, + "cache_read_tokens": 208329, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__execute" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[materials_L2]", + "passed": true, + "duration_s": 110.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 13, + "cost_usd": 0.3383051999999999, + "duration_ms": 108299, + "input_tokens": 22, + "output_tokens": 3374, + "cache_read_tokens": 215129, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "list_materials" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "Bash" + ], + "toolsearch_count": 9, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[materials_L3]", + "passed": false, + "duration_s": 118.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.2068455, + "duration_ms": 116003, + "input_tokens": 20, + "output_tokens": 5958, + "cache_read_tokens": 182460, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__get_schema" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L1]", + "passed": true, + "duration_s": 63.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.1549779, + "duration_ms": 61054, + "input_tokens": 21, + "output_tokens": 3081, + "cache_read_tokens": 193608, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "list_thermal_zones" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L2]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch" + ], + "toolsearch_count": 5, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L3]", + "passed": false, + "duration_s": 68.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.2076381, + "duration_ms": 65883, + "input_tokens": 17, + "output_tokens": 2946, + "cache_read_tokens": 213307, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L1]", + "passed": false, + "duration_s": 78.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.3754559999999999, + "duration_ms": 76084, + "input_tokens": 13, + "output_tokens": 1002, + "cache_read_tokens": 139306, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "Bash", + "Bash", + "Bash", + "Bash" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L2]", + "passed": false, + "duration_s": 59.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [], + "num_tool_calls": 0, + "all_tool_calls": [], + "toolsearch_count": 0, + "is_timeout": false, + "code_mode_active": false, + "code_executions": 0, + "failure_mode": "no_mcp_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L3]", + "passed": false, + "duration_s": 76.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.167361, + "duration_ms": 73912, + "input_tokens": 20, + "output_tokens": 3596, + "cache_read_tokens": 180170, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[surface_details_L1]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "Bash", + "Bash" + ], + "toolsearch_count": 10, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[surface_details_L2]", + "passed": false, + "duration_s": 80.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.387804, + "duration_ms": 78760, + "input_tokens": 19, + "output_tokens": 1640, + "cache_read_tokens": 205657, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "mcp__openstudio__search", + "Bash" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 3, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[surface_details_L3]", + "passed": false, + "duration_s": 78.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.37623330000000005, + "duration_ms": 76375, + "input_tokens": 20, + "output_tokens": 2649, + "cache_read_tokens": 236073, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch" + ], + "toolsearch_count": 8, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 3, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_simulation_L1]", + "passed": false, + "duration_s": 96.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 9, + "cost_usd": 0.17223599999999997, + "duration_ms": 93624, + "input_tokens": 14, + "output_tokens": 4897, + "cache_read_tokens": 140480, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__get_schema" + ], + "toolsearch_count": 4, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_simulation_L2]", + "passed": false, + "duration_s": 85.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 12, + "cost_usd": 0.2507709, + "duration_ms": 83574, + "input_tokens": 24, + "output_tokens": 4385, + "cache_read_tokens": 288538, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_simulation_L3]", + "passed": false, + "duration_s": 144.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 9, + "cost_usd": 0.15699315, + "duration_ms": 142298, + "input_tokens": 15, + "output_tokens": 2412, + "cache_read_tokens": 174398, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "ToolSearch" + ], + "toolsearch_count": 4, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[get_eui_L1]", + "passed": true, + "duration_s": 93.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.28026315, + "duration_ms": 91071, + "input_tokens": 22, + "output_tokens": 2714, + "cache_read_tokens": 173969, + "tool_calls": [ + "extract_summary_metrics", + "extract_summary_metrics", + "extract_end_use_breakdown" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "ToolSearch", + "Bash" + ], + "toolsearch_count": 9, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 3 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[get_eui_L2]", + "passed": true, + "duration_s": 257.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 40, + "cost_usd": 0.7617760499999999, + "duration_ms": 255438, + "input_tokens": 51, + "output_tokens": 12779, + "cache_read_tokens": 1295331, + "tool_calls": [ + "extract_summary_metrics" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Glob", + "Glob", + "Read", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash" + ], + "toolsearch_count": 8, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[get_eui_L3]", + "passed": true, + "duration_s": 99.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 12, + "cost_usd": 0.3358731, + "duration_ms": 97110, + "input_tokens": 22, + "output_tokens": 3489, + "cache_read_tokens": 233774, + "tool_calls": [ + "extract_summary_metrics", + "extract_summary_metrics" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "ToolSearch" + ], + "toolsearch_count": 10, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L1]", + "passed": true, + "duration_s": 50.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.19547504999999998, + "duration_ms": 48441, + "input_tokens": 12, + "output_tokens": 719, + "cache_read_tokens": 86919, + "tool_calls": [ + "extract_end_use_breakdown", + "extract_end_use_breakdown" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "Bash" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L2]", + "passed": true, + "duration_s": 76.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.2444844, + "duration_ms": 73992, + "input_tokens": 12, + "output_tokens": 783, + "cache_read_tokens": 87796, + "tool_calls": [ + "extract_end_use_breakdown", + "extract_end_use_breakdown" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "Agent", + "Bash", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "Bash", + "Bash", + "Glob", + "Grep", + "Bash" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L3]", + "passed": true, + "duration_s": 54.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.13377134999999998, + "duration_ms": 51580, + "input_tokens": 16, + "output_tokens": 2369, + "cache_read_tokens": 136207, + "tool_calls": [ + "extract_end_use_breakdown" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L1]", + "passed": true, + "duration_s": 57.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.2433072, + "duration_ms": 55753, + "input_tokens": 12, + "output_tokens": 760, + "cache_read_tokens": 95853, + "tool_calls": [ + "extract_hvac_sizing", + "extract_hvac_sizing" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "Bash", + "Bash", + "Glob", + "Grep", + "Bash" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 3 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L2]", + "passed": true, + "duration_s": 58.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.16734795, + "duration_ms": 56058, + "input_tokens": 12, + "output_tokens": 791, + "cache_read_tokens": 95336, + "tool_calls": [ + "extract_hvac_sizing", + "extract_hvac_sizing" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "Bash" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L3]", + "passed": true, + "duration_s": 135.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 19, + "cost_usd": 0.32538749999999994, + "duration_ms": 133183, + "input_tokens": 29, + "output_tokens": 7085, + "cache_read_tokens": 443610, + "tool_calls": [ + "extract_hvac_sizing" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_wwr_L1]", + "passed": false, + "duration_s": 89.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.3587286, + "duration_ms": 87034, + "input_tokens": 14, + "output_tokens": 1436, + "cache_read_tokens": 129459, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "Skill", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "Bash", + "Bash" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_wwr_L2]", + "passed": false, + "duration_s": 106.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 13, + "cost_usd": 0.21568379999999998, + "duration_ms": 104684, + "input_tokens": 18, + "output_tokens": 5969, + "cache_read_tokens": 191416, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_wwr_L3]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "list_surfaces", + "set_window_to_wall_ratio", + "save_osm_model" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search", + "Read", + "Grep", + "Grep", + "Glob", + "Grep", + "Grep", + "Bash", + "Grep", + "Bash" + ], + "toolsearch_count": 7, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_windows_L1]", + "passed": false, + "duration_s": 105.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.3375996, + "duration_ms": 103409, + "input_tokens": 13, + "output_tokens": 5485, + "cache_read_tokens": 230577, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "ToolSearch" + ], + "toolsearch_count": 3, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_windows_L2]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "mcp__openstudio__execute", + "ToolSearch" + ], + "toolsearch_count": 3, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_windows_L3]", + "passed": false, + "duration_s": 90.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 12, + "cost_usd": 0.21202695000000002, + "duration_ms": 87854, + "input_tokens": 20, + "output_tokens": 3663, + "cache_read_tokens": 248244, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[construction_details_L1]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool", + "Agent", + "ToolSearch", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute" + ], + "toolsearch_count": 9, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 3, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[construction_details_L2]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "mcp__openstudio__execute", + "Read", + "Bash", + "Bash", + "Bash", + "Glob", + "Bash", + "Bash" + ], + "toolsearch_count": 6, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 4, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[construction_details_L3]", + "passed": false, + "duration_s": 90.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.3163032, + "duration_ms": 87860, + "input_tokens": 20, + "output_tokens": 2117, + "cache_read_tokens": 217134, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "Bash", + "Bash" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[check_loads_L1]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__execute", + "ToolSearch", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "Bash" + ], + "toolsearch_count": 10, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 3, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[check_loads_L2]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "Bash", + "Bash", + "Bash", + "Bash" + ], + "toolsearch_count": 9, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[check_loads_L3]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__search", + "Bash", + "ListMcpResourcesTool", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Read" + ], + "toolsearch_count": 9, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_loads_L1]", + "passed": false, + "duration_s": 68.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.15885435000000003, + "duration_ms": 65809, + "input_tokens": 15, + "output_tokens": 3464, + "cache_read_tokens": 168327, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch" + ], + "toolsearch_count": 4, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_loads_L2]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "load_osm_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "Bash", + "mcp__openstudio__execute", + "Bash", + "Bash", + "Grep", + "Grep", + "Grep", + "Grep" + ], + "toolsearch_count": 3, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_loads_L3]", + "passed": false, + "duration_s": 115.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.22084545, + "duration_ms": 113437, + "input_tokens": 20, + "output_tokens": 6125, + "cache_read_tokens": 191339, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_schema" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L1]", + "passed": false, + "duration_s": 118.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 12, + "cost_usd": 0.2578353, + "duration_ms": 116316, + "input_tokens": 17, + "output_tokens": 5905, + "cache_read_tokens": 261681, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L2]", + "passed": false, + "duration_s": 71.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.13893735000000002, + "duration_ms": 69626, + "input_tokens": 13, + "output_tokens": 3876, + "cache_read_tokens": 120082, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch" + ], + "toolsearch_count": 3, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L3]", + "passed": true, + "duration_s": 79.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 9, + "cost_usd": 0.1589352, + "duration_ms": 76924, + "input_tokens": 16, + "output_tokens": 4330, + "cache_read_tokens": 136649, + "tool_calls": [ + "load_osm_model", + "create_plant_loop" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch" + ], + "toolsearch_count": 4, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedule_details_L1]", + "passed": false, + "duration_s": 84.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 14, + "cost_usd": 0.18674789999999997, + "duration_ms": 82174, + "input_tokens": 21, + "output_tokens": 4666, + "cache_read_tokens": 199808, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool", + "ToolSearch" + ], + "toolsearch_count": 7, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedule_details_L2]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "mcp__openstudio__execute", + "Agent", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__execute", + "Read", + "Read", + "Read", + "Bash", + "Bash", + "Bash" + ], + "toolsearch_count": 5, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 4, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedule_details_L3]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute" + ], + "toolsearch_count": 5, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[space_type_info_L1]", + "passed": false, + "duration_s": 78.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.26106704999999997, + "duration_ms": 75694, + "input_tokens": 18, + "output_tokens": 4127, + "cache_read_tokens": 253506, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute" + ], + "toolsearch_count": 4, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[space_type_info_L2]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "ToolSearch" + ], + "toolsearch_count": 9, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[space_type_info_L3]", + "passed": false, + "duration_s": 68.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.14956875, + "duration_ms": 66634, + "input_tokens": 16, + "output_tokens": 3600, + "cache_read_tokens": 137665, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "ToolSearch" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_run_period_L1]", + "passed": true, + "duration_s": 104.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.3013101, + "duration_ms": 102219, + "input_tokens": 16, + "output_tokens": 3305, + "cache_read_tokens": 174952, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "set_run_period", + "get_run_period" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "mcp__openstudio__search", + "mcp__openstudio__get_schema" + ], + "toolsearch_count": 6, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 3 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_run_period_L2]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "set_run_period", + "set_run_period" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "Agent", + "Bash", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "Bash", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "Agent", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "mcp__openstudio__search", + "ToolSearch" + ], + "toolsearch_count": 14, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 4 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_run_period_L3]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_schema" + ], + "toolsearch_count": 5, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[ideal_air_L1]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "enable_ideal_air_loads", + "load_osm_model" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "Agent", + "ToolSearch", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "Glob" + ], + "toolsearch_count": 9, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 3 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[ideal_air_L2]", + "passed": true, + "duration_s": 49.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.20621355, + "duration_ms": 47103, + "input_tokens": 13, + "output_tokens": 2677, + "cache_read_tokens": 186886, + "tool_calls": [ + "load_osm_model", + "enable_ideal_air_loads" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch" + ], + "toolsearch_count": 3, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[ideal_air_L3]", + "passed": false, + "duration_s": 82.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 12, + "cost_usd": 0.25744575000000003, + "duration_ms": 80308, + "input_tokens": 17, + "output_tokens": 2625, + "cache_read_tokens": 198457, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch" + ], + "toolsearch_count": 9, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[save_model_L1]", + "passed": false, + "duration_s": 61.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.2806035, + "duration_ms": 59343, + "input_tokens": 18, + "output_tokens": 3266, + "cache_read_tokens": 219090, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__get_schema", + "ToolSearch" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 1, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[save_model_L2]", + "passed": false, + "duration_s": 68.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.18438825, + "duration_ms": 66234, + "input_tokens": 16, + "output_tokens": 2941, + "cache_read_tokens": 213080, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__execute", + "ToolSearch" + ], + "toolsearch_count": 4, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[save_model_L3]", + "passed": true, + "duration_s": 87.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 14, + "cost_usd": 0.22063679999999997, + "duration_ms": 85145, + "input_tokens": 24, + "output_tokens": 4407, + "cache_read_tokens": 285391, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "load_osm_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ListMcpResourcesTool", + "ToolSearch" + ], + "toolsearch_count": 7, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_ev_L1]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "mcp__openstudio__get_schema", + "mcp__openstudio__search", + "mcp__openstudio__search", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_schema", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__execute", + "Bash", + "Bash", + "Bash", + "Bash" + ], + "toolsearch_count": 5, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_ev_L2]", + "passed": false, + "duration_s": 79.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.178827, + "duration_ms": 77833, + "input_tokens": 13, + "output_tokens": 4210, + "cache_read_tokens": 173660, + "tool_calls": [ + "load_osm_model", + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__execute", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch" + ], + "toolsearch_count": 4, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 3, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_ev_L3]", + "passed": false, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "load_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "ToolSearch", + "mcp__openstudio__get_schema", + "Agent", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__get_schema", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash" + ], + "toolsearch_count": 6, + "is_timeout": true, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_measures_L1]", + "passed": true, + "duration_s": 46.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.15551009999999998, + "duration_ms": 44134, + "input_tokens": 14, + "output_tokens": 2300, + "cache_read_tokens": 172552, + "tool_calls": [ + "list_custom_measures", + "list_custom_measures" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__execute" + ], + "toolsearch_count": 4, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_measures_L2]", + "passed": true, + "duration_s": 62.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2 + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_measures_L3]", + "passed": false, + "duration_s": 4.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_measure_L1]", + "passed": false, + "duration_s": 2.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_measure_L2]", + "passed": false, + "duration_s": 2.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_measure_L3]", + "passed": false, + "duration_s": 2.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[test_measure_L1]", + "passed": false, + "duration_s": 2.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[test_measure_L2]", + "passed": false, + "duration_s": 2.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[test_measure_L3]", + "passed": false, + "duration_s": 2.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[apply_existing_measure_L1]", + "passed": false, + "duration_s": 2.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[apply_existing_measure_L2]", + "passed": false, + "duration_s": 2.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[apply_existing_measure_L3]", + "passed": false, + "duration_s": 2.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_terminals_cooled_beam_L1]", + "passed": false, + "duration_s": 2.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_terminals_cooled_beam_L2]", + "passed": false, + "duration_s": 2.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_terminals_cooled_beam_L3]", + "passed": false, + "duration_s": 2.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_terminals_four_pipe_beam_L1]", + "passed": false, + "duration_s": 2.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_terminals_four_pipe_beam_L2]", + "passed": false, + "duration_s": 2.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_terminals_four_pipe_beam_L3]", + "passed": false, + "duration_s": 2.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[measure_replace_terminals_L1]", + "passed": false, + "duration_s": 2.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[measure_replace_terminals_L2]", + "passed": false, + "duration_s": 2.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[measure_replace_terminals_L3]", + "passed": false, + "duration_s": 2.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[zone_equipment_priority_L1]", + "passed": false, + "duration_s": 2.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[zone_equipment_priority_L2]", + "passed": false, + "duration_s": 2.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[zone_equipment_priority_L3]", + "passed": false, + "duration_s": 2.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[edit_measure_L1]", + "passed": false, + "duration_s": 2.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[edit_measure_L2]", + "passed": false, + "duration_s": 2.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[edit_measure_L3]", + "passed": false, + "duration_s": 2.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.23917740000000007, + "duration_ms": 60469, + "input_tokens": 19, + "output_tokens": 2778, + "cache_read_tokens": 268143, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search", + "mcp__openstudio__execute", + "mcp__openstudio__get_schema", + "ToolSearch", + "mcp__openstudio__execute", + "ToolSearch", + "ToolSearch", + "ListMcpResourcesTool" + ], + "toolsearch_count": 5, + "is_timeout": false, + "code_mode_active": true, + "code_executions": 2, + "failure_mode": "wrong_tool" + } + ] +} \ No newline at end of file diff --git a/docs/sweeps/codemode-on-2026-04-05/benchmark.md b/docs/sweeps/codemode-on-2026-04-05/benchmark.md new file mode 100644 index 0000000..6c121a2 --- /dev/null +++ b/docs/sweeps/codemode-on-2026-04-05/benchmark.md @@ -0,0 +1,317 @@ +# LLM Benchmark Report + +**Date:** 2026-04-05T22:50:04+00:00 +**Model:** sonnet | **Retries:** 0 | **CodeMode:** ON +**Result:** 31/129 passed (24.0%) in 10102s +**Tokens:** 1.6k in + 300.1k out + 20.3M cache | **Cost:** $22.3458 (notional API pricing) + +## Summary by Tier + +| Tier | Passed | Rate | Time | Avg | +|--------|---------|--------|--------|--------| +| progressive | 31/129 | 24.0% | 10102s | 78s | + +## Detailed Results + +### progressive + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|-------------------------------------|--------|------|-------|-----------------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| import_floorplan_L1 | FAIL | 120s | 0 | get_skill, list_skills | 0 | 0 | 0 | $0.0000 | 1 | +| import_floorplan_L2 | PASS | 50s | 6 | import_floorspacejs | 10 | 2.5k | 100.6k | $0.1176 | 1 | +| import_floorplan_L3 | PASS | 96s | 8 | import_floorspacejs | 16 | 4.9k | 134.2k | $0.1555 | 1 | +| add_hvac_L1 | FAIL | 69s | 10 | load_osm_model, load_osm_model | 16 | 3.5k | 156.0k | $0.1522 | 1 | +| add_hvac_L2 | FAIL | 120s | 0 | load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| add_hvac_L3 | FAIL | 96s | 9 | load_osm_model, load_osm_model, load_osm_model | 15 | 1.6k | 235.2k | $0.4525 | 1 | +| view_model_L1 | FAIL | 108s | 15 | load_osm_model, load_osm_model | 22 | 5.3k | 287.4k | $0.2429 | 1 | +| view_model_L2 | FAIL | 120s | 0 | load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| view_model_L3 | FAIL | 94s | 9 | load_osm_model, load_osm_model | 18 | 3.0k | 166.9k | $0.3817 | 1 | +| set_weather_L1 | FAIL | 120s | 0 | load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| set_weather_L2 | FAIL | 120s | 0 | load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| set_weather_L3 | PASS | 120s | 0 | load_osm_model, load_osm_model, change_building_location | 0 | 0 | 0 | $0.0000 | 1 | +| run_qaqc_L1 | FAIL | 120s | 0 | load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| run_qaqc_L2 | FAIL | 120s | 0 | load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| run_qaqc_L3 | FAIL | 60s | 12 | load_osm_model | 16 | 3.0k | 207.7k | $0.1991 | 1 | +| create_building_L1 | FAIL | 120s | 0 | list_skills, list_weather_files | 0 | 0 | 0 | $0.0000 | 1 | +| create_building_L2 | PASS | 54s | 7 | create_new_building | 13 | 3.0k | 121.3k | $0.1285 | 1 | +| create_building_L3 | PASS | 79s | 8 | create_bar_building | 14 | 4.3k | 168.3k | $0.2166 | 1 | +| add_pv_L1 | FAIL | 120s | 0 | load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| add_pv_L2 | FAIL | 120s | 0 | load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| add_pv_L3 | FAIL | 56s | 6 | load_osm_model, load_osm_model | 10 | 2.7k | 106.3k | $0.1473 | 1 | +| thermostat_L1 | FAIL | 120s | 0 | load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| thermostat_L2 | FAIL | 51s | 8 | load_osm_model | 15 | 2.8k | 120.6k | $0.1299 | 1 | +| thermostat_L3 | FAIL | 80s | 10 | load_osm_model, load_osm_model | 18 | 3.9k | 209.2k | $0.2123 | 1 | +| list_spaces_L1 | FAIL | 56s | 10 | load_osm_model | 16 | 3.0k | 167.1k | $0.1703 | 1 | +| list_spaces_L2 | PASS | 99s | 12 | load_osm_model, load_osm_model, list_spaces | 19 | 4.2k | 197.0k | $0.1768 | 1 | +| list_spaces_L3 | FAIL | 86s | 8 | load_osm_model, load_osm_model | 14 | 1.3k | 132.9k | $0.3118 | 1 | +| schedules_L1 | FAIL | 120s | 0 | load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| schedules_L2 | FAIL | 120s | 0 | load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| schedules_L3 | PASS | 120s | 0 | load_osm_model, list_model_objects, load_osm_model, list_model_objects | 0 | 0 | 0 | $0.0000 | 1 | +| inspect_component_L1 | FAIL | 83s | 10 | load_osm_model, load_osm_model | 19 | 1.5k | 214.1k | $0.3361 | 1 | +| inspect_component_L2 | FAIL | 120s | 0 | load_osm_model, load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| inspect_component_L3 | FAIL | 90s | 11 | load_osm_model, load_osm_model | 18 | 4.4k | 264.8k | $0.3014 | 1 | +| modify_component_L1 | FAIL | 51s | 8 | load_osm_model, load_osm_model | 13 | 2.5k | 179.9k | $0.1966 | 1 | +| modify_component_L2 | FAIL | 120s | 0 | load_osm_model, load_osm_model, load_osm_model, list_model_objects | 0 | 0 | 0 | $0.0000 | 1 | +| modify_component_L3 | FAIL | 90s | 15 | load_osm_model, load_osm_model, load_osm_model, list_model_objects | 21 | 4.8k | 242.2k | $0.2044 | 1 | +| list_dynamic_type_L1 | FAIL | 106s | 10 | load_osm_model, load_osm_model | 18 | 4.8k | 171.7k | $0.2786 | 1 | +| list_dynamic_type_L2 | PASS | 120s | 0 | load_osm_model, load_osm_model, list_model_objects, load_osm_model, list_model_objects | 0 | 0 | 0 | $0.0000 | 1 | +| list_dynamic_type_L3 | PASS | 56s | 9 | load_osm_model, list_model_objects | 16 | 3.1k | 135.3k | $0.1324 | 1 | +| floor_area_L1 | PASS | 110s | 8 | load_osm_model, load_osm_model, get_building_info | 13 | 1.0k | 148.3k | $0.4459 | 1 | +| floor_area_L2 | PASS | 97s | 9 | load_osm_model, load_osm_model, get_building_info | 18 | 1.1k | 165.7k | $0.4157 | 1 | +| floor_area_L3 | FAIL | 69s | 9 | load_osm_model | 19 | 3.7k | 171.2k | $0.1699 | 1 | +| materials_L1 | FAIL | 82s | 14 | load_osm_model, load_osm_model, load_osm_model | 20 | 4.2k | 208.3k | $0.2107 | 1 | +| materials_L2 | PASS | 110s | 13 | load_osm_model, load_osm_model, list_materials | 22 | 3.4k | 215.1k | $0.3383 | 1 | +| materials_L3 | FAIL | 118s | 10 | load_osm_model | 20 | 6.0k | 182.5k | $0.2068 | 1 | +| thermal_zones_L1 | PASS | 63s | 11 | load_osm_model, load_osm_model, list_thermal_zones | 21 | 3.1k | 193.6k | $0.1550 | 1 | +| thermal_zones_L2 | FAIL | 120s | 0 | load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| thermal_zones_L3 | FAIL | 68s | 10 | load_osm_model | 17 | 2.9k | 213.3k | $0.2076 | 1 | +| subsurfaces_L1 | FAIL | 78s | 8 | load_osm_model, load_osm_model | 13 | 1.0k | 139.3k | $0.3755 | 1 | +| subsurfaces_L2 | FAIL | 60s | 0 | — | 0 | 0 | 0 | $0.0000 | 1 | +| subsurfaces_L3 | FAIL | 76s | 10 | load_osm_model, load_osm_model | 20 | 3.6k | 180.2k | $0.1674 | 1 | +| surface_details_L1 | FAIL | 120s | 0 | load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| surface_details_L2 | FAIL | 81s | 10 | load_osm_model, load_osm_model | 19 | 1.6k | 205.7k | $0.3878 | 1 | +| surface_details_L3 | FAIL | 78s | 11 | load_osm_model, load_osm_model, load_osm_model | 20 | 2.6k | 236.1k | $0.3762 | 1 | +| run_simulation_L1 | FAIL | 96s | 9 | load_osm_model | 14 | 4.9k | 140.5k | $0.1722 | 1 | +| run_simulation_L2 | FAIL | 86s | 12 | load_osm_model, load_osm_model | 24 | 4.4k | 288.5k | $0.2508 | 1 | +| run_simulation_L3 | FAIL | 144s | 9 | load_osm_model | 15 | 2.4k | 174.4k | $0.1570 | 1 | +| get_eui_L1 | PASS | 93s | 10 | extract_summary_metrics, extract_summary_metrics, extract_end_use_breakdown | 22 | 2.7k | 174.0k | $0.2803 | 1 | +| get_eui_L2 | PASS | 258s | 40 | extract_summary_metrics | 51 | 12.8k | 1.3M | $0.7618 | 1 | +| get_eui_L3 | PASS | 99s | 12 | extract_summary_metrics, extract_summary_metrics | 22 | 3.5k | 233.8k | $0.3359 | 1 | +| end_use_breakdown_L1 | PASS | 51s | 6 | extract_end_use_breakdown, extract_end_use_breakdown | 12 | 719 | 86.9k | $0.1955 | 1 | +| end_use_breakdown_L2 | PASS | 76s | 6 | extract_end_use_breakdown, extract_end_use_breakdown | 12 | 783 | 87.8k | $0.2445 | 1 | +| end_use_breakdown_L3 | PASS | 54s | 8 | extract_end_use_breakdown | 16 | 2.4k | 136.2k | $0.1338 | 1 | +| hvac_sizing_L1 | PASS | 58s | 6 | extract_hvac_sizing, extract_hvac_sizing | 12 | 760 | 95.9k | $0.2433 | 1 | +| hvac_sizing_L2 | PASS | 58s | 6 | extract_hvac_sizing, extract_hvac_sizing | 12 | 791 | 95.3k | $0.1673 | 1 | +| hvac_sizing_L3 | PASS | 135s | 19 | extract_hvac_sizing | 29 | 7.1k | 443.6k | $0.3254 | 1 | +| set_wwr_L1 | FAIL | 90s | 10 | load_osm_model, load_osm_model | 14 | 1.4k | 129.5k | $0.3587 | 1 | +| set_wwr_L2 | FAIL | 107s | 13 | load_osm_model, load_osm_model | 18 | 6.0k | 191.4k | $0.2157 | 1 | +| set_wwr_L3 | PASS | 120s | 0 | load_osm_model, load_osm_model, list_surfaces, set_window_to_wall_ratio, save_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| replace_windows_L1 | FAIL | 106s | 11 | load_osm_model, load_osm_model | 13 | 5.5k | 230.6k | $0.3376 | 1 | +| replace_windows_L2 | FAIL | 120s | 0 | load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| replace_windows_L3 | FAIL | 90s | 12 | load_osm_model, load_osm_model | 20 | 3.7k | 248.2k | $0.2120 | 1 | +| construction_details_L1 | FAIL | 120s | 0 | load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| construction_details_L2 | FAIL | 120s | 0 | load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| construction_details_L3 | FAIL | 90s | 11 | load_osm_model | 20 | 2.1k | 217.1k | $0.3163 | 1 | +| check_loads_L1 | FAIL | 120s | 0 | load_osm_model, load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| check_loads_L2 | FAIL | 120s | 0 | load_osm_model, load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| check_loads_L3 | FAIL | 120s | 0 | load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| create_loads_L1 | FAIL | 68s | 10 | load_osm_model, load_osm_model | 15 | 3.5k | 168.3k | $0.1589 | 1 | +| create_loads_L2 | FAIL | 120s | 0 | load_osm_model, list_spaces, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| create_loads_L3 | FAIL | 116s | 11 | load_osm_model | 20 | 6.1k | 191.3k | $0.2208 | 1 | +| create_plant_loop_L1 | FAIL | 118s | 12 | load_osm_model, load_osm_model | 17 | 5.9k | 261.7k | $0.2578 | 1 | +| create_plant_loop_L2 | FAIL | 72s | 7 | load_osm_model | 13 | 3.9k | 120.1k | $0.1389 | 1 | +| create_plant_loop_L3 | PASS | 79s | 9 | load_osm_model, create_plant_loop | 16 | 4.3k | 136.6k | $0.1589 | 1 | +| schedule_details_L1 | FAIL | 84s | 14 | load_osm_model | 21 | 4.7k | 199.8k | $0.1867 | 1 | +| schedule_details_L2 | FAIL | 120s | 0 | load_osm_model, load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| schedule_details_L3 | FAIL | 120s | 0 | load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| space_type_info_L1 | FAIL | 78s | 11 | load_osm_model, load_osm_model | 18 | 4.1k | 253.5k | $0.2611 | 1 | +| space_type_info_L2 | FAIL | 120s | 0 | load_osm_model, load_osm_model, list_model_objects | 0 | 0 | 0 | $0.0000 | 1 | +| space_type_info_L3 | FAIL | 69s | 8 | load_osm_model | 16 | 3.6k | 137.7k | $0.1496 | 1 | +| set_run_period_L1 | PASS | 104s | 11 | load_osm_model, load_osm_model, set_run_period, get_run_period | 16 | 3.3k | 175.0k | $0.3013 | 1 | +| set_run_period_L2 | PASS | 120s | 0 | load_osm_model, load_osm_model, set_run_period, set_run_period | 0 | 0 | 0 | $0.0000 | 1 | +| set_run_period_L3 | FAIL | 120s | 0 | load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| ideal_air_L1 | PASS | 120s | 0 | load_osm_model, load_osm_model, enable_ideal_air_loads, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| ideal_air_L2 | PASS | 49s | 8 | load_osm_model, enable_ideal_air_loads | 13 | 2.7k | 186.9k | $0.2062 | 1 | +| ideal_air_L3 | FAIL | 82s | 12 | load_osm_model, load_osm_model | 17 | 2.6k | 198.5k | $0.2574 | 1 | +| save_model_L1 | FAIL | 61s | 11 | load_osm_model | 18 | 3.3k | 219.1k | $0.2806 | 1 | +| save_model_L2 | FAIL | 68s | 10 | load_osm_model, load_osm_model | 16 | 2.9k | 213.1k | $0.1844 | 1 | +| save_model_L3 | PASS | 87s | 14 | load_osm_model, save_osm_model, load_osm_model | 24 | 4.4k | 285.4k | $0.2206 | 1 | +| add_ev_L1 | FAIL | 120s | 0 | load_osm_model, load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| add_ev_L2 | FAIL | 80s | 11 | load_osm_model, load_osm_model, load_osm_model | 13 | 4.2k | 173.7k | $0.1788 | 1 | +| add_ev_L3 | FAIL | 120s | 0 | load_osm_model, load_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| list_measures_L1 | PASS | 46s | 8 | list_custom_measures, list_custom_measures | 14 | 2.3k | 172.6k | $0.1555 | 1 | +| list_measures_L2 | PASS | 63s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| list_measures_L3 | FAIL | 5s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| create_measure_L1 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| create_measure_L2 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| create_measure_L3 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| test_measure_L1 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| test_measure_L2 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| test_measure_L3 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| apply_existing_measure_L1 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| apply_existing_measure_L2 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| apply_existing_measure_L3 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| replace_terminals_cooled_beam_L1 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| replace_terminals_cooled_beam_L2 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| replace_terminals_cooled_beam_L3 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| replace_terminals_four_pipe_beam_L1 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| replace_terminals_four_pipe_beam_L2 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| replace_terminals_four_pipe_beam_L3 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| measure_replace_terminals_L1 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| measure_replace_terminals_L2 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| measure_replace_terminals_L3 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| zone_equipment_priority_L1 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| zone_equipment_priority_L2 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| zone_equipment_priority_L3 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| edit_measure_L1 | FAIL | 3s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| edit_measure_L2 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | +| edit_measure_L3 | FAIL | 2s | 11 | list_custom_measures | 19 | 2.8k | 268.1k | $0.2392 | 1 | + +## Progressive Prompt Analysis + +Pass rates by specificity level per case: + +| Case | L1 (vague) | L2 (moderate) | L3 (explicit) | +|----------------------|------------|---------------|---------------| +| import_floorplan | FAIL | PASS | PASS | +| add_hvac | FAIL | FAIL | FAIL | +| view_model | FAIL | FAIL | FAIL | +| set_weather | FAIL | FAIL | PASS | +| run_qaqc | FAIL | FAIL | FAIL | +| create_building | FAIL | PASS | PASS | +| add_pv | FAIL | FAIL | FAIL | +| thermostat | FAIL | FAIL | FAIL | +| list_spaces | FAIL | PASS | FAIL | +| schedules | FAIL | FAIL | PASS | +| inspect_component | FAIL | FAIL | FAIL | +| modify_component | FAIL | FAIL | FAIL | +| list_dynamic_type | FAIL | PASS | PASS | +| floor_area | PASS | PASS | FAIL | +| materials | FAIL | PASS | FAIL | +| thermal_zones | PASS | FAIL | FAIL | +| subsurfaces | FAIL | FAIL | FAIL | +| surface_details | FAIL | FAIL | FAIL | +| run_simulation | FAIL | FAIL | FAIL | +| get_eui | PASS | PASS | PASS | +| end_use_breakdown | PASS | PASS | PASS | +| hvac_sizing | PASS | PASS | PASS | +| set_wwr | FAIL | FAIL | PASS | +| replace_windows | FAIL | FAIL | FAIL | +| construction_details | FAIL | FAIL | FAIL | +| check_loads | FAIL | FAIL | FAIL | +| create_loads | FAIL | FAIL | FAIL | +| create_plant_loop | FAIL | FAIL | PASS | +| schedule_details | FAIL | FAIL | FAIL | +| space_type_info | FAIL | FAIL | FAIL | +| set_run_period | PASS | PASS | FAIL | +| ideal_air | PASS | PASS | FAIL | +| save_model | FAIL | FAIL | PASS | +| add_ev | FAIL | FAIL | FAIL | +| list_measures | PASS | PASS | FAIL | +| create_measure | FAIL | FAIL | FAIL | +| test_measure | FAIL | FAIL | FAIL | +| apply_existing_measure | FAIL | FAIL | FAIL | +| replace_terminals_cooled_beam | FAIL | FAIL | FAIL | +| replace_terminals_four_pipe_beam | FAIL | FAIL | FAIL | +| measure_replace_terminals | FAIL | FAIL | FAIL | +| zone_equipment_priority | FAIL | FAIL | FAIL | +| edit_measure | FAIL | FAIL | FAIL | + +**Summary:** L1=8/43 | L2=12/43 | L3=11/43 + +## Tool Discovery Overhead + +| Metric | Value | +|--------|-------| +| Avg ToolSearch calls/test | 5.8 | +| Max ToolSearch calls | 14 | +| Tests with 0 ToolSearch | 1/129 | + +## Failure Mode Analysis + +| Mode | Count | Description | +|------|-------|-------------| +| wrong_tool | 67 | MCP tool called but not the expected one | +| timeout | 30 | Timed out before completing | +| no_mcp_tool | 1 | No MCP tool called (stuck in builtins) | + +## Failed Tests + +- **import_floorplan_L1** (progressive, timeout): 120s, 0 turns, tools: get_skill -> list_skills +- **add_hvac_L1** (progressive, wrong_tool): 69s, 10 turns, tools: load_osm_model -> load_osm_model +- **add_hvac_L2** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model +- **add_hvac_L3** (progressive, wrong_tool): 96s, 9 turns, tools: load_osm_model -> load_osm_model -> load_osm_model +- **view_model_L1** (progressive, wrong_tool): 108s, 15 turns, tools: load_osm_model -> load_osm_model +- **view_model_L2** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model +- **view_model_L3** (progressive, wrong_tool): 94s, 9 turns, tools: load_osm_model -> load_osm_model +- **set_weather_L1** (progressive, timeout): 120s, 0 turns, tools: load_osm_model +- **set_weather_L2** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model +- **run_qaqc_L1** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model +- **run_qaqc_L2** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model +- **run_qaqc_L3** (progressive, wrong_tool): 60s, 12 turns, tools: load_osm_model +- **create_building_L1** (progressive, timeout): 120s, 0 turns, tools: list_skills -> list_weather_files +- **add_pv_L1** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model +- **add_pv_L2** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model +- **add_pv_L3** (progressive, wrong_tool): 56s, 6 turns, tools: load_osm_model -> load_osm_model +- **thermostat_L1** (progressive, timeout): 120s, 0 turns, tools: load_osm_model +- **thermostat_L2** (progressive, wrong_tool): 51s, 8 turns, tools: load_osm_model +- **thermostat_L3** (progressive, wrong_tool): 80s, 10 turns, tools: load_osm_model -> load_osm_model +- **list_spaces_L1** (progressive, wrong_tool): 56s, 10 turns, tools: load_osm_model +- **list_spaces_L3** (progressive, wrong_tool): 86s, 8 turns, tools: load_osm_model -> load_osm_model +- **schedules_L1** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model +- **schedules_L2** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model +- **inspect_component_L1** (progressive, wrong_tool): 83s, 10 turns, tools: load_osm_model -> load_osm_model +- **inspect_component_L2** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model -> load_osm_model +- **inspect_component_L3** (progressive, wrong_tool): 90s, 11 turns, tools: load_osm_model -> load_osm_model +- **modify_component_L1** (progressive, wrong_tool): 51s, 8 turns, tools: load_osm_model -> load_osm_model +- **modify_component_L2** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model -> load_osm_model -> list_model_objects +- **modify_component_L3** (progressive, wrong_tool): 90s, 15 turns, tools: load_osm_model -> load_osm_model -> load_osm_model -> list_model_objects +- **list_dynamic_type_L1** (progressive, wrong_tool): 106s, 10 turns, tools: load_osm_model -> load_osm_model +- **floor_area_L3** (progressive, wrong_tool): 69s, 9 turns, tools: load_osm_model +- **materials_L1** (progressive, wrong_tool): 82s, 14 turns, tools: load_osm_model -> load_osm_model -> load_osm_model +- **materials_L3** (progressive, wrong_tool): 118s, 10 turns, tools: load_osm_model +- **thermal_zones_L2** (progressive, timeout): 120s, 0 turns, tools: load_osm_model +- **thermal_zones_L3** (progressive, wrong_tool): 68s, 10 turns, tools: load_osm_model +- **subsurfaces_L1** (progressive, wrong_tool): 78s, 8 turns, tools: load_osm_model -> load_osm_model +- **subsurfaces_L2** (progressive, no_mcp_tool): 60s, 0 turns, tools: no tools called +- **subsurfaces_L3** (progressive, wrong_tool): 76s, 10 turns, tools: load_osm_model -> load_osm_model +- **surface_details_L1** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model +- **surface_details_L2** (progressive, wrong_tool): 81s, 10 turns, tools: load_osm_model -> load_osm_model +- **surface_details_L3** (progressive, wrong_tool): 78s, 11 turns, tools: load_osm_model -> load_osm_model -> load_osm_model +- **run_simulation_L1** (progressive, wrong_tool): 96s, 9 turns, tools: load_osm_model +- **run_simulation_L2** (progressive, wrong_tool): 86s, 12 turns, tools: load_osm_model -> load_osm_model +- **run_simulation_L3** (progressive, wrong_tool): 144s, 9 turns, tools: load_osm_model +- **set_wwr_L1** (progressive, wrong_tool): 90s, 10 turns, tools: load_osm_model -> load_osm_model +- **set_wwr_L2** (progressive, wrong_tool): 107s, 13 turns, tools: load_osm_model -> load_osm_model +- **replace_windows_L1** (progressive, wrong_tool): 106s, 11 turns, tools: load_osm_model -> load_osm_model +- **replace_windows_L2** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model +- **replace_windows_L3** (progressive, wrong_tool): 90s, 12 turns, tools: load_osm_model -> load_osm_model +- **construction_details_L1** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model +- **construction_details_L2** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model +- **construction_details_L3** (progressive, wrong_tool): 90s, 11 turns, tools: load_osm_model +- **check_loads_L1** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model -> load_osm_model +- **check_loads_L2** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model -> load_osm_model +- **check_loads_L3** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model +- **create_loads_L1** (progressive, wrong_tool): 68s, 10 turns, tools: load_osm_model -> load_osm_model +- **create_loads_L2** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> list_spaces -> load_osm_model +- **create_loads_L3** (progressive, wrong_tool): 116s, 11 turns, tools: load_osm_model +- **create_plant_loop_L1** (progressive, wrong_tool): 118s, 12 turns, tools: load_osm_model -> load_osm_model +- **create_plant_loop_L2** (progressive, wrong_tool): 72s, 7 turns, tools: load_osm_model +- **schedule_details_L1** (progressive, wrong_tool): 84s, 14 turns, tools: load_osm_model +- **schedule_details_L2** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model -> load_osm_model +- **schedule_details_L3** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model +- **space_type_info_L1** (progressive, wrong_tool): 78s, 11 turns, tools: load_osm_model -> load_osm_model +- **space_type_info_L2** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model -> list_model_objects +- **space_type_info_L3** (progressive, wrong_tool): 69s, 8 turns, tools: load_osm_model +- **set_run_period_L3** (progressive, timeout): 120s, 0 turns, tools: load_osm_model +- **ideal_air_L3** (progressive, wrong_tool): 82s, 12 turns, tools: load_osm_model -> load_osm_model +- **save_model_L1** (progressive, wrong_tool): 61s, 11 turns, tools: load_osm_model +- **save_model_L2** (progressive, wrong_tool): 68s, 10 turns, tools: load_osm_model -> load_osm_model +- **add_ev_L1** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model -> load_osm_model +- **add_ev_L2** (progressive, wrong_tool): 80s, 11 turns, tools: load_osm_model -> load_osm_model -> load_osm_model +- **add_ev_L3** (progressive, timeout): 120s, 0 turns, tools: load_osm_model -> load_osm_model +- **list_measures_L3** (progressive, wrong_tool): 5s, 11 turns, tools: list_custom_measures +- **create_measure_L1** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **create_measure_L2** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **create_measure_L3** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **test_measure_L1** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **test_measure_L2** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **test_measure_L3** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **apply_existing_measure_L1** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **apply_existing_measure_L2** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **apply_existing_measure_L3** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **replace_terminals_cooled_beam_L1** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **replace_terminals_cooled_beam_L2** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **replace_terminals_cooled_beam_L3** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **replace_terminals_four_pipe_beam_L1** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **replace_terminals_four_pipe_beam_L2** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **replace_terminals_four_pipe_beam_L3** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **measure_replace_terminals_L1** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **measure_replace_terminals_L2** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **measure_replace_terminals_L3** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **zone_equipment_priority_L1** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **zone_equipment_priority_L2** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **zone_equipment_priority_L3** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **edit_measure_L1** (progressive, wrong_tool): 3s, 11 turns, tools: list_custom_measures +- **edit_measure_L2** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures +- **edit_measure_L3** (progressive, wrong_tool): 2s, 11 turns, tools: list_custom_measures diff --git a/docs/sweeps/haiku-2026-03-28/benchmark.json b/docs/sweeps/haiku-2026-03-28/benchmark.json new file mode 100644 index 0000000..344b2f3 --- /dev/null +++ b/docs/sweeps/haiku-2026-03-28/benchmark.json @@ -0,0 +1,6054 @@ +{ + "timestamp": "2026-03-28T18:32:55+00:00", + "model": "haiku", + "retries": 0, + "total_tests": 180, + "passed": 160, + "failed": 20, + "pass_rate": 88.9, + "total_duration_s": 4774.9, + "total_input_tokens": 8870, + "total_output_tokens": 307749, + "total_cache_read_tokens": 66583856, + "total_cost_usd": 11.211, + "tiers": { + "setup": { + "total": 6, + "passed": 6, + "duration_s": 113.7, + "pass_rate": 100.0 + }, + "tier1": { + "total": 4, + "passed": 4, + "duration_s": 75.9, + "pass_rate": 100.0 + }, + "tier3": { + "total": 26, + "passed": 19, + "duration_s": 1127.4, + "pass_rate": 73.1 + }, + "tier2": { + "total": 37, + "passed": 31, + "duration_s": 1857.0, + "pass_rate": 83.8 + }, + "tier4": { + "total": 3, + "passed": 3, + "duration_s": 71.8, + "pass_rate": 100.0 + }, + "progressive": { + "total": 104, + "passed": 97, + "duration_s": 1529.1, + "pass_rate": 93.3 + } + }, + "tests": [ + { + "test_id": "tests/llm/test_01_setup.py::test_create_baseline_model", + "passed": true, + "duration_s": 14.8, + "tier": "setup", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.0755098, + "duration_ms": 11773, + "input_tokens": 18, + "output_tokens": 699, + "cache_read_tokens": 67618, + "tool_calls": [ + "create_baseline_osm" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__create_baseline_osm" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_01_setup.py::test_create_baseline_with_hvac", + "passed": true, + "duration_s": 15.0, + "tier": "setup", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.02596255, + "duration_ms": 12951, + "input_tokens": 18, + "output_tokens": 790, + "cache_read_tokens": 111158, + "tool_calls": [ + "create_baseline_osm" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__create_baseline_osm" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_01_setup.py::test_create_example_model", + "passed": true, + "duration_s": 8.5, + "tier": "setup", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.0238726, + "duration_ms": 6325, + "input_tokens": 18, + "output_tokens": 442, + "cache_read_tokens": 111146, + "tool_calls": [ + "create_example_osm" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__create_example_osm" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_01_setup.py::test_load_baseline_model", + "passed": true, + "duration_s": 6.9, + "tier": "setup", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.04039965, + "duration_ms": 4790, + "input_tokens": 26, + "output_tokens": 453, + "cache_read_tokens": 162699, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_01_setup.py::test_run_baseline_simulation", + "passed": true, + "duration_s": 21.1, + "tier": "setup", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.06312655, + "duration_ms": 18998, + "input_tokens": 58, + "output_tokens": 1381, + "cache_read_tokens": 417048, + "tool_calls": [ + "load_osm_model", + "change_building_location", + "run_simulation", + "get_run_status", + "run_simulation", + "get_run_status", + "get_run_status" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_01_setup.py::test_run_retrofit_simulation", + "passed": true, + "duration_s": 47.4, + "tier": "setup", + "attempt": 1, + "num_turns": 9, + "cost_usd": 0.07618575, + "duration_ms": 45309, + "input_tokens": 74, + "output_tokens": 1520, + "cache_read_tokens": 541830, + "tool_calls": [ + "load_osm_model", + "change_building_location", + "adjust_thermostat_setpoints", + "run_simulation", + "get_run_status", + "save_osm_model", + "run_simulation", + "get_run_status" + ], + "num_tool_calls": 8, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "mcp__openstudio__adjust_thermostat_setpoints", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[What is the server status?]", + "passed": true, + "duration_s": 4.8, + "tier": "tier1", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.0223679, + "duration_ms": 2735, + "input_tokens": 18, + "output_tokens": 196, + "cache_read_tokens": 111124, + "tool_calls": [ + "get_server_status" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__get_server_status" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[List available skills]", + "passed": true, + "duration_s": 7.4, + "tier": "tier1", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.03345375, + "duration_ms": 5309, + "input_tokens": 18, + "output_tokens": 418, + "cache_read_tokens": 103070, + "tool_calls": [ + "list_skills" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__list_skills" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[Create a small office building usin]", + "passed": true, + "duration_s": 45.2, + "tier": "tier1", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0637988, + "duration_ms": 43128, + "input_tokens": 42, + "output_tokens": 1630, + "cache_read_tokens": 305868, + "tool_calls": [ + "create_new_building", + "create_new_building", + "list_weather_files", + "create_new_building" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__create_new_building" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[Create bar geometry for a retail bu]", + "passed": true, + "duration_s": 18.5, + "tier": "tier1", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.0305611, + "duration_ms": 16545, + "input_tokens": 18, + "output_tokens": 1266, + "cache_read_tokens": 111131, + "tool_calls": [ + "create_bar_building" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__create_bar_building" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Add HVAC to the model]", + "passed": true, + "duration_s": 14.6, + "tier": "tier3", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03403955, + "duration_ms": 12541, + "input_tokens": 26, + "output_tokens": 939, + "cache_read_tokens": 171098, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "list_thermal_zones" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Set up heating and cooling]", + "passed": true, + "duration_s": 18.9, + "tier": "tier3", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.0605092, + "duration_ms": 16861, + "input_tokens": 50, + "output_tokens": 1544, + "cache_read_tokens": 358792, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "list_thermal_zones", + "get_weather_info", + "list_baseline_systems", + "add_baseline_system", + "save_osm_model" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__list_baseline_systems", + "mcp__openstudio__add_baseline_system", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:What HVAC system should I use?]", + "passed": true, + "duration_s": 17.4, + "tier": "tier3", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.03707715, + "duration_ms": 15174, + "input_tokens": 26, + "output_tokens": 1171, + "cache_read_tokens": 171099, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "list_baseline_systems", + "recommend_tools" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info", + "mcp__openstudio__list_baseline_systems", + "mcp__openstudio__recommend_tools" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Add a VAV system]", + "passed": true, + "duration_s": 19.2, + "tier": "tier3", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0362682, + "duration_ms": 17093, + "input_tokens": 26, + "output_tokens": 1064, + "cache_read_tokens": 171897, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_baseline_system" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_baseline_system" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[energy-report:Give me a full energy report]", + "passed": false, + "duration_s": 57.3, + "tier": "tier3", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0362682, + "duration_ms": 17093, + "input_tokens": 26, + "output_tokens": 1064, + "cache_read_tokens": 171897, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_baseline_system" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_baseline_system" + ], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a small office building]", + "passed": true, + "duration_s": 55.0, + "tier": "tier3", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.08970605000000001, + "duration_ms": 52890, + "input_tokens": 58, + "output_tokens": 2665, + "cache_read_tokens": 456893, + "tool_calls": [ + "create_new_building", + "create_new_building", + "list_weather_files", + "create_new_building", + "create_bar_building", + "create_baseline_osm" + ], + "num_tool_calls": 6, + "all_tool_calls": [ + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_bar_building", + "mcp__openstudio__create_baseline_osm" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Model a 3-story school]", + "passed": true, + "duration_s": 131.7, + "tier": "tier3", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.09154014999999999, + "duration_ms": 129735, + "input_tokens": 58, + "output_tokens": 2707, + "cache_read_tokens": 435309, + "tool_calls": [ + "list_skills", + "get_skill", + "list_weather_files", + "create_new_building", + "get_building_info", + "list_air_loops", + "list_plant_loops", + "view_model" + ], + "num_tool_calls": 8, + "all_tool_calls": [ + "mcp__openstudio__list_skills", + "mcp__openstudio__get_skill", + "AskUserQuestion", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__create_new_building", + "mcp__openstudio__get_building_info", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__view_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a retail building, 25000 sqf]", + "passed": true, + "duration_s": 71.2, + "tier": "tier3", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.09511269999999998, + "duration_ms": 69092, + "input_tokens": 66, + "output_tokens": 2892, + "cache_read_tokens": 516317, + "tool_calls": [ + "create_new_building", + "create_new_building", + "create_bar_building", + "create_bar_building", + "list_weather_files", + "create_new_building", + "save_osm_model" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_bar_building", + "mcp__openstudio__create_bar_building", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__create_new_building", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Import the FloorspaceJS floor plan ]", + "passed": true, + "duration_s": 18.1, + "tier": "tier3", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0420962, + "duration_ms": 16032, + "input_tokens": 34, + "output_tokens": 1149, + "cache_read_tokens": 232722, + "tool_calls": [ + "import_floorspacejs", + "list_files", + "import_floorspacejs" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__import_floorspacejs", + "mcp__openstudio__list_files", + "mcp__openstudio__import_floorspacejs" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a bar building for a medium ]", + "passed": true, + "duration_s": 17.6, + "tier": "tier3", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.0294487, + "duration_ms": 15540, + "input_tokens": 18, + "output_tokens": 1054, + "cache_read_tokens": 111132, + "tool_calls": [ + "create_bar_building" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__create_bar_building" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Check the model for issues]", + "passed": true, + "duration_s": 18.9, + "tier": "tier3", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.046837500000000004, + "duration_ms": 16877, + "input_tokens": 34, + "output_tokens": 1196, + "cache_read_tokens": 232010, + "tool_calls": [ + "load_osm_model", + "validate_model", + "get_model_summary", + "get_building_info", + "get_weather_info", + "list_air_loops", + "list_plant_loops", + "list_thermal_zones", + "get_simulation_control", + "get_run_period" + ], + "num_tool_calls": 10, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__validate_model", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__get_building_info", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__get_simulation_control", + "mcp__openstudio__get_run_period" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Validate before simulation]", + "passed": false, + "duration_s": 10.5, + "tier": "tier3", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.030294300000000003, + "duration_ms": 8284, + "input_tokens": 26, + "output_tokens": 454, + "cache_read_tokens": 170783, + "tool_calls": [ + "load_osm_model", + "validate_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__validate_model" + ], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:QA/QC the model]", + "passed": false, + "duration_s": 10.0, + "tier": "tier3", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.03122435, + "duration_ms": 7880, + "input_tokens": 26, + "output_tokens": 644, + "cache_read_tokens": 171221, + "tool_calls": [ + "load_osm_model", + "validate_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__validate_model" + ], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Is my model ready to simulate?]", + "passed": false, + "duration_s": 15.0, + "tier": "tier3", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.048901749999999994, + "duration_ms": 12585, + "input_tokens": 34, + "output_tokens": 835, + "cache_read_tokens": 222965, + "tool_calls": [ + "load_osm_model", + "validate_model", + "get_weather_info", + "get_building_info", + "get_simulation_control", + "list_air_loops", + "list_plant_loops" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__validate_model", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__get_building_info", + "mcp__openstudio__get_simulation_control", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops" + ], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[retrofit:Compare before and after adding ins]", + "passed": true, + "duration_s": 160.4, + "tier": "tier3", + "attempt": 1, + "num_turns": 41, + "cost_usd": 0.35133839999999994, + "duration_ms": 158073, + "input_tokens": 306, + "output_tokens": 9244, + "cache_read_tokens": 2637274, + "tool_calls": [ + "load_osm_model", + "validate_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "extract_end_use_breakdown", + "list_common_measures", + "create_measure", + "apply_measure", + "edit_measure", + "apply_measure", + "get_skill", + "edit_measure", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "load_osm_model", + "edit_measure", + "apply_measure", + "search_api", + "edit_measure", + "apply_measure", + "list_surfaces", + "edit_measure", + "apply_measure", + "edit_measure", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "extract_end_use_breakdown", + "compare_runs" + ], + "num_tool_calls": 40, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__validate_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__list_common_measures", + "mcp__openstudio__create_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__get_skill", + "mcp__openstudio__edit_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__edit_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__search_api", + "mcp__openstudio__edit_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__edit_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__compare_runs" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[retrofit:Do a retrofit analysis]", + "passed": true, + "duration_s": 84.7, + "tier": "tier3", + "attempt": 1, + "num_turns": 52, + "cost_usd": 0.28222174999999994, + "duration_ms": 82529, + "input_tokens": 242, + "output_tokens": 7365, + "cache_read_tokens": 2068535, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "get_model_summary", + "list_air_loops", + "list_plant_loops", + "get_weather_info", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "extract_end_use_breakdown", + "load_osm_model", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "save_osm_model", + "run_simulation", + "load_osm_model", + "replace_air_terminals", + "save_osm_model", + "run_simulation", + "load_osm_model", + "add_rooftop_pv", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "extract_summary_metrics", + "extract_summary_metrics", + "extract_end_use_breakdown", + "extract_end_use_breakdown", + "extract_end_use_breakdown" + ], + "num_tool_calls": 51, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__replace_air_terminals", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Run a simulation]", + "passed": false, + "duration_s": 29.3, + "tier": "tier3", + "attempt": 1, + "num_turns": 52, + "cost_usd": 0.28222174999999994, + "duration_ms": 82529, + "input_tokens": 242, + "output_tokens": 7365, + "cache_read_tokens": 2068535, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "get_model_summary", + "list_air_loops", + "list_plant_loops", + "get_weather_info", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "extract_end_use_breakdown", + "load_osm_model", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "save_osm_model", + "run_simulation", + "load_osm_model", + "replace_air_terminals", + "save_osm_model", + "run_simulation", + "load_osm_model", + "add_rooftop_pv", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "extract_summary_metrics", + "extract_summary_metrics", + "extract_end_use_breakdown", + "extract_end_use_breakdown", + "extract_end_use_breakdown" + ], + "num_tool_calls": 51, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__replace_air_terminals", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Simulate the model]", + "passed": false, + "duration_s": 28.2, + "tier": "tier3", + "attempt": 1, + "num_turns": 52, + "cost_usd": 0.28222174999999994, + "duration_ms": 82529, + "input_tokens": 242, + "output_tokens": 7365, + "cache_read_tokens": 2068535, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "get_model_summary", + "list_air_loops", + "list_plant_loops", + "get_weather_info", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "extract_end_use_breakdown", + "load_osm_model", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "save_osm_model", + "run_simulation", + "load_osm_model", + "replace_air_terminals", + "save_osm_model", + "run_simulation", + "load_osm_model", + "add_rooftop_pv", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "extract_summary_metrics", + "extract_summary_metrics", + "extract_end_use_breakdown", + "extract_end_use_breakdown", + "extract_end_use_breakdown" + ], + "num_tool_calls": 51, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__replace_air_terminals", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Run EnergyPlus]", + "passed": true, + "duration_s": 23.9, + "tier": "tier3", + "attempt": 1, + "num_turns": 12, + "cost_usd": 0.10094320000000001, + "duration_ms": 21852, + "input_tokens": 90, + "output_tokens": 1959, + "cache_read_tokens": 651932, + "tool_calls": [ + "load_osm_model", + "get_weather_info", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "extract_end_use_breakdown" + ], + "num_tool_calls": 11, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:My simulation failed]", + "passed": false, + "duration_s": 57.9, + "tier": "tier3", + "attempt": 1, + "num_turns": 12, + "cost_usd": 0.10094320000000001, + "duration_ms": 21852, + "input_tokens": 90, + "output_tokens": 1959, + "cache_read_tokens": 651932, + "tool_calls": [ + "load_osm_model", + "get_weather_info", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "extract_end_use_breakdown" + ], + "num_tool_calls": 11, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:EUI looks way too high]", + "passed": true, + "duration_s": 98.8, + "tier": "tier3", + "attempt": 1, + "num_turns": 44, + "cost_usd": 0.3259123500000001, + "duration_ms": 96692, + "input_tokens": 266, + "output_tokens": 7334, + "cache_read_tokens": 2409326, + "tool_calls": [ + "load_osm_model", + "extract_summary_metrics", + "get_run_status", + "extract_simulation_errors", + "get_weather_info", + "get_building_info", + "change_building_location", + "save_osm_model", + "validate_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_logs", + "get_run_logs", + "get_run_status", + "extract_summary_metrics", + "extract_end_use_breakdown", + "list_air_loops", + "list_plant_loops", + "list_zone_hvac_equipment", + "get_plant_loop_details", + "get_component_properties", + "get_simulation_control", + "extract_hvac_sizing", + "extract_component_sizing", + "get_setpoint_manager_properties", + "extract_component_sizing", + "get_schedule_details", + "get_object_fields", + "list_model_objects", + "list_model_objects", + "get_schedule_details", + "get_object_fields", + "list_thermal_zones", + "get_schedule_details", + "get_schedule_details", + "get_object_fields", + "list_spaces", + "get_space_details", + "get_space_type_details", + "list_model_objects", + "get_load_details" + ], + "num_tool_calls": 43, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_simulation_errors", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__get_building_info", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__validate_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_logs", + "mcp__openstudio__get_run_logs", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__list_zone_hvac_equipment", + "mcp__openstudio__get_plant_loop_details", + "mcp__openstudio__get_component_properties", + "mcp__openstudio__get_simulation_control", + "mcp__openstudio__extract_hvac_sizing", + "mcp__openstudio__extract_component_sizing", + "mcp__openstudio__get_setpoint_manager_properties", + "mcp__openstudio__extract_component_sizing", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__list_spaces", + "mcp__openstudio__get_space_details", + "mcp__openstudio__get_space_type_details", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_load_details" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:Too many unmet hours]", + "passed": true, + "duration_s": 120.1, + "tier": "tier3", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "extract_summary_metrics", + "get_run_status", + "extract_simulation_errors", + "list_weather_files", + "change_building_location", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics", + "extract_simulation_errors", + "list_output_variables", + "load_osm_model", + "add_output_meter", + "add_output_meter", + "add_output_variable", + "add_output_variable", + "add_output_variable", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics", + "get_building_info", + "list_thermal_zones", + "list_air_loops", + "list_plant_loops", + "validate_model", + "get_run_logs", + "change_building_location", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "extract_end_use_breakdown", + "extract_hvac_sizing", + "extract_component_sizing", + "get_component_properties", + "get_plant_loop_details", + "extract_component_sizing", + "query_timeseries", + "list_output_variables", + "load_osm_model", + "set_component_properties", + "set_component_properties", + "search_api" + ], + "num_tool_calls": 48, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_simulation_errors", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_simulation_errors", + "mcp__openstudio__list_output_variables", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_output_meter", + "mcp__openstudio__add_output_meter", + "mcp__openstudio__add_output_variable", + "mcp__openstudio__add_output_variable", + "mcp__openstudio__add_output_variable", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__get_building_info", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__validate_model", + "mcp__openstudio__get_run_logs", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__extract_hvac_sizing", + "mcp__openstudio__extract_component_sizing", + "mcp__openstudio__get_component_properties", + "mcp__openstudio__get_plant_loop_details", + "mcp__openstudio__extract_component_sizing", + "mcp__openstudio__query_timeseries", + "mcp__openstudio__list_output_variables", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__set_component_properties", + "mcp__openstudio__set_component_properties", + "mcp__openstudio__search_api" + ], + "toolsearch_count": 0, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:Why did EnergyPlus crash?]", + "passed": true, + "duration_s": 9.3, + "tier": "tier3", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.02634545, + "duration_ms": 7256, + "input_tokens": 18, + "output_tokens": 713, + "cache_read_tokens": 111187, + "tool_calls": [ + "load_osm_model", + "get_run_status", + "extract_simulation_errors" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_simulation_errors" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:Show me the model]", + "passed": true, + "duration_s": 18.8, + "tier": "tier3", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.0459619, + "duration_ms": 16780, + "input_tokens": 34, + "output_tokens": 1027, + "cache_read_tokens": 232504, + "tool_calls": [ + "load_osm_model", + "get_model_summary", + "get_building_info", + "view_model", + "list_thermal_zones", + "list_air_loops", + "list_plant_loops" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__get_building_info", + "mcp__openstudio__view_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:Visualize the building]", + "passed": true, + "duration_s": 12.0, + "tier": "tier3", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.024481500000000003, + "duration_ms": 9955, + "input_tokens": 18, + "output_tokens": 500, + "cache_read_tokens": 111160, + "tool_calls": [ + "load_osm_model", + "view_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:3D view]", + "passed": true, + "duration_s": 8.6, + "tier": "tier3", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.023931499999999998, + "duration_ms": 6584, + "input_tokens": 18, + "output_tokens": 393, + "cache_read_tokens": 111160, + "tool_calls": [ + "load_osm_model", + "view_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[systemd_fourpipebeam_e2e]", + "passed": true, + "duration_s": 308.5, + "tier": "tier2", + "attempt": 1, + "num_turns": 34, + "cost_usd": 0.3383652, + "duration_ms": 306345, + "input_tokens": 258, + "output_tokens": 9880, + "cache_read_tokens": 2417547, + "tool_calls": [ + "load_osm_model", + "list_weather_files", + "change_building_location", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "list_air_loops", + "list_plant_loops", + "search_wiring_patterns", + "create_measure", + "test_measure", + "edit_measure", + "test_measure", + "apply_measure", + "edit_measure", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics", + "compare_runs", + "copy_file" + ], + "num_tool_calls": 24, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_weather_files", + "Bash", + "Bash", + "Bash", + "Bash", + "Bash", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__search_wiring_patterns", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "Read", + "Bash", + "mcp__openstudio__apply_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__compare_runs", + "mcp__openstudio__copy_file" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[add_vav_reheat]", + "passed": true, + "duration_s": 23.9, + "tier": "tier2", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.041057350000000006, + "duration_ms": 21867, + "input_tokens": 34, + "output_tokens": 1009, + "cache_read_tokens": 231846, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_baseline_system" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_baseline_system" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[add_doas]", + "passed": true, + "duration_s": 15.1, + "tier": "tier2", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.05114525, + "duration_ms": 13030, + "input_tokens": 42, + "output_tokens": 1391, + "cache_read_tokens": 294245, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_doas_system", + "save_osm_model" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_doas_system", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[add_vrf]", + "passed": true, + "duration_s": 11.4, + "tier": "tier2", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.047175550000000004, + "duration_ms": 9319, + "input_tokens": 42, + "output_tokens": 928, + "cache_read_tokens": 293048, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_vrf_system", + "save_osm_model" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_vrf_system", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[set_weather]", + "passed": true, + "duration_s": 14.8, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.03265175, + "duration_ms": 12685, + "input_tokens": 26, + "output_tokens": 822, + "cache_read_tokens": 171395, + "tool_calls": [ + "load_osm_model", + "change_building_location" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[add_rooftop_pv]", + "passed": true, + "duration_s": 11.8, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.030936500000000002, + "duration_ms": 9586, + "input_tokens": 26, + "output_tokens": 523, + "cache_read_tokens": 171180, + "tool_calls": [ + "load_osm_model", + "add_rooftop_pv" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[adjust_thermostat]", + "passed": true, + "duration_s": 18.7, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.031848999999999995, + "duration_ms": 16536, + "input_tokens": 26, + "output_tokens": 702, + "cache_read_tokens": 171280, + "tool_calls": [ + "load_osm_model", + "adjust_thermostat_setpoints" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__adjust_thermostat_setpoints" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[delete_space]", + "passed": true, + "duration_s": 9.0, + "tier": "tier2", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03872505, + "duration_ms": 6975, + "input_tokens": 34, + "output_tokens": 570, + "cache_read_tokens": 231073, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "delete_object" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "mcp__openstudio__delete_object" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[qaqc_check]", + "passed": false, + "duration_s": 22.8, + "tier": "tier2", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.06071900000000001, + "duration_ms": 20779, + "input_tokens": 42, + "output_tokens": 1969, + "cache_read_tokens": 294095, + "tool_calls": [ + "load_osm_model", + "validate_model", + "run_simulation", + "get_run_status", + "extract_simulation_errors" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__validate_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_simulation_errors" + ], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[create_bar_office]", + "passed": true, + "duration_s": 15.3, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0372755, + "duration_ms": 13284, + "input_tokens": 26, + "output_tokens": 993, + "cache_read_tokens": 172945, + "tool_calls": [ + "create_bar_building", + "list_spaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__create_bar_building", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[create_new_building]", + "passed": true, + "duration_s": 52.1, + "tier": "tier2", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.0302877, + "duration_ms": 50079, + "input_tokens": 18, + "output_tokens": 1512, + "cache_read_tokens": 111197, + "tool_calls": [ + "create_new_building" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__create_new_building" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[bar_then_typical]", + "passed": true, + "duration_s": 50.2, + "tier": "tier2", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.07290845, + "duration_ms": 47949, + "input_tokens": 66, + "output_tokens": 1716, + "cache_read_tokens": 487237, + "tool_calls": [ + "create_bar_building", + "change_building_location", + "create_typical_building", + "read_file" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__create_bar_building", + "mcp__openstudio__change_building_location", + "mcp__openstudio__create_typical_building", + "Read", + "Read", + "Read", + "mcp__openstudio__read_file" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[import_floorspacejs]", + "passed": false, + "duration_s": 11.7, + "tier": "tier2", + "attempt": 1, + "num_turns": 1, + "cost_usd": 0.01901225, + "duration_ms": 9211, + "input_tokens": 10, + "output_tokens": 748, + "cache_read_tokens": 51535, + "tool_calls": [], + "num_tool_calls": 0, + "all_tool_calls": [], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "no_mcp_tool" + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[floorspacejs_to_typical]", + "passed": false, + "duration_s": 10.9, + "tier": "tier2", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.0260502, + "duration_ms": 8719, + "input_tokens": 18, + "output_tokens": 821, + "cache_read_tokens": 111272, + "tool_calls": [ + "import_floorspacejs" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__import_floorspacejs" + ], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[manual_geometry_match]", + "passed": true, + "duration_s": 20.5, + "tier": "tier2", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.060654950000000006, + "duration_ms": 18438, + "input_tokens": 50, + "output_tokens": 1917, + "cache_read_tokens": 356162, + "tool_calls": [ + "create_example_osm", + "create_space_from_floor_print", + "create_space_from_floor_print", + "match_surfaces", + "list_surfaces", + "list_surfaces", + "save_osm_model" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "mcp__openstudio__create_example_osm", + "mcp__openstudio__create_space_from_floor_print", + "mcp__openstudio__create_space_from_floor_print", + "mcp__openstudio__match_surfaces", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[envelope_retrofit]", + "passed": false, + "duration_s": 12.1, + "tier": "tier2", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.029497299999999997, + "duration_ms": 9818, + "input_tokens": 18, + "output_tokens": 1017, + "cache_read_tokens": 111193, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "list_materials" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__list_materials" + ], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[create_and_assign_loads]", + "passed": false, + "duration_s": 12.4, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.029107, + "duration_ms": 10295, + "input_tokens": 18, + "output_tokens": 1212, + "cache_read_tokens": 111215, + "tool_calls": [ + "load_osm_model", + "list_spaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[plant_loop_with_boiler]", + "passed": true, + "duration_s": 11.0, + "tier": "tier2", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03807305, + "duration_ms": 8917, + "input_tokens": 34, + "output_tokens": 728, + "cache_read_tokens": 231453, + "tool_calls": [ + "load_osm_model", + "create_plant_loop", + "add_supply_equipment" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_plant_loop", + "mcp__openstudio__add_supply_equipment" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[inspect_and_modify_boiler]", + "passed": true, + "duration_s": 14.7, + "tier": "tier2", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.046854, + "duration_ms": 12680, + "input_tokens": 42, + "output_tokens": 974, + "cache_read_tokens": 292845, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_object_fields", + "set_object_property" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__set_object_property" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[extract_results_chain]", + "passed": true, + "duration_s": 13.2, + "tier": "tier2", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.033176399999999995, + "duration_ms": 10874, + "input_tokens": 26, + "output_tokens": 791, + "cache_read_tokens": 171379, + "tool_calls": [ + "extract_summary_metrics", + "extract_end_use_breakdown", + "get_run_status", + "extract_simulation_errors" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_simulation_errors" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[hvac_chilled_beam_comparison]", + "passed": true, + "duration_s": 41.1, + "tier": "tier2", + "attempt": 1, + "num_turns": 18, + "cost_usd": 0.17254125000000003, + "duration_ms": 39096, + "input_tokens": 146, + "output_tokens": 2918, + "cache_read_tokens": 1184190, + "tool_calls": [ + "load_osm_model", + "list_air_loops", + "replace_air_terminals", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_simulation_errors", + "list_weather_files", + "change_building_location", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_end_use_breakdown" + ], + "num_tool_calls": 17, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__replace_air_terminals", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_simulation_errors", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[create_test_apply_measure]", + "passed": true, + "duration_s": 15.4, + "tier": "tier2", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.03527285000000001, + "duration_ms": 13404, + "input_tokens": 26, + "output_tokens": 1186, + "cache_read_tokens": 171806, + "tool_calls": [ + "load_osm_model", + "create_measure", + "test_measure", + "apply_measure" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[measure_set_lights_full_chain]", + "passed": true, + "duration_s": 97.0, + "tier": "tier2", + "attempt": 1, + "num_turns": 31, + "cost_usd": 0.23750624999999997, + "duration_ms": 94876, + "input_tokens": 210, + "output_tokens": 4362, + "cache_read_tokens": 1816275, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_weather_info", + "list_weather_files", + "change_building_location", + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "list_model_objects", + "load_osm_model", + "change_building_location", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "create_measure", + "test_measure", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 30, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[measure_set_infiltration_full_chain]", + "passed": true, + "duration_s": 53.7, + "tier": "tier2", + "attempt": 1, + "num_turns": 21, + "cost_usd": 0.15801865, + "duration_ms": 51564, + "input_tokens": 154, + "output_tokens": 3656, + "cache_read_tokens": 1191959, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "search_api", + "create_measure", + "test_measure", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 20, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__search_api", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[measure_replace_terminals_full_chain]", + "passed": false, + "duration_s": 71.4, + "tier": "tier2", + "attempt": 1, + "num_turns": 21, + "cost_usd": 0.15801865, + "duration_ms": 51564, + "input_tokens": 154, + "output_tokens": 3656, + "cache_read_tokens": 1191959, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "search_api", + "create_measure", + "test_measure", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 20, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__search_api", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[create_measure_with_args]", + "passed": true, + "duration_s": 87.1, + "tier": "tier2", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.13052344999999999, + "duration_ms": 85001, + "input_tokens": 82, + "output_tokens": 8435, + "cache_read_tokens": 649952, + "tool_calls": [ + "create_measure", + "test_measure", + "edit_measure", + "test_measure", + "create_baseline_osm", + "test_measure", + "apply_measure", + "list_model_objects", + "get_construction_details" + ], + "num_tool_calls": 9, + "all_tool_calls": [ + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__create_baseline_osm", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_construction_details" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[measure_add_baseboards_full_chain]", + "passed": true, + "duration_s": 121.9, + "tier": "tier2", + "attempt": 1, + "num_turns": 24, + "cost_usd": 0.18800334999999999, + "duration_ms": 119799, + "input_tokens": 186, + "output_tokens": 4261, + "cache_read_tokens": 1455936, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "get_weather_info", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "create_measure", + "test_measure", + "edit_measure", + "test_measure", + "search_api", + "edit_measure", + "test_measure", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 21, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__search_api", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[ruby_measure_reduce_plugloads]", + "passed": true, + "duration_s": 184.6, + "tier": "tier2", + "attempt": 1, + "num_turns": 36, + "cost_usd": 0.37527024999999997, + "duration_ms": 182368, + "input_tokens": 282, + "output_tokens": 16755, + "cache_read_tokens": 2488845, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "get_weather_info", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "create_measure", + "test_measure", + "edit_measure", + "test_measure", + "search_api", + "apply_measure", + "search_wiring_patterns", + "edit_measure", + "apply_measure", + "edit_measure", + "apply_measure", + "edit_measure", + "apply_measure", + "search_api", + "edit_measure", + "apply_measure", + "get_run_logs", + "edit_measure", + "apply_measure", + "list_model_objects", + "get_object_fields", + "set_object_property", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status" + ], + "num_tool_calls": 36, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__search_api", + "mcp__openstudio__apply_measure", + "mcp__openstudio__search_wiring_patterns", + "mcp__openstudio__edit_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__search_api", + "mcp__openstudio__edit_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__get_run_logs", + "mcp__openstudio__edit_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__set_object_property", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[python_measure_reduce_plugloads]", + "passed": true, + "duration_s": 130.4, + "tier": "tier2", + "attempt": 1, + "num_turns": 24, + "cost_usd": 0.21729969999999998, + "duration_ms": 128259, + "input_tokens": 194, + "output_tokens": 7217, + "cache_read_tokens": 1549957, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "create_measure", + "test_measure", + "edit_measure", + "test_measure", + "search_api", + "search_api", + "edit_measure", + "test_measure", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 20, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "TaskOutput", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__search_api", + "mcp__openstudio__search_api", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[ruby_measure_boiler_efficiency]", + "passed": true, + "duration_s": 62.3, + "tier": "tier2", + "attempt": 1, + "num_turns": 24, + "cost_usd": 0.20399160000000002, + "duration_ms": 60010, + "input_tokens": 178, + "output_tokens": 5644, + "cache_read_tokens": 1397686, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "create_measure", + "test_measure", + "edit_measure", + "test_measure", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 23, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[python_measure_boiler_efficiency]", + "passed": true, + "duration_s": 64.7, + "tier": "tier2", + "attempt": 1, + "num_turns": 24, + "cost_usd": 0.2050478, + "duration_ms": 62643, + "input_tokens": 178, + "output_tokens": 6163, + "cache_read_tokens": 1436348, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "get_weather_info", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "get_run_artifacts", + "extract_summary_metrics", + "load_osm_model", + "create_measure", + "create_measure", + "test_measure", + "edit_measure", + "test_measure", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_artifacts", + "get_run_status", + "get_run_artifacts", + "extract_summary_metrics", + "compare_runs" + ], + "num_tool_calls": 23, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_artifacts", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_measure", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_artifacts", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_artifacts", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__compare_runs" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_create_measure_with_args_quality", + "passed": true, + "duration_s": 113.9, + "tier": "tier2", + "attempt": 1, + "num_turns": 16, + "cost_usd": 0.1815693, + "duration_ms": 111816, + "input_tokens": 122, + "output_tokens": 11324, + "cache_read_tokens": 1039448, + "tool_calls": [ + "get_skill", + "create_measure", + "create_baseline_osm", + "test_measure", + "edit_measure", + "test_measure", + "edit_measure", + "test_measure", + "edit_measure", + "test_measure", + "test_measure", + "apply_measure", + "get_surface_details", + "get_construction_details", + "save_osm_model" + ], + "num_tool_calls": 15, + "all_tool_calls": [ + "mcp__openstudio__get_skill", + "mcp__openstudio__create_measure", + "mcp__openstudio__create_baseline_osm", + "mcp__openstudio__test_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__get_surface_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_complex_model_multi_query", + "passed": true, + "duration_s": 11.5, + "tier": "tier2", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.02786775, + "duration_ms": 9183, + "input_tokens": 18, + "output_tokens": 854, + "cache_read_tokens": 111235, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "list_air_loops", + "list_plant_loops", + "list_thermal_zones" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_measure_reduce_plugloads_quality[Ruby]", + "passed": true, + "duration_s": 56.3, + "tier": "tier2", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.07629335, + "duration_ms": 54258, + "input_tokens": 18, + "output_tokens": 8894, + "cache_read_tokens": 111241, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_measure_reduce_plugloads_quality[Python]", + "passed": true, + "duration_s": 31.0, + "tier": "tier2", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.047902099999999996, + "duration_ms": 28938, + "input_tokens": 18, + "output_tokens": 4332, + "cache_read_tokens": 111241, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_measure_boiler_efficiency_quality[Ruby]", + "passed": true, + "duration_s": 31.2, + "tier": "tier2", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.0485012, + "duration_ms": 29030, + "input_tokens": 18, + "output_tokens": 4424, + "cache_read_tokens": 111257, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_measure_boiler_efficiency_quality[Python]", + "passed": true, + "duration_s": 23.4, + "tier": "tier2", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.04035745, + "duration_ms": 21384, + "input_tokens": 18, + "output_tokens": 3120, + "cache_read_tokens": 111257, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_05_guardrails.py::test_create_uses_mcp_not_raw_idf", + "passed": true, + "duration_s": 40.4, + "tier": "tier4", + "attempt": 1, + "num_turns": 9, + "cost_usd": 0.07487915, + "duration_ms": 38376, + "input_tokens": 58, + "output_tokens": 1316, + "cache_read_tokens": 433249, + "tool_calls": [ + "list_skills", + "get_skill", + "create_new_building", + "list_weather_files", + "create_new_building", + "save_osm_model", + "get_model_summary", + "get_building_info" + ], + "num_tool_calls": 8, + "all_tool_calls": [ + "mcp__openstudio__list_skills", + "mcp__openstudio__get_skill", + "mcp__openstudio__create_new_building", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__create_new_building", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__get_building_info" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_05_guardrails.py::test_no_script_for_results", + "passed": true, + "duration_s": 11.2, + "tier": "tier4", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.0239035, + "duration_ms": 9079, + "input_tokens": 18, + "output_tokens": 430, + "cache_read_tokens": 111155, + "tool_calls": [ + "extract_summary_metrics" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_05_guardrails.py::test_inspect_component_uses_mcp_not_script", + "passed": true, + "duration_s": 20.2, + "tier": "tier4", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.0731224, + "duration_ms": 18108, + "input_tokens": 66, + "output_tokens": 1834, + "cache_read_tokens": 478989, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_component_properties", + "get_object_fields", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "get_component_properties", + "get_object_fields" + ], + "num_tool_calls": 9, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_component_properties", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_component_properties", + "mcp__openstudio__get_object_fields" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L1]", + "passed": false, + "duration_s": 7.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 1, + "cost_usd": 0.01746725, + "duration_ms": 5074, + "input_tokens": 10, + "output_tokens": 445, + "cache_read_tokens": 51535, + "tool_calls": [], + "num_tool_calls": 0, + "all_tool_calls": [], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "no_mcp_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L2]", + "passed": true, + "duration_s": 17.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.041676149999999995, + "duration_ms": 15598, + "input_tokens": 34, + "output_tokens": 1313, + "cache_read_tokens": 231859, + "tool_calls": [ + "import_floorspacejs", + "list_files", + "import_floorspacejs" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__import_floorspacejs", + "mcp__openstudio__list_files", + "mcp__openstudio__import_floorspacejs" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L3]", + "passed": false, + "duration_s": 13.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 1, + "cost_usd": 0.020376, + "duration_ms": 11668, + "input_tokens": 10, + "output_tokens": 1021, + "cache_read_tokens": 51535, + "tool_calls": [], + "num_tool_calls": 0, + "all_tool_calls": [], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "no_mcp_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_hvac_L1]", + "passed": true, + "duration_s": 19.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.07127465, + "duration_ms": 16443, + "input_tokens": 58, + "output_tokens": 1486, + "cache_read_tokens": 417529, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "list_thermal_zones", + "add_baseline_system", + "save_osm_model", + "list_air_loops", + "list_plant_loops" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_baseline_system", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_hvac_L2]", + "passed": true, + "duration_s": 35.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.04765735, + "duration_ms": 10542, + "input_tokens": 42, + "output_tokens": 1010, + "cache_read_tokens": 293591, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_baseline_system", + "save_osm_model" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_baseline_system", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_hvac_L3]", + "passed": true, + "duration_s": 13.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.04831125000000001, + "duration_ms": 10914, + "input_tokens": 42, + "output_tokens": 1127, + "cache_read_tokens": 293530, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_baseline_system", + "save_osm_model" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_baseline_system", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[view_model_L1]", + "passed": true, + "duration_s": 11.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0239615, + "duration_ms": 8943, + "input_tokens": 18, + "output_tokens": 391, + "cache_read_tokens": 111160, + "tool_calls": [ + "load_osm_model", + "view_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[view_model_L2]", + "passed": true, + "duration_s": 11.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0308535, + "duration_ms": 9112, + "input_tokens": 26, + "output_tokens": 552, + "cache_read_tokens": 171150, + "tool_calls": [ + "load_osm_model", + "view_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[view_model_L3]", + "passed": true, + "duration_s": 11.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0306171, + "duration_ms": 9241, + "input_tokens": 26, + "output_tokens": 512, + "cache_read_tokens": 171136, + "tool_calls": [ + "load_osm_model", + "view_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_weather_L1]", + "passed": true, + "duration_s": 19.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.04484455, + "duration_ms": 17416, + "input_tokens": 26, + "output_tokens": 1168, + "cache_read_tokens": 178723, + "tool_calls": [ + "load_osm_model", + "list_weather_files", + "change_building_location" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_weather_L2]", + "passed": true, + "duration_s": 27.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.05542905000000001, + "duration_ms": 25579, + "input_tokens": 34, + "output_tokens": 1779, + "cache_read_tokens": 240263, + "tool_calls": [ + "load_osm_model", + "change_building_location", + "list_weather_files", + "change_building_location" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_weather_L3]", + "passed": true, + "duration_s": 21.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.051305800000000006, + "duration_ms": 19590, + "input_tokens": 34, + "output_tokens": 1126, + "cache_read_tokens": 239943, + "tool_calls": [ + "load_osm_model", + "change_building_location", + "list_weather_files", + "change_building_location" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L1]", + "passed": true, + "duration_s": 11.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.02752035, + "duration_ms": 9142, + "input_tokens": 18, + "output_tokens": 950, + "cache_read_tokens": 111161, + "tool_calls": [ + "load_osm_model", + "validate_model", + "get_model_summary", + "get_building_info" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__validate_model", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__get_building_info" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L2]", + "passed": true, + "duration_s": 9.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03202995, + "duration_ms": 7369, + "input_tokens": 26, + "output_tokens": 750, + "cache_read_tokens": 171302, + "tool_calls": [ + "load_osm_model", + "validate_model", + "get_model_summary" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__validate_model", + "mcp__openstudio__get_model_summary" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L3]", + "passed": true, + "duration_s": 12.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03277695, + "duration_ms": 9882, + "input_tokens": 26, + "output_tokens": 899, + "cache_read_tokens": 171097, + "tool_calls": [ + "load_osm_model", + "validate_model", + "get_model_summary" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__validate_model", + "mcp__openstudio__get_model_summary" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_building_L1]", + "passed": true, + "duration_s": 27.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.046520299999999994, + "duration_ms": 25133, + "input_tokens": 34, + "output_tokens": 1459, + "cache_read_tokens": 234988, + "tool_calls": [ + "create_new_building", + "create_new_building", + "create_baseline_osm" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_baseline_osm" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_building_L2]", + "passed": false, + "duration_s": 14.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 1, + "cost_usd": 0.02137725, + "duration_ms": 12627, + "input_tokens": 10, + "output_tokens": 1225, + "cache_read_tokens": 51535, + "tool_calls": [], + "num_tool_calls": 0, + "all_tool_calls": [], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "no_mcp_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_building_L3]", + "passed": true, + "duration_s": 15.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.02935685, + "duration_ms": 13591, + "input_tokens": 18, + "output_tokens": 1035, + "cache_read_tokens": 111151, + "tool_calls": [ + "create_bar_building" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__create_bar_building" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_pv_L1]", + "passed": true, + "duration_s": 19.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03923165000000001, + "duration_ms": 17170, + "input_tokens": 34, + "output_tokens": 899, + "cache_read_tokens": 231664, + "tool_calls": [ + "load_osm_model", + "add_rooftop_pv", + "save_osm_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_pv_L2]", + "passed": true, + "duration_s": 18.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03894505, + "duration_ms": 16868, + "input_tokens": 34, + "output_tokens": 821, + "cache_read_tokens": 231748, + "tool_calls": [ + "load_osm_model", + "add_rooftop_pv", + "save_osm_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_pv_L3]", + "passed": true, + "duration_s": 13.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.022179050000000002, + "duration_ms": 11009, + "input_tokens": 26, + "output_tokens": 625, + "cache_read_tokens": 179268, + "tool_calls": [ + "load_osm_model", + "add_rooftop_pv" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermostat_L1]", + "passed": true, + "duration_s": 15.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03772835, + "duration_ms": 13695, + "input_tokens": 34, + "output_tokens": 668, + "cache_read_tokens": 231431, + "tool_calls": [ + "load_osm_model", + "adjust_thermostat_setpoints", + "save_osm_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__adjust_thermostat_setpoints", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermostat_L2]", + "passed": true, + "duration_s": 14.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03187915, + "duration_ms": 12152, + "input_tokens": 26, + "output_tokens": 660, + "cache_read_tokens": 171519, + "tool_calls": [ + "load_osm_model", + "adjust_thermostat_setpoints", + "save_osm_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__adjust_thermostat_setpoints", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermostat_L3]", + "passed": true, + "duration_s": 13.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0244659, + "duration_ms": 11158, + "input_tokens": 18, + "output_tokens": 476, + "cache_read_tokens": 111179, + "tool_calls": [ + "load_osm_model", + "adjust_thermostat_setpoints" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__adjust_thermostat_setpoints" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_spaces_L1]", + "passed": true, + "duration_s": 8.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.040411550000000004, + "duration_ms": 6637, + "input_tokens": 26, + "output_tokens": 504, + "cache_read_tokens": 162968, + "tool_calls": [ + "load_osm_model", + "list_spaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_spaces_L2]", + "passed": true, + "duration_s": 14.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.03336335, + "duration_ms": 12303, + "input_tokens": 26, + "output_tokens": 618, + "cache_read_tokens": 171061, + "tool_calls": [ + "load_osm_model", + "list_spaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_spaces_L3]", + "passed": true, + "duration_s": 7.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.025943050000000002, + "duration_ms": 5240, + "input_tokens": 18, + "output_tokens": 674, + "cache_read_tokens": 111163, + "tool_calls": [ + "load_osm_model", + "list_spaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedules_L1]", + "passed": true, + "duration_s": 9.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.03261075, + "duration_ms": 7562, + "input_tokens": 26, + "output_tokens": 750, + "cache_read_tokens": 171060, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedules_L2]", + "passed": true, + "duration_s": 11.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0318644, + "duration_ms": 8976, + "input_tokens": 26, + "output_tokens": 596, + "cache_read_tokens": 171084, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedules_L3]", + "passed": true, + "duration_s": 7.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.02505915, + "duration_ms": 5040, + "input_tokens": 18, + "output_tokens": 437, + "cache_read_tokens": 111174, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[inspect_component_L1]", + "passed": true, + "duration_s": 9.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0392748, + "duration_ms": 7640, + "input_tokens": 34, + "output_tokens": 588, + "cache_read_tokens": 232183, + "tool_calls": [ + "load_osm_model", + "list_plant_loops", + "get_component_properties" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__get_component_properties" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[inspect_component_L2]", + "passed": true, + "duration_s": 13.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.04788495, + "duration_ms": 10850, + "input_tokens": 42, + "output_tokens": 1126, + "cache_read_tokens": 291492, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_component_properties", + "get_object_fields" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_component_properties", + "mcp__openstudio__get_object_fields" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[inspect_component_L3]", + "passed": true, + "duration_s": 12.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0402239, + "duration_ms": 10092, + "input_tokens": 34, + "output_tokens": 936, + "cache_read_tokens": 231399, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_object_fields" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_object_fields" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[modify_component_L1]", + "passed": true, + "duration_s": 15.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.05381935000000001, + "duration_ms": 13687, + "input_tokens": 50, + "output_tokens": 890, + "cache_read_tokens": 355881, + "tool_calls": [ + "load_osm_model", + "list_plant_loops", + "get_component_properties", + "set_component_properties", + "save_osm_model" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__get_component_properties", + "mcp__openstudio__set_component_properties", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[modify_component_L2]", + "passed": true, + "duration_s": 10.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.04449565, + "duration_ms": 8148, + "input_tokens": 42, + "output_tokens": 709, + "cache_read_tokens": 291524, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "set_component_properties", + "save_osm_model" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__set_component_properties", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[modify_component_L3]", + "passed": true, + "duration_s": 25.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.05907125, + "duration_ms": 23300, + "input_tokens": 50, + "output_tokens": 1776, + "cache_read_tokens": 354375, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "set_object_property", + "get_object_fields", + "set_object_property" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__set_object_property", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__set_object_property" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L1]", + "passed": true, + "duration_s": 31.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 23, + "cost_usd": 0.09260885, + "duration_ms": 29197, + "input_tokens": 74, + "output_tokens": 2530, + "cache_read_tokens": 567486, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "list_air_loops", + "list_thermal_zones", + "list_plant_loops", + "get_sizing_system_properties", + "get_sizing_zone_properties", + "get_sizing_properties", + "get_sizing_properties", + "get_sizing_properties", + "get_object_fields", + "get_object_fields", + "get_object_fields", + "get_plant_loop_details", + "get_plant_loop_details", + "get_plant_loop_details", + "get_simulation_control", + "get_run_period", + "list_model_objects", + "get_weather_info" + ], + "num_tool_calls": 22, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__get_sizing_system_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_properties", + "mcp__openstudio__get_sizing_properties", + "mcp__openstudio__get_sizing_properties", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_plant_loop_details", + "mcp__openstudio__get_plant_loop_details", + "mcp__openstudio__get_plant_loop_details", + "mcp__openstudio__get_simulation_control", + "mcp__openstudio__get_run_period", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_weather_info" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L2]", + "passed": true, + "duration_s": 7.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.024799800000000004, + "duration_ms": 5352, + "input_tokens": 18, + "output_tokens": 578, + "cache_read_tokens": 111168, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L3]", + "passed": true, + "duration_s": 15.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0406127, + "duration_ms": 13308, + "input_tokens": 26, + "output_tokens": 583, + "cache_read_tokens": 163317, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "list_model_objects" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[floor_area_L1]", + "passed": true, + "duration_s": 9.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.031146600000000003, + "duration_ms": 7571, + "input_tokens": 26, + "output_tokens": 576, + "cache_read_tokens": 171081, + "tool_calls": [ + "load_osm_model", + "get_building_info" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[floor_area_L2]", + "passed": true, + "duration_s": 7.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0302753, + "duration_ms": 5344, + "input_tokens": 26, + "output_tokens": 356, + "cache_read_tokens": 170793, + "tool_calls": [ + "load_osm_model", + "get_building_info" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[floor_area_L3]", + "passed": true, + "duration_s": 10.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.030402150000000003, + "duration_ms": 7932, + "input_tokens": 26, + "output_tokens": 367, + "cache_read_tokens": 170799, + "tool_calls": [ + "load_osm_model", + "get_building_info" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[materials_L1]", + "passed": true, + "duration_s": 9.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.04174725, + "duration_ms": 6840, + "input_tokens": 26, + "output_tokens": 673, + "cache_read_tokens": 162950, + "tool_calls": [ + "load_osm_model", + "list_materials" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_materials" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[materials_L2]", + "passed": true, + "duration_s": 16.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.03225515, + "duration_ms": 14414, + "input_tokens": 26, + "output_tokens": 619, + "cache_read_tokens": 171104, + "tool_calls": [ + "load_osm_model", + "list_materials" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_materials" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[materials_L3]", + "passed": true, + "duration_s": 8.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0315526, + "duration_ms": 6526, + "input_tokens": 26, + "output_tokens": 493, + "cache_read_tokens": 171066, + "tool_calls": [ + "load_osm_model", + "list_materials" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_materials" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L1]", + "passed": false, + "duration_s": 7.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.029987250000000003, + "duration_ms": 5059, + "input_tokens": 26, + "output_tokens": 403, + "cache_read_tokens": 171075, + "tool_calls": [ + "load_osm_model", + "get_model_summary" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_model_summary" + ], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L2]", + "passed": true, + "duration_s": 12.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0342701, + "duration_ms": 10227, + "input_tokens": 26, + "output_tokens": 933, + "cache_read_tokens": 170791, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L3]", + "passed": true, + "duration_s": 9.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0309107, + "duration_ms": 7545, + "input_tokens": 26, + "output_tokens": 430, + "cache_read_tokens": 170797, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L1]", + "passed": true, + "duration_s": 15.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.04333825000000001, + "duration_ms": 13705, + "input_tokens": 42, + "output_tokens": 573, + "cache_read_tokens": 291050, + "tool_calls": [ + "load_osm_model", + "list_subsurfaces", + "list_subsurfaces", + "list_subsurfaces" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_subsurfaces", + "mcp__openstudio__list_subsurfaces", + "mcp__openstudio__list_subsurfaces" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L2]", + "passed": true, + "duration_s": 8.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0303261, + "duration_ms": 6621, + "input_tokens": 26, + "output_tokens": 496, + "cache_read_tokens": 171126, + "tool_calls": [ + "load_osm_model", + "list_subsurfaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_subsurfaces" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L3]", + "passed": true, + "duration_s": 7.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.02978185, + "duration_ms": 5766, + "input_tokens": 26, + "output_tokens": 401, + "cache_read_tokens": 171096, + "tool_calls": [ + "load_osm_model", + "list_subsurfaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_subsurfaces" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[surface_details_L1]", + "passed": true, + "duration_s": 11.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.04206575, + "duration_ms": 9841, + "input_tokens": 34, + "output_tokens": 1033, + "cache_read_tokens": 232730, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "get_surface_details", + "get_construction_details" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__get_surface_details", + "mcp__openstudio__get_construction_details" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[surface_details_L2]", + "passed": true, + "duration_s": 14.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0381191, + "duration_ms": 12833, + "input_tokens": 34, + "output_tokens": 741, + "cache_read_tokens": 231351, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "get_surface_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__get_surface_details" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[surface_details_L3]", + "passed": true, + "duration_s": 9.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.03916115, + "duration_ms": 7677, + "input_tokens": 18, + "output_tokens": 651, + "cache_read_tokens": 111169, + "tool_calls": [ + "load_osm_model", + "list_surfaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_simulation_L1]", + "passed": true, + "duration_s": 37.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 20, + "cost_usd": 0.1384678, + "duration_ms": 35058, + "input_tokens": 130, + "output_tokens": 2943, + "cache_read_tokens": 1015128, + "tool_calls": [ + "load_osm_model", + "run_simulation", + "get_run_status", + "get_run_logs", + "validate_model", + "list_air_loops", + "enable_ideal_air_loads", + "delete_object", + "delete_object", + "delete_object", + "delete_object", + "save_osm_model", + "load_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "extract_end_use_breakdown" + ], + "num_tool_calls": 19, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_logs", + "mcp__openstudio__validate_model", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__enable_ideal_air_loads", + "mcp__openstudio__delete_object", + "mcp__openstudio__delete_object", + "mcp__openstudio__delete_object", + "mcp__openstudio__delete_object", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_simulation_L2]", + "passed": true, + "duration_s": 36.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.085235, + "duration_ms": 34629, + "input_tokens": 58, + "output_tokens": 2965, + "cache_read_tokens": 457545, + "tool_calls": [ + "load_osm_model", + "list_weather_files", + "change_building_location", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_simulation_errors" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_simulation_errors" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_simulation_L3]", + "passed": true, + "duration_s": 8.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.030881700000000005, + "duration_ms": 6275, + "input_tokens": 26, + "output_tokens": 566, + "cache_read_tokens": 171107, + "tool_calls": [ + "load_osm_model", + "run_simulation" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__run_simulation" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[get_eui_L1]", + "passed": true, + "duration_s": 8.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0309284, + "duration_ms": 6849, + "input_tokens": 26, + "output_tokens": 536, + "cache_read_tokens": 171174, + "tool_calls": [ + "extract_summary_metrics", + "get_run_status" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[get_eui_L2]", + "passed": true, + "duration_s": 10.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.03187205, + "duration_ms": 7937, + "input_tokens": 26, + "output_tokens": 606, + "cache_read_tokens": 170773, + "tool_calls": [ + "extract_summary_metrics", + "get_run_status" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[get_eui_L3]", + "passed": true, + "duration_s": 10.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.041944550000000004, + "duration_ms": 8244, + "input_tokens": 26, + "output_tokens": 649, + "cache_read_tokens": 163048, + "tool_calls": [ + "extract_summary_metrics", + "get_run_status", + "get_run_logs" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_logs" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L1]", + "passed": true, + "duration_s": 9.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0332371, + "duration_ms": 7582, + "input_tokens": 26, + "output_tokens": 655, + "cache_read_tokens": 171036, + "tool_calls": [ + "extract_end_use_breakdown", + "get_run_status", + "get_run_artifacts", + "extract_summary_metrics" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_artifacts", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L2]", + "passed": true, + "duration_s": 9.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03201785, + "duration_ms": 7719, + "input_tokens": 26, + "output_tokens": 698, + "cache_read_tokens": 171081, + "tool_calls": [ + "extract_end_use_breakdown", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L3]", + "passed": true, + "duration_s": 12.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0335339, + "duration_ms": 10684, + "input_tokens": 26, + "output_tokens": 709, + "cache_read_tokens": 171004, + "tool_calls": [ + "extract_end_use_breakdown", + "get_run_status", + "get_run_artifacts", + "extract_summary_metrics" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_artifacts", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L1]", + "passed": false, + "duration_s": 13.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0335339, + "duration_ms": 10684, + "input_tokens": 26, + "output_tokens": 709, + "cache_read_tokens": 171004, + "tool_calls": [ + "extract_end_use_breakdown", + "get_run_status", + "get_run_artifacts", + "extract_summary_metrics" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_artifacts", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L2]", + "passed": true, + "duration_s": 13.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.03983705, + "duration_ms": 10929, + "input_tokens": 34, + "output_tokens": 871, + "cache_read_tokens": 230818, + "tool_calls": [ + "extract_hvac_sizing", + "extract_component_sizing", + "get_run_status", + "extract_simulation_errors" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__extract_hvac_sizing", + "mcp__openstudio__extract_component_sizing", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_simulation_errors" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L3]", + "passed": true, + "duration_s": 7.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.02359505, + "duration_ms": 5393, + "input_tokens": 18, + "output_tokens": 413, + "cache_read_tokens": 111158, + "tool_calls": [ + "extract_hvac_sizing" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__extract_hvac_sizing" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_wwr_L1]", + "passed": true, + "duration_s": 14.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 12, + "cost_usd": 0.05160005, + "duration_ms": 12292, + "input_tokens": 42, + "output_tokens": 1495, + "cache_read_tokens": 294793, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "save_osm_model" + ], + "num_tool_calls": 11, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_wwr_L2]", + "passed": true, + "duration_s": 14.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 12, + "cost_usd": 0.0516678, + "duration_ms": 12299, + "input_tokens": 42, + "output_tokens": 1481, + "cache_read_tokens": 295133, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "save_osm_model" + ], + "num_tool_calls": 11, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_wwr_L3]", + "passed": true, + "duration_s": 19.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 12, + "cost_usd": 0.0609674, + "duration_ms": 17696, + "input_tokens": 50, + "output_tokens": 2008, + "cache_read_tokens": 355749, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio" + ], + "num_tool_calls": 11, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_windows_L1]", + "passed": true, + "duration_s": 33.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.07077305, + "duration_ms": 31481, + "input_tokens": 58, + "output_tokens": 2794, + "cache_read_tokens": 421313, + "tool_calls": [ + "load_osm_model", + "list_subsurfaces", + "list_model_objects", + "get_construction_details", + "list_common_measures", + "list_measure_arguments", + "list_files" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_subsurfaces", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__list_common_measures", + "mcp__openstudio__list_measure_arguments", + "mcp__openstudio__list_files" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_windows_L2]", + "passed": true, + "duration_s": 100.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 24, + "cost_usd": 0.24001455000000002, + "duration_ms": 98389, + "input_tokens": 178, + "output_tokens": 9065, + "cache_read_tokens": 1516553, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_construction_details", + "get_construction_details", + "list_materials", + "list_subsurfaces", + "create_measure", + "test_measure", + "search_api", + "search_wiring_patterns", + "edit_measure", + "test_measure", + "edit_measure", + "test_measure", + "edit_measure", + "test_measure", + "apply_measure", + "search_api", + "edit_measure", + "apply_measure", + "save_osm_model", + "list_subsurfaces", + "get_construction_details" + ], + "num_tool_calls": 23, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__list_materials", + "mcp__openstudio__list_subsurfaces", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__search_api", + "mcp__openstudio__search_wiring_patterns", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__search_api", + "mcp__openstudio__edit_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__list_subsurfaces", + "mcp__openstudio__get_construction_details" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_windows_L3]", + "passed": false, + "duration_s": 9.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.03376715, + "duration_ms": 7259, + "input_tokens": 26, + "output_tokens": 826, + "cache_read_tokens": 170799, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[construction_details_L1]", + "passed": true, + "duration_s": 14.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.05098955, + "duration_ms": 12474, + "input_tokens": 42, + "output_tokens": 1347, + "cache_read_tokens": 292913, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "get_construction_details", + "get_object_fields", + "get_object_fields", + "get_object_fields" + ], + "num_tool_calls": 6, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[construction_details_L2]", + "passed": true, + "duration_s": 14.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0475043, + "duration_ms": 12530, + "input_tokens": 42, + "output_tokens": 969, + "cache_read_tokens": 291873, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "list_model_objects", + "get_construction_details" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_construction_details" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[construction_details_L3]", + "passed": true, + "duration_s": 11.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.04130905, + "duration_ms": 9748, + "input_tokens": 34, + "output_tokens": 1050, + "cache_read_tokens": 231763, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_construction_details", + "get_construction_details", + "get_construction_details" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[check_loads_L1]", + "passed": true, + "duration_s": 12.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.045577, + "duration_ms": 9848, + "input_tokens": 42, + "output_tokens": 867, + "cache_read_tokens": 291425, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "get_space_details", + "get_space_type_details" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "mcp__openstudio__get_space_details", + "mcp__openstudio__get_space_type_details" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[check_loads_L2]", + "passed": true, + "duration_s": 16.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.05379445, + "duration_ms": 14401, + "input_tokens": 50, + "output_tokens": 1009, + "cache_read_tokens": 352832, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "get_space_details", + "get_space_type_details", + "get_load_details", + "get_load_details" + ], + "num_tool_calls": 6, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "mcp__openstudio__get_space_details", + "mcp__openstudio__get_space_type_details", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[check_loads_L3]", + "passed": true, + "duration_s": 15.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.04799565, + "duration_ms": 12854, + "input_tokens": 34, + "output_tokens": 801, + "cache_read_tokens": 223004, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_load_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_load_details" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_loads_L1]", + "passed": true, + "duration_s": 19.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 24, + "cost_usd": 0.0754027, + "duration_ms": 17598, + "input_tokens": 42, + "output_tokens": 2715, + "cache_read_tokens": 289157, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "save_osm_model" + ], + "num_tool_calls": 23, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_loads_L2]", + "passed": true, + "duration_s": 13.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.04949075, + "duration_ms": 11552, + "input_tokens": 42, + "output_tokens": 1288, + "cache_read_tokens": 293575, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "create_people_definition", + "create_lights_definition", + "save_osm_model" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_loads_L3]", + "passed": false, + "duration_s": 15.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.03661685, + "duration_ms": 12904, + "input_tokens": 26, + "output_tokens": 1200, + "cache_read_tokens": 170821, + "tool_calls": [ + "load_osm_model", + "list_spaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 0, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L1]", + "passed": true, + "duration_s": 11.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.037090700000000004, + "duration_ms": 9836, + "input_tokens": 34, + "output_tokens": 590, + "cache_read_tokens": 231142, + "tool_calls": [ + "load_osm_model", + "create_plant_loop", + "save_osm_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_plant_loop", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L2]", + "passed": true, + "duration_s": 9.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0245851, + "duration_ms": 7021, + "input_tokens": 18, + "output_tokens": 512, + "cache_read_tokens": 111171, + "tool_calls": [ + "load_osm_model", + "create_plant_loop" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_plant_loop" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L3]", + "passed": true, + "duration_s": 8.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.023738850000000002, + "duration_ms": 6387, + "input_tokens": 18, + "output_tokens": 371, + "cache_read_tokens": 111171, + "tool_calls": [ + "load_osm_model", + "create_plant_loop" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_plant_loop" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedule_details_L1]", + "passed": true, + "duration_s": 31.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 14, + "cost_usd": 0.1242402, + "duration_ms": 29613, + "input_tokens": 90, + "output_tokens": 2469, + "cache_read_tokens": 763127, + "tool_calls": [ + "load_osm_model", + "list_air_loops", + "list_plant_loops", + "list_thermal_zones", + "get_schedule_details", + "get_schedule_details", + "list_model_objects", + "get_schedule_details", + "list_model_objects", + "get_object_fields", + "list_model_objects", + "get_air_loop_details", + "get_component_properties" + ], + "num_tool_calls": 13, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_air_loop_details", + "mcp__openstudio__get_component_properties" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedule_details_L2]", + "passed": true, + "duration_s": 13.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0466535, + "duration_ms": 11246, + "input_tokens": 42, + "output_tokens": 910, + "cache_read_tokens": 292940, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_schedule_details", + "get_schedule_details" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_schedule_details" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedule_details_L3]", + "passed": true, + "duration_s": 11.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03876045, + "duration_ms": 8895, + "input_tokens": 34, + "output_tokens": 785, + "cache_read_tokens": 231577, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_schedule_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_schedule_details" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[space_type_info_L1]", + "passed": true, + "duration_s": 21.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 14, + "cost_usd": 0.07304005, + "duration_ms": 19584, + "input_tokens": 58, + "output_tokens": 1929, + "cache_read_tokens": 423433, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "get_space_type_details", + "get_load_details", + "get_load_details", + "get_load_details", + "get_schedule_details", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "get_object_fields", + "get_object_fields", + "get_object_fields" + ], + "num_tool_calls": 13, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "mcp__openstudio__get_space_type_details", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[space_type_info_L2]", + "passed": true, + "duration_s": 16.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.04892895, + "duration_ms": 13537, + "input_tokens": 42, + "output_tokens": 1255, + "cache_read_tokens": 292407, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_model_summary", + "get_space_type_details", + "get_load_details", + "get_load_details", + "get_load_details", + "get_schedule_details", + "get_schedule_details", + "get_schedule_details" + ], + "num_tool_calls": 10, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__get_space_type_details", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_schedule_details" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[space_type_info_L3]", + "passed": true, + "duration_s": 10.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0391403, + "duration_ms": 8367, + "input_tokens": 34, + "output_tokens": 819, + "cache_read_tokens": 232038, + "tool_calls": [ + "load_osm_model", + "get_model_summary", + "list_model_objects", + "get_space_type_details" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_space_type_details" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_run_period_L1]", + "passed": true, + "duration_s": 6.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.02422575, + "duration_ms": 4408, + "input_tokens": 18, + "output_tokens": 459, + "cache_read_tokens": 111165, + "tool_calls": [ + "load_osm_model", + "set_run_period" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__set_run_period" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_run_period_L2]", + "passed": true, + "duration_s": 6.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0240524, + "duration_ms": 3981, + "input_tokens": 18, + "output_tokens": 426, + "cache_read_tokens": 111169, + "tool_calls": [ + "load_osm_model", + "set_run_period" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__set_run_period" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_run_period_L3]", + "passed": true, + "duration_s": 9.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0307273, + "duration_ms": 7385, + "input_tokens": 26, + "output_tokens": 539, + "cache_read_tokens": 171163, + "tool_calls": [ + "load_osm_model", + "set_run_period" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__set_run_period" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[ideal_air_L1]", + "passed": true, + "duration_s": 13.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.03878915, + "duration_ms": 11833, + "input_tokens": 34, + "output_tokens": 760, + "cache_read_tokens": 231389, + "tool_calls": [ + "load_osm_model", + "enable_ideal_air_loads", + "save_osm_model", + "get_model_summary" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__enable_ideal_air_loads", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__get_model_summary" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[ideal_air_L2]", + "passed": true, + "duration_s": 10.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.024232100000000003, + "duration_ms": 8492, + "input_tokens": 18, + "output_tokens": 472, + "cache_read_tokens": 111166, + "tool_calls": [ + "load_osm_model", + "enable_ideal_air_loads" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__enable_ideal_air_loads" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[ideal_air_L3]", + "passed": true, + "duration_s": 14.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0484898, + "duration_ms": 12221, + "input_tokens": 34, + "output_tokens": 663, + "cache_read_tokens": 223158, + "tool_calls": [ + "load_osm_model", + "enable_ideal_air_loads", + "list_thermal_zones" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__enable_ideal_air_loads", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[save_model_L1]", + "passed": true, + "duration_s": 8.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.03015555, + "duration_ms": 6152, + "input_tokens": 26, + "output_tokens": 399, + "cache_read_tokens": 170783, + "tool_calls": [ + "load_osm_model", + "save_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[save_model_L2]", + "passed": true, + "duration_s": 5.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.02387865, + "duration_ms": 3790, + "input_tokens": 18, + "output_tokens": 404, + "cache_read_tokens": 111169, + "tool_calls": [ + "load_osm_model", + "save_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[save_model_L3]", + "passed": true, + "duration_s": 10.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.030628950000000002, + "duration_ms": 8111, + "input_tokens": 26, + "output_tokens": 469, + "cache_read_tokens": 170817, + "tool_calls": [ + "load_osm_model", + "save_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_ev_L1]", + "passed": true, + "duration_s": 15.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.03450405, + "duration_ms": 12867, + "input_tokens": 26, + "output_tokens": 900, + "cache_read_tokens": 172568, + "tool_calls": [ + "load_osm_model", + "add_ev_load", + "save_osm_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_ev_load", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_ev_L2]", + "passed": true, + "duration_s": 16.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.041585250000000004, + "duration_ms": 14765, + "input_tokens": 34, + "output_tokens": 1032, + "cache_read_tokens": 233075, + "tool_calls": [ + "load_osm_model", + "get_model_summary", + "add_ev_load", + "save_osm_model" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__add_ev_load", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_ev_L3]", + "passed": true, + "duration_s": 10.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.026012, + "duration_ms": 8778, + "input_tokens": 18, + "output_tokens": 559, + "cache_read_tokens": 111165, + "tool_calls": [ + "load_osm_model", + "add_ev_load" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_ev_load" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_measures_L1]", + "passed": true, + "duration_s": 6.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.03302125, + "duration_ms": 3954, + "input_tokens": 18, + "output_tokens": 345, + "cache_read_tokens": 103070, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__list_custom_measures" + ], + "toolsearch_count": 0, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_measures_L2]", + "passed": true, + "duration_s": 6.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 2, + "cost_usd": 0.034313750000000004, + "duration_ms": 4439, + "input_tokens": 18, + "output_tokens": 609, + "cache_read_tokens": 103070, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "mcp__openstudio__list_custom_measures" + ], + "toolsearch_count": 0, + "is_timeout": false + } + ] +} \ No newline at end of file diff --git a/docs/sweeps/haiku-2026-03-28/benchmark.md b/docs/sweeps/haiku-2026-03-28/benchmark.md new file mode 100644 index 0000000..df352ec --- /dev/null +++ b/docs/sweeps/haiku-2026-03-28/benchmark.md @@ -0,0 +1,303 @@ +# LLM Benchmark Report + +**Date:** 2026-03-28T18:32:55+00:00 +**Model:** haiku | **Retries:** 0 +**Result:** 160/180 passed (88.9%) in 4775s +**Tokens:** 8.9k in + 307.7k out + 66.6M cache | **Cost:** $11.2110 (notional API pricing) + +## Summary by Tier + +| Tier | Passed | Rate | Time | Avg | +|--------|---------|--------|--------|--------| +| setup | 6/6 | 100.0% | 114s | 19s | +| tier1 | 4/4 | 100.0% | 76s | 19s | +| tier2 | 31/37 | 83.8% | 1857s | 50s | +| tier3 | 19/26 | 73.1% | 1127s | 43s | +| tier4 | 3/3 | 100.0% | 72s | 24s | +| progressive | 97/104 | 93.3% | 1529s | 15s | + +## Detailed Results + +### setup + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|--------------------------------|--------|------|-------|-------------------------------------------------------------------------------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| test_create_baseline_model | PASS | 15s | 2 | create_baseline_osm | 18 | 699 | 67.6k | $0.0755 | 1 | +| test_create_baseline_with_hvac | PASS | 15s | 2 | create_baseline_osm | 18 | 790 | 111.2k | $0.0260 | 1 | +| test_create_example_model | PASS | 8s | 2 | create_example_osm | 18 | 442 | 111.1k | $0.0239 | 1 | +| test_load_baseline_model | PASS | 7s | 3 | load_osm_model, list_thermal_zones | 26 | 453 | 162.7k | $0.0404 | 1 | +| test_run_baseline_simulation | PASS | 21s | 8 | load_osm_model, change_building_location, run_simulation, get_run_status, run_simulation, get_run_status, get_run_status | 58 | 1.4k | 417.0k | $0.0631 | 1 | +| test_run_retrofit_simulation | PASS | 47s | 9 | load_osm_model, change_building_location, adjust_thermostat_setpoints, run_simulation, get_run_status, save_osm_model, run_simulation, get_run_status | 74 | 1.5k | 541.8k | $0.0762 | 1 | + +### tier1 + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|-------------------------------------|--------|------|-------|-----------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| What is the server status? | PASS | 5s | 2 | get_server_status | 18 | 196 | 111.1k | $0.0224 | 1 | +| List available skills | PASS | 7s | 2 | list_skills | 18 | 418 | 103.1k | $0.0335 | 1 | +| Create a small office building usin | PASS | 45s | 5 | create_new_building, create_new_building, list_weather_files, create_new_building | 42 | 1.6k | 305.9k | $0.0638 | 1 | +| Create bar geometry for a retail bu | PASS | 18s | 2 | create_bar_building | 18 | 1.3k | 111.1k | $0.0306 | 1 | + +### tier2 + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|---------------------------------------|--------|------|-------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| systemd_fourpipebeam_e2e | PASS | 308s | 34 | load_osm_model, list_weather_files, change_building_location, save_osm_model, run_simulation, get_run_status, get_run_status, extract_summary_metrics, list_air_loops, list_plant_loops, search_wiring_patterns, create_measure, test_measure, edit_measure, test_measure, apply_measure, edit_measure, apply_measure, save_osm_model, run_simulation, get_run_status, extract_summary_metrics, compare_runs, copy_file | 258 | 9.9k | 2.4M | $0.3384 | 1 | +| add_vav_reheat | PASS | 24s | 4 | load_osm_model, list_thermal_zones, add_baseline_system | 34 | 1.0k | 231.8k | $0.0411 | 1 | +| add_doas | PASS | 15s | 5 | load_osm_model, list_thermal_zones, add_doas_system, save_osm_model | 42 | 1.4k | 294.2k | $0.0511 | 1 | +| add_vrf | PASS | 11s | 5 | load_osm_model, list_thermal_zones, add_vrf_system, save_osm_model | 42 | 928 | 293.0k | $0.0472 | 1 | +| set_weather | PASS | 15s | 3 | load_osm_model, change_building_location | 26 | 822 | 171.4k | $0.0327 | 1 | +| add_rooftop_pv | PASS | 12s | 3 | load_osm_model, add_rooftop_pv | 26 | 523 | 171.2k | $0.0309 | 1 | +| adjust_thermostat | PASS | 19s | 3 | load_osm_model, adjust_thermostat_setpoints | 26 | 702 | 171.3k | $0.0318 | 1 | +| delete_space | PASS | 9s | 4 | load_osm_model, list_spaces, delete_object | 34 | 570 | 231.1k | $0.0387 | 1 | +| qaqc_check | FAIL | 23s | 6 | load_osm_model, validate_model, run_simulation, get_run_status, extract_simulation_errors | 42 | 2.0k | 294.1k | $0.0607 | 1 | +| create_bar_office | PASS | 15s | 3 | create_bar_building, list_spaces | 26 | 993 | 172.9k | $0.0373 | 1 | +| create_new_building | PASS | 52s | 2 | create_new_building | 18 | 1.5k | 111.2k | $0.0303 | 1 | +| bar_then_typical | PASS | 50s | 8 | create_bar_building, change_building_location, create_typical_building, read_file | 66 | 1.7k | 487.2k | $0.0729 | 1 | +| import_floorspacejs | FAIL | 12s | 1 | — | 10 | 748 | 51.5k | $0.0190 | 1 | +| floorspacejs_to_typical | FAIL | 11s | 2 | import_floorspacejs | 18 | 821 | 111.3k | $0.0261 | 1 | +| manual_geometry_match | PASS | 20s | 8 | create_example_osm, create_space_from_floor_print, create_space_from_floor_print, match_surfaces, list_surfaces, list_surfaces, save_osm_model | 50 | 1.9k | 356.2k | $0.0607 | 1 | +| envelope_retrofit | FAIL | 12s | 4 | load_osm_model, list_surfaces, list_materials | 18 | 1.0k | 111.2k | $0.0295 | 1 | +| create_and_assign_loads | FAIL | 12s | 3 | load_osm_model, list_spaces | 18 | 1.2k | 111.2k | $0.0291 | 1 | +| plant_loop_with_boiler | PASS | 11s | 4 | load_osm_model, create_plant_loop, add_supply_equipment | 34 | 728 | 231.5k | $0.0381 | 1 | +| inspect_and_modify_boiler | PASS | 15s | 5 | load_osm_model, list_model_objects, get_object_fields, set_object_property | 42 | 974 | 292.8k | $0.0469 | 1 | +| extract_results_chain | PASS | 13s | 5 | extract_summary_metrics, extract_end_use_breakdown, get_run_status, extract_simulation_errors | 26 | 791 | 171.4k | $0.0332 | 1 | +| hvac_chilled_beam_comparison | PASS | 41s | 18 | load_osm_model, list_air_loops, replace_air_terminals, save_osm_model, run_simulation, get_run_status, extract_simulation_errors, list_weather_files, change_building_location, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, extract_end_use_breakdown | 146 | 2.9k | 1.2M | $0.1725 | 1 | +| create_test_apply_measure | PASS | 15s | 5 | load_osm_model, create_measure, test_measure, apply_measure | 26 | 1.2k | 171.8k | $0.0353 | 1 | +| measure_set_lights_full_chain | PASS | 97s | 31 | load_osm_model, save_osm_model, run_simulation, get_run_status, get_weather_info, list_weather_files, change_building_location, load_osm_model, save_osm_model, run_simulation, get_run_status, list_model_objects, load_osm_model, change_building_location, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, extract_summary_metrics, load_osm_model, create_measure, test_measure, apply_measure, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, extract_summary_metrics | 210 | 4.4k | 1.8M | $0.2375 | 1 | +| measure_set_infiltration_full_chain | PASS | 54s | 21 | load_osm_model, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, extract_summary_metrics, load_osm_model, search_api, create_measure, test_measure, apply_measure, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, extract_summary_metrics | 154 | 3.7k | 1.2M | $0.1580 | 1 | +| measure_replace_terminals_full_chain | FAIL | 71s | 21 | load_osm_model, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, extract_summary_metrics, load_osm_model, search_api, create_measure, test_measure, apply_measure, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, extract_summary_metrics | 154 | 3.7k | 1.2M | $0.1580 | 1 | +| create_measure_with_args | PASS | 87s | 10 | create_measure, test_measure, edit_measure, test_measure, create_baseline_osm, test_measure, apply_measure, list_model_objects, get_construction_details | 82 | 8.4k | 650.0k | $0.1305 | 1 | +| measure_add_baseboards_full_chain | PASS | 122s | 24 | load_osm_model, save_osm_model, get_weather_info, run_simulation, get_run_status, get_run_status, get_run_status, extract_summary_metrics, load_osm_model, create_measure, test_measure, edit_measure, test_measure, search_api, edit_measure, test_measure, apply_measure, save_osm_model, run_simulation, get_run_status, extract_summary_metrics | 186 | 4.3k | 1.5M | $0.1880 | 1 | +| ruby_measure_reduce_plugloads | PASS | 185s | 36 | load_osm_model, save_osm_model, get_weather_info, run_simulation, get_run_status, get_run_status, get_run_status, extract_summary_metrics, load_osm_model, create_measure, test_measure, edit_measure, test_measure, search_api, apply_measure, search_wiring_patterns, edit_measure, apply_measure, edit_measure, apply_measure, edit_measure, apply_measure, search_api, edit_measure, apply_measure, get_run_logs, edit_measure, apply_measure, list_model_objects, get_object_fields, set_object_property, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status | 282 | 16.8k | 2.5M | $0.3753 | 1 | +| python_measure_reduce_plugloads | PASS | 130s | 24 | load_osm_model, save_osm_model, run_simulation, get_run_status, get_run_status, extract_summary_metrics, load_osm_model, create_measure, test_measure, edit_measure, test_measure, search_api, search_api, edit_measure, test_measure, apply_measure, save_osm_model, run_simulation, get_run_status, extract_summary_metrics | 194 | 7.2k | 1.5M | $0.2173 | 1 | +| ruby_measure_boiler_efficiency | PASS | 62s | 24 | load_osm_model, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, extract_summary_metrics, load_osm_model, create_measure, test_measure, edit_measure, test_measure, apply_measure, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, extract_summary_metrics | 178 | 5.6k | 1.4M | $0.2040 | 1 | +| python_measure_boiler_efficiency | PASS | 65s | 24 | load_osm_model, save_osm_model, get_weather_info, run_simulation, get_run_status, get_run_status, get_run_status, get_run_artifacts, extract_summary_metrics, load_osm_model, create_measure, create_measure, test_measure, edit_measure, test_measure, apply_measure, save_osm_model, run_simulation, get_run_artifacts, get_run_status, get_run_artifacts, extract_summary_metrics, compare_runs | 178 | 6.2k | 1.4M | $0.2050 | 1 | +| test_create_measure_with_args_quality | PASS | 114s | 16 | get_skill, create_measure, create_baseline_osm, test_measure, edit_measure, test_measure, edit_measure, test_measure, edit_measure, test_measure, test_measure, apply_measure, get_surface_details, get_construction_details, save_osm_model | 122 | 11.3k | 1.0M | $0.1816 | 1 | +| test_complex_model_multi_query | PASS | 12s | 6 | load_osm_model, get_building_info, list_air_loops, list_plant_loops, list_thermal_zones | 18 | 854 | 111.2k | $0.0279 | 1 | +| Ruby | PASS | 56s | 2 | create_measure | 18 | 8.9k | 111.2k | $0.0763 | 1 | +| Python | PASS | 31s | 2 | create_measure | 18 | 4.3k | 111.2k | $0.0479 | 1 | +| Ruby | PASS | 31s | 2 | create_measure | 18 | 4.4k | 111.3k | $0.0485 | 1 | +| Python | PASS | 23s | 2 | create_measure | 18 | 3.1k | 111.3k | $0.0404 | 1 | + +### tier3 + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|--------------------------------------------------|--------|------|-------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| add-hvac:Add HVAC to the model | PASS | 15s | 4 | load_osm_model, get_building_info, list_thermal_zones | 26 | 939 | 171.1k | $0.0340 | 1 | +| add-hvac:Set up heating and cooling | PASS | 19s | 8 | load_osm_model, get_building_info, list_thermal_zones, get_weather_info, list_baseline_systems, add_baseline_system, save_osm_model | 50 | 1.5k | 358.8k | $0.0605 | 1 | +| add-hvac:What HVAC system should I use? | PASS | 17s | 5 | load_osm_model, get_building_info, list_baseline_systems, recommend_tools | 26 | 1.2k | 171.1k | $0.0371 | 1 | +| add-hvac:Add a VAV system | PASS | 19s | 4 | load_osm_model, list_thermal_zones, add_baseline_system | 26 | 1.1k | 171.9k | $0.0363 | 1 | +| energy-report:Give me a full energy report | FAIL | 57s | 4 | load_osm_model, list_thermal_zones, add_baseline_system | 26 | 1.1k | 171.9k | $0.0363 | 1 | +| new-building:Create a small office building | PASS | 55s | 7 | create_new_building, create_new_building, list_weather_files, create_new_building, create_bar_building, create_baseline_osm | 58 | 2.7k | 456.9k | $0.0897 | 1 | +| new-building:Model a 3-story school | PASS | 132s | 10 | list_skills, get_skill, list_weather_files, create_new_building, get_building_info, list_air_loops, list_plant_loops, view_model | 58 | 2.7k | 435.3k | $0.0915 | 1 | +| new-building:Create a retail building, 25000 sqf | PASS | 71s | 8 | create_new_building, create_new_building, create_bar_building, create_bar_building, list_weather_files, create_new_building, save_osm_model | 66 | 2.9k | 516.3k | $0.0951 | 1 | +| new-building:Import the FloorspaceJS floor plan | PASS | 18s | 4 | import_floorspacejs, list_files, import_floorspacejs | 34 | 1.1k | 232.7k | $0.0421 | 1 | +| new-building:Create a bar building for a medium | PASS | 18s | 2 | create_bar_building | 18 | 1.1k | 111.1k | $0.0294 | 1 | +| qaqc:Check the model for issues | PASS | 19s | 11 | load_osm_model, validate_model, get_model_summary, get_building_info, get_weather_info, list_air_loops, list_plant_loops, list_thermal_zones, get_simulation_control, get_run_period | 34 | 1.2k | 232.0k | $0.0468 | 1 | +| qaqc:Validate before simulation | FAIL | 10s | 3 | load_osm_model, validate_model | 26 | 454 | 170.8k | $0.0303 | 1 | +| qaqc:QA/QC the model | FAIL | 10s | 3 | load_osm_model, validate_model | 26 | 644 | 171.2k | $0.0312 | 1 | +| qaqc:Is my model ready to simulate? | FAIL | 15s | 8 | load_osm_model, validate_model, get_weather_info, get_building_info, get_simulation_control, list_air_loops, list_plant_loops | 34 | 835 | 223.0k | $0.0489 | 1 | +| retrofit:Compare before and after adding ins | PASS | 160s | 41 | load_osm_model, validate_model, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, extract_summary_metrics, extract_end_use_breakdown, list_common_measures, create_measure, apply_measure, edit_measure, apply_measure, get_skill, edit_measure, apply_measure, save_osm_model, run_simulation, get_run_status, load_osm_model, edit_measure, apply_measure, search_api, edit_measure, apply_measure, list_surfaces, edit_measure, apply_measure, edit_measure, apply_measure, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, extract_summary_metrics, extract_end_use_breakdown, compare_runs | 306 | 9.2k | 2.6M | $0.3513 | 1 | +| retrofit:Do a retrofit analysis | PASS | 85s | 52 | load_osm_model, get_building_info, get_model_summary, list_air_loops, list_plant_loops, get_weather_info, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, extract_summary_metrics, extract_end_use_breakdown, load_osm_model, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, save_osm_model, run_simulation, load_osm_model, replace_air_terminals, save_osm_model, run_simulation, load_osm_model, add_rooftop_pv, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, extract_summary_metrics, extract_summary_metrics, extract_summary_metrics, extract_end_use_breakdown, extract_end_use_breakdown, extract_end_use_breakdown | 242 | 7.4k | 2.1M | $0.2822 | 1 | +| simulate:Run a simulation | FAIL | 29s | 52 | load_osm_model, get_building_info, get_model_summary, list_air_loops, list_plant_loops, get_weather_info, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, extract_summary_metrics, extract_end_use_breakdown, load_osm_model, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, save_osm_model, run_simulation, load_osm_model, replace_air_terminals, save_osm_model, run_simulation, load_osm_model, add_rooftop_pv, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, extract_summary_metrics, extract_summary_metrics, extract_summary_metrics, extract_end_use_breakdown, extract_end_use_breakdown, extract_end_use_breakdown | 242 | 7.4k | 2.1M | $0.2822 | 1 | +| simulate:Simulate the model | FAIL | 28s | 52 | load_osm_model, get_building_info, get_model_summary, list_air_loops, list_plant_loops, get_weather_info, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, extract_summary_metrics, extract_end_use_breakdown, load_osm_model, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, save_osm_model, run_simulation, load_osm_model, replace_air_terminals, save_osm_model, run_simulation, load_osm_model, add_rooftop_pv, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, extract_summary_metrics, extract_summary_metrics, extract_summary_metrics, extract_end_use_breakdown, extract_end_use_breakdown, extract_end_use_breakdown | 242 | 7.4k | 2.1M | $0.2822 | 1 | +| simulate:Run EnergyPlus | PASS | 24s | 12 | load_osm_model, get_weather_info, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, extract_summary_metrics, extract_end_use_breakdown | 90 | 2.0k | 651.9k | $0.1009 | 1 | +| troubleshoot:My simulation failed | FAIL | 58s | 12 | load_osm_model, get_weather_info, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, get_run_status, extract_summary_metrics, extract_end_use_breakdown | 90 | 2.0k | 651.9k | $0.1009 | 1 | +| troubleshoot:EUI looks way too high | PASS | 99s | 44 | load_osm_model, extract_summary_metrics, get_run_status, extract_simulation_errors, get_weather_info, get_building_info, change_building_location, save_osm_model, validate_model, run_simulation, get_run_status, get_run_status, get_run_status, get_run_logs, get_run_logs, get_run_status, extract_summary_metrics, extract_end_use_breakdown, list_air_loops, list_plant_loops, list_zone_hvac_equipment, get_plant_loop_details, get_component_properties, get_simulation_control, extract_hvac_sizing, extract_component_sizing, get_setpoint_manager_properties, extract_component_sizing, get_schedule_details, get_object_fields, list_model_objects, list_model_objects, get_schedule_details, get_object_fields, list_thermal_zones, get_schedule_details, get_schedule_details, get_object_fields, list_spaces, get_space_details, get_space_type_details, list_model_objects, get_load_details | 266 | 7.3k | 2.4M | $0.3259 | 1 | +| troubleshoot:Too many unmet hours | PASS | 120s | 0 | load_osm_model, extract_summary_metrics, get_run_status, extract_simulation_errors, list_weather_files, change_building_location, save_osm_model, run_simulation, get_run_status, extract_summary_metrics, extract_simulation_errors, list_output_variables, load_osm_model, add_output_meter, add_output_meter, add_output_variable, add_output_variable, add_output_variable, save_osm_model, run_simulation, get_run_status, extract_summary_metrics, get_building_info, list_thermal_zones, list_air_loops, list_plant_loops, validate_model, get_run_logs, change_building_location, save_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, get_run_status, extract_summary_metrics, extract_end_use_breakdown, extract_hvac_sizing, extract_component_sizing, get_component_properties, get_plant_loop_details, extract_component_sizing, query_timeseries, list_output_variables, load_osm_model, set_component_properties, set_component_properties, search_api | 0 | 0 | 0 | $0.0000 | 1 | +| troubleshoot:Why did EnergyPlus crash? | PASS | 9s | 4 | load_osm_model, get_run_status, extract_simulation_errors | 18 | 713 | 111.2k | $0.0263 | 1 | +| view:Show me the model | PASS | 19s | 8 | load_osm_model, get_model_summary, get_building_info, view_model, list_thermal_zones, list_air_loops, list_plant_loops | 34 | 1.0k | 232.5k | $0.0460 | 1 | +| view:Visualize the building | PASS | 12s | 3 | load_osm_model, view_model | 18 | 500 | 111.2k | $0.0245 | 1 | +| view:3D view | PASS | 9s | 3 | load_osm_model, view_model | 18 | 393 | 111.2k | $0.0239 | 1 | + +### tier4 + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|--------------------------------------------|--------|------|-------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| test_create_uses_mcp_not_raw_idf | PASS | 40s | 9 | list_skills, get_skill, create_new_building, list_weather_files, create_new_building, save_osm_model, get_model_summary, get_building_info | 58 | 1.3k | 433.2k | $0.0749 | 1 | +| test_no_script_for_results | PASS | 11s | 2 | extract_summary_metrics | 18 | 430 | 111.2k | $0.0239 | 1 | +| test_inspect_component_uses_mcp_not_script | PASS | 20s | 10 | load_osm_model, list_model_objects, get_component_properties, get_object_fields, list_model_objects, list_model_objects, list_model_objects, get_component_properties, get_object_fields | 66 | 1.8k | 479.0k | $0.0731 | 1 | + +### progressive + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|-------------------------|--------|------|-------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| import_floorplan_L1 | FAIL | 7s | 1 | — | 10 | 445 | 51.5k | $0.0175 | 1 | +| import_floorplan_L2 | PASS | 18s | 4 | import_floorspacejs, list_files, import_floorspacejs | 34 | 1.3k | 231.9k | $0.0417 | 1 | +| import_floorplan_L3 | FAIL | 14s | 1 | — | 10 | 1.0k | 51.5k | $0.0204 | 1 | +| add_hvac_L1 | PASS | 19s | 8 | load_osm_model, get_building_info, list_thermal_zones, add_baseline_system, save_osm_model, list_air_loops, list_plant_loops | 58 | 1.5k | 417.5k | $0.0713 | 1 | +| add_hvac_L2 | PASS | 35s | 5 | load_osm_model, list_thermal_zones, add_baseline_system, save_osm_model | 42 | 1.0k | 293.6k | $0.0477 | 1 | +| add_hvac_L3 | PASS | 13s | 5 | load_osm_model, list_thermal_zones, add_baseline_system, save_osm_model | 42 | 1.1k | 293.5k | $0.0483 | 1 | +| view_model_L1 | PASS | 11s | 3 | load_osm_model, view_model | 18 | 391 | 111.2k | $0.0240 | 1 | +| view_model_L2 | PASS | 11s | 3 | load_osm_model, view_model | 26 | 552 | 171.2k | $0.0309 | 1 | +| view_model_L3 | PASS | 11s | 3 | load_osm_model, view_model | 26 | 512 | 171.1k | $0.0306 | 1 | +| set_weather_L1 | PASS | 19s | 4 | load_osm_model, list_weather_files, change_building_location | 26 | 1.2k | 178.7k | $0.0448 | 1 | +| set_weather_L2 | PASS | 28s | 5 | load_osm_model, change_building_location, list_weather_files, change_building_location | 34 | 1.8k | 240.3k | $0.0554 | 1 | +| set_weather_L3 | PASS | 22s | 5 | load_osm_model, change_building_location, list_weather_files, change_building_location | 34 | 1.1k | 239.9k | $0.0513 | 1 | +| run_qaqc_L1 | PASS | 11s | 5 | load_osm_model, validate_model, get_model_summary, get_building_info | 18 | 950 | 111.2k | $0.0275 | 1 | +| run_qaqc_L2 | PASS | 10s | 4 | load_osm_model, validate_model, get_model_summary | 26 | 750 | 171.3k | $0.0320 | 1 | +| run_qaqc_L3 | PASS | 12s | 4 | load_osm_model, validate_model, get_model_summary | 26 | 899 | 171.1k | $0.0328 | 1 | +| create_building_L1 | PASS | 27s | 4 | create_new_building, create_new_building, create_baseline_osm | 34 | 1.5k | 235.0k | $0.0465 | 1 | +| create_building_L2 | FAIL | 15s | 1 | — | 10 | 1.2k | 51.5k | $0.0214 | 1 | +| create_building_L3 | PASS | 16s | 2 | create_bar_building | 18 | 1.0k | 111.2k | $0.0294 | 1 | +| add_pv_L1 | PASS | 19s | 4 | load_osm_model, add_rooftop_pv, save_osm_model | 34 | 899 | 231.7k | $0.0392 | 1 | +| add_pv_L2 | PASS | 19s | 4 | load_osm_model, add_rooftop_pv, save_osm_model | 34 | 821 | 231.7k | $0.0389 | 1 | +| add_pv_L3 | PASS | 13s | 3 | load_osm_model, add_rooftop_pv | 26 | 625 | 179.3k | $0.0222 | 1 | +| thermostat_L1 | PASS | 16s | 4 | load_osm_model, adjust_thermostat_setpoints, save_osm_model | 34 | 668 | 231.4k | $0.0377 | 1 | +| thermostat_L2 | PASS | 14s | 4 | load_osm_model, adjust_thermostat_setpoints, save_osm_model | 26 | 660 | 171.5k | $0.0319 | 1 | +| thermostat_L3 | PASS | 13s | 3 | load_osm_model, adjust_thermostat_setpoints | 18 | 476 | 111.2k | $0.0245 | 1 | +| list_spaces_L1 | PASS | 9s | 3 | load_osm_model, list_spaces | 26 | 504 | 163.0k | $0.0404 | 1 | +| list_spaces_L2 | PASS | 14s | 3 | load_osm_model, list_spaces | 26 | 618 | 171.1k | $0.0334 | 1 | +| list_spaces_L3 | PASS | 7s | 3 | load_osm_model, list_spaces | 18 | 674 | 111.2k | $0.0259 | 1 | +| schedules_L1 | PASS | 10s | 3 | load_osm_model, list_model_objects | 26 | 750 | 171.1k | $0.0326 | 1 | +| schedules_L2 | PASS | 11s | 3 | load_osm_model, list_model_objects | 26 | 596 | 171.1k | $0.0319 | 1 | +| schedules_L3 | PASS | 7s | 3 | load_osm_model, list_model_objects | 18 | 437 | 111.2k | $0.0251 | 1 | +| inspect_component_L1 | PASS | 10s | 4 | load_osm_model, list_plant_loops, get_component_properties | 34 | 588 | 232.2k | $0.0393 | 1 | +| inspect_component_L2 | PASS | 13s | 5 | load_osm_model, list_model_objects, get_component_properties, get_object_fields | 42 | 1.1k | 291.5k | $0.0479 | 1 | +| inspect_component_L3 | PASS | 12s | 4 | load_osm_model, list_model_objects, get_object_fields | 34 | 936 | 231.4k | $0.0402 | 1 | +| modify_component_L1 | PASS | 16s | 6 | load_osm_model, list_plant_loops, get_component_properties, set_component_properties, save_osm_model | 50 | 890 | 355.9k | $0.0538 | 1 | +| modify_component_L2 | PASS | 10s | 5 | load_osm_model, list_model_objects, set_component_properties, save_osm_model | 42 | 709 | 291.5k | $0.0445 | 1 | +| modify_component_L3 | PASS | 25s | 6 | load_osm_model, list_model_objects, set_object_property, get_object_fields, set_object_property | 50 | 1.8k | 354.4k | $0.0591 | 1 | +| list_dynamic_type_L1 | PASS | 31s | 23 | load_osm_model, list_model_objects, list_model_objects, list_model_objects, list_air_loops, list_thermal_zones, list_plant_loops, get_sizing_system_properties, get_sizing_zone_properties, get_sizing_properties, get_sizing_properties, get_sizing_properties, get_object_fields, get_object_fields, get_object_fields, get_plant_loop_details, get_plant_loop_details, get_plant_loop_details, get_simulation_control, get_run_period, list_model_objects, get_weather_info | 74 | 2.5k | 567.5k | $0.0926 | 1 | +| list_dynamic_type_L2 | PASS | 8s | 3 | load_osm_model, list_model_objects | 18 | 578 | 111.2k | $0.0248 | 1 | +| list_dynamic_type_L3 | PASS | 16s | 4 | load_osm_model, list_model_objects, list_model_objects | 26 | 583 | 163.3k | $0.0406 | 1 | +| floor_area_L1 | PASS | 10s | 3 | load_osm_model, get_building_info | 26 | 576 | 171.1k | $0.0311 | 1 | +| floor_area_L2 | PASS | 7s | 3 | load_osm_model, get_building_info | 26 | 356 | 170.8k | $0.0303 | 1 | +| floor_area_L3 | PASS | 10s | 3 | load_osm_model, get_building_info | 26 | 367 | 170.8k | $0.0304 | 1 | +| materials_L1 | PASS | 9s | 3 | load_osm_model, list_materials | 26 | 673 | 162.9k | $0.0417 | 1 | +| materials_L2 | PASS | 16s | 3 | load_osm_model, list_materials | 26 | 619 | 171.1k | $0.0323 | 1 | +| materials_L3 | PASS | 9s | 3 | load_osm_model, list_materials | 26 | 493 | 171.1k | $0.0316 | 1 | +| thermal_zones_L1 | FAIL | 7s | 3 | load_osm_model, get_model_summary | 26 | 403 | 171.1k | $0.0300 | 1 | +| thermal_zones_L2 | PASS | 12s | 3 | load_osm_model, list_thermal_zones | 26 | 933 | 170.8k | $0.0343 | 1 | +| thermal_zones_L3 | PASS | 10s | 3 | load_osm_model, list_thermal_zones | 26 | 430 | 170.8k | $0.0309 | 1 | +| subsurfaces_L1 | PASS | 16s | 5 | load_osm_model, list_subsurfaces, list_subsurfaces, list_subsurfaces | 42 | 573 | 291.1k | $0.0433 | 1 | +| subsurfaces_L2 | PASS | 9s | 3 | load_osm_model, list_subsurfaces | 26 | 496 | 171.1k | $0.0303 | 1 | +| subsurfaces_L3 | PASS | 8s | 3 | load_osm_model, list_subsurfaces | 26 | 401 | 171.1k | $0.0298 | 1 | +| surface_details_L1 | PASS | 12s | 5 | load_osm_model, list_surfaces, get_surface_details, get_construction_details | 34 | 1.0k | 232.7k | $0.0421 | 1 | +| surface_details_L2 | PASS | 15s | 4 | load_osm_model, list_surfaces, get_surface_details | 34 | 741 | 231.4k | $0.0381 | 1 | +| surface_details_L3 | PASS | 10s | 3 | load_osm_model, list_surfaces | 18 | 651 | 111.2k | $0.0392 | 1 | +| run_simulation_L1 | PASS | 37s | 20 | load_osm_model, run_simulation, get_run_status, get_run_logs, validate_model, list_air_loops, enable_ideal_air_loads, delete_object, delete_object, delete_object, delete_object, save_osm_model, load_osm_model, run_simulation, get_run_status, get_run_status, get_run_status, extract_summary_metrics, extract_end_use_breakdown | 130 | 2.9k | 1.0M | $0.1385 | 1 | +| run_simulation_L2 | PASS | 37s | 8 | load_osm_model, list_weather_files, change_building_location, save_osm_model, run_simulation, get_run_status, extract_simulation_errors | 58 | 3.0k | 457.5k | $0.0852 | 1 | +| run_simulation_L3 | PASS | 8s | 3 | load_osm_model, run_simulation | 26 | 566 | 171.1k | $0.0309 | 1 | +| get_eui_L1 | PASS | 9s | 3 | extract_summary_metrics, get_run_status | 26 | 536 | 171.2k | $0.0309 | 1 | +| get_eui_L2 | PASS | 10s | 3 | extract_summary_metrics, get_run_status | 26 | 606 | 170.8k | $0.0319 | 1 | +| get_eui_L3 | PASS | 10s | 4 | extract_summary_metrics, get_run_status, get_run_logs | 26 | 649 | 163.0k | $0.0419 | 1 | +| end_use_breakdown_L1 | PASS | 10s | 5 | extract_end_use_breakdown, get_run_status, get_run_artifacts, extract_summary_metrics | 26 | 655 | 171.0k | $0.0332 | 1 | +| end_use_breakdown_L2 | PASS | 10s | 4 | extract_end_use_breakdown, get_run_status, extract_summary_metrics | 26 | 698 | 171.1k | $0.0320 | 1 | +| end_use_breakdown_L3 | PASS | 13s | 5 | extract_end_use_breakdown, get_run_status, get_run_artifacts, extract_summary_metrics | 26 | 709 | 171.0k | $0.0335 | 1 | +| hvac_sizing_L1 | FAIL | 14s | 5 | extract_end_use_breakdown, get_run_status, get_run_artifacts, extract_summary_metrics | 26 | 709 | 171.0k | $0.0335 | 1 | +| hvac_sizing_L2 | PASS | 13s | 5 | extract_hvac_sizing, extract_component_sizing, get_run_status, extract_simulation_errors | 34 | 871 | 230.8k | $0.0398 | 1 | +| hvac_sizing_L3 | PASS | 8s | 2 | extract_hvac_sizing | 18 | 413 | 111.2k | $0.0236 | 1 | +| set_wwr_L1 | PASS | 14s | 12 | load_osm_model, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, save_osm_model | 42 | 1.5k | 294.8k | $0.0516 | 1 | +| set_wwr_L2 | PASS | 14s | 12 | load_osm_model, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, save_osm_model | 42 | 1.5k | 295.1k | $0.0517 | 1 | +| set_wwr_L3 | PASS | 20s | 12 | load_osm_model, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio | 50 | 2.0k | 355.7k | $0.0610 | 1 | +| replace_windows_L1 | PASS | 34s | 8 | load_osm_model, list_subsurfaces, list_model_objects, get_construction_details, list_common_measures, list_measure_arguments, list_files | 58 | 2.8k | 421.3k | $0.0708 | 1 | +| replace_windows_L2 | PASS | 100s | 24 | load_osm_model, list_model_objects, get_construction_details, get_construction_details, list_materials, list_subsurfaces, create_measure, test_measure, search_api, search_wiring_patterns, edit_measure, test_measure, edit_measure, test_measure, edit_measure, test_measure, apply_measure, search_api, edit_measure, apply_measure, save_osm_model, list_subsurfaces, get_construction_details | 178 | 9.1k | 1.5M | $0.2400 | 1 | +| replace_windows_L3 | FAIL | 9s | 3 | load_osm_model, list_model_objects | 26 | 826 | 170.8k | $0.0338 | 1 | +| construction_details_L1 | PASS | 15s | 7 | load_osm_model, list_surfaces, get_construction_details, get_object_fields, get_object_fields, get_object_fields | 42 | 1.3k | 292.9k | $0.0510 | 1 | +| construction_details_L2 | PASS | 15s | 5 | load_osm_model, list_model_objects, list_model_objects, get_construction_details | 42 | 969 | 291.9k | $0.0475 | 1 | +| construction_details_L3 | PASS | 12s | 6 | load_osm_model, list_model_objects, get_construction_details, get_construction_details, get_construction_details | 34 | 1.1k | 231.8k | $0.0413 | 1 | +| check_loads_L1 | PASS | 12s | 5 | load_osm_model, list_spaces, get_space_details, get_space_type_details | 42 | 867 | 291.4k | $0.0456 | 1 | +| check_loads_L2 | PASS | 16s | 7 | load_osm_model, list_spaces, get_space_details, get_space_type_details, get_load_details, get_load_details | 50 | 1.0k | 352.8k | $0.0538 | 1 | +| check_loads_L3 | PASS | 15s | 4 | load_osm_model, list_model_objects, get_load_details | 34 | 801 | 223.0k | $0.0480 | 1 | +| create_loads_L1 | PASS | 20s | 24 | load_osm_model, list_spaces, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, save_osm_model | 42 | 2.7k | 289.2k | $0.0754 | 1 | +| create_loads_L2 | PASS | 14s | 6 | load_osm_model, list_spaces, create_people_definition, create_lights_definition, save_osm_model | 42 | 1.3k | 293.6k | $0.0495 | 1 | +| create_loads_L3 | FAIL | 15s | 3 | load_osm_model, list_spaces | 26 | 1.2k | 170.8k | $0.0366 | 1 | +| create_plant_loop_L1 | PASS | 12s | 4 | load_osm_model, create_plant_loop, save_osm_model | 34 | 590 | 231.1k | $0.0371 | 1 | +| create_plant_loop_L2 | PASS | 9s | 3 | load_osm_model, create_plant_loop | 18 | 512 | 111.2k | $0.0246 | 1 | +| create_plant_loop_L3 | PASS | 8s | 3 | load_osm_model, create_plant_loop | 18 | 371 | 111.2k | $0.0237 | 1 | +| schedule_details_L1 | PASS | 32s | 14 | load_osm_model, list_air_loops, list_plant_loops, list_thermal_zones, get_schedule_details, get_schedule_details, list_model_objects, get_schedule_details, list_model_objects, get_object_fields, list_model_objects, get_air_loop_details, get_component_properties | 90 | 2.5k | 763.1k | $0.1242 | 1 | +| schedule_details_L2 | PASS | 14s | 5 | load_osm_model, list_model_objects, get_schedule_details, get_schedule_details | 42 | 910 | 292.9k | $0.0467 | 1 | +| schedule_details_L3 | PASS | 11s | 4 | load_osm_model, list_model_objects, get_schedule_details | 34 | 785 | 231.6k | $0.0388 | 1 | +| space_type_info_L1 | PASS | 22s | 14 | load_osm_model, list_spaces, get_space_type_details, get_load_details, get_load_details, get_load_details, get_schedule_details, list_model_objects, list_model_objects, list_model_objects, get_object_fields, get_object_fields, get_object_fields | 58 | 1.9k | 423.4k | $0.0730 | 1 | +| space_type_info_L2 | PASS | 16s | 11 | load_osm_model, list_model_objects, get_model_summary, get_space_type_details, get_load_details, get_load_details, get_load_details, get_schedule_details, get_schedule_details, get_schedule_details | 42 | 1.3k | 292.4k | $0.0489 | 1 | +| space_type_info_L3 | PASS | 10s | 5 | load_osm_model, get_model_summary, list_model_objects, get_space_type_details | 34 | 819 | 232.0k | $0.0391 | 1 | +| set_run_period_L1 | PASS | 6s | 3 | load_osm_model, set_run_period | 18 | 459 | 111.2k | $0.0242 | 1 | +| set_run_period_L2 | PASS | 6s | 3 | load_osm_model, set_run_period | 18 | 426 | 111.2k | $0.0241 | 1 | +| set_run_period_L3 | PASS | 9s | 3 | load_osm_model, set_run_period | 26 | 539 | 171.2k | $0.0307 | 1 | +| ideal_air_L1 | PASS | 14s | 5 | load_osm_model, enable_ideal_air_loads, save_osm_model, get_model_summary | 34 | 760 | 231.4k | $0.0388 | 1 | +| ideal_air_L2 | PASS | 11s | 3 | load_osm_model, enable_ideal_air_loads | 18 | 472 | 111.2k | $0.0242 | 1 | +| ideal_air_L3 | PASS | 14s | 4 | load_osm_model, enable_ideal_air_loads, list_thermal_zones | 34 | 663 | 223.2k | $0.0485 | 1 | +| save_model_L1 | PASS | 8s | 3 | load_osm_model, save_osm_model | 26 | 399 | 170.8k | $0.0302 | 1 | +| save_model_L2 | PASS | 6s | 3 | load_osm_model, save_osm_model | 18 | 404 | 111.2k | $0.0239 | 1 | +| save_model_L3 | PASS | 10s | 3 | load_osm_model, save_osm_model | 26 | 469 | 170.8k | $0.0306 | 1 | +| add_ev_L1 | PASS | 15s | 4 | load_osm_model, add_ev_load, save_osm_model | 26 | 900 | 172.6k | $0.0345 | 1 | +| add_ev_L2 | PASS | 17s | 5 | load_osm_model, get_model_summary, add_ev_load, save_osm_model | 34 | 1.0k | 233.1k | $0.0416 | 1 | +| add_ev_L3 | PASS | 11s | 3 | load_osm_model, add_ev_load | 18 | 559 | 111.2k | $0.0260 | 1 | +| list_measures_L1 | PASS | 6s | 2 | list_custom_measures | 18 | 345 | 103.1k | $0.0330 | 1 | +| list_measures_L2 | PASS | 6s | 2 | list_custom_measures | 18 | 609 | 103.1k | $0.0343 | 1 | + +## Progressive Prompt Analysis + +Pass rates by specificity level per case: + +| Case | L1 (vague) | L2 (moderate) | L3 (explicit) | +|----------------------|------------|---------------|---------------| +| import_floorplan | FAIL | PASS | FAIL | +| add_hvac | PASS | PASS | PASS | +| view_model | PASS | PASS | PASS | +| set_weather | PASS | PASS | PASS | +| run_qaqc | PASS | PASS | PASS | +| create_building | PASS | FAIL | PASS | +| add_pv | PASS | PASS | PASS | +| thermostat | PASS | PASS | PASS | +| list_spaces | PASS | PASS | PASS | +| schedules | PASS | PASS | PASS | +| inspect_component | PASS | PASS | PASS | +| modify_component | PASS | PASS | PASS | +| list_dynamic_type | PASS | PASS | PASS | +| floor_area | PASS | PASS | PASS | +| materials | PASS | PASS | PASS | +| thermal_zones | FAIL | PASS | PASS | +| subsurfaces | PASS | PASS | PASS | +| surface_details | PASS | PASS | PASS | +| run_simulation | PASS | PASS | PASS | +| get_eui | PASS | PASS | PASS | +| end_use_breakdown | PASS | PASS | PASS | +| hvac_sizing | FAIL | PASS | PASS | +| set_wwr | PASS | PASS | PASS | +| replace_windows | PASS | PASS | FAIL | +| construction_details | PASS | PASS | PASS | +| check_loads | PASS | PASS | PASS | +| create_loads | PASS | PASS | FAIL | +| create_plant_loop | PASS | PASS | PASS | +| schedule_details | PASS | PASS | PASS | +| space_type_info | PASS | PASS | PASS | +| set_run_period | PASS | PASS | PASS | +| ideal_air | PASS | PASS | PASS | +| save_model | PASS | PASS | PASS | +| add_ev | PASS | PASS | PASS | +| list_measures | PASS | PASS | - | + +**Summary:** L1=32/35 | L2=34/35 | L3=31/35 + +## Failure Mode Analysis + +| Mode | Count | Description | +|------|-------|-------------| +| wrong_tool | 16 | MCP tool called but not the expected one | +| no_mcp_tool | 4 | No MCP tool called (stuck in builtins) | + +## Failed Tests + +- **energy-report:Give me a full energy report** (tier3, wrong_tool): 57s, 4 turns, tools: load_osm_model -> list_thermal_zones -> add_baseline_system +- **qaqc:Validate before simulation** (tier3, wrong_tool): 10s, 3 turns, tools: load_osm_model -> validate_model +- **qaqc:QA/QC the model** (tier3, wrong_tool): 10s, 3 turns, tools: load_osm_model -> validate_model +- **qaqc:Is my model ready to simulate?** (tier3, wrong_tool): 15s, 8 turns, tools: load_osm_model -> validate_model -> get_weather_info -> get_building_info -> get_simulation_control -> list_air_loops -> list_plant_loops +- **simulate:Run a simulation** (tier3, wrong_tool): 29s, 52 turns, tools: load_osm_model -> get_building_info -> get_model_summary -> list_air_loops -> list_plant_loops -> get_weather_info -> save_osm_model -> run_simulation -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> extract_summary_metrics -> extract_end_use_breakdown -> load_osm_model -> list_surfaces -> set_window_to_wall_ratio -> set_window_to_wall_ratio -> set_window_to_wall_ratio -> set_window_to_wall_ratio -> set_window_to_wall_ratio -> set_window_to_wall_ratio -> set_window_to_wall_ratio -> set_window_to_wall_ratio -> save_osm_model -> run_simulation -> load_osm_model -> replace_air_terminals -> save_osm_model -> run_simulation -> load_osm_model -> add_rooftop_pv -> save_osm_model -> run_simulation -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> extract_summary_metrics -> extract_summary_metrics -> extract_summary_metrics -> extract_end_use_breakdown -> extract_end_use_breakdown -> extract_end_use_breakdown +- **simulate:Simulate the model** (tier3, wrong_tool): 28s, 52 turns, tools: load_osm_model -> get_building_info -> get_model_summary -> list_air_loops -> list_plant_loops -> get_weather_info -> save_osm_model -> run_simulation -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> extract_summary_metrics -> extract_end_use_breakdown -> load_osm_model -> list_surfaces -> set_window_to_wall_ratio -> set_window_to_wall_ratio -> set_window_to_wall_ratio -> set_window_to_wall_ratio -> set_window_to_wall_ratio -> set_window_to_wall_ratio -> set_window_to_wall_ratio -> set_window_to_wall_ratio -> save_osm_model -> run_simulation -> load_osm_model -> replace_air_terminals -> save_osm_model -> run_simulation -> load_osm_model -> add_rooftop_pv -> save_osm_model -> run_simulation -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> extract_summary_metrics -> extract_summary_metrics -> extract_summary_metrics -> extract_end_use_breakdown -> extract_end_use_breakdown -> extract_end_use_breakdown +- **troubleshoot:My simulation failed** (tier3, wrong_tool): 58s, 12 turns, tools: load_osm_model -> get_weather_info -> run_simulation -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> extract_summary_metrics -> extract_end_use_breakdown +- **qaqc_check** (tier2, wrong_tool): 23s, 6 turns, tools: load_osm_model -> validate_model -> run_simulation -> get_run_status -> extract_simulation_errors +- **import_floorspacejs** (tier2, no_mcp_tool): 12s, 1 turns, tools: no tools called +- **floorspacejs_to_typical** (tier2, wrong_tool): 11s, 2 turns, tools: import_floorspacejs +- **envelope_retrofit** (tier2, wrong_tool): 12s, 4 turns, tools: load_osm_model -> list_surfaces -> list_materials +- **create_and_assign_loads** (tier2, wrong_tool): 12s, 3 turns, tools: load_osm_model -> list_spaces +- **measure_replace_terminals_full_chain** (tier2, wrong_tool): 71s, 21 turns, tools: load_osm_model -> save_osm_model -> run_simulation -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> extract_summary_metrics -> load_osm_model -> search_api -> create_measure -> test_measure -> apply_measure -> save_osm_model -> run_simulation -> get_run_status -> get_run_status -> get_run_status -> get_run_status -> extract_summary_metrics +- **import_floorplan_L1** (progressive, no_mcp_tool): 7s, 1 turns, tools: no tools called +- **import_floorplan_L3** (progressive, no_mcp_tool): 14s, 1 turns, tools: no tools called +- **create_building_L2** (progressive, no_mcp_tool): 15s, 1 turns, tools: no tools called +- **thermal_zones_L1** (progressive, wrong_tool): 7s, 3 turns, tools: load_osm_model -> get_model_summary +- **hvac_sizing_L1** (progressive, wrong_tool): 14s, 5 turns, tools: extract_end_use_breakdown -> get_run_status -> get_run_artifacts -> extract_summary_metrics +- **replace_windows_L3** (progressive, wrong_tool): 9s, 3 turns, tools: load_osm_model -> list_model_objects +- **create_loads_L3** (progressive, wrong_tool): 15s, 3 turns, tools: load_osm_model -> list_spaces diff --git a/docs/sweeps/haiku-2026-03-28/benchmark_history.json b/docs/sweeps/haiku-2026-03-28/benchmark_history.json new file mode 100644 index 0000000..fa96a73 --- /dev/null +++ b/docs/sweeps/haiku-2026-03-28/benchmark_history.json @@ -0,0 +1,54 @@ +[ + { + "timestamp": "2026-03-28T18:32:55+00:00", + "model": "haiku", + "retries": 0, + "total_tests": 180, + "passed": 160, + "failed": 20, + "pass_rate": 88.9, + "total_duration_s": 4774.9, + "total_input_tokens": 8870, + "total_output_tokens": 307749, + "total_cache_read_tokens": 66583856, + "total_cost_usd": 11.211, + "tiers": { + "setup": { + "total": 6, + "passed": 6, + "duration_s": 113.7, + "pass_rate": 100.0 + }, + "tier1": { + "total": 4, + "passed": 4, + "duration_s": 75.9, + "pass_rate": 100.0 + }, + "tier3": { + "total": 26, + "passed": 19, + "duration_s": 1127.4, + "pass_rate": 73.1 + }, + "tier2": { + "total": 37, + "passed": 31, + "duration_s": 1857.0, + "pass_rate": 83.8 + }, + "tier4": { + "total": 3, + "passed": 3, + "duration_s": 71.8, + "pass_rate": 100.0 + }, + "progressive": { + "total": 104, + "passed": 97, + "duration_s": 1529.1, + "pass_rate": 93.3 + } + } + } +] \ No newline at end of file diff --git a/docs/sweeps/haiku-2026-03-28/sweep.log b/docs/sweeps/haiku-2026-03-28/sweep.log new file mode 100644 index 0000000..a1fa18d --- /dev/null +++ b/docs/sweeps/haiku-2026-03-28/sweep.log @@ -0,0 +1,1292 @@ +============================= test session starts ============================= +platform win32 -- Python 3.13.12, pytest-9.0.2, pluggy-1.6.0 -- C:\Python313\python.exe +cachedir: .pytest_cache +rootdir: C:\projects\openstudio-mcp +configfile: pyproject.toml +plugins: anyio-4.12.1, cov-7.0.0, timeout-2.4.0 +collecting ... collected 230 items + +tests/llm/test_01_setup.py::test_create_baseline_model PASSED [ 0%] +tests/llm/test_01_setup.py::test_create_baseline_with_hvac PASSED [ 0%] +tests/llm/test_01_setup.py::test_create_example_model PASSED [ 1%] +tests/llm/test_01_setup.py::test_load_baseline_model PASSED [ 1%] +tests/llm/test_01_setup.py::test_run_baseline_simulation PASSED [ 2%] +tests/llm/test_01_setup.py::test_run_retrofit_simulation PASSED [ 2%] +tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[What is the server status?] PASSED [ 3%] +tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[List available skills] PASSED [ 3%] +tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[Create a small office building usin] PASSED [ 3%] +tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[Create bar geometry for a retail bu] PASSED [ 4%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Add HVAC to the model] PASSED [ 4%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Set up heating and cooling] PASSED [ 5%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:What HVAC system should I use?] PASSED [ 5%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Add a VAV system] PASSED [ 6%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[energy-report:Give me a full energy report] FAILED [ 6%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a small office building] PASSED [ 6%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Model a 3-story school] PASSED [ 7%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a retail building, 25000 sqf] PASSED [ 7%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Import the FloorspaceJS floor plan ] PASSED [ 8%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a bar building for a medium ] PASSED [ 8%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Check the model for issues] PASSED [ 9%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Validate before simulation] FAILED [ 9%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:QA/QC the model] FAILED [ 10%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Is my model ready to simulate?] FAILED [ 10%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[retrofit:Compare before and after adding ins] PASSED [ 10%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[retrofit:Do a retrofit analysis] PASSED [ 11%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Run a simulation] FAILED [ 11%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Simulate the model] FAILED [ 12%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Run EnergyPlus] PASSED [ 12%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:My simulation failed] FAILED [ 13%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:EUI looks way too high] PASSED [ 13%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:Too many unmet hours] PASSED [ 13%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:Why did EnergyPlus crash?] PASSED [ 14%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:Show me the model] PASSED [ 14%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:Visualize the building] PASSED [ 15%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:3D view] PASSED [ 15%] +tests/llm/test_04_workflows.py::test_workflow[systemd_fourpipebeam_e2e] PASSED [ 16%] +tests/llm/test_04_workflows.py::test_workflow[add_vav_reheat] PASSED [ 16%] +tests/llm/test_04_workflows.py::test_workflow[add_doas] PASSED [ 16%] +tests/llm/test_04_workflows.py::test_workflow[add_vrf] PASSED [ 17%] +tests/llm/test_04_workflows.py::test_workflow[set_weather] PASSED [ 17%] +tests/llm/test_04_workflows.py::test_workflow[add_rooftop_pv] PASSED [ 18%] +tests/llm/test_04_workflows.py::test_workflow[adjust_thermostat] PASSED [ 18%] +tests/llm/test_04_workflows.py::test_workflow[delete_space] PASSED [ 19%] +tests/llm/test_04_workflows.py::test_workflow[qaqc_check] FAILED [ 19%] +tests/llm/test_04_workflows.py::test_workflow[create_bar_office] PASSED [ 20%] +tests/llm/test_04_workflows.py::test_workflow[create_new_building] PASSED [ 20%] +tests/llm/test_04_workflows.py::test_workflow[bar_then_typical] PASSED [ 20%] +tests/llm/test_04_workflows.py::test_workflow[import_floorspacejs] FAILED [ 21%] +tests/llm/test_04_workflows.py::test_workflow[floorspacejs_to_typical] FAILED [ 21%] +tests/llm/test_04_workflows.py::test_workflow[manual_geometry_match] PASSED [ 22%] +tests/llm/test_04_workflows.py::test_workflow[envelope_retrofit] FAILED [ 22%] +tests/llm/test_04_workflows.py::test_workflow[create_and_assign_loads] FAILED [ 23%] +tests/llm/test_04_workflows.py::test_workflow[plant_loop_with_boiler] PASSED [ 23%] +tests/llm/test_04_workflows.py::test_workflow[inspect_and_modify_boiler] PASSED [ 23%] +tests/llm/test_04_workflows.py::test_workflow[extract_results_chain] PASSED [ 24%] +tests/llm/test_04_workflows.py::test_workflow[hvac_chilled_beam_comparison] PASSED [ 24%] +tests/llm/test_04_workflows.py::test_workflow[create_test_apply_measure] PASSED [ 25%] +tests/llm/test_04_workflows.py::test_workflow[measure_set_lights_full_chain] PASSED [ 25%] +tests/llm/test_04_workflows.py::test_workflow[measure_set_infiltration_full_chain] PASSED [ 26%] +tests/llm/test_04_workflows.py::test_workflow[measure_replace_terminals_full_chain] FAILED [ 26%] +tests/llm/test_04_workflows.py::test_workflow[create_measure_with_args] PASSED [ 26%] +tests/llm/test_04_workflows.py::test_workflow[measure_add_baseboards_full_chain] PASSED [ 27%] +tests/llm/test_04_workflows.py::test_workflow[ruby_measure_reduce_plugloads] PASSED [ 27%] +tests/llm/test_04_workflows.py::test_workflow[python_measure_reduce_plugloads] PASSED [ 28%] +tests/llm/test_04_workflows.py::test_workflow[ruby_measure_boiler_efficiency] PASSED [ 28%] +tests/llm/test_04_workflows.py::test_workflow[python_measure_boiler_efficiency] PASSED [ 29%] +tests/llm/test_04_workflows.py::test_create_measure_with_args_quality PASSED [ 29%] +tests/llm/test_04_workflows.py::test_complex_model_multi_query PASSED [ 30%] +tests/llm/test_04_workflows.py::test_measure_reduce_plugloads_quality[Ruby] PASSED [ 30%] +tests/llm/test_04_workflows.py::test_measure_reduce_plugloads_quality[Python] PASSED [ 30%] +tests/llm/test_04_workflows.py::test_measure_boiler_efficiency_quality[Ruby] PASSED [ 31%] +tests/llm/test_04_workflows.py::test_measure_boiler_efficiency_quality[Python] PASSED [ 31%] +tests/llm/test_05_guardrails.py::test_create_uses_mcp_not_raw_idf PASSED [ 32%] +tests/llm/test_05_guardrails.py::test_no_script_for_results PASSED [ 32%] +tests/llm/test_05_guardrails.py::test_inspect_component_uses_mcp_not_script PASSED [ 33%] +tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L1] FAILED [ 33%] +tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L2] PASSED [ 33%] +tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L3] FAILED [ 34%] +tests/llm/test_06_progressive.py::test_progressive[add_hvac_L1] PASSED [ 34%] +tests/llm/test_06_progressive.py::test_progressive[add_hvac_L2] PASSED [ 35%] +tests/llm/test_06_progressive.py::test_progressive[add_hvac_L3] PASSED [ 35%] +tests/llm/test_06_progressive.py::test_progressive[view_model_L1] PASSED [ 36%] +tests/llm/test_06_progressive.py::test_progressive[view_model_L2] PASSED [ 36%] +tests/llm/test_06_progressive.py::test_progressive[view_model_L3] PASSED [ 36%] +tests/llm/test_06_progressive.py::test_progressive[set_weather_L1] PASSED [ 37%] +tests/llm/test_06_progressive.py::test_progressive[set_weather_L2] PASSED [ 37%] +tests/llm/test_06_progressive.py::test_progressive[set_weather_L3] PASSED [ 38%] +tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L1] PASSED [ 38%] +tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L2] PASSED [ 39%] +tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L3] PASSED [ 39%] +tests/llm/test_06_progressive.py::test_progressive[create_building_L1] PASSED [ 40%] +tests/llm/test_06_progressive.py::test_progressive[create_building_L2] FAILED [ 40%] +tests/llm/test_06_progressive.py::test_progressive[create_building_L3] PASSED [ 40%] +tests/llm/test_06_progressive.py::test_progressive[add_pv_L1] PASSED [ 41%] +tests/llm/test_06_progressive.py::test_progressive[add_pv_L2] PASSED [ 41%] +tests/llm/test_06_progressive.py::test_progressive[add_pv_L3] PASSED [ 42%] +tests/llm/test_06_progressive.py::test_progressive[thermostat_L1] PASSED [ 42%] +tests/llm/test_06_progressive.py::test_progressive[thermostat_L2] PASSED [ 43%] +tests/llm/test_06_progressive.py::test_progressive[thermostat_L3] PASSED [ 43%] +tests/llm/test_06_progressive.py::test_progressive[list_spaces_L1] PASSED [ 43%] +tests/llm/test_06_progressive.py::test_progressive[list_spaces_L2] PASSED [ 44%] +tests/llm/test_06_progressive.py::test_progressive[list_spaces_L3] PASSED [ 44%] +tests/llm/test_06_progressive.py::test_progressive[schedules_L1] PASSED [ 45%] +tests/llm/test_06_progressive.py::test_progressive[schedules_L2] PASSED [ 45%] +tests/llm/test_06_progressive.py::test_progressive[schedules_L3] PASSED [ 46%] +tests/llm/test_06_progressive.py::test_progressive[inspect_component_L1] PASSED [ 46%] +tests/llm/test_06_progressive.py::test_progressive[inspect_component_L2] PASSED [ 46%] +tests/llm/test_06_progressive.py::test_progressive[inspect_component_L3] PASSED [ 47%] +tests/llm/test_06_progressive.py::test_progressive[modify_component_L1] PASSED [ 47%] +tests/llm/test_06_progressive.py::test_progressive[modify_component_L2] PASSED [ 48%] +tests/llm/test_06_progressive.py::test_progressive[modify_component_L3] PASSED [ 48%] +tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L1] PASSED [ 49%] +tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L2] PASSED [ 49%] +tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L3] PASSED [ 50%] +tests/llm/test_06_progressive.py::test_progressive[floor_area_L1] PASSED [ 50%] +tests/llm/test_06_progressive.py::test_progressive[floor_area_L2] PASSED [ 50%] +tests/llm/test_06_progressive.py::test_progressive[floor_area_L3] PASSED [ 51%] +tests/llm/test_06_progressive.py::test_progressive[materials_L1] PASSED [ 51%] +tests/llm/test_06_progressive.py::test_progressive[materials_L2] PASSED [ 52%] +tests/llm/test_06_progressive.py::test_progressive[materials_L3] PASSED [ 52%] +tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L1] FAILED [ 53%] +tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L2] PASSED [ 53%] +tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L3] PASSED [ 53%] +tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L1] PASSED [ 54%] +tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L2] PASSED [ 54%] +tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L3] PASSED [ 55%] +tests/llm/test_06_progressive.py::test_progressive[surface_details_L1] PASSED [ 55%] +tests/llm/test_06_progressive.py::test_progressive[surface_details_L2] PASSED [ 56%] +tests/llm/test_06_progressive.py::test_progressive[surface_details_L3] PASSED [ 56%] +tests/llm/test_06_progressive.py::test_progressive[run_simulation_L1] PASSED [ 56%] +tests/llm/test_06_progressive.py::test_progressive[run_simulation_L2] PASSED [ 57%] +tests/llm/test_06_progressive.py::test_progressive[run_simulation_L3] PASSED [ 57%] +tests/llm/test_06_progressive.py::test_progressive[get_eui_L1] PASSED [ 58%] +tests/llm/test_06_progressive.py::test_progressive[get_eui_L2] PASSED [ 58%] +tests/llm/test_06_progressive.py::test_progressive[get_eui_L3] PASSED [ 59%] +tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L1] PASSED [ 59%] +tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L2] PASSED [ 60%] +tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L3] PASSED [ 60%] +tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L1] FAILED [ 60%] +tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L2] PASSED [ 61%] +tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L3] PASSED [ 61%] +tests/llm/test_06_progressive.py::test_progressive[set_wwr_L1] PASSED [ 62%] +tests/llm/test_06_progressive.py::test_progressive[set_wwr_L2] PASSED [ 62%] +tests/llm/test_06_progressive.py::test_progressive[set_wwr_L3] PASSED [ 63%] +tests/llm/test_06_progressive.py::test_progressive[replace_windows_L1] PASSED [ 63%] +tests/llm/test_06_progressive.py::test_progressive[replace_windows_L2] PASSED [ 63%] +tests/llm/test_06_progressive.py::test_progressive[replace_windows_L3] FAILED [ 64%] +tests/llm/test_06_progressive.py::test_progressive[construction_details_L1] PASSED [ 64%] +tests/llm/test_06_progressive.py::test_progressive[construction_details_L2] PASSED [ 65%] +tests/llm/test_06_progressive.py::test_progressive[construction_details_L3] PASSED [ 65%] +tests/llm/test_06_progressive.py::test_progressive[check_loads_L1] PASSED [ 66%] +tests/llm/test_06_progressive.py::test_progressive[check_loads_L2] PASSED [ 66%] +tests/llm/test_06_progressive.py::test_progressive[check_loads_L3] PASSED [ 66%] +tests/llm/test_06_progressive.py::test_progressive[create_loads_L1] PASSED [ 67%] +tests/llm/test_06_progressive.py::test_progressive[create_loads_L2] PASSED [ 67%] +tests/llm/test_06_progressive.py::test_progressive[create_loads_L3] FAILED [ 68%] +tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L1] PASSED [ 68%] +tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L2] PASSED [ 69%] +tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L3] PASSED [ 69%] +tests/llm/test_06_progressive.py::test_progressive[schedule_details_L1] PASSED [ 70%] +tests/llm/test_06_progressive.py::test_progressive[schedule_details_L2] PASSED [ 70%] +tests/llm/test_06_progressive.py::test_progressive[schedule_details_L3] PASSED [ 70%] +tests/llm/test_06_progressive.py::test_progressive[space_type_info_L1] PASSED [ 71%] +tests/llm/test_06_progressive.py::test_progressive[space_type_info_L2] PASSED [ 71%] +tests/llm/test_06_progressive.py::test_progressive[space_type_info_L3] PASSED [ 72%] +tests/llm/test_06_progressive.py::test_progressive[set_run_period_L1] PASSED [ 72%] +tests/llm/test_06_progressive.py::test_progressive[set_run_period_L2] PASSED [ 73%] +tests/llm/test_06_progressive.py::test_progressive[set_run_period_L3] PASSED [ 73%] +tests/llm/test_06_progressive.py::test_progressive[ideal_air_L1] PASSED [ 73%] +tests/llm/test_06_progressive.py::test_progressive[ideal_air_L2] PASSED [ 74%] +tests/llm/test_06_progressive.py::test_progressive[ideal_air_L3] PASSED [ 74%] +tests/llm/test_06_progressive.py::test_progressive[save_model_L1] PASSED [ 75%] +tests/llm/test_06_progressive.py::test_progressive[save_model_L2] PASSED [ 75%] +tests/llm/test_06_progressive.py::test_progressive[save_model_L3] PASSED [ 76%] +tests/llm/test_06_progressive.py::test_progressive[add_ev_L1] PASSED [ 76%] +tests/llm/test_06_progressive.py::test_progressive[add_ev_L2] PASSED [ 76%] +tests/llm/test_06_progressive.py::test_progressive[add_ev_L3] PASSED [ 77%] +tests/llm/test_06_progressive.py::test_progressive[list_measures_L1] PASSED [ 77%] +tests/llm/test_06_progressive.py::test_progressive[list_measures_L2] PASSED [ 78%] +tests/llm/test_06_progressive.py::test_progressive[list_measures_L3] SKIPPED [ 78%] +tests/llm/test_06_progressive.py::test_progressive[create_measure_L1] SKIPPED [ 79%] +tests/llm/test_06_progressive.py::test_progressive[create_measure_L2] SKIPPED [ 79%] +tests/llm/test_06_progressive.py::test_progressive[create_measure_L3] SKIPPED [ 80%] +tests/llm/test_06_progressive.py::test_progressive[test_measure_L1] SKIPPED [ 80%] +tests/llm/test_06_progressive.py::test_progressive[test_measure_L2] SKIPPED [ 80%] +tests/llm/test_06_progressive.py::test_progressive[test_measure_L3] SKIPPED [ 81%] +tests/llm/test_06_progressive.py::test_progressive[apply_existing_measure_L1] SKIPPED [ 81%] +tests/llm/test_06_progressive.py::test_progressive[apply_existing_measure_L2] SKIPPED [ 82%] +tests/llm/test_06_progressive.py::test_progressive[apply_existing_measure_L3] SKIPPED [ 82%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_cooled_beam_L1] SKIPPED [ 83%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_cooled_beam_L2] SKIPPED [ 83%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_cooled_beam_L3] SKIPPED [ 83%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_four_pipe_beam_L1] SKIPPED [ 84%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_four_pipe_beam_L2] SKIPPED [ 84%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_four_pipe_beam_L3] SKIPPED [ 85%] +tests/llm/test_06_progressive.py::test_progressive[measure_replace_terminals_L1] SKIPPED [ 85%] +tests/llm/test_06_progressive.py::test_progressive[measure_replace_terminals_L2] SKIPPED [ 86%] +tests/llm/test_06_progressive.py::test_progressive[measure_replace_terminals_L3] SKIPPED [ 86%] +tests/llm/test_06_progressive.py::test_progressive[zone_equipment_priority_L1] SKIPPED [ 86%] +tests/llm/test_06_progressive.py::test_progressive[zone_equipment_priority_L2] SKIPPED [ 87%] +tests/llm/test_06_progressive.py::test_progressive[zone_equipment_priority_L3] SKIPPED [ 87%] +tests/llm/test_06_progressive.py::test_progressive[edit_measure_L1] SKIPPED [ 88%] +tests/llm/test_06_progressive.py::test_progressive[edit_measure_L2] SKIPPED [ 88%] +tests/llm/test_06_progressive.py::test_progressive[edit_measure_L3] SKIPPED [ 89%] +tests/llm/test_07_fourpipe_e2e.py::test_fourpipe_beam_retrofit_e2e SKIPPED [ 89%] +tests/llm/test_08_measure_authoring.py::test_create_measure_with_quoted_description SKIPPED [ 90%] +tests/llm/test_08_measure_authoring.py::test_edit_measure_description_with_quotes SKIPPED [ 90%] +tests/llm/test_08_measure_authoring.py::test_measure_xml_intended_software_tool SKIPPED [ 90%] +tests/llm/test_08_measure_authoring.py::test_syntax_error_reported_clearly SKIPPED [ 91%] +tests/llm/test_09_tool_routing.py::test_tool_selection_baseline[create_measure] SKIPPED [ 91%] +tests/llm/test_09_tool_routing.py::test_tool_selection_baseline[view_model] SKIPPED [ 92%] +tests/llm/test_09_tool_routing.py::test_tool_selection_baseline[read_file] SKIPPED [ 92%] +tests/llm/test_09_tool_routing.py::test_tool_selection_baseline[add_baseline_system] SKIPPED [ 93%] +tests/llm/test_09_tool_routing.py::test_tool_selection_baseline_extract_eui SKIPPED [ 93%] +tests/llm/test_09_tool_routing.py::test_visualization_uses_mcp_not_script SKIPPED [ 93%] +tests/llm/test_09_tool_routing.py::test_report_uses_mcp_not_script SKIPPED [ 94%] +tests/llm/test_09_tool_routing.py::test_measure_uses_create_measure_not_create_file SKIPPED [ 94%] +tests/llm/test_09_tool_routing.py::test_read_file_uses_mcp_not_bash SKIPPED [ 95%] +tests/llm/test_09_tool_routing.py::test_hvac_measure_uses_api_reference SKIPPED [ 95%] +tests/llm/test_09_tool_routing.py::test_search_api_for_method_verification SKIPPED [ 96%] +tests/llm/test_09_tool_routing.py::test_search_wiring_patterns_for_hvac_wiring SKIPPED [ 96%] +tests/llm/test_10_confusion_pairs.py::test_qaqc_vs_validate_post_sim SKIPPED [ 96%] +tests/llm/test_10_confusion_pairs.py::test_validate_vs_qaqc_pre_sim SKIPPED [ 97%] +tests/llm/test_10_confusion_pairs.py::test_load_details_vs_space_details SKIPPED [ 97%] +tests/llm/test_10_confusion_pairs.py::test_summary_metrics_vs_end_use SKIPPED [ 98%] +tests/llm/test_10_confusion_pairs.py::test_end_use_vs_summary_metrics SKIPPED [ 98%] +tests/llm/test_10_confusion_pairs.py::test_inspect_osm_vs_model_summary SKIPPED [ 99%] +tests/llm/test_10_confusion_pairs.py::test_create_baseline_vs_new_building SKIPPED [ 99%] +tests/llm/test_10_confusion_pairs.py::test_apply_measure_vs_create_measure SKIPPED [100%] +====================================================================== +LLM Benchmark: 160/180 passed (88.9%) | Model: haiku | 4775s +Tokens: 8.9k in + 307.7k out + 66.6M cache | Cost: $11.2110 + setup: 6/6 (100.0%) in 114s + tier1: 4/4 (100.0%) in 76s + tier2: 31/37 (83.8%) in 1857s + tier3: 19/26 (73.1%) in 1127s + tier4: 3/3 (100.0%) in 72s + progressive: 97/104 (93.3%) in 1529s +Failed: energy-report:Give me a full energy report, qaqc:Validate before simulation, qaqc:QA/QC the model, qaqc:Is my model ready to simulate?, simulate:Run a simulation, simulate:Simulate the model, troubleshoot:My simulation failed, qaqc_check, import_floorspacejs, floorspacejs_to_typical, envelope_retrofit, create_and_assign_loads, measure_replace_terminals_full_chain, import_floorplan_L1, import_floorplan_L3, create_building_L2, thermal_zones_L1, hvac_sizing_L1, replace_windows_L3, create_loads_L3 +Report: C:\tmp\llm-sweep-haiku\benchmark.md +History: C:\tmp\llm-sweep-haiku\benchmark_history.json (1 runs) +====================================================================== + + +================================== FAILURES =================================== +____ test_eval_tool_selection[energy-report:Give me a full energy report] _____ + +case = {'expected_tools': ['extract_summary_metrics', 'extract_end_use_breakdown', 'extract_envelope_summary', 'extract_hvac_sizing', 'extract_zone_summary'], 'prompt': 'Give me a full energy report', 'skill': 'energy-report'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) +> result = run_claude(prompt, timeout=timeout) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +tests\llm\test_03_eval_cases.py:141: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +tests\llm\runner.py:209: in run_claude + _last_result = _parse_stream_json(result.stdout) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +raw = None + + def _parse_stream_json(raw: str) -> ClaudeResult: + """Parse newline-delimited JSON from stream-json output.""" + messages = [] + result_obj = {} + +> for line in raw.strip().splitlines(): + ^^^^^^^^^ +E AttributeError: 'NoneType' object has no attribute 'strip' + +tests\llm\runner.py:218: AttributeError +__________ test_eval_tool_selection[qaqc:Validate before simulation] __________ + +case = {'expected_tools': ['run_qaqc_checks'], 'prompt': 'Validate before simulation', 'skill': 'qaqc'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + + # Merge eval.md expected tools with extra acceptable tools + expected = set(case["expected_tools"]) + expected.update(EXTRA_EXPECTED.get(case["skill"], [])) + +> assert any(t in expected for t in tool_names), ( + f"[{case['skill']}] Expected one of {sorted(expected)}, " + f"got: {tool_names}" + ) +E AssertionError: [qaqc] Expected one of ['get_model_summary', 'inspect_osm_summary', 'run_qaqc_checks'], got: ['load_osm_model', 'validate_model'] +E assert False +E + where False = any(. at 0x000002696ED64EE0>) + +tests\llm\test_03_eval_cases.py:148: AssertionError +_______________ test_eval_tool_selection[qaqc:QA/QC the model] ________________ + +case = {'expected_tools': ['run_qaqc_checks'], 'prompt': 'QA/QC the model', 'skill': 'qaqc'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + + # Merge eval.md expected tools with extra acceptable tools + expected = set(case["expected_tools"]) + expected.update(EXTRA_EXPECTED.get(case["skill"], [])) + +> assert any(t in expected for t in tool_names), ( + f"[{case['skill']}] Expected one of {sorted(expected)}, " + f"got: {tool_names}" + ) +E AssertionError: [qaqc] Expected one of ['get_model_summary', 'inspect_osm_summary', 'run_qaqc_checks'], got: ['load_osm_model', 'validate_model'] +E assert False +E + where False = any(. at 0x000002696EE37030>) + +tests\llm\test_03_eval_cases.py:148: AssertionError +________ test_eval_tool_selection[qaqc:Is my model ready to simulate?] ________ + +case = {'expected_tools': ['inspect_osm_summary', 'run_qaqc_checks'], 'prompt': 'Is my model ready to simulate?', 'skill': 'qaqc'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + + # Merge eval.md expected tools with extra acceptable tools + expected = set(case["expected_tools"]) + expected.update(EXTRA_EXPECTED.get(case["skill"], [])) + +> assert any(t in expected for t in tool_names), ( + f"[{case['skill']}] Expected one of {sorted(expected)}, " + f"got: {tool_names}" + ) +E AssertionError: [qaqc] Expected one of ['get_model_summary', 'inspect_osm_summary', 'run_qaqc_checks'], got: ['load_osm_model', 'validate_model', 'get_weather_info', 'get_building_info', 'get_simulation_control', 'list_air_loops', 'list_plant_loops'] +E assert False +E + where False = any(. at 0x000002696EE6A670>) + +tests\llm\test_03_eval_cases.py:148: AssertionError +_____________ test_eval_tool_selection[simulate:Run a simulation] _____________ + +case = {'expected_tools': ['save_osm_model', 'run_simulation', 'get_run_status'], 'prompt': 'Run a simulation', 'skill': 'simulate'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) +> result = run_claude(prompt, timeout=timeout) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +tests\llm\test_03_eval_cases.py:141: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +tests\llm\runner.py:209: in run_claude + _last_result = _parse_stream_json(result.stdout) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +raw = None + + def _parse_stream_json(raw: str) -> ClaudeResult: + """Parse newline-delimited JSON from stream-json output.""" + messages = [] + result_obj = {} + +> for line in raw.strip().splitlines(): + ^^^^^^^^^ +E AttributeError: 'NoneType' object has no attribute 'strip' + +tests\llm\runner.py:218: AttributeError +____________ test_eval_tool_selection[simulate:Simulate the model] ____________ + +case = {'expected_tools': ['save_osm_model', 'run_simulation'], 'prompt': 'Simulate the model', 'skill': 'simulate'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) +> result = run_claude(prompt, timeout=timeout) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +tests\llm\test_03_eval_cases.py:141: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +tests\llm\runner.py:209: in run_claude + _last_result = _parse_stream_json(result.stdout) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +raw = None + + def _parse_stream_json(raw: str) -> ClaudeResult: + """Parse newline-delimited JSON from stream-json output.""" + messages = [] + result_obj = {} + +> for line in raw.strip().splitlines(): + ^^^^^^^^^ +E AttributeError: 'NoneType' object has no attribute 'strip' + +tests\llm\runner.py:218: AttributeError +_________ test_eval_tool_selection[troubleshoot:My simulation failed] _________ + +case = {'expected_tools': ['get_run_status', 'get_run_logs'], 'prompt': 'My simulation failed', 'skill': 'troubleshoot'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) +> result = run_claude(prompt, timeout=timeout) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +tests\llm\test_03_eval_cases.py:141: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +tests\llm\runner.py:209: in run_claude + _last_result = _parse_stream_json(result.stdout) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +raw = None + + def _parse_stream_json(raw: str) -> ClaudeResult: + """Parse newline-delimited JSON from stream-json output.""" + messages = [] + result_obj = {} + +> for line in raw.strip().splitlines(): + ^^^^^^^^^ +E AttributeError: 'NoneType' object has no attribute 'strip' + +tests\llm\runner.py:218: AttributeError +__________________________ test_workflow[qaqc_check] __________________________ + +case = {'id': 'qaqc_check', 'prompt': 'Load the model at /runs/examples/llm-test-baseline/baseline_model.osm using load_osm_m...s using run_qaqc_checks. Use MCP tools only.', 'required_tools': ['load_osm_model', 'run_qaqc_checks'], 'timeout': 120} + + @pytest.mark.parametrize("case", WORKFLOW_CASES, ids=[c["id"] for c in WORKFLOW_CASES]) + def test_workflow(case): + """Agent loads model and completes a multi-step workflow.""" + # Validates: Claude chains all required MCP tools for multi-step BEM workflows + tier = get_tier() + if tier not in ("all", "2"): + pytest.skip("Tier 2 not selected") + + # Build prompt for needs_run cases + prompt = case["prompt"] + if case.get("needs_run"): + run_id = get_sim_run_id() + if not run_id: + pytest.skip("No simulation run_id � run test_01_setup first") + prompt = ( + f"Extract results from simulation run '{run_id}'. " + "First extract summary metrics using extract_summary_metrics. " + "Then extract end use breakdown using extract_end_use_breakdown. " + "Use MCP tools only." + ) + elif BASELINE_HVAC_MODEL in prompt and not baseline_hvac_model_exists(): + pytest.skip("Baseline+HVAC model not found � run test_01_setup first") + elif BASELINE_MODEL in prompt and not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + + result = run_claude( + prompt, + timeout=case.get("timeout", 120), + max_turns=case.get("max_turns"), + ) + tool_names = result.tool_names + + for tool in case["required_tools"]: +> assert tool in tool_names, ( + f"Required tool '{tool}' not found. Tools: {tool_names}" + ) +E AssertionError: Required tool 'run_qaqc_checks' not found. Tools: ['load_osm_model', 'validate_model', 'run_simulation', 'get_run_status', 'extract_simulation_errors'] +E assert 'run_qaqc_checks' in ['load_osm_model', 'validate_model', 'run_simulation', 'get_run_status', 'extract_simulation_errors'] + +tests\llm\test_04_workflows.py:624: AssertionError +_____________________ test_workflow[import_floorspacejs] ______________________ + +case = {'id': 'import_floorspacejs', 'prompt': 'Import the FloorspaceJS JSON file at /test-assets/sddc_office/floorplan.json using import_floorspacejs. Use MCP tools only.', 'required_tools': ['import_floorspacejs'], 'timeout': 120} + + @pytest.mark.parametrize("case", WORKFLOW_CASES, ids=[c["id"] for c in WORKFLOW_CASES]) + def test_workflow(case): + """Agent loads model and completes a multi-step workflow.""" + # Validates: Claude chains all required MCP tools for multi-step BEM workflows + tier = get_tier() + if tier not in ("all", "2"): + pytest.skip("Tier 2 not selected") + + # Build prompt for needs_run cases + prompt = case["prompt"] + if case.get("needs_run"): + run_id = get_sim_run_id() + if not run_id: + pytest.skip("No simulation run_id � run test_01_setup first") + prompt = ( + f"Extract results from simulation run '{run_id}'. " + "First extract summary metrics using extract_summary_metrics. " + "Then extract end use breakdown using extract_end_use_breakdown. " + "Use MCP tools only." + ) + elif BASELINE_HVAC_MODEL in prompt and not baseline_hvac_model_exists(): + pytest.skip("Baseline+HVAC model not found � run test_01_setup first") + elif BASELINE_MODEL in prompt and not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + + result = run_claude( + prompt, + timeout=case.get("timeout", 120), + max_turns=case.get("max_turns"), + ) + tool_names = result.tool_names + + for tool in case["required_tools"]: +> assert tool in tool_names, ( + f"Required tool '{tool}' not found. Tools: {tool_names}" + ) +E AssertionError: Required tool 'import_floorspacejs' not found. Tools: [] +E assert 'import_floorspacejs' in [] + +tests\llm\test_04_workflows.py:624: AssertionError +___________________ test_workflow[floorspacejs_to_typical] ____________________ + +case = {'id': 'floorspacejs_to_typical', 'max_turns': 25, 'prompt': 'Do all 3 steps in order, do not stop early:\nStep 1: Imp...e all 3 steps.', 'required_tools': ['import_floorspacejs', 'change_building_location', 'create_typical_building'], ...} + + @pytest.mark.parametrize("case", WORKFLOW_CASES, ids=[c["id"] for c in WORKFLOW_CASES]) + def test_workflow(case): + """Agent loads model and completes a multi-step workflow.""" + # Validates: Claude chains all required MCP tools for multi-step BEM workflows + tier = get_tier() + if tier not in ("all", "2"): + pytest.skip("Tier 2 not selected") + + # Build prompt for needs_run cases + prompt = case["prompt"] + if case.get("needs_run"): + run_id = get_sim_run_id() + if not run_id: + pytest.skip("No simulation run_id � run test_01_setup first") + prompt = ( + f"Extract results from simulation run '{run_id}'. " + "First extract summary metrics using extract_summary_metrics. " + "Then extract end use breakdown using extract_end_use_breakdown. " + "Use MCP tools only." + ) + elif BASELINE_HVAC_MODEL in prompt and not baseline_hvac_model_exists(): + pytest.skip("Baseline+HVAC model not found � run test_01_setup first") + elif BASELINE_MODEL in prompt and not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + + result = run_claude( + prompt, + timeout=case.get("timeout", 120), + max_turns=case.get("max_turns"), + ) + tool_names = result.tool_names + + for tool in case["required_tools"]: +> assert tool in tool_names, ( + f"Required tool '{tool}' not found. Tools: {tool_names}" + ) +E AssertionError: Required tool 'change_building_location' not found. Tools: ['import_floorspacejs'] +E assert 'change_building_location' in ['import_floorspacejs'] + +tests\llm\test_04_workflows.py:624: AssertionError +______________________ test_workflow[envelope_retrofit] _______________________ + +case = {'id': 'envelope_retrofit', 'prompt': 'Load the model at /runs/examples/llm-test-baseline/baseline_model.osm using loa...ly.', 'required_tools': ['load_osm_model', 'set_window_to_wall_ratio', 'replace_window_constructions'], 'timeout': 180} + + @pytest.mark.parametrize("case", WORKFLOW_CASES, ids=[c["id"] for c in WORKFLOW_CASES]) + def test_workflow(case): + """Agent loads model and completes a multi-step workflow.""" + # Validates: Claude chains all required MCP tools for multi-step BEM workflows + tier = get_tier() + if tier not in ("all", "2"): + pytest.skip("Tier 2 not selected") + + # Build prompt for needs_run cases + prompt = case["prompt"] + if case.get("needs_run"): + run_id = get_sim_run_id() + if not run_id: + pytest.skip("No simulation run_id � run test_01_setup first") + prompt = ( + f"Extract results from simulation run '{run_id}'. " + "First extract summary metrics using extract_summary_metrics. " + "Then extract end use breakdown using extract_end_use_breakdown. " + "Use MCP tools only." + ) + elif BASELINE_HVAC_MODEL in prompt and not baseline_hvac_model_exists(): + pytest.skip("Baseline+HVAC model not found � run test_01_setup first") + elif BASELINE_MODEL in prompt and not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + + result = run_claude( + prompt, + timeout=case.get("timeout", 120), + max_turns=case.get("max_turns"), + ) + tool_names = result.tool_names + + for tool in case["required_tools"]: +> assert tool in tool_names, ( + f"Required tool '{tool}' not found. Tools: {tool_names}" + ) +E AssertionError: Required tool 'set_window_to_wall_ratio' not found. Tools: ['load_osm_model', 'list_surfaces', 'list_materials'] +E assert 'set_window_to_wall_ratio' in ['load_osm_model', 'list_surfaces', 'list_materials'] + +tests\llm\test_04_workflows.py:624: AssertionError +___________________ test_workflow[create_and_assign_loads] ____________________ + +case = {'id': 'create_and_assign_loads', 'prompt': "Load the model at /runs/examples/llm-test-baseline/baseline_model.osm usi...s only.", 'required_tools': ['load_osm_model', 'create_people_definition', 'create_lights_definition'], 'timeout': 120} + + @pytest.mark.parametrize("case", WORKFLOW_CASES, ids=[c["id"] for c in WORKFLOW_CASES]) + def test_workflow(case): + """Agent loads model and completes a multi-step workflow.""" + # Validates: Claude chains all required MCP tools for multi-step BEM workflows + tier = get_tier() + if tier not in ("all", "2"): + pytest.skip("Tier 2 not selected") + + # Build prompt for needs_run cases + prompt = case["prompt"] + if case.get("needs_run"): + run_id = get_sim_run_id() + if not run_id: + pytest.skip("No simulation run_id � run test_01_setup first") + prompt = ( + f"Extract results from simulation run '{run_id}'. " + "First extract summary metrics using extract_summary_metrics. " + "Then extract end use breakdown using extract_end_use_breakdown. " + "Use MCP tools only." + ) + elif BASELINE_HVAC_MODEL in prompt and not baseline_hvac_model_exists(): + pytest.skip("Baseline+HVAC model not found � run test_01_setup first") + elif BASELINE_MODEL in prompt and not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + + result = run_claude( + prompt, + timeout=case.get("timeout", 120), + max_turns=case.get("max_turns"), + ) + tool_names = result.tool_names + + for tool in case["required_tools"]: +> assert tool in tool_names, ( + f"Required tool '{tool}' not found. Tools: {tool_names}" + ) +E AssertionError: Required tool 'create_people_definition' not found. Tools: ['load_osm_model', 'list_spaces'] +E assert 'create_people_definition' in ['load_osm_model', 'list_spaces'] + +tests\llm\test_04_workflows.py:624: AssertionError +_____________ test_workflow[measure_replace_terminals_full_chain] _____________ + +case = {'any_of': ['extract_end_use_breakdown', 'extract_summary_metrics'], 'id': 'measure_replace_terminals_full_chain', 'max_turns': 40, 'min_calls': {'run_simulation': 2}, ...} + + @pytest.mark.parametrize("case", WORKFLOW_CASES, ids=[c["id"] for c in WORKFLOW_CASES]) + def test_workflow(case): + """Agent loads model and completes a multi-step workflow.""" + # Validates: Claude chains all required MCP tools for multi-step BEM workflows + tier = get_tier() + if tier not in ("all", "2"): + pytest.skip("Tier 2 not selected") + + # Build prompt for needs_run cases + prompt = case["prompt"] + if case.get("needs_run"): + run_id = get_sim_run_id() + if not run_id: + pytest.skip("No simulation run_id � run test_01_setup first") + prompt = ( + f"Extract results from simulation run '{run_id}'. " + "First extract summary metrics using extract_summary_metrics. " + "Then extract end use breakdown using extract_end_use_breakdown. " + "Use MCP tools only." + ) + elif BASELINE_HVAC_MODEL in prompt and not baseline_hvac_model_exists(): + pytest.skip("Baseline+HVAC model not found � run test_01_setup first") + elif BASELINE_MODEL in prompt and not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + +> result = run_claude( + prompt, + timeout=case.get("timeout", 120), + max_turns=case.get("max_turns"), + ) + +tests\llm\test_04_workflows.py:616: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +tests\llm\runner.py:209: in run_claude + _last_result = _parse_stream_json(result.stdout) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +raw = None + + def _parse_stream_json(raw: str) -> ClaudeResult: + """Parse newline-delimited JSON from stream-json output.""" + messages = [] + result_obj = {} + +> for line in raw.strip().splitlines(): + ^^^^^^^^^ +E AttributeError: 'NoneType' object has no attribute 'strip' + +tests\llm\runner.py:218: AttributeError +____________________ test_progressive[import_floorplan_L1] ____________________ + +case = {'case_id': 'import_floorplan', 'expected': ['import_floorspacejs'], 'id': 'import_floorplan_L1', 'level': 'L1', ...} + + @pytest.mark.progressive + @pytest.mark.parametrize("case", _FLAT_CASES, ids=[c["id"] for c in _FLAT_CASES]) + def test_progressive(case): + """Test tool discovery at varying prompt specificity levels.""" + # Validates: Claude routes L1/L2/L3 prompts to correct tools � lower levels passing = better discoverability + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + prompt = case["prompt"] + if case.get("needs_run"): + run_id = get_sim_run_id() + if not run_id: + pytest.skip("No simulation run_id � run test_01_setup first") + prompt = f"Use run_id '{run_id}'. " + prompt + elif case.get("needs_hvac"): + if not baseline_hvac_model_exists(): + pytest.skip("Baseline+HVAC model not found � run test_01_setup first") + prompt = LOAD_HVAC + prompt.lower() + elif case["needs_model"]: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + prompt = LOAD + prompt.lower() + prompt += SUFFIX + + timeout = 300 if case.get("needs_run") or case["case_id"] == "run_simulation" else 120 + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + +> assert any(t in case["expected"] for t in tool_names), ( + f"[{case['case_id']} {case['level']}] " + f"Expected one of {case['expected']}, got: {tool_names}" + ) +E AssertionError: [import_floorplan L1] Expected one of ['import_floorspacejs'], got: [] +E assert False +E + where False = any(. at 0x000002696EEA5540>) + +tests\llm\test_06_progressive.py:481: AssertionError +____________________ test_progressive[import_floorplan_L3] ____________________ + +case = {'case_id': 'import_floorplan', 'expected': ['import_floorspacejs'], 'id': 'import_floorplan_L3', 'level': 'L3', ...} + + @pytest.mark.progressive + @pytest.mark.parametrize("case", _FLAT_CASES, ids=[c["id"] for c in _FLAT_CASES]) + def test_progressive(case): + """Test tool discovery at varying prompt specificity levels.""" + # Validates: Claude routes L1/L2/L3 prompts to correct tools � lower levels passing = better discoverability + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + prompt = case["prompt"] + if case.get("needs_run"): + run_id = get_sim_run_id() + if not run_id: + pytest.skip("No simulation run_id � run test_01_setup first") + prompt = f"Use run_id '{run_id}'. " + prompt + elif case.get("needs_hvac"): + if not baseline_hvac_model_exists(): + pytest.skip("Baseline+HVAC model not found � run test_01_setup first") + prompt = LOAD_HVAC + prompt.lower() + elif case["needs_model"]: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + prompt = LOAD + prompt.lower() + prompt += SUFFIX + + timeout = 300 if case.get("needs_run") or case["case_id"] == "run_simulation" else 120 + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + +> assert any(t in case["expected"] for t in tool_names), ( + f"[{case['case_id']} {case['level']}] " + f"Expected one of {case['expected']}, got: {tool_names}" + ) +E AssertionError: [import_floorplan L3] Expected one of ['import_floorspacejs'], got: [] +E assert False +E + where False = any(. at 0x000002696EEA6420>) + +tests\llm\test_06_progressive.py:481: AssertionError +____________________ test_progressive[create_building_L2] _____________________ + +case = {'case_id': 'create_building', 'expected': ['create_new_building', 'create_bar_building'], 'id': 'create_building_L2', 'level': 'L2', ...} + + @pytest.mark.progressive + @pytest.mark.parametrize("case", _FLAT_CASES, ids=[c["id"] for c in _FLAT_CASES]) + def test_progressive(case): + """Test tool discovery at varying prompt specificity levels.""" + # Validates: Claude routes L1/L2/L3 prompts to correct tools � lower levels passing = better discoverability + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + prompt = case["prompt"] + if case.get("needs_run"): + run_id = get_sim_run_id() + if not run_id: + pytest.skip("No simulation run_id � run test_01_setup first") + prompt = f"Use run_id '{run_id}'. " + prompt + elif case.get("needs_hvac"): + if not baseline_hvac_model_exists(): + pytest.skip("Baseline+HVAC model not found � run test_01_setup first") + prompt = LOAD_HVAC + prompt.lower() + elif case["needs_model"]: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + prompt = LOAD + prompt.lower() + prompt += SUFFIX + + timeout = 300 if case.get("needs_run") or case["case_id"] == "run_simulation" else 120 + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + +> assert any(t in case["expected"] for t in tool_names), ( + f"[{case['case_id']} {case['level']}] " + f"Expected one of {case['expected']}, got: {tool_names}" + ) +E AssertionError: [create_building L2] Expected one of ['create_new_building', 'create_bar_building'], got: [] +E assert False +E + where False = any(. at 0x000002696EEA7840>) + +tests\llm\test_06_progressive.py:481: AssertionError +_____________________ test_progressive[thermal_zones_L1] ______________________ + +case = {'case_id': 'thermal_zones', 'expected': ['list_thermal_zones'], 'id': 'thermal_zones_L1', 'level': 'L1', ...} + + @pytest.mark.progressive + @pytest.mark.parametrize("case", _FLAT_CASES, ids=[c["id"] for c in _FLAT_CASES]) + def test_progressive(case): + """Test tool discovery at varying prompt specificity levels.""" + # Validates: Claude routes L1/L2/L3 prompts to correct tools � lower levels passing = better discoverability + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + prompt = case["prompt"] + if case.get("needs_run"): + run_id = get_sim_run_id() + if not run_id: + pytest.skip("No simulation run_id � run test_01_setup first") + prompt = f"Use run_id '{run_id}'. " + prompt + elif case.get("needs_hvac"): + if not baseline_hvac_model_exists(): + pytest.skip("Baseline+HVAC model not found � run test_01_setup first") + prompt = LOAD_HVAC + prompt.lower() + elif case["needs_model"]: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + prompt = LOAD + prompt.lower() + prompt += SUFFIX + + timeout = 300 if case.get("needs_run") or case["case_id"] == "run_simulation" else 120 + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + +> assert any(t in case["expected"] for t in tool_names), ( + f"[{case['case_id']} {case['level']}] " + f"Expected one of {case['expected']}, got: {tool_names}" + ) +E AssertionError: [thermal_zones L1] Expected one of ['list_thermal_zones'], got: ['load_osm_model', 'get_model_summary'] +E assert False +E + where False = any(. at 0x000002696EEA6C00>) + +tests\llm\test_06_progressive.py:481: AssertionError +______________________ test_progressive[hvac_sizing_L1] _______________________ + +case = {'case_id': 'hvac_sizing', 'expected': ['extract_hvac_sizing', 'extract_component_sizing'], 'id': 'hvac_sizing_L1', 'level': 'L1', ...} + + @pytest.mark.progressive + @pytest.mark.parametrize("case", _FLAT_CASES, ids=[c["id"] for c in _FLAT_CASES]) + def test_progressive(case): + """Test tool discovery at varying prompt specificity levels.""" + # Validates: Claude routes L1/L2/L3 prompts to correct tools � lower levels passing = better discoverability + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + prompt = case["prompt"] + if case.get("needs_run"): + run_id = get_sim_run_id() + if not run_id: + pytest.skip("No simulation run_id � run test_01_setup first") + prompt = f"Use run_id '{run_id}'. " + prompt + elif case.get("needs_hvac"): + if not baseline_hvac_model_exists(): + pytest.skip("Baseline+HVAC model not found � run test_01_setup first") + prompt = LOAD_HVAC + prompt.lower() + elif case["needs_model"]: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + prompt = LOAD + prompt.lower() + prompt += SUFFIX + + timeout = 300 if case.get("needs_run") or case["case_id"] == "run_simulation" else 120 +> result = run_claude(prompt, timeout=timeout) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +tests\llm\test_06_progressive.py:478: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +tests\llm\runner.py:209: in run_claude + _last_result = _parse_stream_json(result.stdout) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +raw = None + + def _parse_stream_json(raw: str) -> ClaudeResult: + """Parse newline-delimited JSON from stream-json output.""" + messages = [] + result_obj = {} + +> for line in raw.strip().splitlines(): + ^^^^^^^^^ +E AttributeError: 'NoneType' object has no attribute 'strip' + +tests\llm\runner.py:218: AttributeError +____________________ test_progressive[replace_windows_L3] _____________________ + +case = {'case_id': 'replace_windows', 'expected': ['replace_window_constructions', 'list_common_measures', 'list_materials', 'get_construction_details'], 'id': 'replace_windows_L3', 'level': 'L3', ...} + + @pytest.mark.progressive + @pytest.mark.parametrize("case", _FLAT_CASES, ids=[c["id"] for c in _FLAT_CASES]) + def test_progressive(case): + """Test tool discovery at varying prompt specificity levels.""" + # Validates: Claude routes L1/L2/L3 prompts to correct tools � lower levels passing = better discoverability + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + prompt = case["prompt"] + if case.get("needs_run"): + run_id = get_sim_run_id() + if not run_id: + pytest.skip("No simulation run_id � run test_01_setup first") + prompt = f"Use run_id '{run_id}'. " + prompt + elif case.get("needs_hvac"): + if not baseline_hvac_model_exists(): + pytest.skip("Baseline+HVAC model not found � run test_01_setup first") + prompt = LOAD_HVAC + prompt.lower() + elif case["needs_model"]: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + prompt = LOAD + prompt.lower() + prompt += SUFFIX + + timeout = 300 if case.get("needs_run") or case["case_id"] == "run_simulation" else 120 + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + +> assert any(t in case["expected"] for t in tool_names), ( + f"[{case['case_id']} {case['level']}] " + f"Expected one of {case['expected']}, got: {tool_names}" + ) +E AssertionError: [replace_windows L3] Expected one of ['replace_window_constructions', 'list_common_measures', 'list_materials', 'get_construction_details'], got: ['load_osm_model', 'list_model_objects'] +E assert False +E + where False = any(. at 0x000002696EEA4200>) + +tests\llm\test_06_progressive.py:481: AssertionError +______________________ test_progressive[create_loads_L3] ______________________ + +case = {'case_id': 'create_loads', 'expected': ['create_people_definition', 'create_lights_definition'], 'id': 'create_loads_L3', 'level': 'L3', ...} + + @pytest.mark.progressive + @pytest.mark.parametrize("case", _FLAT_CASES, ids=[c["id"] for c in _FLAT_CASES]) + def test_progressive(case): + """Test tool discovery at varying prompt specificity levels.""" + # Validates: Claude routes L1/L2/L3 prompts to correct tools � lower levels passing = better discoverability + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + prompt = case["prompt"] + if case.get("needs_run"): + run_id = get_sim_run_id() + if not run_id: + pytest.skip("No simulation run_id � run test_01_setup first") + prompt = f"Use run_id '{run_id}'. " + prompt + elif case.get("needs_hvac"): + if not baseline_hvac_model_exists(): + pytest.skip("Baseline+HVAC model not found � run test_01_setup first") + prompt = LOAD_HVAC + prompt.lower() + elif case["needs_model"]: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + prompt = LOAD + prompt.lower() + prompt += SUFFIX + + timeout = 300 if case.get("needs_run") or case["case_id"] == "run_simulation" else 120 + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + +> assert any(t in case["expected"] for t in tool_names), ( + f"[{case['case_id']} {case['level']}] " + f"Expected one of {case['expected']}, got: {tool_names}" + ) +E AssertionError: [create_loads L3] Expected one of ['create_people_definition', 'create_lights_definition'], got: ['load_osm_model', 'list_spaces'] +E assert False +E + where False = any(. at 0x000002696ED392A0>) + +tests\llm\test_06_progressive.py:481: AssertionError +============================== warnings summary =============================== +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[energy-report:Give me a full energy report] + C:\Python313\Lib\site-packages\_pytest\threadexception.py:58: PytestUnhandledThreadExceptionWarning: Exception in thread Thread-29 (_readerthread) + + Traceback (most recent call last): + File "C:\Python313\Lib\threading.py", line 1044, in _bootstrap_inner + self.run() + ~~~~~~~~^^ + File "C:\Python313\Lib\threading.py", line 995, in run + self._target(*self._args, **self._kwargs) + ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Python313\Lib\subprocess.py", line 1615, in _readerthread + buffer.append(fh.read()) + ~~~~~~~^^ + File "C:\Python313\Lib\encodings\cp1252.py", line 23, in decode + return codecs.charmap_decode(input,self.errors,decoding_table)[0] + ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + UnicodeDecodeError: 'charmap' codec can't decode byte 0x8f in position 68267: character maps to + + Enable tracemalloc to get traceback where the object was allocated. + See https://docs.pytest.org/en/stable/how-to/capture-warnings.html#resource-warnings for more info. + warnings.warn(pytest.PytestUnhandledThreadExceptionWarning(msg)) + +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Run a simulation] + C:\Python313\Lib\site-packages\_pytest\threadexception.py:58: PytestUnhandledThreadExceptionWarning: Exception in thread Thread-53 (_readerthread) + + Traceback (most recent call last): + File "C:\Python313\Lib\threading.py", line 1044, in _bootstrap_inner + self.run() + ~~~~~~~~^^ + File "C:\Python313\Lib\threading.py", line 995, in run + self._target(*self._args, **self._kwargs) + ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Python313\Lib\subprocess.py", line 1615, in _readerthread + buffer.append(fh.read()) + ~~~~~~~^^ + File "C:\Python313\Lib\encodings\cp1252.py", line 23, in decode + return codecs.charmap_decode(input,self.errors,decoding_table)[0] + ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + UnicodeDecodeError: 'charmap' codec can't decode byte 0x8f in position 48231: character maps to + + Enable tracemalloc to get traceback where the object was allocated. + See https://docs.pytest.org/en/stable/how-to/capture-warnings.html#resource-warnings for more info. + warnings.warn(pytest.PytestUnhandledThreadExceptionWarning(msg)) + +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Simulate the model] + C:\Python313\Lib\site-packages\_pytest\threadexception.py:58: PytestUnhandledThreadExceptionWarning: Exception in thread Thread-55 (_readerthread) + + Traceback (most recent call last): + File "C:\Python313\Lib\threading.py", line 1044, in _bootstrap_inner + self.run() + ~~~~~~~~^^ + File "C:\Python313\Lib\threading.py", line 995, in run + self._target(*self._args, **self._kwargs) + ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Python313\Lib\subprocess.py", line 1615, in _readerthread + buffer.append(fh.read()) + ~~~~~~~^^ + File "C:\Python313\Lib\encodings\cp1252.py", line 23, in decode + return codecs.charmap_decode(input,self.errors,decoding_table)[0] + ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + UnicodeDecodeError: 'charmap' codec can't decode byte 0x8f in position 37994: character maps to + + Enable tracemalloc to get traceback where the object was allocated. + See https://docs.pytest.org/en/stable/how-to/capture-warnings.html#resource-warnings for more info. + warnings.warn(pytest.PytestUnhandledThreadExceptionWarning(msg)) + +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:My simulation failed] + C:\Python313\Lib\site-packages\_pytest\threadexception.py:58: PytestUnhandledThreadExceptionWarning: Exception in thread Thread-59 (_readerthread) + + Traceback (most recent call last): + File "C:\Python313\Lib\threading.py", line 1044, in _bootstrap_inner + self.run() + ~~~~~~~~^^ + File "C:\Python313\Lib\threading.py", line 995, in run + self._target(*self._args, **self._kwargs) + ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Python313\Lib\subprocess.py", line 1615, in _readerthread + buffer.append(fh.read()) + ~~~~~~~^^ + File "C:\Python313\Lib\encodings\cp1252.py", line 23, in decode + return codecs.charmap_decode(input,self.errors,decoding_table)[0] + ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + UnicodeDecodeError: 'charmap' codec can't decode byte 0x8f in position 208042: character maps to + + Enable tracemalloc to get traceback where the object was allocated. + See https://docs.pytest.org/en/stable/how-to/capture-warnings.html#resource-warnings for more info. + warnings.warn(pytest.PytestUnhandledThreadExceptionWarning(msg)) + +tests/llm/test_04_workflows.py::test_workflow[measure_replace_terminals_full_chain] + C:\Python313\Lib\site-packages\_pytest\threadexception.py:58: PytestUnhandledThreadExceptionWarning: Exception in thread Thread-121 (_readerthread) + + Traceback (most recent call last): + File "C:\Python313\Lib\threading.py", line 1044, in _bootstrap_inner + self.run() + ~~~~~~~~^^ + File "C:\Python313\Lib\threading.py", line 995, in run + self._target(*self._args, **self._kwargs) + ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Python313\Lib\subprocess.py", line 1615, in _readerthread + buffer.append(fh.read()) + ~~~~~~~^^ + File "C:\Python313\Lib\encodings\cp1252.py", line 23, in decode + return codecs.charmap_decode(input,self.errors,decoding_table)[0] + ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + UnicodeDecodeError: 'charmap' codec can't decode byte 0x8f in position 140544: character maps to + + Enable tracemalloc to get traceback where the object was allocated. + See https://docs.pytest.org/en/stable/how-to/capture-warnings.html#resource-warnings for more info. + warnings.warn(pytest.PytestUnhandledThreadExceptionWarning(msg)) + +tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L1] + C:\Python313\Lib\site-packages\_pytest\threadexception.py:58: PytestUnhandledThreadExceptionWarning: Exception in thread Thread-279 (_readerthread) + + Traceback (most recent call last): + File "C:\Python313\Lib\threading.py", line 1044, in _bootstrap_inner + self.run() + ~~~~~~~~^^ + File "C:\Python313\Lib\threading.py", line 995, in run + self._target(*self._args, **self._kwargs) + ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Python313\Lib\subprocess.py", line 1615, in _readerthread + buffer.append(fh.read()) + ~~~~~~~^^ + File "C:\Python313\Lib\encodings\cp1252.py", line 23, in decode + return codecs.charmap_decode(input,self.errors,decoding_table)[0] + ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 37113: character maps to + + Enable tracemalloc to get traceback where the object was allocated. + See https://docs.pytest.org/en/stable/how-to/capture-warnings.html#resource-warnings for more info. + warnings.warn(pytest.PytestUnhandledThreadExceptionWarning(msg)) + +-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html +=========================== short test summary info =========================== +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[energy-report:Give me a full energy report] +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Validate before simulation] +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:QA/QC the model] +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Is my model ready to simulate?] +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Run a simulation] +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Simulate the model] +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:My simulation failed] +FAILED tests/llm/test_04_workflows.py::test_workflow[qaqc_check] - AssertionE... +FAILED tests/llm/test_04_workflows.py::test_workflow[import_floorspacejs] - A... +FAILED tests/llm/test_04_workflows.py::test_workflow[floorspacejs_to_typical] +FAILED tests/llm/test_04_workflows.py::test_workflow[envelope_retrofit] - Ass... +FAILED tests/llm/test_04_workflows.py::test_workflow[create_and_assign_loads] +FAILED tests/llm/test_04_workflows.py::test_workflow[measure_replace_terminals_full_chain] +FAILED tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L1] +FAILED tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L3] +FAILED tests/llm/test_06_progressive.py::test_progressive[create_building_L2] +FAILED tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L1] +FAILED tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L1] - A... +FAILED tests/llm/test_06_progressive.py::test_progressive[replace_windows_L3] +FAILED tests/llm/test_06_progressive.py::test_progressive[create_loads_L3] - ... +===== 20 failed, 160 passed, 50 skipped, 6 warnings in 4776.88s (1:19:36) ===== diff --git a/docs/sweeps/opus-2026-03-28/benchmark.json b/docs/sweeps/opus-2026-03-28/benchmark.json new file mode 100644 index 0000000..8d15203 --- /dev/null +++ b/docs/sweeps/opus-2026-03-28/benchmark.json @@ -0,0 +1,5886 @@ +{ + "timestamp": "2026-03-28T21:44:31+00:00", + "model": "opus", + "retries": 0, + "total_tests": 180, + "passed": 170, + "failed": 10, + "pass_rate": 94.4, + "total_duration_s": 11078.5, + "total_input_tokens": 2019, + "total_output_tokens": 164420, + "total_cache_read_tokens": 22609596, + "total_cost_usd": 32.2343, + "tiers": { + "setup": { + "total": 6, + "passed": 6, + "duration_s": 512.4, + "pass_rate": 100.0 + }, + "tier1": { + "total": 4, + "passed": 4, + "duration_s": 135.2, + "pass_rate": 100.0 + }, + "tier3": { + "total": 26, + "passed": 19, + "duration_s": 1860.4, + "pass_rate": 73.1 + }, + "tier2": { + "total": 37, + "passed": 34, + "duration_s": 5343.5, + "pass_rate": 91.9 + }, + "tier4": { + "total": 3, + "passed": 3, + "duration_s": 135.3, + "pass_rate": 100.0 + }, + "progressive": { + "total": 104, + "passed": 104, + "duration_s": 3091.7, + "pass_rate": 100.0 + } + }, + "tests": [ + { + "test_id": "tests/llm/test_01_setup.py::test_create_baseline_model", + "passed": true, + "duration_s": 13.1, + "tier": "setup", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.10332200000000001, + "duration_ms": 10216, + "input_tokens": 7, + "output_tokens": 267, + "cache_read_tokens": 44749, + "tool_calls": [ + "create_baseline_osm" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_baseline_osm" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_01_setup.py::test_create_baseline_with_hvac", + "passed": true, + "duration_s": 14.8, + "tier": "setup", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.15514975, + "duration_ms": 12757, + "input_tokens": 7, + "output_tokens": 325, + "cache_read_tokens": 36067, + "tool_calls": [ + "create_baseline_osm" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_baseline_osm" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_01_setup.py::test_create_example_model", + "passed": true, + "duration_s": 11.8, + "tier": "setup", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.09422950000000001, + "duration_ms": 9710, + "input_tokens": 7, + "output_tokens": 203, + "cache_read_tokens": 45389, + "tool_calls": [ + "create_example_osm" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_example_osm" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_01_setup.py::test_load_baseline_model", + "passed": true, + "duration_s": 15.0, + "tier": "setup", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11391500000000002, + "duration_ms": 12849, + "input_tokens": 8, + "output_tokens": 293, + "cache_read_tokens": 64600, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_01_setup.py::test_run_baseline_simulation", + "passed": true, + "duration_s": 289.8, + "tier": "setup", + "attempt": 1, + "num_turns": 12, + "cost_usd": 0.23695949999999996, + "duration_ms": 287722, + "input_tokens": 18, + "output_tokens": 1306, + "cache_read_tokens": 235314, + "tool_calls": [ + "load_osm_model", + "change_building_location", + "run_simulation", + "get_run_status", + "save_osm_model", + "run_simulation", + "get_run_status" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_01_setup.py::test_run_retrofit_simulation", + "passed": true, + "duration_s": 167.9, + "tier": "setup", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.24028824999999995, + "duration_ms": 165126, + "input_tokens": 12, + "output_tokens": 945, + "cache_read_tokens": 141494, + "tool_calls": [ + "load_osm_model", + "change_building_location", + "adjust_thermostat_setpoints", + "run_simulation", + "get_run_status" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "mcp__openstudio__adjust_thermostat_setpoints", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[What is the server status?]", + "passed": true, + "duration_s": 12.2, + "tier": "tier1", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.09057250000000001, + "duration_ms": 9688, + "input_tokens": 7, + "output_tokens": 173, + "cache_read_tokens": 45525, + "tool_calls": [ + "get_server_status" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__get_server_status" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[List available skills]", + "passed": true, + "duration_s": 14.0, + "tier": "tier1", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.10012824999999999, + "duration_ms": 11963, + "input_tokens": 7, + "output_tokens": 391, + "cache_read_tokens": 45599, + "tool_calls": [ + "list_skills" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_skills" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[Create a small office building usin]", + "passed": true, + "duration_s": 90.1, + "tier": "tier1", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "create_new_building", + "create_new_building", + "list_weather_files", + "create_new_building", + "create_new_building", + "create_new_building", + "create_bar_building" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__create_bar_building" + ], + "toolsearch_count": 3, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[Create bar geometry for a retail bu]", + "passed": true, + "duration_s": 18.9, + "tier": "tier1", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.11058100000000001, + "duration_ms": 16833, + "input_tokens": 7, + "output_tokens": 409, + "cache_read_tokens": 46367, + "tool_calls": [ + "create_bar_building" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_bar_building" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Add HVAC to the model]", + "passed": true, + "duration_s": 25.5, + "tier": "tier3", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.162391, + "duration_ms": 23321, + "input_tokens": 9, + "output_tokens": 889, + "cache_read_tokens": 86342, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "list_thermal_zones", + "add_baseline_system" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_baseline_system" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Set up heating and cooling]", + "passed": true, + "duration_s": 27.7, + "tier": "tier3", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.15196725, + "duration_ms": 25592, + "input_tokens": 13, + "output_tokens": 747, + "cache_read_tokens": 104792, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "list_thermal_zones" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "Skill", + "ToolSearch", + "mcp__openstudio__get_building_info", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:What HVAC system should I use?]", + "passed": true, + "duration_s": 29.4, + "tier": "tier3", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.15607325000000002, + "duration_ms": 27330, + "input_tokens": 13, + "output_tokens": 914, + "cache_read_tokens": 104754, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "list_thermal_zones" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "Skill", + "ToolSearch", + "mcp__openstudio__get_building_info", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Add a VAV system]", + "passed": true, + "duration_s": 23.5, + "tier": "tier3", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.14527174999999998, + "duration_ms": 21438, + "input_tokens": 9, + "output_tokens": 704, + "cache_read_tokens": 86691, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_baseline_system" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_baseline_system" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[energy-report:Give me a full energy report]", + "passed": false, + "duration_s": 120.2, + "tier": "tier3", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "list_files", + "get_weather_info", + "run_simulation" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_building_info", + "ToolSearch", + "mcp__openstudio__list_files", + "ToolSearch", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__run_simulation", + "Bash" + ], + "toolsearch_count": 5, + "is_timeout": true, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a small office building]", + "passed": true, + "duration_s": 180.1, + "tier": "tier3", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "create_new_building", + "create_new_building", + "list_weather_files", + "create_new_building", + "create_new_building", + "create_new_building", + "create_bar_building", + "create_example_osm", + "create_bar_building", + "change_building_location", + "create_baseline_osm", + "change_building_location" + ], + "num_tool_calls": 12, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__create_bar_building", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__create_example_osm", + "mcp__openstudio__create_bar_building", + "mcp__openstudio__change_building_location", + "mcp__openstudio__create_baseline_osm", + "mcp__openstudio__change_building_location", + "ToolSearch" + ], + "toolsearch_count": 6, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Model a 3-story school]", + "passed": true, + "duration_s": 180.1, + "tier": "tier3", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "list_weather_files", + "create_new_building", + "change_building_location", + "change_building_location", + "create_typical_building" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__change_building_location", + "mcp__openstudio__change_building_location", + "mcp__openstudio__create_typical_building", + "Read", + "Bash" + ], + "toolsearch_count": 3, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a retail building, 25000 sqf]", + "passed": true, + "duration_s": 174.2, + "tier": "tier3", + "attempt": 1, + "num_turns": 15, + "cost_usd": 0.53869725, + "duration_ms": 172212, + "input_tokens": 27, + "output_tokens": 4091, + "cache_read_tokens": 447712, + "tool_calls": [ + "create_new_building", + "create_new_building", + "list_weather_files", + "create_new_building", + "create_new_building", + "create_bar_building", + "change_building_location", + "create_typical_building", + "get_building_info" + ], + "num_tool_calls": 9, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__create_bar_building", + "ToolSearch", + "mcp__openstudio__change_building_location", + "mcp__openstudio__create_typical_building", + "ToolSearch", + "mcp__openstudio__get_building_info" + ], + "toolsearch_count": 5, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Import the FloorspaceJS floor plan ]", + "passed": true, + "duration_s": 38.9, + "tier": "tier3", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.14428275, + "duration_ms": 36602, + "input_tokens": 12, + "output_tokens": 635, + "cache_read_tokens": 103533, + "tool_calls": [ + "import_floorspacejs", + "list_files", + "import_floorspacejs" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__import_floorspacejs", + "ToolSearch", + "mcp__openstudio__list_files", + "mcp__openstudio__import_floorspacejs" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a bar building for a medium ]", + "passed": true, + "duration_s": 21.9, + "tier": "tier3", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.11689225, + "duration_ms": 19850, + "input_tokens": 7, + "output_tokens": 436, + "cache_read_tokens": 46377, + "tool_calls": [ + "create_bar_building" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_bar_building" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Check the model for issues]", + "passed": false, + "duration_s": 17.4, + "tier": "tier3", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11736225, + "duration_ms": 15368, + "input_tokens": 8, + "output_tokens": 404, + "cache_read_tokens": 64857, + "tool_calls": [ + "load_osm_model", + "validate_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__validate_model" + ], + "toolsearch_count": 1, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Validate before simulation]", + "passed": false, + "duration_s": 25.7, + "tier": "tier3", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.108795, + "duration_ms": 23690, + "input_tokens": 8, + "output_tokens": 358, + "cache_read_tokens": 64935, + "tool_calls": [ + "load_osm_model", + "validate_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__validate_model" + ], + "toolsearch_count": 1, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:QA/QC the model]", + "passed": false, + "duration_s": 28.3, + "tier": "tier3", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.1273635, + "duration_ms": 26270, + "input_tokens": 11, + "output_tokens": 557, + "cache_read_tokens": 85142, + "tool_calls": [ + "load_osm_model", + "validate_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__validate_model" + ], + "toolsearch_count": 2, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Is my model ready to simulate?]", + "passed": false, + "duration_s": 16.2, + "tier": "tier3", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.16788225, + "duration_ms": 14159, + "input_tokens": 8, + "output_tokens": 399, + "cache_read_tokens": 54872, + "tool_calls": [ + "load_osm_model", + "validate_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__validate_model" + ], + "toolsearch_count": 1, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[retrofit:Compare before and after adding ins]", + "passed": true, + "duration_s": 58.3, + "tier": "tier3", + "attempt": 1, + "num_turns": 18, + "cost_usd": 0.329591, + "duration_ms": 56330, + "input_tokens": 24, + "output_tokens": 2315, + "cache_read_tokens": 257767, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "list_surfaces", + "list_surfaces", + "get_construction_details", + "get_construction_details", + "get_object_fields", + "get_object_fields", + "set_object_property", + "set_object_property", + "get_object_fields", + "get_object_fields" + ], + "num_tool_calls": 12, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__set_object_property", + "mcp__openstudio__set_object_property", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields" + ], + "toolsearch_count": 5, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[retrofit:Do a retrofit analysis]", + "passed": true, + "duration_s": 180.1, + "tier": "tier3", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "get_model_summary", + "list_air_loops", + "list_thermal_zones", + "get_weather_info", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "add_baseline_system", + "save_osm_model", + "run_simulation", + "list_materials", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "list_model_objects", + "get_construction_details", + "get_construction_details", + "get_object_fields", + "get_object_fields", + "save_osm_model", + "set_object_property", + "set_object_property", + "list_model_objects", + "get_load_details", + "list_model_objects", + "get_object_fields", + "set_object_property", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "extract_end_use_breakdown", + "get_run_status", + "extract_summary_metrics", + "extract_end_use_breakdown" + ], + "num_tool_calls": 44, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__get_building_info", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_thermal_zones", + "ToolSearch", + "mcp__openstudio__get_weather_info", + "ToolSearch", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__add_baseline_system", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "mcp__openstudio__list_materials", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "ToolSearch", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__set_object_property", + "mcp__openstudio__set_object_property", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_load_details", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__set_object_property", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "ToolSearch", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 11, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Run a simulation]", + "passed": true, + "duration_s": 120.1, + "tier": "tier3", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "run_simulation" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__run_simulation", + "ToolSearch", + "Bash" + ], + "toolsearch_count": 3, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Simulate the model]", + "passed": true, + "duration_s": 120.1, + "tier": "tier3", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "run_simulation" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__run_simulation", + "ToolSearch", + "Bash" + ], + "toolsearch_count": 3, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Run EnergyPlus]", + "passed": true, + "duration_s": 120.1, + "tier": "tier3", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "run_simulation" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__run_simulation", + "ToolSearch", + "Bash" + ], + "toolsearch_count": 3, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:My simulation failed]", + "passed": false, + "duration_s": 25.9, + "tier": "tier3", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.19795775000000002, + "duration_ms": 23844, + "input_tokens": 14, + "output_tokens": 683, + "cache_read_tokens": 105113, + "tool_calls": [ + "load_osm_model", + "extract_simulation_errors", + "list_weather_files" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__extract_simulation_errors", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__list_weather_files" + ], + "toolsearch_count": 3, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:EUI looks way too high]", + "passed": true, + "duration_s": 120.1, + "tier": "tier3", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "extract_summary_metrics", + "extract_end_use_breakdown", + "extract_simulation_errors", + "get_run_status", + "get_run_artifacts", + "list_weather_files", + "change_building_location", + "save_osm_model", + "save_osm_model", + "run_simulation" + ], + "num_tool_calls": 11, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "ToolSearch", + "mcp__openstudio__extract_simulation_errors", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_artifacts", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash" + ], + "toolsearch_count": 5, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:Too many unmet hours]", + "passed": true, + "duration_s": 120.1, + "tier": "tier3", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "extract_summary_metrics", + "extract_zone_summary", + "extract_simulation_errors", + "get_run_status", + "list_weather_files", + "change_building_location", + "save_osm_model", + "save_osm_model", + "run_simulation" + ], + "num_tool_calls": 10, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_zone_summary", + "ToolSearch", + "mcp__openstudio__extract_simulation_errors", + "mcp__openstudio__get_run_status", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash" + ], + "toolsearch_count": 4, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:Why did EnergyPlus crash?]", + "passed": false, + "duration_s": 17.6, + "tier": "tier3", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.10307150000000001, + "duration_ms": 14969, + "input_tokens": 7, + "output_tokens": 408, + "cache_read_tokens": 45948, + "tool_calls": [ + "load_osm_model", + "extract_simulation_errors" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__extract_simulation_errors" + ], + "toolsearch_count": 1, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:Show me the model]", + "passed": true, + "duration_s": 29.5, + "tier": "tier3", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.135532, + "duration_ms": 27162, + "input_tokens": 12, + "output_tokens": 474, + "cache_read_tokens": 103644, + "tool_calls": [ + "load_osm_model", + "view_model", + "copy_file" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model", + "ToolSearch", + "mcp__openstudio__copy_file" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:Visualize the building]", + "passed": true, + "duration_s": 21.8, + "tier": "tier3", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.10845775, + "duration_ms": 19607, + "input_tokens": 8, + "output_tokens": 336, + "cache_read_tokens": 64948, + "tool_calls": [ + "load_osm_model", + "view_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:3D view]", + "passed": true, + "duration_s": 17.6, + "tier": "tier3", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.10862650000000001, + "duration_ms": 15650, + "input_tokens": 8, + "output_tokens": 339, + "cache_read_tokens": 64948, + "tool_calls": [ + "load_osm_model", + "view_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[systemd_fourpipebeam_e2e]", + "passed": true, + "duration_s": 300.3, + "tier": "tier2", + "attempt": 1, + "num_turns": 1, + "cost_usd": 0.8532817500000002, + "duration_ms": 6129, + "input_tokens": 3, + "output_tokens": 102, + "cache_read_tokens": 54027, + "tool_calls": [ + "load_osm_model", + "list_weather_files", + "change_building_location", + "list_air_loops", + "save_osm_model", + "list_zone_hvac_equipment", + "list_plant_loops", + "search_wiring_patterns", + "search_api", + "get_skill", + "run_simulation", + "create_measure", + "test_measure", + "get_run_status", + "load_osm_model", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "compare_runs", + "copy_file" + ], + "num_tool_calls": 21, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_weather_files", + "Glob", + "ToolSearch", + "Glob", + "mcp__openstudio__change_building_location", + "mcp__openstudio__list_air_loops", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__list_zone_hvac_equipment", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__search_wiring_patterns", + "mcp__openstudio__search_api", + "mcp__openstudio__get_skill", + "mcp__openstudio__run_simulation", + "mcp__openstudio__create_measure", + "ToolSearch", + "mcp__openstudio__test_measure", + "mcp__openstudio__get_run_status", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__compare_runs", + "mcp__openstudio__copy_file" + ], + "toolsearch_count": 4, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[add_vav_reheat]", + "passed": true, + "duration_s": 25.6, + "tier": "tier2", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.148536, + "duration_ms": 23486, + "input_tokens": 9, + "output_tokens": 636, + "cache_read_tokens": 85407, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_baseline_system" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_baseline_system" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[add_doas]", + "passed": true, + "duration_s": 27.0, + "tier": "tier2", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.15967550000000003, + "duration_ms": 24949, + "input_tokens": 12, + "output_tokens": 715, + "cache_read_tokens": 104656, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_doas_system" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_doas_system" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[add_vrf]", + "passed": true, + "duration_s": 24.2, + "tier": "tier2", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.15180175, + "duration_ms": 22102, + "input_tokens": 12, + "output_tokens": 645, + "cache_read_tokens": 104571, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_vrf_system" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_vrf_system" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[set_weather]", + "passed": true, + "duration_s": 20.5, + "tier": "tier2", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11404974999999999, + "duration_ms": 18541, + "input_tokens": 8, + "output_tokens": 431, + "cache_read_tokens": 65557, + "tool_calls": [ + "load_osm_model", + "change_building_location" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[add_rooftop_pv]", + "passed": true, + "duration_s": 19.6, + "tier": "tier2", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11152900000000002, + "duration_ms": 17627, + "input_tokens": 8, + "output_tokens": 380, + "cache_read_tokens": 65203, + "tool_calls": [ + "load_osm_model", + "add_rooftop_pv" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[adjust_thermostat]", + "passed": true, + "duration_s": 17.6, + "tier": "tier2", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.17364625, + "duration_ms": 15528, + "input_tokens": 8, + "output_tokens": 402, + "cache_read_tokens": 54725, + "tool_calls": [ + "load_osm_model", + "adjust_thermostat_setpoints" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__adjust_thermostat_setpoints" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[delete_space]", + "passed": true, + "duration_s": 15.3, + "tier": "tier2", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.18533624999999998, + "duration_ms": 13239, + "input_tokens": 9, + "output_tokens": 437, + "cache_read_tokens": 76145, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "delete_object" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "mcp__openstudio__delete_object" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[qaqc_check]", + "passed": true, + "duration_s": 15.6, + "tier": "tier2", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11308975000000002, + "duration_ms": 13642, + "input_tokens": 8, + "output_tokens": 460, + "cache_read_tokens": 65487, + "tool_calls": [ + "load_osm_model", + "run_qaqc_checks" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__run_qaqc_checks" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[create_bar_office]", + "passed": true, + "duration_s": 20.4, + "tier": "tier2", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.1401155, + "duration_ms": 18376, + "input_tokens": 8, + "output_tokens": 589, + "cache_read_tokens": 68226, + "tool_calls": [ + "create_bar_building", + "list_spaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_bar_building", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[create_new_building]", + "passed": true, + "duration_s": 51.2, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.10507625000000001, + "duration_ms": 49208, + "input_tokens": 7, + "output_tokens": 421, + "cache_read_tokens": 46620, + "tool_calls": [ + "create_new_building" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_new_building" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[bar_then_typical]", + "passed": true, + "duration_s": 60.2, + "tier": "tier2", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.24585975, + "duration_ms": 58247, + "input_tokens": 11, + "output_tokens": 910, + "cache_read_tokens": 129722, + "tool_calls": [ + "create_bar_building", + "change_building_location", + "create_typical_building" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_bar_building", + "mcp__openstudio__change_building_location", + "mcp__openstudio__create_typical_building", + "Read", + "Bash" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[import_floorspacejs]", + "passed": true, + "duration_s": 23.0, + "tier": "tier2", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.143563, + "duration_ms": 20978, + "input_tokens": 12, + "output_tokens": 591, + "cache_read_tokens": 103306, + "tool_calls": [ + "import_floorspacejs", + "list_files", + "import_floorspacejs" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__import_floorspacejs", + "ToolSearch", + "mcp__openstudio__list_files", + "mcp__openstudio__import_floorspacejs" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[floorspacejs_to_typical]", + "passed": true, + "duration_s": 120.8, + "tier": "tier2", + "attempt": 1, + "num_turns": 13, + "cost_usd": 0.278638, + "duration_ms": 118613, + "input_tokens": 19, + "output_tokens": 1971, + "cache_read_tokens": 266461, + "tool_calls": [ + "import_floorspacejs", + "list_files", + "import_floorspacejs", + "change_building_location", + "create_typical_building" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__import_floorspacejs", + "Glob", + "ToolSearch", + "mcp__openstudio__list_files", + "mcp__openstudio__import_floorspacejs", + "mcp__openstudio__change_building_location", + "mcp__openstudio__create_typical_building", + "Read", + "Grep", + "Read", + "Bash" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[manual_geometry_match]", + "passed": true, + "duration_s": 27.2, + "tier": "tier2", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.16100174999999997, + "duration_ms": 25119, + "input_tokens": 12, + "output_tokens": 886, + "cache_read_tokens": 111121, + "tool_calls": [ + "create_example_osm", + "create_space_from_floor_print", + "create_space_from_floor_print", + "match_surfaces" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__create_example_osm", + "mcp__openstudio__create_space_from_floor_print", + "mcp__openstudio__create_space_from_floor_print", + "mcp__openstudio__match_surfaces" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[envelope_retrofit]", + "passed": true, + "duration_s": 38.9, + "tier": "tier2", + "attempt": 1, + "num_turns": 14, + "cost_usd": 0.24899050000000003, + "duration_ms": 36774, + "input_tokens": 13, + "output_tokens": 1418, + "cache_read_tokens": 118851, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "replace_window_constructions" + ], + "num_tool_calls": 11, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__replace_window_constructions" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[create_and_assign_loads]", + "passed": true, + "duration_s": 34.1, + "tier": "tier2", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.14892375, + "duration_ms": 32067, + "input_tokens": 12, + "output_tokens": 770, + "cache_read_tokens": 106540, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "create_people_definition", + "create_lights_definition" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_spaces", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[plant_loop_with_boiler]", + "passed": true, + "duration_s": 19.9, + "tier": "tier2", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.13008, + "duration_ms": 17645, + "input_tokens": 9, + "output_tokens": 570, + "cache_read_tokens": 86220, + "tool_calls": [ + "load_osm_model", + "create_plant_loop", + "add_supply_equipment" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_plant_loop", + "mcp__openstudio__add_supply_equipment" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[inspect_and_modify_boiler]", + "passed": true, + "duration_s": 27.7, + "tier": "tier2", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.15469724999999998, + "duration_ms": 25633, + "input_tokens": 10, + "output_tokens": 691, + "cache_read_tokens": 109207, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_object_fields", + "set_object_property" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__set_object_property" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[extract_results_chain]", + "passed": true, + "duration_s": 16.8, + "tier": "tier2", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.10156400000000002, + "duration_ms": 14774, + "input_tokens": 7, + "output_tokens": 413, + "cache_read_tokens": 45958, + "tool_calls": [ + "extract_summary_metrics", + "extract_end_use_breakdown" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[hvac_chilled_beam_comparison]", + "passed": false, + "duration_s": 300.1, + "tier": "tier2", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "list_air_loops", + "get_air_loop_details", + "replace_air_terminals", + "save_osm_model", + "run_simulation", + "get_run_status", + "list_weather_files", + "change_building_location", + "save_osm_model", + "run_simulation" + ], + "num_tool_calls": 11, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__get_air_loop_details", + "mcp__openstudio__replace_air_terminals", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "Bash", + "mcp__openstudio__get_run_status", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash" + ], + "toolsearch_count": 4, + "is_timeout": true, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[create_test_apply_measure]", + "passed": true, + "duration_s": 27.1, + "tier": "tier2", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.15245175, + "duration_ms": 24981, + "input_tokens": 10, + "output_tokens": 694, + "cache_read_tokens": 109891, + "tool_calls": [ + "load_osm_model", + "create_measure", + "test_measure", + "apply_measure" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[measure_set_lights_full_chain]", + "passed": true, + "duration_s": 506.4, + "tier": "tier2", + "attempt": 1, + "num_turns": 29, + "cost_usd": 0.6606762500000001, + "duration_ms": 504403, + "input_tokens": 36, + "output_tokens": 3999, + "cache_read_tokens": 748080, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "list_weather_files", + "change_building_location", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "create_measure", + "test_measure", + "change_building_location", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics", + "compare_runs" + ], + "num_tool_calls": 20, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__change_building_location", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__compare_runs" + ], + "toolsearch_count": 5, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[measure_set_infiltration_full_chain]", + "passed": true, + "duration_s": 482.2, + "tier": "tier2", + "attempt": 1, + "num_turns": 30, + "cost_usd": 0.6816930000000001, + "duration_ms": 479729, + "input_tokens": 39, + "output_tokens": 3664, + "cache_read_tokens": 814671, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "load_osm_model", + "get_weather_info", + "list_weather_files", + "change_building_location", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "create_measure", + "test_measure", + "change_building_location", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 21, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "Bash", + "mcp__openstudio__get_run_status", + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_weather_info", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__change_building_location", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 5, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[measure_replace_terminals_full_chain]", + "passed": true, + "duration_s": 544.3, + "tier": "tier2", + "attempt": 1, + "num_turns": 39, + "cost_usd": 0.972912, + "duration_ms": 541585, + "input_tokens": 53, + "output_tokens": 6341, + "cache_read_tokens": 1079669, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_weather_info", + "list_weather_files", + "load_osm_model", + "change_building_location", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "list_air_loops", + "list_plant_loops", + "search_wiring_patterns", + "search_api", + "create_measure", + "test_measure", + "apply_measure", + "change_building_location", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics", + "extract_end_use_breakdown", + "extract_end_use_breakdown" + ], + "num_tool_calls": 27, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "Bash", + "mcp__openstudio__get_run_status", + "ToolSearch", + "mcp__openstudio__get_weather_info", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops", + "ToolSearch", + "mcp__openstudio__search_wiring_patterns", + "mcp__openstudio__search_api", + "mcp__openstudio__create_measure", + "ToolSearch", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 8, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[create_measure_with_args]", + "passed": true, + "duration_s": 55.1, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.17993925, + "duration_ms": 52668, + "input_tokens": 7, + "output_tokens": 2905, + "cache_read_tokens": 46396, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[measure_add_baseboards_full_chain]", + "passed": true, + "duration_s": 512.2, + "tier": "tier2", + "attempt": 1, + "num_turns": 33, + "cost_usd": 0.7487729999999998, + "duration_ms": 510066, + "input_tokens": 49, + "output_tokens": 3787, + "cache_read_tokens": 910756, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_weather_info", + "load_osm_model", + "list_weather_files", + "change_building_location", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "change_building_location", + "list_thermal_zones", + "create_measure", + "test_measure", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 22, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "Bash", + "mcp__openstudio__get_run_status", + "ToolSearch", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "ToolSearch", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__create_measure", + "ToolSearch", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 7, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[ruby_measure_reduce_plugloads]", + "passed": true, + "duration_s": 550.7, + "tier": "tier2", + "attempt": 1, + "num_turns": 36, + "cost_usd": 0.8860807499999999, + "duration_ms": 548001, + "input_tokens": 51, + "output_tokens": 4926, + "cache_read_tokens": 1094564, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_weather_info", + "list_weather_files", + "change_building_location", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "change_building_location", + "get_skill", + "create_measure", + "test_measure", + "read_file", + "edit_measure", + "test_measure", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 24, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "Bash", + "mcp__openstudio__get_run_status", + "ToolSearch", + "mcp__openstudio__get_weather_info", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "ToolSearch", + "mcp__openstudio__get_skill", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "Read", + "ToolSearch", + "mcp__openstudio__read_file", + "ToolSearch", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 7, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[python_measure_reduce_plugloads]", + "passed": true, + "duration_s": 428.9, + "tier": "tier2", + "attempt": 1, + "num_turns": 36, + "cost_usd": 0.8973205000000003, + "duration_ms": 426484, + "input_tokens": 55, + "output_tokens": 6145, + "cache_read_tokens": 1050541, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_weather_info", + "list_weather_files", + "load_osm_model", + "change_building_location", + "save_osm_model", + "run_simulation", + "create_measure", + "test_measure", + "read_file", + "edit_measure", + "test_measure", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "change_building_location", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 24, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "Bash", + "mcp__openstudio__get_run_status", + "ToolSearch", + "mcp__openstudio__get_weather_info", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "mcp__openstudio__create_measure", + "ToolSearch", + "mcp__openstudio__test_measure", + "ToolSearch", + "mcp__openstudio__read_file", + "ToolSearch", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 9, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[ruby_measure_boiler_efficiency]", + "passed": true, + "duration_s": 414.5, + "tier": "tier2", + "attempt": 1, + "num_turns": 36, + "cost_usd": 0.9814812500000001, + "duration_ms": 411858, + "input_tokens": 49, + "output_tokens": 7700, + "cache_read_tokens": 1106110, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "list_weather_files", + "load_osm_model", + "change_building_location", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "change_building_location", + "create_measure", + "test_measure", + "read_file", + "create_measure", + "test_measure", + "read_file", + "create_measure", + "test_measure", + "create_measure", + "test_measure", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status" + ], + "num_tool_calls": 27, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "mcp__openstudio__get_run_status", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "mcp__openstudio__create_measure", + "ToolSearch", + "mcp__openstudio__test_measure", + "ToolSearch", + "mcp__openstudio__read_file", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__read_file", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 6, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[python_measure_boiler_efficiency]", + "passed": true, + "duration_s": 431.1, + "tier": "tier2", + "attempt": 1, + "num_turns": 36, + "cost_usd": 0.8719119999999999, + "duration_ms": 428954, + "input_tokens": 55, + "output_tokens": 5588, + "cache_read_tokens": 1038524, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_weather_info", + "list_weather_files", + "load_osm_model", + "change_building_location", + "save_osm_model", + "run_simulation", + "create_measure", + "test_measure", + "read_file", + "edit_measure", + "test_measure", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "change_building_location", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status" + ], + "num_tool_calls": 23, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "Bash", + "mcp__openstudio__get_run_status", + "ToolSearch", + "mcp__openstudio__get_weather_info", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "mcp__openstudio__create_measure", + "ToolSearch", + "mcp__openstudio__test_measure", + "Read", + "ToolSearch", + "mcp__openstudio__read_file", + "ToolSearch", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 9, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_create_measure_with_args_quality", + "passed": true, + "duration_s": 44.9, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.10097175, + "duration_ms": 42417, + "input_tokens": 7, + "output_tokens": 2373, + "cache_read_tokens": 57286, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_complex_model_multi_query", + "passed": true, + "duration_s": 22.6, + "tier": "tier2", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.1311175, + "duration_ms": 20223, + "input_tokens": 8, + "output_tokens": 760, + "cache_read_tokens": 66205, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "list_air_loops", + "list_plant_loops", + "list_thermal_zones" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_measure_reduce_plugloads_quality[Ruby]", + "passed": true, + "duration_s": 27.3, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.1388915, + "duration_ms": 24909, + "input_tokens": 7, + "output_tokens": 1553, + "cache_read_tokens": 46538, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_measure_reduce_plugloads_quality[Python]", + "passed": true, + "duration_s": 31.0, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.13806325, + "duration_ms": 28384, + "input_tokens": 7, + "output_tokens": 1534, + "cache_read_tokens": 46519, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_measure_boiler_efficiency_quality[Ruby]", + "passed": false, + "duration_s": 28.1, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.134245, + "duration_ms": 25665, + "input_tokens": 7, + "output_tokens": 1407, + "cache_read_tokens": 46570, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 1, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_measure_boiler_efficiency_quality[Python]", + "passed": false, + "duration_s": 31.1, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.1342105, + "duration_ms": 28763, + "input_tokens": 7, + "output_tokens": 1408, + "cache_read_tokens": 46551, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 1, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_05_guardrails.py::test_create_uses_mcp_not_raw_idf", + "passed": true, + "duration_s": 95.5, + "tier": "tier4", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.31379250000000003, + "duration_ms": 93455, + "input_tokens": 18, + "output_tokens": 1932, + "cache_read_tokens": 234355, + "tool_calls": [ + "create_new_building", + "list_weather_files", + "create_new_building", + "change_building_location", + "change_building_location", + "create_typical_building" + ], + "num_tool_calls": 6, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__change_building_location", + "mcp__openstudio__change_building_location", + "mcp__openstudio__create_typical_building" + ], + "toolsearch_count": 3, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_05_guardrails.py::test_no_script_for_results", + "passed": true, + "duration_s": 19.1, + "tier": "tier4", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.18825524999999999, + "duration_ms": 16620, + "input_tokens": 11, + "output_tokens": 597, + "cache_read_tokens": 74363, + "tool_calls": [ + "extract_summary_metrics", + "get_run_status", + "extract_simulation_errors" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_summary_metrics", + "ToolSearch", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_simulation_errors" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_05_guardrails.py::test_inspect_component_uses_mcp_not_script", + "passed": true, + "duration_s": 20.7, + "tier": "tier4", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.1426325, + "duration_ms": 18700, + "input_tokens": 9, + "output_tokens": 769, + "cache_read_tokens": 85250, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "get_component_properties" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_component_properties" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L1]", + "passed": true, + "duration_s": 21.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.12466424999999999, + "duration_ms": 19067, + "input_tokens": 8, + "output_tokens": 590, + "cache_read_tokens": 66511, + "tool_calls": [ + "list_files", + "import_floorspacejs" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_files", + "mcp__openstudio__import_floorspacejs" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L2]", + "passed": true, + "duration_s": 26.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.13965575, + "duration_ms": 24221, + "input_tokens": 12, + "output_tokens": 584, + "cache_read_tokens": 104004, + "tool_calls": [ + "import_floorspacejs", + "list_files", + "import_floorspacejs" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__import_floorspacejs", + "ToolSearch", + "mcp__openstudio__list_files", + "mcp__openstudio__import_floorspacejs" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L3]", + "passed": true, + "duration_s": 23.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.13958225000000002, + "duration_ms": 21404, + "input_tokens": 12, + "output_tokens": 583, + "cache_read_tokens": 103957, + "tool_calls": [ + "import_floorspacejs", + "list_files", + "import_floorspacejs" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__import_floorspacejs", + "ToolSearch", + "mcp__openstudio__list_files", + "mcp__openstudio__import_floorspacejs" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_hvac_L1]", + "passed": true, + "duration_s": 26.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.1775475, + "duration_ms": 24127, + "input_tokens": 12, + "output_tokens": 1005, + "cache_read_tokens": 107950, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "list_thermal_zones", + "add_baseline_system" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info", + "mcp__openstudio__list_thermal_zones", + "ToolSearch", + "mcp__openstudio__add_baseline_system" + ], + "toolsearch_count": 3, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_hvac_L2]", + "passed": true, + "duration_s": 19.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.14333875000000001, + "duration_ms": 17423, + "input_tokens": 9, + "output_tokens": 654, + "cache_read_tokens": 86425, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_baseline_system" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_baseline_system" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_hvac_L3]", + "passed": true, + "duration_s": 19.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.1427125, + "duration_ms": 16978, + "input_tokens": 9, + "output_tokens": 634, + "cache_read_tokens": 86410, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_baseline_system" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_baseline_system" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[view_model_L1]", + "passed": true, + "duration_s": 22.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.1103365, + "duration_ms": 20300, + "input_tokens": 8, + "output_tokens": 405, + "cache_read_tokens": 64968, + "tool_calls": [ + "load_osm_model", + "view_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[view_model_L2]", + "passed": true, + "duration_s": 17.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.1122105, + "duration_ms": 15181, + "input_tokens": 8, + "output_tokens": 371, + "cache_read_tokens": 64516, + "tool_calls": [ + "load_osm_model", + "view_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[view_model_L3]", + "passed": true, + "duration_s": 18.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.110064, + "duration_ms": 16584, + "input_tokens": 8, + "output_tokens": 391, + "cache_read_tokens": 64998, + "tool_calls": [ + "load_osm_model", + "view_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_weather_L1]", + "passed": true, + "duration_s": 32.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.19939674999999998, + "duration_ms": 30317, + "input_tokens": 12, + "output_tokens": 864, + "cache_read_tokens": 111536, + "tool_calls": [ + "load_osm_model", + "list_weather_files", + "change_building_location" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_weather_L2]", + "passed": true, + "duration_s": 47.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.23362475000000002, + "duration_ms": 45568, + "input_tokens": 14, + "output_tokens": 977, + "cache_read_tokens": 160272, + "tool_calls": [ + "load_osm_model", + "change_building_location", + "list_weather_files", + "change_building_location", + "change_building_location" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location", + "mcp__openstudio__change_building_location" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_weather_L3]", + "passed": true, + "duration_s": 34.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.20967625, + "duration_ms": 32472, + "input_tokens": 13, + "output_tokens": 831, + "cache_read_tokens": 133035, + "tool_calls": [ + "load_osm_model", + "change_building_location", + "list_weather_files", + "change_building_location" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L1]", + "passed": true, + "duration_s": 16.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.1125295, + "duration_ms": 14624, + "input_tokens": 8, + "output_tokens": 399, + "cache_read_tokens": 65679, + "tool_calls": [ + "load_osm_model", + "validate_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__validate_model" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L2]", + "passed": true, + "duration_s": 19.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.12068400000000001, + "duration_ms": 17619, + "input_tokens": 10, + "output_tokens": 550, + "cache_read_tokens": 65293, + "tool_calls": [ + "load_osm_model", + "validate_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__validate_model" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L3]", + "passed": true, + "duration_s": 17.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.1317815, + "duration_ms": 15025, + "input_tokens": 11, + "output_tokens": 584, + "cache_read_tokens": 85678, + "tool_calls": [ + "load_osm_model", + "inspect_osm_summary", + "validate_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__inspect_osm_summary", + "ToolSearch", + "mcp__openstudio__validate_model" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_building_L1]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "create_new_building", + "create_new_building", + "list_weather_files", + "create_new_building", + "create_bar_building", + "create_example_osm", + "create_bar_building" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__create_bar_building", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__create_example_osm", + "mcp__openstudio__create_bar_building" + ], + "toolsearch_count": 5, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_building_L2]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "create_new_building", + "create_new_building", + "list_weather_files", + "create_new_building", + "create_new_building", + "create_bar_building", + "create_example_osm", + "create_bar_building" + ], + "num_tool_calls": 8, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__create_bar_building", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__create_example_osm", + "mcp__openstudio__create_bar_building" + ], + "toolsearch_count": 5, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_building_L3]", + "passed": true, + "duration_s": 15.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.11139475000000001, + "duration_ms": 12993, + "input_tokens": 7, + "output_tokens": 372, + "cache_read_tokens": 46407, + "tool_calls": [ + "create_bar_building" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_bar_building" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_pv_L1]", + "passed": true, + "duration_s": 22.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.1135575, + "duration_ms": 19987, + "input_tokens": 8, + "output_tokens": 451, + "cache_read_tokens": 65160, + "tool_calls": [ + "load_osm_model", + "add_rooftop_pv" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_pv_L2]", + "passed": true, + "duration_s": 18.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11430325, + "duration_ms": 16101, + "input_tokens": 8, + "output_tokens": 368, + "cache_read_tokens": 64614, + "tool_calls": [ + "load_osm_model", + "add_rooftop_pv" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_pv_L3]", + "passed": true, + "duration_s": 18.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11166025000000002, + "duration_ms": 15953, + "input_tokens": 8, + "output_tokens": 385, + "cache_read_tokens": 65203, + "tool_calls": [ + "load_osm_model", + "add_rooftop_pv" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermostat_L1]", + "passed": true, + "duration_s": 14.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11204025, + "duration_ms": 12831, + "input_tokens": 8, + "output_tokens": 359, + "cache_read_tokens": 65163, + "tool_calls": [ + "load_osm_model", + "adjust_thermostat_setpoints" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__adjust_thermostat_setpoints" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermostat_L2]", + "passed": true, + "duration_s": 18.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.1153225, + "duration_ms": 16124, + "input_tokens": 8, + "output_tokens": 364, + "cache_read_tokens": 64615, + "tool_calls": [ + "load_osm_model", + "adjust_thermostat_setpoints" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__adjust_thermostat_setpoints" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermostat_L3]", + "passed": true, + "duration_s": 14.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11539275000000002, + "duration_ms": 12759, + "input_tokens": 8, + "output_tokens": 368, + "cache_read_tokens": 64643, + "tool_calls": [ + "load_osm_model", + "adjust_thermostat_setpoints" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__adjust_thermostat_setpoints" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_spaces_L1]", + "passed": true, + "duration_s": 21.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11688825, + "duration_ms": 19148, + "input_tokens": 8, + "output_tokens": 444, + "cache_read_tokens": 65209, + "tool_calls": [ + "load_osm_model", + "list_spaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_spaces_L2]", + "passed": true, + "duration_s": 16.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11977299999999999, + "duration_ms": 14365, + "input_tokens": 8, + "output_tokens": 605, + "cache_read_tokens": 65341, + "tool_calls": [ + "load_osm_model", + "list_spaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_spaces_L3]", + "passed": true, + "duration_s": 18.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.17627025, + "duration_ms": 16755, + "input_tokens": 8, + "output_tokens": 584, + "cache_read_tokens": 55423, + "tool_calls": [ + "load_osm_model", + "list_spaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedules_L1]", + "passed": true, + "duration_s": 19.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.18704849999999998, + "duration_ms": 17837, + "input_tokens": 9, + "output_tokens": 616, + "cache_read_tokens": 75432, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "list_model_objects", + "list_model_objects" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedules_L2]", + "passed": true, + "duration_s": 16.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11269499999999999, + "duration_ms": 14323, + "input_tokens": 8, + "output_tokens": 389, + "cache_read_tokens": 65610, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedules_L3]", + "passed": true, + "duration_s": 21.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11303150000000001, + "duration_ms": 18999, + "input_tokens": 8, + "output_tokens": 397, + "cache_read_tokens": 65658, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[inspect_component_L1]", + "passed": true, + "duration_s": 24.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.13586225000000002, + "duration_ms": 22510, + "input_tokens": 9, + "output_tokens": 575, + "cache_read_tokens": 86272, + "tool_calls": [ + "load_osm_model", + "list_plant_loops", + "get_component_properties" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__get_component_properties" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[inspect_component_L2]", + "passed": true, + "duration_s": 18.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.12637675, + "duration_ms": 16978, + "input_tokens": 9, + "output_tokens": 476, + "cache_read_tokens": 85626, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_component_properties" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_component_properties" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[inspect_component_L3]", + "passed": true, + "duration_s": 32.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.16653925, + "duration_ms": 30578, + "input_tokens": 13, + "output_tokens": 821, + "cache_read_tokens": 124286, + "tool_calls": [ + "load_osm_model", + "get_object_fields", + "list_model_objects", + "get_object_fields" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_object_fields", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_object_fields" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[modify_component_L1]", + "passed": true, + "duration_s": 20.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.14332725, + "duration_ms": 18614, + "input_tokens": 10, + "output_tokens": 556, + "cache_read_tokens": 105992, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_component_properties", + "set_component_properties" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_component_properties", + "mcp__openstudio__set_component_properties" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[modify_component_L2]", + "passed": true, + "duration_s": 14.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.13170875, + "duration_ms": 12278, + "input_tokens": 9, + "output_tokens": 430, + "cache_read_tokens": 84665, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "set_component_properties" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__set_component_properties" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[modify_component_L3]", + "passed": true, + "duration_s": 13.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.1855645, + "duration_ms": 11668, + "input_tokens": 9, + "output_tokens": 481, + "cache_read_tokens": 76589, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "set_object_property" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__set_object_property" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L1]", + "passed": true, + "duration_s": 36.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.17162049999999998, + "duration_ms": 34589, + "input_tokens": 12, + "output_tokens": 1291, + "cache_read_tokens": 106321, + "tool_calls": [ + "load_osm_model", + "get_simulation_control", + "list_air_loops", + "list_thermal_zones", + "get_sizing_system_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_simulation_control", + "ToolSearch", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__get_sizing_system_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L2]", + "passed": true, + "duration_s": 14.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.1107445, + "duration_ms": 11890, + "input_tokens": 8, + "output_tokens": 360, + "cache_read_tokens": 65584, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L3]", + "passed": true, + "duration_s": 16.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11193424999999999, + "duration_ms": 14133, + "input_tokens": 8, + "output_tokens": 393, + "cache_read_tokens": 65676, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[floor_area_L1]", + "passed": true, + "duration_s": 20.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11259274999999999, + "duration_ms": 18656, + "input_tokens": 8, + "output_tokens": 355, + "cache_read_tokens": 64468, + "tool_calls": [ + "load_osm_model", + "get_building_info" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[floor_area_L2]", + "passed": true, + "duration_s": 16.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.12374099999999999, + "duration_ms": 14758, + "input_tokens": 11, + "output_tokens": 333, + "cache_read_tokens": 83122, + "tool_calls": [ + "load_osm_model", + "get_building_info" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__get_building_info" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[floor_area_L3]", + "passed": true, + "duration_s": 16.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11007974999999999, + "duration_ms": 14228, + "input_tokens": 8, + "output_tokens": 347, + "cache_read_tokens": 64917, + "tool_calls": [ + "load_osm_model", + "get_building_info" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[materials_L1]", + "passed": true, + "duration_s": 27.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.12215625000000001, + "duration_ms": 25544, + "input_tokens": 8, + "output_tokens": 595, + "cache_read_tokens": 64920, + "tool_calls": [ + "load_osm_model", + "list_materials" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_materials" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[materials_L2]", + "passed": true, + "duration_s": 18.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.12735749999999998, + "duration_ms": 16274, + "input_tokens": 8, + "output_tokens": 838, + "cache_read_tokens": 65110, + "tool_calls": [ + "load_osm_model", + "list_materials" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_materials" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[materials_L3]", + "passed": true, + "duration_s": 17.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.1283505, + "duration_ms": 14931, + "input_tokens": 8, + "output_tokens": 771, + "cache_read_tokens": 64546, + "tool_calls": [ + "load_osm_model", + "list_materials" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_materials" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L1]", + "passed": true, + "duration_s": 14.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.117674, + "duration_ms": 12793, + "input_tokens": 10, + "output_tokens": 398, + "cache_read_tokens": 64498, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L2]", + "passed": true, + "duration_s": 14.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.116054, + "duration_ms": 12216, + "input_tokens": 8, + "output_tokens": 463, + "cache_read_tokens": 64978, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L3]", + "passed": true, + "duration_s": 20.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11521374999999999, + "duration_ms": 18553, + "input_tokens": 8, + "output_tokens": 467, + "cache_read_tokens": 65160, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L1]", + "passed": true, + "duration_s": 14.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.10964700000000001, + "duration_ms": 12901, + "input_tokens": 8, + "output_tokens": 355, + "cache_read_tokens": 65414, + "tool_calls": [ + "load_osm_model", + "list_subsurfaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_subsurfaces" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L2]", + "passed": true, + "duration_s": 14.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11474225, + "duration_ms": 12838, + "input_tokens": 8, + "output_tokens": 362, + "cache_read_tokens": 64567, + "tool_calls": [ + "load_osm_model", + "list_subsurfaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_subsurfaces" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L3]", + "passed": true, + "duration_s": 15.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.16969125000000002, + "duration_ms": 12935, + "input_tokens": 8, + "output_tokens": 330, + "cache_read_tokens": 54790, + "tool_calls": [ + "load_osm_model", + "list_subsurfaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_subsurfaces" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[surface_details_L1]", + "passed": true, + "duration_s": 24.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.1396425, + "duration_ms": 22402, + "input_tokens": 11, + "output_tokens": 688, + "cache_read_tokens": 83825, + "tool_calls": [ + "load_osm_model", + "list_surfaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_surfaces" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[surface_details_L2]", + "passed": true, + "duration_s": 34.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.13446625, + "duration_ms": 32133, + "input_tokens": 9, + "output_tokens": 599, + "cache_read_tokens": 84630, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "get_surface_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__get_surface_details" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[surface_details_L3]", + "passed": true, + "duration_s": 26.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.1923845, + "duration_ms": 24477, + "input_tokens": 8, + "output_tokens": 668, + "cache_read_tokens": 64764, + "tool_calls": [ + "load_osm_model", + "list_surfaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_simulation_L1]", + "passed": true, + "duration_s": 181.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.199637, + "duration_ms": 179017, + "input_tokens": 18, + "output_tokens": 1041, + "cache_read_tokens": 185619, + "tool_calls": [ + "load_osm_model", + "get_weather_info", + "run_simulation", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__run_simulation", + "ToolSearch", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 3, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_simulation_L2]", + "passed": true, + "duration_s": 149.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.15736, + "duration_ms": 146756, + "input_tokens": 13, + "output_tokens": 738, + "cache_read_tokens": 123640, + "tool_calls": [ + "load_osm_model", + "run_simulation", + "get_run_status" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "Bash", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_simulation_L3]", + "passed": true, + "duration_s": 149.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.15374799999999997, + "duration_ms": 147287, + "input_tokens": 13, + "output_tokens": 696, + "cache_read_tokens": 124016, + "tool_calls": [ + "load_osm_model", + "run_simulation", + "get_run_status" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "Bash", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[get_eui_L1]", + "passed": true, + "duration_s": 20.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.131038, + "duration_ms": 18104, + "input_tokens": 11, + "output_tokens": 597, + "cache_read_tokens": 84041, + "tool_calls": [ + "extract_summary_metrics", + "extract_end_use_breakdown", + "get_run_status" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_summary_metrics", + "ToolSearch", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[get_eui_L2]", + "passed": true, + "duration_s": 28.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.15925625, + "duration_ms": 22353, + "input_tokens": 15, + "output_tokens": 760, + "cache_read_tokens": 123200, + "tool_calls": [ + "extract_summary_metrics", + "extract_end_use_breakdown", + "get_run_status", + "extract_simulation_errors" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_summary_metrics", + "ToolSearch", + "mcp__openstudio__extract_end_use_breakdown", + "ToolSearch", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_simulation_errors" + ], + "toolsearch_count": 3, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[get_eui_L3]", + "passed": true, + "duration_s": 15.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.15201275, + "duration_ms": 13650, + "input_tokens": 7, + "output_tokens": 251, + "cache_read_tokens": 35818, + "tool_calls": [ + "extract_summary_metrics" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L1]", + "passed": true, + "duration_s": 33.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.2254745, + "duration_ms": 31252, + "input_tokens": 20, + "output_tokens": 1394, + "cache_read_tokens": 191549, + "tool_calls": [ + "extract_end_use_breakdown", + "extract_summary_metrics", + "get_run_artifacts", + "query_timeseries", + "query_timeseries", + "extract_simulation_errors" + ], + "num_tool_calls": 6, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_end_use_breakdown", + "ToolSearch", + "mcp__openstudio__extract_summary_metrics", + "ToolSearch", + "mcp__openstudio__get_run_artifacts", + "mcp__openstudio__query_timeseries", + "mcp__openstudio__query_timeseries", + "ToolSearch", + "mcp__openstudio__extract_simulation_errors" + ], + "toolsearch_count": 4, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L2]", + "passed": true, + "duration_s": 27.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.1298555, + "duration_ms": 25486, + "input_tokens": 11, + "output_tokens": 643, + "cache_read_tokens": 83876, + "tool_calls": [ + "extract_end_use_breakdown", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_end_use_breakdown", + "ToolSearch", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L3]", + "passed": true, + "duration_s": 13.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.093364, + "duration_ms": 10537, + "input_tokens": 7, + "output_tokens": 241, + "cache_read_tokens": 45683, + "tool_calls": [ + "extract_end_use_breakdown" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L1]", + "passed": true, + "duration_s": 24.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.144973, + "duration_ms": 21913, + "input_tokens": 11, + "output_tokens": 908, + "cache_read_tokens": 84586, + "tool_calls": [ + "extract_hvac_sizing", + "extract_component_sizing", + "extract_simulation_errors", + "extract_summary_metrics" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__extract_hvac_sizing", + "mcp__openstudio__extract_component_sizing", + "ToolSearch", + "mcp__openstudio__extract_simulation_errors", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 3, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L2]", + "passed": true, + "duration_s": 20.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.12572524999999998, + "duration_ms": 18020, + "input_tokens": 11, + "output_tokens": 622, + "cache_read_tokens": 83828, + "tool_calls": [ + "extract_hvac_sizing", + "extract_component_sizing" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_hvac_sizing", + "ToolSearch", + "mcp__openstudio__extract_component_sizing" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L3]", + "passed": true, + "duration_s": 13.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.09654650000000001, + "duration_ms": 11364, + "input_tokens": 7, + "output_tokens": 332, + "cache_read_tokens": 45423, + "tool_calls": [ + "extract_hvac_sizing" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_hvac_sizing" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_wwr_L1]", + "passed": true, + "duration_s": 33.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 13, + "cost_usd": 0.17280675, + "duration_ms": 30845, + "input_tokens": 12, + "output_tokens": 1187, + "cache_read_tokens": 104506, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio" + ], + "num_tool_calls": 10, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_wwr_L2]", + "passed": true, + "duration_s": 28.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 13, + "cost_usd": 0.17208675, + "duration_ms": 26328, + "input_tokens": 12, + "output_tokens": 1260, + "cache_read_tokens": 105141, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio" + ], + "num_tool_calls": 10, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_wwr_L3]", + "passed": true, + "duration_s": 32.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 13, + "cost_usd": 0.17093775, + "duration_ms": 30471, + "input_tokens": 12, + "output_tokens": 1205, + "cache_read_tokens": 105168, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio" + ], + "num_tool_calls": 10, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_windows_L1]", + "passed": true, + "duration_s": 112.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 26, + "cost_usd": 0.54060125, + "duration_ms": 110172, + "input_tokens": 31, + "output_tokens": 4538, + "cache_read_tokens": 467380, + "tool_calls": [ + "load_osm_model", + "list_materials", + "list_subsurfaces", + "list_surfaces", + "list_model_objects", + "list_surfaces", + "search_api", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "create_measure", + "apply_measure", + "apply_measure", + "get_construction_details" + ], + "num_tool_calls": 19, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_materials", + "mcp__openstudio__list_subsurfaces", + "ToolSearch", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_surfaces", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__search_api", + "ToolSearch", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__create_measure", + "mcp__openstudio__apply_measure", + "ToolSearch", + "mcp__openstudio__apply_measure", + "mcp__openstudio__get_construction_details" + ], + "toolsearch_count": 6, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_windows_L2]", + "passed": true, + "duration_s": 99.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 30, + "cost_usd": 0.50204025, + "duration_ms": 97071, + "input_tokens": 32, + "output_tokens": 4105, + "cache_read_tokens": 440748, + "tool_calls": [ + "load_osm_model", + "list_subsurfaces", + "list_model_objects", + "list_subsurfaces", + "get_construction_details", + "get_construction_details", + "list_materials", + "list_subsurfaces", + "list_surfaces", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "list_materials", + "search_api", + "create_measure", + "apply_measure", + "replace_window_constructions" + ], + "num_tool_calls": 23, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_subsurfaces", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_subsurfaces", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__list_materials", + "mcp__openstudio__list_subsurfaces", + "ToolSearch", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "ToolSearch", + "mcp__openstudio__list_materials", + "ToolSearch", + "mcp__openstudio__search_api", + "mcp__openstudio__create_measure", + "ToolSearch", + "mcp__openstudio__apply_measure", + "mcp__openstudio__replace_window_constructions" + ], + "toolsearch_count": 6, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_windows_L3]", + "passed": true, + "duration_s": 44.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 20, + "cost_usd": 0.25963375, + "duration_ms": 41715, + "input_tokens": 23, + "output_tokens": 1943, + "cache_read_tokens": 215425, + "tool_calls": [ + "load_osm_model", + "list_subsurfaces", + "list_subsurfaces", + "list_model_objects", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "replace_window_constructions" + ], + "num_tool_calls": 14, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_subsurfaces", + "mcp__openstudio__list_subsurfaces", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__replace_window_constructions" + ], + "toolsearch_count": 5, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[construction_details_L1]", + "passed": true, + "duration_s": 21.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.14312424999999998, + "duration_ms": 19113, + "input_tokens": 12, + "output_tokens": 631, + "cache_read_tokens": 103841, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "get_construction_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__get_construction_details" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[construction_details_L2]", + "passed": true, + "duration_s": 21.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.138435, + "duration_ms": 18988, + "input_tokens": 9, + "output_tokens": 801, + "cache_read_tokens": 85930, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_construction_details", + "get_construction_details" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[construction_details_L3]", + "passed": true, + "duration_s": 24.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.15538975, + "duration_ms": 22502, + "input_tokens": 12, + "output_tokens": 895, + "cache_read_tokens": 104922, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_construction_details", + "get_construction_details", + "get_construction_details" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[check_loads_L1]", + "passed": true, + "duration_s": 17.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.13123175, + "duration_ms": 15276, + "input_tokens": 9, + "output_tokens": 439, + "cache_read_tokens": 84136, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "get_space_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "mcp__openstudio__get_space_details" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[check_loads_L2]", + "passed": true, + "duration_s": 29.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.16840149999999998, + "duration_ms": 27653, + "input_tokens": 13, + "output_tokens": 889, + "cache_read_tokens": 127098, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "get_space_type_details", + "get_load_details", + "get_load_details" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "ToolSearch", + "mcp__openstudio__get_space_type_details", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[check_loads_L3]", + "passed": true, + "duration_s": 25.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 13, + "cost_usd": 0.16143000000000002, + "duration_ms": 22624, + "input_tokens": 12, + "output_tokens": 1131, + "cache_read_tokens": 105290, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "get_load_details", + "get_load_details", + "get_load_details", + "get_load_details" + ], + "num_tool_calls": 10, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_loads_L1]", + "passed": true, + "duration_s": 47.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 25, + "cost_usd": 0.31937475, + "duration_ms": 45414, + "input_tokens": 12, + "output_tokens": 2834, + "cache_read_tokens": 95342, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition" + ], + "num_tool_calls": 22, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "ToolSearch", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_loads_L2]", + "passed": true, + "duration_s": 43.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 25, + "cost_usd": 0.35772724999999994, + "duration_ms": 40997, + "input_tokens": 12, + "output_tokens": 2688, + "cache_read_tokens": 84197, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition" + ], + "num_tool_calls": 22, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_spaces", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_loads_L3]", + "passed": true, + "duration_s": 18.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.19732375000000002, + "duration_ms": 16487, + "input_tokens": 12, + "output_tokens": 604, + "cache_read_tokens": 95065, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "create_people_definition" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_spaces", + "mcp__openstudio__create_people_definition" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L1]", + "passed": true, + "duration_s": 15.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11202275, + "duration_ms": 13197, + "input_tokens": 8, + "output_tokens": 410, + "cache_read_tokens": 65528, + "tool_calls": [ + "load_osm_model", + "create_plant_loop" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_plant_loop" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L2]", + "passed": true, + "duration_s": 26.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11688350000000002, + "duration_ms": 24836, + "input_tokens": 8, + "output_tokens": 424, + "cache_read_tokens": 64812, + "tool_calls": [ + "load_osm_model", + "create_plant_loop" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_plant_loop" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L3]", + "passed": true, + "duration_s": 13.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11534575, + "duration_ms": 11445, + "input_tokens": 8, + "output_tokens": 361, + "cache_read_tokens": 64824, + "tool_calls": [ + "load_osm_model", + "create_plant_loop" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_plant_loop" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedule_details_L1]", + "passed": true, + "duration_s": 65.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 22, + "cost_usd": 0.4357255, + "duration_ms": 63547, + "input_tokens": 28, + "output_tokens": 2315, + "cache_read_tokens": 338146, + "tool_calls": [ + "load_osm_model", + "list_air_loops", + "list_zone_hvac_equipment", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "get_building_info", + "get_schedule_details", + "get_schedule_details", + "get_schedule_details", + "get_object_fields", + "get_object_fields", + "read_file", + "read_file", + "read_file" + ], + "num_tool_calls": 15, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_air_loops", + "ToolSearch", + "mcp__openstudio__list_zone_hvac_equipment", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_building_info", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_schedule_details", + "ToolSearch", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields", + "ToolSearch", + "mcp__openstudio__read_file", + "Grep", + "mcp__openstudio__read_file", + "mcp__openstudio__read_file" + ], + "toolsearch_count": 5, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedule_details_L2]", + "passed": true, + "duration_s": 37.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 9, + "cost_usd": 0.175, + "duration_ms": 34523, + "input_tokens": 11, + "output_tokens": 1100, + "cache_read_tokens": 125240, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "get_schedule_details" + ], + "num_tool_calls": 6, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_schedule_details" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedule_details_L3]", + "passed": true, + "duration_s": 26.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.14963625000000003, + "duration_ms": 22901, + "input_tokens": 12, + "output_tokens": 700, + "cache_read_tokens": 103940, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_schedule_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_schedule_details" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[space_type_info_L1]", + "passed": true, + "duration_s": 27.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.15729075, + "duration_ms": 24883, + "input_tokens": 12, + "output_tokens": 658, + "cache_read_tokens": 105324, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "get_space_type_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_spaces", + "mcp__openstudio__get_space_type_details" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[space_type_info_L2]", + "passed": true, + "duration_s": 22.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.1355295, + "duration_ms": 20621, + "input_tokens": 9, + "output_tokens": 602, + "cache_read_tokens": 84669, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_space_type_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_space_type_details" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[space_type_info_L3]", + "passed": true, + "duration_s": 22.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.130308, + "duration_ms": 20032, + "input_tokens": 9, + "output_tokens": 561, + "cache_read_tokens": 85401, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_space_type_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_space_type_details" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_run_period_L1]", + "passed": true, + "duration_s": 18.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11061200000000002, + "duration_ms": 16739, + "input_tokens": 8, + "output_tokens": 366, + "cache_read_tokens": 65044, + "tool_calls": [ + "load_osm_model", + "set_run_period" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__set_run_period" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_run_period_L2]", + "passed": true, + "duration_s": 14.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11324825000000001, + "duration_ms": 11712, + "input_tokens": 8, + "output_tokens": 371, + "cache_read_tokens": 64654, + "tool_calls": [ + "load_osm_model", + "set_run_period" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__set_run_period" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_run_period_L3]", + "passed": true, + "duration_s": 12.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.17044175, + "duration_ms": 10322, + "input_tokens": 8, + "output_tokens": 347, + "cache_read_tokens": 54541, + "tool_calls": [ + "load_osm_model", + "set_run_period" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__set_run_period" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[ideal_air_L1]", + "passed": true, + "duration_s": 16.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.1072375, + "duration_ms": 14074, + "input_tokens": 8, + "output_tokens": 311, + "cache_read_tokens": 64945, + "tool_calls": [ + "load_osm_model", + "enable_ideal_air_loads" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__enable_ideal_air_loads" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[ideal_air_L2]", + "passed": true, + "duration_s": 18.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.13460275, + "duration_ms": 16469, + "input_tokens": 9, + "output_tokens": 478, + "cache_read_tokens": 84203, + "tool_calls": [ + "load_osm_model", + "enable_ideal_air_loads", + "list_thermal_zones" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__enable_ideal_air_loads", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[ideal_air_L3]", + "passed": true, + "duration_s": 15.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.16585875, + "duration_ms": 13526, + "input_tokens": 8, + "output_tokens": 287, + "cache_read_tokens": 54550, + "tool_calls": [ + "load_osm_model", + "enable_ideal_air_loads" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__enable_ideal_air_loads" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[save_model_L1]", + "passed": true, + "duration_s": 14.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.16390849999999998, + "duration_ms": 11938, + "input_tokens": 8, + "output_tokens": 292, + "cache_read_tokens": 55137, + "tool_calls": [ + "load_osm_model", + "save_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[save_model_L2]", + "passed": true, + "duration_s": 15.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11027400000000001, + "duration_ms": 13557, + "input_tokens": 8, + "output_tokens": 318, + "cache_read_tokens": 64543, + "tool_calls": [ + "load_osm_model", + "save_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[save_model_L3]", + "passed": true, + "duration_s": 14.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.16733775, + "duration_ms": 12408, + "input_tokens": 8, + "output_tokens": 315, + "cache_read_tokens": 54633, + "tool_calls": [ + "load_osm_model", + "save_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_ev_L1]", + "passed": true, + "duration_s": 22.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.12387125, + "duration_ms": 20217, + "input_tokens": 8, + "output_tokens": 495, + "cache_read_tokens": 64675, + "tool_calls": [ + "load_osm_model", + "add_ev_load" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_ev_load" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_ev_L2]", + "passed": true, + "duration_s": 20.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.123914, + "duration_ms": 17986, + "input_tokens": 8, + "output_tokens": 498, + "cache_read_tokens": 64673, + "tool_calls": [ + "load_osm_model", + "add_ev_load" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_ev_load" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_ev_L3]", + "passed": true, + "duration_s": 23.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.12089025, + "duration_ms": 21313, + "input_tokens": 8, + "output_tokens": 396, + "cache_read_tokens": 64688, + "tool_calls": [ + "load_osm_model", + "add_ev_load" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_ev_load" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_measures_L1]", + "passed": true, + "duration_s": 12.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.10011575, + "duration_ms": 10732, + "input_tokens": 7, + "output_tokens": 429, + "cache_read_tokens": 45599, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_custom_measures" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_measures_L2]", + "passed": true, + "duration_s": 15.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.099798, + "duration_ms": 13524, + "input_tokens": 7, + "output_tokens": 416, + "cache_read_tokens": 45601, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_custom_measures" + ], + "toolsearch_count": 1, + "is_timeout": false + } + ] +} \ No newline at end of file diff --git a/docs/sweeps/opus-2026-03-28/benchmark.md b/docs/sweeps/opus-2026-03-28/benchmark.md new file mode 100644 index 0000000..51da408 --- /dev/null +++ b/docs/sweeps/opus-2026-03-28/benchmark.md @@ -0,0 +1,301 @@ +# LLM Benchmark Report + +**Date:** 2026-03-28T21:44:31+00:00 +**Model:** opus | **Retries:** 0 +**Result:** 170/180 passed (94.4%) in 11078s +**Tokens:** 2.0k in + 164.4k out + 22.6M cache | **Cost:** $32.2343 (notional API pricing) + +## Summary by Tier + +| Tier | Passed | Rate | Time | Avg | +|--------|---------|--------|--------|--------| +| setup | 6/6 | 100.0% | 512s | 85s | +| tier1 | 4/4 | 100.0% | 135s | 34s | +| tier2 | 34/37 | 91.9% | 5344s | 144s | +| tier3 | 19/26 | 73.1% | 1860s | 72s | +| tier4 | 3/3 | 100.0% | 135s | 45s | +| progressive | 104/104 | 100.0% | 3092s | 30s | + +## Detailed Results + +### setup + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|--------------------------------|--------|------|-------|--------------------------------------------------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| test_create_baseline_model | PASS | 13s | 3 | create_baseline_osm | 7 | 267 | 44.7k | $0.1033 | 1 | +| test_create_baseline_with_hvac | PASS | 15s | 3 | create_baseline_osm | 7 | 325 | 36.1k | $0.1551 | 1 | +| test_create_example_model | PASS | 12s | 3 | create_example_osm | 7 | 203 | 45.4k | $0.0942 | 1 | +| test_load_baseline_model | PASS | 15s | 4 | load_osm_model, list_thermal_zones | 8 | 293 | 64.6k | $0.1139 | 1 | +| test_run_baseline_simulation | PASS | 290s | 12 | load_osm_model, change_building_location, run_simulation, get_run_status, save_osm_model, run_simulation, get_run_status | 18 | 1.3k | 235.3k | $0.2370 | 1 | +| test_run_retrofit_simulation | PASS | 168s | 8 | load_osm_model, change_building_location, adjust_thermostat_setpoints, run_simulation, get_run_status | 12 | 945 | 141.5k | $0.2403 | 1 | + +### tier1 + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|-------------------------------------|--------|------|-------|--------------------------------------------------------------------------------------------------------------------------------------------------|--------|---------|-------|---------|-----| +| What is the server status? | PASS | 12s | 3 | get_server_status | 7 | 173 | 45.5k | $0.0906 | 1 | +| List available skills | PASS | 14s | 3 | list_skills | 7 | 391 | 45.6k | $0.1001 | 1 | +| Create a small office building usin | PASS | 90s | 0 | create_new_building, create_new_building, list_weather_files, create_new_building, create_new_building, create_new_building, create_bar_building | 0 | 0 | 0 | $0.0000 | 1 | +| Create bar geometry for a retail bu | PASS | 19s | 3 | create_bar_building | 7 | 409 | 46.4k | $0.1106 | 1 | + +### tier2 + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|---------------------------------------|--------|------|-------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| systemd_fourpipebeam_e2e | PASS | 300s | 1 | load_osm_model, list_weather_files, change_building_location, list_air_loops, save_osm_model, list_zone_hvac_equipment, list_plant_loops, search_wiring_patterns, search_api, get_skill, run_simulation, create_measure, test_measure, get_run_status, load_osm_model, apply_measure, save_osm_model, run_simulation, get_run_status, compare_runs, copy_file | 3 | 102 | 54.0k | $0.8533 | 1 | +| add_vav_reheat | PASS | 26s | 5 | load_osm_model, list_thermal_zones, add_baseline_system | 9 | 636 | 85.4k | $0.1485 | 1 | +| add_doas | PASS | 27s | 6 | load_osm_model, list_thermal_zones, add_doas_system | 12 | 715 | 104.7k | $0.1597 | 1 | +| add_vrf | PASS | 24s | 6 | load_osm_model, list_thermal_zones, add_vrf_system | 12 | 645 | 104.6k | $0.1518 | 1 | +| set_weather | PASS | 20s | 4 | load_osm_model, change_building_location | 8 | 431 | 65.6k | $0.1140 | 1 | +| add_rooftop_pv | PASS | 20s | 4 | load_osm_model, add_rooftop_pv | 8 | 380 | 65.2k | $0.1115 | 1 | +| adjust_thermostat | PASS | 18s | 4 | load_osm_model, adjust_thermostat_setpoints | 8 | 402 | 54.7k | $0.1736 | 1 | +| delete_space | PASS | 15s | 5 | load_osm_model, list_spaces, delete_object | 9 | 437 | 76.1k | $0.1853 | 1 | +| qaqc_check | PASS | 16s | 4 | load_osm_model, run_qaqc_checks | 8 | 460 | 65.5k | $0.1131 | 1 | +| create_bar_office | PASS | 20s | 4 | create_bar_building, list_spaces | 8 | 589 | 68.2k | $0.1401 | 1 | +| create_new_building | PASS | 51s | 3 | create_new_building | 7 | 421 | 46.6k | $0.1051 | 1 | +| bar_then_typical | PASS | 60s | 7 | create_bar_building, change_building_location, create_typical_building | 11 | 910 | 129.7k | $0.2459 | 1 | +| import_floorspacejs | PASS | 23s | 6 | import_floorspacejs, list_files, import_floorspacejs | 12 | 591 | 103.3k | $0.1436 | 1 | +| floorspacejs_to_typical | PASS | 121s | 13 | import_floorspacejs, list_files, import_floorspacejs, change_building_location, create_typical_building | 19 | 2.0k | 266.5k | $0.2786 | 1 | +| manual_geometry_match | PASS | 27s | 7 | create_example_osm, create_space_from_floor_print, create_space_from_floor_print, match_surfaces | 12 | 886 | 111.1k | $0.1610 | 1 | +| envelope_retrofit | PASS | 39s | 14 | load_osm_model, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, replace_window_constructions | 13 | 1.4k | 118.9k | $0.2490 | 1 | +| create_and_assign_loads | PASS | 34s | 7 | load_osm_model, list_spaces, create_people_definition, create_lights_definition | 12 | 770 | 106.5k | $0.1489 | 1 | +| plant_loop_with_boiler | PASS | 20s | 5 | load_osm_model, create_plant_loop, add_supply_equipment | 9 | 570 | 86.2k | $0.1301 | 1 | +| inspect_and_modify_boiler | PASS | 28s | 6 | load_osm_model, list_model_objects, get_object_fields, set_object_property | 10 | 691 | 109.2k | $0.1547 | 1 | +| extract_results_chain | PASS | 17s | 4 | extract_summary_metrics, extract_end_use_breakdown | 7 | 413 | 46.0k | $0.1016 | 1 | +| hvac_chilled_beam_comparison | FAIL | 300s | 0 | load_osm_model, list_air_loops, get_air_loop_details, replace_air_terminals, save_osm_model, run_simulation, get_run_status, list_weather_files, change_building_location, save_osm_model, run_simulation | 0 | 0 | 0 | $0.0000 | 1 | +| create_test_apply_measure | PASS | 27s | 6 | load_osm_model, create_measure, test_measure, apply_measure | 10 | 694 | 109.9k | $0.1525 | 1 | +| measure_set_lights_full_chain | PASS | 506s | 29 | load_osm_model, save_osm_model, run_simulation, get_run_status, list_weather_files, change_building_location, save_osm_model, run_simulation, get_run_status, extract_summary_metrics, load_osm_model, create_measure, test_measure, change_building_location, apply_measure, save_osm_model, run_simulation, get_run_status, extract_summary_metrics, compare_runs | 36 | 4.0k | 748.1k | $0.6607 | 1 | +| measure_set_infiltration_full_chain | PASS | 482s | 30 | load_osm_model, save_osm_model, run_simulation, get_run_status, load_osm_model, get_weather_info, list_weather_files, change_building_location, save_osm_model, run_simulation, get_run_status, extract_summary_metrics, load_osm_model, create_measure, test_measure, change_building_location, apply_measure, save_osm_model, run_simulation, get_run_status, extract_summary_metrics | 39 | 3.7k | 814.7k | $0.6817 | 1 | +| measure_replace_terminals_full_chain | PASS | 544s | 39 | load_osm_model, save_osm_model, run_simulation, get_run_status, get_weather_info, list_weather_files, load_osm_model, change_building_location, save_osm_model, run_simulation, get_run_status, extract_summary_metrics, load_osm_model, list_air_loops, list_plant_loops, search_wiring_patterns, search_api, create_measure, test_measure, apply_measure, change_building_location, save_osm_model, run_simulation, get_run_status, extract_summary_metrics, extract_end_use_breakdown, extract_end_use_breakdown | 53 | 6.3k | 1.1M | $0.9729 | 1 | +| create_measure_with_args | PASS | 55s | 3 | create_measure | 7 | 2.9k | 46.4k | $0.1799 | 1 | +| measure_add_baseboards_full_chain | PASS | 512s | 33 | load_osm_model, save_osm_model, run_simulation, get_run_status, get_weather_info, load_osm_model, list_weather_files, change_building_location, save_osm_model, run_simulation, get_run_status, extract_summary_metrics, load_osm_model, change_building_location, list_thermal_zones, create_measure, test_measure, apply_measure, save_osm_model, run_simulation, get_run_status, extract_summary_metrics | 49 | 3.8k | 910.8k | $0.7488 | 1 | +| ruby_measure_reduce_plugloads | PASS | 551s | 36 | load_osm_model, save_osm_model, run_simulation, get_run_status, get_weather_info, list_weather_files, change_building_location, save_osm_model, run_simulation, get_run_status, extract_summary_metrics, load_osm_model, change_building_location, get_skill, create_measure, test_measure, read_file, edit_measure, test_measure, apply_measure, save_osm_model, run_simulation, get_run_status, extract_summary_metrics | 51 | 4.9k | 1.1M | $0.8861 | 1 | +| python_measure_reduce_plugloads | PASS | 429s | 36 | load_osm_model, save_osm_model, run_simulation, get_run_status, get_weather_info, list_weather_files, load_osm_model, change_building_location, save_osm_model, run_simulation, create_measure, test_measure, read_file, edit_measure, test_measure, get_run_status, extract_summary_metrics, load_osm_model, change_building_location, apply_measure, save_osm_model, run_simulation, get_run_status, extract_summary_metrics | 55 | 6.1k | 1.1M | $0.8973 | 1 | +| ruby_measure_boiler_efficiency | PASS | 414s | 36 | load_osm_model, save_osm_model, run_simulation, get_run_status, list_weather_files, load_osm_model, change_building_location, save_osm_model, run_simulation, get_run_status, extract_summary_metrics, load_osm_model, change_building_location, create_measure, test_measure, read_file, create_measure, test_measure, read_file, create_measure, test_measure, create_measure, test_measure, apply_measure, save_osm_model, run_simulation, get_run_status | 49 | 7.7k | 1.1M | $0.9815 | 1 | +| python_measure_boiler_efficiency | PASS | 431s | 36 | load_osm_model, save_osm_model, run_simulation, get_run_status, get_weather_info, list_weather_files, load_osm_model, change_building_location, save_osm_model, run_simulation, create_measure, test_measure, read_file, edit_measure, test_measure, get_run_status, extract_summary_metrics, load_osm_model, change_building_location, apply_measure, save_osm_model, run_simulation, get_run_status | 55 | 5.6k | 1.0M | $0.8719 | 1 | +| test_create_measure_with_args_quality | PASS | 45s | 3 | create_measure | 7 | 2.4k | 57.3k | $0.1010 | 1 | +| test_complex_model_multi_query | PASS | 23s | 7 | load_osm_model, get_building_info, list_air_loops, list_plant_loops, list_thermal_zones | 8 | 760 | 66.2k | $0.1311 | 1 | +| Ruby | PASS | 27s | 3 | create_measure | 7 | 1.6k | 46.5k | $0.1389 | 1 | +| Python | PASS | 31s | 3 | create_measure | 7 | 1.5k | 46.5k | $0.1381 | 1 | +| Ruby | FAIL | 28s | 3 | create_measure | 7 | 1.4k | 46.6k | $0.1342 | 1 | +| Python | FAIL | 31s | 3 | create_measure | 7 | 1.4k | 46.6k | $0.1342 | 1 | + +### tier3 + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|--------------------------------------------------|--------|------|-------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| add-hvac:Add HVAC to the model | PASS | 26s | 7 | load_osm_model, get_building_info, list_thermal_zones, add_baseline_system | 9 | 889 | 86.3k | $0.1624 | 1 | +| add-hvac:Set up heating and cooling | PASS | 28s | 8 | load_osm_model, get_building_info, list_thermal_zones | 13 | 747 | 104.8k | $0.1520 | 1 | +| add-hvac:What HVAC system should I use? | PASS | 29s | 8 | load_osm_model, get_building_info, list_thermal_zones | 13 | 914 | 104.8k | $0.1561 | 1 | +| add-hvac:Add a VAV system | PASS | 24s | 6 | load_osm_model, list_thermal_zones, add_baseline_system | 9 | 704 | 86.7k | $0.1453 | 1 | +| energy-report:Give me a full energy report | FAIL | 120s | 0 | load_osm_model, get_building_info, list_files, get_weather_info, run_simulation | 0 | 0 | 0 | $0.0000 | 1 | +| new-building:Create a small office building | PASS | 180s | 0 | create_new_building, create_new_building, list_weather_files, create_new_building, create_new_building, create_new_building, create_bar_building, create_example_osm, create_bar_building, change_building_location, create_baseline_osm, change_building_location | 0 | 0 | 0 | $0.0000 | 1 | +| new-building:Model a 3-story school | PASS | 180s | 0 | list_weather_files, create_new_building, change_building_location, change_building_location, create_typical_building | 0 | 0 | 0 | $0.0000 | 1 | +| new-building:Create a retail building, 25000 sqf | PASS | 174s | 15 | create_new_building, create_new_building, list_weather_files, create_new_building, create_new_building, create_bar_building, change_building_location, create_typical_building, get_building_info | 27 | 4.1k | 447.7k | $0.5387 | 1 | +| new-building:Import the FloorspaceJS floor plan | PASS | 39s | 6 | import_floorspacejs, list_files, import_floorspacejs | 12 | 635 | 103.5k | $0.1443 | 1 | +| new-building:Create a bar building for a medium | PASS | 22s | 3 | create_bar_building | 7 | 436 | 46.4k | $0.1169 | 1 | +| qaqc:Check the model for issues | FAIL | 17s | 4 | load_osm_model, validate_model | 8 | 404 | 64.9k | $0.1174 | 1 | +| qaqc:Validate before simulation | FAIL | 26s | 4 | load_osm_model, validate_model | 8 | 358 | 64.9k | $0.1088 | 1 | +| qaqc:QA/QC the model | FAIL | 28s | 5 | load_osm_model, validate_model | 11 | 557 | 85.1k | $0.1274 | 1 | +| qaqc:Is my model ready to simulate? | FAIL | 16s | 4 | load_osm_model, validate_model | 8 | 399 | 54.9k | $0.1679 | 1 | +| retrofit:Compare before and after adding ins | PASS | 58s | 18 | load_osm_model, list_model_objects, list_surfaces, list_surfaces, get_construction_details, get_construction_details, get_object_fields, get_object_fields, set_object_property, set_object_property, get_object_fields, get_object_fields | 24 | 2.3k | 257.8k | $0.3296 | 1 | +| retrofit:Do a retrofit analysis | PASS | 180s | 0 | load_osm_model, get_building_info, get_model_summary, list_air_loops, list_thermal_zones, get_weather_info, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, add_baseline_system, save_osm_model, run_simulation, list_materials, get_construction_details, get_construction_details, get_construction_details, list_model_objects, get_construction_details, get_construction_details, get_object_fields, get_object_fields, save_osm_model, set_object_property, set_object_property, list_model_objects, get_load_details, list_model_objects, get_object_fields, set_object_property, save_osm_model, run_simulation, get_run_status, get_run_status, extract_summary_metrics, extract_end_use_breakdown, get_run_status, extract_summary_metrics, extract_end_use_breakdown | 0 | 0 | 0 | $0.0000 | 1 | +| simulate:Run a simulation | PASS | 120s | 0 | load_osm_model, run_simulation | 0 | 0 | 0 | $0.0000 | 1 | +| simulate:Simulate the model | PASS | 120s | 0 | load_osm_model, run_simulation | 0 | 0 | 0 | $0.0000 | 1 | +| simulate:Run EnergyPlus | PASS | 120s | 0 | load_osm_model, run_simulation | 0 | 0 | 0 | $0.0000 | 1 | +| troubleshoot:My simulation failed | FAIL | 26s | 7 | load_osm_model, extract_simulation_errors, list_weather_files | 14 | 683 | 105.1k | $0.1980 | 1 | +| troubleshoot:EUI looks way too high | PASS | 120s | 0 | load_osm_model, extract_summary_metrics, extract_end_use_breakdown, extract_simulation_errors, get_run_status, get_run_artifacts, list_weather_files, change_building_location, save_osm_model, save_osm_model, run_simulation | 0 | 0 | 0 | $0.0000 | 1 | +| troubleshoot:Too many unmet hours | PASS | 120s | 0 | load_osm_model, extract_summary_metrics, extract_zone_summary, extract_simulation_errors, get_run_status, list_weather_files, change_building_location, save_osm_model, save_osm_model, run_simulation | 0 | 0 | 0 | $0.0000 | 1 | +| troubleshoot:Why did EnergyPlus crash? | FAIL | 18s | 4 | load_osm_model, extract_simulation_errors | 7 | 408 | 45.9k | $0.1031 | 1 | +| view:Show me the model | PASS | 30s | 6 | load_osm_model, view_model, copy_file | 12 | 474 | 103.6k | $0.1355 | 1 | +| view:Visualize the building | PASS | 22s | 4 | load_osm_model, view_model | 8 | 336 | 64.9k | $0.1085 | 1 | +| view:3D view | PASS | 18s | 4 | load_osm_model, view_model | 8 | 339 | 64.9k | $0.1086 | 1 | + +### tier4 + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|--------------------------------------------|--------|------|-------|-------------------------------------------------------------------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| test_create_uses_mcp_not_raw_idf | PASS | 96s | 10 | create_new_building, list_weather_files, create_new_building, change_building_location, change_building_location, create_typical_building | 18 | 1.9k | 234.4k | $0.3138 | 1 | +| test_no_script_for_results | PASS | 19s | 6 | extract_summary_metrics, get_run_status, extract_simulation_errors | 11 | 597 | 74.4k | $0.1883 | 1 | +| test_inspect_component_uses_mcp_not_script | PASS | 21s | 8 | load_osm_model, list_model_objects, list_model_objects, list_model_objects, get_component_properties | 9 | 769 | 85.2k | $0.1426 | 1 | + +### progressive + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|-------------------------|--------|------|-------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| import_floorplan_L1 | PASS | 21s | 4 | list_files, import_floorspacejs | 8 | 590 | 66.5k | $0.1247 | 1 | +| import_floorplan_L2 | PASS | 26s | 6 | import_floorspacejs, list_files, import_floorspacejs | 12 | 584 | 104.0k | $0.1397 | 1 | +| import_floorplan_L3 | PASS | 23s | 6 | import_floorspacejs, list_files, import_floorspacejs | 12 | 583 | 104.0k | $0.1396 | 1 | +| add_hvac_L1 | PASS | 26s | 8 | load_osm_model, get_building_info, list_thermal_zones, add_baseline_system | 12 | 1.0k | 108.0k | $0.1775 | 1 | +| add_hvac_L2 | PASS | 20s | 5 | load_osm_model, list_thermal_zones, add_baseline_system | 9 | 654 | 86.4k | $0.1433 | 1 | +| add_hvac_L3 | PASS | 19s | 5 | load_osm_model, list_thermal_zones, add_baseline_system | 9 | 634 | 86.4k | $0.1427 | 1 | +| view_model_L1 | PASS | 22s | 4 | load_osm_model, view_model | 8 | 405 | 65.0k | $0.1103 | 1 | +| view_model_L2 | PASS | 17s | 4 | load_osm_model, view_model | 8 | 371 | 64.5k | $0.1122 | 1 | +| view_model_L3 | PASS | 19s | 4 | load_osm_model, view_model | 8 | 391 | 65.0k | $0.1101 | 1 | +| set_weather_L1 | PASS | 32s | 6 | load_osm_model, list_weather_files, change_building_location | 12 | 864 | 111.5k | $0.1994 | 1 | +| set_weather_L2 | PASS | 48s | 8 | load_osm_model, change_building_location, list_weather_files, change_building_location, change_building_location | 14 | 977 | 160.3k | $0.2336 | 1 | +| set_weather_L3 | PASS | 35s | 7 | load_osm_model, change_building_location, list_weather_files, change_building_location | 13 | 831 | 133.0k | $0.2097 | 1 | +| run_qaqc_L1 | PASS | 17s | 4 | load_osm_model, validate_model | 8 | 399 | 65.7k | $0.1125 | 1 | +| run_qaqc_L2 | PASS | 20s | 5 | load_osm_model, validate_model | 10 | 550 | 65.3k | $0.1207 | 1 | +| run_qaqc_L3 | PASS | 17s | 6 | load_osm_model, inspect_osm_summary, validate_model | 11 | 584 | 85.7k | $0.1318 | 1 | +| create_building_L1 | PASS | 120s | 0 | create_new_building, create_new_building, list_weather_files, create_new_building, create_bar_building, create_example_osm, create_bar_building | 0 | 0 | 0 | $0.0000 | 1 | +| create_building_L2 | PASS | 120s | 0 | create_new_building, create_new_building, list_weather_files, create_new_building, create_new_building, create_bar_building, create_example_osm, create_bar_building | 0 | 0 | 0 | $0.0000 | 1 | +| create_building_L3 | PASS | 15s | 3 | create_bar_building | 7 | 372 | 46.4k | $0.1114 | 1 | +| add_pv_L1 | PASS | 22s | 4 | load_osm_model, add_rooftop_pv | 8 | 451 | 65.2k | $0.1136 | 1 | +| add_pv_L2 | PASS | 18s | 4 | load_osm_model, add_rooftop_pv | 8 | 368 | 64.6k | $0.1143 | 1 | +| add_pv_L3 | PASS | 18s | 4 | load_osm_model, add_rooftop_pv | 8 | 385 | 65.2k | $0.1117 | 1 | +| thermostat_L1 | PASS | 15s | 4 | load_osm_model, adjust_thermostat_setpoints | 8 | 359 | 65.2k | $0.1120 | 1 | +| thermostat_L2 | PASS | 18s | 4 | load_osm_model, adjust_thermostat_setpoints | 8 | 364 | 64.6k | $0.1153 | 1 | +| thermostat_L3 | PASS | 15s | 4 | load_osm_model, adjust_thermostat_setpoints | 8 | 368 | 64.6k | $0.1154 | 1 | +| list_spaces_L1 | PASS | 21s | 4 | load_osm_model, list_spaces | 8 | 444 | 65.2k | $0.1169 | 1 | +| list_spaces_L2 | PASS | 17s | 4 | load_osm_model, list_spaces | 8 | 605 | 65.3k | $0.1198 | 1 | +| list_spaces_L3 | PASS | 19s | 4 | load_osm_model, list_spaces | 8 | 584 | 55.4k | $0.1763 | 1 | +| schedules_L1 | PASS | 20s | 6 | load_osm_model, list_model_objects, list_model_objects, list_model_objects | 9 | 616 | 75.4k | $0.1870 | 1 | +| schedules_L2 | PASS | 16s | 4 | load_osm_model, list_model_objects | 8 | 389 | 65.6k | $0.1127 | 1 | +| schedules_L3 | PASS | 21s | 4 | load_osm_model, list_model_objects | 8 | 397 | 65.7k | $0.1130 | 1 | +| inspect_component_L1 | PASS | 24s | 6 | load_osm_model, list_plant_loops, get_component_properties | 9 | 575 | 86.3k | $0.1359 | 1 | +| inspect_component_L2 | PASS | 19s | 5 | load_osm_model, list_model_objects, get_component_properties | 9 | 476 | 85.6k | $0.1264 | 1 | +| inspect_component_L3 | PASS | 33s | 7 | load_osm_model, get_object_fields, list_model_objects, get_object_fields | 13 | 821 | 124.3k | $0.1665 | 1 | +| modify_component_L1 | PASS | 21s | 6 | load_osm_model, list_model_objects, get_component_properties, set_component_properties | 10 | 556 | 106.0k | $0.1433 | 1 | +| modify_component_L2 | PASS | 14s | 5 | load_osm_model, list_model_objects, set_component_properties | 9 | 430 | 84.7k | $0.1317 | 1 | +| modify_component_L3 | PASS | 14s | 5 | load_osm_model, list_model_objects, set_object_property | 9 | 481 | 76.6k | $0.1856 | 1 | +| list_dynamic_type_L1 | PASS | 37s | 10 | load_osm_model, get_simulation_control, list_air_loops, list_thermal_zones, get_sizing_system_properties, get_sizing_zone_properties, get_sizing_zone_properties | 12 | 1.3k | 106.3k | $0.1716 | 1 | +| list_dynamic_type_L2 | PASS | 14s | 4 | load_osm_model, list_model_objects | 8 | 360 | 65.6k | $0.1107 | 1 | +| list_dynamic_type_L3 | PASS | 16s | 4 | load_osm_model, list_model_objects | 8 | 393 | 65.7k | $0.1119 | 1 | +| floor_area_L1 | PASS | 21s | 4 | load_osm_model, get_building_info | 8 | 355 | 64.5k | $0.1126 | 1 | +| floor_area_L2 | PASS | 17s | 5 | load_osm_model, get_building_info | 11 | 333 | 83.1k | $0.1237 | 1 | +| floor_area_L3 | PASS | 16s | 4 | load_osm_model, get_building_info | 8 | 347 | 64.9k | $0.1101 | 1 | +| materials_L1 | PASS | 28s | 4 | load_osm_model, list_materials | 8 | 595 | 64.9k | $0.1222 | 1 | +| materials_L2 | PASS | 18s | 4 | load_osm_model, list_materials | 8 | 838 | 65.1k | $0.1274 | 1 | +| materials_L3 | PASS | 17s | 4 | load_osm_model, list_materials | 8 | 771 | 64.5k | $0.1284 | 1 | +| thermal_zones_L1 | PASS | 15s | 5 | load_osm_model, list_thermal_zones | 10 | 398 | 64.5k | $0.1177 | 1 | +| thermal_zones_L2 | PASS | 14s | 4 | load_osm_model, list_thermal_zones | 8 | 463 | 65.0k | $0.1161 | 1 | +| thermal_zones_L3 | PASS | 21s | 4 | load_osm_model, list_thermal_zones | 8 | 467 | 65.2k | $0.1152 | 1 | +| subsurfaces_L1 | PASS | 15s | 4 | load_osm_model, list_subsurfaces | 8 | 355 | 65.4k | $0.1096 | 1 | +| subsurfaces_L2 | PASS | 15s | 4 | load_osm_model, list_subsurfaces | 8 | 362 | 64.6k | $0.1147 | 1 | +| subsurfaces_L3 | PASS | 15s | 4 | load_osm_model, list_subsurfaces | 8 | 330 | 54.8k | $0.1697 | 1 | +| surface_details_L1 | PASS | 24s | 5 | load_osm_model, list_surfaces | 11 | 688 | 83.8k | $0.1396 | 1 | +| surface_details_L2 | PASS | 34s | 5 | load_osm_model, list_surfaces, get_surface_details | 9 | 599 | 84.6k | $0.1345 | 1 | +| surface_details_L3 | PASS | 26s | 4 | load_osm_model, list_surfaces | 8 | 668 | 64.8k | $0.1924 | 1 | +| run_simulation_L1 | PASS | 181s | 10 | load_osm_model, get_weather_info, run_simulation, get_run_status, extract_summary_metrics | 18 | 1.0k | 185.6k | $0.1996 | 1 | +| run_simulation_L2 | PASS | 149s | 7 | load_osm_model, run_simulation, get_run_status | 13 | 738 | 123.6k | $0.1574 | 1 | +| run_simulation_L3 | PASS | 150s | 7 | load_osm_model, run_simulation, get_run_status | 13 | 696 | 124.0k | $0.1537 | 1 | +| get_eui_L1 | PASS | 21s | 6 | extract_summary_metrics, extract_end_use_breakdown, get_run_status | 11 | 597 | 84.0k | $0.1310 | 1 | +| get_eui_L2 | PASS | 28s | 8 | extract_summary_metrics, extract_end_use_breakdown, get_run_status, extract_simulation_errors | 15 | 760 | 123.2k | $0.1593 | 1 | +| get_eui_L3 | PASS | 16s | 3 | extract_summary_metrics | 7 | 251 | 35.8k | $0.1520 | 1 | +| end_use_breakdown_L1 | PASS | 33s | 11 | extract_end_use_breakdown, extract_summary_metrics, get_run_artifacts, query_timeseries, query_timeseries, extract_simulation_errors | 20 | 1.4k | 191.5k | $0.2255 | 1 | +| end_use_breakdown_L2 | PASS | 28s | 6 | extract_end_use_breakdown, get_run_status, extract_summary_metrics | 11 | 643 | 83.9k | $0.1299 | 1 | +| end_use_breakdown_L3 | PASS | 13s | 3 | extract_end_use_breakdown | 7 | 241 | 45.7k | $0.0934 | 1 | +| hvac_sizing_L1 | PASS | 24s | 8 | extract_hvac_sizing, extract_component_sizing, extract_simulation_errors, extract_summary_metrics | 11 | 908 | 84.6k | $0.1450 | 1 | +| hvac_sizing_L2 | PASS | 20s | 5 | extract_hvac_sizing, extract_component_sizing | 11 | 622 | 83.8k | $0.1257 | 1 | +| hvac_sizing_L3 | PASS | 14s | 3 | extract_hvac_sizing | 7 | 332 | 45.4k | $0.0965 | 1 | +| set_wwr_L1 | PASS | 33s | 13 | load_osm_model, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio | 12 | 1.2k | 104.5k | $0.1728 | 1 | +| set_wwr_L2 | PASS | 28s | 13 | load_osm_model, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio | 12 | 1.3k | 105.1k | $0.1721 | 1 | +| set_wwr_L3 | PASS | 33s | 13 | load_osm_model, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio | 12 | 1.2k | 105.2k | $0.1709 | 1 | +| replace_windows_L1 | PASS | 112s | 26 | load_osm_model, list_materials, list_subsurfaces, list_surfaces, list_model_objects, list_surfaces, search_api, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, create_measure, apply_measure, apply_measure, get_construction_details | 31 | 4.5k | 467.4k | $0.5406 | 1 | +| replace_windows_L2 | PASS | 99s | 30 | load_osm_model, list_subsurfaces, list_model_objects, list_subsurfaces, get_construction_details, get_construction_details, list_materials, list_subsurfaces, list_surfaces, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, list_materials, search_api, create_measure, apply_measure, replace_window_constructions | 32 | 4.1k | 440.7k | $0.5020 | 1 | +| replace_windows_L3 | PASS | 44s | 20 | load_osm_model, list_subsurfaces, list_subsurfaces, list_model_objects, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, replace_window_constructions | 23 | 1.9k | 215.4k | $0.2596 | 1 | +| construction_details_L1 | PASS | 22s | 6 | load_osm_model, list_surfaces, get_construction_details | 12 | 631 | 103.8k | $0.1431 | 1 | +| construction_details_L2 | PASS | 21s | 7 | load_osm_model, list_model_objects, get_construction_details, get_construction_details | 9 | 801 | 85.9k | $0.1384 | 1 | +| construction_details_L3 | PASS | 25s | 8 | load_osm_model, list_model_objects, get_construction_details, get_construction_details, get_construction_details | 12 | 895 | 104.9k | $0.1554 | 1 | +| check_loads_L1 | PASS | 17s | 5 | load_osm_model, list_spaces, get_space_details | 9 | 439 | 84.1k | $0.1312 | 1 | +| check_loads_L2 | PASS | 30s | 8 | load_osm_model, list_spaces, get_space_type_details, get_load_details, get_load_details | 13 | 889 | 127.1k | $0.1684 | 1 | +| check_loads_L3 | PASS | 25s | 13 | load_osm_model, list_model_objects, list_model_objects, list_model_objects, list_model_objects, list_model_objects, get_load_details, get_load_details, get_load_details, get_load_details | 12 | 1.1k | 105.3k | $0.1614 | 1 | +| create_loads_L1 | PASS | 48s | 25 | load_osm_model, list_spaces, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition | 12 | 2.8k | 95.3k | $0.3194 | 1 | +| create_loads_L2 | PASS | 43s | 25 | load_osm_model, list_spaces, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition | 12 | 2.7k | 84.2k | $0.3577 | 1 | +| create_loads_L3 | PASS | 19s | 6 | load_osm_model, list_spaces, create_people_definition | 12 | 604 | 95.1k | $0.1973 | 1 | +| create_plant_loop_L1 | PASS | 15s | 4 | load_osm_model, create_plant_loop | 8 | 410 | 65.5k | $0.1120 | 1 | +| create_plant_loop_L2 | PASS | 27s | 4 | load_osm_model, create_plant_loop | 8 | 424 | 64.8k | $0.1169 | 1 | +| create_plant_loop_L3 | PASS | 14s | 4 | load_osm_model, create_plant_loop | 8 | 361 | 64.8k | $0.1153 | 1 | +| schedule_details_L1 | PASS | 66s | 22 | load_osm_model, list_air_loops, list_zone_hvac_equipment, list_model_objects, list_model_objects, list_model_objects, get_building_info, get_schedule_details, get_schedule_details, get_schedule_details, get_object_fields, get_object_fields, read_file, read_file, read_file | 28 | 2.3k | 338.1k | $0.4357 | 1 | +| schedule_details_L2 | PASS | 37s | 9 | load_osm_model, list_model_objects, list_model_objects, list_model_objects, list_model_objects, get_schedule_details | 11 | 1.1k | 125.2k | $0.1750 | 1 | +| schedule_details_L3 | PASS | 27s | 6 | load_osm_model, list_model_objects, get_schedule_details | 12 | 700 | 103.9k | $0.1496 | 1 | +| space_type_info_L1 | PASS | 27s | 6 | load_osm_model, list_spaces, get_space_type_details | 12 | 658 | 105.3k | $0.1573 | 1 | +| space_type_info_L2 | PASS | 23s | 5 | load_osm_model, list_model_objects, get_space_type_details | 9 | 602 | 84.7k | $0.1355 | 1 | +| space_type_info_L3 | PASS | 22s | 5 | load_osm_model, list_model_objects, get_space_type_details | 9 | 561 | 85.4k | $0.1303 | 1 | +| set_run_period_L1 | PASS | 19s | 4 | load_osm_model, set_run_period | 8 | 366 | 65.0k | $0.1106 | 1 | +| set_run_period_L2 | PASS | 14s | 4 | load_osm_model, set_run_period | 8 | 371 | 64.7k | $0.1132 | 1 | +| set_run_period_L3 | PASS | 12s | 4 | load_osm_model, set_run_period | 8 | 347 | 54.5k | $0.1704 | 1 | +| ideal_air_L1 | PASS | 16s | 4 | load_osm_model, enable_ideal_air_loads | 8 | 311 | 64.9k | $0.1072 | 1 | +| ideal_air_L2 | PASS | 18s | 5 | load_osm_model, enable_ideal_air_loads, list_thermal_zones | 9 | 478 | 84.2k | $0.1346 | 1 | +| ideal_air_L3 | PASS | 16s | 4 | load_osm_model, enable_ideal_air_loads | 8 | 287 | 54.5k | $0.1659 | 1 | +| save_model_L1 | PASS | 14s | 4 | load_osm_model, save_osm_model | 8 | 292 | 55.1k | $0.1639 | 1 | +| save_model_L2 | PASS | 16s | 4 | load_osm_model, save_osm_model | 8 | 318 | 64.5k | $0.1103 | 1 | +| save_model_L3 | PASS | 15s | 4 | load_osm_model, save_osm_model | 8 | 315 | 54.6k | $0.1673 | 1 | +| add_ev_L1 | PASS | 22s | 4 | load_osm_model, add_ev_load | 8 | 495 | 64.7k | $0.1239 | 1 | +| add_ev_L2 | PASS | 20s | 4 | load_osm_model, add_ev_load | 8 | 498 | 64.7k | $0.1239 | 1 | +| add_ev_L3 | PASS | 23s | 4 | load_osm_model, add_ev_load | 8 | 396 | 64.7k | $0.1209 | 1 | +| list_measures_L1 | PASS | 13s | 3 | list_custom_measures | 7 | 429 | 45.6k | $0.1001 | 1 | +| list_measures_L2 | PASS | 16s | 3 | list_custom_measures | 7 | 416 | 45.6k | $0.0998 | 1 | + +## Progressive Prompt Analysis + +Pass rates by specificity level per case: + +| Case | L1 (vague) | L2 (moderate) | L3 (explicit) | +|----------------------|------------|---------------|---------------| +| import_floorplan | PASS | PASS | PASS | +| add_hvac | PASS | PASS | PASS | +| view_model | PASS | PASS | PASS | +| set_weather | PASS | PASS | PASS | +| run_qaqc | PASS | PASS | PASS | +| create_building | PASS | PASS | PASS | +| add_pv | PASS | PASS | PASS | +| thermostat | PASS | PASS | PASS | +| list_spaces | PASS | PASS | PASS | +| schedules | PASS | PASS | PASS | +| inspect_component | PASS | PASS | PASS | +| modify_component | PASS | PASS | PASS | +| list_dynamic_type | PASS | PASS | PASS | +| floor_area | PASS | PASS | PASS | +| materials | PASS | PASS | PASS | +| thermal_zones | PASS | PASS | PASS | +| subsurfaces | PASS | PASS | PASS | +| surface_details | PASS | PASS | PASS | +| run_simulation | PASS | PASS | PASS | +| get_eui | PASS | PASS | PASS | +| end_use_breakdown | PASS | PASS | PASS | +| hvac_sizing | PASS | PASS | PASS | +| set_wwr | PASS | PASS | PASS | +| replace_windows | PASS | PASS | PASS | +| construction_details | PASS | PASS | PASS | +| check_loads | PASS | PASS | PASS | +| create_loads | PASS | PASS | PASS | +| create_plant_loop | PASS | PASS | PASS | +| schedule_details | PASS | PASS | PASS | +| space_type_info | PASS | PASS | PASS | +| set_run_period | PASS | PASS | PASS | +| ideal_air | PASS | PASS | PASS | +| save_model | PASS | PASS | PASS | +| add_ev | PASS | PASS | PASS | +| list_measures | PASS | PASS | - | + +**Summary:** L1=35/35 | L2=35/35 | L3=34/35 + +## Tool Discovery Overhead + +| Metric | Value | +|--------|-------| +| Avg ToolSearch calls/test | 2.0 | +| Max ToolSearch calls | 11 | +| Tests with 0 ToolSearch | 0/180 | + +## Failure Mode Analysis + +| Mode | Count | Description | +|------|-------|-------------| +| wrong_tool | 8 | MCP tool called but not the expected one | +| timeout | 2 | Timed out before completing | + +## Failed Tests + +- **energy-report:Give me a full energy report** (tier3, timeout): 120s, 0 turns, tools: load_osm_model -> get_building_info -> list_files -> get_weather_info -> run_simulation +- **qaqc:Check the model for issues** (tier3, wrong_tool): 17s, 4 turns, tools: load_osm_model -> validate_model +- **qaqc:Validate before simulation** (tier3, wrong_tool): 26s, 4 turns, tools: load_osm_model -> validate_model +- **qaqc:QA/QC the model** (tier3, wrong_tool): 28s, 5 turns, tools: load_osm_model -> validate_model +- **qaqc:Is my model ready to simulate?** (tier3, wrong_tool): 16s, 4 turns, tools: load_osm_model -> validate_model +- **troubleshoot:My simulation failed** (tier3, wrong_tool): 26s, 7 turns, tools: load_osm_model -> extract_simulation_errors -> list_weather_files +- **troubleshoot:Why did EnergyPlus crash?** (tier3, wrong_tool): 18s, 4 turns, tools: load_osm_model -> extract_simulation_errors +- **hvac_chilled_beam_comparison** (tier2, timeout): 300s, 0 turns, tools: load_osm_model -> list_air_loops -> get_air_loop_details -> replace_air_terminals -> save_osm_model -> run_simulation -> get_run_status -> list_weather_files -> change_building_location -> save_osm_model -> run_simulation +- **Ruby** (tier2, wrong_tool): 28s, 3 turns, tools: create_measure +- **Python** (tier2, wrong_tool): 31s, 3 turns, tools: create_measure diff --git a/docs/sweeps/opus-2026-03-28/benchmark_history.json b/docs/sweeps/opus-2026-03-28/benchmark_history.json new file mode 100644 index 0000000..c97ae32 --- /dev/null +++ b/docs/sweeps/opus-2026-03-28/benchmark_history.json @@ -0,0 +1,54 @@ +[ + { + "timestamp": "2026-03-28T21:44:31+00:00", + "model": "opus", + "retries": 0, + "total_tests": 180, + "passed": 170, + "failed": 10, + "pass_rate": 94.4, + "total_duration_s": 11078.5, + "total_input_tokens": 2019, + "total_output_tokens": 164420, + "total_cache_read_tokens": 22609596, + "total_cost_usd": 32.2343, + "tiers": { + "setup": { + "total": 6, + "passed": 6, + "duration_s": 512.4, + "pass_rate": 100.0 + }, + "tier1": { + "total": 4, + "passed": 4, + "duration_s": 135.2, + "pass_rate": 100.0 + }, + "tier3": { + "total": 26, + "passed": 19, + "duration_s": 1860.4, + "pass_rate": 73.1 + }, + "tier2": { + "total": 37, + "passed": 34, + "duration_s": 5343.5, + "pass_rate": 91.9 + }, + "tier4": { + "total": 3, + "passed": 3, + "duration_s": 135.3, + "pass_rate": 100.0 + }, + "progressive": { + "total": 104, + "passed": 104, + "duration_s": 3091.7, + "pass_rate": 100.0 + } + } + } +] \ No newline at end of file diff --git a/docs/sweeps/opus-2026-03-28/sweep.log b/docs/sweeps/opus-2026-03-28/sweep.log new file mode 100644 index 0000000..48afd7b --- /dev/null +++ b/docs/sweeps/opus-2026-03-28/sweep.log @@ -0,0 +1,782 @@ +============================= test session starts ============================= +platform win32 -- Python 3.13.12, pytest-9.0.2, pluggy-1.6.0 -- C:\Python313\python.exe +cachedir: .pytest_cache +rootdir: C:\projects\openstudio-mcp +configfile: pyproject.toml +plugins: anyio-4.12.1, cov-7.0.0, timeout-2.4.0 +collecting ... collected 230 items + +tests/llm/test_01_setup.py::test_create_baseline_model PASSED [ 0%] +tests/llm/test_01_setup.py::test_create_baseline_with_hvac PASSED [ 0%] +tests/llm/test_01_setup.py::test_create_example_model PASSED [ 1%] +tests/llm/test_01_setup.py::test_load_baseline_model PASSED [ 1%] +tests/llm/test_01_setup.py::test_run_baseline_simulation PASSED [ 2%] +tests/llm/test_01_setup.py::test_run_retrofit_simulation PASSED [ 2%] +tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[What is the server status?] PASSED [ 3%] +tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[List available skills] PASSED [ 3%] +tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[Create a small office building usin] PASSED [ 3%] +tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[Create bar geometry for a retail bu] PASSED [ 4%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Add HVAC to the model] PASSED [ 4%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Set up heating and cooling] PASSED [ 5%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:What HVAC system should I use?] PASSED [ 5%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Add a VAV system] PASSED [ 6%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[energy-report:Give me a full energy report] FAILED [ 6%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a small office building] PASSED [ 6%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Model a 3-story school] PASSED [ 7%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a retail building, 25000 sqf] PASSED [ 7%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Import the FloorspaceJS floor plan ] PASSED [ 8%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a bar building for a medium ] PASSED [ 8%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Check the model for issues] FAILED [ 9%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Validate before simulation] FAILED [ 9%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:QA/QC the model] FAILED [ 10%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Is my model ready to simulate?] FAILED [ 10%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[retrofit:Compare before and after adding ins] PASSED [ 10%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[retrofit:Do a retrofit analysis] PASSED [ 11%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Run a simulation] PASSED [ 11%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Simulate the model] PASSED [ 12%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Run EnergyPlus] PASSED [ 12%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:My simulation failed] FAILED [ 13%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:EUI looks way too high] PASSED [ 13%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:Too many unmet hours] PASSED [ 13%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:Why did EnergyPlus crash?] FAILED [ 14%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:Show me the model] PASSED [ 14%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:Visualize the building] PASSED [ 15%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:3D view] PASSED [ 15%] +tests/llm/test_04_workflows.py::test_workflow[systemd_fourpipebeam_e2e] PASSED [ 16%] +tests/llm/test_04_workflows.py::test_workflow[add_vav_reheat] PASSED [ 16%] +tests/llm/test_04_workflows.py::test_workflow[add_doas] PASSED [ 16%] +tests/llm/test_04_workflows.py::test_workflow[add_vrf] PASSED [ 17%] +tests/llm/test_04_workflows.py::test_workflow[set_weather] PASSED [ 17%] +tests/llm/test_04_workflows.py::test_workflow[add_rooftop_pv] PASSED [ 18%] +tests/llm/test_04_workflows.py::test_workflow[adjust_thermostat] PASSED [ 18%] +tests/llm/test_04_workflows.py::test_workflow[delete_space] PASSED [ 19%] +tests/llm/test_04_workflows.py::test_workflow[qaqc_check] PASSED [ 19%] +tests/llm/test_04_workflows.py::test_workflow[create_bar_office] PASSED [ 20%] +tests/llm/test_04_workflows.py::test_workflow[create_new_building] PASSED [ 20%] +tests/llm/test_04_workflows.py::test_workflow[bar_then_typical] PASSED [ 20%] +tests/llm/test_04_workflows.py::test_workflow[import_floorspacejs] PASSED [ 21%] +tests/llm/test_04_workflows.py::test_workflow[floorspacejs_to_typical] PASSED [ 21%] +tests/llm/test_04_workflows.py::test_workflow[manual_geometry_match] PASSED [ 22%] +tests/llm/test_04_workflows.py::test_workflow[envelope_retrofit] PASSED [ 22%] +tests/llm/test_04_workflows.py::test_workflow[create_and_assign_loads] PASSED [ 23%] +tests/llm/test_04_workflows.py::test_workflow[plant_loop_with_boiler] PASSED [ 23%] +tests/llm/test_04_workflows.py::test_workflow[inspect_and_modify_boiler] PASSED [ 23%] +tests/llm/test_04_workflows.py::test_workflow[extract_results_chain] PASSED [ 24%] +tests/llm/test_04_workflows.py::test_workflow[hvac_chilled_beam_comparison] FAILED [ 24%] +tests/llm/test_04_workflows.py::test_workflow[create_test_apply_measure] PASSED [ 25%] +tests/llm/test_04_workflows.py::test_workflow[measure_set_lights_full_chain] PASSED [ 25%] +tests/llm/test_04_workflows.py::test_workflow[measure_set_infiltration_full_chain] PASSED [ 26%] +tests/llm/test_04_workflows.py::test_workflow[measure_replace_terminals_full_chain] PASSED [ 26%] +tests/llm/test_04_workflows.py::test_workflow[create_measure_with_args] PASSED [ 26%] +tests/llm/test_04_workflows.py::test_workflow[measure_add_baseboards_full_chain] PASSED [ 27%] +tests/llm/test_04_workflows.py::test_workflow[ruby_measure_reduce_plugloads] PASSED [ 27%] +tests/llm/test_04_workflows.py::test_workflow[python_measure_reduce_plugloads] PASSED [ 28%] +tests/llm/test_04_workflows.py::test_workflow[ruby_measure_boiler_efficiency] PASSED [ 28%] +tests/llm/test_04_workflows.py::test_workflow[python_measure_boiler_efficiency] PASSED [ 29%] +tests/llm/test_04_workflows.py::test_create_measure_with_args_quality PASSED [ 29%] +tests/llm/test_04_workflows.py::test_complex_model_multi_query PASSED [ 30%] +tests/llm/test_04_workflows.py::test_measure_reduce_plugloads_quality[Ruby] PASSED [ 30%] +tests/llm/test_04_workflows.py::test_measure_reduce_plugloads_quality[Python] PASSED [ 30%] +tests/llm/test_04_workflows.py::test_measure_boiler_efficiency_quality[Ruby] FAILED [ 31%] +tests/llm/test_04_workflows.py::test_measure_boiler_efficiency_quality[Python] FAILED [ 31%] +tests/llm/test_05_guardrails.py::test_create_uses_mcp_not_raw_idf PASSED [ 32%] +tests/llm/test_05_guardrails.py::test_no_script_for_results PASSED [ 32%] +tests/llm/test_05_guardrails.py::test_inspect_component_uses_mcp_not_script PASSED [ 33%] +tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L1] PASSED [ 33%] +tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L2] PASSED [ 33%] +tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L3] PASSED [ 34%] +tests/llm/test_06_progressive.py::test_progressive[add_hvac_L1] PASSED [ 34%] +tests/llm/test_06_progressive.py::test_progressive[add_hvac_L2] PASSED [ 35%] +tests/llm/test_06_progressive.py::test_progressive[add_hvac_L3] PASSED [ 35%] +tests/llm/test_06_progressive.py::test_progressive[view_model_L1] PASSED [ 36%] +tests/llm/test_06_progressive.py::test_progressive[view_model_L2] PASSED [ 36%] +tests/llm/test_06_progressive.py::test_progressive[view_model_L3] PASSED [ 36%] +tests/llm/test_06_progressive.py::test_progressive[set_weather_L1] PASSED [ 37%] +tests/llm/test_06_progressive.py::test_progressive[set_weather_L2] PASSED [ 37%] +tests/llm/test_06_progressive.py::test_progressive[set_weather_L3] PASSED [ 38%] +tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L1] PASSED [ 38%] +tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L2] PASSED [ 39%] +tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L3] PASSED [ 39%] +tests/llm/test_06_progressive.py::test_progressive[create_building_L1] PASSED [ 40%] +tests/llm/test_06_progressive.py::test_progressive[create_building_L2] PASSED [ 40%] +tests/llm/test_06_progressive.py::test_progressive[create_building_L3] PASSED [ 40%] +tests/llm/test_06_progressive.py::test_progressive[add_pv_L1] PASSED [ 41%] +tests/llm/test_06_progressive.py::test_progressive[add_pv_L2] PASSED [ 41%] +tests/llm/test_06_progressive.py::test_progressive[add_pv_L3] PASSED [ 42%] +tests/llm/test_06_progressive.py::test_progressive[thermostat_L1] PASSED [ 42%] +tests/llm/test_06_progressive.py::test_progressive[thermostat_L2] PASSED [ 43%] +tests/llm/test_06_progressive.py::test_progressive[thermostat_L3] PASSED [ 43%] +tests/llm/test_06_progressive.py::test_progressive[list_spaces_L1] PASSED [ 43%] +tests/llm/test_06_progressive.py::test_progressive[list_spaces_L2] PASSED [ 44%] +tests/llm/test_06_progressive.py::test_progressive[list_spaces_L3] PASSED [ 44%] +tests/llm/test_06_progressive.py::test_progressive[schedules_L1] PASSED [ 45%] +tests/llm/test_06_progressive.py::test_progressive[schedules_L2] PASSED [ 45%] +tests/llm/test_06_progressive.py::test_progressive[schedules_L3] PASSED [ 46%] +tests/llm/test_06_progressive.py::test_progressive[inspect_component_L1] PASSED [ 46%] +tests/llm/test_06_progressive.py::test_progressive[inspect_component_L2] PASSED [ 46%] +tests/llm/test_06_progressive.py::test_progressive[inspect_component_L3] PASSED [ 47%] +tests/llm/test_06_progressive.py::test_progressive[modify_component_L1] PASSED [ 47%] +tests/llm/test_06_progressive.py::test_progressive[modify_component_L2] PASSED [ 48%] +tests/llm/test_06_progressive.py::test_progressive[modify_component_L3] PASSED [ 48%] +tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L1] PASSED [ 49%] +tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L2] PASSED [ 49%] +tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L3] PASSED [ 50%] +tests/llm/test_06_progressive.py::test_progressive[floor_area_L1] PASSED [ 50%] +tests/llm/test_06_progressive.py::test_progressive[floor_area_L2] PASSED [ 50%] +tests/llm/test_06_progressive.py::test_progressive[floor_area_L3] PASSED [ 51%] +tests/llm/test_06_progressive.py::test_progressive[materials_L1] PASSED [ 51%] +tests/llm/test_06_progressive.py::test_progressive[materials_L2] PASSED [ 52%] +tests/llm/test_06_progressive.py::test_progressive[materials_L3] PASSED [ 52%] +tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L1] PASSED [ 53%] +tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L2] PASSED [ 53%] +tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L3] PASSED [ 53%] +tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L1] PASSED [ 54%] +tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L2] PASSED [ 54%] +tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L3] PASSED [ 55%] +tests/llm/test_06_progressive.py::test_progressive[surface_details_L1] PASSED [ 55%] +tests/llm/test_06_progressive.py::test_progressive[surface_details_L2] PASSED [ 56%] +tests/llm/test_06_progressive.py::test_progressive[surface_details_L3] PASSED [ 56%] +tests/llm/test_06_progressive.py::test_progressive[run_simulation_L1] PASSED [ 56%] +tests/llm/test_06_progressive.py::test_progressive[run_simulation_L2] PASSED [ 57%] +tests/llm/test_06_progressive.py::test_progressive[run_simulation_L3] PASSED [ 57%] +tests/llm/test_06_progressive.py::test_progressive[get_eui_L1] PASSED [ 58%] +tests/llm/test_06_progressive.py::test_progressive[get_eui_L2] PASSED [ 58%] +tests/llm/test_06_progressive.py::test_progressive[get_eui_L3] PASSED [ 59%] +tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L1] PASSED [ 59%] +tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L2] PASSED [ 60%] +tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L3] PASSED [ 60%] +tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L1] PASSED [ 60%] +tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L2] PASSED [ 61%] +tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L3] PASSED [ 61%] +tests/llm/test_06_progressive.py::test_progressive[set_wwr_L1] PASSED [ 62%] +tests/llm/test_06_progressive.py::test_progressive[set_wwr_L2] PASSED [ 62%] +tests/llm/test_06_progressive.py::test_progressive[set_wwr_L3] PASSED [ 63%] +tests/llm/test_06_progressive.py::test_progressive[replace_windows_L1] PASSED [ 63%] +tests/llm/test_06_progressive.py::test_progressive[replace_windows_L2] PASSED [ 63%] +tests/llm/test_06_progressive.py::test_progressive[replace_windows_L3] PASSED [ 64%] +tests/llm/test_06_progressive.py::test_progressive[construction_details_L1] PASSED [ 64%] +tests/llm/test_06_progressive.py::test_progressive[construction_details_L2] PASSED [ 65%] +tests/llm/test_06_progressive.py::test_progressive[construction_details_L3] PASSED [ 65%] +tests/llm/test_06_progressive.py::test_progressive[check_loads_L1] PASSED [ 66%] +tests/llm/test_06_progressive.py::test_progressive[check_loads_L2] PASSED [ 66%] +tests/llm/test_06_progressive.py::test_progressive[check_loads_L3] PASSED [ 66%] +tests/llm/test_06_progressive.py::test_progressive[create_loads_L1] PASSED [ 67%] +tests/llm/test_06_progressive.py::test_progressive[create_loads_L2] PASSED [ 67%] +tests/llm/test_06_progressive.py::test_progressive[create_loads_L3] PASSED [ 68%] +tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L1] PASSED [ 68%] +tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L2] PASSED [ 69%] +tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L3] PASSED [ 69%] +tests/llm/test_06_progressive.py::test_progressive[schedule_details_L1] PASSED [ 70%] +tests/llm/test_06_progressive.py::test_progressive[schedule_details_L2] PASSED [ 70%] +tests/llm/test_06_progressive.py::test_progressive[schedule_details_L3] PASSED [ 70%] +tests/llm/test_06_progressive.py::test_progressive[space_type_info_L1] PASSED [ 71%] +tests/llm/test_06_progressive.py::test_progressive[space_type_info_L2] PASSED [ 71%] +tests/llm/test_06_progressive.py::test_progressive[space_type_info_L3] PASSED [ 72%] +tests/llm/test_06_progressive.py::test_progressive[set_run_period_L1] PASSED [ 72%] +tests/llm/test_06_progressive.py::test_progressive[set_run_period_L2] PASSED [ 73%] +tests/llm/test_06_progressive.py::test_progressive[set_run_period_L3] PASSED [ 73%] +tests/llm/test_06_progressive.py::test_progressive[ideal_air_L1] PASSED [ 73%] +tests/llm/test_06_progressive.py::test_progressive[ideal_air_L2] PASSED [ 74%] +tests/llm/test_06_progressive.py::test_progressive[ideal_air_L3] PASSED [ 74%] +tests/llm/test_06_progressive.py::test_progressive[save_model_L1] PASSED [ 75%] +tests/llm/test_06_progressive.py::test_progressive[save_model_L2] PASSED [ 75%] +tests/llm/test_06_progressive.py::test_progressive[save_model_L3] PASSED [ 76%] +tests/llm/test_06_progressive.py::test_progressive[add_ev_L1] PASSED [ 76%] +tests/llm/test_06_progressive.py::test_progressive[add_ev_L2] PASSED [ 76%] +tests/llm/test_06_progressive.py::test_progressive[add_ev_L3] PASSED [ 77%] +tests/llm/test_06_progressive.py::test_progressive[list_measures_L1] PASSED [ 77%] +tests/llm/test_06_progressive.py::test_progressive[list_measures_L2] PASSED [ 78%] +tests/llm/test_06_progressive.py::test_progressive[list_measures_L3] SKIPPED [ 78%] +tests/llm/test_06_progressive.py::test_progressive[create_measure_L1] SKIPPED [ 79%] +tests/llm/test_06_progressive.py::test_progressive[create_measure_L2] SKIPPED [ 79%] +tests/llm/test_06_progressive.py::test_progressive[create_measure_L3] SKIPPED [ 80%] +tests/llm/test_06_progressive.py::test_progressive[test_measure_L1] SKIPPED [ 80%] +tests/llm/test_06_progressive.py::test_progressive[test_measure_L2] SKIPPED [ 80%] +tests/llm/test_06_progressive.py::test_progressive[test_measure_L3] SKIPPED [ 81%] +tests/llm/test_06_progressive.py::test_progressive[apply_existing_measure_L1] SKIPPED [ 81%] +tests/llm/test_06_progressive.py::test_progressive[apply_existing_measure_L2] SKIPPED [ 82%] +tests/llm/test_06_progressive.py::test_progressive[apply_existing_measure_L3] SKIPPED [ 82%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_cooled_beam_L1] SKIPPED [ 83%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_cooled_beam_L2] SKIPPED [ 83%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_cooled_beam_L3] SKIPPED [ 83%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_four_pipe_beam_L1] SKIPPED [ 84%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_four_pipe_beam_L2] SKIPPED [ 84%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_four_pipe_beam_L3] SKIPPED [ 85%] +tests/llm/test_06_progressive.py::test_progressive[measure_replace_terminals_L1] SKIPPED [ 85%] +tests/llm/test_06_progressive.py::test_progressive[measure_replace_terminals_L2] SKIPPED [ 86%] +tests/llm/test_06_progressive.py::test_progressive[measure_replace_terminals_L3] SKIPPED [ 86%] +tests/llm/test_06_progressive.py::test_progressive[zone_equipment_priority_L1] SKIPPED [ 86%] +tests/llm/test_06_progressive.py::test_progressive[zone_equipment_priority_L2] SKIPPED [ 87%] +tests/llm/test_06_progressive.py::test_progressive[zone_equipment_priority_L3] SKIPPED [ 87%] +tests/llm/test_06_progressive.py::test_progressive[edit_measure_L1] SKIPPED [ 88%] +tests/llm/test_06_progressive.py::test_progressive[edit_measure_L2] SKIPPED [ 88%] +tests/llm/test_06_progressive.py::test_progressive[edit_measure_L3] SKIPPED [ 89%] +tests/llm/test_07_fourpipe_e2e.py::test_fourpipe_beam_retrofit_e2e SKIPPED [ 89%] +tests/llm/test_08_measure_authoring.py::test_create_measure_with_quoted_description SKIPPED [ 90%] +tests/llm/test_08_measure_authoring.py::test_edit_measure_description_with_quotes SKIPPED [ 90%] +tests/llm/test_08_measure_authoring.py::test_measure_xml_intended_software_tool SKIPPED [ 90%] +tests/llm/test_08_measure_authoring.py::test_syntax_error_reported_clearly SKIPPED [ 91%] +tests/llm/test_09_tool_routing.py::test_tool_selection_baseline[create_measure] SKIPPED [ 91%] +tests/llm/test_09_tool_routing.py::test_tool_selection_baseline[view_model] SKIPPED [ 92%] +tests/llm/test_09_tool_routing.py::test_tool_selection_baseline[read_file] SKIPPED [ 92%] +tests/llm/test_09_tool_routing.py::test_tool_selection_baseline[add_baseline_system] SKIPPED [ 93%] +tests/llm/test_09_tool_routing.py::test_tool_selection_baseline_extract_eui SKIPPED [ 93%] +tests/llm/test_09_tool_routing.py::test_visualization_uses_mcp_not_script SKIPPED [ 93%] +tests/llm/test_09_tool_routing.py::test_report_uses_mcp_not_script SKIPPED [ 94%] +tests/llm/test_09_tool_routing.py::test_measure_uses_create_measure_not_create_file SKIPPED [ 94%] +tests/llm/test_09_tool_routing.py::test_read_file_uses_mcp_not_bash SKIPPED [ 95%] +tests/llm/test_09_tool_routing.py::test_hvac_measure_uses_api_reference SKIPPED [ 95%] +tests/llm/test_09_tool_routing.py::test_search_api_for_method_verification SKIPPED [ 96%] +tests/llm/test_09_tool_routing.py::test_search_wiring_patterns_for_hvac_wiring SKIPPED [ 96%] +tests/llm/test_10_confusion_pairs.py::test_qaqc_vs_validate_post_sim SKIPPED [ 96%] +tests/llm/test_10_confusion_pairs.py::test_validate_vs_qaqc_pre_sim SKIPPED [ 97%] +tests/llm/test_10_confusion_pairs.py::test_load_details_vs_space_details SKIPPED [ 97%] +tests/llm/test_10_confusion_pairs.py::test_summary_metrics_vs_end_use SKIPPED [ 98%] +tests/llm/test_10_confusion_pairs.py::test_end_use_vs_summary_metrics SKIPPED [ 98%] +tests/llm/test_10_confusion_pairs.py::test_inspect_osm_vs_model_summary SKIPPED [ 99%] +tests/llm/test_10_confusion_pairs.py::test_create_baseline_vs_new_building SKIPPED [ 99%] +tests/llm/test_10_confusion_pairs.py::test_apply_measure_vs_create_measure SKIPPED [100%] +====================================================================== +LLM Benchmark: 170/180 passed (94.4%) | Model: opus | 11078s +Tokens: 2.0k in + 164.4k out + 22.6M cache | Cost: $32.2343 + setup: 6/6 (100.0%) in 512s + tier1: 4/4 (100.0%) in 135s + tier2: 34/37 (91.9%) in 5344s + tier3: 19/26 (73.1%) in 1860s + tier4: 3/3 (100.0%) in 135s + progressive: 104/104 (100.0%) in 3092s +Failed: energy-report:Give me a full energy report, qaqc:Check the model for issues, qaqc:Validate before simulation, qaqc:QA/QC the model, qaqc:Is my model ready to simulate?, troubleshoot:My simulation failed, troubleshoot:Why did EnergyPlus crash?, hvac_chilled_beam_comparison, Ruby, Python +Report: C:\tmp\llm-sweep-opus\benchmark.md +History: C:\tmp\llm-sweep-opus\benchmark_history.json (1 runs) +====================================================================== + + +================================== FAILURES =================================== +____ test_eval_tool_selection[energy-report:Give me a full energy report] _____ + +case = {'expected_tools': ['extract_summary_metrics', 'extract_end_use_breakdown', 'extract_envelope_summary', 'extract_hvac_sizing', 'extract_zone_summary'], 'prompt': 'Give me a full energy report', 'skill': 'energy-report'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + + # Merge eval.md expected tools with extra acceptable tools + expected = set(case["expected_tools"]) + expected.update(EXTRA_EXPECTED.get(case["skill"], [])) + +> assert any(t in expected for t in tool_names), ( + f"[{case['skill']}] Expected one of {sorted(expected)}, " + f"got: {tool_names}" + ) +E AssertionError: [energy-report] Expected one of ['extract_end_use_breakdown', 'extract_envelope_summary', 'extract_hvac_sizing', 'extract_summary_metrics', 'extract_zone_summary', 'generate_results_report'], got: ['load_osm_model', 'get_building_info', 'list_files', 'get_weather_info', 'run_simulation'] +E assert False +E + where False = any(. at 0x000001C1F3845E50>) + +tests\llm\test_03_eval_cases.py:148: AssertionError +__________ test_eval_tool_selection[qaqc:Check the model for issues] __________ + +case = {'expected_tools': ['run_qaqc_checks', 'inspect_osm_summary'], 'prompt': 'Check the model for issues', 'skill': 'qaqc'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + + # Merge eval.md expected tools with extra acceptable tools + expected = set(case["expected_tools"]) + expected.update(EXTRA_EXPECTED.get(case["skill"], [])) + +> assert any(t in expected for t in tool_names), ( + f"[{case['skill']}] Expected one of {sorted(expected)}, " + f"got: {tool_names}" + ) +E AssertionError: [qaqc] Expected one of ['get_model_summary', 'inspect_osm_summary', 'run_qaqc_checks'], got: ['load_osm_model', 'validate_model'] +E assert False +E + where False = any(. at 0x000001C1F386A260>) + +tests\llm\test_03_eval_cases.py:148: AssertionError +__________ test_eval_tool_selection[qaqc:Validate before simulation] __________ + +case = {'expected_tools': ['run_qaqc_checks'], 'prompt': 'Validate before simulation', 'skill': 'qaqc'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + + # Merge eval.md expected tools with extra acceptable tools + expected = set(case["expected_tools"]) + expected.update(EXTRA_EXPECTED.get(case["skill"], [])) + +> assert any(t in expected for t in tool_names), ( + f"[{case['skill']}] Expected one of {sorted(expected)}, " + f"got: {tool_names}" + ) +E AssertionError: [qaqc] Expected one of ['get_model_summary', 'inspect_osm_summary', 'run_qaqc_checks'], got: ['load_osm_model', 'validate_model'] +E assert False +E + where False = any(. at 0x000001C1F3844860>) + +tests\llm\test_03_eval_cases.py:148: AssertionError +_______________ test_eval_tool_selection[qaqc:QA/QC the model] ________________ + +case = {'expected_tools': ['run_qaqc_checks'], 'prompt': 'QA/QC the model', 'skill': 'qaqc'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + + # Merge eval.md expected tools with extra acceptable tools + expected = set(case["expected_tools"]) + expected.update(EXTRA_EXPECTED.get(case["skill"], [])) + +> assert any(t in expected for t in tool_names), ( + f"[{case['skill']}] Expected one of {sorted(expected)}, " + f"got: {tool_names}" + ) +E AssertionError: [qaqc] Expected one of ['get_model_summary', 'inspect_osm_summary', 'run_qaqc_checks'], got: ['load_osm_model', 'validate_model'] +E assert False +E + where False = any(. at 0x000001C1F386BD30>) + +tests\llm\test_03_eval_cases.py:148: AssertionError +________ test_eval_tool_selection[qaqc:Is my model ready to simulate?] ________ + +case = {'expected_tools': ['inspect_osm_summary', 'run_qaqc_checks'], 'prompt': 'Is my model ready to simulate?', 'skill': 'qaqc'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + + # Merge eval.md expected tools with extra acceptable tools + expected = set(case["expected_tools"]) + expected.update(EXTRA_EXPECTED.get(case["skill"], [])) + +> assert any(t in expected for t in tool_names), ( + f"[{case['skill']}] Expected one of {sorted(expected)}, " + f"got: {tool_names}" + ) +E AssertionError: [qaqc] Expected one of ['get_model_summary', 'inspect_osm_summary', 'run_qaqc_checks'], got: ['load_osm_model', 'validate_model'] +E assert False +E + where False = any(. at 0x000001C1F38C0A00>) + +tests\llm\test_03_eval_cases.py:148: AssertionError +_________ test_eval_tool_selection[troubleshoot:My simulation failed] _________ + +case = {'expected_tools': ['get_run_status', 'get_run_logs'], 'prompt': 'My simulation failed', 'skill': 'troubleshoot'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + + # Merge eval.md expected tools with extra acceptable tools + expected = set(case["expected_tools"]) + expected.update(EXTRA_EXPECTED.get(case["skill"], [])) + +> assert any(t in expected for t in tool_names), ( + f"[{case['skill']}] Expected one of {sorted(expected)}, " + f"got: {tool_names}" + ) +E AssertionError: [troubleshoot] Expected one of ['extract_component_sizing', 'extract_summary_metrics', 'get_building_info', 'get_model_summary', 'get_run_logs', 'get_run_status', 'inspect_osm_summary', 'list_files', 'list_thermal_zones', 'run_simulation'], got: ['load_osm_model', 'extract_simulation_errors', 'list_weather_files'] +E assert False +E + where False = any(. at 0x000001C1F39097D0>) + +tests\llm\test_03_eval_cases.py:148: AssertionError +______ test_eval_tool_selection[troubleshoot:Why did EnergyPlus crash?] _______ + +case = {'expected_tools': ['get_run_logs'], 'prompt': 'Why did EnergyPlus crash?', 'skill': 'troubleshoot'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + + # Merge eval.md expected tools with extra acceptable tools + expected = set(case["expected_tools"]) + expected.update(EXTRA_EXPECTED.get(case["skill"], [])) + +> assert any(t in expected for t in tool_names), ( + f"[{case['skill']}] Expected one of {sorted(expected)}, " + f"got: {tool_names}" + ) +E AssertionError: [troubleshoot] Expected one of ['extract_component_sizing', 'extract_summary_metrics', 'get_building_info', 'get_model_summary', 'get_run_logs', 'get_run_status', 'inspect_osm_summary', 'list_files', 'list_thermal_zones', 'run_simulation'], got: ['load_osm_model', 'extract_simulation_errors'] +E assert False +E + where False = any(. at 0x000001C1F38475E0>) + +tests\llm\test_03_eval_cases.py:148: AssertionError +_________________ test_workflow[hvac_chilled_beam_comparison] _________________ + +case = {'any_of': ['extract_end_use_breakdown', 'extract_summary_metrics'], 'id': 'hvac_chilled_beam_comparison', 'max_turns'...g replace_air_terminals. Save the model and run a simulation. Extract the end use breakdown. Use MCP tools only.', ...} + + @pytest.mark.parametrize("case", WORKFLOW_CASES, ids=[c["id"] for c in WORKFLOW_CASES]) + def test_workflow(case): + """Agent loads model and completes a multi-step workflow.""" + # Validates: Claude chains all required MCP tools for multi-step BEM workflows + tier = get_tier() + if tier not in ("all", "2"): + pytest.skip("Tier 2 not selected") + + # Build prompt for needs_run cases + prompt = case["prompt"] + if case.get("needs_run"): + run_id = get_sim_run_id() + if not run_id: + pytest.skip("No simulation run_id � run test_01_setup first") + prompt = ( + f"Extract results from simulation run '{run_id}'. " + "First extract summary metrics using extract_summary_metrics. " + "Then extract end use breakdown using extract_end_use_breakdown. " + "Use MCP tools only." + ) + elif BASELINE_HVAC_MODEL in prompt and not baseline_hvac_model_exists(): + pytest.skip("Baseline+HVAC model not found � run test_01_setup first") + elif BASELINE_MODEL in prompt and not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + + result = run_claude( + prompt, + timeout=case.get("timeout", 120), + max_turns=case.get("max_turns"), + ) + tool_names = result.tool_names + + for tool in case["required_tools"]: + assert tool in tool_names, ( + f"Required tool '{tool}' not found. Tools: {tool_names}" + ) + + if "any_of" in case: +> assert any(t in tool_names for t in case["any_of"]), ( + f"None of {case['any_of']} found. Tools: {tool_names}" + ) +E AssertionError: None of ['extract_end_use_breakdown', 'extract_summary_metrics'] found. Tools: ['load_osm_model', 'list_air_loops', 'get_air_loop_details', 'replace_air_terminals', 'save_osm_model', 'run_simulation', 'get_run_status', 'list_weather_files', 'change_building_location', 'save_osm_model', 'run_simulation'] +E assert False +E + where False = any(. at 0x000001C1F3916740>) + +tests\llm\test_04_workflows.py:629: AssertionError +________________ test_measure_boiler_efficiency_quality[Ruby] _________________ + +language = 'Ruby' + + @pytest.mark.parametrize("language", ["Ruby", "Python"]) + def test_measure_boiler_efficiency_quality(language): + """LLM creates a well-parameterized boiler efficiency measure.""" + # Validates: Claude creates boiler efficiency measures with Choice/Double/Boolean args and correct body references + tier = get_tier() + if tier not in ("all", "2"): + pytest.skip("Tier 2 not selected") + + prompt = ( + f"Create a {language} ModelMeasure that upgrades hot water boiler " + "efficiency. It must have these arguments:\n" + " - target_efficiency: Double, default 0.95\n" + " - fuel_type_filter: Choice (All, NaturalGas, Electricity)\n" + " - skip_if_above_target: Boolean, default true\n" + "The measure should iterate BoilerHotWater objects, optionally " + "filter by fuel type, skip boilers already at or above the target " + "efficiency if the boolean is set, and call " + "setNominalThermalEfficiency on the rest. " + f"Use create_measure with language {language}. Use MCP tools only." + ) + result = run_claude(prompt, timeout=300, max_turns=15) +> _check_measure_args_quality( + result, + expected_language=language, + expected_arg_types={"Choice", "Double", "Boolean"}, + body_keywords=_BOILER_BODY_KEYWORDS, + label=f"boiler_{language}", + ) + +tests\llm\test_04_workflows.py:926: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +result = + + def _check_measure_args_quality( + result, *, expected_language, expected_arg_types, + body_keywords, label, + ): + """Shared quality checks for measure-with-args tests. + + Args: + result: ClaudeResult from run_claude + expected_language: "Ruby" or "Python" (case-insensitive match) + expected_arg_types: set of required arg types, e.g. {"Choice", "Double", "Boolean"} + body_keywords: list of strings � at least one must appear in run_body + label: human-readable test label for assertion messages + """ + tool_names = result.tool_names + assert "create_measure" in tool_names, ( + f"[{label}] Missing create_measure. Tools: {tool_names}" + ) + + create_input = _find_create_measure_input(result) + assert create_input, f"[{label}] create_measure call not found in MCP tool calls" + + # Language check + lang = create_input.get("language", "") + assert lang.lower() == expected_language.lower(), ( + f"[{label}] Expected language={expected_language}, got {lang}" + ) + + args = _parse_args(create_input) + run_body = create_input.get("run_body", "") + + # 1. Has arguments + assert args and len(args) > 0, ( + f"[{label}] No arguments � LLM hard-coded all values" + ) + + # 2. Required argument types present + arg_types = {a.get("type", "") for a in args} + for t in expected_arg_types: + assert t in arg_types, ( + f"[{label}] Missing arg type {t}. Types found: {arg_types}" + ) + + # 3. Choice arg has values list + for a in args: + if a.get("type") == "Choice": + vals = a.get("values", []) +> assert len(vals) >= 2, ( + f"[{label}] Choice arg '{a.get('name')}' needs >=2 values, " + f"got {vals}" + ) +E AssertionError: [boiler_Ruby] Choice arg 'fuel_type_filter' needs >=2 values, got [] +E assert 0 >= 2 +E + where 0 = len([]) + +tests\llm\test_04_workflows.py:822: AssertionError +_______________ test_measure_boiler_efficiency_quality[Python] ________________ + +language = 'Python' + + @pytest.mark.parametrize("language", ["Ruby", "Python"]) + def test_measure_boiler_efficiency_quality(language): + """LLM creates a well-parameterized boiler efficiency measure.""" + # Validates: Claude creates boiler efficiency measures with Choice/Double/Boolean args and correct body references + tier = get_tier() + if tier not in ("all", "2"): + pytest.skip("Tier 2 not selected") + + prompt = ( + f"Create a {language} ModelMeasure that upgrades hot water boiler " + "efficiency. It must have these arguments:\n" + " - target_efficiency: Double, default 0.95\n" + " - fuel_type_filter: Choice (All, NaturalGas, Electricity)\n" + " - skip_if_above_target: Boolean, default true\n" + "The measure should iterate BoilerHotWater objects, optionally " + "filter by fuel type, skip boilers already at or above the target " + "efficiency if the boolean is set, and call " + "setNominalThermalEfficiency on the rest. " + f"Use create_measure with language {language}. Use MCP tools only." + ) + result = run_claude(prompt, timeout=300, max_turns=15) +> _check_measure_args_quality( + result, + expected_language=language, + expected_arg_types={"Choice", "Double", "Boolean"}, + body_keywords=_BOILER_BODY_KEYWORDS, + label=f"boiler_{language}", + ) + +tests\llm\test_04_workflows.py:926: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +result = + + def _check_measure_args_quality( + result, *, expected_language, expected_arg_types, + body_keywords, label, + ): + """Shared quality checks for measure-with-args tests. + + Args: + result: ClaudeResult from run_claude + expected_language: "Ruby" or "Python" (case-insensitive match) + expected_arg_types: set of required arg types, e.g. {"Choice", "Double", "Boolean"} + body_keywords: list of strings � at least one must appear in run_body + label: human-readable test label for assertion messages + """ + tool_names = result.tool_names + assert "create_measure" in tool_names, ( + f"[{label}] Missing create_measure. Tools: {tool_names}" + ) + + create_input = _find_create_measure_input(result) + assert create_input, f"[{label}] create_measure call not found in MCP tool calls" + + # Language check + lang = create_input.get("language", "") + assert lang.lower() == expected_language.lower(), ( + f"[{label}] Expected language={expected_language}, got {lang}" + ) + + args = _parse_args(create_input) + run_body = create_input.get("run_body", "") + + # 1. Has arguments + assert args and len(args) > 0, ( + f"[{label}] No arguments � LLM hard-coded all values" + ) + + # 2. Required argument types present + arg_types = {a.get("type", "") for a in args} + for t in expected_arg_types: + assert t in arg_types, ( + f"[{label}] Missing arg type {t}. Types found: {arg_types}" + ) + + # 3. Choice arg has values list + for a in args: + if a.get("type") == "Choice": + vals = a.get("values", []) +> assert len(vals) >= 2, ( + f"[{label}] Choice arg '{a.get('name')}' needs >=2 values, " + f"got {vals}" + ) +E AssertionError: [boiler_Python] Choice arg 'fuel_type_filter' needs >=2 values, got [] +E assert 0 >= 2 +E + where 0 = len([]) + +tests\llm\test_04_workflows.py:822: AssertionError +=========================== short test summary info =========================== +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[energy-report:Give me a full energy report] +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Check the model for issues] +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Validate before simulation] +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:QA/QC the model] +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Is my model ready to simulate?] +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:My simulation failed] +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:Why did EnergyPlus crash?] +FAILED tests/llm/test_04_workflows.py::test_workflow[hvac_chilled_beam_comparison] +FAILED tests/llm/test_04_workflows.py::test_measure_boiler_efficiency_quality[Ruby] +FAILED tests/llm/test_04_workflows.py::test_measure_boiler_efficiency_quality[Python] +========== 10 failed, 170 passed, 50 skipped in 11080.02s (3:04:40) =========== diff --git a/docs/sweeps/sonnet-2026-03-28/benchmark.json b/docs/sweeps/sonnet-2026-03-28/benchmark.json new file mode 100644 index 0000000..e506632 --- /dev/null +++ b/docs/sweeps/sonnet-2026-03-28/benchmark.json @@ -0,0 +1,5819 @@ +{ + "timestamp": "2026-03-28T17:06:27+00:00", + "model": "sonnet", + "retries": 0, + "total_tests": 180, + "passed": 170, + "failed": 10, + "pass_rate": 94.4, + "total_duration_s": 9452.9, + "total_input_tokens": 1959, + "total_output_tokens": 250127, + "total_cache_read_tokens": 20447621, + "total_cost_usd": 18.9595, + "tiers": { + "setup": { + "total": 6, + "passed": 6, + "duration_s": 420.6, + "pass_rate": 100.0 + }, + "tier1": { + "total": 4, + "passed": 4, + "duration_s": 130.0, + "pass_rate": 100.0 + }, + "tier3": { + "total": 26, + "passed": 21, + "duration_s": 1702.9, + "pass_rate": 80.8 + }, + "tier2": { + "total": 37, + "passed": 33, + "duration_s": 3600.4, + "pass_rate": 89.2 + }, + "tier4": { + "total": 3, + "passed": 3, + "duration_s": 202.8, + "pass_rate": 100.0 + }, + "progressive": { + "total": 104, + "passed": 103, + "duration_s": 3396.2, + "pass_rate": 99.0 + } + }, + "tests": [ + { + "test_id": "tests/llm/test_01_setup.py::test_create_baseline_model", + "passed": true, + "duration_s": 11.3, + "tier": "setup", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.06297675, + "duration_ms": 8256, + "input_tokens": 7, + "output_tokens": 330, + "cache_read_tokens": 44515, + "tool_calls": [ + "create_baseline_osm" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_baseline_osm" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_01_setup.py::test_create_baseline_with_hvac", + "passed": true, + "duration_s": 15.2, + "tier": "setup", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0600585, + "duration_ms": 13099, + "input_tokens": 7, + "output_tokens": 389, + "cache_read_tokens": 45750, + "tool_calls": [ + "create_baseline_osm" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_baseline_osm" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_01_setup.py::test_create_example_model", + "passed": true, + "duration_s": 10.8, + "tier": "setup", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0571248, + "duration_ms": 8650, + "input_tokens": 7, + "output_tokens": 292, + "cache_read_tokens": 45446, + "tool_calls": [ + "create_example_osm" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_example_osm" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_01_setup.py::test_load_baseline_model", + "passed": true, + "duration_s": 13.3, + "tier": "setup", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.07076775, + "duration_ms": 11294, + "input_tokens": 8, + "output_tokens": 412, + "cache_read_tokens": 64350, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_01_setup.py::test_run_baseline_simulation", + "passed": true, + "duration_s": 235.9, + "tier": "setup", + "attempt": 1, + "num_turns": 12, + "cost_usd": 0.1500489, + "duration_ms": 233832, + "input_tokens": 18, + "output_tokens": 1666, + "cache_read_tokens": 236233, + "tool_calls": [ + "load_osm_model", + "change_building_location", + "run_simulation", + "get_run_status", + "save_osm_model", + "run_simulation", + "get_run_status" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_01_setup.py::test_run_retrofit_simulation", + "passed": true, + "duration_s": 134.1, + "tier": "setup", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.1210335, + "duration_ms": 131511, + "input_tokens": 12, + "output_tokens": 1536, + "cache_read_tokens": 152450, + "tool_calls": [ + "load_osm_model", + "change_building_location", + "adjust_thermostat_setpoints", + "run_simulation", + "get_run_status" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "mcp__openstudio__adjust_thermostat_setpoints", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[What is the server status?]", + "passed": true, + "duration_s": 9.0, + "tier": "tier1", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.056742600000000004, + "duration_ms": 6445, + "input_tokens": 7, + "output_tokens": 270, + "cache_read_tokens": 45072, + "tool_calls": [ + "get_server_status" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__get_server_status" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[List available skills]", + "passed": true, + "duration_s": 12.6, + "tier": "tier1", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.06104895, + "duration_ms": 10427, + "input_tokens": 7, + "output_tokens": 445, + "cache_read_tokens": 45364, + "tool_calls": [ + "list_skills" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_skills" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[Create a small office building usin]", + "passed": true, + "duration_s": 90.1, + "tier": "tier1", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "create_new_building", + "list_weather_files", + "create_new_building", + "create_new_building" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building" + ], + "toolsearch_count": 2, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[Create bar geometry for a retail bu]", + "passed": true, + "duration_s": 18.3, + "tier": "tier1", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0693171, + "duration_ms": 16249, + "input_tokens": 7, + "output_tokens": 556, + "cache_read_tokens": 46112, + "tool_calls": [ + "create_bar_building" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_bar_building" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Add HVAC to the model]", + "passed": true, + "duration_s": 42.0, + "tier": "tier3", + "attempt": 1, + "num_turns": 15, + "cost_usd": 0.16269540000000005, + "duration_ms": 39736, + "input_tokens": 23, + "output_tokens": 1858, + "cache_read_tokens": 222863, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "list_thermal_zones", + "add_baseline_system", + "list_air_loops", + "list_plant_loops", + "save_osm_model" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "Skill", + "ToolSearch", + "mcp__openstudio__get_building_info", + "ToolSearch", + "mcp__openstudio__list_thermal_zones", + "ToolSearch", + "mcp__openstudio__add_baseline_system", + "ToolSearch", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 5, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Set up heating and cooling]", + "passed": true, + "duration_s": 29.9, + "tier": "tier3", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.09760005, + "duration_ms": 27883, + "input_tokens": 13, + "output_tokens": 1164, + "cache_read_tokens": 104416, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "list_thermal_zones" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "Skill", + "ToolSearch", + "mcp__openstudio__get_building_info", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:What HVAC system should I use?]", + "passed": true, + "duration_s": 53.3, + "tier": "tier3", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.12399779999999999, + "duration_ms": 51338, + "input_tokens": 10, + "output_tokens": 2861, + "cache_read_tokens": 85801, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "list_thermal_zones" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info", + "mcp__openstudio__list_thermal_zones", + "Skill" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Add a VAV system]", + "passed": true, + "duration_s": 17.0, + "tier": "tier3", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0861888, + "duration_ms": 14988, + "input_tokens": 9, + "output_tokens": 792, + "cache_read_tokens": 86156, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_baseline_system" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_baseline_system" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[energy-report:Give me a full energy report]", + "passed": false, + "duration_s": 120.2, + "tier": "tier3", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "list_files", + "get_building_info", + "get_model_summary", + "get_weather_info", + "run_simulation" + ], + "num_tool_calls": 6, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_files", + "mcp__openstudio__get_building_info", + "mcp__openstudio__get_model_summary", + "ToolSearch", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__run_simulation", + "ToolSearch", + "Bash" + ], + "toolsearch_count": 4, + "is_timeout": true, + "failure_mode": "timeout" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a small office building]", + "passed": true, + "duration_s": 54.8, + "tier": "tier3", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.17865390000000003, + "duration_ms": 52695, + "input_tokens": 23, + "output_tokens": 1394, + "cache_read_tokens": 244733, + "tool_calls": [ + "list_skills", + "get_skill", + "list_weather_files", + "create_new_building", + "save_osm_model" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_skills", + "ToolSearch", + "mcp__openstudio__get_skill", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "ToolSearch", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 5, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Model a 3-story school]", + "passed": true, + "duration_s": 138.1, + "tier": "tier3", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.2104218, + "duration_ms": 135955, + "input_tokens": 17, + "output_tokens": 3158, + "cache_read_tokens": 200611, + "tool_calls": [ + "list_skills", + "get_server_status", + "get_skill", + "list_weather_files", + "create_new_building", + "save_osm_model", + "get_model_summary" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_skills", + "mcp__openstudio__get_server_status", + "ToolSearch", + "mcp__openstudio__get_skill", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__create_new_building", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__get_model_summary" + ], + "toolsearch_count": 3, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a retail building, 25000 sqf]", + "passed": true, + "duration_s": 180.1, + "tier": "tier3", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "get_server_status", + "list_skills", + "get_skill", + "list_weather_files", + "create_new_building", + "change_building_location", + "create_typical_building", + "create_typical_building", + "list_thermal_zones", + "add_baseline_system", + "list_baseline_systems" + ], + "num_tool_calls": 11, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__get_server_status", + "mcp__openstudio__list_skills", + "ToolSearch", + "mcp__openstudio__get_skill", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__change_building_location", + "mcp__openstudio__create_typical_building", + "mcp__openstudio__create_typical_building", + "ToolSearch", + "mcp__openstudio__list_thermal_zones", + "ToolSearch", + "mcp__openstudio__add_baseline_system", + "ToolSearch", + "mcp__openstudio__list_baseline_systems" + ], + "toolsearch_count": 7, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Import the FloorspaceJS floor plan ]", + "passed": true, + "duration_s": 24.5, + "tier": "tier3", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.09687345, + "duration_ms": 22502, + "input_tokens": 12, + "output_tokens": 860, + "cache_read_tokens": 106129, + "tool_calls": [ + "import_floorspacejs", + "list_files", + "import_floorspacejs" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__import_floorspacejs", + "ToolSearch", + "mcp__openstudio__list_files", + "mcp__openstudio__import_floorspacejs" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a bar building for a medium ]", + "passed": true, + "duration_s": 19.5, + "tier": "tier3", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.07245194999999999, + "duration_ms": 17357, + "input_tokens": 7, + "output_tokens": 566, + "cache_read_tokens": 46124, + "tool_calls": [ + "create_bar_building" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_bar_building" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Check the model for issues]", + "passed": false, + "duration_s": 20.7, + "tier": "tier3", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.07575555, + "duration_ms": 18721, + "input_tokens": 11, + "output_tokens": 548, + "cache_read_tokens": 84746, + "tool_calls": [ + "load_osm_model", + "validate_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__validate_model" + ], + "toolsearch_count": 2, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Validate before simulation]", + "passed": false, + "duration_s": 17.5, + "tier": "tier3", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.07751190000000001, + "duration_ms": 15551, + "input_tokens": 11, + "output_tokens": 500, + "cache_read_tokens": 83963, + "tool_calls": [ + "load_osm_model", + "validate_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__validate_model" + ], + "toolsearch_count": 2, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:QA/QC the model]", + "passed": true, + "duration_s": 55.9, + "tier": "tier3", + "attempt": 1, + "num_turns": 14, + "cost_usd": 0.15514395, + "duration_ms": 53801, + "input_tokens": 17, + "output_tokens": 2749, + "cache_read_tokens": 149689, + "tool_calls": [ + "load_osm_model", + "validate_model", + "get_model_summary", + "get_building_info", + "list_thermal_zones", + "list_spaces", + "get_weather_info", + "get_run_period" + ], + "num_tool_calls": 8, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "Skill", + "ToolSearch", + "mcp__openstudio__validate_model", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__get_building_info", + "ToolSearch", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__list_spaces", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__get_run_period" + ], + "toolsearch_count": 3, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Is my model ready to simulate?]", + "passed": true, + "duration_s": 50.6, + "tier": "tier3", + "attempt": 1, + "num_turns": 14, + "cost_usd": 0.1273704, + "duration_ms": 48629, + "input_tokens": 17, + "output_tokens": 1409, + "cache_read_tokens": 148048, + "tool_calls": [ + "load_osm_model", + "validate_model", + "get_model_summary", + "get_building_info", + "get_weather_info", + "get_run_period", + "list_thermal_zones", + "list_spaces" + ], + "num_tool_calls": 8, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "Skill", + "ToolSearch", + "mcp__openstudio__validate_model", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__get_building_info", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__get_run_period", + "ToolSearch", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 3, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[retrofit:Compare before and after adding ins]", + "passed": true, + "duration_s": 180.1, + "tier": "tier3", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "get_model_summary", + "list_model_objects", + "list_common_measures", + "get_construction_details", + "get_construction_details", + "list_comstock_measures", + "list_measure_arguments", + "list_measure_arguments", + "list_measure_arguments", + "list_measure_arguments", + "apply_measure", + "apply_measure", + "get_object_fields", + "get_object_fields", + "list_materials", + "set_object_property", + "set_object_property", + "get_construction_details", + "get_construction_details", + "save_osm_model" + ], + "num_tool_calls": 21, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_common_measures", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__list_comstock_measures", + "ToolSearch", + "mcp__openstudio__list_measure_arguments", + "mcp__openstudio__list_measure_arguments", + "ToolSearch", + "mcp__openstudio__list_measure_arguments", + "mcp__openstudio__list_measure_arguments", + "mcp__openstudio__apply_measure", + "mcp__openstudio__apply_measure", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__list_materials", + "mcp__openstudio__set_object_property", + "mcp__openstudio__set_object_property", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "ToolSearch", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 7, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[retrofit:Do a retrofit analysis]", + "passed": true, + "duration_s": 180.1, + "tier": "tier3", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "list_skills", + "get_skill", + "get_building_info", + "get_model_summary", + "list_air_loops", + "get_weather_info", + "list_model_objects", + "save_osm_model", + "run_simulation", + "adjust_thermostat_setpoints", + "add_rooftop_pv", + "list_model_objects", + "shift_schedule_time", + "save_osm_model", + "get_run_status", + "run_simulation", + "extract_summary_metrics", + "extract_end_use_breakdown", + "get_run_status", + "extract_summary_metrics", + "extract_end_use_breakdown", + "compare_runs", + "generate_results_report", + "generate_results_report" + ], + "num_tool_calls": 25, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_skills", + "ToolSearch", + "mcp__openstudio__get_skill", + "mcp__openstudio__get_building_info", + "mcp__openstudio__get_model_summary", + "ToolSearch", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__list_model_objects", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "ToolSearch", + "mcp__openstudio__run_simulation", + "ToolSearch", + "mcp__openstudio__adjust_thermostat_setpoints", + "mcp__openstudio__add_rooftop_pv", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__shift_schedule_time", + "mcp__openstudio__save_osm_model", + "ToolSearch", + "mcp__openstudio__get_run_status", + "mcp__openstudio__run_simulation", + "ToolSearch", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "ToolSearch", + "mcp__openstudio__compare_runs", + "ToolSearch", + "mcp__openstudio__generate_results_report", + "mcp__openstudio__generate_results_report" + ], + "toolsearch_count": 10, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Run a simulation]", + "passed": true, + "duration_s": 22.3, + "tier": "tier3", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.09756165, + "duration_ms": 19822, + "input_tokens": 13, + "output_tokens": 838, + "cache_read_tokens": 123338, + "tool_calls": [ + "load_osm_model", + "run_simulation", + "get_run_status", + "get_run_status" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Simulate the model]", + "passed": true, + "duration_s": 116.1, + "tier": "tier3", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.10370729999999999, + "duration_ms": 113972, + "input_tokens": 14, + "output_tokens": 981, + "cache_read_tokens": 144601, + "tool_calls": [ + "load_osm_model", + "run_simulation", + "get_run_status", + "get_run_status" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "mcp__openstudio__get_run_status", + "Bash", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Run EnergyPlus]", + "passed": true, + "duration_s": 26.8, + "tier": "tier3", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.08936415000000002, + "duration_ms": 24816, + "input_tokens": 12, + "output_tokens": 916, + "cache_read_tokens": 104373, + "tool_calls": [ + "load_osm_model", + "run_simulation", + "get_run_status" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:My simulation failed]", + "passed": false, + "duration_s": 17.3, + "tier": "tier3", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06485774999999999, + "duration_ms": 15248, + "input_tokens": 7, + "output_tokens": 551, + "cache_read_tokens": 45885, + "tool_calls": [ + "load_osm_model", + "extract_simulation_errors" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__extract_simulation_errors" + ], + "toolsearch_count": 1, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:EUI looks way too high]", + "passed": true, + "duration_s": 120.1, + "tier": "tier3", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "extract_summary_metrics", + "extract_end_use_breakdown", + "get_run_status", + "get_weather_info", + "get_run_logs", + "get_run_logs", + "extract_simulation_errors", + "change_building_location", + "change_building_location", + "save_osm_model", + "save_osm_model", + "run_simulation" + ], + "num_tool_calls": 13, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__get_run_logs", + "mcp__openstudio__get_run_logs", + "mcp__openstudio__extract_simulation_errors", + "mcp__openstudio__change_building_location", + "ToolSearch", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash" + ], + "toolsearch_count": 2, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:Too many unmet hours]", + "passed": true, + "duration_s": 120.2, + "tier": "tier3", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "extract_summary_metrics", + "get_run_status", + "list_thermal_zones", + "get_weather_info", + "get_schedule_details", + "get_schedule_details", + "extract_simulation_errors", + "get_run_logs", + "change_building_location", + "save_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status" + ], + "num_tool_calls": 14, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__extract_summary_metrics", + "ToolSearch", + "mcp__openstudio__get_run_status", + "mcp__openstudio__list_thermal_zones", + "ToolSearch", + "mcp__openstudio__get_weather_info", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_schedule_details", + "ToolSearch", + "mcp__openstudio__extract_simulation_errors", + "mcp__openstudio__get_run_logs", + "ToolSearch", + "mcp__openstudio__change_building_location", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "Bash" + ], + "toolsearch_count": 6, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:Why did EnergyPlus crash?]", + "passed": false, + "duration_s": 17.0, + "tier": "tier3", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0647094, + "duration_ms": 14910, + "input_tokens": 7, + "output_tokens": 537, + "cache_read_tokens": 45903, + "tool_calls": [ + "load_osm_model", + "extract_simulation_errors" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__extract_simulation_errors" + ], + "toolsearch_count": 1, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:Show me the model]", + "passed": true, + "duration_s": 23.6, + "tier": "tier3", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.08454645, + "duration_ms": 21304, + "input_tokens": 12, + "output_tokens": 700, + "cache_read_tokens": 103739, + "tool_calls": [ + "load_osm_model", + "view_model", + "copy_file" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model", + "ToolSearch", + "mcp__openstudio__copy_file" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:Visualize the building]", + "passed": true, + "duration_s": 25.3, + "tier": "tier3", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.08400059999999998, + "duration_ms": 23279, + "input_tokens": 12, + "output_tokens": 676, + "cache_read_tokens": 103707, + "tool_calls": [ + "load_osm_model", + "view_model", + "copy_file" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model", + "ToolSearch", + "mcp__openstudio__copy_file" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:3D view]", + "passed": true, + "duration_s": 29.9, + "tier": "tier3", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.08380349999999999, + "duration_ms": 27777, + "input_tokens": 12, + "output_tokens": 615, + "cache_read_tokens": 103350, + "tool_calls": [ + "load_osm_model", + "view_model", + "copy_file" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model", + "ToolSearch", + "mcp__openstudio__copy_file" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[systemd_fourpipebeam_e2e]", + "passed": false, + "duration_s": 577.5, + "tier": "tier2", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.08380349999999999, + "duration_ms": 27777, + "input_tokens": 12, + "output_tokens": 615, + "cache_read_tokens": 103350, + "tool_calls": [ + "load_osm_model", + "view_model", + "copy_file" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model", + "ToolSearch", + "mcp__openstudio__copy_file" + ], + "toolsearch_count": 2, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[add_vav_reheat]", + "passed": true, + "duration_s": 23.3, + "tier": "tier2", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.08598990000000001, + "duration_ms": 20929, + "input_tokens": 9, + "output_tokens": 782, + "cache_read_tokens": 86218, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_baseline_system" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_baseline_system" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[add_doas]", + "passed": true, + "duration_s": 18.4, + "tier": "tier2", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.08999355, + "duration_ms": 16414, + "input_tokens": 9, + "output_tokens": 747, + "cache_read_tokens": 85101, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_doas_system" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_doas_system" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[add_vrf]", + "passed": true, + "duration_s": 29.8, + "tier": "tier2", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.09247335000000001, + "duration_ms": 27825, + "input_tokens": 12, + "output_tokens": 856, + "cache_read_tokens": 104987, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_vrf_system" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_vrf_system" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[set_weather]", + "passed": true, + "duration_s": 22.4, + "tier": "tier2", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06975285, + "duration_ms": 20376, + "input_tokens": 8, + "output_tokens": 507, + "cache_read_tokens": 65367, + "tool_calls": [ + "load_osm_model", + "change_building_location" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[add_rooftop_pv]", + "passed": true, + "duration_s": 17.3, + "tier": "tier2", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06814695, + "duration_ms": 13335, + "input_tokens": 8, + "output_tokens": 451, + "cache_read_tokens": 64939, + "tool_calls": [ + "load_osm_model", + "add_rooftop_pv" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[adjust_thermostat]", + "passed": true, + "duration_s": 15.2, + "tier": "tier2", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06806654999999999, + "duration_ms": 13097, + "input_tokens": 8, + "output_tokens": 470, + "cache_read_tokens": 64996, + "tool_calls": [ + "load_osm_model", + "adjust_thermostat_setpoints" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__adjust_thermostat_setpoints" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[delete_space]", + "passed": true, + "duration_s": 15.9, + "tier": "tier2", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.08223345000000001, + "duration_ms": 13806, + "input_tokens": 9, + "output_tokens": 590, + "cache_read_tokens": 85084, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "delete_object" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "mcp__openstudio__delete_object" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[qaqc_check]", + "passed": true, + "duration_s": 23.1, + "tier": "tier2", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0753915, + "duration_ms": 21074, + "input_tokens": 8, + "output_tokens": 886, + "cache_read_tokens": 65525, + "tool_calls": [ + "load_osm_model", + "run_qaqc_checks" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__run_qaqc_checks" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[create_bar_office]", + "passed": true, + "duration_s": 22.8, + "tier": "tier2", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.08704455, + "duration_ms": 20807, + "input_tokens": 8, + "output_tokens": 772, + "cache_read_tokens": 67981, + "tool_calls": [ + "create_bar_building", + "list_spaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_bar_building", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[create_new_building]", + "passed": true, + "duration_s": 51.2, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.06669345, + "duration_ms": 49123, + "input_tokens": 7, + "output_tokens": 624, + "cache_read_tokens": 46404, + "tool_calls": [ + "create_new_building" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_new_building" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[bar_then_typical]", + "passed": true, + "duration_s": 58.0, + "tier": "tier2", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.13192065, + "duration_ms": 55897, + "input_tokens": 12, + "output_tokens": 1468, + "cache_read_tokens": 163678, + "tool_calls": [ + "create_bar_building", + "change_building_location", + "create_typical_building" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_bar_building", + "mcp__openstudio__change_building_location", + "mcp__openstudio__create_typical_building", + "Read", + "Read", + "Bash" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[import_floorspacejs]", + "passed": true, + "duration_s": 25.3, + "tier": "tier2", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.09148275000000002, + "duration_ms": 23347, + "input_tokens": 12, + "output_tokens": 840, + "cache_read_tokens": 104835, + "tool_calls": [ + "import_floorspacejs", + "list_files", + "import_floorspacejs" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__import_floorspacejs", + "ToolSearch", + "mcp__openstudio__list_files", + "mcp__openstudio__import_floorspacejs" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[floorspacejs_to_typical]", + "passed": true, + "duration_s": 91.8, + "tier": "tier2", + "attempt": 1, + "num_turns": 11, + "cost_usd": 0.1541289, + "duration_ms": 89786, + "input_tokens": 17, + "output_tokens": 1951, + "cache_read_tokens": 221443, + "tool_calls": [ + "import_floorspacejs", + "list_files", + "import_floorspacejs", + "change_building_location", + "create_typical_building" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__import_floorspacejs", + "ToolSearch", + "mcp__openstudio__list_files", + "mcp__openstudio__import_floorspacejs", + "mcp__openstudio__change_building_location", + "mcp__openstudio__create_typical_building", + "Read", + "Read", + "Bash" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[manual_geometry_match]", + "passed": true, + "duration_s": 72.8, + "tier": "tier2", + "attempt": 1, + "num_turns": 13, + "cost_usd": 0.18388484999999996, + "duration_ms": 70708, + "input_tokens": 19, + "output_tokens": 3429, + "cache_read_tokens": 228097, + "tool_calls": [ + "get_server_status", + "create_space_from_floor_print", + "create_example_osm", + "create_space_from_floor_print", + "create_space_from_floor_print", + "match_surfaces", + "list_surfaces", + "list_surfaces", + "save_osm_model" + ], + "num_tool_calls": 9, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_server_status", + "mcp__openstudio__create_space_from_floor_print", + "ToolSearch", + "mcp__openstudio__create_example_osm", + "mcp__openstudio__create_space_from_floor_print", + "mcp__openstudio__create_space_from_floor_print", + "mcp__openstudio__match_surfaces", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 3, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[envelope_retrofit]", + "passed": true, + "duration_s": 57.6, + "tier": "tier2", + "attempt": 1, + "num_turns": 17, + "cost_usd": 0.16534965, + "duration_ms": 55551, + "input_tokens": 18, + "output_tokens": 2778, + "cache_read_tokens": 202398, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "replace_window_constructions", + "list_model_objects", + "replace_window_constructions" + ], + "num_tool_calls": 13, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__replace_window_constructions", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__replace_window_constructions" + ], + "toolsearch_count": 3, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[create_and_assign_loads]", + "passed": true, + "duration_s": 26.8, + "tier": "tier2", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.09550664999999997, + "duration_ms": 24748, + "input_tokens": 12, + "output_tokens": 1098, + "cache_read_tokens": 106798, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "create_people_definition", + "create_lights_definition" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_spaces", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[plant_loop_with_boiler]", + "passed": true, + "duration_s": 19.4, + "tier": "tier2", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.08009145000000001, + "duration_ms": 17280, + "input_tokens": 9, + "output_tokens": 650, + "cache_read_tokens": 85769, + "tool_calls": [ + "load_osm_model", + "create_plant_loop", + "add_supply_equipment" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_plant_loop", + "mcp__openstudio__add_supply_equipment" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[inspect_and_modify_boiler]", + "passed": true, + "duration_s": 21.7, + "tier": "tier2", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.09725519999999999, + "duration_ms": 19645, + "input_tokens": 10, + "output_tokens": 913, + "cache_read_tokens": 108834, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_object_fields", + "set_object_property" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__set_object_property" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[extract_results_chain]", + "passed": true, + "duration_s": 15.8, + "tier": "tier2", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0638526, + "duration_ms": 13763, + "input_tokens": 7, + "output_tokens": 594, + "cache_read_tokens": 45722, + "tool_calls": [ + "extract_summary_metrics", + "extract_end_use_breakdown" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[hvac_chilled_beam_comparison]", + "passed": true, + "duration_s": 108.2, + "tier": "tier2", + "attempt": 1, + "num_turns": 20, + "cost_usd": 0.3183795, + "duration_ms": 106139, + "input_tokens": 30, + "output_tokens": 4252, + "cache_read_tokens": 510165, + "tool_calls": [ + "load_osm_model", + "list_air_loops", + "replace_air_terminals", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_weather_info", + "list_weather_files", + "change_building_location", + "save_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "extract_end_use_breakdown" + ], + "num_tool_calls": 15, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__replace_air_terminals", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "mcp__openstudio__get_run_status", + "ToolSearch", + "mcp__openstudio__get_weather_info", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 4, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[create_test_apply_measure]", + "passed": true, + "duration_s": 23.7, + "tier": "tier2", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.08716665000000001, + "duration_ms": 21589, + "input_tokens": 9, + "output_tokens": 786, + "cache_read_tokens": 89228, + "tool_calls": [ + "load_osm_model", + "create_measure", + "test_measure", + "apply_measure" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[measure_set_lights_full_chain]", + "passed": true, + "duration_s": 102.3, + "tier": "tier2", + "attempt": 1, + "num_turns": 26, + "cost_usd": 0.31975365, + "duration_ms": 100225, + "input_tokens": 37, + "output_tokens": 4678, + "cache_read_tokens": 529338, + "tool_calls": [ + "load_osm_model", + "list_skills", + "get_skill", + "get_skill", + "save_osm_model", + "get_weather_info", + "run_simulation", + "create_measure", + "get_run_status", + "test_measure", + "extract_summary_metrics", + "load_osm_model", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 18, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_skills", + "mcp__openstudio__get_skill", + "mcp__openstudio__get_skill", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__get_weather_info", + "ToolSearch", + "mcp__openstudio__run_simulation", + "ToolSearch", + "mcp__openstudio__create_measure", + "mcp__openstudio__get_run_status", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__test_measure", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 7, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[measure_set_infiltration_full_chain]", + "passed": true, + "duration_s": 121.3, + "tier": "tier2", + "attempt": 1, + "num_turns": 22, + "cost_usd": 0.32198025, + "duration_ms": 119295, + "input_tokens": 25, + "output_tokens": 6259, + "cache_read_tokens": 473630, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "create_measure", + "test_measure", + "get_run_status", + "search_api", + "create_measure", + "test_measure", + "extract_summary_metrics", + "load_osm_model", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 17, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__get_run_status", + "ToolSearch", + "mcp__openstudio__search_api", + "Read", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 3, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[measure_replace_terminals_full_chain]", + "passed": true, + "duration_s": 360.8, + "tier": "tier2", + "attempt": 1, + "num_turns": 28, + "cost_usd": 0.39654480000000003, + "duration_ms": 358541, + "input_tokens": 31, + "output_tokens": 7744, + "cache_read_tokens": 555331, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "search_wiring_patterns", + "list_air_loops", + "list_plant_loops", + "search_api", + "create_measure", + "test_measure", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics", + "compare_runs", + "extract_end_use_breakdown", + "extract_end_use_breakdown" + ], + "num_tool_calls": 21, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "Bash", + "mcp__openstudio__get_run_status", + "ToolSearch", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__search_wiring_patterns", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__search_api", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "ToolSearch", + "mcp__openstudio__compare_runs", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 4, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[create_measure_with_args]", + "passed": true, + "duration_s": 61.8, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.13792454999999998, + "duration_ms": 59835, + "input_tokens": 7, + "output_tokens": 4523, + "cache_read_tokens": 46141, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[measure_add_baseboards_full_chain]", + "passed": true, + "duration_s": 107.2, + "tier": "tier2", + "attempt": 1, + "num_turns": 25, + "cost_usd": 0.31477904999999995, + "duration_ms": 105172, + "input_tokens": 32, + "output_tokens": 4812, + "cache_read_tokens": 508706, + "tool_calls": [ + "load_osm_model", + "list_skills", + "get_skill", + "get_skill", + "list_thermal_zones", + "get_weather_info", + "save_osm_model", + "run_simulation", + "create_measure", + "test_measure", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 19, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_skills", + "mcp__openstudio__get_skill", + "mcp__openstudio__get_skill", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__get_weather_info", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "mcp__openstudio__create_measure", + "ToolSearch", + "mcp__openstudio__test_measure", + "mcp__openstudio__get_run_status", + "ToolSearch", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 5, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[ruby_measure_reduce_plugloads]", + "passed": true, + "duration_s": 417.9, + "tier": "tier2", + "attempt": 1, + "num_turns": 28, + "cost_usd": 0.5765152499999999, + "duration_ms": 415856, + "input_tokens": 42, + "output_tokens": 14504, + "cache_read_tokens": 786310, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "create_measure", + "test_measure", + "read_file", + "edit_measure", + "test_measure", + "edit_measure", + "test_measure", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 18, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "Read", + "ToolSearch", + "mcp__openstudio__read_file", + "ToolSearch", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 6, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[python_measure_reduce_plugloads]", + "passed": true, + "duration_s": 231.0, + "tier": "tier2", + "attempt": 1, + "num_turns": 29, + "cost_usd": 0.6026875499999998, + "duration_ms": 228441, + "input_tokens": 40, + "output_tokens": 13016, + "cache_read_tokens": 837096, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "create_measure", + "test_measure", + "read_file", + "read_file", + "edit_measure", + "read_file", + "test_measure", + "edit_measure", + "test_measure", + "edit_measure", + "test_measure", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics", + "compare_runs" + ], + "num_tool_calls": 23, + "all_tool_calls": [ + "ToolSearch", + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__read_file", + "ToolSearch", + "mcp__openstudio__read_file", + "ToolSearch", + "mcp__openstudio__edit_measure", + "mcp__openstudio__read_file", + "mcp__openstudio__test_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__compare_runs" + ], + "toolsearch_count": 5, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[ruby_measure_boiler_efficiency]", + "passed": true, + "duration_s": 332.2, + "tier": "tier2", + "attempt": 1, + "num_turns": 26, + "cost_usd": 0.41937660000000004, + "duration_ms": 329853, + "input_tokens": 38, + "output_tokens": 7842, + "cache_read_tokens": 663717, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "load_osm_model", + "create_measure", + "test_measure", + "read_file", + "edit_measure", + "read_file", + "test_measure", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 18, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "ToolSearch", + "mcp__openstudio__read_file", + "ToolSearch", + "mcp__openstudio__edit_measure", + "mcp__openstudio__read_file", + "mcp__openstudio__test_measure", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 5, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_workflow[python_measure_boiler_efficiency]", + "passed": true, + "duration_s": 141.7, + "tier": "tier2", + "attempt": 1, + "num_turns": 23, + "cost_usd": 0.3527554500000001, + "duration_ms": 139232, + "input_tokens": 27, + "output_tokens": 7243, + "cache_read_tokens": 494244, + "tool_calls": [ + "load_osm_model", + "save_osm_model", + "run_simulation", + "load_osm_model", + "create_measure", + "test_measure", + "get_run_status", + "read_file", + "edit_measure", + "test_measure", + "extract_summary_metrics", + "apply_measure", + "save_osm_model", + "run_simulation", + "get_run_status", + "get_run_status", + "extract_summary_metrics", + "compare_runs" + ], + "num_tool_calls": 18, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__get_run_status", + "Read", + "ToolSearch", + "mcp__openstudio__read_file", + "ToolSearch", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__apply_measure", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics", + "mcp__openstudio__compare_runs" + ], + "toolsearch_count": 3, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_create_measure_with_args_quality", + "passed": true, + "duration_s": 91.9, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.14079165, + "duration_ms": 89454, + "input_tokens": 7, + "output_tokens": 6501, + "cache_read_tokens": 56073, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_complex_model_multi_query", + "passed": true, + "duration_s": 28.4, + "tier": "tier2", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.09079965000000001, + "duration_ms": 26427, + "input_tokens": 11, + "output_tokens": 1138, + "cache_read_tokens": 84418, + "tool_calls": [ + "load_osm_model", + "get_building_info", + "list_air_loops", + "list_plant_loops", + "list_thermal_zones" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__get_building_info", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_measure_reduce_plugloads_quality[Ruby]", + "passed": false, + "duration_s": 85.8, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.17531969999999997, + "duration_ms": 83741, + "input_tokens": 7, + "output_tokens": 6504, + "cache_read_tokens": 46279, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 1, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_measure_reduce_plugloads_quality[Python]", + "passed": false, + "duration_s": 73.4, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.14606609999999998, + "duration_ms": 70574, + "input_tokens": 7, + "output_tokens": 4937, + "cache_read_tokens": 46292, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 1, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_measure_boiler_efficiency_quality[Ruby]", + "passed": false, + "duration_s": 38.1, + "tier": "tier2", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.10128945, + "duration_ms": 35996, + "input_tokens": 7, + "output_tokens": 2547, + "cache_read_tokens": 46324, + "tool_calls": [ + "create_measure" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_measure" + ], + "toolsearch_count": 1, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_04_workflows.py::test_measure_boiler_efficiency_quality[Python]", + "passed": true, + "duration_s": 68.6, + "tier": "tier2", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.17024309999999998, + "duration_ms": 66629, + "input_tokens": 13, + "output_tokens": 4350, + "cache_read_tokens": 140647, + "tool_calls": [ + "create_measure", + "test_measure", + "edit_measure", + "test_measure" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_measure", + "ToolSearch", + "mcp__openstudio__test_measure", + "mcp__openstudio__edit_measure", + "mcp__openstudio__test_measure" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_05_guardrails.py::test_create_uses_mcp_not_raw_idf", + "passed": true, + "duration_s": 165.2, + "tier": "tier4", + "attempt": 1, + "num_turns": 15, + "cost_usd": 0.33836084999999994, + "duration_ms": 163086, + "input_tokens": 21, + "output_tokens": 6127, + "cache_read_tokens": 427847, + "tool_calls": [ + "list_skills", + "get_skill", + "list_weather_files", + "create_new_building", + "create_new_building", + "create_bar_building", + "get_model_summary", + "change_building_location", + "create_typical_building", + "save_osm_model", + "save_osm_model", + "get_model_summary" + ], + "num_tool_calls": 12, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_skills", + "mcp__openstudio__get_skill", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_bar_building", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__change_building_location", + "mcp__openstudio__create_typical_building", + "mcp__openstudio__save_osm_model", + "ToolSearch", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__get_model_summary" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_05_guardrails.py::test_no_script_for_results", + "passed": true, + "duration_s": 14.0, + "tier": "tier4", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0584907, + "duration_ms": 11506, + "input_tokens": 7, + "output_tokens": 339, + "cache_read_tokens": 45499, + "tool_calls": [ + "extract_summary_metrics" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_05_guardrails.py::test_inspect_component_uses_mcp_not_script", + "passed": true, + "duration_s": 23.6, + "tier": "tier4", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.09002055, + "duration_ms": 21585, + "input_tokens": 9, + "output_tokens": 1021, + "cache_read_tokens": 84991, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "get_component_properties" + ], + "num_tool_calls": 6, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_component_properties" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L1]", + "passed": true, + "duration_s": 64.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.1445049, + "duration_ms": 62306, + "input_tokens": 12, + "output_tokens": 2822, + "cache_read_tokens": 114988, + "tool_calls": [ + "list_files", + "list_skills", + "get_skill", + "import_floorspacejs" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_files", + "mcp__openstudio__list_skills", + "ToolSearch", + "mcp__openstudio__get_skill", + "mcp__openstudio__import_floorspacejs" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L2]", + "passed": true, + "duration_s": 22.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.09462285, + "duration_ms": 20236, + "input_tokens": 12, + "output_tokens": 807, + "cache_read_tokens": 103802, + "tool_calls": [ + "import_floorspacejs", + "list_files", + "import_floorspacejs" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__import_floorspacejs", + "ToolSearch", + "mcp__openstudio__list_files", + "mcp__openstudio__import_floorspacejs" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L3]", + "passed": true, + "duration_s": 21.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.08967915000000001, + "duration_ms": 19785, + "input_tokens": 12, + "output_tokens": 743, + "cache_read_tokens": 104773, + "tool_calls": [ + "import_floorspacejs", + "list_files", + "import_floorspacejs" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__import_floorspacejs", + "ToolSearch", + "mcp__openstudio__list_files", + "mcp__openstudio__import_floorspacejs" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_hvac_L1]", + "passed": true, + "duration_s": 49.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 15, + "cost_usd": 0.16795559999999998, + "duration_ms": 47529, + "input_tokens": 21, + "output_tokens": 2395, + "cache_read_tokens": 203092, + "tool_calls": [ + "load_osm_model", + "list_skills", + "get_building_info", + "list_thermal_zones", + "add_baseline_system", + "list_air_loops", + "list_plant_loops", + "save_osm_model" + ], + "num_tool_calls": 8, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_skills", + "Skill", + "mcp__openstudio__get_building_info", + "ToolSearch", + "mcp__openstudio__list_thermal_zones", + "ToolSearch", + "mcp__openstudio__add_baseline_system", + "ToolSearch", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_plant_loops", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 4, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_hvac_L2]", + "passed": true, + "duration_s": 19.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.08622855, + "duration_ms": 17428, + "input_tokens": 9, + "output_tokens": 799, + "cache_read_tokens": 86201, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_baseline_system" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_baseline_system" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_hvac_L3]", + "passed": true, + "duration_s": 19.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.08987984999999998, + "duration_ms": 17809, + "input_tokens": 9, + "output_tokens": 753, + "cache_read_tokens": 84947, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones", + "add_baseline_system" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__add_baseline_system" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[view_model_L1]", + "passed": true, + "duration_s": 23.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.08354985000000001, + "duration_ms": 21052, + "input_tokens": 12, + "output_tokens": 648, + "cache_read_tokens": 103667, + "tool_calls": [ + "load_osm_model", + "view_model", + "copy_file" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model", + "ToolSearch", + "mcp__openstudio__copy_file" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[view_model_L2]", + "passed": true, + "duration_s": 16.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06903195, + "duration_ms": 14716, + "input_tokens": 8, + "output_tokens": 467, + "cache_read_tokens": 64214, + "tool_calls": [ + "load_osm_model", + "view_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[view_model_L3]", + "passed": true, + "duration_s": 24.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.08450865, + "duration_ms": 22024, + "input_tokens": 12, + "output_tokens": 697, + "cache_read_tokens": 103763, + "tool_calls": [ + "load_osm_model", + "view_model", + "copy_file" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__view_model", + "ToolSearch", + "mcp__openstudio__copy_file" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_weather_L1]", + "passed": true, + "duration_s": 37.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.12610919999999998, + "duration_ms": 35751, + "input_tokens": 12, + "output_tokens": 1243, + "cache_read_tokens": 111469, + "tool_calls": [ + "load_osm_model", + "list_weather_files", + "change_building_location" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_weather_L2]", + "passed": true, + "duration_s": 46.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.1486893, + "duration_ms": 44887, + "input_tokens": 13, + "output_tokens": 2052, + "cache_read_tokens": 133451, + "tool_calls": [ + "load_osm_model", + "change_building_location", + "list_weather_files", + "change_building_location" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_weather_L3]", + "passed": true, + "duration_s": 58.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.1487019, + "duration_ms": 56666, + "input_tokens": 13, + "output_tokens": 2011, + "cache_read_tokens": 132693, + "tool_calls": [ + "load_osm_model", + "change_building_location", + "list_weather_files", + "change_building_location" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__change_building_location", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L1]", + "passed": true, + "duration_s": 17.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0773742, + "duration_ms": 15583, + "input_tokens": 11, + "output_tokens": 590, + "cache_read_tokens": 84529, + "tool_calls": [ + "load_osm_model", + "validate_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__validate_model" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L2]", + "passed": true, + "duration_s": 24.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.08039474999999999, + "duration_ms": 22569, + "input_tokens": 11, + "output_tokens": 792, + "cache_read_tokens": 84785, + "tool_calls": [ + "load_osm_model", + "validate_model", + "run_qaqc_checks" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__validate_model", + "mcp__openstudio__run_qaqc_checks" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L3]", + "passed": true, + "duration_s": 24.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.08352795, + "duration_ms": 22300, + "input_tokens": 11, + "output_tokens": 848, + "cache_read_tokens": 85554, + "tool_calls": [ + "load_osm_model", + "inspect_osm_summary", + "validate_model" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__inspect_osm_summary", + "ToolSearch", + "mcp__openstudio__validate_model" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_building_L1]", + "passed": true, + "duration_s": 80.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 12, + "cost_usd": 0.2103162, + "duration_ms": 78448, + "input_tokens": 17, + "output_tokens": 2476, + "cache_read_tokens": 269209, + "tool_calls": [ + "list_skills", + "get_skill", + "list_weather_files", + "create_new_building", + "change_building_location", + "create_typical_building", + "save_osm_model", + "get_model_summary", + "save_osm_model" + ], + "num_tool_calls": 9, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_skills", + "mcp__openstudio__get_skill", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__create_new_building", + "mcp__openstudio__change_building_location", + "mcp__openstudio__create_typical_building", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__get_model_summary", + "ToolSearch", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_building_L2]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "create_new_building", + "create_new_building", + "list_weather_files", + "change_building_location", + "create_typical_building" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_new_building", + "mcp__openstudio__create_new_building", + "ToolSearch", + "mcp__openstudio__list_weather_files", + "mcp__openstudio__change_building_location", + "mcp__openstudio__create_typical_building", + "Read", + "Read", + "Grep", + "Bash" + ], + "toolsearch_count": 2, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_building_L3]", + "passed": true, + "duration_s": 15.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0683652, + "duration_ms": 13921, + "input_tokens": 7, + "output_tokens": 458, + "cache_read_tokens": 46164, + "tool_calls": [ + "create_bar_building" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__create_bar_building" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_pv_L1]", + "passed": true, + "duration_s": 20.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06959565, + "duration_ms": 18383, + "input_tokens": 8, + "output_tokens": 526, + "cache_read_tokens": 64968, + "tool_calls": [ + "load_osm_model", + "add_rooftop_pv" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_pv_L2]", + "passed": true, + "duration_s": 20.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06942614999999999, + "duration_ms": 18101, + "input_tokens": 8, + "output_tokens": 521, + "cache_read_tokens": 64928, + "tool_calls": [ + "load_osm_model", + "add_rooftop_pv" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_pv_L3]", + "passed": true, + "duration_s": 16.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06938865, + "duration_ms": 14143, + "input_tokens": 8, + "output_tokens": 412, + "cache_read_tokens": 64303, + "tool_calls": [ + "load_osm_model", + "add_rooftop_pv" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_rooftop_pv" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermostat_L1]", + "passed": true, + "duration_s": 21.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06753779999999998, + "duration_ms": 19832, + "input_tokens": 8, + "output_tokens": 442, + "cache_read_tokens": 64921, + "tool_calls": [ + "load_osm_model", + "adjust_thermostat_setpoints" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__adjust_thermostat_setpoints" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermostat_L2]", + "passed": true, + "duration_s": 15.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0670239, + "duration_ms": 13392, + "input_tokens": 8, + "output_tokens": 413, + "cache_read_tokens": 64958, + "tool_calls": [ + "load_osm_model", + "adjust_thermostat_setpoints" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__adjust_thermostat_setpoints" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermostat_L3]", + "passed": true, + "duration_s": 19.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06925200000000001, + "duration_ms": 17344, + "input_tokens": 8, + "output_tokens": 419, + "cache_read_tokens": 64360, + "tool_calls": [ + "load_osm_model", + "adjust_thermostat_setpoints" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__adjust_thermostat_setpoints" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_spaces_L1]", + "passed": true, + "duration_s": 16.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.07092285, + "duration_ms": 14694, + "input_tokens": 8, + "output_tokens": 533, + "cache_read_tokens": 65092, + "tool_calls": [ + "load_osm_model", + "list_spaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_spaces_L2]", + "passed": true, + "duration_s": 16.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.07579709999999999, + "duration_ms": 14572, + "input_tokens": 8, + "output_tokens": 695, + "cache_read_tokens": 64402, + "tool_calls": [ + "load_osm_model", + "list_spaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_spaces_L3]", + "passed": true, + "duration_s": 14.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.07674344999999999, + "duration_ms": 12373, + "input_tokens": 8, + "output_tokens": 701, + "cache_read_tokens": 64219, + "tool_calls": [ + "load_osm_model", + "list_spaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedules_L1]", + "passed": true, + "duration_s": 23.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.08337405, + "duration_ms": 21359, + "input_tokens": 9, + "output_tokens": 874, + "cache_read_tokens": 85736, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "list_model_objects" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedules_L2]", + "passed": true, + "duration_s": 16.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.07256354999999999, + "duration_ms": 14832, + "input_tokens": 8, + "output_tokens": 646, + "cache_read_tokens": 65411, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedules_L3]", + "passed": true, + "duration_s": 17.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0720546, + "duration_ms": 15529, + "input_tokens": 8, + "output_tokens": 613, + "cache_read_tokens": 65402, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[inspect_component_L1]", + "passed": true, + "duration_s": 19.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.07759275, + "duration_ms": 17579, + "input_tokens": 9, + "output_tokens": 570, + "cache_read_tokens": 85415, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_component_properties" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_component_properties" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[inspect_component_L2]", + "passed": true, + "duration_s": 20.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.07857630000000002, + "duration_ms": 18339, + "input_tokens": 9, + "output_tokens": 596, + "cache_read_tokens": 85231, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_component_properties" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_component_properties" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[inspect_component_L3]", + "passed": true, + "duration_s": 28.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.10312274999999999, + "duration_ms": 26657, + "input_tokens": 13, + "output_tokens": 1028, + "cache_read_tokens": 124225, + "tool_calls": [ + "load_osm_model", + "get_object_fields", + "list_model_objects", + "get_object_fields" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_object_fields", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_object_fields" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[modify_component_L1]", + "passed": true, + "duration_s": 29.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.10415264999999999, + "duration_ms": 27573, + "input_tokens": 14, + "output_tokens": 878, + "cache_read_tokens": 147373, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_component_properties", + "set_component_properties", + "save_osm_model" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_component_properties", + "mcp__openstudio__set_component_properties", + "ToolSearch", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[modify_component_L2]", + "passed": true, + "duration_s": 21.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0785763, + "duration_ms": 19135, + "input_tokens": 9, + "output_tokens": 543, + "cache_read_tokens": 85181, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "set_component_properties" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__set_component_properties" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[modify_component_L3]", + "passed": true, + "duration_s": 22.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.09606404999999998, + "duration_ms": 20332, + "input_tokens": 13, + "output_tokens": 859, + "cache_read_tokens": 125546, + "tool_calls": [ + "load_osm_model", + "set_object_property", + "list_model_objects", + "set_object_property" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__set_object_property", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__set_object_property" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L1]", + "passed": true, + "duration_s": 33.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 17, + "cost_usd": 0.16678905, + "duration_ms": 31615, + "input_tokens": 12, + "output_tokens": 1914, + "cache_read_tokens": 93206, + "tool_calls": [ + "load_osm_model", + "list_air_loops", + "list_thermal_zones", + "get_sizing_system_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties", + "get_sizing_zone_properties" + ], + "num_tool_calls": 14, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_thermal_zones", + "mcp__openstudio__get_sizing_system_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties", + "mcp__openstudio__get_sizing_zone_properties" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L2]", + "passed": true, + "duration_s": 14.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06859530000000001, + "duration_ms": 12701, + "input_tokens": 8, + "output_tokens": 475, + "cache_read_tokens": 65421, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L3]", + "passed": true, + "duration_s": 16.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06932745000000001, + "duration_ms": 14426, + "input_tokens": 8, + "output_tokens": 524, + "cache_read_tokens": 65424, + "tool_calls": [ + "load_osm_model", + "list_model_objects" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[floor_area_L1]", + "passed": true, + "duration_s": 18.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06805889999999999, + "duration_ms": 16028, + "input_tokens": 8, + "output_tokens": 472, + "cache_read_tokens": 64658, + "tool_calls": [ + "load_osm_model", + "get_building_info" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[floor_area_L2]", + "passed": true, + "duration_s": 13.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0661569, + "duration_ms": 11645, + "input_tokens": 8, + "output_tokens": 344, + "cache_read_tokens": 64668, + "tool_calls": [ + "load_osm_model", + "get_building_info" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[floor_area_L3]", + "passed": true, + "duration_s": 14.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06786375, + "duration_ms": 12593, + "input_tokens": 8, + "output_tokens": 445, + "cache_read_tokens": 64770, + "tool_calls": [ + "load_osm_model", + "get_building_info" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_building_info" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[materials_L1]", + "passed": true, + "duration_s": 22.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.07764914999999999, + "duration_ms": 20241, + "input_tokens": 8, + "output_tokens": 857, + "cache_read_tokens": 64688, + "tool_calls": [ + "load_osm_model", + "list_materials" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_materials" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[materials_L2]", + "passed": true, + "duration_s": 19.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0734562, + "duration_ms": 17767, + "input_tokens": 8, + "output_tokens": 617, + "cache_read_tokens": 64874, + "tool_calls": [ + "load_osm_model", + "list_materials" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_materials" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[materials_L3]", + "passed": true, + "duration_s": 20.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0767352, + "duration_ms": 18565, + "input_tokens": 8, + "output_tokens": 840, + "cache_read_tokens": 64879, + "tool_calls": [ + "load_osm_model", + "list_materials" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_materials" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L1]", + "passed": false, + "duration_s": 17.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.05851229999999999, + "duration_ms": 15353, + "input_tokens": 7, + "output_tokens": 301, + "cache_read_tokens": 45746, + "tool_calls": [ + "load_osm_model" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model" + ], + "toolsearch_count": 1, + "is_timeout": false, + "failure_mode": "wrong_tool" + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L2]", + "passed": true, + "duration_s": 16.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.073491, + "duration_ms": 14295, + "input_tokens": 8, + "output_tokens": 730, + "cache_read_tokens": 64990, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L3]", + "passed": true, + "duration_s": 17.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.07208085, + "duration_ms": 15385, + "input_tokens": 8, + "output_tokens": 641, + "cache_read_tokens": 64977, + "tool_calls": [ + "load_osm_model", + "list_thermal_zones" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_thermal_zones" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L1]", + "passed": true, + "duration_s": 13.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06866024999999999, + "duration_ms": 11449, + "input_tokens": 8, + "output_tokens": 378, + "cache_read_tokens": 64425, + "tool_calls": [ + "load_osm_model", + "list_subsurfaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_subsurfaces" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L2]", + "passed": true, + "duration_s": 13.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06721425, + "duration_ms": 11572, + "input_tokens": 8, + "output_tokens": 435, + "cache_read_tokens": 65180, + "tool_calls": [ + "load_osm_model", + "list_subsurfaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_subsurfaces" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L3]", + "passed": true, + "duration_s": 16.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06774825000000001, + "duration_ms": 14177, + "input_tokens": 8, + "output_tokens": 420, + "cache_read_tokens": 65010, + "tool_calls": [ + "load_osm_model", + "list_subsurfaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_subsurfaces" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[surface_details_L1]", + "passed": true, + "duration_s": 23.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.09293025, + "duration_ms": 21070, + "input_tokens": 9, + "output_tokens": 935, + "cache_read_tokens": 85640, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "get_surface_details", + "get_surface_details" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__get_surface_details", + "mcp__openstudio__get_surface_details" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[surface_details_L2]", + "passed": true, + "duration_s": 20.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.07988564999999999, + "duration_ms": 18466, + "input_tokens": 9, + "output_tokens": 748, + "cache_read_tokens": 85108, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "get_surface_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__get_surface_details" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[surface_details_L3]", + "passed": true, + "duration_s": 20.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.11586345, + "duration_ms": 18808, + "input_tokens": 8, + "output_tokens": 846, + "cache_read_tokens": 65244, + "tool_calls": [ + "load_osm_model", + "list_surfaces" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_simulation_L1]", + "passed": true, + "duration_s": 300.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "run_simulation", + "get_run_status", + "extract_simulation_errors", + "validate_model", + "list_air_loops", + "list_thermal_zones", + "delete_object", + "save_osm_model", + "run_simulation", + "get_run_status", + "extract_simulation_errors", + "delete_object", + "delete_object", + "clean_unused_objects", + "save_osm_model", + "save_osm_model", + "run_simulation" + ], + "num_tool_calls": 18, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_simulation_errors", + "ToolSearch", + "mcp__openstudio__validate_model", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__list_thermal_zones", + "ToolSearch", + "mcp__openstudio__delete_object", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_simulation_errors", + "mcp__openstudio__delete_object", + "mcp__openstudio__delete_object", + "mcp__openstudio__clean_unused_objects", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__save_osm_model", + "mcp__openstudio__run_simulation", + "Bash" + ], + "toolsearch_count": 3, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_simulation_L2]", + "passed": true, + "duration_s": 115.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 7, + "cost_usd": 0.09344939999999999, + "duration_ms": 113278, + "input_tokens": 13, + "output_tokens": 803, + "cache_read_tokens": 124193, + "tool_calls": [ + "load_osm_model", + "run_simulation", + "get_run_status" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "Bash", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[run_simulation_L3]", + "passed": true, + "duration_s": 152.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 1, + "cost_usd": 0.1128954, + "duration_ms": 4680, + "input_tokens": 3, + "output_tokens": 105, + "cache_read_tokens": 20621, + "tool_calls": [ + "load_osm_model", + "run_simulation", + "get_run_status", + "get_run_status" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__run_simulation", + "ToolSearch", + "mcp__openstudio__get_run_status", + "Bash", + "mcp__openstudio__get_run_status" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[get_eui_L1]", + "passed": true, + "duration_s": 25.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.07947314999999999, + "duration_ms": 23338, + "input_tokens": 11, + "output_tokens": 713, + "cache_read_tokens": 84088, + "tool_calls": [ + "extract_summary_metrics", + "get_run_status", + "extract_end_use_breakdown" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_summary_metrics", + "ToolSearch", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[get_eui_L2]", + "passed": true, + "duration_s": 24.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.0806604, + "duration_ms": 22053, + "input_tokens": 11, + "output_tokens": 705, + "cache_read_tokens": 84008, + "tool_calls": [ + "extract_summary_metrics", + "get_run_status", + "extract_simulation_errors" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_summary_metrics", + "ToolSearch", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_simulation_errors" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[get_eui_L3]", + "passed": true, + "duration_s": 14.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0593364, + "duration_ms": 11834, + "input_tokens": 7, + "output_tokens": 397, + "cache_read_tokens": 45493, + "tool_calls": [ + "extract_summary_metrics" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L1]", + "passed": true, + "duration_s": 29.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 9, + "cost_usd": 0.10391565, + "duration_ms": 27657, + "input_tokens": 15, + "output_tokens": 1064, + "cache_read_tokens": 123698, + "tool_calls": [ + "extract_end_use_breakdown", + "extract_end_use_breakdown", + "extract_summary_metrics", + "get_run_status", + "get_run_artifacts" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_end_use_breakdown", + "ToolSearch", + "mcp__openstudio__extract_end_use_breakdown", + "mcp__openstudio__extract_summary_metrics", + "ToolSearch", + "mcp__openstudio__get_run_status", + "mcp__openstudio__get_run_artifacts" + ], + "toolsearch_count": 3, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L2]", + "passed": true, + "duration_s": 21.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.08187105, + "duration_ms": 19667, + "input_tokens": 11, + "output_tokens": 792, + "cache_read_tokens": 83431, + "tool_calls": [ + "extract_end_use_breakdown", + "get_run_status", + "extract_summary_metrics" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_end_use_breakdown", + "ToolSearch", + "mcp__openstudio__get_run_status", + "mcp__openstudio__extract_summary_metrics" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L3]", + "passed": true, + "duration_s": 14.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0580248, + "duration_ms": 12323, + "input_tokens": 7, + "output_tokens": 355, + "cache_read_tokens": 45471, + "tool_calls": [ + "extract_end_use_breakdown" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_end_use_breakdown" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L1]", + "passed": true, + "duration_s": 24.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.08245245, + "duration_ms": 22486, + "input_tokens": 11, + "output_tokens": 907, + "cache_read_tokens": 83544, + "tool_calls": [ + "extract_hvac_sizing", + "extract_component_sizing", + "extract_simulation_errors" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_hvac_sizing", + "ToolSearch", + "mcp__openstudio__extract_component_sizing", + "mcp__openstudio__extract_simulation_errors" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L2]", + "passed": true, + "duration_s": 13.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0595905, + "duration_ms": 11008, + "input_tokens": 7, + "output_tokens": 408, + "cache_read_tokens": 45140, + "tool_calls": [ + "extract_hvac_sizing" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_hvac_sizing" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L3]", + "passed": true, + "duration_s": 14.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.05936940000000001, + "duration_ms": 12549, + "input_tokens": 7, + "output_tokens": 459, + "cache_read_tokens": 45428, + "tool_calls": [ + "extract_hvac_sizing" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__extract_hvac_sizing" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_wwr_L1]", + "passed": true, + "duration_s": 27.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 13, + "cost_usd": 0.10800659999999998, + "duration_ms": 25264, + "input_tokens": 12, + "output_tokens": 1515, + "cache_read_tokens": 105077, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio" + ], + "num_tool_calls": 10, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_wwr_L2]", + "passed": true, + "duration_s": 34.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 15, + "cost_usd": 0.1265748, + "duration_ms": 32660, + "input_tokens": 16, + "output_tokens": 1620, + "cache_read_tokens": 150306, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "save_osm_model" + ], + "num_tool_calls": 11, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "ToolSearch", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 3, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_wwr_L3]", + "passed": true, + "duration_s": 29.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 13, + "cost_usd": 0.10844955, + "duration_ms": 27256, + "input_tokens": 12, + "output_tokens": 1556, + "cache_read_tokens": 105066, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio", + "set_window_to_wall_ratio" + ], + "num_tool_calls": 10, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio", + "mcp__openstudio__set_window_to_wall_ratio" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_windows_L1]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_construction_details", + "list_model_objects", + "get_construction_details", + "list_common_measures", + "list_measure_arguments", + "list_files", + "list_measure_arguments" + ], + "num_tool_calls": 9, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__list_model_objects", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__list_common_measures", + "mcp__openstudio__list_measure_arguments", + "mcp__openstudio__list_files", + "ToolSearch", + "ToolSearch", + "mcp__openstudio__list_measure_arguments" + ], + "toolsearch_count": 6, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_windows_L2]", + "passed": true, + "duration_s": 36.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.10104434999999999, + "duration_ms": 33465, + "input_tokens": 12, + "output_tokens": 1465, + "cache_read_tokens": 105107, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "replace_window_constructions" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__replace_window_constructions" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[replace_windows_L3]", + "passed": true, + "duration_s": 36.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.09934740000000002, + "duration_ms": 34665, + "input_tokens": 12, + "output_tokens": 1288, + "cache_read_tokens": 105613, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "replace_window_constructions" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__replace_window_constructions" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[construction_details_L1]", + "passed": true, + "duration_s": 23.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.07993275, + "duration_ms": 20663, + "input_tokens": 9, + "output_tokens": 660, + "cache_read_tokens": 84940, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "get_construction_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__get_construction_details" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[construction_details_L2]", + "passed": true, + "duration_s": 28.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0848283, + "duration_ms": 26000, + "input_tokens": 9, + "output_tokens": 804, + "cache_read_tokens": 84421, + "tool_calls": [ + "load_osm_model", + "list_surfaces", + "get_construction_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_surfaces", + "mcp__openstudio__get_construction_details" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[construction_details_L3]", + "passed": true, + "duration_s": 38.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 17, + "cost_usd": 0.15897974999999998, + "duration_ms": 36260, + "input_tokens": 12, + "output_tokens": 1916, + "cache_read_tokens": 92825, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details", + "get_construction_details" + ], + "num_tool_calls": 14, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details", + "mcp__openstudio__get_construction_details" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[check_loads_L1]", + "passed": true, + "duration_s": 29.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.10559834999999998, + "duration_ms": 27223, + "input_tokens": 16, + "output_tokens": 1010, + "cache_read_tokens": 144097, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "get_space_details", + "get_space_type_details" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_spaces", + "mcp__openstudio__get_space_details", + "ToolSearch", + "mcp__openstudio__get_space_type_details" + ], + "toolsearch_count": 3, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[check_loads_L2]", + "passed": true, + "duration_s": 30.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 8, + "cost_usd": 0.10672919999999998, + "duration_ms": 28601, + "input_tokens": 13, + "output_tokens": 1115, + "cache_read_tokens": 126259, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "get_space_type_details", + "get_load_details", + "get_load_details" + ], + "num_tool_calls": 5, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "ToolSearch", + "mcp__openstudio__get_space_type_details", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[check_loads_L3]", + "passed": true, + "duration_s": 33.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 10, + "cost_usd": 0.09556814999999998, + "duration_ms": 31129, + "input_tokens": 12, + "output_tokens": 1144, + "cache_read_tokens": 104828, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "get_load_details", + "get_load_details", + "get_load_details" + ], + "num_tool_calls": 7, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details", + "mcp__openstudio__get_load_details" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_loads_L1]", + "passed": true, + "duration_s": 84.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 29, + "cost_usd": 0.2728764, + "duration_ms": 81729, + "input_tokens": 17, + "output_tokens": 5277, + "cache_read_tokens": 179268, + "tool_calls": [ + "load_osm_model", + "get_model_summary", + "list_spaces", + "get_space_type_details", + "create_people_definition", + "create_lights_definition", + "create_people_definition", + "create_lights_definition", + "create_people_definition", + "create_lights_definition", + "create_people_definition", + "create_lights_definition", + "create_people_definition", + "create_lights_definition", + "create_people_definition", + "create_lights_definition", + "create_people_definition", + "create_lights_definition", + "create_people_definition", + "create_lights_definition", + "create_people_definition", + "create_lights_definition", + "create_people_definition", + "create_lights_definition", + "save_osm_model" + ], + "num_tool_calls": 25, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__list_spaces", + "ToolSearch", + "mcp__openstudio__get_space_type_details", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition", + "ToolSearch", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 3, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_loads_L2]", + "passed": true, + "duration_s": 46.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 24, + "cost_usd": 0.15134925, + "duration_ms": 44341, + "input_tokens": 10, + "output_tokens": 3295, + "cache_read_tokens": 108860, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_people_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition", + "create_lights_definition" + ], + "num_tool_calls": 22, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_people_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition", + "mcp__openstudio__create_lights_definition" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_loads_L3]", + "passed": true, + "duration_s": 25.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.08754390000000001, + "duration_ms": 23391, + "input_tokens": 12, + "output_tokens": 752, + "cache_read_tokens": 104693, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "create_people_definition" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_spaces", + "mcp__openstudio__create_people_definition" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L1]", + "passed": true, + "duration_s": 15.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0683496, + "duration_ms": 13265, + "input_tokens": 8, + "output_tokens": 466, + "cache_read_tokens": 65302, + "tool_calls": [ + "load_osm_model", + "create_plant_loop" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_plant_loop" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L2]", + "passed": true, + "duration_s": 16.2, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06863055000000001, + "duration_ms": 14062, + "input_tokens": 8, + "output_tokens": 475, + "cache_read_tokens": 65351, + "tool_calls": [ + "load_osm_model", + "create_plant_loop" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_plant_loop" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L3]", + "passed": true, + "duration_s": 17.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.0773235, + "duration_ms": 14928, + "input_tokens": 9, + "output_tokens": 627, + "cache_read_tokens": 84830, + "tool_calls": [ + "load_osm_model", + "create_plant_loop", + "create_plant_loop" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__create_plant_loop", + "mcp__openstudio__create_plant_loop" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedule_details_L1]", + "passed": true, + "duration_s": 120.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 0, + "cost_usd": 0.0, + "duration_ms": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_tokens": 0, + "tool_calls": [ + "load_osm_model", + "list_air_loops", + "get_air_loop_details", + "get_component_properties", + "get_object_fields", + "get_object_fields", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "get_schedule_details", + "get_schedule_details", + "get_thermal_zone_details", + "get_thermal_zone_details", + "get_object_fields", + "get_object_fields", + "read_file", + "read_file", + "read_file", + "read_file" + ], + "num_tool_calls": 19, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_air_loops", + "ToolSearch", + "mcp__openstudio__get_air_loop_details", + "mcp__openstudio__get_component_properties", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_thermal_zone_details", + "ToolSearch", + "mcp__openstudio__get_thermal_zone_details", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__get_object_fields", + "mcp__openstudio__read_file", + "ToolSearch", + "mcp__openstudio__read_file", + "Grep", + "Grep", + "Bash", + "Bash", + "Glob", + "mcp__openstudio__read_file", + "mcp__openstudio__read_file" + ], + "toolsearch_count": 4, + "is_timeout": true + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedule_details_L2]", + "passed": true, + "duration_s": 62.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 14, + "cost_usd": 0.18472349999999998, + "duration_ms": 59800, + "input_tokens": 22, + "output_tokens": 2603, + "cache_read_tokens": 286150, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "list_model_objects", + "list_model_objects", + "list_air_loops", + "get_air_loop_details", + "get_component_properties", + "get_schedule_details", + "get_setpoint_manager_properties", + "get_setpoint_manager_properties" + ], + "num_tool_calls": 10, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__list_air_loops", + "mcp__openstudio__get_air_loop_details", + "mcp__openstudio__get_component_properties", + "ToolSearch", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_setpoint_manager_properties", + "ToolSearch", + "mcp__openstudio__get_setpoint_manager_properties" + ], + "toolsearch_count": 3, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[schedule_details_L3]", + "passed": true, + "duration_s": 39.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 9, + "cost_usd": 0.10295834999999999, + "duration_ms": 37560, + "input_tokens": 12, + "output_tokens": 1418, + "cache_read_tokens": 104637, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_schedule_details", + "get_schedule_details", + "get_schedule_details", + "get_schedule_details" + ], + "num_tool_calls": 6, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_schedule_details", + "mcp__openstudio__get_schedule_details" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[space_type_info_L1]", + "passed": true, + "duration_s": 30.8, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.0952914, + "duration_ms": 28639, + "input_tokens": 9, + "output_tokens": 1105, + "cache_read_tokens": 87673, + "tool_calls": [ + "load_osm_model", + "get_model_summary", + "list_spaces", + "get_space_type_details" + ], + "num_tool_calls": 4, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__get_model_summary", + "mcp__openstudio__list_spaces", + "mcp__openstudio__get_space_type_details" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[space_type_info_L2]", + "passed": true, + "duration_s": 27.5, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.0889239, + "duration_ms": 25400, + "input_tokens": 12, + "output_tokens": 884, + "cache_read_tokens": 104268, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_space_type_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_space_type_details" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[space_type_info_L3]", + "passed": true, + "duration_s": 33.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.09065775, + "duration_ms": 31154, + "input_tokens": 12, + "output_tokens": 941, + "cache_read_tokens": 104235, + "tool_calls": [ + "load_osm_model", + "list_model_objects", + "get_space_type_details" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "ToolSearch", + "mcp__openstudio__list_model_objects", + "mcp__openstudio__get_space_type_details" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_run_period_L1]", + "passed": true, + "duration_s": 17.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0710034, + "duration_ms": 15557, + "input_tokens": 8, + "output_tokens": 488, + "cache_read_tokens": 64148, + "tool_calls": [ + "load_osm_model", + "set_run_period" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__set_run_period" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_run_period_L2]", + "passed": true, + "duration_s": 13.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06740805000000001, + "duration_ms": 11846, + "input_tokens": 8, + "output_tokens": 455, + "cache_read_tokens": 65001, + "tool_calls": [ + "load_osm_model", + "set_run_period" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__set_run_period" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[set_run_period_L3]", + "passed": true, + "duration_s": 14.4, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06847395, + "duration_ms": 11971, + "input_tokens": 8, + "output_tokens": 508, + "cache_read_tokens": 65154, + "tool_calls": [ + "load_osm_model", + "set_run_period" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__set_run_period" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[ideal_air_L1]", + "passed": true, + "duration_s": 27.3, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.07305945, + "duration_ms": 25272, + "input_tokens": 8, + "output_tokens": 881, + "cache_read_tokens": 64689, + "tool_calls": [ + "load_osm_model", + "enable_ideal_air_loads" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__enable_ideal_air_loads" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[ideal_air_L2]", + "passed": true, + "duration_s": 39.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 6, + "cost_usd": 0.09396044999999997, + "duration_ms": 37785, + "input_tokens": 12, + "output_tokens": 1060, + "cache_read_tokens": 103894, + "tool_calls": [ + "load_osm_model", + "enable_ideal_air_loads", + "list_zone_hvac_equipment" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__enable_ideal_air_loads", + "ToolSearch", + "mcp__openstudio__list_zone_hvac_equipment" + ], + "toolsearch_count": 2, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[ideal_air_L3]", + "passed": true, + "duration_s": 22.0, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0723459, + "duration_ms": 19667, + "input_tokens": 8, + "output_tokens": 725, + "cache_read_tokens": 64198, + "tool_calls": [ + "load_osm_model", + "enable_ideal_air_loads" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__enable_ideal_air_loads" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[save_model_L1]", + "passed": true, + "duration_s": 14.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06526799999999999, + "duration_ms": 12377, + "input_tokens": 8, + "output_tokens": 324, + "cache_read_tokens": 64530, + "tool_calls": [ + "load_osm_model", + "save_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[save_model_L2]", + "passed": true, + "duration_s": 15.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06680895, + "duration_ms": 13014, + "input_tokens": 8, + "output_tokens": 407, + "cache_read_tokens": 64654, + "tool_calls": [ + "load_osm_model", + "save_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[save_model_L3]", + "passed": true, + "duration_s": 16.7, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.06702390000000001, + "duration_ms": 14714, + "input_tokens": 8, + "output_tokens": 418, + "cache_read_tokens": 64683, + "tool_calls": [ + "load_osm_model", + "save_osm_model" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__save_osm_model" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_ev_L1]", + "passed": true, + "duration_s": 21.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.07310235, + "duration_ms": 18943, + "input_tokens": 8, + "output_tokens": 551, + "cache_read_tokens": 65007, + "tool_calls": [ + "load_osm_model", + "add_ev_load" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_ev_load" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_ev_L2]", + "passed": true, + "duration_s": 30.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 5, + "cost_usd": 0.09090975, + "duration_ms": 28574, + "input_tokens": 9, + "output_tokens": 970, + "cache_read_tokens": 86080, + "tool_calls": [ + "load_osm_model", + "list_spaces", + "add_ev_load" + ], + "num_tool_calls": 3, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__list_spaces", + "mcp__openstudio__add_ev_load" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[add_ev_L3]", + "passed": true, + "duration_s": 17.1, + "tier": "progressive", + "attempt": 1, + "num_turns": 4, + "cost_usd": 0.0708423, + "duration_ms": 15006, + "input_tokens": 8, + "output_tokens": 421, + "cache_read_tokens": 65061, + "tool_calls": [ + "load_osm_model", + "add_ev_load" + ], + "num_tool_calls": 2, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__load_osm_model", + "mcp__openstudio__add_ev_load" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_measures_L1]", + "passed": true, + "duration_s": 15.9, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.059368950000000004, + "duration_ms": 13885, + "input_tokens": 7, + "output_tokens": 387, + "cache_read_tokens": 45364, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_custom_measures" + ], + "toolsearch_count": 1, + "is_timeout": false + }, + { + "test_id": "tests/llm/test_06_progressive.py::test_progressive[list_measures_L2]", + "passed": true, + "duration_s": 12.6, + "tier": "progressive", + "attempt": 1, + "num_turns": 3, + "cost_usd": 0.0602949, + "duration_ms": 10466, + "input_tokens": 7, + "output_tokens": 383, + "cache_read_tokens": 45088, + "tool_calls": [ + "list_custom_measures" + ], + "num_tool_calls": 1, + "all_tool_calls": [ + "ToolSearch", + "mcp__openstudio__list_custom_measures" + ], + "toolsearch_count": 1, + "is_timeout": false + } + ] +} \ No newline at end of file diff --git a/docs/sweeps/sonnet-2026-03-28/benchmark.md b/docs/sweeps/sonnet-2026-03-28/benchmark.md new file mode 100644 index 0000000..30ce268 --- /dev/null +++ b/docs/sweeps/sonnet-2026-03-28/benchmark.md @@ -0,0 +1,301 @@ +# LLM Benchmark Report + +**Date:** 2026-03-28T17:06:27+00:00 +**Model:** sonnet | **Retries:** 0 +**Result:** 170/180 passed (94.4%) in 9453s +**Tokens:** 2.0k in + 250.1k out + 20.4M cache | **Cost:** $18.9595 (notional API pricing) + +## Summary by Tier + +| Tier | Passed | Rate | Time | Avg | +|--------|---------|--------|--------|--------| +| setup | 6/6 | 100.0% | 421s | 70s | +| tier1 | 4/4 | 100.0% | 130s | 32s | +| tier2 | 33/37 | 89.2% | 3600s | 97s | +| tier3 | 21/26 | 80.8% | 1703s | 65s | +| tier4 | 3/3 | 100.0% | 203s | 68s | +| progressive | 103/104 | 99.0% | 3396s | 33s | + +## Detailed Results + +### setup + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|--------------------------------|--------|------|-------|--------------------------------------------------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| test_create_baseline_model | PASS | 11s | 3 | create_baseline_osm | 7 | 330 | 44.5k | $0.0630 | 1 | +| test_create_baseline_with_hvac | PASS | 15s | 3 | create_baseline_osm | 7 | 389 | 45.8k | $0.0601 | 1 | +| test_create_example_model | PASS | 11s | 3 | create_example_osm | 7 | 292 | 45.4k | $0.0571 | 1 | +| test_load_baseline_model | PASS | 13s | 4 | load_osm_model, list_thermal_zones | 8 | 412 | 64.3k | $0.0708 | 1 | +| test_run_baseline_simulation | PASS | 236s | 12 | load_osm_model, change_building_location, run_simulation, get_run_status, save_osm_model, run_simulation, get_run_status | 18 | 1.7k | 236.2k | $0.1500 | 1 | +| test_run_retrofit_simulation | PASS | 134s | 8 | load_osm_model, change_building_location, adjust_thermostat_setpoints, run_simulation, get_run_status | 12 | 1.5k | 152.4k | $0.1210 | 1 | + +### tier1 + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|-------------------------------------|--------|------|-------|-----------------------------------------------------------------------------------|--------|---------|-------|---------|-----| +| What is the server status? | PASS | 9s | 3 | get_server_status | 7 | 270 | 45.1k | $0.0567 | 1 | +| List available skills | PASS | 13s | 3 | list_skills | 7 | 445 | 45.4k | $0.0610 | 1 | +| Create a small office building usin | PASS | 90s | 0 | create_new_building, list_weather_files, create_new_building, create_new_building | 0 | 0 | 0 | $0.0000 | 1 | +| Create bar geometry for a retail bu | PASS | 18s | 3 | create_bar_building | 7 | 556 | 46.1k | $0.0693 | 1 | + +### tier2 + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|---------------------------------------|--------|------|-------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| systemd_fourpipebeam_e2e | FAIL | 578s | 6 | load_osm_model, view_model, copy_file | 12 | 615 | 103.3k | $0.0838 | 1 | +| add_vav_reheat | PASS | 23s | 5 | load_osm_model, list_thermal_zones, add_baseline_system | 9 | 782 | 86.2k | $0.0860 | 1 | +| add_doas | PASS | 18s | 5 | load_osm_model, list_thermal_zones, add_doas_system | 9 | 747 | 85.1k | $0.0900 | 1 | +| add_vrf | PASS | 30s | 6 | load_osm_model, list_thermal_zones, add_vrf_system | 12 | 856 | 105.0k | $0.0925 | 1 | +| set_weather | PASS | 22s | 4 | load_osm_model, change_building_location | 8 | 507 | 65.4k | $0.0698 | 1 | +| add_rooftop_pv | PASS | 17s | 4 | load_osm_model, add_rooftop_pv | 8 | 451 | 64.9k | $0.0681 | 1 | +| adjust_thermostat | PASS | 15s | 4 | load_osm_model, adjust_thermostat_setpoints | 8 | 470 | 65.0k | $0.0681 | 1 | +| delete_space | PASS | 16s | 5 | load_osm_model, list_spaces, delete_object | 9 | 590 | 85.1k | $0.0822 | 1 | +| qaqc_check | PASS | 23s | 4 | load_osm_model, run_qaqc_checks | 8 | 886 | 65.5k | $0.0754 | 1 | +| create_bar_office | PASS | 23s | 4 | create_bar_building, list_spaces | 8 | 772 | 68.0k | $0.0870 | 1 | +| create_new_building | PASS | 51s | 3 | create_new_building | 7 | 624 | 46.4k | $0.0667 | 1 | +| bar_then_typical | PASS | 58s | 8 | create_bar_building, change_building_location, create_typical_building | 12 | 1.5k | 163.7k | $0.1319 | 1 | +| import_floorspacejs | PASS | 25s | 6 | import_floorspacejs, list_files, import_floorspacejs | 12 | 840 | 104.8k | $0.0915 | 1 | +| floorspacejs_to_typical | PASS | 92s | 11 | import_floorspacejs, list_files, import_floorspacejs, change_building_location, create_typical_building | 17 | 2.0k | 221.4k | $0.1541 | 1 | +| manual_geometry_match | PASS | 73s | 13 | get_server_status, create_space_from_floor_print, create_example_osm, create_space_from_floor_print, create_space_from_floor_print, match_surfaces, list_surfaces, list_surfaces, save_osm_model | 19 | 3.4k | 228.1k | $0.1839 | 1 | +| envelope_retrofit | PASS | 58s | 17 | load_osm_model, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, replace_window_constructions, list_model_objects, replace_window_constructions | 18 | 2.8k | 202.4k | $0.1653 | 1 | +| create_and_assign_loads | PASS | 27s | 7 | load_osm_model, list_spaces, create_people_definition, create_lights_definition | 12 | 1.1k | 106.8k | $0.0955 | 1 | +| plant_loop_with_boiler | PASS | 19s | 5 | load_osm_model, create_plant_loop, add_supply_equipment | 9 | 650 | 85.8k | $0.0801 | 1 | +| inspect_and_modify_boiler | PASS | 22s | 6 | load_osm_model, list_model_objects, get_object_fields, set_object_property | 10 | 913 | 108.8k | $0.0973 | 1 | +| extract_results_chain | PASS | 16s | 4 | extract_summary_metrics, extract_end_use_breakdown | 7 | 594 | 45.7k | $0.0639 | 1 | +| hvac_chilled_beam_comparison | PASS | 108s | 20 | load_osm_model, list_air_loops, replace_air_terminals, save_osm_model, run_simulation, get_run_status, get_weather_info, list_weather_files, change_building_location, save_osm_model, save_osm_model, run_simulation, get_run_status, get_run_status, extract_end_use_breakdown | 30 | 4.3k | 510.2k | $0.3184 | 1 | +| create_test_apply_measure | PASS | 24s | 6 | load_osm_model, create_measure, test_measure, apply_measure | 9 | 786 | 89.2k | $0.0872 | 1 | +| measure_set_lights_full_chain | PASS | 102s | 26 | load_osm_model, list_skills, get_skill, get_skill, save_osm_model, get_weather_info, run_simulation, create_measure, get_run_status, test_measure, extract_summary_metrics, load_osm_model, apply_measure, save_osm_model, run_simulation, get_run_status, get_run_status, extract_summary_metrics | 37 | 4.7k | 529.3k | $0.3198 | 1 | +| measure_set_infiltration_full_chain | PASS | 121s | 22 | load_osm_model, save_osm_model, run_simulation, create_measure, test_measure, get_run_status, search_api, create_measure, test_measure, extract_summary_metrics, load_osm_model, apply_measure, save_osm_model, run_simulation, get_run_status, get_run_status, extract_summary_metrics | 25 | 6.3k | 473.6k | $0.3220 | 1 | +| measure_replace_terminals_full_chain | PASS | 361s | 28 | load_osm_model, save_osm_model, run_simulation, get_run_status, get_run_status, extract_summary_metrics, load_osm_model, search_wiring_patterns, list_air_loops, list_plant_loops, search_api, create_measure, test_measure, apply_measure, save_osm_model, run_simulation, get_run_status, extract_summary_metrics, compare_runs, extract_end_use_breakdown, extract_end_use_breakdown | 31 | 7.7k | 555.3k | $0.3965 | 1 | +| create_measure_with_args | PASS | 62s | 3 | create_measure | 7 | 4.5k | 46.1k | $0.1379 | 1 | +| measure_add_baseboards_full_chain | PASS | 107s | 25 | load_osm_model, list_skills, get_skill, get_skill, list_thermal_zones, get_weather_info, save_osm_model, run_simulation, create_measure, test_measure, get_run_status, extract_summary_metrics, load_osm_model, apply_measure, save_osm_model, run_simulation, get_run_status, get_run_status, extract_summary_metrics | 32 | 4.8k | 508.7k | $0.3148 | 1 | +| ruby_measure_reduce_plugloads | PASS | 418s | 28 | load_osm_model, save_osm_model, run_simulation, get_run_status, extract_summary_metrics, load_osm_model, create_measure, test_measure, read_file, edit_measure, test_measure, edit_measure, test_measure, apply_measure, save_osm_model, run_simulation, get_run_status, extract_summary_metrics | 42 | 14.5k | 786.3k | $0.5765 | 1 | +| python_measure_reduce_plugloads | PASS | 231s | 29 | load_osm_model, save_osm_model, run_simulation, get_run_status, extract_summary_metrics, load_osm_model, create_measure, test_measure, read_file, read_file, edit_measure, read_file, test_measure, edit_measure, test_measure, edit_measure, test_measure, apply_measure, save_osm_model, run_simulation, get_run_status, extract_summary_metrics, compare_runs | 40 | 13.0k | 837.1k | $0.6027 | 1 | +| ruby_measure_boiler_efficiency | PASS | 332s | 26 | load_osm_model, save_osm_model, run_simulation, get_run_status, get_run_status, extract_summary_metrics, load_osm_model, create_measure, test_measure, read_file, edit_measure, read_file, test_measure, apply_measure, save_osm_model, run_simulation, get_run_status, extract_summary_metrics | 38 | 7.8k | 663.7k | $0.4194 | 1 | +| python_measure_boiler_efficiency | PASS | 142s | 23 | load_osm_model, save_osm_model, run_simulation, load_osm_model, create_measure, test_measure, get_run_status, read_file, edit_measure, test_measure, extract_summary_metrics, apply_measure, save_osm_model, run_simulation, get_run_status, get_run_status, extract_summary_metrics, compare_runs | 27 | 7.2k | 494.2k | $0.3528 | 1 | +| test_create_measure_with_args_quality | PASS | 92s | 3 | create_measure | 7 | 6.5k | 56.1k | $0.1408 | 1 | +| test_complex_model_multi_query | PASS | 28s | 8 | load_osm_model, get_building_info, list_air_loops, list_plant_loops, list_thermal_zones | 11 | 1.1k | 84.4k | $0.0908 | 1 | +| Ruby | FAIL | 86s | 3 | create_measure | 7 | 6.5k | 46.3k | $0.1753 | 1 | +| Python | FAIL | 73s | 3 | create_measure | 7 | 4.9k | 46.3k | $0.1461 | 1 | +| Ruby | FAIL | 38s | 3 | create_measure | 7 | 2.5k | 46.3k | $0.1013 | 1 | +| Python | PASS | 69s | 7 | create_measure, test_measure, edit_measure, test_measure | 13 | 4.3k | 140.6k | $0.1702 | 1 | + +### tier3 + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|--------------------------------------------------|--------|------|-------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| add-hvac:Add HVAC to the model | PASS | 42s | 15 | load_osm_model, get_building_info, list_thermal_zones, add_baseline_system, list_air_loops, list_plant_loops, save_osm_model | 23 | 1.9k | 222.9k | $0.1627 | 1 | +| add-hvac:Set up heating and cooling | PASS | 30s | 8 | load_osm_model, get_building_info, list_thermal_zones | 13 | 1.2k | 104.4k | $0.0976 | 1 | +| add-hvac:What HVAC system should I use? | PASS | 53s | 7 | load_osm_model, get_building_info, list_thermal_zones | 10 | 2.9k | 85.8k | $0.1240 | 1 | +| add-hvac:Add a VAV system | PASS | 17s | 5 | load_osm_model, list_thermal_zones, add_baseline_system | 9 | 792 | 86.2k | $0.0862 | 1 | +| energy-report:Give me a full energy report | FAIL | 120s | 0 | load_osm_model, list_files, get_building_info, get_model_summary, get_weather_info, run_simulation | 0 | 0 | 0 | $0.0000 | 1 | +| new-building:Create a small office building | PASS | 55s | 11 | list_skills, get_skill, list_weather_files, create_new_building, save_osm_model | 23 | 1.4k | 244.7k | $0.1787 | 1 | +| new-building:Model a 3-story school | PASS | 138s | 11 | list_skills, get_server_status, get_skill, list_weather_files, create_new_building, save_osm_model, get_model_summary | 17 | 3.2k | 200.6k | $0.2104 | 1 | +| new-building:Create a retail building, 25000 sqf | PASS | 180s | 0 | get_server_status, list_skills, get_skill, list_weather_files, create_new_building, change_building_location, create_typical_building, create_typical_building, list_thermal_zones, add_baseline_system, list_baseline_systems | 0 | 0 | 0 | $0.0000 | 1 | +| new-building:Import the FloorspaceJS floor plan | PASS | 24s | 6 | import_floorspacejs, list_files, import_floorspacejs | 12 | 860 | 106.1k | $0.0969 | 1 | +| new-building:Create a bar building for a medium | PASS | 20s | 3 | create_bar_building | 7 | 566 | 46.1k | $0.0725 | 1 | +| qaqc:Check the model for issues | FAIL | 21s | 5 | load_osm_model, validate_model | 11 | 548 | 84.7k | $0.0758 | 1 | +| qaqc:Validate before simulation | FAIL | 18s | 5 | load_osm_model, validate_model | 11 | 500 | 84.0k | $0.0775 | 1 | +| qaqc:QA/QC the model | PASS | 56s | 14 | load_osm_model, validate_model, get_model_summary, get_building_info, list_thermal_zones, list_spaces, get_weather_info, get_run_period | 17 | 2.7k | 149.7k | $0.1551 | 1 | +| qaqc:Is my model ready to simulate? | PASS | 51s | 14 | load_osm_model, validate_model, get_model_summary, get_building_info, get_weather_info, get_run_period, list_thermal_zones, list_spaces | 17 | 1.4k | 148.0k | $0.1274 | 1 | +| retrofit:Compare before and after adding ins | PASS | 180s | 0 | load_osm_model, get_model_summary, list_model_objects, list_common_measures, get_construction_details, get_construction_details, list_comstock_measures, list_measure_arguments, list_measure_arguments, list_measure_arguments, list_measure_arguments, apply_measure, apply_measure, get_object_fields, get_object_fields, list_materials, set_object_property, set_object_property, get_construction_details, get_construction_details, save_osm_model | 0 | 0 | 0 | $0.0000 | 1 | +| retrofit:Do a retrofit analysis | PASS | 180s | 0 | load_osm_model, list_skills, get_skill, get_building_info, get_model_summary, list_air_loops, get_weather_info, list_model_objects, save_osm_model, run_simulation, adjust_thermostat_setpoints, add_rooftop_pv, list_model_objects, shift_schedule_time, save_osm_model, get_run_status, run_simulation, extract_summary_metrics, extract_end_use_breakdown, get_run_status, extract_summary_metrics, extract_end_use_breakdown, compare_runs, generate_results_report, generate_results_report | 0 | 0 | 0 | $0.0000 | 1 | +| simulate:Run a simulation | PASS | 22s | 7 | load_osm_model, run_simulation, get_run_status, get_run_status | 13 | 838 | 123.3k | $0.0976 | 1 | +| simulate:Simulate the model | PASS | 116s | 8 | load_osm_model, run_simulation, get_run_status, get_run_status | 14 | 981 | 144.6k | $0.1037 | 1 | +| simulate:Run EnergyPlus | PASS | 27s | 6 | load_osm_model, run_simulation, get_run_status | 12 | 916 | 104.4k | $0.0894 | 1 | +| troubleshoot:My simulation failed | FAIL | 17s | 4 | load_osm_model, extract_simulation_errors | 7 | 551 | 45.9k | $0.0649 | 1 | +| troubleshoot:EUI looks way too high | PASS | 120s | 0 | load_osm_model, extract_summary_metrics, extract_end_use_breakdown, get_run_status, get_weather_info, get_run_logs, get_run_logs, extract_simulation_errors, change_building_location, change_building_location, save_osm_model, save_osm_model, run_simulation | 0 | 0 | 0 | $0.0000 | 1 | +| troubleshoot:Too many unmet hours | PASS | 120s | 0 | load_osm_model, extract_summary_metrics, get_run_status, list_thermal_zones, get_weather_info, get_schedule_details, get_schedule_details, extract_simulation_errors, get_run_logs, change_building_location, save_osm_model, save_osm_model, run_simulation, get_run_status | 0 | 0 | 0 | $0.0000 | 1 | +| troubleshoot:Why did EnergyPlus crash? | FAIL | 17s | 4 | load_osm_model, extract_simulation_errors | 7 | 537 | 45.9k | $0.0647 | 1 | +| view:Show me the model | PASS | 24s | 6 | load_osm_model, view_model, copy_file | 12 | 700 | 103.7k | $0.0845 | 1 | +| view:Visualize the building | PASS | 25s | 6 | load_osm_model, view_model, copy_file | 12 | 676 | 103.7k | $0.0840 | 1 | +| view:3D view | PASS | 30s | 6 | load_osm_model, view_model, copy_file | 12 | 615 | 103.3k | $0.0838 | 1 | + +### tier4 + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|--------------------------------------------|--------|------|-------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| test_create_uses_mcp_not_raw_idf | PASS | 165s | 15 | list_skills, get_skill, list_weather_files, create_new_building, create_new_building, create_bar_building, get_model_summary, change_building_location, create_typical_building, save_osm_model, save_osm_model, get_model_summary | 21 | 6.1k | 427.8k | $0.3384 | 1 | +| test_no_script_for_results | PASS | 14s | 3 | extract_summary_metrics | 7 | 339 | 45.5k | $0.0585 | 1 | +| test_inspect_component_uses_mcp_not_script | PASS | 24s | 8 | load_osm_model, list_model_objects, list_model_objects, list_model_objects, list_model_objects, get_component_properties | 9 | 1.0k | 85.0k | $0.0900 | 1 | + +### progressive + +| Test | Result | Time | Turns | Tools | In Tok | Out Tok | Cache | Cost | Att | +|-------------------------|--------|------|-------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|---------|--------|---------|-----| +| import_floorplan_L1 | PASS | 64s | 7 | list_files, list_skills, get_skill, import_floorspacejs | 12 | 2.8k | 115.0k | $0.1445 | 1 | +| import_floorplan_L2 | PASS | 22s | 6 | import_floorspacejs, list_files, import_floorspacejs | 12 | 807 | 103.8k | $0.0946 | 1 | +| import_floorplan_L3 | PASS | 22s | 6 | import_floorspacejs, list_files, import_floorspacejs | 12 | 743 | 104.8k | $0.0897 | 1 | +| add_hvac_L1 | PASS | 50s | 15 | load_osm_model, list_skills, get_building_info, list_thermal_zones, add_baseline_system, list_air_loops, list_plant_loops, save_osm_model | 21 | 2.4k | 203.1k | $0.1680 | 1 | +| add_hvac_L2 | PASS | 20s | 5 | load_osm_model, list_thermal_zones, add_baseline_system | 9 | 799 | 86.2k | $0.0862 | 1 | +| add_hvac_L3 | PASS | 20s | 5 | load_osm_model, list_thermal_zones, add_baseline_system | 9 | 753 | 84.9k | $0.0899 | 1 | +| view_model_L1 | PASS | 23s | 6 | load_osm_model, view_model, copy_file | 12 | 648 | 103.7k | $0.0835 | 1 | +| view_model_L2 | PASS | 17s | 4 | load_osm_model, view_model | 8 | 467 | 64.2k | $0.0690 | 1 | +| view_model_L3 | PASS | 24s | 6 | load_osm_model, view_model, copy_file | 12 | 697 | 103.8k | $0.0845 | 1 | +| set_weather_L1 | PASS | 38s | 6 | load_osm_model, list_weather_files, change_building_location | 12 | 1.2k | 111.5k | $0.1261 | 1 | +| set_weather_L2 | PASS | 47s | 7 | load_osm_model, change_building_location, list_weather_files, change_building_location | 13 | 2.1k | 133.5k | $0.1487 | 1 | +| set_weather_L3 | PASS | 59s | 7 | load_osm_model, change_building_location, list_weather_files, change_building_location | 13 | 2.0k | 132.7k | $0.1487 | 1 | +| run_qaqc_L1 | PASS | 18s | 5 | load_osm_model, validate_model | 11 | 590 | 84.5k | $0.0774 | 1 | +| run_qaqc_L2 | PASS | 25s | 6 | load_osm_model, validate_model, run_qaqc_checks | 11 | 792 | 84.8k | $0.0804 | 1 | +| run_qaqc_L3 | PASS | 24s | 6 | load_osm_model, inspect_osm_summary, validate_model | 11 | 848 | 85.6k | $0.0835 | 1 | +| create_building_L1 | PASS | 80s | 12 | list_skills, get_skill, list_weather_files, create_new_building, change_building_location, create_typical_building, save_osm_model, get_model_summary, save_osm_model | 17 | 2.5k | 269.2k | $0.2103 | 1 | +| create_building_L2 | PASS | 120s | 0 | create_new_building, create_new_building, list_weather_files, change_building_location, create_typical_building | 0 | 0 | 0 | $0.0000 | 1 | +| create_building_L3 | PASS | 16s | 3 | create_bar_building | 7 | 458 | 46.2k | $0.0684 | 1 | +| add_pv_L1 | PASS | 20s | 4 | load_osm_model, add_rooftop_pv | 8 | 526 | 65.0k | $0.0696 | 1 | +| add_pv_L2 | PASS | 20s | 4 | load_osm_model, add_rooftop_pv | 8 | 521 | 64.9k | $0.0694 | 1 | +| add_pv_L3 | PASS | 16s | 4 | load_osm_model, add_rooftop_pv | 8 | 412 | 64.3k | $0.0694 | 1 | +| thermostat_L1 | PASS | 22s | 4 | load_osm_model, adjust_thermostat_setpoints | 8 | 442 | 64.9k | $0.0675 | 1 | +| thermostat_L2 | PASS | 15s | 4 | load_osm_model, adjust_thermostat_setpoints | 8 | 413 | 65.0k | $0.0670 | 1 | +| thermostat_L3 | PASS | 20s | 4 | load_osm_model, adjust_thermostat_setpoints | 8 | 419 | 64.4k | $0.0693 | 1 | +| list_spaces_L1 | PASS | 17s | 4 | load_osm_model, list_spaces | 8 | 533 | 65.1k | $0.0709 | 1 | +| list_spaces_L2 | PASS | 17s | 4 | load_osm_model, list_spaces | 8 | 695 | 64.4k | $0.0758 | 1 | +| list_spaces_L3 | PASS | 14s | 4 | load_osm_model, list_spaces | 8 | 701 | 64.2k | $0.0767 | 1 | +| schedules_L1 | PASS | 23s | 5 | load_osm_model, list_model_objects, list_model_objects | 9 | 874 | 85.7k | $0.0834 | 1 | +| schedules_L2 | PASS | 17s | 4 | load_osm_model, list_model_objects | 8 | 646 | 65.4k | $0.0726 | 1 | +| schedules_L3 | PASS | 18s | 4 | load_osm_model, list_model_objects | 8 | 613 | 65.4k | $0.0721 | 1 | +| inspect_component_L1 | PASS | 20s | 5 | load_osm_model, list_model_objects, get_component_properties | 9 | 570 | 85.4k | $0.0776 | 1 | +| inspect_component_L2 | PASS | 20s | 5 | load_osm_model, list_model_objects, get_component_properties | 9 | 596 | 85.2k | $0.0786 | 1 | +| inspect_component_L3 | PASS | 29s | 7 | load_osm_model, get_object_fields, list_model_objects, get_object_fields | 13 | 1.0k | 124.2k | $0.1031 | 1 | +| modify_component_L1 | PASS | 30s | 8 | load_osm_model, list_model_objects, get_component_properties, set_component_properties, save_osm_model | 14 | 878 | 147.4k | $0.1042 | 1 | +| modify_component_L2 | PASS | 21s | 5 | load_osm_model, list_model_objects, set_component_properties | 9 | 543 | 85.2k | $0.0786 | 1 | +| modify_component_L3 | PASS | 22s | 7 | load_osm_model, set_object_property, list_model_objects, set_object_property | 13 | 859 | 125.5k | $0.0961 | 1 | +| list_dynamic_type_L1 | PASS | 34s | 17 | load_osm_model, list_air_loops, list_thermal_zones, get_sizing_system_properties, get_sizing_zone_properties, get_sizing_zone_properties, get_sizing_zone_properties, get_sizing_zone_properties, get_sizing_zone_properties, get_sizing_zone_properties, get_sizing_zone_properties, get_sizing_zone_properties, get_sizing_zone_properties, get_sizing_zone_properties | 12 | 1.9k | 93.2k | $0.1668 | 1 | +| list_dynamic_type_L2 | PASS | 15s | 4 | load_osm_model, list_model_objects | 8 | 475 | 65.4k | $0.0686 | 1 | +| list_dynamic_type_L3 | PASS | 16s | 4 | load_osm_model, list_model_objects | 8 | 524 | 65.4k | $0.0693 | 1 | +| floor_area_L1 | PASS | 18s | 4 | load_osm_model, get_building_info | 8 | 472 | 64.7k | $0.0681 | 1 | +| floor_area_L2 | PASS | 14s | 4 | load_osm_model, get_building_info | 8 | 344 | 64.7k | $0.0662 | 1 | +| floor_area_L3 | PASS | 15s | 4 | load_osm_model, get_building_info | 8 | 445 | 64.8k | $0.0679 | 1 | +| materials_L1 | PASS | 22s | 4 | load_osm_model, list_materials | 8 | 857 | 64.7k | $0.0776 | 1 | +| materials_L2 | PASS | 20s | 4 | load_osm_model, list_materials | 8 | 617 | 64.9k | $0.0735 | 1 | +| materials_L3 | PASS | 20s | 4 | load_osm_model, list_materials | 8 | 840 | 64.9k | $0.0767 | 1 | +| thermal_zones_L1 | FAIL | 17s | 3 | load_osm_model | 7 | 301 | 45.7k | $0.0585 | 1 | +| thermal_zones_L2 | PASS | 16s | 4 | load_osm_model, list_thermal_zones | 8 | 730 | 65.0k | $0.0735 | 1 | +| thermal_zones_L3 | PASS | 18s | 4 | load_osm_model, list_thermal_zones | 8 | 641 | 65.0k | $0.0721 | 1 | +| subsurfaces_L1 | PASS | 14s | 4 | load_osm_model, list_subsurfaces | 8 | 378 | 64.4k | $0.0687 | 1 | +| subsurfaces_L2 | PASS | 14s | 4 | load_osm_model, list_subsurfaces | 8 | 435 | 65.2k | $0.0672 | 1 | +| subsurfaces_L3 | PASS | 16s | 4 | load_osm_model, list_subsurfaces | 8 | 420 | 65.0k | $0.0677 | 1 | +| surface_details_L1 | PASS | 23s | 6 | load_osm_model, list_surfaces, get_surface_details, get_surface_details | 9 | 935 | 85.6k | $0.0929 | 1 | +| surface_details_L2 | PASS | 20s | 5 | load_osm_model, list_surfaces, get_surface_details | 9 | 748 | 85.1k | $0.0799 | 1 | +| surface_details_L3 | PASS | 21s | 4 | load_osm_model, list_surfaces | 8 | 846 | 65.2k | $0.1159 | 1 | +| run_simulation_L1 | PASS | 300s | 0 | load_osm_model, run_simulation, get_run_status, extract_simulation_errors, validate_model, list_air_loops, list_thermal_zones, delete_object, save_osm_model, run_simulation, get_run_status, extract_simulation_errors, delete_object, delete_object, clean_unused_objects, save_osm_model, save_osm_model, run_simulation | 0 | 0 | 0 | $0.0000 | 1 | +| run_simulation_L2 | PASS | 116s | 7 | load_osm_model, run_simulation, get_run_status | 13 | 803 | 124.2k | $0.0934 | 1 | +| run_simulation_L3 | PASS | 153s | 1 | load_osm_model, run_simulation, get_run_status, get_run_status | 3 | 105 | 20.6k | $0.1129 | 1 | +| get_eui_L1 | PASS | 25s | 6 | extract_summary_metrics, get_run_status, extract_end_use_breakdown | 11 | 713 | 84.1k | $0.0795 | 1 | +| get_eui_L2 | PASS | 24s | 6 | extract_summary_metrics, get_run_status, extract_simulation_errors | 11 | 705 | 84.0k | $0.0807 | 1 | +| get_eui_L3 | PASS | 14s | 3 | extract_summary_metrics | 7 | 397 | 45.5k | $0.0593 | 1 | +| end_use_breakdown_L1 | PASS | 30s | 9 | extract_end_use_breakdown, extract_end_use_breakdown, extract_summary_metrics, get_run_status, get_run_artifacts | 15 | 1.1k | 123.7k | $0.1039 | 1 | +| end_use_breakdown_L2 | PASS | 22s | 6 | extract_end_use_breakdown, get_run_status, extract_summary_metrics | 11 | 792 | 83.4k | $0.0819 | 1 | +| end_use_breakdown_L3 | PASS | 14s | 3 | extract_end_use_breakdown | 7 | 355 | 45.5k | $0.0580 | 1 | +| hvac_sizing_L1 | PASS | 24s | 6 | extract_hvac_sizing, extract_component_sizing, extract_simulation_errors | 11 | 907 | 83.5k | $0.0825 | 1 | +| hvac_sizing_L2 | PASS | 13s | 3 | extract_hvac_sizing | 7 | 408 | 45.1k | $0.0596 | 1 | +| hvac_sizing_L3 | PASS | 15s | 3 | extract_hvac_sizing | 7 | 459 | 45.4k | $0.0594 | 1 | +| set_wwr_L1 | PASS | 27s | 13 | load_osm_model, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio | 12 | 1.5k | 105.1k | $0.1080 | 1 | +| set_wwr_L2 | PASS | 35s | 15 | load_osm_model, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, save_osm_model | 16 | 1.6k | 150.3k | $0.1266 | 1 | +| set_wwr_L3 | PASS | 29s | 13 | load_osm_model, list_surfaces, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio, set_window_to_wall_ratio | 12 | 1.6k | 105.1k | $0.1084 | 1 | +| replace_windows_L1 | PASS | 120s | 0 | load_osm_model, list_model_objects, get_construction_details, list_model_objects, get_construction_details, list_common_measures, list_measure_arguments, list_files, list_measure_arguments | 0 | 0 | 0 | $0.0000 | 1 | +| replace_windows_L2 | PASS | 36s | 6 | load_osm_model, list_model_objects, replace_window_constructions | 12 | 1.5k | 105.1k | $0.1010 | 1 | +| replace_windows_L3 | PASS | 37s | 6 | load_osm_model, list_model_objects, replace_window_constructions | 12 | 1.3k | 105.6k | $0.0993 | 1 | +| construction_details_L1 | PASS | 23s | 5 | load_osm_model, list_surfaces, get_construction_details | 9 | 660 | 84.9k | $0.0799 | 1 | +| construction_details_L2 | PASS | 28s | 5 | load_osm_model, list_surfaces, get_construction_details | 9 | 804 | 84.4k | $0.0848 | 1 | +| construction_details_L3 | PASS | 39s | 17 | load_osm_model, list_model_objects, get_construction_details, get_construction_details, get_construction_details, get_construction_details, get_construction_details, get_construction_details, get_construction_details, get_construction_details, get_construction_details, get_construction_details, get_construction_details, get_construction_details | 12 | 1.9k | 92.8k | $0.1590 | 1 | +| check_loads_L1 | PASS | 29s | 8 | load_osm_model, list_spaces, get_space_details, get_space_type_details | 16 | 1.0k | 144.1k | $0.1056 | 1 | +| check_loads_L2 | PASS | 31s | 8 | load_osm_model, list_spaces, get_space_type_details, get_load_details, get_load_details | 13 | 1.1k | 126.3k | $0.1067 | 1 | +| check_loads_L3 | PASS | 33s | 10 | load_osm_model, list_model_objects, list_model_objects, list_model_objects, get_load_details, get_load_details, get_load_details | 12 | 1.1k | 104.8k | $0.0956 | 1 | +| create_loads_L1 | PASS | 84s | 29 | load_osm_model, get_model_summary, list_spaces, get_space_type_details, create_people_definition, create_lights_definition, create_people_definition, create_lights_definition, create_people_definition, create_lights_definition, create_people_definition, create_lights_definition, create_people_definition, create_lights_definition, create_people_definition, create_lights_definition, create_people_definition, create_lights_definition, create_people_definition, create_lights_definition, create_people_definition, create_lights_definition, create_people_definition, create_lights_definition, save_osm_model | 17 | 5.3k | 179.3k | $0.2729 | 1 | +| create_loads_L2 | PASS | 47s | 24 | load_osm_model, list_spaces, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_people_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition, create_lights_definition | 10 | 3.3k | 108.9k | $0.1513 | 1 | +| create_loads_L3 | PASS | 26s | 6 | load_osm_model, list_spaces, create_people_definition | 12 | 752 | 104.7k | $0.0875 | 1 | +| create_plant_loop_L1 | PASS | 15s | 4 | load_osm_model, create_plant_loop | 8 | 466 | 65.3k | $0.0683 | 1 | +| create_plant_loop_L2 | PASS | 16s | 4 | load_osm_model, create_plant_loop | 8 | 475 | 65.4k | $0.0686 | 1 | +| create_plant_loop_L3 | PASS | 17s | 5 | load_osm_model, create_plant_loop, create_plant_loop | 9 | 627 | 84.8k | $0.0773 | 1 | +| schedule_details_L1 | PASS | 120s | 0 | load_osm_model, list_air_loops, get_air_loop_details, get_component_properties, get_object_fields, get_object_fields, list_model_objects, list_model_objects, list_model_objects, get_schedule_details, get_schedule_details, get_thermal_zone_details, get_thermal_zone_details, get_object_fields, get_object_fields, read_file, read_file, read_file, read_file | 0 | 0 | 0 | $0.0000 | 1 | +| schedule_details_L2 | PASS | 63s | 14 | load_osm_model, list_model_objects, list_model_objects, list_model_objects, list_air_loops, get_air_loop_details, get_component_properties, get_schedule_details, get_setpoint_manager_properties, get_setpoint_manager_properties | 22 | 2.6k | 286.1k | $0.1847 | 1 | +| schedule_details_L3 | PASS | 40s | 9 | load_osm_model, list_model_objects, get_schedule_details, get_schedule_details, get_schedule_details, get_schedule_details | 12 | 1.4k | 104.6k | $0.1030 | 1 | +| space_type_info_L1 | PASS | 31s | 6 | load_osm_model, get_model_summary, list_spaces, get_space_type_details | 9 | 1.1k | 87.7k | $0.0953 | 1 | +| space_type_info_L2 | PASS | 28s | 6 | load_osm_model, list_model_objects, get_space_type_details | 12 | 884 | 104.3k | $0.0889 | 1 | +| space_type_info_L3 | PASS | 33s | 6 | load_osm_model, list_model_objects, get_space_type_details | 12 | 941 | 104.2k | $0.0907 | 1 | +| set_run_period_L1 | PASS | 18s | 4 | load_osm_model, set_run_period | 8 | 488 | 64.1k | $0.0710 | 1 | +| set_run_period_L2 | PASS | 14s | 4 | load_osm_model, set_run_period | 8 | 455 | 65.0k | $0.0674 | 1 | +| set_run_period_L3 | PASS | 14s | 4 | load_osm_model, set_run_period | 8 | 508 | 65.2k | $0.0685 | 1 | +| ideal_air_L1 | PASS | 27s | 4 | load_osm_model, enable_ideal_air_loads | 8 | 881 | 64.7k | $0.0731 | 1 | +| ideal_air_L2 | PASS | 40s | 6 | load_osm_model, enable_ideal_air_loads, list_zone_hvac_equipment | 12 | 1.1k | 103.9k | $0.0940 | 1 | +| ideal_air_L3 | PASS | 22s | 4 | load_osm_model, enable_ideal_air_loads | 8 | 725 | 64.2k | $0.0723 | 1 | +| save_model_L1 | PASS | 15s | 4 | load_osm_model, save_osm_model | 8 | 324 | 64.5k | $0.0653 | 1 | +| save_model_L2 | PASS | 15s | 4 | load_osm_model, save_osm_model | 8 | 407 | 64.7k | $0.0668 | 1 | +| save_model_L3 | PASS | 17s | 4 | load_osm_model, save_osm_model | 8 | 418 | 64.7k | $0.0670 | 1 | +| add_ev_L1 | PASS | 22s | 4 | load_osm_model, add_ev_load | 8 | 551 | 65.0k | $0.0731 | 1 | +| add_ev_L2 | PASS | 31s | 5 | load_osm_model, list_spaces, add_ev_load | 9 | 970 | 86.1k | $0.0909 | 1 | +| add_ev_L3 | PASS | 17s | 4 | load_osm_model, add_ev_load | 8 | 421 | 65.1k | $0.0708 | 1 | +| list_measures_L1 | PASS | 16s | 3 | list_custom_measures | 7 | 387 | 45.4k | $0.0594 | 1 | +| list_measures_L2 | PASS | 13s | 3 | list_custom_measures | 7 | 383 | 45.1k | $0.0603 | 1 | + +## Progressive Prompt Analysis + +Pass rates by specificity level per case: + +| Case | L1 (vague) | L2 (moderate) | L3 (explicit) | +|----------------------|------------|---------------|---------------| +| import_floorplan | PASS | PASS | PASS | +| add_hvac | PASS | PASS | PASS | +| view_model | PASS | PASS | PASS | +| set_weather | PASS | PASS | PASS | +| run_qaqc | PASS | PASS | PASS | +| create_building | PASS | PASS | PASS | +| add_pv | PASS | PASS | PASS | +| thermostat | PASS | PASS | PASS | +| list_spaces | PASS | PASS | PASS | +| schedules | PASS | PASS | PASS | +| inspect_component | PASS | PASS | PASS | +| modify_component | PASS | PASS | PASS | +| list_dynamic_type | PASS | PASS | PASS | +| floor_area | PASS | PASS | PASS | +| materials | PASS | PASS | PASS | +| thermal_zones | FAIL | PASS | PASS | +| subsurfaces | PASS | PASS | PASS | +| surface_details | PASS | PASS | PASS | +| run_simulation | PASS | PASS | PASS | +| get_eui | PASS | PASS | PASS | +| end_use_breakdown | PASS | PASS | PASS | +| hvac_sizing | PASS | PASS | PASS | +| set_wwr | PASS | PASS | PASS | +| replace_windows | PASS | PASS | PASS | +| construction_details | PASS | PASS | PASS | +| check_loads | PASS | PASS | PASS | +| create_loads | PASS | PASS | PASS | +| create_plant_loop | PASS | PASS | PASS | +| schedule_details | PASS | PASS | PASS | +| space_type_info | PASS | PASS | PASS | +| set_run_period | PASS | PASS | PASS | +| ideal_air | PASS | PASS | PASS | +| save_model | PASS | PASS | PASS | +| add_ev | PASS | PASS | PASS | +| list_measures | PASS | PASS | - | + +**Summary:** L1=34/35 | L2=35/35 | L3=34/35 + +## Tool Discovery Overhead + +| Metric | Value | +|--------|-------| +| Avg ToolSearch calls/test | 1.9 | +| Max ToolSearch calls | 10 | +| Tests with 0 ToolSearch | 0/180 | + +## Failure Mode Analysis + +| Mode | Count | Description | +|------|-------|-------------| +| wrong_tool | 9 | MCP tool called but not the expected one | +| timeout | 1 | Timed out before completing | + +## Failed Tests + +- **energy-report:Give me a full energy report** (tier3, timeout): 120s, 0 turns, tools: load_osm_model -> list_files -> get_building_info -> get_model_summary -> get_weather_info -> run_simulation +- **qaqc:Check the model for issues** (tier3, wrong_tool): 21s, 5 turns, tools: load_osm_model -> validate_model +- **qaqc:Validate before simulation** (tier3, wrong_tool): 18s, 5 turns, tools: load_osm_model -> validate_model +- **troubleshoot:My simulation failed** (tier3, wrong_tool): 17s, 4 turns, tools: load_osm_model -> extract_simulation_errors +- **troubleshoot:Why did EnergyPlus crash?** (tier3, wrong_tool): 17s, 4 turns, tools: load_osm_model -> extract_simulation_errors +- **systemd_fourpipebeam_e2e** (tier2, wrong_tool): 578s, 6 turns, tools: load_osm_model -> view_model -> copy_file +- **Ruby** (tier2, wrong_tool): 86s, 3 turns, tools: create_measure +- **Python** (tier2, wrong_tool): 73s, 3 turns, tools: create_measure +- **Ruby** (tier2, wrong_tool): 38s, 3 turns, tools: create_measure +- **thermal_zones_L1** (progressive, wrong_tool): 17s, 3 turns, tools: load_osm_model diff --git a/docs/sweeps/sonnet-2026-03-28/benchmark_history.json b/docs/sweeps/sonnet-2026-03-28/benchmark_history.json new file mode 100644 index 0000000..ffa9c9c --- /dev/null +++ b/docs/sweeps/sonnet-2026-03-28/benchmark_history.json @@ -0,0 +1,54 @@ +[ + { + "timestamp": "2026-03-28T17:06:27+00:00", + "model": "sonnet", + "retries": 0, + "total_tests": 180, + "passed": 170, + "failed": 10, + "pass_rate": 94.4, + "total_duration_s": 9452.9, + "total_input_tokens": 1959, + "total_output_tokens": 250127, + "total_cache_read_tokens": 20447621, + "total_cost_usd": 18.9595, + "tiers": { + "setup": { + "total": 6, + "passed": 6, + "duration_s": 420.6, + "pass_rate": 100.0 + }, + "tier1": { + "total": 4, + "passed": 4, + "duration_s": 130.0, + "pass_rate": 100.0 + }, + "tier3": { + "total": 26, + "passed": 21, + "duration_s": 1702.9, + "pass_rate": 80.8 + }, + "tier2": { + "total": 37, + "passed": 33, + "duration_s": 3600.4, + "pass_rate": 89.2 + }, + "tier4": { + "total": 3, + "passed": 3, + "duration_s": 202.8, + "pass_rate": 100.0 + }, + "progressive": { + "total": 104, + "passed": 103, + "duration_s": 3396.2, + "pass_rate": 99.0 + } + } + } +] \ No newline at end of file diff --git a/docs/sweeps/sonnet-2026-03-28/sweep.log b/docs/sweeps/sonnet-2026-03-28/sweep.log new file mode 100644 index 0000000..e4db65b --- /dev/null +++ b/docs/sweeps/sonnet-2026-03-28/sweep.log @@ -0,0 +1,863 @@ +============================= test session starts ============================= +platform win32 -- Python 3.13.12, pytest-9.0.2, pluggy-1.6.0 -- C:\Python313\python.exe +cachedir: .pytest_cache +rootdir: C:\projects\openstudio-mcp +configfile: pyproject.toml +plugins: anyio-4.12.1, cov-7.0.0, timeout-2.4.0 +collecting ... collected 230 items + +tests/llm/test_01_setup.py::test_create_baseline_model PASSED [ 0%] +tests/llm/test_01_setup.py::test_create_baseline_with_hvac PASSED [ 0%] +tests/llm/test_01_setup.py::test_create_example_model PASSED [ 1%] +tests/llm/test_01_setup.py::test_load_baseline_model PASSED [ 1%] +tests/llm/test_01_setup.py::test_run_baseline_simulation PASSED [ 2%] +tests/llm/test_01_setup.py::test_run_retrofit_simulation PASSED [ 2%] +tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[What is the server status?] PASSED [ 3%] +tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[List available skills] PASSED [ 3%] +tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[Create a small office building usin] PASSED [ 3%] +tests/llm/test_02_tool_selection.py::test_tool_selection_no_model[Create bar geometry for a retail bu] PASSED [ 4%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Add HVAC to the model] PASSED [ 4%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Set up heating and cooling] PASSED [ 5%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:What HVAC system should I use?] PASSED [ 5%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[add-hvac:Add a VAV system] PASSED [ 6%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[energy-report:Give me a full energy report] FAILED [ 6%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a small office building] PASSED [ 6%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Model a 3-story school] PASSED [ 7%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a retail building, 25000 sqf] PASSED [ 7%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Import the FloorspaceJS floor plan ] PASSED [ 8%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[new-building:Create a bar building for a medium ] PASSED [ 8%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Check the model for issues] FAILED [ 9%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Validate before simulation] FAILED [ 9%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:QA/QC the model] PASSED [ 10%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Is my model ready to simulate?] PASSED [ 10%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[retrofit:Compare before and after adding ins] PASSED [ 10%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[retrofit:Do a retrofit analysis] PASSED [ 11%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Run a simulation] PASSED [ 11%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Simulate the model] PASSED [ 12%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[simulate:Run EnergyPlus] PASSED [ 12%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:My simulation failed] FAILED [ 13%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:EUI looks way too high] PASSED [ 13%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:Too many unmet hours] PASSED [ 13%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:Why did EnergyPlus crash?] FAILED [ 14%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:Show me the model] PASSED [ 14%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:Visualize the building] PASSED [ 15%] +tests/llm/test_03_eval_cases.py::test_eval_tool_selection[view:3D view] PASSED [ 15%] +tests/llm/test_04_workflows.py::test_workflow[systemd_fourpipebeam_e2e] FAILED [ 16%] +tests/llm/test_04_workflows.py::test_workflow[add_vav_reheat] PASSED [ 16%] +tests/llm/test_04_workflows.py::test_workflow[add_doas] PASSED [ 16%] +tests/llm/test_04_workflows.py::test_workflow[add_vrf] PASSED [ 17%] +tests/llm/test_04_workflows.py::test_workflow[set_weather] PASSED [ 17%] +tests/llm/test_04_workflows.py::test_workflow[add_rooftop_pv] PASSED [ 18%] +tests/llm/test_04_workflows.py::test_workflow[adjust_thermostat] PASSED [ 18%] +tests/llm/test_04_workflows.py::test_workflow[delete_space] PASSED [ 19%] +tests/llm/test_04_workflows.py::test_workflow[qaqc_check] PASSED [ 19%] +tests/llm/test_04_workflows.py::test_workflow[create_bar_office] PASSED [ 20%] +tests/llm/test_04_workflows.py::test_workflow[create_new_building] PASSED [ 20%] +tests/llm/test_04_workflows.py::test_workflow[bar_then_typical] PASSED [ 20%] +tests/llm/test_04_workflows.py::test_workflow[import_floorspacejs] PASSED [ 21%] +tests/llm/test_04_workflows.py::test_workflow[floorspacejs_to_typical] PASSED [ 21%] +tests/llm/test_04_workflows.py::test_workflow[manual_geometry_match] PASSED [ 22%] +tests/llm/test_04_workflows.py::test_workflow[envelope_retrofit] PASSED [ 22%] +tests/llm/test_04_workflows.py::test_workflow[create_and_assign_loads] PASSED [ 23%] +tests/llm/test_04_workflows.py::test_workflow[plant_loop_with_boiler] PASSED [ 23%] +tests/llm/test_04_workflows.py::test_workflow[inspect_and_modify_boiler] PASSED [ 23%] +tests/llm/test_04_workflows.py::test_workflow[extract_results_chain] PASSED [ 24%] +tests/llm/test_04_workflows.py::test_workflow[hvac_chilled_beam_comparison] PASSED [ 24%] +tests/llm/test_04_workflows.py::test_workflow[create_test_apply_measure] PASSED [ 25%] +tests/llm/test_04_workflows.py::test_workflow[measure_set_lights_full_chain] PASSED [ 25%] +tests/llm/test_04_workflows.py::test_workflow[measure_set_infiltration_full_chain] PASSED [ 26%] +tests/llm/test_04_workflows.py::test_workflow[measure_replace_terminals_full_chain] PASSED [ 26%] +tests/llm/test_04_workflows.py::test_workflow[create_measure_with_args] PASSED [ 26%] +tests/llm/test_04_workflows.py::test_workflow[measure_add_baseboards_full_chain] PASSED [ 27%] +tests/llm/test_04_workflows.py::test_workflow[ruby_measure_reduce_plugloads] PASSED [ 27%] +tests/llm/test_04_workflows.py::test_workflow[python_measure_reduce_plugloads] PASSED [ 28%] +tests/llm/test_04_workflows.py::test_workflow[ruby_measure_boiler_efficiency] PASSED [ 28%] +tests/llm/test_04_workflows.py::test_workflow[python_measure_boiler_efficiency] PASSED [ 29%] +tests/llm/test_04_workflows.py::test_create_measure_with_args_quality PASSED [ 29%] +tests/llm/test_04_workflows.py::test_complex_model_multi_query PASSED [ 30%] +tests/llm/test_04_workflows.py::test_measure_reduce_plugloads_quality[Ruby] FAILED [ 30%] +tests/llm/test_04_workflows.py::test_measure_reduce_plugloads_quality[Python] FAILED [ 30%] +tests/llm/test_04_workflows.py::test_measure_boiler_efficiency_quality[Ruby] FAILED [ 31%] +tests/llm/test_04_workflows.py::test_measure_boiler_efficiency_quality[Python] PASSED [ 31%] +tests/llm/test_05_guardrails.py::test_create_uses_mcp_not_raw_idf PASSED [ 32%] +tests/llm/test_05_guardrails.py::test_no_script_for_results PASSED [ 32%] +tests/llm/test_05_guardrails.py::test_inspect_component_uses_mcp_not_script PASSED [ 33%] +tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L1] PASSED [ 33%] +tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L2] PASSED [ 33%] +tests/llm/test_06_progressive.py::test_progressive[import_floorplan_L3] PASSED [ 34%] +tests/llm/test_06_progressive.py::test_progressive[add_hvac_L1] PASSED [ 34%] +tests/llm/test_06_progressive.py::test_progressive[add_hvac_L2] PASSED [ 35%] +tests/llm/test_06_progressive.py::test_progressive[add_hvac_L3] PASSED [ 35%] +tests/llm/test_06_progressive.py::test_progressive[view_model_L1] PASSED [ 36%] +tests/llm/test_06_progressive.py::test_progressive[view_model_L2] PASSED [ 36%] +tests/llm/test_06_progressive.py::test_progressive[view_model_L3] PASSED [ 36%] +tests/llm/test_06_progressive.py::test_progressive[set_weather_L1] PASSED [ 37%] +tests/llm/test_06_progressive.py::test_progressive[set_weather_L2] PASSED [ 37%] +tests/llm/test_06_progressive.py::test_progressive[set_weather_L3] PASSED [ 38%] +tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L1] PASSED [ 38%] +tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L2] PASSED [ 39%] +tests/llm/test_06_progressive.py::test_progressive[run_qaqc_L3] PASSED [ 39%] +tests/llm/test_06_progressive.py::test_progressive[create_building_L1] PASSED [ 40%] +tests/llm/test_06_progressive.py::test_progressive[create_building_L2] PASSED [ 40%] +tests/llm/test_06_progressive.py::test_progressive[create_building_L3] PASSED [ 40%] +tests/llm/test_06_progressive.py::test_progressive[add_pv_L1] PASSED [ 41%] +tests/llm/test_06_progressive.py::test_progressive[add_pv_L2] PASSED [ 41%] +tests/llm/test_06_progressive.py::test_progressive[add_pv_L3] PASSED [ 42%] +tests/llm/test_06_progressive.py::test_progressive[thermostat_L1] PASSED [ 42%] +tests/llm/test_06_progressive.py::test_progressive[thermostat_L2] PASSED [ 43%] +tests/llm/test_06_progressive.py::test_progressive[thermostat_L3] PASSED [ 43%] +tests/llm/test_06_progressive.py::test_progressive[list_spaces_L1] PASSED [ 43%] +tests/llm/test_06_progressive.py::test_progressive[list_spaces_L2] PASSED [ 44%] +tests/llm/test_06_progressive.py::test_progressive[list_spaces_L3] PASSED [ 44%] +tests/llm/test_06_progressive.py::test_progressive[schedules_L1] PASSED [ 45%] +tests/llm/test_06_progressive.py::test_progressive[schedules_L2] PASSED [ 45%] +tests/llm/test_06_progressive.py::test_progressive[schedules_L3] PASSED [ 46%] +tests/llm/test_06_progressive.py::test_progressive[inspect_component_L1] PASSED [ 46%] +tests/llm/test_06_progressive.py::test_progressive[inspect_component_L2] PASSED [ 46%] +tests/llm/test_06_progressive.py::test_progressive[inspect_component_L3] PASSED [ 47%] +tests/llm/test_06_progressive.py::test_progressive[modify_component_L1] PASSED [ 47%] +tests/llm/test_06_progressive.py::test_progressive[modify_component_L2] PASSED [ 48%] +tests/llm/test_06_progressive.py::test_progressive[modify_component_L3] PASSED [ 48%] +tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L1] PASSED [ 49%] +tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L2] PASSED [ 49%] +tests/llm/test_06_progressive.py::test_progressive[list_dynamic_type_L3] PASSED [ 50%] +tests/llm/test_06_progressive.py::test_progressive[floor_area_L1] PASSED [ 50%] +tests/llm/test_06_progressive.py::test_progressive[floor_area_L2] PASSED [ 50%] +tests/llm/test_06_progressive.py::test_progressive[floor_area_L3] PASSED [ 51%] +tests/llm/test_06_progressive.py::test_progressive[materials_L1] PASSED [ 51%] +tests/llm/test_06_progressive.py::test_progressive[materials_L2] PASSED [ 52%] +tests/llm/test_06_progressive.py::test_progressive[materials_L3] PASSED [ 52%] +tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L1] FAILED [ 53%] +tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L2] PASSED [ 53%] +tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L3] PASSED [ 53%] +tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L1] PASSED [ 54%] +tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L2] PASSED [ 54%] +tests/llm/test_06_progressive.py::test_progressive[subsurfaces_L3] PASSED [ 55%] +tests/llm/test_06_progressive.py::test_progressive[surface_details_L1] PASSED [ 55%] +tests/llm/test_06_progressive.py::test_progressive[surface_details_L2] PASSED [ 56%] +tests/llm/test_06_progressive.py::test_progressive[surface_details_L3] PASSED [ 56%] +tests/llm/test_06_progressive.py::test_progressive[run_simulation_L1] PASSED [ 56%] +tests/llm/test_06_progressive.py::test_progressive[run_simulation_L2] PASSED [ 57%] +tests/llm/test_06_progressive.py::test_progressive[run_simulation_L3] PASSED [ 57%] +tests/llm/test_06_progressive.py::test_progressive[get_eui_L1] PASSED [ 58%] +tests/llm/test_06_progressive.py::test_progressive[get_eui_L2] PASSED [ 58%] +tests/llm/test_06_progressive.py::test_progressive[get_eui_L3] PASSED [ 59%] +tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L1] PASSED [ 59%] +tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L2] PASSED [ 60%] +tests/llm/test_06_progressive.py::test_progressive[end_use_breakdown_L3] PASSED [ 60%] +tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L1] PASSED [ 60%] +tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L2] PASSED [ 61%] +tests/llm/test_06_progressive.py::test_progressive[hvac_sizing_L3] PASSED [ 61%] +tests/llm/test_06_progressive.py::test_progressive[set_wwr_L1] PASSED [ 62%] +tests/llm/test_06_progressive.py::test_progressive[set_wwr_L2] PASSED [ 62%] +tests/llm/test_06_progressive.py::test_progressive[set_wwr_L3] PASSED [ 63%] +tests/llm/test_06_progressive.py::test_progressive[replace_windows_L1] PASSED [ 63%] +tests/llm/test_06_progressive.py::test_progressive[replace_windows_L2] PASSED [ 63%] +tests/llm/test_06_progressive.py::test_progressive[replace_windows_L3] PASSED [ 64%] +tests/llm/test_06_progressive.py::test_progressive[construction_details_L1] PASSED [ 64%] +tests/llm/test_06_progressive.py::test_progressive[construction_details_L2] PASSED [ 65%] +tests/llm/test_06_progressive.py::test_progressive[construction_details_L3] PASSED [ 65%] +tests/llm/test_06_progressive.py::test_progressive[check_loads_L1] PASSED [ 66%] +tests/llm/test_06_progressive.py::test_progressive[check_loads_L2] PASSED [ 66%] +tests/llm/test_06_progressive.py::test_progressive[check_loads_L3] PASSED [ 66%] +tests/llm/test_06_progressive.py::test_progressive[create_loads_L1] PASSED [ 67%] +tests/llm/test_06_progressive.py::test_progressive[create_loads_L2] PASSED [ 67%] +tests/llm/test_06_progressive.py::test_progressive[create_loads_L3] PASSED [ 68%] +tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L1] PASSED [ 68%] +tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L2] PASSED [ 69%] +tests/llm/test_06_progressive.py::test_progressive[create_plant_loop_L3] PASSED [ 69%] +tests/llm/test_06_progressive.py::test_progressive[schedule_details_L1] PASSED [ 70%] +tests/llm/test_06_progressive.py::test_progressive[schedule_details_L2] PASSED [ 70%] +tests/llm/test_06_progressive.py::test_progressive[schedule_details_L3] PASSED [ 70%] +tests/llm/test_06_progressive.py::test_progressive[space_type_info_L1] PASSED [ 71%] +tests/llm/test_06_progressive.py::test_progressive[space_type_info_L2] PASSED [ 71%] +tests/llm/test_06_progressive.py::test_progressive[space_type_info_L3] PASSED [ 72%] +tests/llm/test_06_progressive.py::test_progressive[set_run_period_L1] PASSED [ 72%] +tests/llm/test_06_progressive.py::test_progressive[set_run_period_L2] PASSED [ 73%] +tests/llm/test_06_progressive.py::test_progressive[set_run_period_L3] PASSED [ 73%] +tests/llm/test_06_progressive.py::test_progressive[ideal_air_L1] PASSED [ 73%] +tests/llm/test_06_progressive.py::test_progressive[ideal_air_L2] PASSED [ 74%] +tests/llm/test_06_progressive.py::test_progressive[ideal_air_L3] PASSED [ 74%] +tests/llm/test_06_progressive.py::test_progressive[save_model_L1] PASSED [ 75%] +tests/llm/test_06_progressive.py::test_progressive[save_model_L2] PASSED [ 75%] +tests/llm/test_06_progressive.py::test_progressive[save_model_L3] PASSED [ 76%] +tests/llm/test_06_progressive.py::test_progressive[add_ev_L1] PASSED [ 76%] +tests/llm/test_06_progressive.py::test_progressive[add_ev_L2] PASSED [ 76%] +tests/llm/test_06_progressive.py::test_progressive[add_ev_L3] PASSED [ 77%] +tests/llm/test_06_progressive.py::test_progressive[list_measures_L1] PASSED [ 77%] +tests/llm/test_06_progressive.py::test_progressive[list_measures_L2] PASSED [ 78%] +tests/llm/test_06_progressive.py::test_progressive[list_measures_L3] SKIPPED [ 78%] +tests/llm/test_06_progressive.py::test_progressive[create_measure_L1] SKIPPED [ 79%] +tests/llm/test_06_progressive.py::test_progressive[create_measure_L2] SKIPPED [ 79%] +tests/llm/test_06_progressive.py::test_progressive[create_measure_L3] SKIPPED [ 80%] +tests/llm/test_06_progressive.py::test_progressive[test_measure_L1] SKIPPED [ 80%] +tests/llm/test_06_progressive.py::test_progressive[test_measure_L2] SKIPPED [ 80%] +tests/llm/test_06_progressive.py::test_progressive[test_measure_L3] SKIPPED [ 81%] +tests/llm/test_06_progressive.py::test_progressive[apply_existing_measure_L1] SKIPPED [ 81%] +tests/llm/test_06_progressive.py::test_progressive[apply_existing_measure_L2] SKIPPED [ 82%] +tests/llm/test_06_progressive.py::test_progressive[apply_existing_measure_L3] SKIPPED [ 82%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_cooled_beam_L1] SKIPPED [ 83%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_cooled_beam_L2] SKIPPED [ 83%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_cooled_beam_L3] SKIPPED [ 83%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_four_pipe_beam_L1] SKIPPED [ 84%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_four_pipe_beam_L2] SKIPPED [ 84%] +tests/llm/test_06_progressive.py::test_progressive[replace_terminals_four_pipe_beam_L3] SKIPPED [ 85%] +tests/llm/test_06_progressive.py::test_progressive[measure_replace_terminals_L1] SKIPPED [ 85%] +tests/llm/test_06_progressive.py::test_progressive[measure_replace_terminals_L2] SKIPPED [ 86%] +tests/llm/test_06_progressive.py::test_progressive[measure_replace_terminals_L3] SKIPPED [ 86%] +tests/llm/test_06_progressive.py::test_progressive[zone_equipment_priority_L1] SKIPPED [ 86%] +tests/llm/test_06_progressive.py::test_progressive[zone_equipment_priority_L2] SKIPPED [ 87%] +tests/llm/test_06_progressive.py::test_progressive[zone_equipment_priority_L3] SKIPPED [ 87%] +tests/llm/test_06_progressive.py::test_progressive[edit_measure_L1] SKIPPED [ 88%] +tests/llm/test_06_progressive.py::test_progressive[edit_measure_L2] SKIPPED [ 88%] +tests/llm/test_06_progressive.py::test_progressive[edit_measure_L3] SKIPPED [ 89%] +tests/llm/test_07_fourpipe_e2e.py::test_fourpipe_beam_retrofit_e2e SKIPPED [ 89%] +tests/llm/test_08_measure_authoring.py::test_create_measure_with_quoted_description SKIPPED [ 90%] +tests/llm/test_08_measure_authoring.py::test_edit_measure_description_with_quotes SKIPPED [ 90%] +tests/llm/test_08_measure_authoring.py::test_measure_xml_intended_software_tool SKIPPED [ 90%] +tests/llm/test_08_measure_authoring.py::test_syntax_error_reported_clearly SKIPPED [ 91%] +tests/llm/test_09_tool_routing.py::test_tool_selection_baseline[create_measure] SKIPPED [ 91%] +tests/llm/test_09_tool_routing.py::test_tool_selection_baseline[view_model] SKIPPED [ 92%] +tests/llm/test_09_tool_routing.py::test_tool_selection_baseline[read_file] SKIPPED [ 92%] +tests/llm/test_09_tool_routing.py::test_tool_selection_baseline[add_baseline_system] SKIPPED [ 93%] +tests/llm/test_09_tool_routing.py::test_tool_selection_baseline_extract_eui SKIPPED [ 93%] +tests/llm/test_09_tool_routing.py::test_visualization_uses_mcp_not_script SKIPPED [ 93%] +tests/llm/test_09_tool_routing.py::test_report_uses_mcp_not_script SKIPPED [ 94%] +tests/llm/test_09_tool_routing.py::test_measure_uses_create_measure_not_create_file SKIPPED [ 94%] +tests/llm/test_09_tool_routing.py::test_read_file_uses_mcp_not_bash SKIPPED [ 95%] +tests/llm/test_09_tool_routing.py::test_hvac_measure_uses_api_reference SKIPPED [ 95%] +tests/llm/test_09_tool_routing.py::test_search_api_for_method_verification SKIPPED [ 96%] +tests/llm/test_09_tool_routing.py::test_search_wiring_patterns_for_hvac_wiring SKIPPED [ 96%] +tests/llm/test_10_confusion_pairs.py::test_qaqc_vs_validate_post_sim SKIPPED [ 96%] +tests/llm/test_10_confusion_pairs.py::test_validate_vs_qaqc_pre_sim SKIPPED [ 97%] +tests/llm/test_10_confusion_pairs.py::test_load_details_vs_space_details SKIPPED [ 97%] +tests/llm/test_10_confusion_pairs.py::test_summary_metrics_vs_end_use SKIPPED [ 98%] +tests/llm/test_10_confusion_pairs.py::test_end_use_vs_summary_metrics SKIPPED [ 98%] +tests/llm/test_10_confusion_pairs.py::test_inspect_osm_vs_model_summary SKIPPED [ 99%] +tests/llm/test_10_confusion_pairs.py::test_create_baseline_vs_new_building SKIPPED [ 99%] +tests/llm/test_10_confusion_pairs.py::test_apply_measure_vs_create_measure SKIPPED [100%] +====================================================================== +LLM Benchmark: 170/180 passed (94.4%) | Model: sonnet | 9453s +Tokens: 2.0k in + 250.1k out + 20.4M cache | Cost: $18.9595 + setup: 6/6 (100.0%) in 421s + tier1: 4/4 (100.0%) in 130s + tier2: 33/37 (89.2%) in 3600s + tier3: 21/26 (80.8%) in 1703s + tier4: 3/3 (100.0%) in 203s + progressive: 103/104 (99.0%) in 3396s +Failed: energy-report:Give me a full energy report, qaqc:Check the model for issues, qaqc:Validate before simulation, troubleshoot:My simulation failed, troubleshoot:Why did EnergyPlus crash?, systemd_fourpipebeam_e2e, Ruby, Python, Ruby, thermal_zones_L1 +Report: C:\tmp\llm-sweep-sonnet\benchmark.md +History: C:\tmp\llm-sweep-sonnet\benchmark_history.json (1 runs) +====================================================================== + + +================================== FAILURES =================================== +____ test_eval_tool_selection[energy-report:Give me a full energy report] _____ + +case = {'expected_tools': ['extract_summary_metrics', 'extract_end_use_breakdown', 'extract_envelope_summary', 'extract_hvac_sizing', 'extract_zone_summary'], 'prompt': 'Give me a full energy report', 'skill': 'energy-report'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + + # Merge eval.md expected tools with extra acceptable tools + expected = set(case["expected_tools"]) + expected.update(EXTRA_EXPECTED.get(case["skill"], [])) + +> assert any(t in expected for t in tool_names), ( + f"[{case['skill']}] Expected one of {sorted(expected)}, " + f"got: {tool_names}" + ) +E AssertionError: [energy-report] Expected one of ['extract_end_use_breakdown', 'extract_envelope_summary', 'extract_hvac_sizing', 'extract_summary_metrics', 'extract_zone_summary', 'generate_results_report'], got: ['load_osm_model', 'list_files', 'get_building_info', 'get_model_summary', 'get_weather_info', 'run_simulation'] +E assert False +E + where False = any(. at 0x000001ED066CE260>) + +tests\llm\test_03_eval_cases.py:148: AssertionError +__________ test_eval_tool_selection[qaqc:Check the model for issues] __________ + +case = {'expected_tools': ['run_qaqc_checks', 'inspect_osm_summary'], 'prompt': 'Check the model for issues', 'skill': 'qaqc'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + + # Merge eval.md expected tools with extra acceptable tools + expected = set(case["expected_tools"]) + expected.update(EXTRA_EXPECTED.get(case["skill"], [])) + +> assert any(t in expected for t in tool_names), ( + f"[{case['skill']}] Expected one of {sorted(expected)}, " + f"got: {tool_names}" + ) +E AssertionError: [qaqc] Expected one of ['get_model_summary', 'inspect_osm_summary', 'run_qaqc_checks'], got: ['load_osm_model', 'validate_model'] +E assert False +E + where False = any(. at 0x000001ED0670A670>) + +tests\llm\test_03_eval_cases.py:148: AssertionError +__________ test_eval_tool_selection[qaqc:Validate before simulation] __________ + +case = {'expected_tools': ['run_qaqc_checks'], 'prompt': 'Validate before simulation', 'skill': 'qaqc'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + + # Merge eval.md expected tools with extra acceptable tools + expected = set(case["expected_tools"]) + expected.update(EXTRA_EXPECTED.get(case["skill"], [])) + +> assert any(t in expected for t in tool_names), ( + f"[{case['skill']}] Expected one of {sorted(expected)}, " + f"got: {tool_names}" + ) +E AssertionError: [qaqc] Expected one of ['get_model_summary', 'inspect_osm_summary', 'run_qaqc_checks'], got: ['load_osm_model', 'validate_model'] +E assert False +E + where False = any(. at 0x000001ED06778AD0>) + +tests\llm\test_03_eval_cases.py:148: AssertionError +_________ test_eval_tool_selection[troubleshoot:My simulation failed] _________ + +case = {'expected_tools': ['get_run_status', 'get_run_logs'], 'prompt': 'My simulation failed', 'skill': 'troubleshoot'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + + # Merge eval.md expected tools with extra acceptable tools + expected = set(case["expected_tools"]) + expected.update(EXTRA_EXPECTED.get(case["skill"], [])) + +> assert any(t in expected for t in tool_names), ( + f"[{case['skill']}] Expected one of {sorted(expected)}, " + f"got: {tool_names}" + ) +E AssertionError: [troubleshoot] Expected one of ['extract_component_sizing', 'extract_summary_metrics', 'get_building_info', 'get_model_summary', 'get_run_logs', 'get_run_status', 'inspect_osm_summary', 'list_files', 'list_thermal_zones', 'run_simulation'], got: ['load_osm_model', 'extract_simulation_errors'] +E assert False +E + where False = any(. at 0x000001ED0677A5A0>) + +tests\llm\test_03_eval_cases.py:148: AssertionError +______ test_eval_tool_selection[troubleshoot:Why did EnergyPlus crash?] _______ + +case = {'expected_tools': ['get_run_logs'], 'prompt': 'Why did EnergyPlus crash?', 'skill': 'troubleshoot'} + + @pytest.mark.parametrize("case", EVAL_CASES, ids=[_case_id(c) for c in EVAL_CASES]) + def test_eval_tool_selection(case): + """Verify agent calls at least one expected MCP tool for an eval.md prompt.""" + # Validates: Claude selects correct tool from eval.md skill tables for natural language prompts + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + # Prepend model load for skills that need model state + prompt = case["prompt"] + if case["skill"] in NEEDS_MODEL: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + if case["skill"] == "troubleshoot": + prompt = _troubleshoot_prefix() + prompt.lower() + else: + prompt = LOAD_PREFIX + prompt.lower() + prompt += SUFFIX + + timeout = SLOW_SKILLS.get(case["skill"], 120) + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + + # Merge eval.md expected tools with extra acceptable tools + expected = set(case["expected_tools"]) + expected.update(EXTRA_EXPECTED.get(case["skill"], [])) + +> assert any(t in expected for t in tool_names), ( + f"[{case['skill']}] Expected one of {sorted(expected)}, " + f"got: {tool_names}" + ) +E AssertionError: [troubleshoot] Expected one of ['extract_component_sizing', 'extract_summary_metrics', 'get_building_info', 'get_model_summary', 'get_run_logs', 'get_run_status', 'inspect_osm_summary', 'list_files', 'list_thermal_zones', 'run_simulation'], got: ['load_osm_model', 'extract_simulation_errors'] +E assert False +E + where False = any(. at 0x000001ED0677A810>) + +tests\llm\test_03_eval_cases.py:148: AssertionError +___________________ test_workflow[systemd_fourpipebeam_e2e] ___________________ + +case = {'any_of': ['compare_runs', 'extract_summary_metrics', 'extract_end_use_breakdown'], 'id': 'systemd_fourpipebeam_e2e', 'max_turns': 40, 'min_calls': {'run_simulation': 2}, ...} + + @pytest.mark.parametrize("case", WORKFLOW_CASES, ids=[c["id"] for c in WORKFLOW_CASES]) + def test_workflow(case): + """Agent loads model and completes a multi-step workflow.""" + # Validates: Claude chains all required MCP tools for multi-step BEM workflows + tier = get_tier() + if tier not in ("all", "2"): + pytest.skip("Tier 2 not selected") + + # Build prompt for needs_run cases + prompt = case["prompt"] + if case.get("needs_run"): + run_id = get_sim_run_id() + if not run_id: + pytest.skip("No simulation run_id � run test_01_setup first") + prompt = ( + f"Extract results from simulation run '{run_id}'. " + "First extract summary metrics using extract_summary_metrics. " + "Then extract end use breakdown using extract_end_use_breakdown. " + "Use MCP tools only." + ) + elif BASELINE_HVAC_MODEL in prompt and not baseline_hvac_model_exists(): + pytest.skip("Baseline+HVAC model not found � run test_01_setup first") + elif BASELINE_MODEL in prompt and not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + +> result = run_claude( + prompt, + timeout=case.get("timeout", 120), + max_turns=case.get("max_turns"), + ) + +tests\llm\test_04_workflows.py:616: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +tests\llm\runner.py:209: in run_claude + _last_result = _parse_stream_json(result.stdout) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +raw = None + + def _parse_stream_json(raw: str) -> ClaudeResult: + """Parse newline-delimited JSON from stream-json output.""" + messages = [] + result_obj = {} + +> for line in raw.strip().splitlines(): + ^^^^^^^^^ +E AttributeError: 'NoneType' object has no attribute 'strip' + +tests\llm\runner.py:218: AttributeError +_________________ test_measure_reduce_plugloads_quality[Ruby] _________________ + +language = 'Ruby' + + @pytest.mark.parametrize("language", ["Ruby", "Python"]) + def test_measure_reduce_plugloads_quality(language): + """LLM creates a well-parameterized plug-load reduction measure.""" + # Validates: Claude creates plug-load measures with Choice/Double/Boolean args and correct body references + tier = get_tier() + if tier not in ("all", "2"): + pytest.skip("Tier 2 not selected") + + prompt = ( + f"Create a {language} ModelMeasure that reduces electric equipment " + "power density. It must have these arguments:\n" + " - space_type_filter: Choice (All, Office, Corridor, Lobby)\n" + " - reduction_percent: Double, default 25.0\n" + " - skip_empty_spaces: Boolean, default true\n" + "The measure should iterate ElectricEquipmentDefinition objects, " + "check the associated SpaceType name against the filter, " + "and reduce wattsPerSpaceFloorArea by the given percentage. " + f"Use create_measure with language {language}. Use MCP tools only." + ) + result = run_claude(prompt, timeout=300, max_turns=15) +> _check_measure_args_quality( + result, + expected_language=language, + expected_arg_types={"Choice", "Double", "Boolean"}, + body_keywords=_PLUGLOAD_BODY_KEYWORDS, + label=f"plugloads_{language}", + ) + +tests\llm\test_04_workflows.py:885: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +result = + + def _check_measure_args_quality( + result, *, expected_language, expected_arg_types, + body_keywords, label, + ): + """Shared quality checks for measure-with-args tests. + + Args: + result: ClaudeResult from run_claude + expected_language: "Ruby" or "Python" (case-insensitive match) + expected_arg_types: set of required arg types, e.g. {"Choice", "Double", "Boolean"} + body_keywords: list of strings � at least one must appear in run_body + label: human-readable test label for assertion messages + """ + tool_names = result.tool_names + assert "create_measure" in tool_names, ( + f"[{label}] Missing create_measure. Tools: {tool_names}" + ) + + create_input = _find_create_measure_input(result) + assert create_input, f"[{label}] create_measure call not found in MCP tool calls" + + # Language check + lang = create_input.get("language", "") + assert lang.lower() == expected_language.lower(), ( + f"[{label}] Expected language={expected_language}, got {lang}" + ) + + args = _parse_args(create_input) + run_body = create_input.get("run_body", "") + + # 1. Has arguments + assert args and len(args) > 0, ( + f"[{label}] No arguments � LLM hard-coded all values" + ) + + # 2. Required argument types present + arg_types = {a.get("type", "") for a in args} + for t in expected_arg_types: + assert t in arg_types, ( + f"[{label}] Missing arg type {t}. Types found: {arg_types}" + ) + + # 3. Choice arg has values list + for a in args: + if a.get("type") == "Choice": + vals = a.get("values", []) +> assert len(vals) >= 2, ( + f"[{label}] Choice arg '{a.get('name')}' needs >=2 values, " + f"got {vals}" + ) +E AssertionError: [plugloads_Ruby] Choice arg 'space_type_filter' needs >=2 values, got [] +E assert 0 >= 2 +E + where 0 = len([]) + +tests\llm\test_04_workflows.py:822: AssertionError +________________ test_measure_reduce_plugloads_quality[Python] ________________ + +language = 'Python' + + @pytest.mark.parametrize("language", ["Ruby", "Python"]) + def test_measure_reduce_plugloads_quality(language): + """LLM creates a well-parameterized plug-load reduction measure.""" + # Validates: Claude creates plug-load measures with Choice/Double/Boolean args and correct body references + tier = get_tier() + if tier not in ("all", "2"): + pytest.skip("Tier 2 not selected") + + prompt = ( + f"Create a {language} ModelMeasure that reduces electric equipment " + "power density. It must have these arguments:\n" + " - space_type_filter: Choice (All, Office, Corridor, Lobby)\n" + " - reduction_percent: Double, default 25.0\n" + " - skip_empty_spaces: Boolean, default true\n" + "The measure should iterate ElectricEquipmentDefinition objects, " + "check the associated SpaceType name against the filter, " + "and reduce wattsPerSpaceFloorArea by the given percentage. " + f"Use create_measure with language {language}. Use MCP tools only." + ) + result = run_claude(prompt, timeout=300, max_turns=15) +> _check_measure_args_quality( + result, + expected_language=language, + expected_arg_types={"Choice", "Double", "Boolean"}, + body_keywords=_PLUGLOAD_BODY_KEYWORDS, + label=f"plugloads_{language}", + ) + +tests\llm\test_04_workflows.py:885: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +result = + + def _check_measure_args_quality( + result, *, expected_language, expected_arg_types, + body_keywords, label, + ): + """Shared quality checks for measure-with-args tests. + + Args: + result: ClaudeResult from run_claude + expected_language: "Ruby" or "Python" (case-insensitive match) + expected_arg_types: set of required arg types, e.g. {"Choice", "Double", "Boolean"} + body_keywords: list of strings � at least one must appear in run_body + label: human-readable test label for assertion messages + """ + tool_names = result.tool_names + assert "create_measure" in tool_names, ( + f"[{label}] Missing create_measure. Tools: {tool_names}" + ) + + create_input = _find_create_measure_input(result) + assert create_input, f"[{label}] create_measure call not found in MCP tool calls" + + # Language check + lang = create_input.get("language", "") + assert lang.lower() == expected_language.lower(), ( + f"[{label}] Expected language={expected_language}, got {lang}" + ) + + args = _parse_args(create_input) + run_body = create_input.get("run_body", "") + + # 1. Has arguments + assert args and len(args) > 0, ( + f"[{label}] No arguments � LLM hard-coded all values" + ) + + # 2. Required argument types present + arg_types = {a.get("type", "") for a in args} + for t in expected_arg_types: + assert t in arg_types, ( + f"[{label}] Missing arg type {t}. Types found: {arg_types}" + ) + + # 3. Choice arg has values list + for a in args: + if a.get("type") == "Choice": + vals = a.get("values", []) +> assert len(vals) >= 2, ( + f"[{label}] Choice arg '{a.get('name')}' needs >=2 values, " + f"got {vals}" + ) +E AssertionError: [plugloads_Python] Choice arg 'space_type_filter' needs >=2 values, got [] +E assert 0 >= 2 +E + where 0 = len([]) + +tests\llm\test_04_workflows.py:822: AssertionError +________________ test_measure_boiler_efficiency_quality[Ruby] _________________ + +language = 'Ruby' + + @pytest.mark.parametrize("language", ["Ruby", "Python"]) + def test_measure_boiler_efficiency_quality(language): + """LLM creates a well-parameterized boiler efficiency measure.""" + # Validates: Claude creates boiler efficiency measures with Choice/Double/Boolean args and correct body references + tier = get_tier() + if tier not in ("all", "2"): + pytest.skip("Tier 2 not selected") + + prompt = ( + f"Create a {language} ModelMeasure that upgrades hot water boiler " + "efficiency. It must have these arguments:\n" + " - target_efficiency: Double, default 0.95\n" + " - fuel_type_filter: Choice (All, NaturalGas, Electricity)\n" + " - skip_if_above_target: Boolean, default true\n" + "The measure should iterate BoilerHotWater objects, optionally " + "filter by fuel type, skip boilers already at or above the target " + "efficiency if the boolean is set, and call " + "setNominalThermalEfficiency on the rest. " + f"Use create_measure with language {language}. Use MCP tools only." + ) + result = run_claude(prompt, timeout=300, max_turns=15) +> _check_measure_args_quality( + result, + expected_language=language, + expected_arg_types={"Choice", "Double", "Boolean"}, + body_keywords=_BOILER_BODY_KEYWORDS, + label=f"boiler_{language}", + ) + +tests\llm\test_04_workflows.py:926: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +result = + + def _check_measure_args_quality( + result, *, expected_language, expected_arg_types, + body_keywords, label, + ): + """Shared quality checks for measure-with-args tests. + + Args: + result: ClaudeResult from run_claude + expected_language: "Ruby" or "Python" (case-insensitive match) + expected_arg_types: set of required arg types, e.g. {"Choice", "Double", "Boolean"} + body_keywords: list of strings � at least one must appear in run_body + label: human-readable test label for assertion messages + """ + tool_names = result.tool_names + assert "create_measure" in tool_names, ( + f"[{label}] Missing create_measure. Tools: {tool_names}" + ) + + create_input = _find_create_measure_input(result) + assert create_input, f"[{label}] create_measure call not found in MCP tool calls" + + # Language check + lang = create_input.get("language", "") + assert lang.lower() == expected_language.lower(), ( + f"[{label}] Expected language={expected_language}, got {lang}" + ) + + args = _parse_args(create_input) + run_body = create_input.get("run_body", "") + + # 1. Has arguments + assert args and len(args) > 0, ( + f"[{label}] No arguments � LLM hard-coded all values" + ) + + # 2. Required argument types present + arg_types = {a.get("type", "") for a in args} + for t in expected_arg_types: + assert t in arg_types, ( + f"[{label}] Missing arg type {t}. Types found: {arg_types}" + ) + + # 3. Choice arg has values list + for a in args: + if a.get("type") == "Choice": + vals = a.get("values", []) +> assert len(vals) >= 2, ( + f"[{label}] Choice arg '{a.get('name')}' needs >=2 values, " + f"got {vals}" + ) +E AssertionError: [boiler_Ruby] Choice arg 'fuel_type_filter' needs >=2 values, got [] +E assert 0 >= 2 +E + where 0 = len([]) + +tests\llm\test_04_workflows.py:822: AssertionError +_____________________ test_progressive[thermal_zones_L1] ______________________ + +case = {'case_id': 'thermal_zones', 'expected': ['list_thermal_zones'], 'id': 'thermal_zones_L1', 'level': 'L1', ...} + + @pytest.mark.progressive + @pytest.mark.parametrize("case", _FLAT_CASES, ids=[c["id"] for c in _FLAT_CASES]) + def test_progressive(case): + """Test tool discovery at varying prompt specificity levels.""" + # Validates: Claude routes L1/L2/L3 prompts to correct tools � lower levels passing = better discoverability + tier = get_tier() + if tier not in ("all", "1"): + pytest.skip("Tier 1 not selected") + + prompt = case["prompt"] + if case.get("needs_run"): + run_id = get_sim_run_id() + if not run_id: + pytest.skip("No simulation run_id � run test_01_setup first") + prompt = f"Use run_id '{run_id}'. " + prompt + elif case.get("needs_hvac"): + if not baseline_hvac_model_exists(): + pytest.skip("Baseline+HVAC model not found � run test_01_setup first") + prompt = LOAD_HVAC + prompt.lower() + elif case["needs_model"]: + if not baseline_model_exists(): + pytest.skip("Baseline model not found � run test_01_setup first") + prompt = LOAD + prompt.lower() + prompt += SUFFIX + + timeout = 300 if case.get("needs_run") or case["case_id"] == "run_simulation" else 120 + result = run_claude(prompt, timeout=timeout) + tool_names = result.tool_names + +> assert any(t in case["expected"] for t in tool_names), ( + f"[{case['case_id']} {case['level']}] " + f"Expected one of {case['expected']}, got: {tool_names}" + ) +E AssertionError: [thermal_zones L1] Expected one of ['list_thermal_zones'], got: ['load_osm_model'] +E assert False +E + where False = any(. at 0x000001ED064DBA00>) + +tests\llm\test_06_progressive.py:481: AssertionError +============================== warnings summary =============================== +tests/llm/test_04_workflows.py::test_workflow[systemd_fourpipebeam_e2e] + C:\Python313\Lib\site-packages\_pytest\threadexception.py:58: PytestUnhandledThreadExceptionWarning: Exception in thread Thread-73 (_readerthread) + + Traceback (most recent call last): + File "C:\Python313\Lib\threading.py", line 1044, in _bootstrap_inner + self.run() + ~~~~~~~~^^ + File "C:\Python313\Lib\threading.py", line 995, in run + self._target(*self._args, **self._kwargs) + ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Python313\Lib\subprocess.py", line 1615, in _readerthread + buffer.append(fh.read()) + ~~~~~~~^^ + File "C:\Python313\Lib\encodings\cp1252.py", line 23, in decode + return codecs.charmap_decode(input,self.errors,decoding_table)[0] + ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + UnicodeDecodeError: 'charmap' codec can't decode byte 0x8f in position 422036: character maps to + + Enable tracemalloc to get traceback where the object was allocated. + See https://docs.pytest.org/en/stable/how-to/capture-warnings.html#resource-warnings for more info. + warnings.warn(pytest.PytestUnhandledThreadExceptionWarning(msg)) + +-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html +=========================== short test summary info =========================== +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[energy-report:Give me a full energy report] +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Check the model for issues] +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[qaqc:Validate before simulation] +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:My simulation failed] +FAILED tests/llm/test_03_eval_cases.py::test_eval_tool_selection[troubleshoot:Why did EnergyPlus crash?] +FAILED tests/llm/test_04_workflows.py::test_workflow[systemd_fourpipebeam_e2e] +FAILED tests/llm/test_04_workflows.py::test_measure_reduce_plugloads_quality[Ruby] +FAILED tests/llm/test_04_workflows.py::test_measure_reduce_plugloads_quality[Python] +FAILED tests/llm/test_04_workflows.py::test_measure_boiler_efficiency_quality[Ruby] +FAILED tests/llm/test_06_progressive.py::test_progressive[thermal_zones_L1] +===== 10 failed, 170 passed, 50 skipped, 1 warning in 9454.12s (2:37:34) ====== diff --git a/docs/testing/README.md b/docs/testing/README.md new file mode 100644 index 0000000..0e0a53e --- /dev/null +++ b/docs/testing/README.md @@ -0,0 +1,267 @@ +# LLM Agent Testing — openstudio-mcp + +**Technical report on the methodology, implementation, and results of the LLM behavioral test suite for openstudio-mcp, an MCP server exposing ~142 building-energy-modeling tools.** + +The suite runs a real Claude Code agent against a real openstudio-mcp Docker container, measures whether the agent discovers and calls the correct MCP tools from natural-language prompts, and tracks the result over time. As of the most recent run (Run 15, 2026-04-05) the suite passes **123/129 (95.3%)** on the progressive diagnostic and **170/180 (94.4%)** on the full-suite cross-model baseline (Run 14, 2026-03-28). + +--- + +## 1. Problem statement + +Unit and integration tests verify that a tool works in isolation — call it with these arguments, assert on the response. They do **not** verify that an LLM agent, reading a user's natural-language request, will discover the right tool out of 142 candidates, choose appropriate arguments, and sequence multiple calls correctly. That is the actual user experience of an MCP server, and it is only measurable end-to-end. + +Failures unique to LLM behavior that only this suite catches: + +- Agent writes raw IDF files via `Bash`/`Edit`/`Write` instead of calling MCP tools (guardrail regression). +- Agent gets stuck in a `list_files` loop instead of calling the right domain tool. +- A tool exists, its code is correct, but its docstring has no discoverable keywords — so the agent never picks it even at moderate prompt specificity. +- A rename or reorganization breaks every natural-language prompt that doesn't include the new name. +- A "confusion pair" — two tools that both plausibly match a prompt — resolves to the wrong one. + +The LLM suite is the only gate that measures agent behavior against a real Claude session hitting a real openstudio-mcp container, and it is the basis for the pass-rate trajectory shown throughout this report. + +--- + +## 2. Architecture + +``` +pytest (tests/llm/conftest.py) + │ + ├─ pytest_runtest_protocol ─→ retry loop (up to LLM_TESTS_RETRIES) + │ + └─ run_claude(prompt, ...) (tests/llm/runner.py) + │ + └─ subprocess: claude -p "" + --output-format stream-json --verbose + --mcp-config + --max-turns N --model sonnet + │ + ├─ stdin ←─── NDJSON stream ───→ _parse_stream_json() + │ │ + │ └─→ ClaudeResult + │ (tool_calls, tokens, cost, + │ num_turns, final_text) + │ + └─ MCP stdio → openstudio-mcp Docker container + ├─ stdout_suppression (SWIG safe) + ├─ 142 MCP tools + └─ shared /runs volume (baseline models) +``` + +### Key implementation points + +| Concern | Where | Detail | +|---|---|---| +| Subprocess spawn | `runner.py:181-239` `run_claude()` | Writes temp `mcp.json`, spawns CLI. Strips `CLAUDECODE` env var (nested `claude -p` fails otherwise). | +| Output parsing | `runner.py:242-261` `_parse_stream_json()` | `--output-format stream-json --verbose` is **mandatory** — plain `json` drops `tool_use` blocks. | +| Tool-call extraction | `runner.py:61-106` `ClaudeResult` | Two views: `tool_calls` (all, incl. builtins like ToolSearch/Bash) and `mcp_tool_calls` (MCP only). | +| Markers & auto-tagging | `conftest.py:42-53, 252-278` | `llm`, `tier1-4`, `stable`, `flaky`, `smoke`, `progressive`, `generic`. Auto-tagged via `FLAKY_TESTS` frozenset. | +| Retry logic | `conftest.py:281-323` | Custom `pytest_runtest_protocol` hook. Each retry consumes one prompt from the budget. | +| Benchmark collection | `conftest.py:342-412, 434-692` | `pytest_runtest_logreport` stores per-test metrics. Session end writes `benchmark.json` / `benchmark.md` / `benchmark_history.json`. | +| Failure classification | `conftest.py:383-390` | `timeout` · `no_mcp_tool` · `wrong_tool`. | +| Prompt budget | `conftest.py` (`LLM_TESTS_MAX_PROMPTS`, default 180) | Hard cap prevents runaway cost during iteration. | +| Skill eval auto-discovery | `eval_parser.py:48-90` | Scrapes "Should trigger" / "Should NOT trigger" tables from `.claude/skills/*/eval.md`. | + +### Environment knobs + +| Var | Default | Purpose | +|---|---|---| +| `LLM_TESTS_ENABLED` | unset | Must be `1` to enable the suite | +| `LLM_TESTS_MODEL` | `sonnet` | `sonnet` / `haiku` / `opus` | +| `LLM_TESTS_RETRIES` | `0` | Retry count for non-determinism | +| `LLM_TESTS_MAX_PROMPTS` | `180` | Hard budget cap | +| `LLM_TESTS_TIER` | `all` | `1` / `2` / `3` / `4` / `all` | +| `LLM_TESTS_RUNS_DIR` | `/tmp/llm-test-runs` | Host path mounted as `/runs` in Docker | +| `OSMCP_CODE_MODE` | `0` | FastMCP CodeMode toggle (see §9) | + +--- + +## 3. Test taxonomy + +Ten test files, organized by what the agent is asked to do. + +| File | Tier | ~Count | Purpose | Pass‑rate signal | +|---|---|---|---|---| +| `test_01_setup.py` | setup | 6 | Creates baseline/HVAC/example models in `/runs`. All other tests depend on these. Prompts use explicit tool names to minimize non-determinism. | Dependency gate | +| `test_02_tool_selection.py` | tier1 | 4 | Single-tool discovery, **no model state** (e.g. "What is the server status?"). Fastest tests. | Baseline discovery | +| `test_03_eval_cases.py` | tier3 | 26 | Auto-parsed from `.claude/skills/*/eval.md` "Should trigger" tables. Keeps tests DRY and co-located with skill definitions. | Skill discovery | +| `test_04_workflows.py` | tier2 | 37 | Multi-step chains (3-5 MCP calls): load → weather → HVAC → simulate → extract. | Multi-step composition | +| `test_05_guardrails.py` | tier4 | 3 | **Regression gate:** agent must NOT use `Bash`/`Edit`/`Write` to bypass MCP tools. | Safety / bypass | +| `test_06_progressive.py` | progressive | 104-129 | **The core diagnostic.** 43 operations × 3 specificity levels. | Tool description quality | +| `test_07_fourpipe_e2e.py` | tier2 | 1 | Full retrofit on 44-zone SystemD model using natural language (no tool names). Two simulations, 40+ turns, ~5 min. | Real-user session | +| `test_08_measure_authoring.py` | tier2 | 8 | Custom measure create/edit/test/export. Regression tests pulled from debug-session JSON exports. | Authoring workflows | +| `test_09_tool_routing.py` | tier4 | 4 | A/B baseline: all 142 tools vs `recommend_tools` routing. Not in CI. | Tool-routing efficiency | +| `test_10_confusion_pairs.py` | tier4 | 8 | Prompts that could reasonably trigger either of two similar tools (`run_qaqc_checks` vs `validate_model`). | Disambiguation | + +### The progressive test pattern (L1 / L2 / L3) + +Each operation is tested with **three prompts of increasing specificity**: + +| Level | Example (add HVAC) | What it measures | +|---|---|---| +| **L1 — vague** | *"Add HVAC to the building"* | Can the agent discover the tool from keyword scraps alone? → **docstring keyword quality** | +| **L2 — moderate** | *"Add a VAV reheat system to all 10 zones"* | With domain context, can the agent pick the right tool among near-neighbors? → **tool discovery / ToolSearch** | +| **L3 — explicit** | *"Use add_baseline_system to add System 7 VAV reheat"* | Given the exact tool name, does the tool work? → **tool code / API correctness** | + +The **gap between levels** is the diagnostic: + +- **L1 fails, L2/L3 pass** → docstring is missing keywords. Fast fix. +- **L2 fails, L3 passes** → tool is hard to discover even with context. Fix ToolSearch indexing or tool name. +- **L3 fails** → tool is broken. Fix the code. +- **All three fail** → a true regression (the tool was working and now isn't). This is the most serious signal — Run 15's `edit_measure` is a current example. + +This decomposition is why the progressive tier is the most useful part of the suite: it points at the cause, not just the symptom. + +--- + +## 4. What gets measured + +Every `run_claude()` call yields a `ClaudeResult`. These fields are written to `benchmark.json`, aggregated into `benchmark.md`, and appended to `benchmark_history.json`. + +**Per test:** `passed` · `attempt` (1 = first try, 2+ = flaky) · `duration_s` · `num_turns` · `num_tool_calls` · `tool_calls` (ordered list) · `input_tokens` / `output_tokens` / `cache_read_tokens` · `cost_usd` (notional — free on Claude Max) · `failure_mode` (timeout / no_mcp_tool / wrong_tool) · `toolsearch_count` · `code_mode_active`. + +**Aggregates:** per-tier pass rate, per-L1/L2/L3 pass rate, token profile by tier, failed-test drill-down with tool sequences, run history (last 50 runs). + +**Explicit gaps (things we don't measure yet):** + +- **Parameter correctness** — a test passes if the right tool is called, even with wrong arguments. +- **First-attempt pass rate** — retries mask flakiness. Only `attempt` captures it, not aggregates. +- **Time-to-first-tool** — slow ToolSearch discovery isn't penalized. +- **Error recovery rate** — when a tool returns `ok:False`, does the agent retry or give up? + +--- + +## 5. Results + +### 5.1 Pass-rate history — 16 runs across one month + +![Run history](plots/run_history.png) + +The blue line traces the pass rate of the sonnet-on-default-config suite across 15 sequential runs from 2026-03-05 to 2026-04-05; the tan bars (right axis) show how many tests each run attempted. Four red-circled letters mark the inflection points that actually moved the number. **A** is the single biggest lever in the entire history: adding anti-loop guidance to the MCP server's `instructions` field drove pass rate from 44.0% to 83.3% between Run 1 and Run 2, a 39-point jump from one prompt change. **B** captures Run 3's targeted tool-description edits (+8pp). **C** at Run 6 is when the progressive tier was introduced, expanding the test space from ~90 to ~160 while holding pass rate steady — a successful stress test of the methodology. **D** at Run 14 is the 2026-03-28 cross-model sweep baseline (the same run is plotted separately in §5.6). + +The red **X** at Run 16 is the FastMCP CodeMode A/B experiment (2026-04-05), which collapses the pass rate to 24.0%. It is drawn as a dashed outlier and excluded from the headline trajectory because it is a controlled experiment, not a regression — the CodeMode feature was behind an `OSMCP_CODE_MODE` toggle, was tested, and was rejected. Full analysis in §5.7. + +Note on run sizes: runs prior to Run 6 predate the progressive tier and total ~90 tests; Runs 6–14 run the full suite of 180 tests (setup + tier1–4 + progressive); Run 15 (2026-04-05 sonnet baseline) and Run 16 (CodeMode A/B) are **progressive-only** at 129 tests. The April 5 runs were scoped to the progressive marker to isolate CodeMode's effect on tool dispatch — setup/tier1–4 add no signal for that question and would have doubled cost and runtime. The 129 vs 104 progressive-test count reflects an expansion of the progressive tier between Run 14 and Run 15 (new L1/L2/L3 cases added). + +From Run 10 onward the main line sits in a tight 94.4%–96.5% band. This is the regime where the low-hanging description and keyword work is mostly done, and each additional change costs more engineering time for less pass-rate movement. The dashed green line at 95% is the operational target; the suite has held at or near it for the last six runs. + +### 5.2 Pass rate by tier — which categories are solid, which need work + +![Tier pass rates](plots/tier_pass_rates.png) + +This chart breaks Run 14 (2026-03-28 sonnet, full suite) into its six tiers. Bar color encodes distance from the 95% target — green is on target, orange is in the warning band (85–94%), red is below 85%. Four tiers are at 100%: `setup` (model-creation prerequisites), `tier1` (single-tool discovery with no model state), `tier4` (guardrails), and the monster `progressive` tier at 103/104 = 99.0%. The weak categories are `tier3` skill-eval cases at 80.8% (21/26) and `tier2` workflows at 89.2% (33/37). + +The tier3 and tier2 failures are almost entirely **confusion pairs** rather than broken code. The `qaqc` vs `validate_model` pair accounts for multiple failures: both tools plausibly answer "check the model for issues", and the agent keeps picking `validate_model` when the test expected `run_qaqc_checks`. The fix is docstring disambiguation, not a code change. Tier 2 workflow failures are similar plus a handful of multi-step chain stalls where the agent runs out of turns before completing the full sequence. The pattern tells us that the remaining headroom on this suite is in description quality and confusion-pair resolution — the tools themselves are largely correct. + +### 5.3 Progressive tier — L1 / L2 / L3 + +![Progressive L1 L2 L3](plots/progressive_l1_l2_l3.png) + +The left panel shows aggregate pass rate across all 43 progressive operations at each specificity level, from Run 15 (2026-04-05, sonnet, progressive-only). The bars climb from 93.0% at L1 (vague) to 97.7% at L2 (moderate) to 95.3% at L3 (explicit). A monotone climb is the expected signature of a healthy suite; the fact that L3 dips slightly below L2 is the noteworthy finding this run. It is driven entirely by the `edit_measure` case which fails at all three levels (an actual tool regression, not a description problem). + +The right panel drills into the only four problem cases. Of 43 operations, 39 pass cleanly at all three levels. `thermal_zones_L1` and `test_measure_L1` are single-level failures — the vague prompts are genuinely ambiguous (e.g. "What zones are in this model?" collides with `list_spaces`, `list_thermal_zones`, and `get_model_summary` at L1 precision). `zone_equipment_priority_L3` is a single-level failure at the opposite end: the explicit prompt succeeded previously, so its Run 15 failure is most likely a flaky single-run. **`edit_measure` is the important one**: all three levels fail with the agent stuck calling `add_zone_equipment` instead of `edit_measure`. Failure at L3 means the explicit tool name in the prompt is being ignored — that is a routing bug, not a docstring bug, and it is the top item on the follow-up list. + +### 5.4 Token profile — why 180 tests cost $19 + +![Token profile](plots/token_profile.png) + +The left panel, on a log scale, decomposes per-test token usage for Run 14 (2026-03-28 sonnet). The key finding: **cache-read tokens dominate fresh input tokens by a factor of roughly 10,000×**. Tier 1 tests send ~5 fresh input tokens and read ~34k from cache; the worst offender (`tier2` workflows) sends ~16 fresh input tokens and reads ~217k from cache. This is prompt caching at work: Claude Code caches the MCP tool definitions and session prompts and serves them from cache on every subsequent test, so 180 tests that each "send" tens of thousands of tokens of context actually only pay fresh-input cost on the test prompt itself. + +The right panel plots per-test cost and conversation turn count. The relationship is intuitive — single-tool tiers (tier1, tier3, progressive) run ~2–6 turns at roughly $0.05–$0.09 each, while multi-step tiers (tier2 workflows, tier4 guardrails) average 8–11 turns at $0.16–$0.18. `setup` is a moderate outlier on cost because it runs multi-step model creation workflows, but on few tests so the per-test average looks higher than it feels in aggregate. The bottom-line numbers for Run 14: 180 tests, 157 minutes wall clock, ~20M cache-read tokens, ~250k output tokens, **$18.96 notional** (free on Claude Max). The token profile also tells us where CodeMode's premise fails — see §5.7. + +### 5.5 Failure modes — how the failures break down + +![Failure modes](plots/failure_modes.png) + +The left panel classifies Run 14's 10 failures by mode. Nine of ten are `wrong_tool` — the agent called an MCP tool, just not the one the test expected. The specific cluster is revealing: 2× qaqc, 2× troubleshoot, 1× energy-report, 1× systemd e2e workflow, 2× measure quality, 1× miscellaneous. The qaqc and troubleshoot failures are confusion pairs (discussed in §5.2); the measure-quality failures are new tests hitting syntax/structure checks; the systemd e2e is a multi-step chain that ran out of wall-clock time. One failure is a pure `timeout`. Zero are `no_mcp_tool` — the agent is never stuck; it is always calling something, just sometimes the wrong thing. + +The right panel shows absolute pass/fail counts across all 16 runs. Run 1's 28 failures on 50 tests is the noisy origin — the rest of the history, despite roughly quadrupling the test count, sits comfortably in the single-digit-failures band with occasional ten-failure peaks. Run 16 (faded bars on the far right) is the CodeMode experiment with 98 failures; its inclusion visualizes how far outside normal operating range the CodeMode transformation pushed the agent. + +### 5.6 Cross-model sweep — sonnet vs haiku vs opus + +![Model comparison](plots/model_comparison.png) + +On 2026-03-28 we ran the identical 180-test suite against three models with zero retries to get an honest first-attempt signal. The left panel combines pass rate (green bars, left axis) and notional cost (blue bars, right axis). Sonnet and Opus tie at 94.4% (170/180) and Haiku trails by 5.5 points at 88.9% (160/180). The cost spread is more dramatic: Haiku $11.21, Sonnet $18.96, Opus $32.23 — Opus costs ~2.9× Haiku for the same pass rate that Sonnet delivers at ~1.7×. Duration scales roughly with cost (80 / 157 / 185 minutes). + +The right panel breaks each model down by tier. Three observations. First, setup / tier1 / tier4 are 100% across all three models — the prerequisites and the well-disambiguated tiers don't discriminate between models. Second, tier3 skill-eval cases are the same 73.1% on both Haiku *and* Opus but 80.8% on Sonnet; this is the confusion-pair gap, and interestingly the largest model doesn't help — Opus picks the "wrong" tool of a confusion pair just as often as Haiku does, which means the ambiguity is real, not a capability gap. Third, progressive is near-perfect for all three (Haiku 93.3%, Sonnet 99.0%, Opus 100%) — the L1/L2/L3 progressive design is largely model-agnostic once tool descriptions are good. The operational conclusion from this sweep: **sonnet is the right default**. Opus doesn't earn its price premium, Haiku's tier3/progressive losses exceed its cost savings for our use case. + +### 5.7 FastMCP CodeMode A/B — an experiment that failed cleanly + +![CodeMode A/B](plots/codemode_ab.png) + +On 2026-04-05 we tested FastMCP 3.2.0's CodeMode transform, which collapses the tool catalog behind three meta-tools (search / get_schema / execute) and asks the model to write Python code invoking `call_tool(...)` instead of emitting tool_use blocks directly. The premise of CodeMode is token savings — if tool definitions are huge and always loaded upfront, hiding them behind meta-tools is a win. The result is unambiguous: **CodeMode OFF scored 123/129 (95.3%) on the progressive suite; CodeMode ON scored 31/129 (24.0%), a 71-point regression**. + +The left panel shows the overall drop. The middle panel confirms the regression is structural, not prompt-sensitive: L1, L2, and L3 all collapse by ~70 points. If this were a description-quality problem, L3 would hold. Instead all three levels tank together, which means the failure is in the CodeMode transformation layer itself, not in how the prompts land. The right panel shows the resource multipliers — CodeMode ON cost **2.4× more** ($22.35 vs $9.29), took **2.4× longer** (168 vs 69 minutes), made **3.6× more ToolSearch calls** (5.8 vs 1.6 per test), and generated **2.3× more output tokens** (300k vs 128k). Output tokens going *up* is the kicker: CodeMode was supposed to save tokens, and instead the LLM burned more of them writing Python orchestration code than it would have generating plain tool_use blocks. + +The root cause, documented in `docs/knowledge/codemode-benchmark-2026-04-05.md`, is a **double-discovery-layer conflict**. Claude Code already implements deferred tool loading via its own built-in ToolSearch when a tool catalog exceeds 10k tokens. Our 142 tools hit that threshold and get auto-deferred by Claude Code. Adding CodeMode on top creates a second discovery layer the model has to navigate, and the two systems interfere: ToolSearch calls tripled instead of going to zero. CodeMode's token-saving premise also assumes the baseline wastes tokens shipping tool defs upfront — but our Run 14 input-token average is **~10 tokens per test** (see §5.4), because prompt caching is already serving tool definitions from cache. There is no waste to save. + +The feature was kept behind an `OSMCP_CODE_MODE` toggle (default `0`) for future experiments with fewer tools or different clients, but it is not used by the default server config. This experiment is what makes me most confident in the suite: a single 4-hour experiment produced a definitive, quantified rejection of a community-hyped technique. + +--- + +## 6. Lessons that changed how the suite is built + +1. **System prompts are the biggest lever.** Run 1→2 is the evidence: +39 points from one change to `server.py` `instructions`. Before touching individual tool docstrings, audit the server-wide prompt. + +2. **Docstring keywords >> docstring prose.** `add_baseline_system` L1 was failing until we added "HVAC / heating and cooling" to its docstring. Verbose paragraphs don't help; a single matched keyword does. All 142 tools are now enforced ≥40 chars. + +3. **Progressive testing is the best diagnostic tool.** L1/L2/L3 separates three failure classes (description, discovery, code) that binary pass/fail obscures completely. Every tool should have at least one progressive case. + +4. **L1 failures are often structural, not fixable.** "What loads?" is genuinely ambiguous — a good agent asks for clarification. Don't bend a tool description to pass a vague prompt if the agent's alternative behavior is reasonable. + +5. **Multi-step workflows are fragile.** Tier 2 is consistently the lowest. ToolSearch + measure execution eats turns; one stall mid-chain fails the whole test. Keep `max_turns` generous (25+ for 3-tool chains, 40+ for e2e). + +6. **Retries mask flakiness.** Default `LLM_TESTS_RETRIES=0` gives the honest first-attempt signal. Only add retries when CI-like confidence is needed, and track the `attempt` field to see which tests are actually brittle. + +7. **Flaky tests need a promotion path.** The `FLAKY_TESTS` frozenset is the quarantine. Pattern-match by substring. Remove patterns when a test stabilizes across three or more runs. + +8. **Description guidance alone doesn't fix L1 failures.** See [`benchmark-description-guidance.md`](benchmark-description-guidance.md) — ~35 tools got disambiguation/when-to-use/emphasis edits and L1 pass rate **did not move**. The remaining failures were structural. + +9. **NDJSON logs per test are indispensable.** When a test fails, the `.ndjson` log shows the exact tool calls, arguments, error responses, and where the agent got stuck. + +10. **The biggest model isn't always the right default.** Run 14's cross-model sweep shows Opus matching Sonnet on pass rate while costing 1.7× more. Sonnet is the operational default. + +11. **Community-hyped techniques need quantified A/B tests.** The CodeMode experiment in Run 16 took ~4 hours to reject a feature that looked plausible on paper. The same methodology that validates our default config is what lets us reject features confidently. + +--- + +## 7. How to run the suite + +```bash +# Full suite (~100–150 min) +LLM_TESTS_ENABLED=1 pytest tests/llm/ -v + +# Smoke subset (~10 min) +LLM_TESTS_ENABLED=1 pytest tests/llm/ -m smoke -v + +# Progressive tier only (~60 min) +LLM_TESTS_ENABLED=1 pytest tests/llm/ -m progressive -v + +# Iterate on flaky tests (~10 min) +LLM_TESTS_ENABLED=1 pytest tests/llm/ -m flaky -v + +# Single case +LLM_TESTS_ENABLED=1 pytest tests/llm/test_06_progressive.py -k thermostat_L1 -v +``` + +Reports land in `$LLM_TESTS_RUNS_DIR/benchmark.md` / `benchmark.json`. After each run, copy results into [`llm-test-benchmark.md`](llm-test-benchmark.md) to version-control. + +To regenerate every plot in this report from the committed benchmark data: + +```bash +python docs/testing/plots/generate_plots.py +``` + +--- + +## 8. Reference files + +| Doc | What it covers | +|---|---| +| [`llm-test-benchmark.md`](llm-test-benchmark.md) | Raw benchmark data — per-tool L1/L2/L3 matrix, run history table, workflow results, flaky-test log | +| [`frameworks-summary.md`](frameworks-summary.md) | Unit / integration / LLM side-by-side — counts, strengths, weaknesses, improvement ideas | +| [`testing.md`](testing.md) | Contributor guide for unit + integration tests, CI shards, Docker setup, writing new tests | +| [`benchmark-description-guidance.md`](benchmark-description-guidance.md) | Negative-result experiment: ~35 tool description edits that did **not** move L1 pass rate | +| [`llm-testing-methodology.md`](llm-testing-methodology.md) | Earlier deep-dive draft — superseded by this README but kept for the narrative lessons section | +| [`../knowledge/codemode-benchmark-2026-04-05.md`](../knowledge/codemode-benchmark-2026-04-05.md) | Full writeup of the CodeMode A/B experiment referenced in §5.7 | +| [`plots/generate_plots.py`](plots/generate_plots.py) | Reproducible source for every chart in this report | diff --git a/docs/benchmark-description-guidance.md b/docs/testing/benchmark-description-guidance.md similarity index 100% rename from docs/benchmark-description-guidance.md rename to docs/testing/benchmark-description-guidance.md diff --git a/docs/testing-frameworks-summary.md b/docs/testing/frameworks-summary.md similarity index 99% rename from docs/testing-frameworks-summary.md rename to docs/testing/frameworks-summary.md index 2c463af..99d57d9 100644 --- a/docs/testing-frameworks-summary.md +++ b/docs/testing/frameworks-summary.md @@ -159,7 +159,7 @@ Written at session end to `LLM_TESTS_RUNS_DIR/`: | `benchmark_history.json` | JSON array | Per-run summary (last 50 runs) for trend tracking | | `ndjson_logs/.ndjson` | NDJSON | Raw Claude CLI stream per test (for debugging tool call sequences) | -Latest results are copied to `docs/llm-test-benchmark.md` for version control. +Latest results are copied to `docs/testing/llm-test-benchmark.md` for version control. ### Strengths @@ -300,4 +300,4 @@ LLM_TESTS_ENABLED=1 pytest tests/llm/ -v # full (~160 tests, 2-3 | `tests/llm/runner.py` | `run_claude()`, NDJSON parsing, `ClaudeResult` | | `tests/llm/eval_parser.py` | Auto-parse skill eval.md into test cases | | `.github/workflows/ci.yml` | CI pipeline, shard definitions | -| `docs/llm-test-benchmark.md` | Latest benchmark results + run history | +| `docs/testing/llm-test-benchmark.md` | Latest benchmark results + run history | diff --git a/docs/llm-test-benchmark.md b/docs/testing/llm-test-benchmark.md similarity index 84% rename from docs/llm-test-benchmark.md rename to docs/testing/llm-test-benchmark.md index 3805911..8cb2a99 100644 --- a/docs/llm-test-benchmark.md +++ b/docs/testing/llm-test-benchmark.md @@ -4,10 +4,21 @@ | Run | Date | Model | Tests | Passed | Rate | Runtime | Notes | |-----|------|-------|-------|--------|------|---------|-------| -| **13** | **2026-03-26** | **sonnet** | **230** | **160** | **95.8%** | **151 min** | **Post #40 fix + test audit. 7 fail (3 qaqc, 3 measure quality, 1 sim_L1)** | +| **15** | **2026-04-05** | **sonnet** | **129** | **123** | **95.3%** | **69 min** | **Progressive-only re-run, CodeMode A/B baseline. 6 fail — edit_measure L1/L2/L3 regression, thermal_zones_L1, test_measure_L1, zone_equipment_priority_L3.** | +| 14 | 2026-03-28 | sonnet | 180 | 170 | 94.4% | 157 min | Full suite cross-model sweep baseline. 10 fail (eval + workflow). Also ran haiku (160/180 = 88.9%) and opus (170/180 = 94.4%) same day. | +| 13 | 2026-03-26 | sonnet | 230 | 160 | 95.8% | 151 min | Post #40 fix + test audit. 7 fail (3 qaqc, 3 measure quality, 1 sim_L1). | *Cost is notional API pricing from Claude Code CLI — free on Claude Max.* +## Cross-Run Experiments + +Two comparative runs on 2026-03-28 and 2026-04-05: + +| Experiment | Date | Variants | Finding | +|---|---|---|---| +| Cross-model sweep | 2026-03-28 | haiku / sonnet / opus, same 180-test suite | haiku 88.9% / sonnet 94.4% / opus 94.4%. Opus matches sonnet but costs ~1.7×. Haiku is 40% cheaper at the cost of 5.5pp. | +| FastMCP CodeMode A/B | 2026-04-05 | CodeMode OFF / ON, same 129 progressive tests | OFF 95.3% / ON **24.0%** — 71pp regression. See [`../knowledge/codemode-benchmark-2026-04-05.md`](../knowledge/codemode-benchmark-2026-04-05.md). | + ## Per-Tool Discovery Matrix One row per progressive case. L1=vague, L2=moderate, L3=explicit. @@ -126,8 +137,12 @@ One row per progressive case. L1=vague, L2=moderate, L3=explicit. | 11 | 2026-03-20 | 171 | 164 | 95.9% | — | Full suite with ToolSearch + wiring recipes + enriched descriptions. 12/12 test_09 pass. 7 failures all known flaky (replace_windows_L1 new — agent called search_api instead). | | 12 | 2026-03-20 | 170 | 163 | 95.9% | — | Post description enrichment (all 142 tools ≥40 char). Same 7 flaky failures. No regression. | | 13 | 2026-03-26 | 230 | 160 | 95.8% | — | Post #40 fix + test audit. 63 skipped (test structure). 7 fail: 3 qaqc tier2, 3 measure quality, 1 run_simulation_L1. Previously flaky L1s (import_floorplan, list_dynamic_type, check_loads, thermostat, set_wwr, schedule_details, create_loads) ALL passed. | +| 14 | 2026-03-28 | 180 | 170 | 94.4% | $18.96 | Cross-model sweep baseline (sonnet). 157 min. 10 fail: 9 wrong_tool (2× qaqc, 2× troubleshoot, 1× energy-report, 1× systemd_e2e, 2× measure quality, 1× misc) + 1 timeout. Haiku same day: 160/180 = 88.9%, $11.21, 80 min. Opus same day: 170/180 = 94.4%, $32.23, 185 min. | +| 15 | 2026-04-05 | 129 | 123 | 95.3% | $9.29 | CodeMode A/B baseline (OFF). Progressive-only suite (43 cases × 3). 69 min. 6 fail: edit_measure L1/L2/L3 (all 3 → tool regression), thermal_zones_L1, test_measure_L1, zone_equipment_priority_L3. L1=93.0%, L2=97.7%, L3=95.3%. | +| 16 | 2026-04-05 | 129 | 31 | **24.0%** | $22.35 | **CodeMode A/B experiment (ON) — 71pp regression.** 168 min. 67 wrong_tool + 30 timeout + 1 no_mcp_tool. Feature kept as opt-in toggle, NOT default. See `docs/knowledge/codemode-benchmark-2026-04-05.md`. | *Run 8 = combined results from two separate targeted runs (measure authoring 13/15 + cooled beam 10/10).* +*Run 16 is an experimental outlier (CodeMode ON) and is excluded from the main pass-rate timeline in plots.* ## Tool Verification Failures @@ -189,4 +204,4 @@ LLM_TESTS_ENABLED=1 pytest tests/llm/test_06_progressive.py -k "thermostat_L1" - ``` Reports written to `LLM_TESTS_RUNS_DIR/benchmark.md` and `benchmark.json`. -After running, copy to `docs/llm-test-benchmark.md`. +After running, copy to `docs/testing/llm-test-benchmark.md`. diff --git a/docs/testing/llm-testing-methodology.md b/docs/testing/llm-testing-methodology.md new file mode 100644 index 0000000..b630f92 --- /dev/null +++ b/docs/testing/llm-testing-methodology.md @@ -0,0 +1,276 @@ +# LLM Testing Methodology, Implementation & Results + +**openstudio-mcp** — behavioral testing of an MCP server with ~142 tools, where a real LLM agent drives the tests end-to-end. + +> **TL;DR** — 160/167 tests passing (**95.8%**) in Run 13. Core methodology: each tool tested at three prompt specificity levels (L1 vague / L2 moderate / L3 explicit). Pass-rate gap between levels isolates tool-description problems from tool-design problems. System prompt is the single biggest lever (44% → 83% in one run). + +--- + +## 1. Why LLM tests exist + +Unit and integration tests verify that MCP tools work in isolation. They don't verify that an LLM agent, given a natural-language request, will **discover and call the correct tool** — the actual user experience. + +Examples of failures only LLM tests catch: +- Agent writes raw IDF files to bypass MCP tools (guardrail regression) +- Agent loops on `list_files` forever instead of calling the right tool +- A tool exists but has a docstring so vague the agent never picks it +- A "correct but surprising" rename breaks discovery for every prompt that doesn't mention the new name + +The LLM suite is the only gate that measures agent behavior end-to-end against a real Claude session hitting a real openstudio-mcp Docker container. + +--- + +## 2. Architecture + +``` +pytest (tests/llm/conftest.py) + │ + ├─ pytest_runtest_protocol ─→ retry loop (up to LLM_TESTS_RETRIES) + │ + └─ run_claude(prompt, ...) (tests/llm/runner.py) + │ + └─ subprocess: claude -p "" + --output-format stream-json --verbose + --mcp-config + --max-turns N --model sonnet + │ + ├─ stdin ←──── NDJSON stream ────→ _parse_stream_json() + │ │ + │ └─→ ClaudeResult + │ (tool_calls, tokens, cost, + │ num_turns, final_text) + │ + └─ MCP stdio → openstudio-mcp Docker container + ├─ stdio_suppression wrapping + ├─ 142 MCP tools + └─ shared /runs volume (baseline models) +``` + +### Key implementation points + +| Concern | Where | Detail | +|---|---|---| +| Subprocess spawn | `runner.py:181-239` `run_claude()` | Writes temp `mcp.json`, spawns CLI. Strips `CLAUDECODE` env var (nested `claude -p` fails otherwise). | +| Output parsing | `runner.py:242-261` `_parse_stream_json()` | `--output-format stream-json --verbose` is **mandatory** — plain `json` drops `tool_use` blocks. | +| Tool-call extraction | `runner.py:61-106` `ClaudeResult` | Two views: `tool_calls` (all, inc. builtins like ToolSearch/Bash) and `mcp_tool_calls` (MCP-only). | +| Markers & auto-tagging | `conftest.py:42-53, 252-278` | `llm`, `tier1-4`, `stable`, `flaky`, `smoke`, `progressive`, `generic`. Auto-tagged via `FLAKY_TESTS` frozenset. | +| Retry logic | `conftest.py:281-323` | Custom `pytest_runtest_protocol` hook. Each retry consumes one prompt from the budget. | +| Benchmark collection | `conftest.py:342-412, 434-692` | `pytest_runtest_logreport` stores per-test metrics. Session end writes `benchmark.json` / `benchmark.md` / `benchmark_history.json`. | +| Failure classification | `conftest.py:383-390` | `timeout` · `no_mcp_tool` · `wrong_tool`. | +| Prompt budget | `conftest.py` `LLM_TESTS_MAX_PROMPTS` (default 180) | Hard cap prevents runaway cost during iteration. | +| Skill eval auto-discovery | `eval_parser.py:48-90` | Scrapes "Should trigger" / "Should NOT trigger" tables from `.claude/skills/*/eval.md`. | + +### Environment knobs + +| Var | Default | Purpose | +|---|---|---| +| `LLM_TESTS_ENABLED` | unset | Must be `1` to enable the suite | +| `LLM_TESTS_MODEL` | `sonnet` | `sonnet` / `haiku` / `opus` | +| `LLM_TESTS_RETRIES` | `0` | Retry count for non-determinism | +| `LLM_TESTS_MAX_PROMPTS` | `180` | Hard budget cap | +| `LLM_TESTS_TIER` | `all` | `1`/`2`/`3`/`4`/`all` | +| `LLM_TESTS_RUNS_DIR` | `/tmp/llm-test-runs` | Host path mounted as `/runs` in Docker | + +--- + +## 3. Test taxonomy + +Ten test files, organized by what the agent is asked to do. + +| File | Tier | ~Count | Purpose | Pass‑rate signal | +|---|---|---|---|---| +| `test_01_setup.py` | setup | 5 | Creates baseline/HVAC/example models in `/runs`. All other tests depend on these. Prompts use explicit tool names to minimize non-determinism. | Dependency gate | +| `test_02_tool_selection.py` | tier1 | 4 | Single-tool discovery, **no model state** (e.g., "What is the server status?"). Fastest tests. | Baseline discovery | +| `test_03_eval_cases.py` | tier3 | 26 | Auto-parsed from `.claude/skills/*/eval.md` "Should trigger" tables. Keeps tests DRY and co-located with skill definitions. | Skill discovery | +| `test_04_workflows.py` | tier2 | 19 | Multi-step chains (3-5 MCP calls): load → weather → HVAC → simulate → extract. | Multi-step composition | +| `test_05_guardrails.py` | tier4 | 3 | **Regression gate**: agent must **NOT** use `Bash`/`Edit`/`Write` to bypass MCP tools. | Safety/bypass | +| `test_06_progressive.py` | progressive | 110 | **The core diagnostic.** 34+ operations × 3 specificity levels. | Tool description quality | +| `test_07_fourpipe_e2e.py` | tier2 | 1 | Full retrofit on 44-zone SystemD model using natural language (no tool names). Two simulations, 40+ turns, ~5 min. | Real-user session | +| `test_08_measure_authoring.py` | tier2 | 8 | Custom measure create/edit/test/export. Regression tests pulled from debug-session JSON exports. | Authoring workflows | +| `test_09_tool_routing.py` | tier4 | 4 | A/B baseline: all 139 tools vs. `recommend_tools` routing. Not in CI. | Tool-routing efficiency | +| `test_10_confusion_pairs.py` | tier4 | 8 | Prompts that could reasonably trigger either of two similar tools (`run_qaqc_checks` vs `validate_model`). | Disambiguation | + +### The progressive test pattern (L1 / L2 / L3) + +Each operation is tested with **three prompts of increasing specificity**: + +| Level | Example (add HVAC) | What it measures | +|---|---|---| +| **L1 — vague** | *"Add HVAC to the building"* | Can the agent discover the tool from keyword scraps alone? → **docstring keyword quality** | +| **L2 — moderate** | *"Add a VAV reheat system to all 10 zones"* | With domain context, can the agent pick the right tool among near-neighbors? → **tool discovery / ToolSearch** | +| **L3 — explicit** | *"Use add_baseline_system to add System 7 VAV reheat"* | Given the exact tool name, does the tool work? → **tool code / API correctness** | + +The **gap between levels** is the diagnostic: + +- **L1 fails, L2/L3 pass** → docstring is missing keywords. Fast fix. (Example: adding "HVAC / heating and cooling" to `add_baseline_system` made L1 pass immediately in Run 3.) +- **L2 fails, L3 passes** → tool is hard to discover even with context. Fix ToolSearch indexing or tool name. +- **L3 fails** → tool is broken. Fix the code. + +This decomposition is why the progressive tier is the most useful part of the suite — it points at the *cause*, not just the symptom. + +--- + +## 4. What gets measured + +Every `run_claude()` call yields a `ClaudeResult` object. These fields are written to `benchmark.json`, aggregated into `benchmark.md`, and appended to `benchmark_history.json`. + +**Per test:** + +| Metric | Source | Meaning | +|---|---|---| +| `passed` | pytest outcome | Binary, *after* retries | +| `attempt` | retry hook | 1 = first try, 2+ = flaky | +| `duration_s` | wall clock | Includes Docker spawn + LLM inference | +| `num_turns` | CLI result | Conversation turns. High = looping. | +| `num_tool_calls` | NDJSON | Total MCP tools invoked | +| `tool_calls` | NDJSON | Ordered list — primary assertion target | +| `input_tokens` | CLI usage | Fresh tokens to model | +| `output_tokens` | CLI usage | Tokens generated | +| `cache_read_tokens` | CLI usage | Served from prompt cache (high = tool defs cached) | +| `cost_usd` | CLI result | **Notional** — free on Claude Max | +| `failure_mode` | `conftest.py:383-390` | `timeout` / `no_mcp_tool` / `wrong_tool` | + +**Aggregates:** per-tier pass rate, per-L1/L2/L3 pass rate, token profile by tier, failed-test drill-down with tool sequences, run history (last 50 runs). + +**Explicit gaps (things we don't measure yet):** + +- **Parameter correctness** — a test passes if the right tool is called, even with wrong arguments. +- **First-attempt pass rate** — retries mask flakiness. Only `attempt` captures it, not aggregates. +- **Time-to-first-tool** — slow ToolSearch discovery isn't penalized. +- **Cross-model comparison** — all runs use one model. No GPT-4 / Gemini data to validate model-agnostic tool descriptions. +- **Error recovery rate** — when a tool returns `ok:False`, does the agent retry or give up? + +--- + +## 5. Results + +### Run history — 13 runs, 2026-03-05 to 2026-03-26 + +![Run history](plots/run_history.png) + +| Run | Date | Tests | Passed | Rate | Key change | +|---|---|---|---|---|---| +| 1 | 03-05 | 50 | 22 | **44.0%** | Baseline — no system prompt, wrong model path | +| 2 | 03-06 | 90 | 75 | **83.3%** | **+system prompt (anti-loop), model path fix, pre-check** → +39pp | +| 3 | 03-07 | 90 | 82 | **91.1%** | +tool description improvements → +8pp | +| 4 | 03-07 | 90 | 84 | 93.3% | Stability run (no code changes) | +| 5 | 03-10 | 107 | 103 | 96.3% | +generic access tests, cleanup | +| 6 | 03-11 | 159 | 153 | 96.2% | **+progressive tier (L1/L2/L3)**, workflows, sim setup | +| 7 | 03-12 | 159 | 155 | **97.5%** | Test consolidation (no tool changes) — high-water mark | +| 8 | 03-13 | 25 | 23 | 92.0% | Measure authoring + cooled beam (targeted runs) | +| 9a/b | 03-19 | 9 | 9 | 100% | Tool-routing A/B baseline (9 cases, neutral delta) | +| 10 | 03-19 | 172 | 166 | 96.5% | Full regression: tags, `recommend_tools`, search_api, docstrings — no regressions | +| 11 | 03-20 | 171 | 164 | 95.9% | +ToolSearch + wiring recipes + enriched descriptions. 7 flaky. | +| 12 | 03-20 | 170 | 163 | 95.9% | Description enrichment (all 142 tools ≥40 char). Same 7 flaky. | +| **13** | **03-26** | **230** | **160** | **95.8%** | **Post #40 fix + test audit. 63 skipped. 7 fail. Previously-flaky L1s all passing.** | + +The two big inflections are the **system prompt** (Run 1→2, +39pp) and **progressive-tier introduction** (Run 5→6, which massively expanded the test space without dropping pass rate). Everything since Run 10 sits in the 95.8-96.5% band — a regime where improvements are marginal and noise dominates. + +### Per-tier pass rate — Run 13 + +![Tier pass rates](plots/tier_pass_rates.png) + +- **setup / tier1 / tier4: 100%** — prerequisites, single-tool discovery, and guardrails are solid. +- **progressive: 98%** (108/110) — the biggest category and the most diagnostic. +- **tier3 skill evals: 92%** — 63 additional tests skipped due to test structure issues (these will reappear in future runs). +- **tier2 workflows: 84%** — lowest tier. Three failures are all `run_qaqc_checks` not being called for validation prompts, i.e. a confusion pair with `validate_model`. Multi-step chains are inherently more fragile than single-tool tests. + +### Progressive tier — L1 / L2 / L3 + +![Progressive L1 L2 L3](plots/progressive_l1_l2_l3.png) + +**Left:** aggregate pass rate across 42 progressive cases. L1 93% → L2 95% → L3 100%. The monotone climb is the expected signature of a healthy suite: explicit prompts always succeed, so L3 failures mean broken tools; vague prompts fail more, and the magnitude of the gap tells you how docstring-dependent discovery is. + +**Right:** the only cases that don't pass all three levels. All others are 3/3. + +| Case | Status | Root cause | +|---|---|---| +| import_floorplan | Now passing at all levels | Was flaky — no file path in vague prompt, agent correctly asks for one | +| list_dynamic_type | Now passing | "What sizing parameters?" was too vague; agent used explicit sizing tools | +| check_loads | Now passing | "What loads?" → agent inspected spaces instead of calling `get_load_details` | +| thermostat | Now passing | "Change thermostat settings" needs direction (up/down, by how much) | +| **run_simulation** | **L1 FAIL (Run 13)** | "Run a simulation" genuinely too vague — agent hesitates on a bare prompt | +| **export_measure** | **L1 & L2 FAIL** | Agent can't discover `export_measure` without the explicit name — durable description gap | + +The `export_measure` case is the best example of a real bug the methodology catches: the tool works at L3 (so the implementation is fine), the docstring has keywords, but Claude still doesn't pick it over `list_custom_measures` + `list_files`. Fix is on the tool/description side, not the test. + +### Token profile by tier + +![Token profile](plots/token_profile.png) + +**Left panel (log scale):** cache-read tokens dominate by 2-3 orders of magnitude. Each invocation loads ~27-50K tokens of tool definitions, and Claude's prompt cache serves them on subsequent tests. This is why a 172-test run only costs ~$12 of notional API pricing — the fresh-token footprint per test is tiny (10-30 in, 400-2800 out). + +**Right panel:** cost and turn count per tier. Single-tool tests ≈ 3 turns, $0.06. The cooled-beam comparison workflow is a 22-turn outlier because it runs two full simulations and recovers from sim errors mid-session — it's the only test that costs >$0.10 per run. + +### Failure modes — Run 13 + +![Failure modes](plots/failure_modes.png) + +**Left:** the 7 Run-13 failures fit three buckets. + +| Mode | Count | Cases | +|---|---|---| +| `no_mcp_tool` — agent didn't call any MCP tool | 3 | qaqc tier2 (agent used `validate_model` instead of `run_qaqc_checks`) | +| `wrong_tool` — MCP tool called but not the expected one | 1 | `run_simulation_L1` (intermittent) | +| Measure-quality assertions (new tests) | 3 | measure authoring syntax/structure checks | + +The qaqc cluster is the most interesting: both tools legitimately "check the model", and `validate_model` is a defensible answer. This is a **confusion pair** that needs docstring disambiguation, not a bug. + +**Right:** absolute pass/fail counts by run. Run 1's 28 failures stand out; runs 5-13 are in a stable <10-failure regime despite the test count roughly quadrupling. + +--- + +## 6. Lessons that changed how the suite is built + +1. **System prompts are the biggest lever.** Adding anti-loop guidance to `server.py` `instructions` was a single change that took pass rate from 44% → 83%. Before touching individual tool docstrings, audit the server-wide prompt. + +2. **Docstring keywords >> docstring prose.** `add_baseline_system` L1 was failing until we added "HVAC / heating and cooling" to its docstring. A verbose paragraph doesn't help. A single matched keyword does. All 142 tools are now enforced ≥40 chars. + +3. **Progressive testing is the best diagnostic tool.** L1/L2/L3 separates three failure classes (description, discovery, code) that a binary pass/fail obscures completely. Every tool should have at least one progressive case. + +4. **L1 failures are often structural, not fixable.** "What loads?" is genuinely ambiguous — a good agent asks for clarification. Don't bend a tool description to pass a vague prompt if the agent's alternative behavior is reasonable. + +5. **Multi-step workflows are fragile.** Tier 2 is consistently the lowest. ToolSearch + measure execution eats turns; one stall mid-chain fails the whole test. Keep `max_turns` generous (25+ for 3-tool chains, 40+ for e2e). + +6. **Retries mask flakiness.** Default `LLM_TESTS_RETRIES=0` gives you the honest first-attempt signal. Only add retries when you need CI-like confidence — and track `attempt` field to see which tests are actually brittle. + +7. **Flaky tests need a promotion path.** The `FLAKY_TESTS` frozenset is the quarantine. Pattern-match by substring. Remove patterns when a test stabilizes across 3+ runs. Don't let the list grow indefinitely. + +8. **Description guidance alone doesn't fix L1 failures.** See [`benchmark-description-guidance.md`](benchmark-description-guidance.md) — ~35 tools got disambiguation/when-to-use/emphasis edits and L1 pass rate **did not move**. The remaining failures were structural. + +9. **NDJSON logs per test are indispensable.** When a test fails, the `.ndjson` log shows the exact tool calls, arguments, error responses, and where the agent got stuck. Clearing them per run keeps disk usage sane. + +10. **Stable/flaky classification beats "just run more tests".** Iterating on `-m flaky` (~18 tests, ~10 min) is the right inner loop. Running the full suite is reserved for final validation. + +--- + +## 7. Running the suite + +```bash +# Full suite (~100-150 min) +LLM_TESTS_ENABLED=1 pytest tests/llm/ -v + +# Smoke subset (~12 tests, ~10 min) +LLM_TESTS_ENABLED=1 pytest tests/llm/ -m smoke -v + +# Progressive tier only (~60 min) +LLM_TESTS_ENABLED=1 pytest tests/llm/ -m progressive -v + +# Iterate on flaky tests (~10 min) +LLM_TESTS_ENABLED=1 pytest tests/llm/ -m flaky -v + +# Single case +LLM_TESTS_ENABLED=1 pytest tests/llm/test_06_progressive.py -k thermostat_L1 -v +``` + +Reports land in `$LLM_TESTS_RUNS_DIR/benchmark.md` / `benchmark.json`. After each run, copy results into [`llm-test-benchmark.md`](llm-test-benchmark.md) to check into version control. + +--- + +## 8. See also + +- [`llm-test-benchmark.md`](llm-test-benchmark.md) — raw benchmark data, per-tool matrix, run history +- [`frameworks-summary.md`](frameworks-summary.md) — unit/integration/LLM side-by-side, strengths & gaps +- [`benchmark-description-guidance.md`](benchmark-description-guidance.md) — negative-result experiment: description edits that didn't move the needle +- [`testing.md`](testing.md) — general testing guide (unit + integration + CI) +- [`plots/generate_plots.py`](plots/generate_plots.py) — reproduce every chart in this doc (`python docs/testing/plots/generate_plots.py`) diff --git a/docs/testing/plots/codemode_ab.png b/docs/testing/plots/codemode_ab.png new file mode 100644 index 0000000..e85fab5 Binary files /dev/null and b/docs/testing/plots/codemode_ab.png differ diff --git a/docs/testing/plots/failure_modes.png b/docs/testing/plots/failure_modes.png new file mode 100644 index 0000000..52914e5 Binary files /dev/null and b/docs/testing/plots/failure_modes.png differ diff --git a/docs/testing/plots/generate_plots.py b/docs/testing/plots/generate_plots.py new file mode 100644 index 0000000..584dd2f --- /dev/null +++ b/docs/testing/plots/generate_plots.py @@ -0,0 +1,591 @@ +"""Generate LLM test benchmark plots. + +Data sources: +- Runs 1-13: docs/testing/llm-test-benchmark.md run-history table +- Run 14 (2026-03-28): docs/sweeps/sonnet-2026-03-28/benchmark.json (+ haiku/opus) +- Run 15 (2026-04-05): docs/sweeps/codemode-off-2026-04-05/benchmark.json +- Run 16 (2026-04-05): docs/sweeps/codemode-on-2026-04-05/benchmark.json (experiment) + +Run from repo root: + python docs/testing/plots/generate_plots.py +""" +from __future__ import annotations + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.patches as mpatches +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.lines import Line2D +from pathlib import Path + +OUT = Path(__file__).parent + +plt.rcParams.update( + { + "font.size": 10, + "axes.titlesize": 12, + "axes.titleweight": "bold", + "axes.spines.top": False, + "axes.spines.right": False, + "figure.dpi": 120, + } +) + +COLOR_PASS = "#2e7d32" # green +COLOR_WARN = "#ef6c00" # orange +COLOR_FAIL = "#c62828" # red +COLOR_LINE = "#1565c0" # blue +COLOR_ALT = "#7b1fa2" # purple +COLOR_EXP = "#546e7a" # blue-gray (experimental) + + +# -------------------------------------------------------------------- # +# 1. Run history timeline # +# -------------------------------------------------------------------- # +def run_history() -> None: + # Runs 1-15 are the main sonnet progression. Run 16 (CodeMode ON) is + # plotted as an experimental outlier in a different color. + runs = list(range(1, 16)) + rates = [ + 44.0, 83.3, 91.1, 93.3, 96.3, 96.2, 97.5, 92.0, 100.0, 96.5, + 95.9, 95.9, 95.8, + 94.4, # Run 14: 2026-03-28 sonnet 170/180 (full suite) + 95.3, # Run 15: 2026-04-05 codemode-OFF 123/129 (progressive-only) + ] + tests = [ + 50, 90, 90, 90, 107, 159, 159, 25, 9, 172, 171, 170, 230, + 180, 129, + ] + dates = [ + "03-05", "03-06", "03-07", "03-07", "03-10", "03-11", "03-12", + "03-13", "03-19", "03-19", "03-20", "03-20", "03-26", + "03-28", "04-05", + ] + + # Experimental outlier: Run 16 April 5 CodeMode ON + exp_run = 16 + exp_rate = 24.0 + exp_tests = 129 + + inflections = [ + (2, 83.3, "A"), + (3, 91.1, "B"), + (6, 96.2, "C"), + (14, 94.4, "D"), + ] + inflection_labels = { + "A": "+system prompt (anti-loop guidance)", + "B": "+tool description improvements", + "C": "+progressive tier introduced (L1/L2/L3)", + "D": "cross-model sweep (sonnet/haiku/opus)", + } + + fig, ax1 = plt.subplots(figsize=(13, 6.5)) + + ax2 = ax1.twinx() + all_runs = runs + [exp_run] + all_tests = tests + [exp_tests] + bar_h = ax2.bar(all_runs, all_tests, alpha=0.18, color=COLOR_WARN, + zorder=1, width=0.6, label="Tests run (right axis)") + ax2.set_ylabel("Tests run (bars)", color=COLOR_WARN) + ax2.tick_params(axis="y", labelcolor=COLOR_WARN) + ax2.set_ylim(0, max(all_tests) * 1.45) + ax2.spines["top"].set_visible(False) + + line_h, = ax1.plot(runs, rates, marker="o", linewidth=2.5, markersize=9, + color=COLOR_LINE, zorder=3, + label="Pass rate — sonnet, default config") + ax1.fill_between(runs, rates, alpha=0.08, color=COLOR_LINE, zorder=2) + + # Experimental point + dashed connector + exp_h = ax1.scatter([exp_run], [exp_rate], marker="X", s=170, + color=COLOR_FAIL, zorder=4, + label="Run 16 — CodeMode ON (A/B experiment, excluded from main line)") + ax1.plot([runs[-1], exp_run], [rates[-1], exp_rate], + linestyle=":", color=COLOR_FAIL, linewidth=1.5, alpha=0.6, zorder=3) + ax1.text(exp_run, exp_rate - 3, "CodeMode ON\n24.0% (outlier)", + ha="center", va="top", fontsize=8.5, color=COLOR_FAIL, fontweight="bold") + + target_h = ax1.axhline(95, color=COLOR_PASS, linestyle="--", alpha=0.6, + linewidth=1.5, label="95% target") + + for run_idx, rate, letter in inflections: + ax1.scatter(run_idx, rate, s=260, facecolor="white", + edgecolor=COLOR_FAIL, linewidth=2, zorder=5) + ax1.text(run_idx, rate, letter, ha="center", va="center", + fontsize=10, fontweight="bold", color=COLOR_FAIL, zorder=6) + + ax1.set_xlabel("Run # (date below)") + ax1.set_ylabel("Pass rate (%)", color=COLOR_LINE) + ax1.set_ylim(18, 110) + xticks = all_runs + ax1.set_xticks(xticks) + xlabels = [f"{r}\n{d}" for r, d in zip(runs, dates)] + ["16\n04-05"] + ax1.set_xticklabels(xlabels, fontsize=8.5) + ax1.tick_params(axis="y", labelcolor=COLOR_LINE) + ax1.grid(axis="y", alpha=0.3, linestyle="--") + + legend_items = [line_h, exp_h, bar_h, target_h] + for letter, text in inflection_labels.items(): + legend_items.append( + Line2D([0], [0], marker="o", markerfacecolor="white", + markeredgecolor=COLOR_FAIL, markersize=10, linewidth=0, + label=f"{letter}: {text}") + ) + ax1.legend(handles=legend_items, loc="lower center", fontsize=8.3, + framealpha=0.95, ncol=2) + + ax1.set_title("LLM Test Suite Pass Rate — Run History " + "(Runs 1–16, 2026-03-05 → 2026-04-05)") + fig.tight_layout() + fig.savefig(OUT / "run_history.png", bbox_inches="tight") + plt.close(fig) + + +# -------------------------------------------------------------------- # +# 2. Progressive L1/L2/L3 — Run 15 (2026-04-05 codemode-OFF) # +# -------------------------------------------------------------------- # +def progressive_l1_l2_l3() -> None: + levels = ["L1\n(vague)", "L2\n(moderate)", "L3\n(explicit)"] + passed = [40, 42, 41] + total = 43 + rates = [p / total * 100 for p in passed] + + fig, (ax_a, ax_b) = plt.subplots(1, 2, figsize=(14, 6), + gridspec_kw={"width_ratios": [1, 1.5]}) + + bars = ax_a.bar(levels, rates, color=[COLOR_FAIL, COLOR_WARN, COLOR_PASS], + edgecolor="black", linewidth=0.5) + for bar, p in zip(bars, passed): + ax_a.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1.5, + f"{p}/{total}\n({bar.get_height():.1f}%)", + ha="center", va="bottom", fontsize=10, fontweight="bold") + ax_a.set_ylabel("Pass rate (%)") + ax_a.set_ylim(0, 118) + ax_a.set_title("Progressive Tier Pass Rate by Prompt Specificity\n" + "Run 15 (2026-04-05, sonnet) — 43 operations × 3 levels") + ax_a.grid(axis="y", alpha=0.3, linestyle="--") + ax_a.axhline(100, color="gray", linestyle=":", alpha=0.4) + + level_legend = [ + mpatches.Patch(color=COLOR_FAIL, label="L1 — vague keywords only"), + mpatches.Patch(color=COLOR_WARN, label="L2 — moderate domain context"), + mpatches.Patch(color=COLOR_PASS, label="L3 — explicit tool name"), + ] + ax_a.legend(handles=level_legend, loc="lower left", fontsize=8.5, framealpha=0.95) + + # Right: Run 15 problem cases (the only 6 failures / 129 tests) + cases = [ + ("thermal_zones", 0, 1, 1), # L1 fail + ("test_measure", 0, 1, 1), # L1 fail + ("zone_equipment_priority", 1, 1, 0), # L3 fail + ("edit_measure", 0, 0, 0), # all 3 fail (regression) + ] + names = [c[0] for c in cases] + l1 = [c[1] for c in cases] + l2 = [c[2] for c in cases] + l3 = [c[3] for c in cases] + x = np.arange(len(names)) + w = 0.26 + ax_b.bar(x - w, l1, w, label="L1 (vague)", color=COLOR_FAIL, + edgecolor="black", linewidth=0.3) + ax_b.bar(x, l2, w, label="L2 (moderate)", color=COLOR_WARN, + edgecolor="black", linewidth=0.3) + ax_b.bar(x + w, l3, w, label="L3 (explicit)", color=COLOR_PASS, + edgecolor="black", linewidth=0.3) + ax_b.set_xticks(x) + ax_b.set_xticklabels(names, rotation=12, ha="right", fontsize=9) + ax_b.set_ylim(0, 1.35) + ax_b.set_yticks([0, 1]) + ax_b.set_yticklabels(["FAIL", "PASS"]) + ax_b.set_title("Problem Cases — Run 15 failures\n" + "(39/43 operations pass all 3 levels; edit_measure is an all-level regression)") + ax_b.legend(loc="upper right", fontsize=8.5, framealpha=0.95) + ax_b.grid(axis="y", alpha=0.3, linestyle="--") + + fig.tight_layout() + fig.savefig(OUT / "progressive_l1_l2_l3.png", bbox_inches="tight") + plt.close(fig) + + +# -------------------------------------------------------------------- # +# 3. Tier pass rates — Run 14 (2026-03-28 sonnet full suite) # +# -------------------------------------------------------------------- # +def tier_pass_rates() -> None: + tiers = ["setup", "tier1\n(no model)", "tier2\n(workflows)", "tier3\n(skill evals)", + "tier4\n(guardrails)", "progressive\n(L1/L2/L3)"] + # Run 14: 2026-03-28 sonnet + passed = [6, 4, 33, 21, 3, 103] + total = [6, 4, 37, 26, 3, 104] + rates = [p / t * 100 for p, t in zip(passed, total)] + + fig, ax = plt.subplots(figsize=(12, 6)) + colors = [COLOR_PASS if r >= 95 else (COLOR_WARN if r >= 85 else COLOR_FAIL) for r in rates] + bars = ax.bar(tiers, rates, color=colors, edgecolor="black", linewidth=0.5) + + for bar, p, t in zip(bars, passed, total): + ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1.3, + f"{p}/{t}\n({bar.get_height():.1f}%)", + ha="center", va="bottom", fontsize=9.5, fontweight="bold") + + target_h = ax.axhline(95, color=COLOR_PASS, linestyle="--", alpha=0.6, + linewidth=1.5, label="95% target") + + ax.set_ylabel("Pass rate (%)") + ax.set_ylim(0, 118) + ax.set_title("LLM Test Pass Rate by Tier — Run 14 (2026-03-28, sonnet)\n" + "170/180 = 94.4% overall, full suite incl. expanded progressive tier") + ax.grid(axis="y", alpha=0.3, linestyle="--") + + color_legend = [ + mpatches.Patch(color=COLOR_PASS, label="≥ 95% (on target)"), + mpatches.Patch(color=COLOR_WARN, label="85–94% (warning)"), + mpatches.Patch(color=COLOR_FAIL, label="< 85% (attention)"), + target_h, + ] + ax.legend(handles=color_legend, loc="lower right", fontsize=9, framealpha=0.95) + + fig.tight_layout() + fig.savefig(OUT / "tier_pass_rates.png", bbox_inches="tight") + plt.close(fig) + + +# -------------------------------------------------------------------- # +# 4. Token profile — from 2026-03-28 sonnet per-tier averages # +# -------------------------------------------------------------------- # +def token_profile() -> None: + tiers = ["setup", "tier1", "tier2", "tier3", "tier4", "progressive"] + # Per-test averages (actual values from sonnet-2026-03-28/benchmark.json) + input_tok = [10, 5, 16, 10, 12, 10] + output_tok = [771, 318, 3315, 910, 2496, 869] + cache_tok = [98_124, 34_137, 216_796, 89_930, 186_112, 84_657] + cost = [0.087, 0.047, 0.179, 0.082, 0.162, 0.087] + turns = [5.5, 2.2, 10.5, 5.8, 8.7, 5.9] + + fig, axes = plt.subplots(1, 2, figsize=(14, 6)) + + x = np.arange(len(tiers)) + axes[0].bar(x, cache_tok, color="#90caf9", edgecolor="black", linewidth=0.3, + label="cache-read (tool defs served from cache)") + axes[0].bar(x, output_tok, bottom=cache_tok, color=COLOR_WARN, + edgecolor="black", linewidth=0.3, label="output (model-generated)") + axes[0].bar(x, input_tok, + bottom=[c + o for c, o in zip(cache_tok, output_tok)], + color=COLOR_LINE, edgecolor="black", linewidth=0.3, + label="input (fresh tokens sent)") + axes[0].set_xticks(x) + axes[0].set_xticklabels(tiers, fontsize=9) + axes[0].set_ylabel("Tokens per test (log scale)") + axes[0].set_yscale("log") + axes[0].set_title("Token Profile by Tier — per-test averages\n" + "Run 14 (2026-03-28 sonnet) — cache-read dominates by 100×+") + axes[0].legend(loc="upper left", fontsize=9, framealpha=0.95) + axes[0].grid(axis="y", alpha=0.3, linestyle="--", which="both") + + ax_r = axes[1] + ax_r2 = ax_r.twinx() + bars_cost = ax_r.bar(x - 0.2, cost, 0.4, color=COLOR_LINE, + edgecolor="black", linewidth=0.3, + label="notional cost per test (USD, left)") + bars_turns = ax_r2.bar(x + 0.2, turns, 0.4, color=COLOR_WARN, + edgecolor="black", linewidth=0.3, + label="avg conversation turns (right)") + for bar, c in zip(bars_cost, cost): + ax_r.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.004, + f"${c:.2f}", ha="center", va="bottom", fontsize=8) + for bar, t in zip(bars_turns, turns): + ax_r2.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.2, + f"{t:.1f}", ha="center", va="bottom", fontsize=8) + ax_r.set_xticks(x) + ax_r.set_xticklabels(tiers, fontsize=9) + ax_r.set_ylabel("Notional cost per test (USD)", color=COLOR_LINE) + ax_r.tick_params(axis="y", labelcolor=COLOR_LINE) + ax_r2.set_ylabel("Avg turns per test", color=COLOR_WARN) + ax_r2.tick_params(axis="y", labelcolor=COLOR_WARN) + ax_r.set_title("Cost & Turn Count by Tier\n" + "(free on Claude Max — cost is notional API pricing)") + ax_r.set_ylim(0, max(cost) * 1.3) + ax_r2.set_ylim(0, max(turns) * 1.3) + + h1, l1 = ax_r.get_legend_handles_labels() + h2, l2 = ax_r2.get_legend_handles_labels() + ax_r.legend(h1 + h2, l1 + l2, loc="upper left", fontsize=9, framealpha=0.95) + + fig.tight_layout() + fig.savefig(OUT / "token_profile.png", bbox_inches="tight") + plt.close(fig) + + +# -------------------------------------------------------------------- # +# 5. Failure modes — Run 14 (full suite) + historical stacked # +# -------------------------------------------------------------------- # +def failure_modes() -> None: + # Run 14 (2026-03-28 sonnet) failure modes + modes_short = ["wrong_tool", "timeout", "no_mcp_tool"] + counts = [9, 1, 0] + descriptions = [ + "eval + workflow:\n2× qaqc, 2× troubleshoot\n1× energy-report,\n1× e2e workflow,\n2× measure quality,\n1× misc", + "1× systemd\nfourpipebeam e2e\n(exceeded wall clock)", + "—", + ] + + fig, (ax_a, ax_b) = plt.subplots(1, 2, figsize=(14, 6), + gridspec_kw={"width_ratios": [1, 1.3]}) + + colors = [COLOR_FAIL, COLOR_WARN, COLOR_ALT] + bars = ax_a.bar(modes_short, counts, color=colors, edgecolor="black", linewidth=0.5) + for bar, d in zip(bars, descriptions): + if bar.get_height() > 0: + ax_a.text(bar.get_x() + bar.get_width() / 2, bar.get_height() / 2, + d, ha="center", va="center", + fontsize=8.5, color="white", fontweight="bold") + else: + ax_a.text(bar.get_x() + bar.get_width() / 2, 0.2, + "0", ha="center", va="bottom", + fontsize=9, color="black") + ax_a.set_ylabel("Failure count") + ax_a.set_title("Run 14 Failures by Mode\n" + "(10 failed / 180 attempted = 94.4% pass)") + ax_a.set_ylim(0, max(counts) + 2) + ax_a.grid(axis="y", alpha=0.3, linestyle="--") + + mode_legend = [ + mpatches.Patch(color=COLOR_FAIL, + label="wrong_tool: MCP tool called, but not expected one"), + mpatches.Patch(color=COLOR_WARN, + label="timeout: exceeded wall clock before finishing"), + mpatches.Patch(color=COLOR_ALT, + label="no_mcp_tool: agent called no MCP tool at all"), + ] + ax_a.legend(handles=mode_legend, loc="upper right", fontsize=8, framealpha=0.95) + + # Right: historical pass/fail stacked + runs = list(range(1, 17)) + passed = [22, 75, 82, 84, 103, 153, 155, 23, 9, 166, 164, 163, 160, 170, 123, 31] + total = [50, 90, 90, 90, 107, 159, 159, 25, 9, 172, 171, 170, 167, 180, 129, 129] + failed = [t - p for p, t in zip(passed, total)] + + # Run 16 is experimental — shade differently + regular = 15 + ax_b.bar(runs[:regular], passed[:regular], label="passed", + color=COLOR_PASS, edgecolor="black", linewidth=0.3) + ax_b.bar(runs[:regular], failed[:regular], bottom=passed[:regular], + label="failed", color=COLOR_FAIL, edgecolor="black", linewidth=0.3) + # Run 16 (CodeMode ON) in muted colors + ax_b.bar([runs[regular]], [passed[regular]], color=COLOR_PASS, + edgecolor="black", linewidth=0.3, alpha=0.4, + label="passed (experiment)") + ax_b.bar([runs[regular]], [failed[regular]], bottom=[passed[regular]], + color=COLOR_FAIL, edgecolor="black", linewidth=0.3, alpha=0.4, + label="failed (experiment)") + + for r, p, f in zip(runs, passed, failed): + if f > 0: + ax_b.text(r, p + f + 3, str(f), ha="center", va="bottom", + fontsize=8, color=COLOR_FAIL, fontweight="bold") + + ax_b.set_xticks(runs) + ax_b.set_xlabel("Run #") + ax_b.set_ylabel("Test count (attempted)") + ax_b.set_title("Pass / Fail Absolute Counts by Run (1–16)\n" + "failure count labeled above each bar; Run 16 = CodeMode ON experiment") + ax_b.legend(loc="upper left", fontsize=8.5, framealpha=0.95) + ax_b.grid(axis="y", alpha=0.3, linestyle="--") + + fig.tight_layout() + fig.savefig(OUT / "failure_modes.png", bbox_inches="tight") + plt.close(fig) + + +# -------------------------------------------------------------------- # +# 6. NEW: Model comparison (2026-03-28 sonnet/haiku/opus sweep) # +# -------------------------------------------------------------------- # +def model_comparison() -> None: + models = ["haiku", "sonnet", "opus"] + passed = [160, 170, 170] + total = 180 + rates = [p / total * 100 for p in passed] + cost = [11.21, 18.96, 32.23] + duration_min = [79.6, 157.5, 184.6] + + # Per-tier breakdowns + tiers = ["setup", "tier1", "tier2", "tier3", "tier4", "progressive"] + sonnet_t = [100.0, 100.0, 89.2, 80.8, 100.0, 99.0] + haiku_t = [100.0, 100.0, 83.8, 73.1, 100.0, 93.3] + opus_t = [100.0, 100.0, 91.9, 73.1, 100.0, 100.0] + + fig, (ax_a, ax_b) = plt.subplots(1, 2, figsize=(14, 6), + gridspec_kw={"width_ratios": [1, 1.4]}) + + # Left: overall pass rate + cost + x = np.arange(len(models)) + w = 0.38 + ax_b2 = ax_a.twinx() + bars_pass = ax_a.bar(x - w/2, rates, w, color=COLOR_PASS, + edgecolor="black", linewidth=0.4, + label="pass rate (left)") + bars_cost = ax_b2.bar(x + w/2, cost, w, color=COLOR_LINE, + edgecolor="black", linewidth=0.4, + label="notional cost USD (right)") + for bar, p, r in zip(bars_pass, passed, rates): + ax_a.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1, + f"{p}/{total}\n{r:.1f}%", ha="center", va="bottom", + fontsize=9, fontweight="bold") + for bar, c, d in zip(bars_cost, cost, duration_min): + ax_b2.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5, + f"${c:.2f}\n{d:.0f} min", ha="center", va="bottom", + fontsize=8.5) + ax_a.set_xticks(x) + ax_a.set_xticklabels(models) + ax_a.set_ylabel("Pass rate (%)", color=COLOR_PASS) + ax_a.tick_params(axis="y", labelcolor=COLOR_PASS) + ax_b2.set_ylabel("Notional cost (USD)", color=COLOR_LINE) + ax_b2.tick_params(axis="y", labelcolor=COLOR_LINE) + ax_a.set_ylim(0, 115) + ax_b2.set_ylim(0, max(cost) * 1.35) + ax_a.set_title("Cross-Model Sweep — 2026-03-28\n" + "Same 180-test suite, retries=0, identical tool definitions") + ax_a.grid(axis="y", alpha=0.3, linestyle="--") + + h1, l1 = ax_a.get_legend_handles_labels() + h2, l2 = ax_b2.get_legend_handles_labels() + ax_a.legend(h1 + h2, l1 + l2, loc="upper center", + bbox_to_anchor=(0.5, -0.08), fontsize=9, + framealpha=0.95, ncol=2) + + # Right: per-tier comparison + x2 = np.arange(len(tiers)) + w2 = 0.26 + ax_b.bar(x2 - w2, haiku_t, w2, label="haiku", + color="#90caf9", edgecolor="black", linewidth=0.3) + ax_b.bar(x2, sonnet_t, w2, label="sonnet", + color=COLOR_LINE, edgecolor="black", linewidth=0.3) + ax_b.bar(x2 + w2, opus_t, w2, label="opus", + color=COLOR_ALT, edgecolor="black", linewidth=0.3) + ax_b.axhline(95, color=COLOR_PASS, linestyle="--", alpha=0.5, label="95% target") + ax_b.set_xticks(x2) + ax_b.set_xticklabels(tiers, fontsize=9) + ax_b.set_ylabel("Pass rate (%)") + ax_b.set_ylim(0, 115) + ax_b.set_title("Per-Tier Pass Rate by Model\n" + "(tier3 skill evals hit all 3 models — disambiguation gap)") + ax_b.legend(loc="upper center", bbox_to_anchor=(0.5, -0.08), + fontsize=9, framealpha=0.95, ncol=4) + ax_b.grid(axis="y", alpha=0.3, linestyle="--") + + fig.tight_layout() + fig.savefig(OUT / "model_comparison.png", bbox_inches="tight") + plt.close(fig) + + +# -------------------------------------------------------------------- # +# 7. NEW: CodeMode A/B experiment (2026-04-05) # +# -------------------------------------------------------------------- # +def codemode_ab() -> None: + labels = ["CodeMode OFF\n(baseline)", "CodeMode ON\n(experiment)"] + + # Top-level + passed = [123, 31] + total = 129 + rates = [p / total * 100 for p in passed] + + # L1/L2/L3 breakdown + l1_rates = [93.0, 18.6] + l2_rates = [97.7, 27.9] + l3_rates = [95.3, 25.6] + + # Cost / duration / ToolSearch + cost = [9.29, 22.35] + duration_min = [69, 168] + toolsearch = [1.6, 5.8] + output_tok = [127_859, 300_118] + + fig, axes = plt.subplots(1, 3, figsize=(16, 5.5), + gridspec_kw={"width_ratios": [1, 1.4, 1.4]}) + + # Left: overall pass rate + ax = axes[0] + colors = [COLOR_PASS, COLOR_FAIL] + bars = ax.bar(labels, rates, color=colors, edgecolor="black", linewidth=0.5) + for bar, p, r in zip(bars, passed, rates): + ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 2, + f"{p}/{total}\n{r:.1f}%", ha="center", va="bottom", + fontsize=10, fontweight="bold") + ax.axhline(95, color=COLOR_PASS, linestyle="--", alpha=0.5, label="95% target") + ax.set_ylabel("Pass rate (%)") + ax.set_ylim(0, 118) + ax.set_title("Overall Pass Rate\n(same 129-test progressive suite)") + ax.grid(axis="y", alpha=0.3, linestyle="--") + ax.legend(loc="upper right", fontsize=9, framealpha=0.95) + + # Middle: L1/L2/L3 by condition + ax = axes[1] + x = np.arange(2) + w = 0.26 + ax.bar(x - w, l1_rates, w, label="L1 (vague)", + color=COLOR_FAIL, edgecolor="black", linewidth=0.3) + ax.bar(x, l2_rates, w, label="L2 (moderate)", + color=COLOR_WARN, edgecolor="black", linewidth=0.3) + ax.bar(x + w, l3_rates, w, label="L3 (explicit)", + color=COLOR_PASS, edgecolor="black", linewidth=0.3) + for i, (a, b, c) in enumerate(zip(l1_rates, l2_rates, l3_rates)): + ax.text(i - w, a + 1.5, f"{a:.0f}%", ha="center", fontsize=8) + ax.text(i, b + 1.5, f"{b:.0f}%", ha="center", fontsize=8) + ax.text(i + w, c + 1.5, f"{c:.0f}%", ha="center", fontsize=8) + ax.set_xticks(x) + ax.set_xticklabels(labels) + ax.set_ylabel("Pass rate (%)") + ax.set_ylim(0, 115) + ax.set_title("Pass Rate by Specificity Level\n(CodeMode regresses ~70pp at every level)") + ax.legend(loc="upper right", fontsize=8.5, framealpha=0.95) + ax.grid(axis="y", alpha=0.3, linestyle="--") + + # Right: cost / duration / toolsearch calls + ax = axes[2] + metrics = ["cost\n(USD)", "duration\n(min)", "ToolSearch\ncalls/test", "output\ntokens (k)"] + off_vals = [9.29, 69, 1.6, 127.9] + on_vals = [22.35, 168, 5.8, 300.1] + # Normalize each metric so bars are comparable on one axis + off_norm = [1.0, 1.0, 1.0, 1.0] + on_norm = [o / f for o, f in zip(on_vals, off_vals)] + x = np.arange(len(metrics)) + w = 0.38 + ax.bar(x - w/2, off_norm, w, color=COLOR_PASS, + edgecolor="black", linewidth=0.3, label="CodeMode OFF (baseline = 1×)") + bars_on = ax.bar(x + w/2, on_norm, w, color=COLOR_FAIL, + edgecolor="black", linewidth=0.3, label="CodeMode ON") + for bar, on_v, off_v in zip(bars_on, on_vals, off_vals): + ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.05, + f"{bar.get_height():.2f}×\n({on_v:.0f} vs {off_v:.0f})", + ha="center", va="bottom", fontsize=8) + ax.set_xticks(x) + ax.set_xticklabels(metrics, fontsize=9) + ax.set_ylabel("Relative to CodeMode OFF (= 1.0)") + ax.set_title("Resource Cost Multipliers\n(CodeMode ON is worse on every metric)") + ax.set_ylim(0, max(on_norm) * 1.4) + ax.axhline(1, color="gray", linestyle=":", alpha=0.5) + ax.legend(loc="upper left", fontsize=9, framealpha=0.95) + ax.grid(axis="y", alpha=0.3, linestyle="--") + + fig.suptitle("FastMCP CodeMode A/B Experiment — 2026-04-05 (sonnet, 129 progressive tests)", + fontsize=13, fontweight="bold", y=1.02) + fig.tight_layout() + fig.savefig(OUT / "codemode_ab.png", bbox_inches="tight") + plt.close(fig) + + +def main() -> None: + run_history() + progressive_l1_l2_l3() + tier_pass_rates() + token_profile() + failure_modes() + model_comparison() + codemode_ab() + print(f"Wrote 7 plots to {OUT}") + + +if __name__ == "__main__": + main() diff --git a/docs/testing/plots/model_comparison.png b/docs/testing/plots/model_comparison.png new file mode 100644 index 0000000..c5b9909 Binary files /dev/null and b/docs/testing/plots/model_comparison.png differ diff --git a/docs/testing/plots/progressive_l1_l2_l3.png b/docs/testing/plots/progressive_l1_l2_l3.png new file mode 100644 index 0000000..c8d10a6 Binary files /dev/null and b/docs/testing/plots/progressive_l1_l2_l3.png differ diff --git a/docs/testing/plots/run_history.png b/docs/testing/plots/run_history.png new file mode 100644 index 0000000..4ec79bb Binary files /dev/null and b/docs/testing/plots/run_history.png differ diff --git a/docs/testing/plots/tier_pass_rates.png b/docs/testing/plots/tier_pass_rates.png new file mode 100644 index 0000000..db1dc21 Binary files /dev/null and b/docs/testing/plots/tier_pass_rates.png differ diff --git a/docs/testing/plots/token_profile.png b/docs/testing/plots/token_profile.png new file mode 100644 index 0000000..6de5a0b Binary files /dev/null and b/docs/testing/plots/token_profile.png differ diff --git a/docs/testing.md b/docs/testing/testing.md similarity index 100% rename from docs/testing.md rename to docs/testing/testing.md diff --git a/docs/tool-discovery-research.md b/docs/tool-discovery-research.md deleted file mode 100644 index 34ddcfa..0000000 --- a/docs/tool-discovery-research.md +++ /dev/null @@ -1,195 +0,0 @@ -# Tool Discovery & Lazy Loading Research - -**Date:** 2026-03-19 -**Context:** 142 MCP tools causing LLM tool selection degradation (FM1) - -## Problem (Resolved) - -RAG-MCP paper (arxiv:2505.03275) shows selection accuracy drops to 13.6% -at 100+ tools. Initially our LLM tests couldn't discover new tools — -root cause was stale Docker image (ToolSearch indexes at build time). -After Docker rebuild + enriched descriptions, all tools discoverable. -LLM tests 12/12 pass. - -## Approaches Investigated - -### 1. Anthropic Tool Search (`defer_loading`) — Most Promising - -Mark tools with `defer_loading: true` — excluded from initial context. -Claude sees only a built-in "Tool Search Tool" (~500 tokens) + always-loaded -tools. When it needs a capability, it searches tool names/descriptions/arg -names and loads matched tools (typically 3-5) into context. - -**Results from Anthropic benchmarks:** -- 85% context reduction -- Opus 4: 49% → 74% accuracy -- Opus 4.5: 79.5% → 88.1% accuracy - -**MCP integration:** -```json -{ - "mcpServers": { - "openstudio": { - "command": "openstudio-mcp", - "toolConfiguration": { - "default_config": { "defer_loading": true }, - "configs": { - "load_osm_model": { "defer_loading": false }, - "save_osm_model": { "defer_loading": false } - } - } - } - } -} -``` - -**Status:** Need to test if Claude Desktop/Code support `defer_loading` -for MCP servers. Works for direct API calls. - -Sources: -- https://platform.claude.com/docs/en/agents-and-tools/tool-use/tool-search-tool -- https://www.anthropic.com/engineering/advanced-tool-use -- https://unified.to/blog/scaling_mcp_tools_with_anthropic_defer_loading - -### 2. FastMCP Namespace Activation (v3.x) - -Tags + `mcp.disable(tags={"hvac"})` at init hides tools from `tools/list`. -Agent calls activation tool → `ctx.enable_components(tags={"namespace:hvac"})` -→ tools appear. Sends `tools/list_changed` notification automatically. - -```python -server = FastMCP("openstudio-mcp") - -@server.tool(tags={"namespace:hvac"}) -def add_baseline_system(...): ... - -@server.tool -async def activate_hvac(ctx: Context) -> str: - await ctx.enable_components(tags={"namespace:hvac"}) - return "HVAC tools activated" - -server.disable(tags={"namespace:hvac"}) # hidden at init -``` - -**Problem:** Claude Desktop and Claude Code do NOT support -`tools/list_changed` notification. Hidden tools stay hidden forever. - -**Client support for `tools/list_changed`:** -- Supported: Cursor, VS Code Copilot, Windsurf, Glama, Kilo Code -- NOT supported: Claude Desktop, Claude Code, Cline, Claude.ai - -Source: github.com/apify/mcp-client-capabilities - -### 3. LlamaIndex ObjectIndex + ToolRetriever - -Embed tool descriptions into VectorStoreIndex. At query time, retrieve -top-k most relevant tools via cosine similarity. Only those signatures -get passed to the LLM. - -```python -from llama_index.core.objects import ObjectIndex -obj_index = ObjectIndex.from_objects(all_tools, index_cls=VectorStoreIndex) -agent = FunctionAgent( - tool_retriever=obj_index.as_retriever(similarity_top_k=5), - llm=llm -) -``` - -Not applicable for MCP servers (no control over client-side tool injection). -Useful if building a custom agent that calls MCP tools programmatically. - -### 4. Multi-Agent Routing (LangChain/CrewAI/AutoGen) - -Router LLM classifies query into domain → sub-agent with 5-10 tools handles -it. Each sub-agent sees only its domain's tools. - -High effort, requires architecture change. Not applicable to single MCP -server serving Claude Desktop. - -### 5. Semantic Router MCP (openclaw-mcp-router) - -Single MCP gateway that: -1. Indexes all tools from downstream MCP servers (embeddings in LanceDB) -2. Exposes `mcp_search(query)` returning top-K relevant tools -3. Exposes `mcp_call(tool_name, params)` to execute - -Replaces tens of thousands of schema tokens with 5-tool search results. -Interesting but adds infrastructure complexity. - -### 6. Tool Consolidation - -Merge related tools to reduce count. e.g. all `extract_*` into one with -a `what` parameter. Reduces tool count but loses discoverability of -specific capabilities. - -## RAG-MCP Paper Key Numbers - -| Tool Pool Size | Selection Accuracy | -|---------------|-------------------| -| ≤30 tools | >90% | -| 31-70 tools | Degraded (semantic overlap) | -| 100+ tools | 13.6% (baseline), 43% (with retrieval) | - -## What We Built (Phases 1-3) - -- `recommend_tools` meta-tool: keyword routing to 9 groups -- Tags on all 142 tools -- Docstring hardening for bypass-prone tools -- `search_api` + `search_wiring_patterns` for HVAC measure authoring - -**Result:** 96.5% pass rate on existing tests (no regression). New tools -are discoverable via ToolSearch after Docker rebuild. LLM tests 12/12 pass. - -## Claude Code ToolSearch Testing (2026-03-19) - -Claude Code has `ENABLE_TOOL_SEARCH` (default: auto at 10% context threshold). -When active, MCP tools are deferred and discovered via ToolSearch. - -**Test results with `ENABLE_TOOL_SEARCH=true`:** - -| ToolSearch Query | Found our tool? | What it found instead | -|-----------------|----------------|----------------------| -| "search_api" | NO | "No matching deferred tools found" | -| "search" | NO | WebSearch, ExitPlanMode, TodoWrite | -| "api reference" | NO | WebFetch, TodoWrite, WebSearch | -| "SDK classes methods" | NO | LSP, create_measure, get_object_fields | -| "search_wiring" | NO | (empty) | -| "HVAC wiring recipe" | NO | list_zone_hvac_equipment, get_zone_hvac_details | -| "wiring patterns" | NO | create_measure (docstring mentions wiring) | - -**Conclusion:** ToolSearch cannot find `search_api` or `search_wiring_patterns` -with any query. The deferred tool mechanism works (ToolSearch runs, finds other -MCP tools like `create_measure` and `get_object_fields`) but our new tools are -invisible to it. Possible causes: -- Tool descriptions not matching ToolSearch's internal index/embedding -- Tool names with underscores may not tokenize well for matching -- ToolSearch may prioritize tools with longer/richer descriptions - -**Root cause found:** ToolSearch indexes tools at Docker image build time. -Volume-mounted code registers new tools at runtime, but ToolSearch's index -is stale. **Docker rebuild fixes everything.** - -After `docker build`: - -| Query | Finds tool? | Position | -|-------|------------|----------| -| "search_api" | search_api | 1st | -| "SDK methods" | search_api | 1st | -| "wiring patterns" | search_wiring_patterns | 1st | -| "four pipe beam wiring" | search_wiring_patterns | 1st | -| "HVAC recipe" | search_wiring_patterns | 4th | -| "recommend tools" | recommend_tools | 1st | - -Enriched descriptions also helped — added use cases, examples, and -keyword-rich text to match likely search queries. - -## Recommendation - -1. **ToolSearch works** — all tools discoverable after Docker rebuild - with enriched descriptions -2. **Always rebuild Docker** after adding new tools (CI does this already) -3. **Enriched descriptions matter** — include use cases, examples, and - keywords that match natural language queries -4. **LLM tests pass** — 12/12 after rebuild (including search_api + search_wiring_patterns discovery) -5. **Phase 4 (lazy loading) not needed** — ToolSearch handles the - discovery problem when properly indexed diff --git a/mcp_server/config.py b/mcp_server/config.py index 9be77f9..3425f9a 100644 --- a/mcp_server/config.py +++ b/mcp_server/config.py @@ -27,6 +27,8 @@ def _safe_int(env_val: str, default: int) -> int: INPUT_ROOT = Path(os.environ.get("OPENSTUDIO_MCP_INPUT_ROOT", "/inputs")).resolve() +ENABLE_CODE_MODE = os.environ.get("OSMCP_CODE_MODE", "").lower() in ("1", "true") + ALLOWED_PATH_ROOTS = [ Path("/repo").resolve(), RUN_ROOT, diff --git a/mcp_server/server.py b/mcp_server/server.py index 6d912e3..77941a0 100644 --- a/mcp_server/server.py +++ b/mcp_server/server.py @@ -2,12 +2,12 @@ from fastmcp import FastMCP +from mcp_server.config import ENABLE_CODE_MODE from mcp_server.skills import register_all_skills -from mcp_server.stdout_suppression import create_suppression_middleware +from mcp_server.stdout_suppression import redirect_c_stdout_to_stderr mcp = FastMCP( "openstudio-mcp", - middleware=[create_suppression_middleware()], instructions=( "Building energy simulation server (OpenStudio SDK) with 142 tools for " "creating, modifying, simulating, and analyzing building energy models. " @@ -47,8 +47,13 @@ register_all_skills(mcp) +if ENABLE_CODE_MODE: + from fastmcp.experimental.transforms.code_mode import CodeMode + mcp.add_transform(CodeMode()) + def main(): + redirect_c_stdout_to_stderr() mcp.run() diff --git a/mcp_server/stdout_suppression.py b/mcp_server/stdout_suppression.py index 9b51539..9fe33b8 100644 --- a/mcp_server/stdout_suppression.py +++ b/mcp_server/stdout_suppression.py @@ -1,84 +1,63 @@ -"""Utilities for suppressing unwanted stdout from OpenStudio Python bindings. +"""Redirect C-level stdout to stderr to protect MCP JSON-RPC protocol. -The OpenStudio SWIG bindings print memory leak warnings to stdout: -"swig/python detected a memory leak of type 'openstudio::model::Model *', no destructor found." +OpenStudio's SWIG bindings and C++ geometry engine write directly to +C stdout (fd 1): memory leak warnings, Polyhedron diagnostics, etc. +These corrupt the JSON-RPC stream that MCP clients read from stdout. -This pollutes the MCP JSON-RPC protocol which requires clean stdout. -We redirect these warnings to stderr instead. +Strategy: at process startup, permanently redirect fd 1 to stderr so +ALL C-level writes go there harmlessly. Then replace Python's +sys.stdout with a wrapper around the saved original fd so FastMCP's +stdio transport still writes JSON-RPC to the real client pipe. + +This is done once — no per-call suppression, no races, no missed callsites. """ from __future__ import annotations import atexit import contextlib +import io import os import sys -@contextlib.contextmanager -def suppress_openstudio_warnings(): - """Temporarily redirect stdout to stderr to suppress OpenStudio SWIG warnings. +def redirect_c_stdout_to_stderr(): + """Permanently redirect C-level stdout (fd 1) to stderr. - This ensures the MCP JSON-RPC protocol on stdout remains clean. - Works at both Python and C level by redirecting file descriptors. + Must be called before FastMCP's stdio_server() captures sys.stdout. + After this call: + - C code (printf, SWIG, OpenStudio internals) -> fd 1 -> stderr + - Python sys.stdout -> saved fd -> real MCP client pipe """ - # Save original file descriptors - stdout_fd = sys.stdout.fileno() - stderr_fd = sys.stderr.fileno() - - # Duplicate the current stdout FD to restore later - saved_stdout_fd = os.dup(stdout_fd) - - # Flush Python-level buffers before redirecting - sys.stdout.flush() - sys.stderr.flush() - - try: - # Redirect stdout (fd 1) to stderr (fd 2) at OS level - # This catches C-level fprintf(stdout, ...) from SWIG - os.dup2(stderr_fd, stdout_fd) - - yield + stdout_fd = sys.stdout.fileno() # 1 + stderr_fd = sys.stderr.fileno() # 2 - finally: - # Flush again before restoring - sys.stdout.flush() - sys.stderr.flush() + # Save the real stdout pipe (to MCP client) as a new fd + saved_fd = os.dup(stdout_fd) - # Restore original stdout - os.dup2(saved_stdout_fd, stdout_fd) - os.close(saved_stdout_fd) + # Point fd 1 at stderr — all future C-level printf goes here + os.dup2(stderr_fd, stdout_fd) + # Build a new Python stdout that writes to the saved fd. + # Line buffering so each JSON-RPC message flushes immediately. + binary = io.open(saved_fd, "wb", closefd=True) + text = io.TextIOWrapper(binary, encoding="utf-8", line_buffering=True) + sys.stdout = text -def create_suppression_middleware(): - """Create a FastMCP middleware that wraps ALL tool calls in stdout suppression. - Returns a Middleware instance. Factory function avoids importing fastmcp - at module level (this module is also used by model_manager which loads - before the server). - """ - from fastmcp.server.middleware import Middleware - - class _StdoutSuppressionMiddleware(Middleware): - async def on_call_tool(self, context, call_next): - with suppress_openstudio_warnings(): - return await call_next(context) - - return _StdoutSuppressionMiddleware() +# Retain context-manager API so model_manager.py imports don't break. +# Now a no-op since fd 1 is permanently redirected. +@contextlib.contextmanager +def suppress_openstudio_warnings(): + """No-op — fd 1 is permanently redirected at startup.""" + yield def _redirect_stdout_to_stderr_at_exit(): - """Redirect stdout to stderr during Python cleanup to catch SWIG warnings. - - OpenStudio prints memory leak warnings when models are garbage-collected - during Python interpreter shutdown. This redirects those to stderr. - """ + """Safety net: ensure fd 1 points to stderr during interpreter shutdown.""" try: - stdout_fd = 1 # sys.stdout might be None at exit - stderr_fd = 2 - os.dup2(stderr_fd, stdout_fd) + os.dup2(2, 1) except Exception: - pass # Silently ignore errors during shutdown + pass -# Register the cleanup handler to run before Python exits atexit.register(_redirect_stdout_to_stderr_at_exit) diff --git a/pyproject.toml b/pyproject.toml index 4db802c..dadf638 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ version = "0.8.2" description = "Thin MCP server around OpenStudio CLI with async runs and testable outputs." requires-python = ">=3.11" dependencies = [ - "fastmcp>=0.4.0", + "fastmcp>=3.1.0,<4.0", "pydantic>=2.6", "psutil>=5.9", "jsonschema>=4.21", diff --git a/tests/llm/README.md b/tests/llm/README.md index ec90924..ef7faa4 100644 --- a/tests/llm/README.md +++ b/tests/llm/README.md @@ -20,8 +20,8 @@ LLM_TESTS_ENABLED=1 pytest "tests/llm/test_04_workflows.py::test_workflow[bar_th # Run only tier 1 (tool selection, fastest — ~5 min) LLM_TESTS_ENABLED=1 LLM_TESTS_TIER=1 pytest tests/llm/ -v -# Reduce retries for faster iteration (default: 2) -LLM_TESTS_ENABLED=1 LLM_TESTS_RETRIES=0 pytest tests/llm/ -v +# Add retries for CI-like confidence (default: 0) +LLM_TESTS_ENABLED=1 LLM_TESTS_RETRIES=2 pytest tests/llm/ -v ``` ## Prerequisites @@ -35,7 +35,7 @@ LLM_TESTS_ENABLED=1 LLM_TESTS_RETRIES=0 pytest tests/llm/ -v | Variable | Default | Description | |----------|---------|-------------| | `LLM_TESTS_ENABLED` | (unset) | Set to `1` to enable tests | -| `LLM_TESTS_RETRIES` | `2` | Retry count for flaky LLM tests | +| `LLM_TESTS_RETRIES` | `0` | Retry count for flaky LLM tests | | `LLM_TESTS_TIER` | `all` | Filter: `1`, `2`, `3`, `4`, or `all` | | `LLM_TESTS_MODEL` | `sonnet` | Model: `sonnet`, `haiku`, `opus` | | `LLM_TESTS_MAX_PROMPTS` | `180` | Hard cap on Claude invocations per run | @@ -111,7 +111,7 @@ Each test invocation loads ~27K tokens of tool definitions (134 tools). Full sui - **`haiku` model** uses less quota: `LLM_TESTS_MODEL=haiku` (lower pass rate) ### Retries -Default 2 retries handles ~80% pass-rate LLM non-determinism. Set `LLM_TESTS_RETRIES=0` when iterating on a single test to get fast feedback. Set to `1` for a quick check, `2-3` for CI-like confidence. +Default 0 retries (single attempt) gives first-attempt signal for model comparison. Set `LLM_TESTS_RETRIES=2` for CI-like confidence with non-deterministic tests. ### Benchmark reports After each run, benchmark data is written to `LLM_TESTS_RUNS_DIR`: diff --git a/tests/llm/conftest.py b/tests/llm/conftest.py index d51dd83..5e9ac80 100644 --- a/tests/llm/conftest.py +++ b/tests/llm/conftest.py @@ -13,7 +13,7 @@ LLM_TESTS_ENABLED — set to "1" to enable LLM tests (default: disabled) LLM_TESTS_MAX_PROMPTS — hard cap on Claude invocations per run (default: 180) LLM_TESTS_TIER — filter to run specific tier: "1", "2", "3", "4", or "all" - LLM_TESTS_RETRIES — retry count for failed tests (default: 2) + LLM_TESTS_RETRIES — retry count for failed tests (default: 0) LLM_TESTS_MODEL — model to use: "sonnet", "haiku", "opus" (default: "sonnet") LLM_TESTS_RUNS_DIR — host path for /runs volume mount (default: /tmp/llm-test-runs) @@ -217,7 +217,7 @@ def get_tier() -> str: # not block the suite. The retry hook re-runs failed tests up to MAX_RETRIES # times before reporting a final failure. This is similar to pytest-rerunfailures # but implemented as a custom hook to avoid an extra dependency. -MAX_RETRIES = int(os.environ.get("LLM_TESTS_RETRIES", "2")) +MAX_RETRIES = int(os.environ.get("LLM_TESTS_RETRIES", "0")) def _is_flaky(nodeid: str) -> bool: @@ -379,14 +379,27 @@ def pytest_runtest_logreport(report): from .runner import _last_result stats = _last_result.stats if _last_result else {} - _benchmark_results.append({ + # Classify failure mode for failed tests + failure_mode = None + if not report.passed and _last_result: + if _last_result.is_error and "Timed out" in _last_result.final_text: + failure_mode = "timeout" + elif not _last_result.tool_names: + failure_mode = "no_mcp_tool" + else: + failure_mode = "wrong_tool" + + entry = { "test_id": report.nodeid, "passed": report.passed, "duration_s": round(duration, 1), "tier": tier, "attempt": attempt, **stats, - }) + } + if failure_mode: + entry["failure_mode"] = failure_mode + _benchmark_results.append(entry) # Persist NDJSON log for debugging if _last_result and _last_result.raw_ndjson: @@ -450,10 +463,15 @@ def pytest_sessionfinish(session, exitstatus): model = os.environ.get("LLM_TESTS_MODEL", "sonnet") ts = datetime.now(timezone.utc).isoformat(timespec="seconds") + code_mode = os.environ.get("LLM_TESTS_CODE_MODE", "0") + code_mode_tests = sum(1 for r in _benchmark_results if r.get("code_mode_active")) + summary = { "timestamp": ts, "model": model, "retries": MAX_RETRIES, + "code_mode": code_mode == "1", + "code_mode_tests": code_mode_tests, "total_tests": total, "passed": passed, "failed": total - passed, @@ -477,7 +495,9 @@ def pytest_sessionfinish(session, exitstatus): md.append(f"# LLM Benchmark Report") md.append(f"") md.append(f"**Date:** {ts} ") - md.append(f"**Model:** {model} | **Retries:** {MAX_RETRIES} ") + cm_label = "ON" if code_mode == "1" else "OFF" + md.append(f"**Model:** {model} | **Retries:** {MAX_RETRIES} " + f"| **CodeMode:** {cm_label} ") md.append(f"**Result:** {passed}/{total} passed ({pass_rate}%) " f"in {total_time:.0f}s ") md.append(f"**Tokens:** {_fmt_tokens(total_input)} in " @@ -590,15 +610,49 @@ def _fmt_row(vals): f"L2={l2_pass}/{l_total} | L3={l3_pass}/{l_total}") md.append("") - # Failed tests detail + # ToolSearch overhead analysis + ts_counts = [r.get("toolsearch_count", 0) for r in _benchmark_results] + if any(ts_counts): + avg_ts = sum(ts_counts) / len(ts_counts) if ts_counts else 0 + max_ts = max(ts_counts) if ts_counts else 0 + zero_ts = sum(1 for c in ts_counts if c == 0) + md.append("## Tool Discovery Overhead") + md.append("") + md.append(f"| Metric | Value |") + md.append(f"|--------|-------|") + md.append(f"| Avg ToolSearch calls/test | {avg_ts:.1f} |") + md.append(f"| Max ToolSearch calls | {max_ts} |") + md.append(f"| Tests with 0 ToolSearch | {zero_ts}/{len(ts_counts)} |") + md.append("") + + # Failure mode analysis failed_tests = [r for r in _benchmark_results if not r["passed"]] if failed_tests: + modes = {} + for r in failed_tests: + m = r.get("failure_mode", "unknown") + modes[m] = modes.get(m, 0) + 1 + md.append("## Failure Mode Analysis") + md.append("") + md.append("| Mode | Count | Description |") + md.append("|------|-------|-------------|") + mode_desc = { + "wrong_tool": "MCP tool called but not the expected one", + "no_mcp_tool": "No MCP tool called (stuck in builtins)", + "timeout": "Timed out before completing", + "unknown": "Failure mode not classified", + } + for m, count in sorted(modes.items(), key=lambda x: -x[1]): + md.append(f"| {m} | {count} | {mode_desc.get(m, '')} |") + md.append("") + md.append("## Failed Tests") md.append("") for r in failed_tests: name = _short_test_id(r["test_id"]) tools = " -> ".join(r.get("tool_calls", [])) or "no tools called" - md.append(f"- **{name}** ({r['tier']}): {r['duration_s']:.0f}s, " + mode = r.get("failure_mode", "?") + md.append(f"- **{name}** ({r['tier']}, {mode}): {r['duration_s']:.0f}s, " f"{r.get('num_turns', '?')} turns, tools: {tools}") md.append("") diff --git a/tests/llm/runner.py b/tests/llm/runner.py index cf6db59..3732e54 100644 --- a/tests/llm/runner.py +++ b/tests/llm/runner.py @@ -26,6 +26,7 @@ import json import os +import re import subprocess import tempfile from pathlib import Path @@ -75,11 +76,34 @@ def mcp_tool_calls(self) -> list[dict]: """Only MCP tool calls (excluding ToolSearch, Bash, etc.).""" return [c for c in self.tool_calls if c["tool"] not in BUILTIN_TOOLS] + @property + def code_mode_tool_calls(self) -> list[str]: + """Extract tool names from CodeMode execute calls.""" + names = [] + for c in self.mcp_tool_calls: + stripped = c["tool"].removeprefix("mcp__openstudio__") + if stripped == "execute": + code = c["input"].get("code", "") + for m in re.finditer(r'call_tool\(["\'](\w+)["\']', code): + names.append(m.group(1)) + return names + @property def tool_names(self) -> list[str]: - """MCP tool names with mcp__openstudio__ prefix stripped.""" + """MCP tool names with mcp__openstudio__ prefix stripped. + + Includes tools called inside CodeMode execute blocks. + """ prefix = "mcp__openstudio__" - return [c["tool"].removeprefix(prefix) for c in self.mcp_tool_calls] + # CodeMode meta-tools (search, get_schema, execute) excluded from + # domain tool list — only the real tools they invoke count. + code_mode_meta = frozenset({"search", "get_schema", "execute"}) + direct = [ + c["tool"].removeprefix(prefix) + for c in self.mcp_tool_calls + if c["tool"].removeprefix(prefix) not in code_mode_meta + ] + return direct + self.code_mode_tool_calls @property def all_tool_names(self) -> list[str]: @@ -122,6 +146,11 @@ def cache_read_tokens(self) -> int: usage = self.result.get("usage", {}) return usage.get("cache_read_input_tokens", 0) + @property + def toolsearch_count(self) -> int: + """Number of ToolSearch calls — proxy for tool discovery overhead.""" + return sum(1 for c in self.tool_calls if c["tool"] == "ToolSearch") + @property def stats(self) -> dict: """Summary stats for benchmarking.""" @@ -134,6 +163,14 @@ def stats(self) -> dict: "cache_read_tokens": self.cache_read_tokens, "tool_calls": self.tool_names, "num_tool_calls": len(self.tool_names), + "all_tool_calls": self.all_tool_names, + "toolsearch_count": self.toolsearch_count, + "is_timeout": self.is_error and "Timed out" in self.final_text, + "code_mode_active": bool(self.code_mode_tool_calls), + "code_executions": sum( + 1 for c in self.mcp_tool_calls + if c["tool"].removeprefix("mcp__openstudio__") == "execute" + ), } @@ -202,12 +239,12 @@ def run_claude( return _last_result -def _parse_stream_json(raw: str) -> ClaudeResult: +def _parse_stream_json(raw: str | None) -> ClaudeResult: """Parse newline-delimited JSON from stream-json output.""" messages = [] result_obj = {} - for line in raw.strip().splitlines(): + for line in (raw or "").strip().splitlines(): line = line.strip() if not line: continue @@ -230,6 +267,7 @@ def _write_mcp_config() -> Path: runs_dir = os.environ.get("LLM_TESTS_RUNS_DIR", _default_runs) assets_dir = str(Path(__file__).resolve().parents[1] / "assets") + code_mode = os.environ.get("LLM_TESTS_CODE_MODE", "0") config = { "mcpServers": { "openstudio": { @@ -240,6 +278,7 @@ def _write_mcp_config() -> Path: "-v", f"{assets_dir}:/test-assets:ro", "-v", f"{assets_dir}:/inputs:ro", "-e", "OPENSTUDIO_MCP_MODE=prod", + "-e", f"OSMCP_CODE_MODE={code_mode}", "openstudio-mcp:dev", "openstudio-mcp", ], diff --git a/tests/test_concurrent_tools.py b/tests/test_concurrent_tools.py new file mode 100644 index 0000000..54d0181 --- /dev/null +++ b/tests/test_concurrent_tools.py @@ -0,0 +1,113 @@ +"""Regression test for issue #42: stdout suppression race condition. + +The global FastMCP middleware held os.dup2() on fd 1 (stdout->stderr) for +the entire tool call. FastMCP dispatches sync tools via +anyio.to_thread.run_sync, so two tools CAN run concurrently. When Thread A +held the redirect, Thread B's JSON-RPC response goes to stderr and the +client receives nothing -> MCP error -32001 timeout. + +This test fires a slow tool (create_baseline_osm, several seconds) and a +fast tool (get_server_status, near-instant) concurrently. On buggy code, +get_server_status's response is lost -> timeout. After the fix, both return. +""" +import asyncio +import pytest + +from conftest import integration_enabled, server_params, unwrap +from mcp import ClientSession +from mcp.client.stdio import stdio_client + + +@pytest.mark.integration +def test_concurrent_tool_calls_both_respond(): + # Regression: issue #42 — concurrent tool calls lost responses due to + # global stdout suppression middleware redirecting fd 1 for entire tool duration. + if not integration_enabled(): + pytest.skip("Set RUN_OPENSTUDIO_INTEGRATION=1 to enable MCP integration tests.") + + async def _run(): + async with stdio_client(server_params()) as (read, write): + async with ClientSession(read, write) as session: + await session.initialize() + + # --- Arrange --- + # Fire slow tool first + baseline_task = asyncio.create_task( + session.call_tool("create_baseline_osm", { + "name": "concurrent_race_test", "num_floors": 1, + }) + ) + # Small delay so baseline_osm enters its execution window + await asyncio.sleep(0.5) + + # --- Act --- + # Fire fast tool while slow tool holds middleware fd redirect + status_task = asyncio.create_task( + session.call_tool("get_server_status", {}) + ) + + # --- Assert --- + # 30s timeout: get_server_status should return in <1s. + # If it times out, the race condition is present — the response + # went to stderr and the client never received it. + try: + baseline_res, status_res = await asyncio.wait_for( + asyncio.gather(baseline_task, status_task), + timeout=30, + ) + except asyncio.TimeoutError: + pytest.fail( + "Concurrent tool call timed out — stdout suppression race " + "condition is present (issue #42). get_server_status response " + "was likely written to stderr while create_baseline_osm held " + "the fd 1 redirect." + ) + + baseline = unwrap(baseline_res) + status = unwrap(status_res) + + assert baseline.get("ok") is True, f"create_baseline_osm failed: {baseline}" + assert status.get("ok") is True, f"get_server_status failed: {status}" + assert "run_root" in status, f"status missing expected keys: {status}" + + asyncio.run(_run()) + + +@pytest.mark.integration +def test_concurrent_fast_tools_both_respond(): + # Regression: issue #42 — even two fast tools can race if both enter + # the middleware's fd redirect window simultaneously. + if not integration_enabled(): + pytest.skip("Set RUN_OPENSTUDIO_INTEGRATION=1 to enable MCP integration tests.") + + async def _run(): + async with stdio_client(server_params()) as (read, write): + async with ClientSession(read, write) as session: + await session.initialize() + + # Fire two fast tools concurrently + task_a = asyncio.create_task( + session.call_tool("get_server_status", {}) + ) + task_b = asyncio.create_task( + session.call_tool("get_server_status", {}) + ) + + try: + res_a, res_b = await asyncio.wait_for( + asyncio.gather(task_a, task_b), + timeout=15, + ) + except asyncio.TimeoutError: + pytest.fail( + "Concurrent fast tool calls timed out — stdout suppression " + "race condition (issue #42)." + ) + + a = unwrap(res_a) + b = unwrap(res_b) + + assert a.get("ok") is True, f"First get_server_status failed: {a}" + assert b.get("ok") is True, f"Second get_server_status failed: {b}" + + asyncio.run(_run())