diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0f4fb7b..074efea 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -81,8 +81,8 @@ jobs: EXTRA_ENV="" ;; 5) - # HVAC supply wiring simulation smoke tests (DOAS, radiant, district, beams) - FILES="tests/test_hvac_supply_sim.py" + # HVAC supply wiring simulation smoke tests (DOAS, radiant, district, beams) + concurrent regression + FILES="tests/test_hvac_supply_sim.py tests/test_concurrent_tools.py" EXTRA_ENV="" ;; esac diff --git a/mcp_server/server.py b/mcp_server/server.py index 6cb2eec..79b180b 100644 --- a/mcp_server/server.py +++ b/mcp_server/server.py @@ -3,6 +3,7 @@ from fastmcp import FastMCP from mcp_server.skills import register_all_skills +from mcp_server.stdout_suppression import redirect_c_stdout_to_stderr mcp = FastMCP("openstudio-mcp") @@ -10,6 +11,7 @@ def main(): + redirect_c_stdout_to_stderr() mcp.run() diff --git a/mcp_server/stdout_suppression.py b/mcp_server/stdout_suppression.py index 5cc8032..9fe33b8 100644 --- a/mcp_server/stdout_suppression.py +++ b/mcp_server/stdout_suppression.py @@ -1,67 +1,63 @@ -"""Utilities for suppressing unwanted stdout from OpenStudio Python bindings. +"""Redirect C-level stdout to stderr to protect MCP JSON-RPC protocol. -The OpenStudio SWIG bindings print memory leak warnings to stdout: -"swig/python detected a memory leak of type 'openstudio::model::Model *', no destructor found." +OpenStudio's SWIG bindings and C++ geometry engine write directly to +C stdout (fd 1): memory leak warnings, Polyhedron diagnostics, etc. +These corrupt the JSON-RPC stream that MCP clients read from stdout. -This pollutes the MCP JSON-RPC protocol which requires clean stdout. -We redirect these warnings to stderr instead. +Strategy: at process startup, permanently redirect fd 1 to stderr so +ALL C-level writes go there harmlessly. Then replace Python's +sys.stdout with a wrapper around the saved original fd so FastMCP's +stdio transport still writes JSON-RPC to the real client pipe. + +This is done once — no per-call suppression, no races, no missed callsites. """ from __future__ import annotations import atexit import contextlib +import io import os import sys -@contextlib.contextmanager -def suppress_openstudio_warnings(): - """Temporarily redirect stdout to stderr to suppress OpenStudio SWIG warnings. +def redirect_c_stdout_to_stderr(): + """Permanently redirect C-level stdout (fd 1) to stderr. - This ensures the MCP JSON-RPC protocol on stdout remains clean. - Works at both Python and C level by redirecting file descriptors. + Must be called before FastMCP's stdio_server() captures sys.stdout. + After this call: + - C code (printf, SWIG, OpenStudio internals) -> fd 1 -> stderr + - Python sys.stdout -> saved fd -> real MCP client pipe """ - # Save original file descriptors - stdout_fd = sys.stdout.fileno() - stderr_fd = sys.stderr.fileno() + stdout_fd = sys.stdout.fileno() # 1 + stderr_fd = sys.stderr.fileno() # 2 - # Duplicate the current stdout FD to restore later - saved_stdout_fd = os.dup(stdout_fd) + # Save the real stdout pipe (to MCP client) as a new fd + saved_fd = os.dup(stdout_fd) - # Flush Python-level buffers before redirecting - sys.stdout.flush() - sys.stderr.flush() - - try: - # Redirect stdout (fd 1) to stderr (fd 2) at OS level - # This catches C-level fprintf(stdout, ...) from SWIG - os.dup2(stderr_fd, stdout_fd) + # Point fd 1 at stderr — all future C-level printf goes here + os.dup2(stderr_fd, stdout_fd) - yield + # Build a new Python stdout that writes to the saved fd. + # Line buffering so each JSON-RPC message flushes immediately. + binary = io.open(saved_fd, "wb", closefd=True) + text = io.TextIOWrapper(binary, encoding="utf-8", line_buffering=True) + sys.stdout = text - finally: - # Flush again before restoring - sys.stdout.flush() - sys.stderr.flush() - # Restore original stdout - os.dup2(saved_stdout_fd, stdout_fd) - os.close(saved_stdout_fd) +# Retain context-manager API so model_manager.py imports don't break. +# Now a no-op since fd 1 is permanently redirected. +@contextlib.contextmanager +def suppress_openstudio_warnings(): + """No-op — fd 1 is permanently redirected at startup.""" + yield def _redirect_stdout_to_stderr_at_exit(): - """Redirect stdout to stderr during Python cleanup to catch SWIG warnings. - - OpenStudio prints memory leak warnings when models are garbage-collected - during Python interpreter shutdown. This redirects those to stderr. - """ + """Safety net: ensure fd 1 points to stderr during interpreter shutdown.""" try: - stdout_fd = 1 # sys.stdout might be None at exit - stderr_fd = 2 - os.dup2(stderr_fd, stdout_fd) + os.dup2(2, 1) except Exception: - pass # Silently ignore errors during shutdown + pass -# Register the cleanup handler to run before Python exits atexit.register(_redirect_stdout_to_stderr_at_exit) diff --git a/tests/test_concurrent_tools.py b/tests/test_concurrent_tools.py new file mode 100644 index 0000000..fcdfeda --- /dev/null +++ b/tests/test_concurrent_tools.py @@ -0,0 +1,113 @@ +"""Regression test for issue #42: stdout suppression race condition. + +The global FastMCP middleware held os.dup2() on fd 1 (stdout→stderr) for +the entire tool call. FastMCP dispatches sync tools via +anyio.to_thread.run_sync, so two tools CAN run concurrently. When Thread A +held the redirect, Thread B's JSON-RPC response goes to stderr and the +client receives nothing → MCP error -32001 timeout. + +This test fires a slow tool (create_baseline_osm, several seconds) and a +fast tool (get_server_status, near-instant) concurrently. On buggy code, +get_server_status's response is lost → timeout. After the fix, both return. +""" +import asyncio +import pytest + +from conftest import integration_enabled, server_params, unwrap +from mcp import ClientSession +from mcp.client.stdio import stdio_client + + +@pytest.mark.integration +def test_concurrent_tool_calls_both_respond(): + # Regression: issue #42 — concurrent tool calls lost responses due to + # global stdout suppression middleware redirecting fd 1 for entire tool duration. + if not integration_enabled(): + pytest.skip("Set RUN_OPENSTUDIO_INTEGRATION=1 to enable MCP integration tests.") + + async def _run(): + async with stdio_client(server_params()) as (read, write): + async with ClientSession(read, write) as session: + await session.initialize() + + # --- Arrange --- + # Fire slow tool first + baseline_task = asyncio.create_task( + session.call_tool("create_baseline_osm", { + "name": "concurrent_race_test", "num_floors": 1, + }) + ) + # Small delay so baseline_osm enters its execution window + await asyncio.sleep(0.5) + + # --- Act --- + # Fire fast tool while slow tool holds middleware fd redirect + status_task = asyncio.create_task( + session.call_tool("get_server_status", {}) + ) + + # --- Assert --- + # 30s timeout: get_server_status should return in <1s. + # If it times out, the race condition is present — the response + # went to stderr and the client never received it. + try: + baseline_res, status_res = await asyncio.wait_for( + asyncio.gather(baseline_task, status_task), + timeout=30, + ) + except asyncio.TimeoutError: + pytest.fail( + "Concurrent tool call timed out — stdout suppression race " + "condition is present (issue #42). get_server_status response " + "was likely written to stderr while create_baseline_osm held " + "the fd 1 redirect." + ) + + baseline = unwrap(baseline_res) + status = unwrap(status_res) + + assert baseline.get("ok") is True, f"create_baseline_osm failed: {baseline}" + assert status.get("ok") is True, f"get_server_status failed: {status}" + assert "run_root" in status, f"status missing expected keys: {status}" + + asyncio.run(_run()) + + +@pytest.mark.integration +def test_concurrent_fast_tools_both_respond(): + # Regression: issue #42 — even two fast tools can race if both enter + # the middleware's fd redirect window simultaneously. + if not integration_enabled(): + pytest.skip("Set RUN_OPENSTUDIO_INTEGRATION=1 to enable MCP integration tests.") + + async def _run(): + async with stdio_client(server_params()) as (read, write): + async with ClientSession(read, write) as session: + await session.initialize() + + # Fire two fast tools concurrently + task_a = asyncio.create_task( + session.call_tool("get_server_status", {}) + ) + task_b = asyncio.create_task( + session.call_tool("get_server_status", {}) + ) + + try: + res_a, res_b = await asyncio.wait_for( + asyncio.gather(task_a, task_b), + timeout=15, + ) + except asyncio.TimeoutError: + pytest.fail( + "Concurrent fast tool calls timed out — stdout suppression " + "race condition (issue #42)." + ) + + a = unwrap(res_a) + b = unwrap(res_b) + + assert a.get("ok") is True, f"First get_server_status failed: {a}" + assert b.get("ok") is True, f"Second get_server_status failed: {b}" + + asyncio.run(_run())