massgen · Henry-811 · Apr 3, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/ROADMAP_v0.1.72.md → ROADMAP_v0.1.73.md b/ROADMAP_v0.1.72.md → ROADMAP_v0.1.73.md
diff --git a/massgen/__init__.py b/massgen/__init__.py
@@ -86,7 +86,7 @@
 from .message_templates import MessageTemplates, get_templates
 from .orchestrator import Orchestrator, create_orchestrator
 
-__version__ = "0.1.71"
+__version__ = "0.1.72"
 __author__ = "MassGen Contributors"
 
 

diff --git a/massgen/backend/chat_completions.py b/massgen/backend/chat_completions.py
@@ -43,6 +43,11 @@
     CustomToolChunk,
     ToolExecutionConfig,
 )
+from .llm_circuit_breaker import (
+    CircuitBreakerOpenError,
+    LLMCircuitBreaker,
+    LLMCircuitBreakerConfig,
+)
 
 
 class ChatCompletionsBackend(StreamingBufferMixin, CustomToolAndMCPBackend):
@@ -58,6 +63,8 @@ class ChatCompletionsBackend(StreamingBufferMixin, CustomToolAndMCPBackend):
     """
 
     def __init__(self, api_key: str | None = None, **kwargs):
+        # Extract circuit breaker config before passing to super
+        cb_config = self._build_circuit_breaker_config(kwargs)
         super().__init__(api_key, **kwargs)
         # Backend name is already set in MCPBackend, but we may need to override it
         self.backend_name = self.get_provider_name()
@@ -72,6 +79,27 @@ def __init__(self, api_key: str | None = None, **kwargs):
         self._stream_usage_received: bool = True  # True = no pending estimation needed
         # Track reasoning state for streaming (needed for reasoning_done transition)
         self._reasoning_active: bool = False
+        self.circuit_breaker = LLMCircuitBreaker(
+            config=cb_config,
+            backend_name=self.get_provider_name(),
+        )
+
+    @staticmethod
+    def _build_circuit_breaker_config(
+        kwargs: dict[str, Any],
+    ) -> LLMCircuitBreakerConfig:
+        """Extract circuit breaker settings from kwargs and build config."""
+        cb_kwargs: dict[str, Any] = {}
+        prefix = "llm_circuit_breaker_"
+        keys_to_pop: list[str] = []
+        for key in kwargs:
+            if key.startswith(prefix):
+                param = key[len(prefix) :]
+                cb_kwargs[param] = kwargs[key]
+                keys_to_pop.append(key)
+        for key in keys_to_pop:
+            kwargs.pop(key)
+        return LLMCircuitBreakerConfig(**cb_kwargs)
 
     def finalize_token_tracking(self) -> None:
         """Finalize token tracking by estimating tokens for any interrupted streams.
@@ -276,9 +304,19 @@ async def _stream_with_custom_and_mcp_tools(
             model=model,
             operation="stream",
         ) as llm_span:
-            # Start streaming - wrap in try/except for context length errors
+            # Start streaming - wrap with circuit breaker + context length handling
             try:
-                stream = await client.chat.completions.create(**api_params)
+
+                async def _make_api_call():
+                    return await client.chat.completions.create(**api_params)
+
+                stream = await self.circuit_breaker.call_with_retry(
+                    _make_api_call,
+                    agent_id=agent_id,
+                )
+            except CircuitBreakerOpenError:
+                self.end_api_call_timing(success=False, error="circuit_breaker_open")
+                raise
             except Exception as e:
                 if is_context_length_error(e) and not _compression_retry:
                     # Context length exceeded on initial request - compress and retry

diff --git a/massgen/backend/gemini.py b/massgen/backend/gemini.py
@@ -51,6 +51,11 @@
     PostEvaluationResponse,
     VoteOnlyCoordinationResponse,
 )
+from .llm_circuit_breaker import (
+    CircuitBreakerOpenError,
+    LLMCircuitBreaker,
+    LLMCircuitBreakerConfig,
+)
 from .rate_limiter import GlobalRateLimiter
 
 
@@ -247,6 +252,9 @@ def __init__(self, api_key: str | None = None, **kwargs):
         # Store Gemini-specific API key before calling parent init
         gemini_api_key = api_key or os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY")
 
+        # Extract circuit breaker config before other kwargs processing
+        cb_config = self._build_circuit_breaker_config(kwargs)
+
         # Extract and remove enable_rate_limit and backoff config
         enable_rate_limit = kwargs.pop("enable_rate_limit", False)
         model_name = kwargs.get("model", "")
@@ -293,6 +301,12 @@ def __init__(self, api_key: str | None = None, **kwargs):
         self.backoff_retry_count = 0
         self.backoff_total_delay = 0.0
 
+        # LLM circuit breaker (opt-in, default disabled)
+        self.circuit_breaker = LLMCircuitBreaker(
+            config=cb_config,
+            backend_name="gemini",
+        )
+
         # Initialize multi-dimensional rate limiter for Gemini API
         # Supports RPM (Requests Per Minute), TPM (Tokens Per Minute), RPD (Requests Per Day)
         # Configuration loaded from massgen/config/rate_limits.yaml
@@ -335,6 +349,23 @@ def __init__(self, api_key: str | None = None, **kwargs):
             self.rate_limiter = None
             logger.info(f"[Gemini] Rate limiting disabled for '{model_name}'")
 
+    @staticmethod
+    def _build_circuit_breaker_config(
+        kwargs: dict[str, Any],
+    ) -> LLMCircuitBreakerConfig:
+        """Extract circuit breaker settings from kwargs and build config."""
+        cb_kwargs: dict[str, Any] = {}
+        prefix = "llm_circuit_breaker_"
+        keys_to_pop: list[str] = []
+        for key in kwargs:
+            if key.startswith(prefix):
+                param = key[len(prefix) :]
+                cb_kwargs[param] = kwargs[key]
+                keys_to_pop.append(key)
+        for key in keys_to_pop:
+            kwargs.pop(key)
+        return LLMCircuitBreakerConfig(**cb_kwargs)
+
     def _normalize_and_resolve_tool_name(self, tool_name: str) -> str:
         """Normalize Gemini tool names and resolve MCP aliases.
 
@@ -777,6 +808,11 @@ async def stream_with_tools(self, messages: list[dict[str, Any]], tools: list[di
             last_response_with_candidates = None
 
             cfg = self.backoff_config
+
+            # Circuit breaker gate
+            if self.circuit_breaker.should_block():
+                raise CircuitBreakerOpenError("Circuit breaker is open for gemini")
+
             first_token_recorded = False
             for stream_attempt in range(1, cfg.max_attempts + 1):
                 try:
@@ -863,6 +899,7 @@ async def stream_with_tools(self, messages: list[dict[str, Any]], tools: list[di
 
                     # End API call timing on successful completion
                     self.end_api_call_timing(success=True)
+                    self.circuit_breaker.record_success()
                     break
 
                 except Exception as stream_exc:
@@ -873,6 +910,10 @@ async def stream_with_tools(self, messages: list[dict[str, Any]], tools: list[di
 
                     if not is_retryable or stream_attempt >= cfg.max_attempts:
                         if is_retryable:
+                            self.circuit_breaker.record_failure(
+                                error_type=f"exhausted_{status_code or 'unknown'}",
+                                error_message=f"Max retries exhausted: {error_msg[:200]}",
+                            )
                             yield StreamChunk(
                                 type="error",
                                 error=f"⚠️ Rate limit exceeded after {cfg.max_attempts} retries. Please try again later.",
@@ -1443,6 +1484,10 @@ def tool_config_for_call(call: dict[str, Any]) -> ToolExecutionConfig:
                     cont_first_token_recorded = False
 
                     # Retry for continuation with backoff
+                    # Circuit breaker gate
+                    if self.circuit_breaker.should_block():
+                        raise CircuitBreakerOpenError("Circuit breaker is open for gemini")
+
                     for cont_attempt in range(1, cfg.max_attempts + 1):
                         try:
                             # Start API call timing for continuation
@@ -1519,16 +1564,21 @@ def tool_config_for_call(call: dict[str, Any]) -> ToolExecutionConfig:
 
                             # End API call timing on successful completion
                             self.end_api_call_timing(success=True)
+                            self.circuit_breaker.record_success()
                             break
 
                         except Exception as cont_exc:
                             # End API call timing with failure
                             self.end_api_call_timing(success=False, error=str(cont_exc))
-                            is_retryable, status_code, _ = _is_retryable_gemini_error(cont_exc, cfg.retry_statuses)
+                            is_retryable, status_code, error_msg = _is_retryable_gemini_error(cont_exc, cfg.retry_statuses)
 
                             if not is_retryable or cont_attempt >= cfg.max_attempts:
                                 # Yield user-friendly error before raising
                                 if is_retryable:
+                                    self.circuit_breaker.record_failure(
+                                        error_type=f"exhausted_{status_code or 'unknown'}",
+                                        error_message=f"Max retries exhausted: {error_msg[:200]}",
+                                    )
                                     yield StreamChunk(
                                         type="error",
                                         error=f"⚠️ Rate limit exceeded after {cfg.max_attempts} retries. Please try again later.",

diff --git a/massgen/backend/response.py b/massgen/backend/response.py
@@ -35,6 +35,11 @@
     ToolExecutionConfig,
     UploadFileError,
 )
+from .llm_circuit_breaker import (
+    CircuitBreakerOpenError,
+    LLMCircuitBreaker,
+    LLMCircuitBreakerConfig,
+)
 
 
 class _WSEvent:
@@ -85,6 +90,8 @@ class ResponseBackend(StreamingBufferMixin, CustomToolAndMCPBackend):
     """Backend using the standard Response API format with multimodal support."""
 
     def __init__(self, api_key: str | None = None, **kwargs):
+        # Extract circuit breaker config before passing to super
+        cb_config = self._build_circuit_breaker_config(kwargs)
         super().__init__(api_key, **kwargs)
         self.api_key = api_key or os.getenv("OPENAI_API_KEY")
         self.formatter = ResponseFormatter()
@@ -107,6 +114,27 @@ def __init__(self, api_key: str | None = None, **kwargs):
         self._uploaded_file_ids: list[str] = []
 
         # Note: _streaming_buffer is provided by StreamingBufferMixin
+        self.circuit_breaker = LLMCircuitBreaker(
+            config=cb_config,
+            backend_name="response_api",
+        )
+
+    @staticmethod
+    def _build_circuit_breaker_config(
+        kwargs: dict[str, Any],
+    ) -> LLMCircuitBreakerConfig:
+        """Extract circuit breaker settings from kwargs and build config."""
+        cb_kwargs: dict[str, Any] = {}
+        prefix = "llm_circuit_breaker_"
+        keys_to_pop: list[str] = []
+        for key in kwargs:
+            if key.startswith(prefix):
+                param = key[len(prefix) :]
+                cb_kwargs[param] = kwargs[key]
+                keys_to_pop.append(key)
+        for key in keys_to_pop:
+            kwargs.pop(key)
+        return LLMCircuitBreakerConfig(**cb_kwargs)
 
     def supports_upload_files(self) -> bool:
         return True
@@ -244,12 +272,20 @@ async def _stream_without_custom_and_mcp_tools(
         _compression_retry = kwargs.get("_compression_retry", False)
         ws_transport = kwargs.get("_ws_transport")
 
+        # Start API call timing for non-MCP path
+        model = api_params.get("model", "unknown")
+        self.start_api_call_timing(model)
+
         try:
             stream = await self._create_response_stream(
                 api_params,
                 client,
                 ws_transport,
+                agent_id=agent_id,
             )
+        except CircuitBreakerOpenError:
+            self.end_api_call_timing(success=False, error="circuit_breaker_open")
+            raise
         except Exception as e:
             # Debug: Catch input[N].content format errors and print the problematic message
             error_str = str(e)
@@ -271,6 +307,7 @@ async def _stream_without_custom_and_mcp_tools(
             from ._context_errors import is_context_length_error
 
             if is_context_length_error(e) and not _compression_retry:
+                self.end_api_call_timing(success=False, error=str(e))
                 logger.warning(
                     f"[{self.get_provider_name()}] Context length exceeded, " f"attempting compression recovery...",
                 )
@@ -307,6 +344,7 @@ async def _stream_without_custom_and_mcp_tools(
                     api_params,
                     client,
                     ws_transport,
+                    agent_id=agent_id,
                 )
 
                 # Notify user that compression succeeded
@@ -323,6 +361,7 @@ async def _stream_without_custom_and_mcp_tools(
                     f"[{self.get_provider_name()}] Compression recovery successful via summarization " f"({input_count} items)",
                 )
             else:
+                self.end_api_call_timing(success=False, error=str(e))
                 raise
 
         async for chunk in self._process_stream(stream, all_params, agent_id):
@@ -471,7 +510,11 @@ async def _stream_with_custom_and_mcp_tools(
                 api_params,
                 client,
                 ws_transport,
+                agent_id=agent_id,
             )
+        except CircuitBreakerOpenError:
+            self.end_api_call_timing(success=False, error="circuit_breaker_open")
+            raise
         except Exception as e:
             # Debug: Catch input[N].content format errors and print the problematic message
             error_str = str(e)
@@ -533,6 +576,7 @@ async def _stream_with_custom_and_mcp_tools(
                     api_params,
                     client,
                     ws_transport,
+                    agent_id=agent_id,
                 )
 
                 # Notify user that compression succeeded
@@ -1758,12 +1802,19 @@ def extract_tool_result_content(self, tool_result_message: dict[str, Any]) -> st
         """Extract content from OpenAI Responses API tool result message."""
         return tool_result_message.get("output", "")
 
-    async def _create_response_stream(self, api_params, client, ws_transport=None):
+    async def _create_response_stream(self, api_params, client, ws_transport=None, agent_id=None):
         """Create a response stream via HTTP or websocket transport."""
         if ws_transport is not None and ws_transport.is_connected:
             logger.debug("[WebSocket] Sending response.create via WebSocket")
             return self._ws_event_stream(ws_transport, api_params)
-        return await client.responses.create(**api_params)
+
+        async def _make_api_call():
+            return await client.responses.create(**api_params)
+
+        return await self.circuit_breaker.call_with_retry(
+            _make_api_call,
+            agent_id=agent_id,
+        )
 
     async def _ws_event_stream(self, ws_transport, api_params):
         """Wrap websocket JSON events as objects matching SDK stream chunks."""