Merge branch 'feat/reasoning-timeout' into 'main'

jackapbutler · jackapbutler · commit 00b1e09fadc1 · 2026-04-13T12:47:11.000Z
Feat/reasoning timeout

See merge request proserve/genaiid/innovation-assets/hive!4
diff --git a/README.md b/README.md
@@ -236,7 +236,27 @@ print(response)
 
 You can also apply the verifier from the previous stage in this inference method, applying it independently to each revision from each model.
 
-### 4) Optimisation
+### 4) Reasoning Timeout
+
+When using reflection or multi-model debate, you can set `max_reasoning_seconds` to cap the total inference time. If the time limit is reached before all reflection rounds complete, the library returns the best answer available at that point. The initial model call always executes — the timeout only applies to subsequent reflection rounds.
+
+```python
+from bhive import Hive, HiveConfig
+
+bhive_client = Hive()
+bhive_config = HiveConfig(
+    bedrock_model_ids=["anthropic.claude-haiku-4-5-20251001-v1:0"],
+    num_reflections=5,
+    max_reasoning_seconds=10.0,  # return best answer after 10s even if reflections remain
+)
+messages = [{"role": "user", "content": [{"text": "What is 2 + 2?"}]}]
+response = bhive_client.converse(messages, bhive_config)
+print(response)
+```
+
+This is useful for latency-sensitive applications where you want to allow extra reasoning time when available but need a hard upper bound.
+
+### 5) Optimisation
 
 If you are not sure which exact hyperparameter configuration will suit your needs, you can use the hyperparameter optimisation functionality. Here, you can define a set of ranges for the inference parameters such as the Amazon Bedrock models or rounds of reflection and these will be evaluated against a test dataset. You can also specify a budget constraining the maximum cost ($) and maximum latency (seconds) per example.
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "bee_hive"
-version = "0.7.12"
+version = "0.8.0"
 description = "Library for enabling inference-time-compute augmentations in Bedrock"
 authors = [
     {name = "Jack Butler", email = "jackbtlr@amazon.co.uk"},
diff --git a/src/bhive/client.py b/src/bhive/client.py
@@ -97,28 +97,7 @@ def converse(
         logger.info(f"Starting inference with {config=} and {converse_kwargs=}")
 
         _converse_func = functools.partial(self._converse, **converse_kwargs)
-        response: str | list[str] | None = None
-        if config.single_model_single_call:
-            # single model call
-            response, chatlog = inference.single_model_single_call(config, chatlog, _converse_func)
-
-        elif config.multi_model_single_call:
-            # multi model / single round debate
-            response, chatlog = inference.multi_model_single_call(
-                config, chatlog, _converse_func, message
-            )
-
-        elif config.single_model_multi_call:
-            # single model but reflection
-            response, chatlog = inference.single_model_multi_call(
-                config, chatlog, _converse_func, message
-            )
-
-        else:
-            # multi model + multi round debate
-            response, chatlog = inference.multi_model_multi_call(
-                config, chatlog, _converse_func, message
-            )
+        response, chatlog = inference.run_inference(config, chatlog, _converse_func, message)
 
         # parsing structured outputs
         parsed_response = None
diff --git a/src/bhive/config.py b/src/bhive/config.py
@@ -28,6 +28,7 @@ class HiveConfig(pydantic.BaseModel):
     verifier: Callable[[str], str] | None = None
     use_prompt_caching: bool = False
     output_model: type[pydantic.BaseModel] | None = None
+    max_reasoning_seconds: float | None = pydantic.Field(default=None, gt=0)
 
     @pydantic.field_validator("bedrock_model_ids")
     @classmethod
@@ -42,7 +43,7 @@ def validate_configuration(self: "HiveConfig") -> "HiveConfig":
             logger.warning("We recommend a final aggregator_model when using multiple models.")
         if self.aggregator_model_id and self.n_models == 1:
             logger.warning("No need for an aggregator_model when using a single model.")
-        if self.single_model_single_call and self.verifier:
+        if self.n_models == 1 and self.no_reflections and self.verifier:
             raise ValueError("verifier cannot be provided when using a single model call.")
         if self.use_prompt_caching:
             logger.warning("Cache read / write pricing is approximate but may not be exact.")
@@ -56,18 +57,6 @@ def n_models(self) -> int:
     def no_reflections(self) -> bool:
         return self.num_reflections == 0
 
-    @property
-    def single_model_single_call(self) -> bool:
-        return self.n_models == 1 and self.no_reflections
-
-    @property
-    def multi_model_single_call(self) -> bool:
-        return self.n_models > 1 and self.no_reflections
-
-    @property
-    def single_model_multi_call(self) -> bool:
-        return self.n_models == 1 and not self.no_reflections
-
 
 class TrialConfig(pydantic.BaseModel):
     """Configuration class for Hive trials, managing trial settings and validation."""
diff --git a/src/bhive/inference.py b/src/bhive/inference.py
@@ -3,6 +3,7 @@
 SPDX-License-Identifier: Apache-2.0
 """
 
+import time
 from typing import Callable
 
 from loguru import logger
@@ -12,6 +13,71 @@
 from bhive.utils import parallel_bedrock_exec
 
 
+def run_inference(
+    config: HiveConfig, chatlog: chat.ChatLog, _converse_func: Callable, message: str | None = None
+) -> tuple[str | list[str], chat.ChatLog]:
+    is_single = config.n_models == 1
+    start_time = time.monotonic()
+
+    for n_reflect in range(config.num_reflections + 1):
+        if n_reflect > 0:
+            if config.max_reasoning_seconds is not None:
+                elapsed = time.monotonic() - start_time
+                if elapsed >= config.max_reasoning_seconds:
+                    logger.info(
+                        f"Exiting early at round {n_reflect}/{config.num_reflections} "
+                        f"after {elapsed:.1f}s (limit: {config.max_reasoning_seconds}s)"
+                    )
+                    break
+        if n_reflect > 0:
+            if is_single:
+                reflect_msg = prompt.reflect + "\n"
+                if config.verifier:
+                    past_answer = chatlog.get_last_answer()
+                    reflect_msg += apply_verification(past_answer, config.verifier)  # type: ignore[arg-type]
+                if message:
+                    reflect_msg += f"\nAs a reminder, the original question is {message}"
+                chatlog.add_user_msg(reflect_msg, invoke_index=0)
+            else:
+                for index in range(config.n_models):
+                    recent_other_answers = chatlog.get_recent_other_answers(index)
+                    debate_msg = prompt.debate
+                    for recent_ans in recent_other_answers:
+                        answer_text = recent_ans["content"][0]["text"]
+                        debate_msg += f"\n\nOne agent response: ```{answer_text}```"
+                        if config.verifier:
+                            debate_msg += apply_verification(answer_text, config.verifier)
+                    debate_msg += f"\n\n {prompt.careful}\n"
+                    if message:
+                        debate_msg += f"\nAs a reminder, the original question is {message}"
+                    chatlog.add_user_msg(debate_msg, index)
+
+        if is_single:
+            modelid = config.bedrock_model_ids[0]
+            response = _converse_func(model_id=modelid, messages=chatlog.history[0].chat_history)
+            _record_response(chatlog, 0, modelid, response)
+        else:
+            responses = parallel_bedrock_exec(_converse_func, chathistory=chatlog.history)
+            for (index, modelid), response in responses.items():
+                _record_response(chatlog, index, modelid, response)
+
+    if config.aggregator_model_id:
+        chatlog = aggregate_last_responses(config, chatlog, _converse_func, message)
+
+    return chatlog.get_last_answer(), chatlog
+
+
+def _record_response(
+    chatlog: chat.ChatLog, index: int, modelid: str, response: chat.ConverseResponse
+):
+    chatlog.add_assistant_msg(response.answer, index)
+    if response.thinking:
+        chatlog.add_thinking_trace(response.thinking, index)
+    chatlog.update_stats(modelid, response)
+    chatlog.add_stop_reason(response.stopReason)
+    chatlog.add_trace(response.trace)
+
+
 def aggregate_last_responses(
     config: HiveConfig, chatlog: chat.ChatLog, _converse_func: Callable, message: str | None = None
 ) -> chat.ChatLog:
@@ -30,131 +96,12 @@ def aggregate_last_responses(
     logger.info(f"Aggregating a final response using {config.aggregator_model_id=}")
     response: chat.ConverseResponse = _converse_func(config.aggregator_model_id, [fmt_msg])
 
-    chatlog.add_assistant_msg(response.answer, 0)
-    if response.thinking:
-        chatlog.add_thinking_trace(response.thinking, 0)
-    chatlog.update_stats(config.aggregator_model_id, response)
-    chatlog.add_stop_reason(response.stopReason)
-    chatlog.add_trace(response.trace)
+    _record_response(chatlog, 0, config.aggregator_model_id, response)
 
     return chatlog
 
 
-def single_model_single_call(
-    config: HiveConfig, chatlog: chat.ChatLog, _converse_func: Callable
-) -> tuple[str, chat.ChatLog]:
-    modelid = config.bedrock_model_ids[0]
-    logger.info(f"Calling {modelid} with no self-reflection")
-    response: chat.ConverseResponse = _converse_func(
-        model_id=modelid, messages=chatlog.history[0].chat_history
-    )
-    chatlog.add_assistant_msg(response.answer, 0)
-    if response.thinking:
-        chatlog.add_thinking_trace(response.thinking, 0)
-    chatlog.update_stats(modelid, response)
-    chatlog.add_stop_reason(response.stopReason)
-    chatlog.add_trace(response.trace)
-
-    return response.answer, chatlog
-
-
-def multi_model_single_call(
-    config: HiveConfig, chatlog: chat.ChatLog, _converse_func: Callable, message: str | None = None
-) -> tuple[str | list[str], chat.ChatLog]:
-    logger.info(f"Calling {config.bedrock_model_ids} with no self-reflection")
-    responses: dict[tuple[int, str], chat.ConverseResponse] = parallel_bedrock_exec(
-        _converse_func, chathistory=chatlog.history
-    )
-    for (index, modelid), response in responses.items():
-        chatlog.add_assistant_msg(response.answer, index)
-        if response.thinking:
-            chatlog.add_thinking_trace(response.thinking, index)
-        chatlog.update_stats(modelid, response)
-        chatlog.add_stop_reason(response.stopReason)
-        chatlog.add_trace(response.trace)
-
-    if config.aggregator_model_id:
-        # aggregate an answer
-        chatlog = aggregate_last_responses(config, chatlog, _converse_func, message)
-    return chatlog.get_last_answer(), chatlog
-
-
-def single_model_multi_call(
-    config: HiveConfig, chatlog: chat.ChatLog, _converse_func: Callable, message: str | None = None
-) -> tuple[str, chat.ChatLog]:
-    modelid = config.bedrock_model_ids[0]
-    logger.info(f"Calling {modelid} with {config.num_reflections} rounds of self-reflection")
-    for n_reflect in range(config.num_reflections + 1):
-        if 0 < n_reflect:
-            reflect_msg = prompt.reflect + "\n"
-            if config.verifier:
-                past_answer = chatlog.get_last_answer()
-                assert isinstance(past_answer, str), (
-                    "Received multiple responds when doing a single model call"
-                )
-                reflect_msg += apply_verification(past_answer, config.verifier)
-            if message:
-                reflect_msg += f"\nAs a reminder, the original question is {message}"
-            chatlog.add_user_msg(reflect_msg, invoke_index=0)
-        response: chat.ConverseResponse = _converse_func(
-            model_id=modelid, messages=chatlog.history[0].chat_history
-        )
-        chatlog.add_assistant_msg(response.answer, invoke_index=0)
-        if response.thinking:
-            chatlog.add_thinking_trace(response.thinking, invoke_index=0)
-        chatlog.update_stats(modelid, response)
-        chatlog.add_stop_reason(response.stopReason)
-        chatlog.add_trace(response.trace)
-
-    return response.answer, chatlog
-
-
-def multi_model_multi_call(
-    config: HiveConfig, chatlog: chat.ChatLog, _converse_func: Callable, message: str | None = None
-) -> tuple[str | list[str], chat.ChatLog]:
-    logger.info(f"Calling {config.bedrock_model_ids} with {config.num_reflections} rounds")
-    for n_reflect in range(config.num_reflections + 1):
-        if 0 < n_reflect:
-            # consider others & debate
-            for index, model_log in enumerate(chatlog.history):
-                recent_other_answers = chatlog.get_recent_other_answers(index)
-                debate_msg = prompt.debate
-                for recent_ans in recent_other_answers:
-                    # NOTE could alternatively summarise messages
-                    answer_text = recent_ans["content"][0]["text"]
-                    debate_msg += f"\n\nOne agent response: ```{answer_text}```"
-                    if config.verifier:
-                        debate_msg += apply_verification(answer_text, config.verifier)
-                debate_msg += f"\n\n {prompt.careful}\n"
-                if message:
-                    debate_msg += f"\nAs a reminder, the original question is {message}"
-                logger.debug(f"Sending request to {model_log.modelid}:\n{debate_msg}")
-                chatlog.add_user_msg(debate_msg, index)
-
-        logger.info(
-            f"Fetching debate #{n_reflect + 1} answers from all {config.bedrock_model_ids=}"
-        )
-        responses: dict[tuple[int, str], chat.ConverseResponse] = parallel_bedrock_exec(
-            _converse_func, chathistory=chatlog.history
-        )
-        for (index, modelid), response in responses.items():
-            chatlog.add_assistant_msg(response.answer, index)
-            if response.thinking:
-                chatlog.add_thinking_trace(response.thinking, index)
-            chatlog.update_stats(modelid, response)
-            chatlog.add_stop_reason(response.stopReason)
-            chatlog.add_trace(response.trace)
-
-    if config.aggregator_model_id:
-        # aggregate an answer
-        chatlog = aggregate_last_responses(config, chatlog, _converse_func, message)
-    return chatlog.get_last_answer(), chatlog
-
-
 def apply_verification(past_answer: str, verifier: Callable[[str], str]) -> str:
-    # Applies a verification function to add more context
     verifier_context = verifier(past_answer)
     logger.debug(f"External verification function returned: {verifier_context}")
-    return (
-        f"\nAn external verification function has added context to this answer: {verifier_context}"
-    )
+    return f"An external verifier has added the following to this answer: {verifier_context}"
diff --git a/tests/test_inference.py b/tests/test_inference.py