Fix budget tracking: use cumulative cost with handler deltas

le-big-mac · claude · le-big-mac · commit b5bfb111e00a · 2026-03-01T17:47:47.000Z
Replace _child_cost with a single _cumulative_cost running total.
Handler cost deltas are synced via _update_handler_cost in
_completion_turn before code execution, so subcalls see accurate
remaining budget. Child costs are added directly in _subcall.

Budget tests exercise the real _completion_turn flow rather than
setting _cumulative_cost directly.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/rlm/core/rlm.py b/rlm/core/rlm.py
@@ -152,6 +152,7 @@ def __init__(
 
         # Tracking (cumulative across all calls including children)
         self._cumulative_cost: float = 0.0
+        self._last_handler_cost: float = 0.0  # Last known handler cost, for computing deltas
         self._consecutive_errors: int = 0
         self._last_error: str | None = None
         self._best_partial_answer: str | None = None
@@ -487,11 +488,8 @@ def _check_iteration_limits(
                 ),
             )
 
-        # Check budget
+        # Check budget (handler cost already synced in _completion_turn)
         if self.max_budget is not None:
-            current_usage = lm_handler.get_usage_summary()
-            current_cost = current_usage.total_cost or 0.0
-            self._cumulative_cost = current_cost
             if self._cumulative_cost > self.max_budget:
                 self.verbose.print_budget_exceeded(self._cumulative_cost, self.max_budget)
                 raise BudgetExceededError(
@@ -582,6 +580,16 @@ def _compact_history(
         ]
         return new_history
 
+    def _update_handler_cost(self, lm_handler: LMHandler) -> None:
+        """Update _cumulative_cost with the handler's cost delta since last check."""
+        if self.max_budget is None:
+            return
+        current_usage = lm_handler.get_usage_summary()
+        handler_cost = current_usage.total_cost or 0.0
+        delta = handler_cost - self._last_handler_cost
+        self._cumulative_cost += delta
+        self._last_handler_cost = handler_cost
+
     def _completion_turn(
         self,
         prompt: str | dict[str, Any],
@@ -594,6 +602,11 @@ def _completion_turn(
         """
         iter_start = time.perf_counter()
         response = lm_handler.completion(prompt)
+
+        # Update cumulative cost with the handler's LLM call BEFORE code
+        # execution so that subcalls see accurate remaining budget.
+        self._update_handler_cost(lm_handler)
+
         code_block_strs = find_code_blocks(response)
         code_blocks = []
 
@@ -771,12 +784,13 @@ def _subcall(self, prompt: str, model: str | None = None) -> RLMChatCompletion:
         )
         try:
             result = child.completion(prompt, root_prompt=None)
-            # Track child's cost in parent's cumulative cost
+            # Add child's cost to cumulative total immediately so subsequent
+            # subcalls in the same code block see accurate remaining budget.
             if result.usage_summary and result.usage_summary.total_cost:
                 self._cumulative_cost += result.usage_summary.total_cost
             return result
         except BudgetExceededError as e:
-            # Propagate child's spending to parent
+            # Record whatever the child actually spent
             self._cumulative_cost += e.spent
             error_msg = f"Budget exceeded - {e}"
             return RLMChatCompletion(
diff --git a/tests/test_depth_metadata.py b/tests/test_depth_metadata.py
@@ -251,34 +251,100 @@ def test_error_count_resets_on_success(self):
         assert rlm._consecutive_errors == 0
 
     def test_budget_check_raises(self):
-        """_check_iteration_limits should raise BudgetExceededError when budget exceeded."""
-        from rlm.core.types import RLMIteration
+        """_completion_turn syncs handler cost; _check_iteration_limits detects overspend."""
+        from rlm.core.types import REPLResult
 
-        rlm = RLM(
+        rlm_inst = RLM(
             backend="openai",
             backend_kwargs={"model_name": "test"},
             max_budget=0.01,
         )
 
+        # Mock handler: completion returns no code blocks, handler spent $0.05
         mock_handler = Mock()
+        mock_handler.completion.return_value = "No code to run."
         mock_handler.get_usage_summary.return_value = UsageSummary(
             model_usage_summaries={
                 "test": ModelUsageSummary(
-                    total_calls=10,
-                    total_input_tokens=10000,
-                    total_output_tokens=10000,
-                    total_cost=0.05,
+                    total_calls=1, total_input_tokens=0, total_output_tokens=0, total_cost=0.05
                 )
             }
         )
 
-        iteration = RLMIteration(prompt="test", response="code", code_blocks=[])
+        mock_env = Mock()
+
+        # _completion_turn calls _update_handler_cost → adds $0.05 to _cumulative_cost
+        iteration = rlm_inst._completion_turn(
+            prompt=[{"role": "user", "content": "test"}],
+            lm_handler=mock_handler,
+            environment=mock_env,
+        )
+
+        assert rlm_inst._cumulative_cost == 0.05
 
         with pytest.raises(BudgetExceededError) as exc_info:
-            rlm._check_iteration_limits(iteration, 0, mock_handler)
-        assert exc_info.value.spent > 0.01
+            rlm_inst._check_iteration_limits(iteration, 0, mock_handler)
+        assert exc_info.value.spent == 0.05
         assert exc_info.value.budget == 0.01
 
+    def test_budget_includes_child_cost_after_iteration(self):
+        """Regression: _cumulative_cost must include both handler and child subcall costs.
+
+        Exercises the real flow: _completion_turn calls _update_handler_cost
+        (syncing handler cost delta into _cumulative_cost), then executes code
+        blocks where _subcall adds child cost to _cumulative_cost.
+        _check_iteration_limits should see the accumulated total.
+        """
+        from rlm.core.types import REPLResult
+
+        rlm_inst = RLM(
+            backend="openai",
+            backend_kwargs={"model_name": "test"},
+            max_budget=5.0,
+        )
+
+        # Mock handler: completion returns a response with a code block,
+        # handler spent $1.0
+        mock_handler = Mock()
+        mock_handler.completion.return_value = (
+            "Running subcall\n```repl\nrlm_query('hello')\n```"
+        )
+        mock_handler.get_usage_summary.return_value = UsageSummary(
+            model_usage_summaries={
+                "test": ModelUsageSummary(
+                    total_calls=1, total_input_tokens=0, total_output_tokens=0, total_cost=1.0
+                )
+            }
+        )
+
+        # Mock environment: execute_code simulates _subcall adding $9 child cost
+        mock_env = Mock()
+
+        def execute_with_child_cost(code_str):
+            rlm_inst._cumulative_cost += 9.0
+            return REPLResult(stdout="", stderr="", locals={})
+
+        mock_env.execute_code.side_effect = execute_with_child_cost
+
+        # _completion_turn:
+        # 1. lm_handler.completion() → response with code block
+        # 2. _update_handler_cost() → adds $1 handler delta
+        # 3. execute_code() → child adds $9 via side effect
+        iteration = rlm_inst._completion_turn(
+            prompt=[{"role": "user", "content": "test"}],
+            lm_handler=mock_handler,
+            environment=mock_env,
+        )
+
+        # Total: $1 (handler) + $9 (child) = $10 > $5 budget
+        assert rlm_inst._cumulative_cost == 10.0
+
+        with pytest.raises(BudgetExceededError) as exc_info:
+            rlm_inst._check_iteration_limits(iteration, 0, mock_handler)
+
+        assert exc_info.value.spent == 10.0
+        assert exc_info.value.budget == 5.0
+
     def test_token_limit_check_raises(self):
         """_check_iteration_limits should raise TokenLimitExceededError when tokens exceeded."""
         from rlm.core.types import RLMIteration