Skip to content

Commit b5bfb11

Browse files
le-big-macclaude
andcommitted
Fix budget tracking: use cumulative cost with handler deltas
Replace _child_cost with a single _cumulative_cost running total. Handler cost deltas are synced via _update_handler_cost in _completion_turn before code execution, so subcalls see accurate remaining budget. Child costs are added directly in _subcall. Budget tests exercise the real _completion_turn flow rather than setting _cumulative_cost directly. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 413c373 commit b5bfb11

File tree

2 files changed

+96
-16
lines changed

2 files changed

+96
-16
lines changed

rlm/core/rlm.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ def __init__(
152152

153153
# Tracking (cumulative across all calls including children)
154154
self._cumulative_cost: float = 0.0
155+
self._last_handler_cost: float = 0.0 # Last known handler cost, for computing deltas
155156
self._consecutive_errors: int = 0
156157
self._last_error: str | None = None
157158
self._best_partial_answer: str | None = None
@@ -487,11 +488,8 @@ def _check_iteration_limits(
487488
),
488489
)
489490

490-
# Check budget
491+
# Check budget (handler cost already synced in _completion_turn)
491492
if self.max_budget is not None:
492-
current_usage = lm_handler.get_usage_summary()
493-
current_cost = current_usage.total_cost or 0.0
494-
self._cumulative_cost = current_cost
495493
if self._cumulative_cost > self.max_budget:
496494
self.verbose.print_budget_exceeded(self._cumulative_cost, self.max_budget)
497495
raise BudgetExceededError(
@@ -582,6 +580,16 @@ def _compact_history(
582580
]
583581
return new_history
584582

583+
def _update_handler_cost(self, lm_handler: LMHandler) -> None:
584+
"""Update _cumulative_cost with the handler's cost delta since last check."""
585+
if self.max_budget is None:
586+
return
587+
current_usage = lm_handler.get_usage_summary()
588+
handler_cost = current_usage.total_cost or 0.0
589+
delta = handler_cost - self._last_handler_cost
590+
self._cumulative_cost += delta
591+
self._last_handler_cost = handler_cost
592+
585593
def _completion_turn(
586594
self,
587595
prompt: str | dict[str, Any],
@@ -594,6 +602,11 @@ def _completion_turn(
594602
"""
595603
iter_start = time.perf_counter()
596604
response = lm_handler.completion(prompt)
605+
606+
# Update cumulative cost with the handler's LLM call BEFORE code
607+
# execution so that subcalls see accurate remaining budget.
608+
self._update_handler_cost(lm_handler)
609+
597610
code_block_strs = find_code_blocks(response)
598611
code_blocks = []
599612

@@ -771,12 +784,13 @@ def _subcall(self, prompt: str, model: str | None = None) -> RLMChatCompletion:
771784
)
772785
try:
773786
result = child.completion(prompt, root_prompt=None)
774-
# Track child's cost in parent's cumulative cost
787+
# Add child's cost to cumulative total immediately so subsequent
788+
# subcalls in the same code block see accurate remaining budget.
775789
if result.usage_summary and result.usage_summary.total_cost:
776790
self._cumulative_cost += result.usage_summary.total_cost
777791
return result
778792
except BudgetExceededError as e:
779-
# Propagate child's spending to parent
793+
# Record whatever the child actually spent
780794
self._cumulative_cost += e.spent
781795
error_msg = f"Budget exceeded - {e}"
782796
return RLMChatCompletion(

tests/test_depth_metadata.py

Lines changed: 76 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -251,34 +251,100 @@ def test_error_count_resets_on_success(self):
251251
assert rlm._consecutive_errors == 0
252252

253253
def test_budget_check_raises(self):
254-
"""_check_iteration_limits should raise BudgetExceededError when budget exceeded."""
255-
from rlm.core.types import RLMIteration
254+
"""_completion_turn syncs handler cost; _check_iteration_limits detects overspend."""
255+
from rlm.core.types import REPLResult
256256

257-
rlm = RLM(
257+
rlm_inst = RLM(
258258
backend="openai",
259259
backend_kwargs={"model_name": "test"},
260260
max_budget=0.01,
261261
)
262262

263+
# Mock handler: completion returns no code blocks, handler spent $0.05
263264
mock_handler = Mock()
265+
mock_handler.completion.return_value = "No code to run."
264266
mock_handler.get_usage_summary.return_value = UsageSummary(
265267
model_usage_summaries={
266268
"test": ModelUsageSummary(
267-
total_calls=10,
268-
total_input_tokens=10000,
269-
total_output_tokens=10000,
270-
total_cost=0.05,
269+
total_calls=1, total_input_tokens=0, total_output_tokens=0, total_cost=0.05
271270
)
272271
}
273272
)
274273

275-
iteration = RLMIteration(prompt="test", response="code", code_blocks=[])
274+
mock_env = Mock()
275+
276+
# _completion_turn calls _update_handler_cost → adds $0.05 to _cumulative_cost
277+
iteration = rlm_inst._completion_turn(
278+
prompt=[{"role": "user", "content": "test"}],
279+
lm_handler=mock_handler,
280+
environment=mock_env,
281+
)
282+
283+
assert rlm_inst._cumulative_cost == 0.05
276284

277285
with pytest.raises(BudgetExceededError) as exc_info:
278-
rlm._check_iteration_limits(iteration, 0, mock_handler)
279-
assert exc_info.value.spent > 0.01
286+
rlm_inst._check_iteration_limits(iteration, 0, mock_handler)
287+
assert exc_info.value.spent == 0.05
280288
assert exc_info.value.budget == 0.01
281289

290+
def test_budget_includes_child_cost_after_iteration(self):
291+
"""Regression: _cumulative_cost must include both handler and child subcall costs.
292+
293+
Exercises the real flow: _completion_turn calls _update_handler_cost
294+
(syncing handler cost delta into _cumulative_cost), then executes code
295+
blocks where _subcall adds child cost to _cumulative_cost.
296+
_check_iteration_limits should see the accumulated total.
297+
"""
298+
from rlm.core.types import REPLResult
299+
300+
rlm_inst = RLM(
301+
backend="openai",
302+
backend_kwargs={"model_name": "test"},
303+
max_budget=5.0,
304+
)
305+
306+
# Mock handler: completion returns a response with a code block,
307+
# handler spent $1.0
308+
mock_handler = Mock()
309+
mock_handler.completion.return_value = (
310+
"Running subcall\n```repl\nrlm_query('hello')\n```"
311+
)
312+
mock_handler.get_usage_summary.return_value = UsageSummary(
313+
model_usage_summaries={
314+
"test": ModelUsageSummary(
315+
total_calls=1, total_input_tokens=0, total_output_tokens=0, total_cost=1.0
316+
)
317+
}
318+
)
319+
320+
# Mock environment: execute_code simulates _subcall adding $9 child cost
321+
mock_env = Mock()
322+
323+
def execute_with_child_cost(code_str):
324+
rlm_inst._cumulative_cost += 9.0
325+
return REPLResult(stdout="", stderr="", locals={})
326+
327+
mock_env.execute_code.side_effect = execute_with_child_cost
328+
329+
# _completion_turn:
330+
# 1. lm_handler.completion() → response with code block
331+
# 2. _update_handler_cost() → adds $1 handler delta
332+
# 3. execute_code() → child adds $9 via side effect
333+
iteration = rlm_inst._completion_turn(
334+
prompt=[{"role": "user", "content": "test"}],
335+
lm_handler=mock_handler,
336+
environment=mock_env,
337+
)
338+
339+
# Total: $1 (handler) + $9 (child) = $10 > $5 budget
340+
assert rlm_inst._cumulative_cost == 10.0
341+
342+
with pytest.raises(BudgetExceededError) as exc_info:
343+
rlm_inst._check_iteration_limits(iteration, 0, mock_handler)
344+
345+
assert exc_info.value.spent == 10.0
346+
assert exc_info.value.budget == 5.0
347+
282348
def test_token_limit_check_raises(self):
283349
"""_check_iteration_limits should raise TokenLimitExceededError when tokens exceeded."""
284350
from rlm.core.types import RLMIteration

0 commit comments

Comments
 (0)