Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion crates/ccpa-arena/src/bin/ccpa-arena-bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,16 @@ struct Cli {
/// Only meaningful when `--compliance-enforced` is set.
#[arg(long, default_value_t = 3)]
max_consecutive_compliance_failures: u32,

/// M292 Agent-Text-Loop cap. Session terminates with
/// `ArenaOutcome::AgentTextLoop` when the agent emits text-only
/// turns (no `<tool_call>`) this many turns in a row. `0`
/// (default) disables the detector — preserves M287/M291 baseline.
/// Motivation: `V1_004` sub-bench B observed 20 consecutive text-only
/// turns on `Qwen3-Coder-30B`; bailing at e.g. 5 saves ~15 turns ×
/// ~72s/turn = ~18min/fixture × 20 fixtures ≈ 6hr of bench wall.
#[arg(long, default_value_t = 0)]
max_consecutive_text_turns: u32,
}

#[derive(Debug, Serialize)]
Expand Down Expand Up @@ -135,11 +145,16 @@ fn main() {
cli.wall_seconds,
cli.oracle_check_interval,
);
let mut session = if cli.compliance_enforced {
let session_with_compliance = if cli.compliance_enforced {
session_base.with_compliance(cli.max_consecutive_compliance_failures)
} else {
session_base
};
let mut session = if cli.max_consecutive_text_turns > 0 {
session_with_compliance.with_max_consecutive_text_turns(cli.max_consecutive_text_turns)
} else {
session_with_compliance
};
let outcome = session.run(&prompt, &oracle);

// Recovery observed: at least one turn's BashOutput had a non-zero
Expand Down
260 changes: 260 additions & 0 deletions crates/ccpa-arena/src/session.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,23 @@ pub enum ArenaOutcome {
/// SHA triggered the trap (equals the configured cap).
consecutive_count: u32,
},
/// M292 (`V1_004` sub-bench B follow-up): the session was terminated
/// because the agent emitted text-only turns (no `<tool_call>`
/// invocation) for N turns in a row without making any file-system
/// or shell-command action. Distinct from `OracleFailedAfterMaxTurns`
/// because we stop EARLY when no tool calls are detected — the agent
/// is "talking but not acting." Empirically observed on `Qwen3-Coder-30B`
/// sub-bench B (M291), where the model emitted prose + Markdown code
/// blocks for all 20 turns without ever invoking `file_edit` /
/// `file_write`. Detector is opt-in (default cap=0 = disabled).
AgentTextLoop {
/// How many consecutive text-only turns triggered the trap
/// (equals the configured cap).
consecutive_text_turns: u32,
/// First 200 chars of the most recent text turn (for diagnostic;
/// reveals what the model was saying when it was stuck talking).
last_text_excerpt: String,
},
}

impl ArenaOutcome {
Expand Down Expand Up @@ -155,6 +172,12 @@ pub struct ArenaSession<D: ArenaDriver> {
/// terminates with [`ArenaOutcome::ComplianceTrap`] to save token
/// costs. Default: 3.
max_consecutive_compliance_failures: u32,
/// M292 Agent-Text-Loop cap. If the agent emits text-only turns (no
/// `tool_call`) for this many turns in a row, the session terminates
/// with [`ArenaOutcome::AgentTextLoop`]. `0` (default) disables the
/// detector (preserves M287/M291 baseline behavior). Configured via
/// [`Self::with_max_consecutive_text_turns`].
max_consecutive_text_turns: u32,
}

impl<D: ArenaDriver> ArenaSession<D> {
Expand Down Expand Up @@ -185,6 +208,7 @@ impl<D: ArenaDriver> ArenaSession<D> {
oracle_check_interval,
compliance_enforced: false,
max_consecutive_compliance_failures: 3,
max_consecutive_text_turns: 0,
}
}

Expand Down Expand Up @@ -217,6 +241,32 @@ impl<D: ArenaDriver> ArenaSession<D> {
self.max_consecutive_compliance_failures
}

/// Enable the M292 Agent-Text-Loop detector. Each turn where the
/// agent emits text only (no `<tool_call>`) increments the
/// consecutive-text counter; any tool invocation resets it. When the
/// counter reaches `cap`, the session terminates with
/// [`ArenaOutcome::AgentTextLoop`]. `cap=0` disables the detector
/// (the default — preserves baseline behavior).
///
/// Motivated by `V1_004` sub-bench B (paiml/claude-code-parity-apr
/// M291) on `Qwen3-Coder-30B-A3B` where the model emitted text-only
/// turns for all 20 turns of `max_turns` without ever invoking a
/// tool — the existing `OracleFailedAfterMaxTurns` outcome
/// conflated "wrong answer after 20 turns of work" with "no work at
/// all for 20 turns." This detector separates the two.
#[must_use]
pub fn with_max_consecutive_text_turns(mut self, cap: u32) -> Self {
self.max_consecutive_text_turns = cap;
self
}

/// Configured Agent-Text-Loop cap (turns). `0` means the detector
/// is disabled.
#[must_use]
pub fn max_consecutive_text_turns(&self) -> u32 {
self.max_consecutive_text_turns
}

/// Get the underlying driver. Test-only accessor for inspecting
/// driver state after a session.
#[must_use]
Expand Down Expand Up @@ -273,6 +323,7 @@ impl<D: ArenaDriver> ArenaSession<D> {
pub fn run(&mut self, prompt: &str, oracle: &OracleCmd) -> ArenaOutcome {
let start = Instant::now();
let mut compliance_trap: ComplianceTrapState = ComplianceTrapState::default();
let mut text_loop: AgentTextLoopState = AgentTextLoopState::default();
for turn in 1..=self.max_turns {
// (1) Wall-clock budget check.
if start.elapsed().as_secs() >= self.max_wall_seconds {
Expand Down Expand Up @@ -331,6 +382,24 @@ impl<D: ArenaDriver> ArenaSession<D> {
}
}

// (4c) M292 Agent-Text-Loop detection. Opt-in: disabled when
// `max_consecutive_text_turns == 0` (preserves M287/M291
// baseline). When enabled, counts consecutive text-only turns
// and bails with `AgentTextLoop` at the cap to save token
// costs on agents that talk-but-don't-act.
if self.max_consecutive_text_turns > 0 {
if let Some(trap_outcome) =
text_loop.observe(&invocation, self.max_consecutive_text_turns)
{
self.history.push(TurnRecord {
turn,
invocation,
result,
});
return trap_outcome;
}
}

// (5) Append to history.
self.history.push(TurnRecord {
turn,
Expand Down Expand Up @@ -427,6 +496,47 @@ impl ComplianceTrapState {
}
}

/// Rolling state for the M292 Agent-Text-Loop detector. Counts
/// consecutive text-only invocations; any non-Text invocation resets
/// the counter.
#[derive(Debug, Default)]
struct AgentTextLoopState {
consecutive: u32,
last_text: String,
}

impl AgentTextLoopState {
/// Update from a [`ToolInvocation`]. Returns
/// `Some(ArenaOutcome::AgentTextLoop)` when the consecutive-text
/// counter reaches `cap`. `cap=0` is a guard handled by the caller;
/// this function assumes `cap >= 1`.
fn observe(&mut self, invocation: &ToolInvocation, cap: u32) -> Option<ArenaOutcome> {
let ToolInvocation::Text { content } = invocation else {
self.consecutive = 0;
self.last_text.clear();
return None;
};
self.consecutive = self.consecutive.saturating_add(1);
self.last_text = excerpt(content, 200);
if self.consecutive >= cap {
Some(ArenaOutcome::AgentTextLoop {
consecutive_text_turns: self.consecutive,
last_text_excerpt: self.last_text.clone(),
})
} else {
None
}
}
}

/// Truncate `s` to at most `max` chars (char-boundary safe).
fn excerpt(s: &str, max: usize) -> String {
if s.chars().count() <= max {
return s.to_owned();
}
s.chars().take(max).collect::<String>() + "…"
}

#[cfg(test)]
#[allow(
clippy::expect_used,
Expand Down Expand Up @@ -985,4 +1095,154 @@ mod tests {
ArenaOutcome::OracleFailedAfterMaxTurns { turns: 2, .. }
));
}

// ====================================================================
// M292 Agent-Text-Loop detector
// ====================================================================

#[test]
fn agent_text_loop_state_increments_on_text() {
let mut s = AgentTextLoopState::default();
let inv = ToolInvocation::Text {
content: "hello".to_owned(),
};
assert!(s.observe(&inv, 3).is_none());
assert_eq!(s.consecutive, 1);
assert!(s.observe(&inv, 3).is_none());
assert_eq!(s.consecutive, 2);
let outcome = s.observe(&inv, 3);
assert!(
matches!(
outcome,
Some(ArenaOutcome::AgentTextLoop {
consecutive_text_turns: 3,
..
})
),
"expected AgentTextLoop at cap=3, got {outcome:?}"
);
}

#[test]
fn agent_text_loop_state_resets_on_non_text() {
let mut s = AgentTextLoopState::default();
let text = ToolInvocation::Text {
content: "talk".to_owned(),
};
let bash = ToolInvocation::Bash {
command: "echo".to_owned(),
};
assert!(s.observe(&text, 3).is_none());
assert!(s.observe(&text, 3).is_none());
// Non-text invocation resets the counter.
assert!(s.observe(&bash, 3).is_none());
assert_eq!(s.consecutive, 0);
assert!(s.last_text.is_empty());
// After reset, next text only increments to 1.
assert!(s.observe(&text, 3).is_none());
assert_eq!(s.consecutive, 1);
}

#[test]
fn agent_text_loop_state_excerpt_truncates_long_text() {
let mut s = AgentTextLoopState::default();
let long = "x".repeat(500);
let inv = ToolInvocation::Text {
content: long.clone(),
};
assert!(s.observe(&inv, 100).is_none());
// Excerpt should be ≤200 chars + ellipsis.
assert!(
s.last_text.chars().count() <= 201,
"excerpt was: {}",
s.last_text
);
assert!(
s.last_text.ends_with('…'),
"expected ellipsis, got: {}",
s.last_text
);
}

#[test]
fn run_agent_text_loop_disabled_by_default_preserves_baseline() {
// No `with_max_consecutive_text_turns` call → detector disabled →
// text-only turns run all the way to `max_turns` (current
// M287/M291 baseline behavior).
let plan = vec![mk_text_turn("just chatter"); 3];
let driver = MockDriver::new("test", plan);
let tmp = tempfile::tempdir().expect("tempdir");
let mut s = ArenaSession::new(driver, tmp.path().to_path_buf(), 3, 900, 0);
let oracle = OracleCmd::new("false", "PASS");
let outcome = s.run("text-only", &oracle);
assert!(matches!(
outcome,
ArenaOutcome::OracleFailedAfterMaxTurns { turns: 3, .. }
));
}

#[test]
fn run_agent_text_loop_fires_at_cap_when_enabled() {
// 5 text-only turns; cap=3 → AgentTextLoop after turn 3, saving
// 2 turns of work.
let plan = vec![mk_text_turn("turn 1"); 5];
let driver = MockDriver::new("test", plan);
let tmp = tempfile::tempdir().expect("tempdir");
let mut s = ArenaSession::new(driver, tmp.path().to_path_buf(), 5, 900, 0)
.with_max_consecutive_text_turns(3);
let oracle = OracleCmd::new("false", "PASS");
let outcome = s.run("text-only", &oracle);
match outcome {
ArenaOutcome::AgentTextLoop {
consecutive_text_turns: 3,
..
} => {}
other => panic!("expected AgentTextLoop(3), got {other:?}"),
}
// History should record 3 turns (the trapping turn included).
assert_eq!(s.history().len(), 3);
}

#[test]
fn run_agent_text_loop_resets_counter_on_tool_use() {
// Plan: 2 text + 1 bash + 2 text + 1 bash → counter resets twice;
// no trap fires; session runs to max_turns=6.
let plan = vec![
mk_text_turn("talk 1"),
mk_text_turn("talk 2"),
mk_bash_turn("echo act"),
mk_text_turn("talk 3"),
mk_text_turn("talk 4"),
mk_bash_turn("echo PASS"),
];
let driver = MockDriver::new("test", plan);
let tmp = tempfile::tempdir().expect("tempdir");
// Cap=3; sequence never has 3 consecutive text → no trap.
// oracle_check_interval=0 so the bash echo doesn't trigger an
// EndTurn oracle pass.
let mut s = ArenaSession::new(driver, tmp.path().to_path_buf(), 6, 900, 0)
.with_max_consecutive_text_turns(3);
let oracle = OracleCmd::new("false", "PASS");
let outcome = s.run("intermittent", &oracle);
// Should NOT be AgentTextLoop; should hit max_turns.
assert!(
matches!(
outcome,
ArenaOutcome::OracleFailedAfterMaxTurns { turns: 6, .. }
),
"expected OracleFailedAfterMaxTurns(6), got {outcome:?}"
);
}

#[test]
fn with_max_consecutive_text_turns_accessor_returns_configured_cap() {
let s = mk_session_default().with_max_consecutive_text_turns(5);
assert_eq!(s.max_consecutive_text_turns(), 5);
}

#[test]
fn max_consecutive_text_turns_default_is_zero_disabled() {
let s = mk_session_default();
assert_eq!(s.max_consecutive_text_turns(), 0);
}
}
Loading
Loading