Skip to content

Commit 448383c

Browse files
refactor: remove Responses API, consolidate to Chat Completions (#272)
* fix: strip reasoning from LLM responses and persist assistant messages reliably - Filter out `type: "reasoning"` output items from NEAR AI Responses API parsing so chain-of-thought never reaches the UI (nearai.rs) - Rewrite clean_response with regex-based tag stripping that is code-aware (preserves tags inside fenced blocks and inline backticks), supports 9+ tag names (think, thought, reasoning, reflection, etc.), handles <final> extraction, pipe-delimited tags, and case/whitespace tolerance (reasoning.rs) - Add Reasoning::complete() helper so all non-agentic LLM call sites (summarize, suggest, heartbeat, compaction) get automatic response cleaning; thread SafetyLayer through to those callers - Change persist_turn from fire-and-forget tokio::spawn to awaited async so both user and assistant messages are written before returning, preventing data loss on shutdown/restart - Pass input_count through seed_response_chain so response chaining delta calculation is accurate after thread hydration on restart - Make NearAiResponse.usage optional and preserve response_id in alt response path for chaining continuity - Persist session token to DB during onboarding wizard so runtime loads it without legacy-key fallback; suppress spurious warning on fresh installs - Fix dev tool double-registration when builder already registers them - Load dotenv/ironclaw env for doctor and status subcommands - Reduce startup log noise (demote info→debug for skills, remove redundant info lines) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Nudge to not loop over tools continuesly * refactor: remove Responses API, consolidate NEAR AI to Chat Completions only The Responses API provider (nearai.rs, 1278 lines) added significant complexity (response chaining state machine, delta message calculation, previous_response_id persistence) for marginal benefit. This consolidates to the Chat Completions API only, upgrading NearAiChatProvider with dual auth (session token + API key) and 401 retry for session token renewal. - Delete src/llm/nearai.rs (Responses API provider) - Upgrade nearai_chat.rs with SessionManager, dual auth, flexible list_models - Remove response_id from CompletionResponse and ToolCompletionResponse - Remove seed_response_chain/get_response_chain_id from LlmProvider trait - Remove response chain persistence from agent (thread_ops, session) - Remove NearAiApiMode enum and NEARAI_API_MODE config - Clean up all wrapper providers (retry, circuit_breaker, failover, cache) - Update documentation (CLAUDE.md, .env.example) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * feat: runtime log level control via gateway UI and URL parameter Add server-side log level switching using tracing_subscriber::reload::Layer so the EnvFilter can be swapped at runtime without restarting. Expose via GET/PUT /api/logs/level endpoints, a "Server: LEVEL" dropdown in the logs toolbar, and a ?log_level=debug URL parameter for one-click activation. Also applies cargo fmt to pre-existing files (llm/, tests/). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 7df356c commit 448383c

38 files changed

Lines changed: 1319 additions & 1773 deletions

.env.example

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,19 @@ DATABASE_POOL_SIZE=10
66
# LLM_BACKEND=nearai # default
77
# Possible values: nearai, ollama, openai_compatible, openai, anthropic, tinfoil
88

9-
# === NEAR AI Chat (Responses API, session token auth) ===
10-
# Default mode. Uses browser OAuth (GitHub/Google) on first run.
11-
# Session token stored in ~/.ironclaw/session.json automatically.
12-
# For hosting providers: set NEARAI_SESSION_TOKEN env var directly.
9+
# === NEAR AI (Chat Completions API) ===
10+
# Two auth modes:
11+
# 1. Session token (default): Uses browser OAuth (GitHub/Google) on first run.
12+
# Session token stored in ~/.ironclaw/session.json automatically.
13+
# Base URL defaults to https://private.near.ai
14+
# 2. API key: Set NEARAI_API_KEY to use API key auth from cloud.near.ai.
15+
# Base URL defaults to https://cloud-api.near.ai
1316
NEARAI_MODEL=zai-org/GLM-5-FP8
1417
NEARAI_BASE_URL=https://private.near.ai
1518
NEARAI_AUTH_URL=https://private.near.ai
1619
# NEARAI_SESSION_TOKEN=sess_... # hosting providers: set this
1720
# NEARAI_SESSION_PATH=~/.ironclaw/session.json # optional, default shown
18-
19-
# === NEAR AI Cloud (Chat Completions API, API key auth) ===
20-
# Auto-selected when NEARAI_API_KEY is set. Get a key from cloud.near.ai.
21-
# NEARAI_API_KEY=...
22-
# NEARAI_BASE_URL=https://cloud-api.near.ai # default for cloud mode
23-
# NEARAI_API_MODE=chat_completions # auto-detected from API key
21+
# NEARAI_API_KEY=... # API key from cloud.near.ai
2422

2523
# Local LLM Providers (Ollama, LM Studio, vLLM, LiteLLM)
2624

CLAUDE.md

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,7 @@ src/
120120
├── llm/ # LLM integration (multi-provider)
121121
│ ├── mod.rs # Provider factory, LlmBackend enum
122122
│ ├── provider.rs # LlmProvider trait, message types
123-
│ ├── nearai.rs # NEAR AI Responses API provider
124-
│ ├── nearai_chat.rs # NEAR AI Chat Completions fallback
123+
│ ├── nearai_chat.rs # NEAR AI Chat Completions provider (session token + API key auth)
125124
│ ├── reasoning.rs # Planning, tool selection, evaluation
126125
│ ├── session.rs # Session token management with auto-renewal
127126
│ ├── circuit_breaker.rs # Circuit breaker for provider failures
@@ -339,13 +338,12 @@ LIBSQL_PATH=~/.ironclaw/ironclaw.db # libSQL local path (default)
339338
# LIBSQL_AUTH_TOKEN=xxx # Required with LIBSQL_URL
340339

341340
# NEAR AI (when LLM_BACKEND=nearai, the default)
342-
# Two modes: "NEAR AI Chat" (session token) or "NEAR AI Cloud" (API key)
343-
# NEAR AI Chat (Responses API, default):
344-
NEARAI_SESSION_TOKEN=sess_... # session token for chat-api
341+
# Two auth modes: session token (default) or API key
342+
# Session token auth (default): uses browser OAuth on first run
343+
NEARAI_SESSION_TOKEN=sess_... # hosting providers: set this
345344
NEARAI_BASE_URL=https://private.near.ai
346-
# NEAR AI Cloud (Chat Completions API, auto-selected when API key is set):
345+
# API key auth: set NEARAI_API_KEY, base URL defaults to cloud-api.near.ai
347346
# NEARAI_API_KEY=... # API key from cloud.near.ai
348-
# NEARAI_BASE_URL=https://cloud-api.near.ai
349347
NEARAI_MODEL=claude-3-5-sonnet-20241022
350348

351349
# Agent settings
@@ -408,11 +406,9 @@ TINFOIL_MODEL=kimi-k2-5 # Default model
408406

409407
IronClaw supports multiple LLM backends via the `LLM_BACKEND` env var: `nearai` (default), `openai`, `anthropic`, `ollama`, `openai_compatible`, and `tinfoil`.
410408

411-
**NEAR AI Chat** -- Uses the NEAR AI Responses API (`https://private.near.ai/v1/responses`). Authenticates with session tokens (`sess_xxx`) obtained via browser OAuth (GitHub/Google). Supports response chaining (delta-only follow-up messages) for efficient multi-turn conversations. This is the default mode when no `NEARAI_API_KEY` is set. Set `NEARAI_SESSION_TOKEN` env var for hosting providers that inject tokens via environment. Configure with `NEARAI_BASE_URL` (default: `https://private.near.ai`).
409+
**NEAR AI** -- Uses the Chat Completions API with dual auth support. Session token auth (default): authenticates with session tokens (`sess_xxx`) obtained via browser OAuth (GitHub/Google), base URL defaults to `https://private.near.ai`. API key auth: set `NEARAI_API_KEY` (from `cloud.near.ai`), base URL defaults to `https://cloud-api.near.ai`. Both modes use the same Chat Completions endpoint. Tool messages are flattened to plain text for compatibility. Set `NEARAI_SESSION_TOKEN` env var for hosting providers that inject tokens via environment.
412410

413-
**NEAR AI Cloud** -- Uses the OpenAI-compatible Chat Completions API (`https://cloud-api.near.ai/v1/chat/completions`). Authenticates with API keys from `cloud.near.ai`. Auto-selected when `NEARAI_API_KEY` is set (or explicitly via `NEARAI_API_MODE=chat_completions`). Tool messages are flattened to plain text for compatibility. Configure with `NEARAI_API_KEY` and `NEARAI_BASE_URL` (default: `https://cloud-api.near.ai`).
414-
415-
**Tinfoil** -- Private inference via `https://inference.tinfoil.sh/v1`. Runs models inside hardware-attested TEEs so neither Tinfoil nor the cloud provider can see prompts or responses. Uses the OpenAI-compatible Chat Completions API only (not the Responses API, so tool calls are adapted to chat format). Configure with `TINFOIL_API_KEY` and `TINFOIL_MODEL` (default: `kimi-k2-5`).
411+
**Tinfoil** -- Private inference via `https://inference.tinfoil.sh/v1`. Runs models inside hardware-attested TEEs so neither Tinfoil nor the cloud provider can see prompts or responses. Uses the OpenAI-compatible Chat Completions API. Configure with `TINFOIL_API_KEY` and `TINFOIL_MODEL` (default: `kimi-k2-5`).
416412

417413
## Database
418414

benchmarks/src/instrumented_llm.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,6 @@ mod tests {
182182
input_tokens: 100,
183183
output_tokens: 50,
184184
finish_reason: FinishReason::Stop,
185-
response_id: None,
186185
})
187186
}
188187

@@ -196,7 +195,6 @@ mod tests {
196195
input_tokens: 200,
197196
output_tokens: 100,
198197
finish_reason: FinishReason::Stop,
199-
response_id: None,
200198
})
201199
}
202200
}

src/agent/agent_loop.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -358,10 +358,6 @@ impl Agent {
358358
}
359359
});
360360

361-
tracing::info!(
362-
"Heartbeat enabled with {}s interval",
363-
hb_config.interval_secs
364-
);
365361
let hygiene = self
366362
.hygiene_config
367363
.as_ref()
@@ -373,6 +369,7 @@ impl Agent {
373369
hygiene,
374370
workspace.clone(),
375371
self.cheap_llm().clone(),
372+
self.safety().clone(),
376373
Some(notify_tx),
377374
))
378375
} else {

src/agent/commands.rs

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use crate::agent::submission::SubmissionResult;
1313
use crate::agent::{Agent, MessageIntent};
1414
use crate::channels::{IncomingMessage, StatusUpdate};
1515
use crate::error::Error;
16-
use crate::llm::ChatMessage;
16+
use crate::llm::{ChatMessage, Reasoning};
1717

1818
impl Agent {
1919
/// Handle job-related intents without turn tracking.
@@ -235,6 +235,7 @@ impl Agent {
235235
crate::workspace::hygiene::HygieneConfig::default(),
236236
workspace.clone(),
237237
self.llm().clone(),
238+
self.safety().clone(),
238239
);
239240

240241
match runner.check_heartbeat().await {
@@ -295,10 +296,11 @@ impl Agent {
295296
.with_max_tokens(512)
296297
.with_temperature(0.3);
297298

298-
match self.llm().complete(request).await {
299-
Ok(response) => Ok(SubmissionResult::response(format!(
299+
let reasoning = Reasoning::new(self.llm().clone(), self.safety().clone());
300+
match reasoning.complete(request).await {
301+
Ok((text, _usage)) => Ok(SubmissionResult::response(format!(
300302
"Thread Summary:\n\n{}",
301-
response.content.trim()
303+
text.trim()
302304
))),
303305
Err(e) => Ok(SubmissionResult::error(format!("Summarize failed: {}", e))),
304306
}
@@ -342,10 +344,11 @@ impl Agent {
342344
.with_max_tokens(512)
343345
.with_temperature(0.5);
344346

345-
match self.llm().complete(request).await {
346-
Ok(response) => Ok(SubmissionResult::response(format!(
347+
let reasoning = Reasoning::new(self.llm().clone(), self.safety().clone());
348+
match reasoning.complete(request).await {
349+
Ok((text, _usage)) => Ok(SubmissionResult::response(format!(
347350
"Suggested Next Steps:\n\n{}",
348-
response.content.trim()
351+
text.trim()
349352
))),
350353
Err(e) => Ok(SubmissionResult::error(format!("Suggest failed: {}", e))),
351354
}

src/agent/compaction.rs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ use chrono::Utc;
1212
use crate::agent::context_monitor::{CompactionStrategy, ContextBreakdown};
1313
use crate::agent::session::Thread;
1414
use crate::error::Error;
15-
use crate::llm::{ChatMessage, CompletionRequest, LlmProvider};
15+
use crate::llm::{ChatMessage, CompletionRequest, LlmProvider, Reasoning};
16+
use crate::safety::SafetyLayer;
1617
use crate::workspace::Workspace;
1718

1819
/// Result of a compaction operation.
@@ -33,12 +34,13 @@ pub struct CompactionResult {
3334
/// Compacts conversation context to stay within limits.
3435
pub struct ContextCompactor {
3536
llm: Arc<dyn LlmProvider>,
37+
safety: Arc<SafetyLayer>,
3638
}
3739

3840
impl ContextCompactor {
3941
/// Create a new context compactor.
40-
pub fn new(llm: Arc<dyn LlmProvider>) -> Self {
41-
Self { llm }
42+
pub fn new(llm: Arc<dyn LlmProvider>, safety: Arc<SafetyLayer>) -> Self {
43+
Self { llm, safety }
4244
}
4345

4446
/// Compact a thread's context using the given strategy.
@@ -231,8 +233,9 @@ Be brief but capture all important details. Use bullet points."#,
231233
.with_max_tokens(1024)
232234
.with_temperature(0.3);
233235

234-
let response = self.llm.complete(request).await?;
235-
Ok(response.content)
236+
let reasoning = Reasoning::new(self.llm.clone(), self.safety.clone());
237+
let (text, _) = reasoning.complete(request).await?;
238+
Ok(text)
236239
}
237240

238241
/// Write a summary to the workspace daily log.

src/agent/dispatcher.rs

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,17 @@ impl Agent {
109109
let job_ctx = JobContext::with_user(&message.user_id, "chat", "Interactive chat session");
110110

111111
const MAX_TOOL_ITERATIONS: usize = 10;
112+
// Force a text-only response on the last iteration to guarantee termination
113+
// instead of hard-erroring. The penultimate iteration also gets a nudge
114+
// message so the LLM knows it should wrap up.
115+
const FORCE_TEXT_AT: usize = MAX_TOOL_ITERATIONS;
116+
const NUDGE_AT: usize = MAX_TOOL_ITERATIONS - 1;
112117
let mut iteration = 0;
113118
loop {
114119
iteration += 1;
115-
if iteration > MAX_TOOL_ITERATIONS {
120+
// Hard ceiling one past the forced-text iteration (should never be reached
121+
// since FORCE_TEXT_AT guarantees a text response, but kept as a safety net).
122+
if iteration > MAX_TOOL_ITERATIONS + 1 {
116123
return Err(crate::error::LlmError::InvalidResponse {
117124
provider: "agent".to_string(),
118125
reason: format!("Exceeded maximum tool iterations ({})", MAX_TOOL_ITERATIONS),
@@ -143,6 +150,19 @@ impl Agent {
143150
.into());
144151
}
145152

153+
// Inject a nudge message when approaching the iteration limit so the
154+
// LLM is aware it should produce a final answer on the next turn.
155+
if iteration == NUDGE_AT {
156+
context_messages.push(ChatMessage::system(
157+
"You are approaching the tool call limit. \
158+
Provide your best final answer on the next response \
159+
using the information you have gathered so far. \
160+
Do not call any more tools.",
161+
));
162+
}
163+
164+
let force_text = iteration >= FORCE_TEXT_AT;
165+
146166
// Refresh tool definitions each iteration so newly built tools become visible
147167
let tool_defs = self.tools().tool_definitions().await;
148168

@@ -162,15 +182,24 @@ impl Agent {
162182
tool_defs
163183
};
164184

165-
// Call LLM with current context
166-
let context = ReasoningContext::new()
185+
// Call LLM with current context; force_text drops tools to guarantee a
186+
// text response on the final iteration.
187+
let mut context = ReasoningContext::new()
167188
.with_messages(context_messages.clone())
168189
.with_tools(tool_defs)
169190
.with_metadata({
170191
let mut m = std::collections::HashMap::new();
171192
m.insert("thread_id".to_string(), thread_id.to_string());
172193
m
173194
});
195+
context.force_text = force_text;
196+
197+
if force_text {
198+
tracing::info!(
199+
iteration,
200+
"Forcing text-only response (iteration limit reached)"
201+
);
202+
}
174203

175204
let output = reasoning.respond_with_tools(&context).await?;
176205

@@ -799,7 +828,6 @@ mod tests {
799828
input_tokens: 0,
800829
output_tokens: 0,
801830
finish_reason: FinishReason::Stop,
802-
response_id: None,
803831
})
804832
}
805833

@@ -813,7 +841,6 @@ mod tests {
813841
input_tokens: 0,
814842
output_tokens: 0,
815843
finish_reason: FinishReason::Stop,
816-
response_id: None,
817844
})
818845
}
819846
}

src/agent/heartbeat.rs

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ use std::time::Duration;
2929
use tokio::sync::mpsc;
3030

3131
use crate::channels::OutgoingResponse;
32-
use crate::llm::{ChatMessage, CompletionRequest, FinishReason, LlmProvider};
32+
use crate::llm::{ChatMessage, CompletionRequest, LlmProvider, Reasoning};
33+
use crate::safety::SafetyLayer;
3334
use crate::workspace::Workspace;
3435
use crate::workspace::hygiene::HygieneConfig;
3536

@@ -100,6 +101,7 @@ pub struct HeartbeatRunner {
100101
hygiene_config: HygieneConfig,
101102
workspace: Arc<Workspace>,
102103
llm: Arc<dyn LlmProvider>,
104+
safety: Arc<SafetyLayer>,
103105
response_tx: Option<mpsc::Sender<OutgoingResponse>>,
104106
consecutive_failures: u32,
105107
}
@@ -111,12 +113,14 @@ impl HeartbeatRunner {
111113
hygiene_config: HygieneConfig,
112114
workspace: Arc<Workspace>,
113115
llm: Arc<dyn LlmProvider>,
116+
safety: Arc<SafetyLayer>,
114117
) -> Self {
115118
Self {
116119
config,
117120
hygiene_config,
118121
workspace,
119122
llm,
123+
safety,
120124
response_tx: None,
121125
consecutive_failures: 0,
122126
}
@@ -258,25 +262,18 @@ impl HeartbeatRunner {
258262
.with_max_tokens(max_tokens)
259263
.with_temperature(0.3);
260264

261-
let response = match self.llm.complete(request).await {
265+
let reasoning = Reasoning::new(self.llm.clone(), self.safety.clone());
266+
let (content, _usage) = match reasoning.complete(request).await {
262267
Ok(r) => r,
263268
Err(e) => return HeartbeatResult::Failed(format!("LLM call failed: {}", e)),
264269
};
265270

266-
let content = response.content.trim();
271+
let content = content.trim();
267272

268273
// Guard against empty content. Reasoning models (e.g. GLM-4.7) may
269274
// burn all output tokens on chain-of-thought and return content: null.
270275
if content.is_empty() {
271-
return if response.finish_reason == FinishReason::Length {
272-
HeartbeatResult::Failed(
273-
"LLM response was truncated (finish_reason=length) with no content. \
274-
The model may have exhausted its token budget on reasoning."
275-
.to_string(),
276-
)
277-
} else {
278-
HeartbeatResult::Failed("LLM returned empty content.".to_string())
279-
};
276+
return HeartbeatResult::Failed("LLM returned empty content.".to_string());
280277
}
281278

282279
// Check if nothing needs attention
@@ -355,9 +352,10 @@ pub fn spawn_heartbeat(
355352
hygiene_config: HygieneConfig,
356353
workspace: Arc<Workspace>,
357354
llm: Arc<dyn LlmProvider>,
355+
safety: Arc<SafetyLayer>,
358356
response_tx: Option<mpsc::Sender<OutgoingResponse>>,
359357
) -> tokio::task::JoinHandle<()> {
360-
let mut runner = HeartbeatRunner::new(config, hygiene_config, workspace, llm);
358+
let mut runner = HeartbeatRunner::new(config, hygiene_config, workspace, llm, safety);
361359
if let Some(tx) = response_tx {
362360
runner = runner.with_response_channel(tx);
363361
}

src/agent/session.rs

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -185,10 +185,6 @@ pub struct Thread {
185185
/// Pending auth token request (thread is in auth mode).
186186
#[serde(default)]
187187
pub pending_auth: Option<PendingAuth>,
188-
/// Last NEAR AI response ID for response chaining. Persisted to DB
189-
/// metadata so we can resume chaining across restarts.
190-
#[serde(default)]
191-
pub last_response_id: Option<String>,
192188
}
193189

194190
impl Thread {
@@ -205,7 +201,6 @@ impl Thread {
205201
metadata: serde_json::Value::Null,
206202
pending_approval: None,
207203
pending_auth: None,
208-
last_response_id: None,
209204
}
210205
}
211206

@@ -222,7 +217,6 @@ impl Thread {
222217
metadata: serde_json::Value::Null,
223218
pending_approval: None,
224219
pending_auth: None,
225-
last_response_id: None,
226220
}
227221
}
228222

@@ -863,7 +857,6 @@ mod tests {
863857

864858
thread.start_turn("hello");
865859
thread.complete_turn("world");
866-
thread.last_response_id = Some("resp_abc123".to_string());
867860

868861
let json = serde_json::to_string(&thread).unwrap();
869862
let restored: Thread = serde_json::from_str(&json).unwrap();
@@ -873,7 +866,6 @@ mod tests {
873866
assert_eq!(restored.turns.len(), 1);
874867
assert_eq!(restored.turns[0].user_input, "hello");
875868
assert_eq!(restored.turns[0].response, Some("world".to_string()));
876-
assert_eq!(restored.last_response_id, Some("resp_abc123".to_string()));
877869
}
878870

879871
#[test]

0 commit comments

Comments
 (0)