nearai
diff --git a/‎.env.example‎
Lines changed: 8 additions & 10 deletions b/‎.env.example‎
Lines changed: 8 additions & 10 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 7 additions & 11 deletions b/‎CLAUDE.md‎
Lines changed: 7 additions & 11 deletions
diff --git a/‎benchmarks/src/instrumented_llm.rs‎
Lines changed: 0 additions & 2 deletions b/‎benchmarks/src/instrumented_llm.rs‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/agent/agent_loop.rs‎
Lines changed: 1 addition & 4 deletions b/‎src/agent/agent_loop.rs‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎src/agent/commands.rs‎
Lines changed: 10 additions & 7 deletions b/‎src/agent/commands.rs‎
Lines changed: 10 additions & 7 deletions
diff --git a/‎src/agent/compaction.rs‎
Lines changed: 8 additions & 5 deletions b/‎src/agent/compaction.rs‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎src/agent/dispatcher.rs‎
Lines changed: 32 additions & 5 deletions b/‎src/agent/dispatcher.rs‎
Lines changed: 32 additions & 5 deletions
diff --git a/‎src/agent/heartbeat.rs‎
Lines changed: 11 additions & 13 deletions b/‎src/agent/heartbeat.rs‎
Lines changed: 11 additions & 13 deletions
diff --git a/‎src/agent/session.rs‎
Lines changed: 0 additions & 8 deletions b/‎src/agent/session.rs‎
Lines changed: 0 additions & 8 deletions
@@ -6,21 +6,19 @@ DATABASE_POOL_SIZE=10
 # LLM_BACKEND=nearai           # default
 # Possible values: nearai, ollama, openai_compatible, openai, anthropic, tinfoil
 
-# === NEAR AI Chat (Responses API, session token auth) ===
-# Default mode. Uses browser OAuth (GitHub/Google) on first run.
-# Session token stored in ~/.ironclaw/session.json automatically.
-# For hosting providers: set NEARAI_SESSION_TOKEN env var directly.
+# === NEAR AI (Chat Completions API) ===
+# Two auth modes:
+#   1. Session token (default): Uses browser OAuth (GitHub/Google) on first run.
+#      Session token stored in ~/.ironclaw/session.json automatically.
+#      Base URL defaults to https://private.near.ai
+#   2. API key: Set NEARAI_API_KEY to use API key auth from cloud.near.ai.
+#      Base URL defaults to https://cloud-api.near.ai
 NEARAI_MODEL=zai-org/GLM-5-FP8
 NEARAI_BASE_URL=https://private.near.ai
 NEARAI_AUTH_URL=https://private.near.ai
 # NEARAI_SESSION_TOKEN=sess_...                  # hosting providers: set this
 # NEARAI_SESSION_PATH=~/.ironclaw/session.json   # optional, default shown
-
-# === NEAR AI Cloud (Chat Completions API, API key auth) ===
-# Auto-selected when NEARAI_API_KEY is set. Get a key from cloud.near.ai.
-# NEARAI_API_KEY=...
-# NEARAI_BASE_URL=https://cloud-api.near.ai      # default for cloud mode
-# NEARAI_API_MODE=chat_completions               # auto-detected from API key
+# NEARAI_API_KEY=...                             # API key from cloud.near.ai
 
 # Local LLM Providers (Ollama, LM Studio, vLLM, LiteLLM)
 
 
@@ -120,8 +120,7 @@ src/
 ├── llm/                # LLM integration (multi-provider)
 │   ├── mod.rs          # Provider factory, LlmBackend enum
 │   ├── provider.rs     # LlmProvider trait, message types
-│   ├── nearai.rs       # NEAR AI Responses API provider
-│   ├── nearai_chat.rs  # NEAR AI Chat Completions fallback
+│   ├── nearai_chat.rs  # NEAR AI Chat Completions provider (session token + API key auth)
 │   ├── reasoning.rs    # Planning, tool selection, evaluation
 │   ├── session.rs      # Session token management with auto-renewal
 │   ├── circuit_breaker.rs # Circuit breaker for provider failures
@@ -339,13 +338,12 @@ LIBSQL_PATH=~/.ironclaw/ironclaw.db    # libSQL local path (default)
 # LIBSQL_AUTH_TOKEN=xxx                # Required with LIBSQL_URL
 
 # NEAR AI (when LLM_BACKEND=nearai, the default)
-# Two modes: "NEAR AI Chat" (session token) or "NEAR AI Cloud" (API key)
-# NEAR AI Chat (Responses API, default):
-NEARAI_SESSION_TOKEN=sess_...           # session token for chat-api
+# Two auth modes: session token (default) or API key
+# Session token auth (default): uses browser OAuth on first run
+NEARAI_SESSION_TOKEN=sess_...           # hosting providers: set this
 NEARAI_BASE_URL=https://private.near.ai
-# NEAR AI Cloud (Chat Completions API, auto-selected when API key is set):
+# API key auth: set NEARAI_API_KEY, base URL defaults to cloud-api.near.ai
 # NEARAI_API_KEY=...                    # API key from cloud.near.ai
-# NEARAI_BASE_URL=https://cloud-api.near.ai
 NEARAI_MODEL=claude-3-5-sonnet-20241022
 
 # Agent settings
@@ -408,11 +406,9 @@ TINFOIL_MODEL=kimi-k2-5               # Default model
 
 IronClaw supports multiple LLM backends via the `LLM_BACKEND` env var: `nearai` (default), `openai`, `anthropic`, `ollama`, `openai_compatible`, and `tinfoil`.
 
-**NEAR AI Chat** -- Uses the NEAR AI Responses API (`https://private.near.ai/v1/responses`). Authenticates with session tokens (`sess_xxx`) obtained via browser OAuth (GitHub/Google). Supports response chaining (delta-only follow-up messages) for efficient multi-turn conversations. This is the default mode when no `NEARAI_API_KEY` is set. Set `NEARAI_SESSION_TOKEN` env var for hosting providers that inject tokens via environment. Configure with `NEARAI_BASE_URL` (default: `https://private.near.ai`).
+**NEAR AI** -- Uses the Chat Completions API with dual auth support. Session token auth (default): authenticates with session tokens (`sess_xxx`) obtained via browser OAuth (GitHub/Google), base URL defaults to `https://private.near.ai`. API key auth: set `NEARAI_API_KEY` (from `cloud.near.ai`), base URL defaults to `https://cloud-api.near.ai`. Both modes use the same Chat Completions endpoint. Tool messages are flattened to plain text for compatibility. Set `NEARAI_SESSION_TOKEN` env var for hosting providers that inject tokens via environment.
 
-**NEAR AI Cloud** -- Uses the OpenAI-compatible Chat Completions API (`https://cloud-api.near.ai/v1/chat/completions`). Authenticates with API keys from `cloud.near.ai`. Auto-selected when `NEARAI_API_KEY` is set (or explicitly via `NEARAI_API_MODE=chat_completions`). Tool messages are flattened to plain text for compatibility. Configure with `NEARAI_API_KEY` and `NEARAI_BASE_URL` (default: `https://cloud-api.near.ai`).
-
-**Tinfoil** -- Private inference via `https://inference.tinfoil.sh/v1`. Runs models inside hardware-attested TEEs so neither Tinfoil nor the cloud provider can see prompts or responses. Uses the OpenAI-compatible Chat Completions API only (not the Responses API, so tool calls are adapted to chat format). Configure with `TINFOIL_API_KEY` and `TINFOIL_MODEL` (default: `kimi-k2-5`).
+**Tinfoil** -- Private inference via `https://inference.tinfoil.sh/v1`. Runs models inside hardware-attested TEEs so neither Tinfoil nor the cloud provider can see prompts or responses. Uses the OpenAI-compatible Chat Completions API. Configure with `TINFOIL_API_KEY` and `TINFOIL_MODEL` (default: `kimi-k2-5`).
 
 ## Database
 
 
@@ -182,7 +182,6 @@ mod tests {
                 input_tokens: 100,
                 output_tokens: 50,
                 finish_reason: FinishReason::Stop,
-                response_id: None,
             })
         }
 
@@ -196,7 +195,6 @@ mod tests {
                 input_tokens: 200,
                 output_tokens: 100,
                 finish_reason: FinishReason::Stop,
-                response_id: None,
             })
         }
     }
 
@@ -358,10 +358,6 @@ impl Agent {
                         }
                     });
 
-                    tracing::info!(
-                        "Heartbeat enabled with {}s interval",
-                        hb_config.interval_secs
-                    );
                     let hygiene = self
                         .hygiene_config
                         .as_ref()
@@ -373,6 +369,7 @@ impl Agent {
                         hygiene,
                         workspace.clone(),
                         self.cheap_llm().clone(),
+                        self.safety().clone(),
                         Some(notify_tx),
                     ))
                 } else {
 
@@ -13,7 +13,7 @@ use crate::agent::submission::SubmissionResult;
 use crate::agent::{Agent, MessageIntent};
 use crate::channels::{IncomingMessage, StatusUpdate};
 use crate::error::Error;
-use crate::llm::ChatMessage;
+use crate::llm::{ChatMessage, Reasoning};
 
 impl Agent {
     /// Handle job-related intents without turn tracking.
@@ -235,6 +235,7 @@ impl Agent {
             crate::workspace::hygiene::HygieneConfig::default(),
             workspace.clone(),
             self.llm().clone(),
+            self.safety().clone(),
         );
 
         match runner.check_heartbeat().await {
@@ -295,10 +296,11 @@ impl Agent {
             .with_max_tokens(512)
             .with_temperature(0.3);
 
-        match self.llm().complete(request).await {
-            Ok(response) => Ok(SubmissionResult::response(format!(
+        let reasoning = Reasoning::new(self.llm().clone(), self.safety().clone());
+        match reasoning.complete(request).await {
+            Ok((text, _usage)) => Ok(SubmissionResult::response(format!(
                 "Thread Summary:\n\n{}",
-                response.content.trim()
+                text.trim()
             ))),
             Err(e) => Ok(SubmissionResult::error(format!("Summarize failed: {}", e))),
         }
@@ -342,10 +344,11 @@ impl Agent {
             .with_max_tokens(512)
             .with_temperature(0.5);
 
-        match self.llm().complete(request).await {
-            Ok(response) => Ok(SubmissionResult::response(format!(
+        let reasoning = Reasoning::new(self.llm().clone(), self.safety().clone());
+        match reasoning.complete(request).await {
+            Ok((text, _usage)) => Ok(SubmissionResult::response(format!(
                 "Suggested Next Steps:\n\n{}",
-                response.content.trim()
+                text.trim()
             ))),
             Err(e) => Ok(SubmissionResult::error(format!("Suggest failed: {}", e))),
         }
 
@@ -12,7 +12,8 @@ use chrono::Utc;
 use crate::agent::context_monitor::{CompactionStrategy, ContextBreakdown};
 use crate::agent::session::Thread;
 use crate::error::Error;
-use crate::llm::{ChatMessage, CompletionRequest, LlmProvider};
+use crate::llm::{ChatMessage, CompletionRequest, LlmProvider, Reasoning};
+use crate::safety::SafetyLayer;
 use crate::workspace::Workspace;
 
 /// Result of a compaction operation.
@@ -33,12 +34,13 @@ pub struct CompactionResult {
 /// Compacts conversation context to stay within limits.
 pub struct ContextCompactor {
     llm: Arc<dyn LlmProvider>,
+    safety: Arc<SafetyLayer>,
 }
 
 impl ContextCompactor {
     /// Create a new context compactor.
-    pub fn new(llm: Arc<dyn LlmProvider>) -> Self {
-        Self { llm }
+    pub fn new(llm: Arc<dyn LlmProvider>, safety: Arc<SafetyLayer>) -> Self {
+        Self { llm, safety }
     }
 
     /// Compact a thread's context using the given strategy.
@@ -231,8 +233,9 @@ Be brief but capture all important details. Use bullet points."#,
             .with_max_tokens(1024)
             .with_temperature(0.3);
 
-        let response = self.llm.complete(request).await?;
-        Ok(response.content)
+        let reasoning = Reasoning::new(self.llm.clone(), self.safety.clone());
+        let (text, _) = reasoning.complete(request).await?;
+        Ok(text)
     }
 
     /// Write a summary to the workspace daily log.
 
@@ -109,10 +109,17 @@ impl Agent {
         let job_ctx = JobContext::with_user(&message.user_id, "chat", "Interactive chat session");
 
         const MAX_TOOL_ITERATIONS: usize = 10;
+        // Force a text-only response on the last iteration to guarantee termination
+        // instead of hard-erroring. The penultimate iteration also gets a nudge
+        // message so the LLM knows it should wrap up.
+        const FORCE_TEXT_AT: usize = MAX_TOOL_ITERATIONS;
+        const NUDGE_AT: usize = MAX_TOOL_ITERATIONS - 1;
         let mut iteration = 0;
         loop {
             iteration += 1;
-            if iteration > MAX_TOOL_ITERATIONS {
+            // Hard ceiling one past the forced-text iteration (should never be reached
+            // since FORCE_TEXT_AT guarantees a text response, but kept as a safety net).
+            if iteration > MAX_TOOL_ITERATIONS + 1 {
                 return Err(crate::error::LlmError::InvalidResponse {
                     provider: "agent".to_string(),
                     reason: format!("Exceeded maximum tool iterations ({})", MAX_TOOL_ITERATIONS),
@@ -143,6 +150,19 @@ impl Agent {
                 .into());
             }
 
+            // Inject a nudge message when approaching the iteration limit so the
+            // LLM is aware it should produce a final answer on the next turn.
+            if iteration == NUDGE_AT {
+                context_messages.push(ChatMessage::system(
+                    "You are approaching the tool call limit. \
+                     Provide your best final answer on the next response \
+                     using the information you have gathered so far. \
+                     Do not call any more tools.",
+                ));
+            }
+
+            let force_text = iteration >= FORCE_TEXT_AT;
+
             // Refresh tool definitions each iteration so newly built tools become visible
             let tool_defs = self.tools().tool_definitions().await;
 
@@ -162,15 +182,24 @@ impl Agent {
                 tool_defs
             };
 
-            // Call LLM with current context
-            let context = ReasoningContext::new()
+            // Call LLM with current context; force_text drops tools to guarantee a
+            // text response on the final iteration.
+            let mut context = ReasoningContext::new()
                 .with_messages(context_messages.clone())
                 .with_tools(tool_defs)
                 .with_metadata({
                     let mut m = std::collections::HashMap::new();
                     m.insert("thread_id".to_string(), thread_id.to_string());
                     m
                 });
+            context.force_text = force_text;
+
+            if force_text {
+                tracing::info!(
+                    iteration,
+                    "Forcing text-only response (iteration limit reached)"
+                );
+            }
 
             let output = reasoning.respond_with_tools(&context).await?;
 
@@ -799,7 +828,6 @@ mod tests {
                 input_tokens: 0,
                 output_tokens: 0,
                 finish_reason: FinishReason::Stop,
-                response_id: None,
             })
         }
 
@@ -813,7 +841,6 @@ mod tests {
                 input_tokens: 0,
                 output_tokens: 0,
                 finish_reason: FinishReason::Stop,
-                response_id: None,
             })
         }
     }
 
@@ -29,7 +29,8 @@ use std::time::Duration;
 use tokio::sync::mpsc;
 
 use crate::channels::OutgoingResponse;
-use crate::llm::{ChatMessage, CompletionRequest, FinishReason, LlmProvider};
+use crate::llm::{ChatMessage, CompletionRequest, LlmProvider, Reasoning};
+use crate::safety::SafetyLayer;
 use crate::workspace::Workspace;
 use crate::workspace::hygiene::HygieneConfig;
 
@@ -100,6 +101,7 @@ pub struct HeartbeatRunner {
     hygiene_config: HygieneConfig,
     workspace: Arc<Workspace>,
     llm: Arc<dyn LlmProvider>,
+    safety: Arc<SafetyLayer>,
     response_tx: Option<mpsc::Sender<OutgoingResponse>>,
     consecutive_failures: u32,
 }
@@ -111,12 +113,14 @@ impl HeartbeatRunner {
         hygiene_config: HygieneConfig,
         workspace: Arc<Workspace>,
         llm: Arc<dyn LlmProvider>,
+        safety: Arc<SafetyLayer>,
     ) -> Self {
         Self {
             config,
             hygiene_config,
             workspace,
             llm,
+            safety,
             response_tx: None,
             consecutive_failures: 0,
         }
@@ -258,25 +262,18 @@ impl HeartbeatRunner {
             .with_max_tokens(max_tokens)
             .with_temperature(0.3);
 
-        let response = match self.llm.complete(request).await {
+        let reasoning = Reasoning::new(self.llm.clone(), self.safety.clone());
+        let (content, _usage) = match reasoning.complete(request).await {
             Ok(r) => r,
             Err(e) => return HeartbeatResult::Failed(format!("LLM call failed: {}", e)),
         };
 
-        let content = response.content.trim();
+        let content = content.trim();
 
         // Guard against empty content. Reasoning models (e.g. GLM-4.7) may
         // burn all output tokens on chain-of-thought and return content: null.
         if content.is_empty() {
-            return if response.finish_reason == FinishReason::Length {
-                HeartbeatResult::Failed(
-                    "LLM response was truncated (finish_reason=length) with no content. \
-                     The model may have exhausted its token budget on reasoning."
-                        .to_string(),
-                )
-            } else {
-                HeartbeatResult::Failed("LLM returned empty content.".to_string())
-            };
+            return HeartbeatResult::Failed("LLM returned empty content.".to_string());
         }
 
         // Check if nothing needs attention
@@ -355,9 +352,10 @@ pub fn spawn_heartbeat(
     hygiene_config: HygieneConfig,
     workspace: Arc<Workspace>,
     llm: Arc<dyn LlmProvider>,
+    safety: Arc<SafetyLayer>,
     response_tx: Option<mpsc::Sender<OutgoingResponse>>,
 ) -> tokio::task::JoinHandle<()> {
-    let mut runner = HeartbeatRunner::new(config, hygiene_config, workspace, llm);
+    let mut runner = HeartbeatRunner::new(config, hygiene_config, workspace, llm, safety);
     if let Some(tx) = response_tx {
         runner = runner.with_response_channel(tx);
     }
 
@@ -185,10 +185,6 @@ pub struct Thread {
     /// Pending auth token request (thread is in auth mode).
     #[serde(default)]
     pub pending_auth: Option<PendingAuth>,
-    /// Last NEAR AI response ID for response chaining. Persisted to DB
-    /// metadata so we can resume chaining across restarts.
-    #[serde(default)]
-    pub last_response_id: Option<String>,
 }
 
 impl Thread {
@@ -205,7 +201,6 @@ impl Thread {
             metadata: serde_json::Value::Null,
             pending_approval: None,
             pending_auth: None,
-            last_response_id: None,
         }
     }
 
@@ -222,7 +217,6 @@ impl Thread {
             metadata: serde_json::Value::Null,
             pending_approval: None,
             pending_auth: None,
-            last_response_id: None,
         }
     }
 
@@ -863,7 +857,6 @@ mod tests {
 
         thread.start_turn("hello");
         thread.complete_turn("world");
-        thread.last_response_id = Some("resp_abc123".to_string());
 
         let json = serde_json::to_string(&thread).unwrap();
         let restored: Thread = serde_json::from_str(&json).unwrap();
@@ -873,7 +866,6 @@ mod tests {
         assert_eq!(restored.turns.len(), 1);
         assert_eq!(restored.turns[0].user_input, "hello");
         assert_eq!(restored.turns[0].response, Some("world".to_string()));
-        assert_eq!(restored.last_response_id, Some("resp_abc123".to_string()));
     }
 
     #[test]
Original file line number	Diff line number	Diff line change
`@@ -182,7 +182,6 @@ mod tests {`
`182`	`182`	`input_tokens: 100,`
`183`	`183`	`output_tokens: 50,`
`184`	`184`	`finish_reason: FinishReason::Stop,`
`185`		`- response_id: None,`
`186`	`185`	`})`
`187`	`186`	`}`
`188`	`187`
`@@ -196,7 +195,6 @@ mod tests {`
`196`	`195`	`input_tokens: 200,`
`197`	`196`	`output_tokens: 100,`
`198`	`197`	`finish_reason: FinishReason::Stop,`
`199`		`- response_id: None,`
`200`	`198`	`})`
`201`	`199`	`}`
`202`	`200`	`}`
Original file line number	Diff line number	Diff line change
`@@ -185,10 +185,6 @@ pub struct Thread {`
`185`	`185`	`/// Pending auth token request (thread is in auth mode).`
`186`	`186`	`#[serde(default)]`
`187`	`187`	`pub pending_auth: Option<PendingAuth>,`
`188`		`- /// Last NEAR AI response ID for response chaining. Persisted to DB`
`189`		`- /// metadata so we can resume chaining across restarts.`
`190`		`- #[serde(default)]`
`191`		`- pub last_response_id: Option<String>,`
`192`	`188`	`}`
`193`	`189`
`194`	`190`	`impl Thread {`
`@@ -205,7 +201,6 @@ impl Thread {`
`205`	`201`	`metadata: serde_json::Value::Null,`
`206`	`202`	`pending_approval: None,`
`207`	`203`	`pending_auth: None,`
`208`		`- last_response_id: None,`
`209`	`204`	`}`
`210`	`205`	`}`
`211`	`206`
`@@ -222,7 +217,6 @@ impl Thread {`
`222`	`217`	`metadata: serde_json::Value::Null,`
`223`	`218`	`pending_approval: None,`
`224`	`219`	`pending_auth: None,`
`225`		`- last_response_id: None,`
`226`	`220`	`}`
`227`	`221`	`}`
`228`	`222`
`@@ -863,7 +857,6 @@ mod tests {`
`863`	`857`
`864`	`858`	`thread.start_turn("hello");`
`865`	`859`	`thread.complete_turn("world");`
`866`		`- thread.last_response_id = Some("resp_abc123".to_string());`
`867`	`860`
`868`	`861`	`let json = serde_json::to_string(&thread).unwrap();`
`869`	`862`	`let restored: Thread = serde_json::from_str(&json).unwrap();`
`@@ -873,7 +866,6 @@ mod tests {`
`873`	`866`	`assert_eq!(restored.turns.len(), 1);`
`874`	`867`	`assert_eq!(restored.turns[0].user_input, "hello");`
`875`	`868`	`assert_eq!(restored.turns[0].response, Some("world".to_string()));`
`876`		`- assert_eq!(restored.last_response_id, Some("resp_abc123".to_string()));`
`877`	`869`	`}`
`878`	`870`
`879`	`871`	`#[test]`