mozilla-ai · aittalam · Feb 18, 2026 · Feb 17, 2026 · Feb 18, 2026 · Feb 18, 2026
diff --git a/llamafile/chatbot.h b/llamafile/chatbot.h
@@ -21,6 +21,8 @@
 #include <vector>
 #include <signal.h>
 
+#include "chat.h"
+
 #define DEFAULT_SYSTEM_PROMPT \
     "A chat between a curious human and an artificial intelligence assistant. " \
     "The assistant gives helpful, detailed, and polite answers to the " \
@@ -66,6 +68,8 @@ extern llama_model *g_model;
 extern std::vector<int> g_history;
 extern volatile sig_atomic_t g_got_sigint;
 extern bool g_interrupted_exit;
+extern common_chat_templates_ptr g_chat_templates;
+extern common_chat_syntax g_chat_syntax;
 
 int main(int, char **);
 

diff --git a/llamafile/chatbot_logo.cpp b/llamafile/chatbot_logo.cpp
@@ -47,8 +47,10 @@ static void print_logo(const char16_t *s) {
 }
 
 void logo(char **argv) {
-    if (llamafile_has(argv, "--nologo"))
+    if (llamafile_has(argv, "--nologo")) {
+        FLAG_nologo = true;
         return;
+    }
     if (llamafile_has(argv, "--ascii")) {
         printf("\
  _ _                        __ _ _\n\

diff --git a/llamafile/chatbot_main.cpp b/llamafile/chatbot_main.cpp
@@ -20,13 +20,16 @@
 #include <cosmo.h>
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
+#include <exception>
 #include <limits.h>
 #include <signal.h>
 #include <string>
 #include <unistd.h>
 #include <vector>
 
 #include "arg.h"
+#include "chat.h"
 #include "common.h"
 #include "llama.h"
 #include "log.h"
@@ -53,6 +56,8 @@ common_sampler *g_sampler = nullptr;    // sampler context
 mtmd_context *g_mtmd = nullptr;         // multimodal context
 llama_model *g_model = nullptr;
 llama_context *g_ctx = nullptr;
+common_chat_templates_ptr g_chat_templates;  // chat template handler
+common_chat_syntax g_chat_syntax;            // chat syntax for parsing
 
 // Static storage for params
 static common_params s_params;
@@ -170,15 +175,14 @@ int main(int argc, char **argv) {
     if (g_params->n_ctx < g_params->n_batch)
         g_params->n_batch = g_params->n_ctx;
 
-    // Print info
+    // Print info (format line is added later after template detection)
     if (!FLAG_nologo) {
         printf(BOLD "software" UNBOLD ": llamafile " LLAMAFILE_VERSION_STRING "\n"
                BOLD "model" UNBOLD ":    %s\n",
                basename(g_params->model.path).c_str());
         if (is_base_model())
             printf(BOLD "mode" UNBOLD ":     RAW TEXT COMPLETION (base model)\n");
         printf(BOLD "compute" UNBOLD ":  %s\n", describe_compute().c_str());
-        printf("\n");
     }
 
     print_ephemeral("initializing context...");
@@ -216,6 +220,47 @@ int main(int argc, char **argv) {
         }
     }
 
+    // Initialize chat templates for output parsing (e.g., gpt-oss think mode)
+    // Use the same approach as common_chat_verify_template() - provide a dummy message
+    if (!is_base_model()) {
+        g_chat_templates = common_chat_templates_init(g_model, g_params->chat_template);
+        if (g_chat_templates) {
+            // Provide a minimal dummy message (same approach as common_chat_verify_template)
+            common_chat_msg dummy_msg;
+            dummy_msg.role = "user";
+            dummy_msg.content = "test";
+
+            common_chat_templates_inputs inputs;
+            inputs.messages = {dummy_msg};
+            inputs.use_jinja = true;
+
+            try {
+                auto chat_params = common_chat_templates_apply(g_chat_templates.get(), inputs);
+                g_chat_syntax.format = chat_params.format;
+                g_chat_syntax.thinking_forced_open = chat_params.thinking_forced_open;
+
+                // Enable reasoning extraction for all chat models, like llama.cpp CLI/server does.
+                // Parsers handle models without think mode gracefully - if there's no <think> or
+                // similar tags in the output, no reasoning gets extracted.
+                g_chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+                g_chat_syntax.reasoning_in_content = false;
+
+                // Print detected format
+                if (!FLAG_nologo && g_chat_syntax.format != COMMON_CHAT_FORMAT_CONTENT_ONLY) {
+                    printf(BOLD "format" UNBOLD ":   %s\n", common_chat_format_name(g_chat_syntax.format));
+                }
+            } catch (const std::exception &e) {
+                // Template application failed, fall back to content-only parsing
+                LOG_DBG("chat template application failed: %s\n", e.what());
+            }
+        }
+    }
+
+    // Ensure there's a blank line after info block
+    if (!FLAG_nologo) {
+        printf("\n");
+    }
+
     // Run the REPL
     repl();
 

diff --git a/llamafile/chatbot_repl.cpp b/llamafile/chatbot_repl.cpp
@@ -22,6 +22,7 @@
 #include <cstdio>
 #include <string_view>
 
+#include "chat.h"
 #include "common.h"
 #include "llama.h"
 #include "sampling.h"
@@ -213,8 +214,18 @@ void repl() {
             free(line);
             continue;
         }
+        // Check if we should use chat parsing (for think mode models like gpt-oss)
+        const bool use_chat_parser = g_chat_syntax.format != COMMON_CHAT_FORMAT_CONTENT_ONLY;
+        std::string raw_output;           // Accumulates raw token output
+        common_chat_msg prev_msg;         // Previous parse result for diff computation
+        bool in_reasoning = false;        // Track if we're currently in reasoning mode
+
         for (;;) {
             if (g_got_sigint) {
+                if (in_reasoning) {
+                    print(UNBOLD);
+                    in_reasoning = false;
+                }
                 eval_token(llamafile_token_eot(g_model));
                 break;
             }
@@ -224,11 +235,61 @@ void repl() {
                 break;
             if (llama_vocab_is_eog(llama_model_get_vocab(g_model), id))
                 break;
-            std::string s;
-            bleeder.feed(&s, token_to_piece(g_ctx, id, g_params->special));
-            print(s);
-            fflush(stdout);
+
+            if (use_chat_parser) {
+                // For chat parsing, we need special tokens to detect patterns like <|channel|>
+                std::string token_str = token_to_piece(g_ctx, id, /*special=*/true);
+
+                // Accumulate raw output for parsing
+                raw_output += token_str;
+
+                // Parse incrementally
+                auto msg = common_chat_parse(raw_output, /*is_partial=*/true, g_chat_syntax);
+
+                // Compute diffs to find new content
+                auto diffs = common_chat_msg_diff::compute_diffs(prev_msg, msg);
+
+                for (const auto &diff : diffs) {
+                    // Display reasoning content in dim style
+                    if (!diff.reasoning_content_delta.empty()) {
+                        if (!in_reasoning) {
+                            print(FAINT);
+                            in_reasoning = true;
+                        }
+                        std::string s;
+                        bleeder.feed(&s, diff.reasoning_content_delta);
+                        print(s);
+                    }
+                    // Display final content normally
+                    if (!diff.content_delta.empty()) {
+                        if (in_reasoning) {
+                            print(UNBOLD);
+                            print("\n\n");  // Add newline between reasoning and content
+                            in_reasoning = false;
+                        }
+                        std::string s;
+                        bleeder.feed(&s, diff.content_delta);
+                        print(s);
+                    }
+                }
+
+                prev_msg = msg;
+                fflush(stdout);
+            } else {
+                // No chat parsing - direct output
+                std::string token_str = token_to_piece(g_ctx, id, g_params->special);
+                std::string s;
+                bleeder.feed(&s, token_str);
+                print(s);
+                fflush(stdout);
+            }
         }
+
+        // End reasoning mode if still active
+        if (in_reasoning) {
+            print(UNBOLD);
+        }
+
         g_got_sigint = 0;
         free(line);
         std::string s;