Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llamafile/chatbot.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#include <vector>
#include <signal.h>

#include "chat.h"

#define DEFAULT_SYSTEM_PROMPT \
"A chat between a curious human and an artificial intelligence assistant. " \
"The assistant gives helpful, detailed, and polite answers to the " \
Expand Down Expand Up @@ -66,6 +68,8 @@ extern llama_model *g_model;
extern std::vector<int> g_history;
extern volatile sig_atomic_t g_got_sigint;
extern bool g_interrupted_exit;
extern common_chat_templates_ptr g_chat_templates;
extern common_chat_syntax g_chat_syntax;

int main(int, char **);

Expand Down
4 changes: 3 additions & 1 deletion llamafile/chatbot_logo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,10 @@ static void print_logo(const char16_t *s) {
}

void logo(char **argv) {
if (llamafile_has(argv, "--nologo"))
if (llamafile_has(argv, "--nologo")) {
FLAG_nologo = true;
return;
}
if (llamafile_has(argv, "--ascii")) {
printf("\
_ _ __ _ _\n\
Expand Down
49 changes: 47 additions & 2 deletions llamafile/chatbot_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,16 @@
#include <cosmo.h>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <exception>
#include <limits.h>
#include <signal.h>
#include <string>
#include <unistd.h>
#include <vector>

#include "arg.h"
#include "chat.h"
#include "common.h"
#include "llama.h"
#include "log.h"
Expand All @@ -53,6 +56,8 @@ common_sampler *g_sampler = nullptr; // sampler context
mtmd_context *g_mtmd = nullptr; // multimodal context
llama_model *g_model = nullptr;
llama_context *g_ctx = nullptr;
common_chat_templates_ptr g_chat_templates; // chat template handler
common_chat_syntax g_chat_syntax; // chat syntax for parsing

// Static storage for params
static common_params s_params;
Expand Down Expand Up @@ -170,15 +175,14 @@ int main(int argc, char **argv) {
if (g_params->n_ctx < g_params->n_batch)
g_params->n_batch = g_params->n_ctx;

// Print info
// Print info (format line is added later after template detection)
if (!FLAG_nologo) {
printf(BOLD "software" UNBOLD ": llamafile " LLAMAFILE_VERSION_STRING "\n"
BOLD "model" UNBOLD ": %s\n",
basename(g_params->model.path).c_str());
if (is_base_model())
printf(BOLD "mode" UNBOLD ": RAW TEXT COMPLETION (base model)\n");
printf(BOLD "compute" UNBOLD ": %s\n", describe_compute().c_str());
printf("\n");
}

print_ephemeral("initializing context...");
Expand Down Expand Up @@ -216,6 +220,47 @@ int main(int argc, char **argv) {
}
}

// Initialize chat templates for output parsing (e.g., gpt-oss think mode)
// Use the same approach as common_chat_verify_template() - provide a dummy message
if (!is_base_model()) {
g_chat_templates = common_chat_templates_init(g_model, g_params->chat_template);
if (g_chat_templates) {
// Provide a minimal dummy message (same approach as common_chat_verify_template)
common_chat_msg dummy_msg;
dummy_msg.role = "user";
dummy_msg.content = "test";

common_chat_templates_inputs inputs;
inputs.messages = {dummy_msg};
inputs.use_jinja = true;

try {
auto chat_params = common_chat_templates_apply(g_chat_templates.get(), inputs);
g_chat_syntax.format = chat_params.format;
g_chat_syntax.thinking_forced_open = chat_params.thinking_forced_open;

// Enable reasoning extraction for all chat models, like llama.cpp CLI/server does.
// Parsers handle models without think mode gracefully - if there's no <think> or
// similar tags in the output, no reasoning gets extracted.
g_chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
g_chat_syntax.reasoning_in_content = false;

// Print detected format
if (!FLAG_nologo && g_chat_syntax.format != COMMON_CHAT_FORMAT_CONTENT_ONLY) {
printf(BOLD "format" UNBOLD ": %s\n", common_chat_format_name(g_chat_syntax.format));
}
} catch (const std::exception &e) {
// Template application failed, fall back to content-only parsing
LOG_DBG("chat template application failed: %s\n", e.what());
}
}
}

// Ensure there's a blank line after info block
if (!FLAG_nologo) {
printf("\n");
}

// Run the REPL
repl();

Expand Down
69 changes: 65 additions & 4 deletions llamafile/chatbot_repl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cstdio>
#include <string_view>

#include "chat.h"
#include "common.h"
#include "llama.h"
#include "sampling.h"
Expand Down Expand Up @@ -213,8 +214,18 @@ void repl() {
free(line);
continue;
}
// Check if we should use chat parsing (for think mode models like gpt-oss)
const bool use_chat_parser = g_chat_syntax.format != COMMON_CHAT_FORMAT_CONTENT_ONLY;
std::string raw_output; // Accumulates raw token output
common_chat_msg prev_msg; // Previous parse result for diff computation
bool in_reasoning = false; // Track if we're currently in reasoning mode

for (;;) {
if (g_got_sigint) {
if (in_reasoning) {
print(UNBOLD);
in_reasoning = false;
}
eval_token(llamafile_token_eot(g_model));
break;
}
Expand All @@ -224,11 +235,61 @@ void repl() {
break;
if (llama_vocab_is_eog(llama_model_get_vocab(g_model), id))
break;
std::string s;
bleeder.feed(&s, token_to_piece(g_ctx, id, g_params->special));
print(s);
fflush(stdout);

if (use_chat_parser) {
// For chat parsing, we need special tokens to detect patterns like <|channel|>
std::string token_str = token_to_piece(g_ctx, id, /*special=*/true);

// Accumulate raw output for parsing
raw_output += token_str;

// Parse incrementally
auto msg = common_chat_parse(raw_output, /*is_partial=*/true, g_chat_syntax);

// Compute diffs to find new content
auto diffs = common_chat_msg_diff::compute_diffs(prev_msg, msg);

for (const auto &diff : diffs) {
// Display reasoning content in dim style
if (!diff.reasoning_content_delta.empty()) {
if (!in_reasoning) {
print(FAINT);
in_reasoning = true;
}
std::string s;
bleeder.feed(&s, diff.reasoning_content_delta);
print(s);
}
// Display final content normally
if (!diff.content_delta.empty()) {
if (in_reasoning) {
print(UNBOLD);
print("\n\n"); // Add newline between reasoning and content
in_reasoning = false;
}
std::string s;
bleeder.feed(&s, diff.content_delta);
print(s);
}
}

prev_msg = msg;
fflush(stdout);
} else {
// No chat parsing - direct output
std::string token_str = token_to_piece(g_ctx, id, g_params->special);
std::string s;
bleeder.feed(&s, token_str);
print(s);
fflush(stdout);
}
}

// End reasoning mode if still active
if (in_reasoning) {
print(UNBOLD);
}

g_got_sigint = 0;
free(line);
std::string s;
Expand Down