mozilla-ai · aittalam · Mar 6, 2026 · Feb 5, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/docs/example_llamafiles.md b/docs/example_llamafiles.md
@@ -3,7 +3,7 @@ try out llamafile with different kinds of LLMs.
 
 | Model                   | Size     | License                                                                                                                            | llamafile                                                                                                                                                                                      | other quants                                                                        |
 | ---                     | ---      | ---                                                                                                                                | ---                                                                                                                                                                                            | ---                                                                                 |
-| LLaMA 3.2 1B Instruct   | 1.11 GB  | [LLaMA 3.2](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile/blob/main/LICENSE)                                      | [Llama-3.2-1B-Instruct.Q6\_K.llamafile](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile/blob/main/Llama-3.2-1B-Instruct.Q6_K.llamafile?download=true)                           | [See HF repo](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile)       |
+| LLaMA 3.2 1B Instruct   | 1.11 GB  | [LLaMA 3.2](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile/blob/main/LICENSE)                                      | [Llama-3.2-1B-Instruct-Q6\_K.llamafile](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile/blob/main/Llama-3.2-1B-Instruct-Q6_K.llamafile?download=true)                           | [See HF repo](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile)       |
 | LLaMA 3.2 3B Instruct   | 2.62 GB  | [LLaMA 3.2](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile/blob/main/LICENSE)                                      | [Llama-3.2-3B-Instruct.Q6\_K.llamafile](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile/blob/main/Llama-3.2-3B-Instruct.Q6_K.llamafile?download=true)                           | [See HF repo](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile)       |
 | LLaMA 3.1 8B Instruct   | 5.23 GB  | [LLaMA 3.1](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile/blob/main/LICENSE)                                 | [Llama-3.1-8B-Instruct.Q4\_K\_M.llamafile](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile/resolve/main/Meta-Llama-3.1-8B-Instruct.Q4_K_M.llamafile?download=true)         | [See HF repo](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile)  |
 | Gemma 3 1B Instruct     | 1.32 GB  | [Gemma 3](https://ai.google.dev/gemma/terms)                                                                                       | [gemma-3-1b-it.Q6\_K.llamafile](https://huggingface.co/Mozilla/gemma-3-1b-it-llamafile/resolve/main/google_gemma-3-1b-it-Q6_K.llamafile?download=true)                                         | [See HF repo](https://huggingface.co/Mozilla/gemma-3-1b-it-llamafile)               |

diff --git a/docs/support.md b/docs/support.md
@@ -9,7 +9,7 @@ stock install:
 - Windows 10+ (AMD64 only)
 - FreeBSD 13+
 - NetBSD 9.2+ (AMD64 only)
-- OpenBSD 7+ (AMD64 only)
+- OpenBSD 7.0 to 7.4 (AMD64 only)
 
 On Windows, llamafile runs as a native portable executable. On UNIX
 systems, llamafile extracts a small loader program named `ape` to

diff --git a/llama.cpp b/llama.cpp
diff --git a/llama.cpp.patches/README.md b/llama.cpp.patches/README.md
@@ -91,17 +91,12 @@ These patches integrate llamafile's file handling APIs for loading models from b
 |-------|-------------|
 | `tools_server_server.cpp.patch` | Refactors `main()` to `server_main()` for llamafile integration; adds Metal backend trigger, cosmo_args support, TUI mode handling, and proper exit for Metal async logging |
 
-### Vendor Library Fixes
-
-| Patch | Description |
-|-------|-------------|
-| `vendor_miniaudio_miniaudio.h.patch` | Removes `__COSMOPOLITAN__` from Windows platform detection (Cosmopolitan handles this at runtime) |
-
 ### Miscellaneous
 
 | Patch | Description |
 |-------|-------------|
 | `common_chat.cpp.patch` | Fixes C++ type conversion: explicitly wraps `inputs.messages` in `std::optional<json>()` for Deepseek v3.1 template |
+| `ggml_src_ggml-backend-reg.cpp.patch` | Suppresses debug log noise for non-existent backend search paths (irrelevant for llamafile's DSO loading approach) |
 
 ## Creating New Patches
 

diff --git a/llama.cpp.patches/llamafile-files/BUILD.mk b/llama.cpp.patches/llamafile-files/BUILD.mk
@@ -77,6 +77,7 @@ LLAMA_SRCS_CPP := \
 	llama.cpp/src/models/dream.cpp \
 	llama.cpp/src/models/ernie4-5-moe.cpp \
 	llama.cpp/src/models/ernie4-5.cpp \
+	llama.cpp/src/models/eurobert.cpp \
 	llama.cpp/src/models/exaone.cpp \
 	llama.cpp/src/models/exaone4.cpp \
 	llama.cpp/src/models/exaone-moe.cpp \

diff --git a/llama.cpp.patches/patches/common_chat.cpp.patch b/llama.cpp.patches/patches/common_chat.cpp.patch
@@ -1,7 +1,7 @@
 diff --git a/common/chat.cpp b/common/chat.cpp
 --- a/llama.cpp/common/chat.cpp
 +++ b/llama.cpp/common/chat.cpp
-@@ -1791,7 +1791,7 @@ static common_chat_params common_chat_params_init_deepseek_v3_1(const common_cha
+@@ -1795,7 +1795,7 @@ static common_chat_params common_chat_params_init_deepseek_v3_1(const common_cha
      };
 
      auto prompt = apply(tmpl, inputs,

diff --git a/llama.cpp.patches/patches/ggml_src_gguf.cpp.patch b/llama.cpp.patches/patches/ggml_src_gguf.cpp.patch
@@ -12,18 +12,22 @@ diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
  #include <cinttypes>
  #include <cstddef>
  #include <cstdint>
-@@ -284,14 +288,103 @@ struct gguf_reader {
-     bool read(void * dst, const size_t size) const {
-         return fread(dst, 1, size, file) == size;
+@@ -358,18 +362,107 @@ struct gguf_reader {
+         return nread == size;
      }
-+
+ 
 +    size_t tell() const {
-+        return ftell(file);
++        return gguf_ftell(file);
 +    }
 +
 +    bool seek(size_t offset, int whence) const {
-+        return fseek(file, offset, whence) == 0;
++        return gguf_fseek(file, offset, whence) == 0;
 +    }
++
+ private:
+     FILE * file;
+
+     mutable uint64_t nbytes_remain;
  };
 
 +#ifdef COSMOCC
@@ -118,7 +122,7 @@ diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
      if (is_array) {
          std::vector<T> value;
          try {
-@@ -316,8 +409,8 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
+@@ -394,8 +487,8 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
      return true;
  }
 
@@ -129,24 +133,24 @@ diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
      struct gguf_context * ctx = new gguf_context;
 
      bool ok = true;
-@@ -618,14 +711,14 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
+@@ -696,14 +789,14 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
      GGML_ASSERT(int64_t(ctx->info.size()) == n_tensors);
 
      // we require the data section to be aligned, so take into account any padding
--    if (fseek(file, GGML_PAD(ftell(file), ctx->alignment), SEEK_SET) != 0) {
+-    if (gguf_fseek(file, GGML_PAD(gguf_ftell(file), ctx->alignment), SEEK_SET) != 0) {
 +    if (!gr.seek(GGML_PAD(gr.tell(), ctx->alignment), SEEK_SET)) {
          GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__);
          gguf_free(ctx);
          return nullptr;
      }
 
      // store the current file offset - this is where the data section starts
--    ctx->offset = ftell(file);
+-    ctx->offset = gguf_ftell(file);
 +    ctx->offset = gr.tell();
 
      // compute the total size of the data section, taking into account the alignment
      {
-@@ -738,7 +831,27 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
+@@ -840,7 +933,27 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
      return ctx;
  }
 
@@ -174,7 +178,7 @@ diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
      FILE * file = ggml_fopen(fname, "rb");
 
      if (!file) {
-@@ -749,6 +862,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
+@@ -851,6 +964,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
      struct gguf_context * result = gguf_init_from_file_impl(file, params);
      fclose(file);
      return result;

diff --git a/llama.cpp.patches/patches/tools_server_server.cpp.patch b/llama.cpp.patches/patches/tools_server_server.cpp.patch
@@ -1,7 +1,7 @@
 diff --git a/tools/server/server.cpp b/tools/server/server.cpp
 --- a/llama.cpp/tools/server/server.cpp
 +++ b/llama.cpp/tools/server/server.cpp
-@@ -16,6 +16,11 @@
+@@ -17,6 +17,11 @@
  #include <windows.h>
  #endif
 
@@ -13,18 +13,18 @@ diff --git a/tools/server/server.cpp b/tools/server/server.cpp
  static std::function<void(int)> shutdown_handler;
  static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
 
-@@ -66,7 +71,8 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t
+@@ -67,7 +72,8 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t
      };
  }
 
 -int main(int argc, char ** argv) {
 +// Core server logic - can be called from llamafile main.cpp or standalone main()
 +int server_main(int argc, char ** argv) {
-     // own arguments required by this example
-     common_params params;
+     std::setlocale(LC_NUMERIC, "C");
 
-@@ -95,6 +101,10 @@ int main(int argc, char ** argv) {
-         params.model_alias = params.model.name;
+     // own arguments required by this example
+@@ -98,6 +104,10 @@ int main(int argc, char ** argv) {
+         params.model_alias.insert(params.model.name);
      }
 
 +#ifdef COSMOCC
@@ -34,7 +34,7 @@ diff --git a/tools/server/server.cpp b/tools/server/server.cpp
      common_init();
 
      // struct that contains llama context and inference
-@@ -318,5 +328,43 @@ int main(int argc, char ** argv) {
+@@ -322,5 +332,43 @@ int main(int argc, char ** argv) {
          }
      }
 
@@ -48,7 +48,7 @@ diff --git a/tools/server/server.cpp b/tools/server/server.cpp
 +#else
      return 0;
 +#endif
-+}
+ }
 +
 +// Standalone entry point for llama-server executable
 +// Not compiled when building as part of llamafile TUI (which has its own main)
@@ -76,6 +76,5 @@ diff --git a/tools/server/server.cpp b/tools/server/server.cpp
 +    }
 +#endif
 +    return server_main(argc, argv);
- }
++}
 +#endif
-\ No newline at end of file
diff --git a/llama.cpp.patches/patches/vendor_cpp-httplib_httplib.cpp.patch b/llama.cpp.patches/patches/vendor_cpp-httplib_httplib.cpp.patch
@@ -1,19 +1,18 @@
 diff --git a/vendor/cpp-httplib/httplib.cpp b/vendor/cpp-httplib/httplib.cpp
 --- a/llama.cpp/vendor/cpp-httplib/httplib.cpp
 +++ b/llama.cpp/vendor/cpp-httplib/httplib.cpp
-@@ -5372,8 +5372,13 @@ void ThreadPool::worker::operator()() {
-     {
-       std::unique_lock<std::mutex> lock(pool_.mutex_);
-
--      pool_.cond_.wait(lock,
--                       [&] { return !pool_.jobs_.empty() || pool_.shutdown_; });
-+      // Use wait_for() instead of wait() to work around a
-+      // Cosmopolitan libc bug where untimed futex waits on XNU
-+      // (macOS) expire after ~72 minutes, causing
-+      // condition_variable::wait() to throw ETIMEDOUT.
-+      while (pool_.jobs_.empty() && !pool_.shutdown_) {
-+        pool_.cond_.wait_for(lock, std::chrono::seconds(30));
-+      }
-
-       if (pool_.shutdown_ && pool_.jobs_.empty()) { break; }
+@@ -6043,7 +6043,13 @@ void ThreadPool::worker(bool is_dynamic) {
+           break;
+         }
+       } else {
+-        cond_.wait(lock, [&] { return !jobs_.empty() || shutdown_; });
++        // Use wait_for() instead of wait() to work around a
++        // Cosmopolitan libc bug where untimed futex waits on XNU
++        // (macOS) expire after ~72 minutes, causing
++        // condition_variable::wait() to throw ETIMEDOUT.
++        while (jobs_.empty() && !shutdown_) {
++          cond_.wait_for(lock, std::chrono::seconds(30));
++        }
+       }
 
+       idle_thread_count_--;
diff --git a/llama.cpp.patches/patches/vendor_miniaudio_miniaudio.h.patch b/llama.cpp.patches/patches/vendor_miniaudio_miniaudio.h.patch
diff --git a/llamafile/args.cpp b/llamafile/args.cpp
@@ -42,6 +42,16 @@ LlamafileArgs parse_llamafile_args(int argc, char** argv) {
     // This reads --gpu and -ngl flags to set FLAG_gpu
     llamafile_early_gpu_init(argv);
 
+    // Capture -p/--prompt value before filtering (needed for combined mode
+    // where SERVER parsing excludes -p)
+    // Note: Loop does not break early; if multiple -p flags are given,
+    // the last occurrence wins (intentional for override flexibility)
+    for (int i = 0; i < argc; ++i) {
+        if ((strcmp(argv[i], "-p") == 0 || strcmp(argv[i], "--prompt") == 0) && i + 1 < argc) {
+            args.system_prompt = argv[i + 1];
+        }
+    }
+
     // Determine execution mode from flags
     // Priority: explicit flags override defaults
     if (llamafile_has(argv, "--server")) {

diff --git a/llamafile/args.h b/llamafile/args.h
@@ -17,6 +17,8 @@
 
 #pragma once
 
+#include <string>
+
 namespace lf {
 
 // Program execution modes
@@ -35,6 +37,10 @@ struct LlamafileArgs {
     int llama_argc = 0;
     char** llama_argv = nullptr;
 
+    // System prompt captured from -p (needed for combined mode where SERVER
+    // parsing excludes -p)
+    std::string system_prompt;
+
     // Note: Llamafile-specific flags are stored in FLAG_* globals (llamafile.h):
     //   --verbose  -> FLAG_verbose
     //   --nothink  -> FLAG_nothink

diff --git a/llamafile/chatbot.h b/llamafile/chatbot.h
@@ -62,14 +62,15 @@ extern mtmd_context *g_mtmd;          // multimodal context (replaces g_clip)
 extern enum Role g_role;
 extern common_params *g_params;       // pointer to params (replaces gpt_params)
 extern common_sampler *g_sampler;     // sampler context (new)
-extern int g_system_prompt_tokens;
+extern std::vector<common_chat_msg> g_messages;  // chat message history
 extern llama_context *g_ctx;
 extern llama_model *g_model;
 extern std::vector<int> g_history;
 extern volatile sig_atomic_t g_got_sigint;
 extern bool g_interrupted_exit;
 extern common_chat_templates_ptr g_chat_templates;
 extern common_chat_parser_params g_chat_syntax;
+extern std::string g_pending_file_content;  // accumulated /upload content awaiting user message
 
 // Original entry point: loads its own model
 int main(int argc, char **argv);