mozilla-ai
diff --git a/‎llama.cpp‎ b/‎llama.cpp‎
diff --git a/‎llama.cpp.patches/README.md‎
Lines changed: 1 addition & 1 deletion b/‎llama.cpp.patches/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llama.cpp.patches/llamafile-files/BUILD.mk‎
Lines changed: 18 additions & 5 deletions b/‎llama.cpp.patches/llamafile-files/BUILD.mk‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎llama.cpp.patches/patches/common_arg.cpp.patch‎
Lines changed: 1 addition & 1 deletion b/‎llama.cpp.patches/patches/common_arg.cpp.patch‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llama.cpp.patches/patches/common_chat.cpp.patch‎
Lines changed: 0 additions & 12 deletions b/‎llama.cpp.patches/patches/common_chat.cpp.patch‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎llama.cpp.patches/patches/common_common.cpp.patch‎
Lines changed: 2 additions & 2 deletions b/‎llama.cpp.patches/patches/common_common.cpp.patch‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎llama.cpp.patches/patches/common_download.cpp.patch‎
Lines changed: 1 addition & 1 deletion b/‎llama.cpp.patches/patches/common_download.cpp.patch‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llama.cpp.patches/patches/ggml_src_ggml-backend-reg.cpp.patch‎
Lines changed: 1 addition & 1 deletion b/‎llama.cpp.patches/patches/ggml_src_ggml-backend-reg.cpp.patch‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llama.cpp.patches/patches/ggml_src_ggml-cpu_repack.cpp.patch‎
Lines changed: 3 additions & 3 deletions b/‎llama.cpp.patches/patches/ggml_src_ggml-cpu_repack.cpp.patch‎
Lines changed: 3 additions & 3 deletions
@@ -83,6 +83,7 @@ Cosmopolitan libc has specific behaviors with condition variables and signals th
 | Patch | Description |
 |-------|-------------|
 | `common_log.cpp.patch` | Adds `#include <csignal>`; blocks `SIGINT`/`SIGTERM` on logger thread via `pthread_sigmask` to prevent `EINTR` exceptions; replaces `cv.wait()` with `wait_for(30s)` loop to work around XNU futex timeout bug (~72 minute expiry) |
+| `tools_server_server-models.cpp.patch` | Adds `#include <csignal>`; blocks `SIGINT`/`SIGTERM` on stopping thread; replaces `cv.wait()` with `wait_for(30s)` loops in `unload_lru`, `stopping_thread`, and `wait_until_loading_finished` |
 | `tools_server_server-queue.cpp.patch` | Adds missing includes (`<cerrno>`, `<system_error>`, `<csignal>`); blocks `SIGINT`/`SIGTERM` on queue thread; replaces `wait()` with `wait_for()` loops in three locations (`wait_until_no_sleep`, main loop, `recv`) |
 | `vendor_cpp-httplib_httplib.cpp.patch` | Fixes httplib thread pool with `wait_for()` instead of `wait()` for XNU futex compatibility |
 
@@ -116,7 +117,6 @@ These patches integrate llamafile's file handling APIs for loading models from b
 
 | Patch | Description |
 |-------|-------------|
-| `common_chat.cpp.patch` | Fixes C++ type conversion: explicitly wraps `inputs.messages` in `std::optional<json>()` for Deepseek v3.1 template |
 | `ggml_src_ggml-backend-reg.cpp.patch` | Suppresses debug log noise for non-existent backend search paths (irrelevant for llamafile's DSO loading approach) |
 | `ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch` | Fixes unsigned integer underflow in `ggml_backend_vk_get_device_memory` where Vulkan's `heapUsage` can exceed `heapBudget` (clamps to zero instead of wrapping) |
 
 
@@ -88,6 +88,7 @@ LLAMA_SRCS_CPP := \
 	llama.cpp/src/models/gemma2-iswa.cpp \
 	llama.cpp/src/models/gemma3.cpp \
 	llama.cpp/src/models/gemma3n-iswa.cpp \
+	llama.cpp/src/models/gemma4-iswa.cpp \
 	llama.cpp/src/models/glm4-moe.cpp \
 	llama.cpp/src/models/glm4.cpp \
 	llama.cpp/src/models/gpt2.cpp \
@@ -198,14 +199,16 @@ LLAMA_OBJS := $(LLAMA_SRCS_CPP:%.cpp=o/$(MODE)/%.cpp.o)
 
 COMMON_SRCS_CPP := \
 	llama.cpp/common/arg.cpp \
-	llama.cpp/common/chat-parser-xml-toolcall.cpp \
-	llama.cpp/common/chat-parser.cpp \
+	llama.cpp/common/chat-auto-parser-generator.cpp \
+	llama.cpp/common/chat-auto-parser-helpers.cpp \
+	llama.cpp/common/chat-diff-analyzer.cpp \
 	llama.cpp/common/chat-peg-parser.cpp \
 	llama.cpp/common/chat.cpp \
 	llama.cpp/common/common.cpp \
 	llama.cpp/common/console.cpp \
 	llama.cpp/common/debug.cpp \
 	llama.cpp/common/download.cpp \
+	llama.cpp/common/hf-cache.cpp \
 	llama.cpp/common/jinja/caps.cpp \
 	llama.cpp/common/jinja/lexer.cpp \
 	llama.cpp/common/jinja/parser.cpp \
@@ -222,6 +225,7 @@ COMMON_SRCS_CPP := \
 	llama.cpp/common/ngram-mod.cpp \
 	llama.cpp/common/peg-parser.cpp \
 	llama.cpp/common/preset.cpp \
+	llama.cpp/common/reasoning-budget.cpp \
 	llama.cpp/common/regex-partial.cpp \
 	llama.cpp/common/sampling.cpp \
 	llama.cpp/common/speculative.cpp \
@@ -273,9 +277,13 @@ MTMD_SRCS_CPP := \
 	llama.cpp/tools/mtmd/mtmd.cpp \
 	llama.cpp/tools/mtmd/mtmd-helper.cpp \
 	llama.cpp/tools/mtmd/mtmd-audio.cpp \
+	llama.cpp/tools/mtmd/mtmd-image.cpp \
 	llama.cpp/tools/mtmd/models/cogvlm.cpp \
+	llama.cpp/tools/mtmd/models/deepseekocr.cpp \
 	llama.cpp/tools/mtmd/models/conformer.cpp \
+	llama.cpp/tools/mtmd/models/gemma4v.cpp \
 	llama.cpp/tools/mtmd/models/glm4v.cpp \
+	llama.cpp/tools/mtmd/models/hunyuanocr.cpp \
 	llama.cpp/tools/mtmd/models/internvl.cpp \
 	llama.cpp/tools/mtmd/models/kimik25.cpp \
 	llama.cpp/tools/mtmd/models/kimivl.cpp \
@@ -289,6 +297,7 @@ MTMD_SRCS_CPP := \
 	llama.cpp/tools/mtmd/models/qwen2vl.cpp \
 	llama.cpp/tools/mtmd/models/qwen3vl.cpp \
 	llama.cpp/tools/mtmd/models/siglip.cpp \
+	llama.cpp/tools/mtmd/models/step3vl.cpp \
 	llama.cpp/tools/mtmd/models/whisper-enc.cpp \
 	llama.cpp/tools/mtmd/models/youtuvl.cpp
 
@@ -316,7 +325,9 @@ o/$(MODE)/llama.cpp/tools/server/%.hpp: llama.cpp/tools/server/public/%
 	@echo 'unsigned int $(VARNAME)_len = sizeof($(VARNAME));' >> $@
 
 SERVER_ASSETS := \
-	o/$(MODE)/llama.cpp/tools/server/index.html.gz.hpp \
+	o/$(MODE)/llama.cpp/tools/server/index.html.hpp \
+	o/$(MODE)/llama.cpp/tools/server/bundle.js.hpp \
+	o/$(MODE)/llama.cpp/tools/server/bundle.css.hpp \
 	o/$(MODE)/llama.cpp/tools/server/loading.html.hpp
 
 # ==============================================================================
@@ -336,7 +347,8 @@ TOOL_SERVER_SRCS := \
 	llama.cpp/tools/server/server-http.cpp \
 	llama.cpp/tools/server/server-models.cpp \
 	llama.cpp/tools/server/server-queue.cpp \
-	llama.cpp/tools/server/server-task.cpp
+	llama.cpp/tools/server/server-task.cpp \
+	llama.cpp/tools/server/server-tools.cpp
 
 # Tool object files
 TOOL_QUANTIZE_OBJS := $(TOOL_QUANTIZE_SRCS:%.cpp=o/$(MODE)/%.cpp.o)
@@ -373,8 +385,9 @@ $(TOOL_PERPLEXITY_OBJS) $(TOOL_BENCH_OBJS) $(TOOL_SERVER_OBJS) $(MTMD_OBJS): \
 		-iquote o/$(MODE)/llama.cpp/tools/server \
 		-isystem llama.cpp/vendor
 
-# Server needs llamafile headers for Metal support
+# Server needs llamafile headers for Metal support and web UI
 $(TOOL_SERVER_OBJS): private CPPFLAGS += -iquote llamafile
+$(TOOL_SERVER_OBJS): private CCFLAGS += -DLLAMA_BUILD_WEBUI
 
 # Version definitions
 $(GGML_OBJS): private CCFLAGS += \
 
@@ -1,7 +1,7 @@
 diff --git a/common/arg.cpp b/common/arg.cpp
 --- a/llama.cpp/common/arg.cpp
 +++ b/llama.cpp/common/arg.cpp
-@@ -36,6 +36,8 @@
+@@ -37,6 +37,8 @@
  #ifndef __EMSCRIPTEN__
  #ifdef __linux__
  #include <linux/limits.h>
 
@@ -1,7 +1,7 @@
 diff --git a/common/common.cpp b/common/common.cpp
 --- a/llama.cpp/common/common.cpp
 +++ b/llama.cpp/common/common.cpp
-@@ -874,6 +874,16 @@ std::string fs_get_cache_directory() {
+@@ -970,6 +970,16 @@ std::string fs_get_cache_directory() {
          cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
  #elif defined(_WIN32)
          cache_directory = std::getenv("LOCALAPPDATA");
@@ -18,7 +18,7 @@ diff --git a/common/common.cpp b/common/common.cpp
  #elif defined(__EMSCRIPTEN__)
          GGML_ABORT("not implemented on this platform");
  #else
-@@ -1050,10 +1060,31 @@ common_init_result::common_init_result(common_params & params) :
+@@ -1146,10 +1156,31 @@ common_init_result::common_init_result(common_params & params) :
 
      if (params.fit_params) {
          LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
 
@@ -1,7 +1,7 @@
 diff --git a/common/download.cpp b/common/download.cpp
 --- a/llama.cpp/common/download.cpp
 +++ b/llama.cpp/common/download.cpp
-@@ -24,6 +24,8 @@
+@@ -25,6 +25,8 @@
  #ifndef __EMSCRIPTEN__
  #ifdef __linux__
  #include <linux/limits.h>
 
@@ -1,7 +1,7 @@
 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
 --- a/llama.cpp/ggml/src/ggml-backend-reg.cpp
 +++ b/llama.cpp/ggml/src/ggml-backend-reg.cpp
-@@ -478,7 +478,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
+@@ -485,7 +485,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
              if (ec) {
                  GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(search_path).c_str(), ec.message().c_str());
              } else {
 
@@ -1,7 +1,7 @@
 diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
 --- a/llama.cpp/ggml/src/ggml-cpu/repack.cpp
 +++ b/llama.cpp/ggml/src/ggml-cpu/repack.cpp
-@@ -3521,14 +3521,14 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
+@@ -4723,14 +4723,14 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
      return nullptr;
  }
 
@@ -18,7 +18,7 @@ diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
                                                         const void * data, size_t offset, size_t size) {
      GGML_ASSERT(offset == 0);
      GGML_ASSERT(size == ggml_nbytes(tensor));
-@@ -3540,13 +3540,13 @@ static void ggml_backend_cpu_repack_buffer_set_tensor(ggml_backend_buffer_t buff
+@@ -4742,13 +4742,13 @@ static void ggml_backend_cpu_repack_buffer_set_tensor(ggml_backend_buffer_t buff
      GGML_UNUSED(buffer);
  }
 
@@ -34,7 +34,7 @@ diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
      ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
 
      if (buffer == nullptr) {
-@@ -3561,7 +3561,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_repack_buffer_type_alloc_buffer(gg
+@@ -4763,7 +4763,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_repack_buffer_type_alloc_buffer(gg
      return buffer;
  }