Skip to content

Commit bec4d28

Browse files
authored
Update llama.cpp submodule to 7b8443ac7 (#951)
* Update llama.cpp submodule to 7b8443ac7 * Update llama.cpp patches and BUILD.mk for 7b8443ac7 Refresh the patches against the new submodule head and add a new ggml-backend-meta.cpp patch to annotate its callbacks with GGML_CALL, matching the existing buffer/device/backend interface struct typedefs. Update BUILD.mk for upstream file additions/renames: common/fit.cpp, ggml-backend-meta.cpp, server-chat.cpp, four new mtmd models, and the llama-iswa/t5-dec/t5-enc renames. Add a private CPPFLAGS rule so the single-prefix build-info.cpp.o (built directly by tests) can find the new build-info.h header. Add server-chat.cpp.o to llamafile main exe deps. * Add ggml-backend-meta.cpp to GPU runtime build scripts Upstream's ggml-backend.cpp now references ggml_backend_buffer_is_meta (line 133, 2006) and ggml-alloc.c references ggml_backend_buft_is_meta (line 1240). Both functions are defined in the new ggml-backend-meta.cpp which upstream made part of ggml-base. Without this, the runtime-built GPU DSOs (ggml-cuda.so/.dll, ggml-rocm.dll, ggml-vulkan.dll) and the on-the-fly Metal dylib build would link with undefined references. Updated: - llamafile/build-functions.sh (Linux CUDA + ROCm via cuda.sh / rocm.sh) - llamafile/cuda.bat, llamafile/cuda_parallel.bat - llamafile/rocm.bat, llamafile/rocm_parallel.bat - llamafile/vulkan.bat - llamafile/metal.c (yoink + extracted-files map + compile list) - llamafile/BUILD.mk (add ggml-backend-meta.cpp.zip.o to LLAMAFILE_METAL_SOURCES) * Bundle ggml-cpp.h for Metal runtime compile ggml-backend-meta.cpp #includes "ggml-cpp.h", which wasn't in the bundle because no previously bundled source needed it. Without this, on macOS the on-the-fly metal dylib compile fails with: ~/.llamafile/v/X.Y.Z/ggml-backend-meta.cpp:6:10: fatal error: 'ggml-cpp.h' file not found Yoink + extract-map + LLAMAFILE_METAL_SOURCES updated. * vulkan.sh: probe and require spirv-headers explicitly Since llama.cpp PR #21572, ggml-vulkan.cpp #includes a SPIR-V header to emit OpCapability/OpExtension/OpExecutionMode in compiled shaders. The script previously only passed -I for ggml include paths, relying on the default compiler search path. When spirv-headers isn't installed, the build fails deep in the source with cryptic "'spv' is not a class or namespace" errors instead of a clear missing-dependency message. Probe the same cascade ggml-vulkan.cpp uses (plus VULKAN_SDK) and pass the matching -I. Fail early with install instructions otherwise. Also add spirv-headers to the glslc-not-found install hints since the two are typically needed together.
1 parent 8105c87 commit bec4d28

31 files changed

Lines changed: 542 additions & 185 deletions

llama.cpp

llama.cpp.patches/llamafile-files/BUILD.mk

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ GGML_SRCS_C := \
2424

2525
GGML_SRCS_CPP := \
2626
llama.cpp/ggml/src/ggml-backend-dl.cpp \
27+
llama.cpp/ggml/src/ggml-backend-meta.cpp \
2728
llama.cpp/ggml/src/ggml-backend-reg.cpp \
2829
llama.cpp/ggml/src/ggml-backend.cpp \
2930
llama.cpp/ggml/src/ggml-opt.cpp \
@@ -108,7 +109,7 @@ LLAMA_SRCS_CPP := \
108109
llama.cpp/src/models/lfm2.cpp \
109110
llama.cpp/src/models/llada-moe.cpp \
110111
llama.cpp/src/models/llada.cpp \
111-
llama.cpp/src/models/llama-iswa.cpp \
112+
llama.cpp/src/models/llama4.cpp \
112113
llama.cpp/src/models/llama.cpp \
113114
llama.cpp/src/models/maincoder.cpp \
114115
llama.cpp/src/models/mamba.cpp \
@@ -160,8 +161,8 @@ LLAMA_SRCS_CPP := \
160161
llama.cpp/src/models/starcoder.cpp \
161162
llama.cpp/src/models/step35-iswa.cpp \
162163
llama.cpp/src/models/starcoder2.cpp \
163-
llama.cpp/src/models/t5-dec.cpp \
164-
llama.cpp/src/models/t5-enc.cpp \
164+
llama.cpp/src/models/t5.cpp \
165+
llama.cpp/src/models/t5encoder.cpp \
165166
llama.cpp/src/models/wavtokenizer-dec.cpp \
166167
llama.cpp/src/models/xverse.cpp \
167168
llama.cpp/src/llama-adapter.cpp \
@@ -208,6 +209,7 @@ COMMON_SRCS_CPP := \
208209
llama.cpp/common/console.cpp \
209210
llama.cpp/common/debug.cpp \
210211
llama.cpp/common/download.cpp \
212+
llama.cpp/common/fit.cpp \
211213
llama.cpp/common/hf-cache.cpp \
212214
llama.cpp/common/jinja/caps.cpp \
213215
llama.cpp/common/jinja/lexer.cpp \
@@ -249,6 +251,10 @@ COMMON_SRCS_CPP += o/$(MODE)/llama.cpp/common/build-info.cpp
249251

250252
COMMON_OBJS := $(COMMON_SRCS_CPP:%.cpp=o/$(MODE)/%.cpp.o)
251253

254+
# build-info.cpp #includes "build-info.h" from llama.cpp/common; tests build the
255+
# single-prefix object directly via the generic rule, so add the include path.
256+
o/$(MODE)/llama.cpp/common/build-info.cpp.o: private CPPFLAGS += -iquote llama.cpp/common
257+
252258
# ==============================================================================
253259
# Additional support files
254260
# ==============================================================================
@@ -281,6 +287,8 @@ MTMD_SRCS_CPP := \
281287
llama.cpp/tools/mtmd/models/cogvlm.cpp \
282288
llama.cpp/tools/mtmd/models/deepseekocr.cpp \
283289
llama.cpp/tools/mtmd/models/conformer.cpp \
290+
llama.cpp/tools/mtmd/models/dotsocr.cpp \
291+
llama.cpp/tools/mtmd/models/gemma4a.cpp \
284292
llama.cpp/tools/mtmd/models/gemma4v.cpp \
285293
llama.cpp/tools/mtmd/models/glm4v.cpp \
286294
llama.cpp/tools/mtmd/models/hunyuanocr.cpp \
@@ -295,10 +303,12 @@ MTMD_SRCS_CPP := \
295303
llama.cpp/tools/mtmd/models/paddleocr.cpp \
296304
llama.cpp/tools/mtmd/models/pixtral.cpp \
297305
llama.cpp/tools/mtmd/models/qwen2vl.cpp \
306+
llama.cpp/tools/mtmd/models/qwen3a.cpp \
298307
llama.cpp/tools/mtmd/models/qwen3vl.cpp \
299308
llama.cpp/tools/mtmd/models/siglip.cpp \
300309
llama.cpp/tools/mtmd/models/step3vl.cpp \
301310
llama.cpp/tools/mtmd/models/whisper-enc.cpp \
311+
llama.cpp/tools/mtmd/models/yasa2.cpp \
302312
llama.cpp/tools/mtmd/models/youtuvl.cpp
303313

304314
MTMD_OBJS := $(MTMD_SRCS_CPP:%.cpp=o/$(MODE)/%.cpp.o)
@@ -342,6 +352,7 @@ TOOL_BENCH_SRCS := llama.cpp/tools/llama-bench/llama-bench.cpp
342352

343353
TOOL_SERVER_SRCS := \
344354
llama.cpp/tools/server/server.cpp \
355+
llama.cpp/tools/server/server-chat.cpp \
345356
llama.cpp/tools/server/server-common.cpp \
346357
llama.cpp/tools/server/server-context.cpp \
347358
llama.cpp/tools/server/server-http.cpp \

llama.cpp.patches/patches/common_arg.cpp.patch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
diff --git a/common/arg.cpp b/common/arg.cpp
22
--- a/llama.cpp/common/arg.cpp
33
+++ b/llama.cpp/common/arg.cpp
4-
@@ -37,6 +37,8 @@
4+
@@ -38,6 +38,8 @@
55
#ifndef __EMSCRIPTEN__
66
#ifdef __linux__
77
#include <linux/limits.h>

llama.cpp.patches/patches/common_common.cpp.patch

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
diff --git a/common/common.cpp b/common/common.cpp
22
--- a/llama.cpp/common/common.cpp
33
+++ b/llama.cpp/common/common.cpp
4-
@@ -970,6 +970,16 @@ std::string fs_get_cache_directory() {
4+
@@ -972,6 +972,16 @@ std::string fs_get_cache_directory() {
55
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
66
#elif defined(_WIN32)
77
cache_directory = std::getenv("LOCALAPPDATA");
@@ -18,7 +18,7 @@ diff --git a/common/common.cpp b/common/common.cpp
1818
#elif defined(__EMSCRIPTEN__)
1919
GGML_ABORT("not implemented on this platform");
2020
#else
21-
@@ -1146,10 +1156,31 @@ common_init_result::common_init_result(common_params & params) :
21+
@@ -1148,10 +1158,31 @@ common_init_result::common_init_result(common_params & params) :
2222

2323
if (params.fit_params) {
2424
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
@@ -43,7 +43,7 @@ diff --git a/common/common.cpp b/common/common.cpp
4343
+ }
4444
+ }
4545
+
46-
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
46+
common_fit_params(params.model.path.c_str(), &mparams, &cparams,
4747
params.tensor_split,
4848
params.tensor_buft_overrides.data(),
4949
- params.fit_params_target.data(),

llama.cpp.patches/patches/common_download.cpp.patch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
diff --git a/common/download.cpp b/common/download.cpp
22
--- a/llama.cpp/common/download.cpp
33
+++ b/llama.cpp/common/download.cpp
4-
@@ -25,6 +25,8 @@
4+
@@ -26,6 +26,8 @@
55
#ifndef __EMSCRIPTEN__
66
#ifdef __linux__
77
#include <linux/limits.h>

llama.cpp.patches/patches/common_log.cpp.patch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ diff --git a/common/log.cpp b/common/log.cpp
99
#endif // defined(_WIN32)
1010

1111
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
12-
@@ -257,10 +258,27 @@ public:
12+
@@ -261,10 +262,27 @@ public:
1313
running = true;
1414

1515
thrd = std::thread([this]() {

llama.cpp.patches/patches/ggml_include_ggml-backend.h.patch

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
1818
#ifdef __cplusplus
1919
extern "C" {
2020
#endif
21-
@@ -197,19 +207,19 @@ extern "C" {
22-
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
21+
@@ -208,19 +218,19 @@ extern "C" {
22+
typedef bool (*ggml_backend_comm_allreduce_tensor_t)(void * comm_ctx, struct ggml_tensor ** tensors);
2323

24-
// Split buffer type for tensor parallelism
24+
// Split buffer type for tensor parallelism (old)
2525
- typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
2626
+ typedef ggml_backend_buffer_type_t (GGML_CALL *ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
2727
// Set the number of threads for the backend

llama.cpp.patches/patches/ggml_include_ggml-cuda.h.patch

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
22
--- a/llama.cpp/ggml/include/ggml-cuda.h
33
+++ b/llama.cpp/ggml/include/ggml-cuda.h
4-
@@ -28,7 +28,7 @@ GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
5-
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
4+
@@ -31,7 +31,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int de
5+
GGML_BACKEND_API bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
66

77
// split tensor buffer that splits matrices by rows across multiple devices
88
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
99
+GGML_BACKEND_API ggml_backend_buffer_type_t GGML_CALL ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
1010

1111
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
1212
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
13-
@@ -37,8 +37,8 @@ GGML_BACKEND_API int ggml_backend_cuda_get_device_count(void);
13+
@@ -40,8 +40,8 @@ GGML_BACKEND_API int ggml_backend_cuda_get_device_count(void);
1414
GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
1515
GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
1616

llama.cpp.patches/patches/ggml_src_ggml-backend-impl.h.patch

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
2525
};
2626

2727
struct ggml_backend_buffer_type {
28-
@@ -39,22 +39,26 @@ extern "C" {
28+
@@ -39,26 +39,30 @@ extern "C" {
2929
//
3030

3131
struct ggml_backend_buffer_i {
@@ -46,6 +46,12 @@ diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
4646
+ void (GGML_CALL *memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
4747
+ void (GGML_CALL *set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
4848
+ void (GGML_CALL *get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
49+
// (optional) 2d data copies
50+
- void (*set_tensor_2d)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
51+
- void (*get_tensor_2d)(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
52+
+ void (GGML_CALL *set_tensor_2d)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
53+
+ void (GGML_CALL *get_tensor_2d)(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
54+
4955
// (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
5056
- bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
5157
+ bool (GGML_CALL *cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
@@ -62,7 +68,7 @@ diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
6268
};
6369

6470
struct ggml_backend_buffer {
65-
@@ -85,38 +89,38 @@ extern "C" {
71+
@@ -103,40 +107,40 @@ extern "C" {
6672
//
6773

6874
struct ggml_backend_i {
@@ -73,11 +79,15 @@ diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
7379
+ void (GGML_CALL *free)(ggml_backend_t backend);
7480

7581
// (optional) asynchronous tensor data access
76-
- void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
77-
- void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
82+
- void (*set_tensor_async) (ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
83+
- void (*get_tensor_async) (ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
84+
- void (*set_tensor_2d_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
85+
- void (*get_tensor_2d_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
7886
- bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
79-
+ void (GGML_CALL *set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
80-
+ void (GGML_CALL *get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
87+
+ void (GGML_CALL *set_tensor_async) (ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
88+
+ void (GGML_CALL *get_tensor_async) (ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
89+
+ void (GGML_CALL *set_tensor_2d_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
90+
+ void (GGML_CALL *get_tensor_2d_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
8191
+ bool (GGML_CALL *cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
8292

8393
// (optional) complete all pending operations (required if the backend supports async operations)
@@ -115,7 +125,7 @@ diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
115125
};
116126

117127
struct ggml_backend {
118-
@@ -139,46 +143,46 @@ extern "C" {
128+
@@ -159,46 +163,46 @@ extern "C" {
119129
// the current functions to obtain the properties can remain, since they are more convenient for often used properties
120130
struct ggml_backend_device_i {
121131
// device name: short identifier for this device, such as "CPU" or "CUDA0"
@@ -177,7 +187,7 @@ diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
177187
};
178188

179189
struct ggml_backend_device {
180-
@@ -192,15 +196,15 @@ extern "C" {
190+
@@ -212,15 +216,15 @@ extern "C" {
181191
//
182192

183193
struct ggml_backend_reg_i {

0 commit comments

Comments
 (0)