From debd8a3455077d05a5d98a3041f26067bc2cb4d9 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Wed, 8 Apr 2026 19:50:06 +0200 Subject: [PATCH 1/7] CUDA13 fixes, find vcvarsall.bat automatically --- llamafile/cuda.bat | 24 ++++++++++++++++++++++-- llamafile/cuda_parallel.bat | 25 ++++++++++++++++++++++--- llamafile/rocm.bat | 21 ++++++++++++++++++++- llamafile/rocm_parallel.bat | 21 ++++++++++++++++++++- llamafile/vulkan.bat | 19 +++++++++++++++---- 5 files changed, 99 insertions(+), 11 deletions(-) diff --git a/llamafile/cuda.bat b/llamafile/cuda.bat index 32055b9666..7168d9759c 100644 --- a/llamafile/cuda.bat +++ b/llamafile/cuda.bat @@ -36,6 +36,25 @@ echo Unknown option: %~1 exit /b 1 :done_args +:: -------- find Visual Studio / Build Tools -------- +where cl >nul 2>&1 +if errorlevel 1 ( + set "VSWHERE=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" + if not exist "!VSWHERE!" ( + echo Error: cl.exe not found in PATH and vswhere.exe not found + echo Please run from a Visual Studio Developer Command Prompt + exit /b 1 + ) + for /f "usebackq tokens=*" %%i in (`"!VSWHERE!" -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath`) do ( + set "VS_PATH=%%i" + ) + if not defined VS_PATH ( + echo Error: Visual Studio with C++ tools not found + exit /b 1 + ) + call "!VS_PATH!\VC\Auxiliary\Build\vcvarsall.bat" x64 +) + set "LLAMA_CPP_DIR=%REPO_DIR%\llama.cpp" set "GGML_CUDA_DIR=%LLAMA_CPP_DIR%\ggml\src\ggml-cuda" set "GGML_SRC_DIR=%LLAMA_CPP_DIR%\ggml\src" @@ -115,7 +134,8 @@ if "%USE_CUBLAS%"=="0" set "COMMON_FLAGS=%COMMON_FLAGS% -I%BUILD_DIR%" set "COMMON_FLAGS=%COMMON_FLAGS% -I%GGML_INC_DIR% -I%GGML_SRC_DIR% -I%GGML_CUDA_DIR%" set "COMMON_FLAGS=%COMMON_FLAGS% --forward-unknown-to-host-compiler" set "COMMON_FLAGS=%COMMON_FLAGS% --std=c++17" -set "COMMON_FLAGS=%COMMON_FLAGS% -Xcompiler="/nologo /EHsc /O2 /GR /MT /std:c++17"" +set "COMMON_FLAGS=%COMMON_FLAGS% -Xcompiler="/nologo /EHsc /O2 /GR /MT /std:c++17 /Zc:preprocessor"" +set "COMMON_FLAGS=%COMMON_FLAGS% -diag-suppress 177 -diag-suppress 221 -diag-suppress 550" set "COMMON_FLAGS=%COMMON_FLAGS% -DNDEBUG -DGGML_BUILD=1 -DGGML_SHARED=1 -DGGML_BACKEND_SHARED=1 -DGGML_BACKEND_BUILD=1 -DGGML_MULTIPLATFORM" set "COMMON_FLAGS=%COMMON_FLAGS% %BLAS_DEFINE%" @@ -209,7 +229,7 @@ echo. :: -------- compile core GGML sources with host compiler -------- echo Compiling core GGML sources... -set "HOST_FLAGS=/nologo /EHsc /O2 /GR /MT /DNDEBUG" +set "HOST_FLAGS=/nologo /EHsc /O2 /GR /MT /Zc:preprocessor /DNDEBUG" set "HOST_FLAGS=%HOST_FLAGS% /DGGML_BUILD=1 /DGGML_SHARED=1 /DGGML_BACKEND_SHARED=1 /DGGML_BACKEND_BUILD=1 /DGGML_MULTIPLATFORM" set "HOST_FLAGS=%HOST_FLAGS% /DGGML_VERSION=\"!GGML_VERSION!\" /DGGML_COMMIT=\"!GGML_COMMIT!\"" set "HOST_FLAGS=%HOST_FLAGS% /I"%GGML_INC_DIR%" /I"%GGML_SRC_DIR%"" diff --git a/llamafile/cuda_parallel.bat b/llamafile/cuda_parallel.bat index 67905b163c..50ed849326 100644 --- a/llamafile/cuda_parallel.bat +++ b/llamafile/cuda_parallel.bat @@ -47,6 +47,25 @@ echo Unknown option: %~1 exit /b 1 :done_args +:: -------- find Visual Studio / Build Tools -------- +where cl >nul 2>&1 +if errorlevel 1 ( + set "VSWHERE=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" + if not exist "!VSWHERE!" ( + echo Error: cl.exe not found in PATH and vswhere.exe not found + echo Please run from a Visual Studio Developer Command Prompt + exit /b 1 + ) + for /f "usebackq tokens=*" %%i in (`"!VSWHERE!" -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath`) do ( + set "VS_PATH=%%i" + ) + if not defined VS_PATH ( + echo Error: Visual Studio with C++ tools not found + exit /b 1 + ) + call "!VS_PATH!\VC\Auxiliary\Build\vcvarsall.bat" x64 +) + set "LLAMA_CPP_DIR=%REPO_DIR%\llama.cpp" set "GGML_CUDA_DIR=%LLAMA_CPP_DIR%\ggml\src\ggml-cuda" set "GGML_SRC_DIR=%LLAMA_CPP_DIR%\ggml\src" @@ -137,8 +156,8 @@ if "%USE_CUBLAS%"=="0" set "COMMON_FLAGS=%COMMON_FLAGS% -I%BUILD_DIR%" set "COMMON_FLAGS=%COMMON_FLAGS% -I%GGML_INC_DIR% -I%GGML_SRC_DIR% -I%GGML_CUDA_DIR%" set "COMMON_FLAGS=%COMMON_FLAGS% --forward-unknown-to-host-compiler" set "COMMON_FLAGS=%COMMON_FLAGS% --std=c++17" -set "COMMON_FLAGS=%COMMON_FLAGS% -Xcompiler="/nologo /EHsc /O2 /GR /MT /std:c++17"" -set "COMMON_FLAGS=%COMMON_FLAGS% -diag-suppress 177 -diag-suppress 221" +set "COMMON_FLAGS=%COMMON_FLAGS% -Xcompiler="/nologo /EHsc /O2 /GR /MT /std:c++17 /Zc:preprocessor"" +set "COMMON_FLAGS=%COMMON_FLAGS% -diag-suppress 177 -diag-suppress 221 -diag-suppress 550" set "COMMON_FLAGS=%COMMON_FLAGS% -DNDEBUG -DGGML_BUILD=1 -DGGML_SHARED=1 -DGGML_BACKEND_SHARED=1 -DGGML_BACKEND_BUILD=1 -DGGML_MULTIPLATFORM" set "COMMON_FLAGS=%COMMON_FLAGS% %BLAS_DEFINE%" @@ -229,7 +248,7 @@ echo. :: -------- compile core GGML sources with host compiler -------- echo Compiling core GGML sources... -set "HOST_FLAGS=/nologo /EHsc /O2 /GR /MT /DNDEBUG" +set "HOST_FLAGS=/nologo /EHsc /O2 /GR /MT /Zc:preprocessor /DNDEBUG" set "HOST_FLAGS=%HOST_FLAGS% /DGGML_BUILD=1 /DGGML_SHARED=1 /DGGML_BACKEND_SHARED=1 /DGGML_BACKEND_BUILD=1 /DGGML_MULTIPLATFORM" set "HOST_FLAGS=%HOST_FLAGS% /DGGML_VERSION=\"!GGML_VERSION!\" /DGGML_COMMIT=\"!GGML_COMMIT!\"" set "HOST_FLAGS=%HOST_FLAGS% /I"%GGML_INC_DIR%" /I"%GGML_SRC_DIR%"" diff --git a/llamafile/rocm.bat b/llamafile/rocm.bat index db51fe2a97..c136a82a2b 100644 --- a/llamafile/rocm.bat +++ b/llamafile/rocm.bat @@ -33,6 +33,25 @@ echo Unknown option: %~1 exit /b 1 :done_args +:: -------- find Visual Studio / Build Tools -------- +where cl >nul 2>&1 +if errorlevel 1 ( + set "VSWHERE=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" + if not exist "!VSWHERE!" ( + echo Error: cl.exe not found in PATH and vswhere.exe not found + echo Please run from a Visual Studio Developer Command Prompt + exit /b 1 + ) + for /f "usebackq tokens=*" %%i in (`"!VSWHERE!" -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath`) do ( + set "VS_PATH=%%i" + ) + if not defined VS_PATH ( + echo Error: Visual Studio with C++ tools not found + exit /b 1 + ) + call "!VS_PATH!\VC\Auxiliary\Build\vcvarsall.bat" x64 +) + set "LLAMA_CPP_DIR=%REPO_DIR%\llama.cpp" set "GGML_CUDA_DIR=%LLAMA_CPP_DIR%\ggml\src\ggml-cuda" set "GGML_SRC_DIR=%LLAMA_CPP_DIR%\ggml\src" @@ -196,7 +215,7 @@ echo. :: -------- compile core GGML sources with host compiler -------- echo Compiling core GGML sources... -set "HOST_FLAGS=/nologo /EHsc /O2 /GR /MT /DNDEBUG" +set "HOST_FLAGS=/nologo /EHsc /O2 /GR /MT /Zc:preprocessor /DNDEBUG" set "HOST_FLAGS=%HOST_FLAGS% /DGGML_BUILD=1 /DGGML_SHARED=1 /DGGML_BACKEND_SHARED=1 /DGGML_BACKEND_BUILD=1 /DGGML_MULTIPLATFORM" set "HOST_FLAGS=%HOST_FLAGS% /DGGML_VERSION=\"!GGML_VERSION!\" /DGGML_COMMIT=\"!GGML_COMMIT!\"" set "HOST_FLAGS=%HOST_FLAGS% /I"%GGML_INC_DIR%" /I"%GGML_SRC_DIR%"" diff --git a/llamafile/rocm_parallel.bat b/llamafile/rocm_parallel.bat index fdeda1ffcb..ee1c441cb2 100644 --- a/llamafile/rocm_parallel.bat +++ b/llamafile/rocm_parallel.bat @@ -44,6 +44,25 @@ echo Unknown option: %~1 exit /b 1 :done_args +:: -------- find Visual Studio / Build Tools -------- +where cl >nul 2>&1 +if errorlevel 1 ( + set "VSWHERE=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" + if not exist "!VSWHERE!" ( + echo Error: cl.exe not found in PATH and vswhere.exe not found + echo Please run from a Visual Studio Developer Command Prompt + exit /b 1 + ) + for /f "usebackq tokens=*" %%i in (`"!VSWHERE!" -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath`) do ( + set "VS_PATH=%%i" + ) + if not defined VS_PATH ( + echo Error: Visual Studio with C++ tools not found + exit /b 1 + ) + call "!VS_PATH!\VC\Auxiliary\Build\vcvarsall.bat" x64 +) + set "LLAMA_CPP_DIR=%REPO_DIR%\llama.cpp" set "GGML_CUDA_DIR=%LLAMA_CPP_DIR%\ggml\src\ggml-cuda" set "GGML_SRC_DIR=%LLAMA_CPP_DIR%\ggml\src" @@ -215,7 +234,7 @@ echo. :: -------- compile core GGML sources with host compiler -------- echo Compiling core GGML sources... -set "HOST_FLAGS=/nologo /EHsc /O2 /GR /MT /DNDEBUG" +set "HOST_FLAGS=/nologo /EHsc /O2 /GR /MT /Zc:preprocessor /DNDEBUG" set "HOST_FLAGS=%HOST_FLAGS% /DGGML_BUILD=1 /DGGML_SHARED=1 /DGGML_BACKEND_SHARED=1 /DGGML_BACKEND_BUILD=1 /DGGML_MULTIPLATFORM" set "HOST_FLAGS=%HOST_FLAGS% /DGGML_VERSION=\"!GGML_VERSION!\" /DGGML_COMMIT=\"!GGML_COMMIT!\"" set "HOST_FLAGS=%HOST_FLAGS% /I"%GGML_INC_DIR%" /I"%GGML_SRC_DIR%"" diff --git a/llamafile/vulkan.bat b/llamafile/vulkan.bat index 8feead1546..d7dc46ed8c 100644 --- a/llamafile/vulkan.bat +++ b/llamafile/vulkan.bat @@ -93,12 +93,23 @@ if not exist "%VULKAN_SDK%\Lib\vulkan-1.lib" ( exit /b 1 ) -:: -------- check MSVC -------- +:: -------- find Visual Studio / Build Tools -------- where cl >nul 2>&1 if errorlevel 1 ( - echo Error: cl.exe not found in PATH - echo Please run from a Visual Studio Developer Command Prompt - exit /b 1 + set "VSWHERE=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" + if not exist "!VSWHERE!" ( + echo Error: cl.exe not found in PATH and vswhere.exe not found + echo Please run from a Visual Studio Developer Command Prompt + exit /b 1 + ) + for /f "usebackq tokens=*" %%i in (`"!VSWHERE!" -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath`) do ( + set "VS_PATH=%%i" + ) + if not defined VS_PATH ( + echo Error: Visual Studio with C++ tools not found + exit /b 1 + ) + call "!VS_PATH!\VC\Auxiliary\Build\vcvarsall.bat" x64 ) :: -------- build parallel job runner -------- From ace7f5176ae16c916a2d499d4663f0f724f0a941 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Thu, 9 Apr 2026 13:14:16 +0200 Subject: [PATCH 2/7] Got to a working (albeit slow) vulkan version on CUDA13 --- ...ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch | 100 +++++++++++++----- llamafile/vulkan.bat | 4 +- 2 files changed, 74 insertions(+), 30 deletions(-) diff --git a/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch b/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch index 71210c635d..a9fd9bc47a 100644 --- a/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch +++ b/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch @@ -27,7 +27,51 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu static VkDeviceSize ggml_vk_get_max_buffer_range(const ggml_backend_vk_context * ctx, const vk_buffer &buf, const VkDeviceSize offset) { const VkDeviceSize range = std::min(VkDeviceSize{buf->size - offset}, -@@ -13222,20 +13222,28 @@ static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { +@@ -3341,20 +3341,9 @@ static void ggml_vk_load_shaders(vk_device& device) { + if (!pipeline->needed || pipeline->compiled) { + continue; + } +- // TODO: We're no longer benefitting from the async compiles (shaders are +- // compiled individually, as needed) and this complexity can be removed. +- { +- // wait until fewer than N compiles are in progress +- uint32_t N = std::max(1u, std::thread::hardware_concurrency()); +- std::unique_lock guard(compile_count_mutex); +- while (compile_count >= N) { +- compile_count_cond.wait(guard); +- } +- compile_count++; +- } +- +- compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), spv_size, spv_data, entrypoint, +- parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size)); ++ // Compile synchronously to avoid threading issues in cross-module DLL loading ++ ggml_vk_create_pipeline_func(device, pipeline, spv_size, spv_data, entrypoint, ++ parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size); + } + }; + +@@ -7606,10 +7595,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + +- VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; +- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; +- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; +- std::cerr << ")),)"); ++ VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3] << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3] << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << ")),)"); + GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT + GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT + +@@ -8030,6 +8016,7 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c + ggml_tensor * dst = cgraph->nodes[node_idx]; + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src1 = dst->src[1]; ++ (long long)dst->ne[0], (long long)dst->ne[1], (long long)dst->ne[2], (long long)dst->ne[3], + VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")"); + + // Handle huge A matrix by splitting the M dimensions. This works well for convolution use cases +@@ -13222,20 +13209,28 @@ static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { return buffer->buft->iface.get_name == ggml_backend_vk_buffer_type_name; } @@ -59,7 +103,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")"); if (tensor->view_src != nullptr) { GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); -@@ -13243,7 +13251,7 @@ static enum ggml_status ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t +@@ -13243,7 +13238,7 @@ static enum ggml_status ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t return GGML_STATUS_SUCCESS; } @@ -68,7 +112,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")"); ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; vk_buffer buf = buf_ctx->dev_buffer; -@@ -13252,7 +13260,7 @@ static void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, g +@@ -13252,7 +13247,7 @@ static void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, g ggml_vk_buffer_memset(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, val32, size); } @@ -77,7 +121,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; vk_buffer buf = buf_ctx->dev_buffer; -@@ -13260,7 +13268,7 @@ static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml +@@ -13260,7 +13255,7 @@ static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } @@ -86,7 +130,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; -@@ -13269,7 +13277,7 @@ static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, cons +@@ -13269,7 +13264,7 @@ static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, cons ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } @@ -95,7 +139,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu if (ggml_backend_buffer_is_vk(src->buffer)) { ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context; ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; -@@ -13286,7 +13294,7 @@ static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, cons +@@ -13286,7 +13281,7 @@ static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, cons UNUSED(buffer); } @@ -104,7 +148,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_vk_buffer_memset(ctx->dev_buffer, 0, value, buffer->size); -@@ -13302,16 +13310,17 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { +@@ -13302,16 +13297,17 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { /* .cpy_tensor = */ ggml_backend_vk_buffer_cpy_tensor, /* .clear = */ ggml_backend_vk_buffer_clear, /* .reset = */ NULL, @@ -124,7 +168,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")"); ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; -@@ -13327,17 +13336,17 @@ static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backe +@@ -13327,17 +13323,17 @@ static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backe return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size); } @@ -145,7 +189,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu return ggml_nbytes(tensor); UNUSED(buft); -@@ -13355,24 +13364,24 @@ ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) { +@@ -13355,24 +13351,24 @@ ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) { // host buffer type @@ -174,7 +218,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")"); size += 32; // Behave like the CPU buffer type -@@ -13388,19 +13397,20 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_ +@@ -13388,19 +13384,20 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer; @@ -197,7 +241,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu return vk_instance.devices[0]->suballocation_block_size; UNUSED(buft); -@@ -13432,13 +13442,13 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { +@@ -13432,13 +13429,13 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { // backend @@ -213,7 +257,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")"); -@@ -13454,7 +13464,7 @@ static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_b +@@ -13454,7 +13451,7 @@ static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_b return &ctx->device->buffer_type; } @@ -222,7 +266,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); -@@ -13497,7 +13507,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor +@@ -13497,7 +13494,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor } } @@ -231,7 +275,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); -@@ -13527,7 +13537,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_ +@@ -13527,7 +13524,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_ } } @@ -240,7 +284,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context; -@@ -13639,7 +13649,7 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) { +@@ -13639,7 +13636,7 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) { } } @@ -249,7 +293,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_synchronize()"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; -@@ -14061,7 +14071,7 @@ static int32_t find_first_set(uint32_t x) { +@@ -14061,7 +14058,7 @@ static int32_t find_first_set(uint32_t x) { return ret; } @@ -258,7 +302,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; -@@ -14441,7 +14451,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg +@@ -14441,7 +14438,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg } // Sort the graph for improved parallelism. @@ -267,7 +311,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu { VK_LOG_DEBUG("ggml_vk_graph_optimize(" << graph->n_nodes << " nodes)"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; -@@ -14679,7 +14689,7 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * +@@ -14679,7 +14676,7 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * } } @@ -276,7 +320,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_event_record(backend=" << backend << ", event=" << event << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; vk_event *vkev = (vk_event *)event->context; -@@ -14702,7 +14712,7 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev +@@ -14702,7 +14699,7 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev ctx->compute_ctx.reset(); } @@ -285,7 +329,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_event_wait(backend=" << backend << ", event=" << event << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; vk_event *vkev = (vk_event *)event->context; -@@ -14796,7 +14806,10 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total +@@ -14796,7 +14793,10 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total *total += heap.size; if (membudget_supported && i < budgetprops.heapUsage.size()) { @@ -297,7 +341,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu } else { *free += heap.size; } -@@ -14864,38 +14877,38 @@ struct ggml_backend_vk_device_context { +@@ -14864,38 +14864,38 @@ struct ggml_backend_vk_device_context { int op_offload_min_batch_size; }; @@ -343,7 +387,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; props->name = ggml_backend_vk_device_get_name(dev); -@@ -14911,13 +14924,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml +@@ -14911,13 +14911,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml }; } @@ -359,7 +403,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; const vk_device& device = ggml_vk_get_device(ctx->device); -@@ -15445,7 +15458,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm +@@ -15445,7 +15445,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm UNUSED(dev); } @@ -368,7 +412,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) { return false; } -@@ -15471,13 +15484,13 @@ static int64_t ggml_vk_get_op_batch_size(const ggml_tensor * op) { +@@ -15471,13 +15471,13 @@ static int64_t ggml_vk_get_op_batch_size(const ggml_tensor * op) { } } @@ -384,7 +428,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; auto device = ggml_vk_get_device(ctx->device); -@@ -15497,7 +15510,7 @@ static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t +@@ -15497,7 +15497,7 @@ static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t }; } @@ -393,7 +437,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; auto device = ggml_vk_get_device(ctx->device); -@@ -15509,7 +15522,7 @@ static void ggml_backend_vk_device_event_free(ggml_backend_dev_t dev, ggml_backe +@@ -15509,7 +15509,7 @@ static void ggml_backend_vk_device_event_free(ggml_backend_dev_t dev, ggml_backe delete event; } @@ -402,7 +446,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_device_event_synchronize(backend=" << dev << ", event=" << event << ")"); ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; auto device = ggml_vk_get_device(ctx->device); -@@ -15543,7 +15556,7 @@ static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, si +@@ -15543,7 +15543,7 @@ static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, si return buf; } @@ -411,7 +455,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_device_buffer_from_host_ptr(backend=" << dev << ", ptr=" << ptr << ", size=" << size << ")"); GGML_UNUSED(max_tensor_size); -@@ -15581,17 +15594,17 @@ static const struct ggml_backend_device_i ggml_backend_vk_device_i = { +@@ -15581,17 +15581,17 @@ static const struct ggml_backend_device_i ggml_backend_vk_device_i = { /* .event_synchronize = */ ggml_backend_vk_device_event_synchronize, }; diff --git a/llamafile/vulkan.bat b/llamafile/vulkan.bat index d7dc46ed8c..4cb9d19d8f 100644 --- a/llamafile/vulkan.bat +++ b/llamafile/vulkan.bat @@ -221,7 +221,7 @@ echo. :: ======================================================================== echo Phase 4: Compiling shader C++ files... -set "CXX_FLAGS=/c /nologo /EHsc /O2 /GR /MT /std:c++17" +set "CXX_FLAGS=/c /nologo /EHsc /O2 /GR /MT /std:c++17 /Zc:preprocessor" set "CXX_FLAGS=%CXX_FLAGS% /I"%GGML_INC_DIR%" /I"%GGML_SRC_DIR%" /I"%BUILD_DIR%"" set "CXX_FLAGS=%CXX_FLAGS% /DNDEBUG /DGGML_BUILD=1 /DGGML_SHARED=1 /DGGML_BACKEND_SHARED=1 /DGGML_BACKEND_BUILD=1 /DGGML_MULTIPLATFORM" @@ -272,7 +272,7 @@ echo. :: ======================================================================== echo Phase 6: Compiling core GGML sources... -set "HOST_FLAGS=/nologo /EHsc /O2 /GR /MT /DNDEBUG" +set "HOST_FLAGS=/nologo /EHsc /O2 /GR /MT /Zc:preprocessor /DNDEBUG" set "HOST_FLAGS=%HOST_FLAGS% /DGGML_BUILD=1 /DGGML_SHARED=1 /DGGML_BACKEND_SHARED=1 /DGGML_BACKEND_BUILD=1 /DGGML_MULTIPLATFORM" set "HOST_FLAGS=%HOST_FLAGS% /DGGML_VERSION=\"!GGML_VERSION!\" /DGGML_COMMIT=\"!GGML_COMMIT!\"" set "HOST_FLAGS=%HOST_FLAGS% /I"%GGML_INC_DIR%" /I"%GGML_SRC_DIR%"" From 53d5c73b761bdeeeaadb76f73558bf753913685f Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Fri, 10 Apr 2026 21:45:27 +0100 Subject: [PATCH 3/7] Re-added synchronous compilation after merge --- ...ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch | 80 ++++++++++++------- 1 file changed, 52 insertions(+), 28 deletions(-) diff --git a/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch b/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch index 79d955f4b0..0aa8e007ad 100644 --- a/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch +++ b/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch @@ -27,7 +27,31 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu static VkDeviceSize ggml_vk_get_max_buffer_range(const ggml_backend_vk_context * ctx, const vk_buffer &buf, const VkDeviceSize offset) { const VkDeviceSize range = std::min(VkDeviceSize{buf->size - offset}, -@@ -13428,20 +13428,28 @@ static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { +@@ -3391,20 +3391,9 @@ static void ggml_vk_load_shaders(vk_device& device) { + if (!pipeline->needed || pipeline->compiled) { + continue; + } +- // TODO: We're no longer benefitting from the async compiles (shaders are +- // compiled individually, as needed) and this complexity can be removed. +- { +- // wait until fewer than N compiles are in progress +- uint32_t N = std::max(1u, std::thread::hardware_concurrency()); +- std::unique_lock guard(compile_count_mutex); +- while (compile_count >= N) { +- compile_count_cond.wait(guard); +- } +- compile_count++; +- } +- +- compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), spv_size, spv_data, entrypoint, +- parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size)); ++ // Compile synchronously to avoid threading issues in cross-module DLL loading ++ ggml_vk_create_pipeline_func(device, pipeline, spv_size, spv_data, entrypoint, ++ parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size); + } + }; + +@@ -13428,20 +13417,28 @@ static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { return buffer->buft->iface.get_name == ggml_backend_vk_buffer_type_name; } @@ -59,7 +83,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")"); if (tensor->view_src != nullptr) { GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); -@@ -13449,7 +13457,7 @@ static enum ggml_status ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t +@@ -13449,7 +13446,7 @@ static enum ggml_status ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t return GGML_STATUS_SUCCESS; } @@ -68,7 +92,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")"); ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; vk_buffer buf = buf_ctx->dev_buffer; -@@ -13462,7 +13470,7 @@ static void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, g +@@ -13462,7 +13459,7 @@ static void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, g ggml_vk_buffer_memset(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, val32, size); } @@ -77,7 +101,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; vk_buffer buf = buf_ctx->dev_buffer; -@@ -13474,7 +13482,7 @@ static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml +@@ -13474,7 +13471,7 @@ static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } @@ -86,7 +110,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; -@@ -13487,7 +13495,7 @@ static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, cons +@@ -13487,7 +13484,7 @@ static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, cons ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } @@ -95,7 +119,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu if (ggml_nbytes(src) == 0) { return true; } -@@ -13508,7 +13516,7 @@ static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, cons +@@ -13508,7 +13505,7 @@ static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, cons UNUSED(buffer); } @@ -104,7 +128,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_vk_buffer_memset(ctx->dev_buffer, 0, value, buffer->size); -@@ -13524,16 +13532,17 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { +@@ -13524,16 +13521,17 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { /* .cpy_tensor = */ ggml_backend_vk_buffer_cpy_tensor, /* .clear = */ ggml_backend_vk_buffer_clear, /* .reset = */ NULL, @@ -124,7 +148,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")"); ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; -@@ -13549,17 +13558,17 @@ static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backe +@@ -13549,17 +13547,17 @@ static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backe return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size); } @@ -145,7 +169,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu return ggml_nbytes(tensor); UNUSED(buft); -@@ -13577,24 +13586,24 @@ ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) { +@@ -13577,24 +13575,24 @@ ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) { // host buffer type @@ -174,7 +198,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")"); size += 32; // Behave like the CPU buffer type -@@ -13610,19 +13619,20 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_ +@@ -13610,19 +13608,20 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer; @@ -197,7 +221,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu return vk_instance.devices[0]->suballocation_block_size; UNUSED(buft); -@@ -13654,13 +13664,13 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { +@@ -13654,13 +13653,13 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { // backend @@ -213,7 +237,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")"); -@@ -13676,7 +13686,7 @@ static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_b +@@ -13676,7 +13675,7 @@ static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_b return &ctx->device->buffer_type; } @@ -222,7 +246,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); -@@ -13723,7 +13733,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor +@@ -13723,7 +13722,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor } } @@ -231,7 +255,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); -@@ -13757,7 +13767,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_ +@@ -13757,7 +13756,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_ } } @@ -240,7 +264,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async(" << src << " -> " << dst << ", size=" << ggml_nbytes(src) << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context; -@@ -13882,7 +13892,7 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) { +@@ -13882,7 +13881,7 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) { } } @@ -249,7 +273,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_synchronize()"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; -@@ -14304,7 +14314,7 @@ static int32_t find_first_set(uint32_t x) { +@@ -14304,7 +14303,7 @@ static int32_t find_first_set(uint32_t x) { return ret; } @@ -258,7 +282,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; -@@ -14684,7 +14694,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg +@@ -14684,7 +14683,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg } // Sort the graph for improved parallelism. @@ -267,7 +291,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu { VK_LOG_DEBUG("ggml_vk_graph_optimize(" << graph->n_nodes << " nodes)"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; -@@ -14922,7 +14932,7 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * +@@ -14922,7 +14921,7 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * } } @@ -276,7 +300,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_event_record(backend=" << backend << ", event=" << event << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; vk_event *vkev = (vk_event *)event->context; -@@ -14960,7 +14970,7 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev +@@ -14960,7 +14959,7 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev ctx->compute_ctx.reset(); } @@ -285,7 +309,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_event_wait(backend=" << backend << ", event=" << event << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; vk_event *vkev = (vk_event *)event->context; -@@ -15055,7 +15065,10 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total +@@ -15055,7 +15054,10 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total *total += heap.size; if (membudget_supported && i < budgetprops.heapUsage.size()) { @@ -297,7 +321,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu } else { *free += heap.size; } -@@ -15123,38 +15136,38 @@ struct ggml_backend_vk_device_context { +@@ -15123,38 +15125,38 @@ struct ggml_backend_vk_device_context { int op_offload_min_batch_size; }; @@ -343,7 +367,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; props->name = ggml_backend_vk_device_get_name(dev); -@@ -15170,13 +15183,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml +@@ -15170,13 +15172,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml }; } @@ -359,7 +383,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; const vk_device& device = ggml_vk_get_device(ctx->device); -@@ -15714,7 +15727,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm +@@ -15714,7 +15716,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm UNUSED(dev); } @@ -368,7 +392,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) { return false; } -@@ -15740,13 +15753,13 @@ static int64_t ggml_vk_get_op_batch_size(const ggml_tensor * op) { +@@ -15740,13 +15742,13 @@ static int64_t ggml_vk_get_op_batch_size(const ggml_tensor * op) { } } @@ -384,7 +408,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; auto device = ggml_vk_get_device(ctx->device); -@@ -15769,7 +15782,7 @@ static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t +@@ -15769,7 +15771,7 @@ static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t }; } @@ -393,7 +417,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; auto device = ggml_vk_get_device(ctx->device); -@@ -15789,7 +15802,7 @@ static void ggml_backend_vk_device_event_free(ggml_backend_dev_t dev, ggml_backe +@@ -15789,7 +15791,7 @@ static void ggml_backend_vk_device_event_free(ggml_backend_dev_t dev, ggml_backe delete event; } @@ -402,7 +426,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_device_event_synchronize(backend=" << dev << ", event=" << event << ")"); ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; auto device = ggml_vk_get_device(ctx->device); -@@ -15846,7 +15859,7 @@ static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, si +@@ -15846,7 +15848,7 @@ static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, si return buf; } @@ -411,7 +435,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_device_buffer_from_host_ptr(backend=" << dev << ", ptr=" << ptr << ", size=" << size << ")"); GGML_UNUSED(max_tensor_size); -@@ -15884,17 +15897,17 @@ static const struct ggml_backend_device_i ggml_backend_vk_device_i = { +@@ -15884,17 +15886,17 @@ static const struct ggml_backend_device_i ggml_backend_vk_device_i = { /* .event_synchronize = */ ggml_backend_vk_device_event_synchronize, }; From 84290caabffd0e466676cfc4e53e52f0a8e31705 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Fri, 17 Apr 2026 09:28:31 +0000 Subject: [PATCH 4/7] Cosmetic fix to patch --- .../patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch b/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch index 0aa8e007ad..eccd9238ba 100644 --- a/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch +++ b/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch @@ -47,7 +47,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu - parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size)); + // Compile synchronously to avoid threading issues in cross-module DLL loading + ggml_vk_create_pipeline_func(device, pipeline, spv_size, spv_data, entrypoint, -+ parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size); ++ parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size); } }; From ec56f4146b0a0c0cbd3ae7fd3b3d2207b88f4701 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Fri, 17 Apr 2026 10:51:01 +0000 Subject: [PATCH 5/7] Rewrote info logs for consistency / to work with --verbose --- llamafile/cuda.c | 41 +++++++++++++++++++--------------------- llamafile/llamafile.c | 24 +++++++++++++++++------ llamafile/llamafile.h | 7 +++++++ llamafile/metal.c | 44 +++++++++++++++++++++---------------------- llamafile/vulkan.c | 35 ++++++++++++++++------------------ 5 files changed, 82 insertions(+), 69 deletions(-) diff --git a/llamafile/cuda.c b/llamafile/cuda.c index c6e4297ebd..e2fc017fa2 100644 --- a/llamafile/cuda.c +++ b/llamafile/cuda.c @@ -97,7 +97,8 @@ static bool LinkCuda(const char *dso) { void *lib = cosmo_dlopen(dso, RTLD_LAZY); if (!lib) { char *err = cosmo_dlerror(); - fprintf(stderr, "cuda: %s: failed to load library\n", err ? err : "unknown error"); + llamafile_info("cuda", "failed to load library %s: %s", + dso, err ? err : "unknown error"); return false; } @@ -142,7 +143,8 @@ static bool LinkCuda(const char *dso) { if (!ok) { char *err = cosmo_dlerror(); - fprintf(stderr, "cuda: %s: not all symbols could be imported\n", err ? err : "unknown error"); + llamafile_info("cuda", "could not import all symbols from %s: %s", + dso, err ? err : "unknown error"); memset(&g_cuda.backend_init, 0, sizeof(g_cuda.backend_init)); memset(&g_cuda.backend_reg, 0, sizeof(g_cuda.backend_reg)); memset(&g_cuda.get_device_count, 0, sizeof(g_cuda.get_device_count)); @@ -197,12 +199,10 @@ static bool ImportCudaImpl(void) { } // No pre-built DSO found - if (FLAG_verbose) { - fprintf(stderr, "cuda: no pre-built GPU library found\n"); - fprintf(stderr, "cuda: to enable GPU support, build with:\n"); - fprintf(stderr, "cuda: llamafile/cuda.sh (for NVIDIA)\n"); - fprintf(stderr, "cuda: llamafile/rocm.sh (for AMD)\n"); - } + llamafile_info("cuda", "no pre-built GPU library found"); + llamafile_info("cuda", "to enable GPU support, build with:"); + llamafile_info("cuda", " llamafile/cuda.sh (for NVIDIA)"); + llamafile_info("cuda", " llamafile/rocm.sh (for AMD)"); return false; RegisterBackend: @@ -225,9 +225,8 @@ static bool ImportCudaImpl(void) { reg = g_cuda.backend_reg.default_abi(); if (reg) { ggml_backend_register(reg); - if (FLAG_verbose) - fprintf(stderr, "cuda: %s backend registered with GGML\n", - g_cuda.is_amd ? "ROCm" : "CUDA"); + llamafile_info("cuda", "%s backend registered with GGML", + g_cuda.is_amd ? "ROCm" : "CUDA"); } } @@ -237,17 +236,15 @@ static bool ImportCudaImpl(void) { static void ImportCuda(void) { if (ImportCudaImpl()) { g_cuda.supported = true; - if (FLAG_verbose) { - fprintf(stderr, "cuda: %s GPU support successfully loaded\n", - g_cuda.is_amd ? "AMD ROCm" : "NVIDIA CUDA"); - if (g_cuda.get_device_count.default_abi || g_cuda.get_device_count.windows_abi) { - int count; - if (IsWindows()) - count = g_cuda.get_device_count.windows_abi(); - else - count = g_cuda.get_device_count.default_abi(); - fprintf(stderr, "cuda: found %d GPU device(s)\n", count); - } + llamafile_info("cuda", "%s GPU support successfully loaded", + g_cuda.is_amd ? "AMD ROCm" : "NVIDIA CUDA"); + if (g_cuda.get_device_count.default_abi || g_cuda.get_device_count.windows_abi) { + int count; + if (IsWindows()) + count = g_cuda.get_device_count.windows_abi(); + else + count = g_cuda.get_device_count.default_abi(); + llamafile_info("cuda", "found %d GPU device(s)", count); } } else if (FLAG_gpu == LLAMAFILE_GPU_NVIDIA || FLAG_gpu == LLAMAFILE_GPU_AMD) { fprintf(stderr, "fatal error: support for --gpu %s was explicitly requested, " diff --git a/llamafile/llamafile.c b/llamafile/llamafile.c index 348045e213..d29e551c24 100644 --- a/llamafile/llamafile.c +++ b/llamafile/llamafile.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -730,9 +731,9 @@ bool llamafile_try_load_prebuilt_dso(const char *name, const char *backend_name, break; } + llamafile_info(backend_name, "probing library %s (bundled)", extracted); if (link_fn(extracted)) { - if (FLAG_verbose) - fprintf(stderr, "%s: loaded bundled %s\n", backend_name, name); + llamafile_info(backend_name, "loaded bundled library %s", extracted); return true; } } @@ -741,9 +742,9 @@ bool llamafile_try_load_prebuilt_dso(const char *name, const char *backend_name, llamafile_get_app_dir(app_dir, PATH_MAX); snprintf(dso, PATH_MAX, "%s%s", app_dir, name); if (llamafile_file_exists(dso)) { + llamafile_info(backend_name, "probing library %s (app directory)", dso); if (link_fn(dso)) { - if (FLAG_verbose) - fprintf(stderr, "%s: loaded %s from app directory\n", backend_name, name); + llamafile_info(backend_name, "loaded library %s from app directory", dso); return true; } } @@ -753,9 +754,9 @@ bool llamafile_try_load_prebuilt_dso(const char *name, const char *backend_name, if (home && *home) { snprintf(dso, PATH_MAX, "%s/%s", home, name); if (llamafile_file_exists(dso)) { + llamafile_info(backend_name, "probing library %s (home directory)", dso); if (link_fn(dso)) { - if (FLAG_verbose) - fprintf(stderr, "%s: loaded %s from home directory\n", backend_name, name); + llamafile_info(backend_name, "loaded library %s from home directory", dso); return true; } } @@ -774,6 +775,17 @@ void llamafile_log_callback_null(int level, const char *text, void *user_data) { (void)user_data; } +void llamafile_info(const char *backend, const char *fmt, ...) { + if (!FLAG_verbose) + return; + fprintf(stderr, "%s: INFO: ", backend); + va_list ap; + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + fputc('\n', stderr); +} + // ============================================================================== // GPU support // ============================================================================== diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h index 534c0f1a4f..b3b45410c5 100644 --- a/llamafile/llamafile.h +++ b/llamafile/llamafile.h @@ -120,6 +120,13 @@ typedef void (*llamafile_log_callback)(int level, const char *text, void *user_d // No-op log callback to disable logging (defined in llamafile.c) void llamafile_log_callback_null(int level, const char *text, void *user_data); +// Print an INFO-level diagnostic tagged with a backend name. +// No-op unless FLAG_verbose is set. Adds the ": INFO: " prefix +// and a trailing newline, so callers pass only the message body. +// Defined in llamafile.c. +void llamafile_info(const char *backend, const char *fmt, ...) + __attribute__((format(printf, 2, 3))); + // Set logging callback for Metal dylib (defined in metal.c) // Pass a no-op callback to disable logging void llamafile_metal_log_set(llamafile_log_callback log_callback, void *user_data); diff --git a/llamafile/metal.c b/llamafile/metal.c index 68ae1fa397..778ef1b02f 100644 --- a/llamafile/metal.c +++ b/llamafile/metal.c @@ -276,8 +276,7 @@ static bool PreprocessMetalShader(const char *app_dir) { free(impl_content); free(metal_content); - if (FLAG_verbose) - fprintf(stderr, "metal: preprocessed %s\n", metal_path); + llamafile_info("metal", "preprocessed %s", metal_path); return true; } @@ -293,8 +292,7 @@ static bool BuildMetal(const char *dso) { // Since we use versioned paths, source updates come with new versions struct stat dso_stat; if (stat(dso, &dso_stat) == 0 && !FLAG_recompile) { - if (FLAG_verbose) - fprintf(stderr, "metal: using cached %s\n", dso); + llamafile_info("metal", "using cached %s", dso); return true; } @@ -359,8 +357,7 @@ static bool BuildMetal(const char *dso) { // Compile dynamic shared object if (needs_rebuild || FLAG_recompile) { - if (FLAG_verbose) - fprintf(stderr, "metal: building ggml-metal.dylib with xcode...\n"); + llamafile_info("metal", "building ggml-metal.dylib with xcode..."); char tmpdso[PATH_MAX]; snprintf(tmpdso, PATH_MAX, "%s.XXXXXX", dso); @@ -435,10 +432,12 @@ static bool BuildMetal(const char *dso) { args[argc] = NULL; if (FLAG_verbose) { - fprintf(stderr, "metal: executing: cc"); - for (int j = 1; args[j]; j++) - fprintf(stderr, " %s", args[j]); - fprintf(stderr, "\n"); + char cmd[4096]; + size_t off = 0; + off += snprintf(cmd + off, sizeof(cmd) - off, "executing: cc"); + for (int j = 1; args[j] && off < sizeof(cmd); j++) + off += snprintf(cmd + off, sizeof(cmd) - off, " %s", args[j]); + llamafile_info("metal", "%s", cmd); } int pid, ws; @@ -499,10 +498,12 @@ static bool BuildMetal(const char *dso) { args[argc] = NULL; if (FLAG_verbose) { - fprintf(stderr, "metal: executing: cc"); - for (int j = 1; args[j]; j++) - fprintf(stderr, " %s", args[j]); - fprintf(stderr, "\n"); + char cmd[4096]; + size_t off = 0; + off += snprintf(cmd + off, sizeof(cmd) - off, "executing: cc"); + for (int j = 1; args[j] && off < sizeof(cmd); j++) + off += snprintf(cmd + off, sizeof(cmd) - off, " %s", args[j]); + llamafile_info("metal", "%s", cmd); } int pid, ws; @@ -538,8 +539,7 @@ static bool BuildMetal(const char *dso) { return false; } - if (FLAG_verbose) - fprintf(stderr, "metal: successfully built %s\n", dso); + llamafile_info("metal", "successfully built %s", dso); } return true; @@ -550,7 +550,8 @@ static bool LinkMetal(const char *dso) { void *lib = cosmo_dlopen(dso, RTLD_LAZY); if (!lib) { char *err = cosmo_dlerror(); - fprintf(stderr, "metal: %s: failed to load library\n", err ? err : "unknown error"); + llamafile_info("metal", "failed to load library %s: %s", + dso, err ? err : "unknown error"); return false; } @@ -570,7 +571,8 @@ static bool LinkMetal(const char *dso) { if (!ok) { char *err = cosmo_dlerror(); - fprintf(stderr, "metal: %s: not all symbols could be imported\n", err ? err : "unknown error"); + llamafile_info("metal", "could not import all symbols from %s: %s", + dso, err ? err : "unknown error"); cosmo_dlclose(lib); return false; } @@ -617,8 +619,7 @@ static bool ImportMetalImpl(void) { ggml_backend_reg_t reg = g_metal.backend_metal_reg(); if (reg) { ggml_backend_register(reg); - if (FLAG_verbose) - fprintf(stderr, "metal: Metal backend registered with GGML\n"); + llamafile_info("metal", "Metal backend registered with GGML"); } } return true; @@ -630,8 +631,7 @@ static bool ImportMetalImpl(void) { static void ImportMetal(void) { if (ImportMetalImpl()) { g_metal.supported = true; - if (FLAG_verbose) - fprintf(stderr, "metal: Apple Metal GPU support successfully loaded\n"); + llamafile_info("metal", "Apple Metal GPU support successfully loaded"); } else if (FLAG_gpu == LLAMAFILE_GPU_APPLE) { fprintf(stderr, "fatal error: support for --gpu %s was explicitly requested, " "but it wasn't available\n", llamafile_describe_gpu()); diff --git a/llamafile/vulkan.c b/llamafile/vulkan.c index 386657d4d8..91d4a2fcdb 100644 --- a/llamafile/vulkan.c +++ b/llamafile/vulkan.c @@ -93,7 +93,8 @@ static bool LinkVulkan(const char *dso) { void *lib = cosmo_dlopen(dso, RTLD_LAZY); if (!lib) { char *err = cosmo_dlerror(); - fprintf(stderr, "vulkan: %s: failed to load library\n", err ? err : "unknown error"); + llamafile_info("vulkan", "failed to load library %s: %s", + dso, err ? err : "unknown error"); return false; } @@ -138,7 +139,8 @@ static bool LinkVulkan(const char *dso) { if (!ok) { char *err = cosmo_dlerror(); - fprintf(stderr, "vulkan: %s: not all symbols could be imported\n", err ? err : "unknown error"); + llamafile_info("vulkan", "could not import all symbols from %s: %s", + dso, err ? err : "unknown error"); memset(&g_vulkan.backend_init, 0, sizeof(g_vulkan.backend_init)); memset(&g_vulkan.backend_reg, 0, sizeof(g_vulkan.backend_reg)); memset(&g_vulkan.get_device_count, 0, sizeof(g_vulkan.get_device_count)); @@ -173,11 +175,9 @@ static bool ImportVulkanImpl(void) { // Try to load pre-built DSO if (!llamafile_try_load_prebuilt_dso(vulkan_dso, "vulkan", LinkVulkan)) { // No pre-built DSO found - if (FLAG_verbose) { - fprintf(stderr, "vulkan: no pre-built GPU library found\n"); - fprintf(stderr, "vulkan: to enable Vulkan support, build with:\n"); - fprintf(stderr, "vulkan: llamafile/vulkan.sh\n"); - } + llamafile_info("vulkan", "no pre-built GPU library found"); + llamafile_info("vulkan", "to enable Vulkan support, build with:"); + llamafile_info("vulkan", " llamafile/vulkan.sh"); return false; } @@ -200,8 +200,7 @@ static bool ImportVulkanImpl(void) { reg = g_vulkan.backend_reg.default_abi(); if (reg) { ggml_backend_register(reg); - if (FLAG_verbose) - fprintf(stderr, "vulkan: Vulkan backend registered with GGML\n"); + llamafile_info("vulkan", "Vulkan backend registered with GGML"); } } @@ -211,16 +210,14 @@ static bool ImportVulkanImpl(void) { static void ImportVulkan(void) { if (ImportVulkanImpl()) { g_vulkan.supported = true; - if (FLAG_verbose) { - fprintf(stderr, "vulkan: Vulkan GPU support successfully loaded\n"); - if (g_vulkan.get_device_count.default_abi || g_vulkan.get_device_count.windows_abi) { - int count; - if (IsWindows()) - count = g_vulkan.get_device_count.windows_abi(); - else - count = g_vulkan.get_device_count.default_abi(); - fprintf(stderr, "vulkan: found %d GPU device(s)\n", count); - } + llamafile_info("vulkan", "Vulkan GPU support successfully loaded"); + if (g_vulkan.get_device_count.default_abi || g_vulkan.get_device_count.windows_abi) { + int count; + if (IsWindows()) + count = g_vulkan.get_device_count.windows_abi(); + else + count = g_vulkan.get_device_count.default_abi(); + llamafile_info("vulkan", "found %d GPU device(s)", count); } } else if (FLAG_gpu == LLAMAFILE_GPU_VULKAN) { fprintf(stderr, "fatal error: support for --gpu vulkan was explicitly requested, " From 86e8378581e5dcd5e072828ccda29ffdb03afdfd Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Fri, 17 Apr 2026 11:31:51 +0000 Subject: [PATCH 6/7] Brought compile_count back in --- ...ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch | 69 ++++++++++--------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch b/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch index eccd9238ba..81b8e2ce0a 100644 --- a/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch +++ b/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch @@ -27,31 +27,34 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu static VkDeviceSize ggml_vk_get_max_buffer_range(const ggml_backend_vk_context * ctx, const vk_buffer &buf, const VkDeviceSize offset) { const VkDeviceSize range = std::min(VkDeviceSize{buf->size - offset}, -@@ -3391,20 +3391,9 @@ static void ggml_vk_load_shaders(vk_device& device) { +@@ -3391,20 +3391,15 @@ static void ggml_vk_load_shaders(vk_device& device) { if (!pipeline->needed || pipeline->compiled) { continue; } - // TODO: We're no longer benefitting from the async compiles (shaders are - // compiled individually, as needed) and this complexity can be removed. -- { ++ // Compile synchronously to avoid threading issues in cross-module DLL loading. ++ // ggml_vk_create_pipeline_func asserts compile_count > 0 and decrements it ++ // on completion, so we still need to increment it here. + { - // wait until fewer than N compiles are in progress - uint32_t N = std::max(1u, std::thread::hardware_concurrency()); - std::unique_lock guard(compile_count_mutex); - while (compile_count >= N) { - compile_count_cond.wait(guard); - } -- compile_count++; -- } ++ std::lock_guard guard(compile_count_mutex); + compile_count++; + } - - compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), spv_size, spv_data, entrypoint, - parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size)); -+ // Compile synchronously to avoid threading issues in cross-module DLL loading + ggml_vk_create_pipeline_func(device, pipeline, spv_size, spv_data, entrypoint, + parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size); } }; -@@ -13428,20 +13417,28 @@ static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { +@@ -13428,20 +13423,28 @@ static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { return buffer->buft->iface.get_name == ggml_backend_vk_buffer_type_name; } @@ -83,7 +86,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")"); if (tensor->view_src != nullptr) { GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); -@@ -13449,7 +13446,7 @@ static enum ggml_status ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t +@@ -13449,7 +13452,7 @@ static enum ggml_status ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t return GGML_STATUS_SUCCESS; } @@ -92,7 +95,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")"); ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; vk_buffer buf = buf_ctx->dev_buffer; -@@ -13462,7 +13459,7 @@ static void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, g +@@ -13462,7 +13465,7 @@ static void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, g ggml_vk_buffer_memset(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, val32, size); } @@ -101,7 +104,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; vk_buffer buf = buf_ctx->dev_buffer; -@@ -13474,7 +13471,7 @@ static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml +@@ -13474,7 +13477,7 @@ static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } @@ -110,7 +113,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; -@@ -13487,7 +13484,7 @@ static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, cons +@@ -13487,7 +13490,7 @@ static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, cons ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } @@ -119,7 +122,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu if (ggml_nbytes(src) == 0) { return true; } -@@ -13508,7 +13505,7 @@ static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, cons +@@ -13508,7 +13511,7 @@ static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, cons UNUSED(buffer); } @@ -128,7 +131,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_vk_buffer_memset(ctx->dev_buffer, 0, value, buffer->size); -@@ -13524,16 +13521,17 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { +@@ -13524,16 +13527,17 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { /* .cpy_tensor = */ ggml_backend_vk_buffer_cpy_tensor, /* .clear = */ ggml_backend_vk_buffer_clear, /* .reset = */ NULL, @@ -148,7 +151,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")"); ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; -@@ -13549,17 +13547,17 @@ static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backe +@@ -13549,17 +13553,17 @@ static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backe return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size); } @@ -169,7 +172,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu return ggml_nbytes(tensor); UNUSED(buft); -@@ -13577,24 +13575,24 @@ ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) { +@@ -13577,24 +13581,24 @@ ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) { // host buffer type @@ -198,7 +201,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")"); size += 32; // Behave like the CPU buffer type -@@ -13610,19 +13608,20 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_ +@@ -13610,19 +13614,20 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer; @@ -221,7 +224,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu return vk_instance.devices[0]->suballocation_block_size; UNUSED(buft); -@@ -13654,13 +13653,13 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { +@@ -13654,13 +13659,13 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { // backend @@ -237,7 +240,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")"); -@@ -13676,7 +13675,7 @@ static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_b +@@ -13676,7 +13681,7 @@ static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_b return &ctx->device->buffer_type; } @@ -246,7 +249,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); -@@ -13723,7 +13722,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor +@@ -13723,7 +13728,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor } } @@ -255,7 +258,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); -@@ -13757,7 +13756,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_ +@@ -13757,7 +13762,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_ } } @@ -264,7 +267,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async(" << src << " -> " << dst << ", size=" << ggml_nbytes(src) << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context; -@@ -13882,7 +13881,7 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) { +@@ -13882,7 +13887,7 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) { } } @@ -273,7 +276,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_synchronize()"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; -@@ -14304,7 +14303,7 @@ static int32_t find_first_set(uint32_t x) { +@@ -14304,7 +14309,7 @@ static int32_t find_first_set(uint32_t x) { return ret; } @@ -282,7 +285,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; -@@ -14684,7 +14683,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg +@@ -14684,7 +14689,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg } // Sort the graph for improved parallelism. @@ -291,7 +294,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu { VK_LOG_DEBUG("ggml_vk_graph_optimize(" << graph->n_nodes << " nodes)"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; -@@ -14922,7 +14921,7 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * +@@ -14922,7 +14927,7 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * } } @@ -300,7 +303,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_event_record(backend=" << backend << ", event=" << event << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; vk_event *vkev = (vk_event *)event->context; -@@ -14960,7 +14959,7 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev +@@ -14960,7 +14965,7 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev ctx->compute_ctx.reset(); } @@ -309,7 +312,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_event_wait(backend=" << backend << ", event=" << event << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; vk_event *vkev = (vk_event *)event->context; -@@ -15055,7 +15054,10 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total +@@ -15055,7 +15060,10 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total *total += heap.size; if (membudget_supported && i < budgetprops.heapUsage.size()) { @@ -321,7 +324,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu } else { *free += heap.size; } -@@ -15123,38 +15125,38 @@ struct ggml_backend_vk_device_context { +@@ -15123,38 +15131,38 @@ struct ggml_backend_vk_device_context { int op_offload_min_batch_size; }; @@ -367,7 +370,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; props->name = ggml_backend_vk_device_get_name(dev); -@@ -15170,13 +15172,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml +@@ -15170,13 +15178,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml }; } @@ -383,7 +386,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; const vk_device& device = ggml_vk_get_device(ctx->device); -@@ -15714,7 +15716,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm +@@ -15714,7 +15722,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm UNUSED(dev); } @@ -392,7 +395,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) { return false; } -@@ -15740,13 +15742,13 @@ static int64_t ggml_vk_get_op_batch_size(const ggml_tensor * op) { +@@ -15740,13 +15748,13 @@ static int64_t ggml_vk_get_op_batch_size(const ggml_tensor * op) { } } @@ -408,7 +411,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; auto device = ggml_vk_get_device(ctx->device); -@@ -15769,7 +15771,7 @@ static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t +@@ -15769,7 +15777,7 @@ static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t }; } @@ -417,7 +420,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; auto device = ggml_vk_get_device(ctx->device); -@@ -15789,7 +15791,7 @@ static void ggml_backend_vk_device_event_free(ggml_backend_dev_t dev, ggml_backe +@@ -15789,7 +15797,7 @@ static void ggml_backend_vk_device_event_free(ggml_backend_dev_t dev, ggml_backe delete event; } @@ -426,7 +429,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_device_event_synchronize(backend=" << dev << ", event=" << event << ")"); ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; auto device = ggml_vk_get_device(ctx->device); -@@ -15846,7 +15848,7 @@ static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, si +@@ -15846,7 +15854,7 @@ static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, si return buf; } @@ -435,7 +438,7 @@ diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vu VK_LOG_DEBUG("ggml_backend_vk_device_buffer_from_host_ptr(backend=" << dev << ", ptr=" << ptr << ", size=" << size << ")"); GGML_UNUSED(max_tensor_size); -@@ -15884,17 +15886,17 @@ static const struct ggml_backend_device_i ggml_backend_vk_device_i = { +@@ -15884,17 +15892,17 @@ static const struct ggml_backend_device_i ggml_backend_vk_device_i = { /* .event_synchronize = */ ggml_backend_vk_device_event_synchronize, }; From 25b835ed7c9d05cd22f446e52d1593fd1f461ea0 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Fri, 17 Apr 2026 11:47:34 +0000 Subject: [PATCH 7/7] Updated docs removing ref to VS Developer Command Prompt --- docs/building_dlls.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/docs/building_dlls.md b/docs/building_dlls.md index 5baf3ebadb..ca6455e5c2 100644 --- a/docs/building_dlls.md +++ b/docs/building_dlls.md @@ -37,11 +37,16 @@ make setup # In the Windows terminal After the repo is set up, you can build the cuda / rocm / vulkan DLLs as follows. +The .bat files to run the builds are in the `llamafile` directory and accept the following +parameters: -- from powershell, open a Visual Studio 2022 Developer Command Prompt -``` - cmd /k "`"C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvarsall.bat`" x64" -``` +- `--clean` to restart a build from scratch +- `--output` to provide a custom output filename for the dll (default is ggml-xxxx.dll in the current directory +for xxxx in (cuda, rocm, vulkan) +- only for the cuda libraries, you also have the `--cublas` option to link the library against NVIDIA's cublas instead of tinyblas + +Also note that for cuda and rocm libraries there are `*_parallel.bat` scripts that should work faster +by parallelizing compilation and taking advantage of your compute. Here's how you call the build scripts: - cd to the llamafile dir and start CUDA parallel build (this will run for a while...) ```