mozilla-ai · aittalam · Apr 17, 2026 · Apr 8, 2026 · Apr 9, 2026 · Apr 10, 2026
diff --git a/docs/building_dlls.md b/docs/building_dlls.md
@@ -37,11 +37,16 @@ make setup
 # In the Windows terminal
 
 After the repo is set up, you can build the cuda / rocm / vulkan DLLs as follows.
+The .bat files to run the builds are in the `llamafile` directory and accept the following
+parameters:
 
-- from powershell, open a Visual Studio 2022 Developer Command Prompt
-```
- cmd /k "`"C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvarsall.bat`" x64"
-```
+- `--clean` to restart a build from scratch
+- `--output` to provide a custom output filename for the dll (default is ggml-xxxx.dll in the current directory
+for xxxx in (cuda, rocm, vulkan)
+- only for the cuda libraries, you also have the `--cublas` option to link the library against NVIDIA's cublas instead of tinyblas
+
+Also note that for cuda and rocm libraries there are `*_parallel.bat` scripts that should work faster
+by parallelizing compilation and taking advantage of your compute. Here's how you call the build scripts:
 
 - cd to the llamafile dir and start CUDA parallel build (this will run for a while...)
 ```

diff --git a/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch b/llama.cpp.patches/patches/ggml_src_ggml-vulkan_ggml-vulkan.cpp.patch
diff --git a/llamafile/cuda.bat b/llamafile/cuda.bat
@@ -36,6 +36,25 @@ echo Unknown option: %~1
 exit /b 1
 :done_args
 
+:: -------- find Visual Studio / Build Tools --------
+where cl >nul 2>&1
+if errorlevel 1 (
+    set "VSWHERE=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"
+    if not exist "!VSWHERE!" (
+        echo Error: cl.exe not found in PATH and vswhere.exe not found
+        echo Please run from a Visual Studio Developer Command Prompt
+        exit /b 1
+    )
+    for /f "usebackq tokens=*" %%i in (`"!VSWHERE!" -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath`) do (
+        set "VS_PATH=%%i"
+    )
+    if not defined VS_PATH (
+        echo Error: Visual Studio with C++ tools not found
+        exit /b 1
+    )
+    call "!VS_PATH!\VC\Auxiliary\Build\vcvarsall.bat" x64
+)
+
 set "LLAMA_CPP_DIR=%REPO_DIR%\llama.cpp"
 set "GGML_CUDA_DIR=%LLAMA_CPP_DIR%\ggml\src\ggml-cuda"
 set "GGML_SRC_DIR=%LLAMA_CPP_DIR%\ggml\src"
@@ -115,7 +134,8 @@ if "%USE_CUBLAS%"=="0" set "COMMON_FLAGS=%COMMON_FLAGS% -I%BUILD_DIR%"
 set "COMMON_FLAGS=%COMMON_FLAGS% -I%GGML_INC_DIR% -I%GGML_SRC_DIR% -I%GGML_CUDA_DIR%"
 set "COMMON_FLAGS=%COMMON_FLAGS% --forward-unknown-to-host-compiler"
 set "COMMON_FLAGS=%COMMON_FLAGS% --std=c++17"
-set "COMMON_FLAGS=%COMMON_FLAGS% -Xcompiler="/nologo /EHsc /O2 /GR /MT /std:c++17""
+set "COMMON_FLAGS=%COMMON_FLAGS% -Xcompiler="/nologo /EHsc /O2 /GR /MT /std:c++17 /Zc:preprocessor""
+set "COMMON_FLAGS=%COMMON_FLAGS% -diag-suppress 177 -diag-suppress 221 -diag-suppress 550"
 set "COMMON_FLAGS=%COMMON_FLAGS% -DNDEBUG -DGGML_BUILD=1 -DGGML_SHARED=1 -DGGML_BACKEND_SHARED=1 -DGGML_BACKEND_BUILD=1 -DGGML_MULTIPLATFORM"
 set "COMMON_FLAGS=%COMMON_FLAGS% %BLAS_DEFINE%"
 
@@ -209,7 +229,7 @@ echo.
 :: -------- compile core GGML sources with host compiler --------
 echo Compiling core GGML sources...
 
-set "HOST_FLAGS=/nologo /EHsc /O2 /GR /MT /DNDEBUG"
+set "HOST_FLAGS=/nologo /EHsc /O2 /GR /MT /Zc:preprocessor /DNDEBUG"
 set "HOST_FLAGS=%HOST_FLAGS% /DGGML_BUILD=1 /DGGML_SHARED=1 /DGGML_BACKEND_SHARED=1 /DGGML_BACKEND_BUILD=1 /DGGML_MULTIPLATFORM"
 set "HOST_FLAGS=%HOST_FLAGS% /DGGML_VERSION=\"!GGML_VERSION!\" /DGGML_COMMIT=\"!GGML_COMMIT!\""
 set "HOST_FLAGS=%HOST_FLAGS% /I"%GGML_INC_DIR%" /I"%GGML_SRC_DIR%""

diff --git a/llamafile/cuda.c b/llamafile/cuda.c
@@ -97,7 +97,8 @@ static bool LinkCuda(const char *dso) {
     void *lib = cosmo_dlopen(dso, RTLD_LAZY);
     if (!lib) {
         char *err = cosmo_dlerror();
-        fprintf(stderr, "cuda: %s: failed to load library\n", err ? err : "unknown error");
+        llamafile_info("cuda", "failed to load library %s: %s",
+                       dso, err ? err : "unknown error");
         return false;
     }
 
@@ -142,7 +143,8 @@ static bool LinkCuda(const char *dso) {
 
     if (!ok) {
         char *err = cosmo_dlerror();
-        fprintf(stderr, "cuda: %s: not all symbols could be imported\n", err ? err : "unknown error");
+        llamafile_info("cuda", "could not import all symbols from %s: %s",
+                       dso, err ? err : "unknown error");
         memset(&g_cuda.backend_init, 0, sizeof(g_cuda.backend_init));
         memset(&g_cuda.backend_reg, 0, sizeof(g_cuda.backend_reg));
         memset(&g_cuda.get_device_count, 0, sizeof(g_cuda.get_device_count));
@@ -197,12 +199,10 @@ static bool ImportCudaImpl(void) {
     }
 
     // No pre-built DSO found
-    if (FLAG_verbose) {
-        fprintf(stderr, "cuda: no pre-built GPU library found\n");
-        fprintf(stderr, "cuda: to enable GPU support, build with:\n");
-        fprintf(stderr, "cuda:   llamafile/cuda.sh   (for NVIDIA)\n");
-        fprintf(stderr, "cuda:   llamafile/rocm.sh   (for AMD)\n");
-    }
+    llamafile_info("cuda", "no pre-built GPU library found");
+    llamafile_info("cuda", "to enable GPU support, build with:");
+    llamafile_info("cuda", "  llamafile/cuda.sh   (for NVIDIA)");
+    llamafile_info("cuda", "  llamafile/rocm.sh   (for AMD)");
     return false;
 
 RegisterBackend:
@@ -225,9 +225,8 @@ static bool ImportCudaImpl(void) {
             reg = g_cuda.backend_reg.default_abi();
         if (reg) {
             ggml_backend_register(reg);
-            if (FLAG_verbose)
-                fprintf(stderr, "cuda: %s backend registered with GGML\n",
-                        g_cuda.is_amd ? "ROCm" : "CUDA");
+            llamafile_info("cuda", "%s backend registered with GGML",
+                           g_cuda.is_amd ? "ROCm" : "CUDA");
         }
     }
 
@@ -237,17 +236,15 @@ static bool ImportCudaImpl(void) {
 static void ImportCuda(void) {
     if (ImportCudaImpl()) {
         g_cuda.supported = true;
-        if (FLAG_verbose) {
-            fprintf(stderr, "cuda: %s GPU support successfully loaded\n",
-                    g_cuda.is_amd ? "AMD ROCm" : "NVIDIA CUDA");
-            if (g_cuda.get_device_count.default_abi || g_cuda.get_device_count.windows_abi) {
-                int count;
-                if (IsWindows())
-                    count = g_cuda.get_device_count.windows_abi();
-                else
-                    count = g_cuda.get_device_count.default_abi();
-                fprintf(stderr, "cuda: found %d GPU device(s)\n", count);
-            }
+        llamafile_info("cuda", "%s GPU support successfully loaded",
+                       g_cuda.is_amd ? "AMD ROCm" : "NVIDIA CUDA");
+        if (g_cuda.get_device_count.default_abi || g_cuda.get_device_count.windows_abi) {
+            int count;
+            if (IsWindows())
+                count = g_cuda.get_device_count.windows_abi();
+            else
+                count = g_cuda.get_device_count.default_abi();
+            llamafile_info("cuda", "found %d GPU device(s)", count);
         }
     } else if (FLAG_gpu == LLAMAFILE_GPU_NVIDIA || FLAG_gpu == LLAMAFILE_GPU_AMD) {
         fprintf(stderr, "fatal error: support for --gpu %s was explicitly requested, "

diff --git a/llamafile/cuda_parallel.bat b/llamafile/cuda_parallel.bat
@@ -47,6 +47,25 @@ echo Unknown option: %~1
 exit /b 1
 :done_args
 
+:: -------- find Visual Studio / Build Tools --------
+where cl >nul 2>&1
+if errorlevel 1 (
+    set "VSWHERE=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"
+    if not exist "!VSWHERE!" (
+        echo Error: cl.exe not found in PATH and vswhere.exe not found
+        echo Please run from a Visual Studio Developer Command Prompt
+        exit /b 1
+    )
+    for /f "usebackq tokens=*" %%i in (`"!VSWHERE!" -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath`) do (
+        set "VS_PATH=%%i"
+    )
+    if not defined VS_PATH (
+        echo Error: Visual Studio with C++ tools not found
+        exit /b 1
+    )
+    call "!VS_PATH!\VC\Auxiliary\Build\vcvarsall.bat" x64
+)
+
 set "LLAMA_CPP_DIR=%REPO_DIR%\llama.cpp"
 set "GGML_CUDA_DIR=%LLAMA_CPP_DIR%\ggml\src\ggml-cuda"
 set "GGML_SRC_DIR=%LLAMA_CPP_DIR%\ggml\src"
@@ -137,8 +156,8 @@ if "%USE_CUBLAS%"=="0" set "COMMON_FLAGS=%COMMON_FLAGS% -I%BUILD_DIR%"
 set "COMMON_FLAGS=%COMMON_FLAGS% -I%GGML_INC_DIR% -I%GGML_SRC_DIR% -I%GGML_CUDA_DIR%"
 set "COMMON_FLAGS=%COMMON_FLAGS% --forward-unknown-to-host-compiler"
 set "COMMON_FLAGS=%COMMON_FLAGS% --std=c++17"
-set "COMMON_FLAGS=%COMMON_FLAGS% -Xcompiler="/nologo /EHsc /O2 /GR /MT /std:c++17""
-set "COMMON_FLAGS=%COMMON_FLAGS% -diag-suppress 177 -diag-suppress 221"
+set "COMMON_FLAGS=%COMMON_FLAGS% -Xcompiler="/nologo /EHsc /O2 /GR /MT /std:c++17 /Zc:preprocessor""
+set "COMMON_FLAGS=%COMMON_FLAGS% -diag-suppress 177 -diag-suppress 221 -diag-suppress 550"
 set "COMMON_FLAGS=%COMMON_FLAGS% -DNDEBUG -DGGML_BUILD=1 -DGGML_SHARED=1 -DGGML_BACKEND_SHARED=1 -DGGML_BACKEND_BUILD=1 -DGGML_MULTIPLATFORM"
 set "COMMON_FLAGS=%COMMON_FLAGS% %BLAS_DEFINE%"
 
@@ -229,7 +248,7 @@ echo.
 :: -------- compile core GGML sources with host compiler --------
 echo Compiling core GGML sources...
 
-set "HOST_FLAGS=/nologo /EHsc /O2 /GR /MT /DNDEBUG"
+set "HOST_FLAGS=/nologo /EHsc /O2 /GR /MT /Zc:preprocessor /DNDEBUG"
 set "HOST_FLAGS=%HOST_FLAGS% /DGGML_BUILD=1 /DGGML_SHARED=1 /DGGML_BACKEND_SHARED=1 /DGGML_BACKEND_BUILD=1 /DGGML_MULTIPLATFORM"
 set "HOST_FLAGS=%HOST_FLAGS% /DGGML_VERSION=\"!GGML_VERSION!\" /DGGML_COMMIT=\"!GGML_COMMIT!\""
 set "HOST_FLAGS=%HOST_FLAGS% /I"%GGML_INC_DIR%" /I"%GGML_SRC_DIR%""

diff --git a/llamafile/llamafile.c b/llamafile/llamafile.c
@@ -27,6 +27,7 @@
 #include <limits.h>
 #include <stdatomic.h>
 #include <stdint.h>
+#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -730,9 +731,9 @@ bool llamafile_try_load_prebuilt_dso(const char *name, const char *backend_name,
             break;
         }
 
+        llamafile_info(backend_name, "probing library %s (bundled)", extracted);
         if (link_fn(extracted)) {
-            if (FLAG_verbose)
-                fprintf(stderr, "%s: loaded bundled %s\n", backend_name, name);
+            llamafile_info(backend_name, "loaded bundled library %s", extracted);
             return true;
         }
     }
@@ -741,9 +742,9 @@ bool llamafile_try_load_prebuilt_dso(const char *name, const char *backend_name,
     llamafile_get_app_dir(app_dir, PATH_MAX);
     snprintf(dso, PATH_MAX, "%s%s", app_dir, name);
     if (llamafile_file_exists(dso)) {
+        llamafile_info(backend_name, "probing library %s (app directory)", dso);
         if (link_fn(dso)) {
-            if (FLAG_verbose)
-                fprintf(stderr, "%s: loaded %s from app directory\n", backend_name, name);
+            llamafile_info(backend_name, "loaded library %s from app directory", dso);
             return true;
         }
     }
@@ -753,9 +754,9 @@ bool llamafile_try_load_prebuilt_dso(const char *name, const char *backend_name,
     if (home && *home) {
         snprintf(dso, PATH_MAX, "%s/%s", home, name);
         if (llamafile_file_exists(dso)) {
+            llamafile_info(backend_name, "probing library %s (home directory)", dso);
             if (link_fn(dso)) {
-                if (FLAG_verbose)
-                    fprintf(stderr, "%s: loaded %s from home directory\n", backend_name, name);
+                llamafile_info(backend_name, "loaded library %s from home directory", dso);
                 return true;
             }
         }
@@ -774,6 +775,17 @@ void llamafile_log_callback_null(int level, const char *text, void *user_data) {
     (void)user_data;
 }
 
+void llamafile_info(const char *backend, const char *fmt, ...) {
+    if (!FLAG_verbose)
+        return;
+    fprintf(stderr, "%s: INFO: ", backend);
+    va_list ap;
+    va_start(ap, fmt);
+    vfprintf(stderr, fmt, ap);
+    va_end(ap);
+    fputc('\n', stderr);
+}
+
 // ==============================================================================
 // GPU support
 // ==============================================================================

diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h
@@ -120,6 +120,13 @@ typedef void (*llamafile_log_callback)(int level, const char *text, void *user_d
 // No-op log callback to disable logging (defined in llamafile.c)
 void llamafile_log_callback_null(int level, const char *text, void *user_data);
 
+// Print an INFO-level diagnostic tagged with a backend name.
+// No-op unless FLAG_verbose is set. Adds the "<backend>: INFO: " prefix
+// and a trailing newline, so callers pass only the message body.
+// Defined in llamafile.c.
+void llamafile_info(const char *backend, const char *fmt, ...)
+    __attribute__((format(printf, 2, 3)));
+
 // Set logging callback for Metal dylib (defined in metal.c)
 // Pass a no-op callback to disable logging
 void llamafile_metal_log_set(llamafile_log_callback log_callback, void *user_data);

diff --git a/llamafile/metal.c b/llamafile/metal.c
@@ -276,8 +276,7 @@ static bool PreprocessMetalShader(const char *app_dir) {
     free(impl_content);
     free(metal_content);
 
-    if (FLAG_verbose)
-        fprintf(stderr, "metal: preprocessed %s\n", metal_path);
+    llamafile_info("metal", "preprocessed %s", metal_path);
 
     return true;
 }
@@ -293,8 +292,7 @@ static bool BuildMetal(const char *dso) {
     // Since we use versioned paths, source updates come with new versions
     struct stat dso_stat;
     if (stat(dso, &dso_stat) == 0 && !FLAG_recompile) {
-        if (FLAG_verbose)
-            fprintf(stderr, "metal: using cached %s\n", dso);
+        llamafile_info("metal", "using cached %s", dso);
         return true;
     }
 
@@ -359,8 +357,7 @@ static bool BuildMetal(const char *dso) {
 
     // Compile dynamic shared object
     if (needs_rebuild || FLAG_recompile) {
-        if (FLAG_verbose)
-            fprintf(stderr, "metal: building ggml-metal.dylib with xcode...\n");
+        llamafile_info("metal", "building ggml-metal.dylib with xcode...");
 
         char tmpdso[PATH_MAX];
         snprintf(tmpdso, PATH_MAX, "%s.XXXXXX", dso);
@@ -435,10 +432,12 @@ static bool BuildMetal(const char *dso) {
             args[argc] = NULL;
 
             if (FLAG_verbose) {
-                fprintf(stderr, "metal: executing: cc");
-                for (int j = 1; args[j]; j++)
-                    fprintf(stderr, " %s", args[j]);
-                fprintf(stderr, "\n");
+                char cmd[4096];
+                size_t off = 0;
+                off += snprintf(cmd + off, sizeof(cmd) - off, "executing: cc");
+                for (int j = 1; args[j] && off < sizeof(cmd); j++)
+                    off += snprintf(cmd + off, sizeof(cmd) - off, " %s", args[j]);
+                llamafile_info("metal", "%s", cmd);
             }
 
             int pid, ws;
@@ -499,10 +498,12 @@ static bool BuildMetal(const char *dso) {
             args[argc] = NULL;
 
             if (FLAG_verbose) {
-                fprintf(stderr, "metal: executing: cc");
-                for (int j = 1; args[j]; j++)
-                    fprintf(stderr, " %s", args[j]);
-                fprintf(stderr, "\n");
+                char cmd[4096];
+                size_t off = 0;
+                off += snprintf(cmd + off, sizeof(cmd) - off, "executing: cc");
+                for (int j = 1; args[j] && off < sizeof(cmd); j++)
+                    off += snprintf(cmd + off, sizeof(cmd) - off, " %s", args[j]);
+                llamafile_info("metal", "%s", cmd);
             }
 
             int pid, ws;
@@ -538,8 +539,7 @@ static bool BuildMetal(const char *dso) {
             return false;
         }
 
-        if (FLAG_verbose)
-            fprintf(stderr, "metal: successfully built %s\n", dso);
+        llamafile_info("metal", "successfully built %s", dso);
     }
 
     return true;
@@ -550,7 +550,8 @@ static bool LinkMetal(const char *dso) {
     void *lib = cosmo_dlopen(dso, RTLD_LAZY);
     if (!lib) {
         char *err = cosmo_dlerror();
-        fprintf(stderr, "metal: %s: failed to load library\n", err ? err : "unknown error");
+        llamafile_info("metal", "failed to load library %s: %s",
+                       dso, err ? err : "unknown error");
         return false;
     }
 
@@ -570,7 +571,8 @@ static bool LinkMetal(const char *dso) {
 
     if (!ok) {
         char *err = cosmo_dlerror();
-        fprintf(stderr, "metal: %s: not all symbols could be imported\n", err ? err : "unknown error");
+        llamafile_info("metal", "could not import all symbols from %s: %s",
+                       dso, err ? err : "unknown error");
         cosmo_dlclose(lib);
         return false;
     }
@@ -617,8 +619,7 @@ static bool ImportMetalImpl(void) {
                 ggml_backend_reg_t reg = g_metal.backend_metal_reg();
                 if (reg) {
                     ggml_backend_register(reg);
-                    if (FLAG_verbose)
-                        fprintf(stderr, "metal: Metal backend registered with GGML\n");
+                    llamafile_info("metal", "Metal backend registered with GGML");
                 }
             }
             return true;
@@ -630,8 +631,7 @@ static bool ImportMetalImpl(void) {
 static void ImportMetal(void) {
     if (ImportMetalImpl()) {
         g_metal.supported = true;
-        if (FLAG_verbose)
-            fprintf(stderr, "metal: Apple Metal GPU support successfully loaded\n");
+        llamafile_info("metal", "Apple Metal GPU support successfully loaded");
     } else if (FLAG_gpu == LLAMAFILE_GPU_APPLE) {
         fprintf(stderr, "fatal error: support for --gpu %s was explicitly requested, "
                 "but it wasn't available\n", llamafile_describe_gpu());