From c250d5c1448b4b9919bb580223081abec707ee1f Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide.eynard@gmail.com>
Date: Thu, 30 Apr 2026 11:26:38 +0000
Subject: [PATCH 1/3] New release version: update version.h and docs

---
 README.md                  |  6 +++---
 docs/example_llamafiles.md | 28 ++++++++++++++--------------
 docs/index.md              |  4 ++--
 docs/quickstart.md         |  6 +++---
 docs/support.md            |  4 ++--
 llamafile/version.h        |  2 +-
 6 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 5ce0fcf504..57ff701b94 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ locally on most operating systems and CPU archiectures, with no installation.
 llamafile also includes **[whisperfile](https://docs.mozilla.ai/llamafile/whisperfile)**, a single-file speech-to-text tool built on [whisper.cpp](https://github.com/ggerganov/whisper.cpp) and the same Cosmopolitan packaging. It supports transcription and translation of audio files across all the same platforms, with no installation required.
 
 
-## v0.10.0
+## v0.10.*
 
 **llamafile versions starting from 0.10.0 use a new build system**, aimed at keeping our code more easily 
 aligned with the latest versions of llama.cpp. This means they support more recent models and functionalities,
@@ -32,7 +32,7 @@ but at the same time they might be missing some of
 the features you were accustomed to (check out [this doc](README_0.10.0.md) for a high-level description of what has been done). If you liked
 the "classic experience" more, you will always be able to access the previous versions from our
 [releases](https://github.com/mozilla-ai/llamafile/releases) page. Our pre-built llamafiles always
-show which version of the server they have been bundled with ([0.9.* example](https://huggingface.co/mozilla-ai/llava-v1.5-7b-llamafile), [0.10.* example](https://huggingface.co/mozilla-ai/llamafile_0.10.0)), so you will always know
+show which version of the server they have been bundled with ([0.9.* example](https://huggingface.co/mozilla-ai/llava-v1.5-7b-llamafile), [0.10.* example](https://huggingface.co/mozilla-ai/llamafile_0.10)), so you will always know
 which version of the software you are downloading.
 
 
@@ -47,7 +47,7 @@ Download and run your first llamafile in minutes:
 
 ```sh
 # Download an example model (Qwen3.5 0.8B)
-curl -LO https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Qwen3.5-0.8B-Q8_0.llamafile
+curl -LO https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Qwen3.5-0.8B-Q8_0.llamafile
 
 # Make it executable (macOS/Linux/BSD)
 chmod +x Qwen3.5-0.8B-Q8_0.llamafile
diff --git a/docs/example_llamafiles.md b/docs/example_llamafiles.md
index eb08800170..519e107b41 100644
--- a/docs/example_llamafiles.md
+++ b/docs/example_llamafiles.md
@@ -1,24 +1,24 @@
 We provide example llamafiles for a variety of models, so you can easily try out llamafile 
 with different kinds of LLMs. The following table lists llamafiles bundled with the latest
-available version of the server (v0.10.0). The smaller the file is, the more easily it will
+available version of the server (v0.10.*). The smaller the file is, the more easily it will
 run on your computer, even if no GPU is present (as a reference, Qwen3.5 0.8B Q8 generates
 text on a Raspberry Pi5 at ~8 tokens/sec).
 
 | Model | Size | License | llamafile |
 | --- | --- | --- | --- |
-| [Qwen3.5 0.8B](https://huggingface.co/Qwen/Qwen3.5-0.8B) Q8_0 | 1.6 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-0.8B-Q8_0.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Qwen3.5-0.8B-Q8_0.llamafile) |
-| [Qwen3.5 2B](https://huggingface.co/Qwen/Qwen3.5-2B) Q8_0 | 3.2 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-2B-Q8_0.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Qwen3.5-2B-Q8_0.llamafile) |
-| [Ministral 3 3B Instruct 2512](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512) Q4_K_M | 3.4 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Ministral-3-3B-Instruct-2512-Q4_K_M.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Ministral-3-3B-Instruct-2512-Q4_K_M.llamafile) |
-| [Qwen3.5 4B](https://huggingface.co/Qwen/Qwen3.5-4B) Q5_K_S | 4.1 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-4B-Q5_K_S.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Qwen3.5-4B-Q5_K_S.llamafile) |
-| [llava v1.6 mistral 7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b) Q4_K_M | 5.3 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [llava-v1.6-mistral-7b-Q4_K_M.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/llava-v1.6-mistral-7b-Q4_K_M.llamafile) |
-| [Apertus 8B Instruct 2509](https://huggingface.co/swiss-ai/Apertus-8B-Instruct-2509) | 5.9 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Apertus-8B-Instruct-2509.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Apertus-8B-Instruct-2509.llamafile) |
-| [Qwen3.5 9B](https://huggingface.co/Qwen/Qwen3.5-9B) Q5_K_S | 7.4 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-9B-Q5_K_S.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Qwen3.5-9B-Q5_K_S.llamafile) |
-| [Ministral 3 3B Instruct 2512](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512) BF16 | 7.8 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Ministral-3-3B-Instruct-2512-BF16.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Ministral-3-3B-Instruct-2512-BF16.llamafile) |
-| [llava v1.6 mistral 7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b) Q8_0 | 8.4 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [llava-v1.6-mistral-7b-Q8_0.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/llava-v1.6-mistral-7b-Q8_0.llamafile) |
-| [gpt-oss 20b](https://huggingface.co/openai/gpt-oss-20b) mxfp4 | 12 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [gpt-oss-20b-mxfp4.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/gpt-oss-20b-mxfp4.llamafile) |
-| [gpt-oss 20b](https://huggingface.co/openai/gpt-oss-20b) Q5_K_S | 12 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [gpt-oss-20b-Q5_K_S.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/gpt-oss-20b-Q5_K_S.llamafile) |
-| [LFM2 24B A2B](https://huggingface.co/LiquidAI/LFM2-24B-A2B) Q5_K_M | 16 GB | [lfm1.0](https://huggingface.co/LiquidAI/LFM2-24B-A2B/blob/main/LICENSE) | [LFM2-24B-A2B-Q5_K_M.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/LFM2-24B-A2B-Q5_K_M.llamafile) |
-| [Qwen3.5 27B](https://huggingface.co/Qwen/Qwen3.5-27B) Q5_K_S | 19 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-27B-Q5_K_S.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Qwen3.5-27B-Q5_K_S.llamafile) |
+| [Qwen3.5 0.8B](https://huggingface.co/Qwen/Qwen3.5-0.8B) Q8_0 | 1.6 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-0.8B-Q8_0.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Qwen3.5-0.8B-Q8_0.llamafile) |
+| [Qwen3.5 2B](https://huggingface.co/Qwen/Qwen3.5-2B) Q8_0 | 3.2 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-2B-Q8_0.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Qwen3.5-2B-Q8_0.llamafile) |
+| [Ministral 3 3B Instruct 2512](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512) Q4_K_M | 3.4 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Ministral-3-3B-Instruct-2512-Q4_K_M.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Ministral-3-3B-Instruct-2512-Q4_K_M.llamafile) |
+| [Qwen3.5 4B](https://huggingface.co/Qwen/Qwen3.5-4B) Q5_K_S | 4.1 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-4B-Q5_K_S.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Qwen3.5-4B-Q5_K_S.llamafile) |
+| [llava v1.6 mistral 7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b) Q4_K_M | 5.3 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [llava-v1.6-mistral-7b-Q4_K_M.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/llava-v1.6-mistral-7b-Q4_K_M.llamafile) |
+| [Apertus 8B Instruct 2509](https://huggingface.co/swiss-ai/Apertus-8B-Instruct-2509) | 5.9 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Apertus-8B-Instruct-2509.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Apertus-8B-Instruct-2509.llamafile) |
+| [Qwen3.5 9B](https://huggingface.co/Qwen/Qwen3.5-9B) Q5_K_S | 7.4 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-9B-Q5_K_S.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Qwen3.5-9B-Q5_K_S.llamafile) |
+| [Ministral 3 3B Instruct 2512](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512) BF16 | 7.8 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Ministral-3-3B-Instruct-2512-BF16.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Ministral-3-3B-Instruct-2512-BF16.llamafile) |
+| [llava v1.6 mistral 7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b) Q8_0 | 8.4 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [llava-v1.6-mistral-7b-Q8_0.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/llava-v1.6-mistral-7b-Q8_0.llamafile) |
+| [gpt-oss 20b](https://huggingface.co/openai/gpt-oss-20b) mxfp4 | 12 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [gpt-oss-20b-mxfp4.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/gpt-oss-20b-mxfp4.llamafile) |
+| [gpt-oss 20b](https://huggingface.co/openai/gpt-oss-20b) Q5_K_S | 12 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [gpt-oss-20b-Q5_K_S.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/gpt-oss-20b-Q5_K_S.llamafile) |
+| [LFM2 24B A2B](https://huggingface.co/LiquidAI/LFM2-24B-A2B) Q5_K_M | 16 GB | [lfm1.0](https://huggingface.co/LiquidAI/LFM2-24B-A2B/blob/main/LICENSE) | [LFM2-24B-A2B-Q5_K_M.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/LFM2-24B-A2B-Q5_K_M.llamafile) |
+| [Qwen3.5 27B](https://huggingface.co/Qwen/Qwen3.5-27B) Q5_K_S | 19 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-27B-Q5_K_S.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Qwen3.5-27B-Q5_K_S.llamafile) |
 
 ## Legacy llamafiles
 
diff --git a/docs/index.md b/docs/index.md
index 9574237661..8b514f7b58 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -24,7 +24,7 @@ locally on most operating systems and CPU archiectures, with no installation.
 llamafile also includes **[whisperfile](whisperfile/index.md)**, a single-file speech-to-text tool built on [whisper.cpp](https://github.com/ggerganov/whisper.cpp) and the same Cosmopolitan packaging. It supports transcription and translation of audio files across all the same platforms, with no installation required.
 
 
-## v0.10.0
+## v0.10.*
 
 **llamafile versions starting from 0.10.0 use a new build system**, aimed at keeping our code more easily 
 aligned with the latest versions of llama.cpp. This means they support more recent models and functionalities,
@@ -32,7 +32,7 @@ but at the same time they might be missing some of
 the features you were accustomed to (check out [this doc](https://github.com/mozilla-ai/llamafile/blob/main/README_0.10.0.md) for a high-level description of what has been done). If you liked
 the "classic experience" more, you will always be able to access the previous versions from our
 [releases](https://github.com/mozilla-ai/llamafile/releases) page. Our pre-built llamafiles always
-show which version of the server they have been bundled with ([0.9.* example](https://huggingface.co/mozilla-ai/llava-v1.5-7b-llamafile), [0.10.* example](https://huggingface.co/mozilla-ai/llamafile_0.10.0)), so you will always know
+show which version of the server they have been bundled with ([0.9.* example](https://huggingface.co/mozilla-ai/llava-v1.5-7b-llamafile), [0.10.* example](https://huggingface.co/mozilla-ai/llamafile_0.10)), so you will always know
 which version of the software you are downloading.
 
 
diff --git a/docs/quickstart.md b/docs/quickstart.md
index ae77dd886a..f7e22654b1 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -14,7 +14,7 @@ other hand, you have powerful hardware and/or GPUs, [feel free to choose](exampl
 larger and more expressive models which should provide more accurate
 responses.
 
-1. Download [Qwen3.5-0.8B-Q8_0.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Qwen3.5-0.8B-Q8_0.llamafile) (1.77 GB).
+1. Download [Qwen3.5-0.8B-Q8_0.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Qwen3.5-0.8B-Q8_0.llamafile) (1.77 GB).
 
 2. Open your computer's terminal.
 
@@ -175,7 +175,7 @@ enable you to work around Windows' 4GB executable file size limit.
 For Windows users, here's an example for the gpt-oss LLM (whose size is >12GB):
 
 ```sh
-curl -L -o llamafile.exe https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/llamafile_0.10.0
+curl -L -o llamafile.exe https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/llamafile_0.10.1
 curl -L -o gpt-oss.gguf https://huggingface.co/unsloth/gpt-oss-20b-GGUF/resolve/main/gpt-oss-20b-Q5_K_S.gguf
 ./llamafile.exe -m gpt-oss.gguf
 ```
@@ -207,4 +207,4 @@ cd ~/.ollama/models/blobs
 llamafile -m sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
 ```
 **Note** that Ollama's GGUF weights do not always work with llama.cpp (see e.g. [here](https://forums.developer.nvidia.com/t/nemotron-3-super-120b-on-gb10-llama-cpp-sm-121-build-ollama-gguf-incompatibility-fix/363459)), 
-and as llamafile relies on llama.cpp this trick might not always work for you.
\ No newline at end of file
+and as llamafile relies on llama.cpp this trick might not always work for you.
diff --git a/docs/support.md b/docs/support.md
index 784fec2fab..28fc419940 100644
--- a/docs/support.md
+++ b/docs/support.md
@@ -81,6 +81,6 @@ In the event that GPU support couldn't be compiled and dynamically
 linked on the fly for any reason, llamafile will fall back to CPU
 inference.
 
-**NOTE** that the 0.10.0 build of llamafile has not been tested on all
+**NOTE** that the 0.10.* build of llamafile has not been tested on all
 GPUs/platforms yet, so we welcome your feedback both whether there are
-any issues or if everything runs smoothly on your specific setup!
\ No newline at end of file
+any issues or if everything runs smoothly on your specific setup!
diff --git a/llamafile/version.h b/llamafile/version.h
index 91429ea53b..bfbf3c78c4 100644
--- a/llamafile/version.h
+++ b/llamafile/version.h
@@ -20,7 +20,7 @@
 
 #define LLAMAFILE_MAJOR 0
 #define LLAMAFILE_MINOR 10
-#define LLAMAFILE_PATCH 0
+#define LLAMAFILE_PATCH 1
 #define LLAMAFILE_VERSION \
     (100000000 * LLAMAFILE_MAJOR + 1000000 * LLAMAFILE_MINOR + LLAMAFILE_PATCH)
 

From bb8158115d45b7e254f9e056a26630158ac677e5 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide.eynard@gmail.com>
Date: Fri, 1 May 2026 10:47:13 +0000
Subject: [PATCH 2/3] Added probe for GPU libs

---
 llamafile/cuda.c | 76 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 55 insertions(+), 21 deletions(-)

diff --git a/llamafile/cuda.c b/llamafile/cuda.c
index e2fc017fa2..1e53865391 100644
--- a/llamafile/cuda.c
+++ b/llamafile/cuda.c
@@ -158,6 +158,55 @@ static bool LinkCuda(const char *dso) {
     return true;
 }
 
+static void UnlinkCuda(void) {
+    if (g_cuda.lib_handle) {
+        cosmo_dlclose(g_cuda.lib_handle);
+        g_cuda.lib_handle = NULL;
+    }
+    memset(&g_cuda.backend_init, 0, sizeof(g_cuda.backend_init));
+    memset(&g_cuda.backend_reg, 0, sizeof(g_cuda.backend_reg));
+    memset(&g_cuda.get_device_count, 0, sizeof(g_cuda.get_device_count));
+    memset(&g_cuda.get_device_description, 0, sizeof(g_cuda.get_device_description));
+    memset(&g_cuda.log_set, 0, sizeof(g_cuda.log_set));
+}
+
+static bool TryGpuBackend(const char *dso, bool is_amd) {
+    if (!llamafile_try_load_prebuilt_dso(dso, "cuda", LinkCuda))
+        return false;
+
+    // Suppress the DSO's ggml logging before we touch any function that
+    // triggers ggml_cuda_init() (e.g. get_device_count). Without this, a
+    // failed init on the wrong backend would print a confusing error to
+    // stderr even when --verbose is not set.
+    if (!FLAG_verbose && (g_cuda.log_set.default_abi || g_cuda.log_set.windows_abi)) {
+        if (IsWindows())
+            g_cuda.log_set.windows_abi(llamafile_log_callback_null, NULL);
+        else
+            g_cuda.log_set.default_abi(llamafile_log_callback_null, NULL);
+    }
+
+    // Verify the backend has at least one device before committing. The DSO
+    // loads fine even when no compatible hardware is present, so we must
+    // probe device count to avoid registering a 0-device backend (which
+    // would then prevent fallback to other GPU backends in AUTO mode).
+    if (g_cuda.get_device_count.default_abi || g_cuda.get_device_count.windows_abi) {
+        int count;
+        if (IsWindows())
+            count = g_cuda.get_device_count.windows_abi();
+        else
+            count = g_cuda.get_device_count.default_abi();
+        if (count <= 0) {
+            llamafile_info("cuda", "%s library loaded but no devices detected; trying next backend",
+                           is_amd ? "ROCm" : "CUDA");
+            UnlinkCuda();
+            return false;
+        }
+    }
+
+    g_cuda.is_amd = is_amd;
+    return true;
+}
+
 static bool ImportCudaImpl(void) {
     // Skip on Apple Silicon (use Metal instead)
     if (IsXnuSilicon()) {
@@ -168,9 +217,7 @@ static bool ImportCudaImpl(void) {
     switch (FLAG_gpu) {
     case LLAMAFILE_GPU_AUTO:
     case LLAMAFILE_GPU_NVIDIA:
-        break;
     case LLAMAFILE_GPU_AMD:
-        g_cuda.is_amd = true;
         break;
     default:
         return false;
@@ -183,19 +230,16 @@ static bool ImportCudaImpl(void) {
     snprintf(cuda_dso, sizeof(cuda_dso), "ggml-cuda.%s", ext);
     snprintf(rocm_dso, sizeof(rocm_dso), "ggml-rocm.%s", ext);
 
-    // Try to load pre-built DSO
-    if (FLAG_gpu == LLAMAFILE_GPU_AMD || FLAG_gpu == LLAMAFILE_GPU_AUTO) {
-        if (llamafile_try_load_prebuilt_dso(rocm_dso, "cuda", LinkCuda)) {
-            g_cuda.is_amd = true;
+    // In AUTO mode, prefer CUDA over ROCm: it covers the common NVIDIA case
+    // and lets ROCm be the fallback when CUDA is absent or has no devices.
+    if (FLAG_gpu == LLAMAFILE_GPU_NVIDIA || FLAG_gpu == LLAMAFILE_GPU_AUTO) {
+        if (TryGpuBackend(cuda_dso, false))
             goto RegisterBackend;
-        }
     }
 
-    if (FLAG_gpu == LLAMAFILE_GPU_NVIDIA || FLAG_gpu == LLAMAFILE_GPU_AUTO) {
-        if (llamafile_try_load_prebuilt_dso(cuda_dso, "cuda", LinkCuda)) {
-            g_cuda.is_amd = false;
+    if (FLAG_gpu == LLAMAFILE_GPU_AMD || FLAG_gpu == LLAMAFILE_GPU_AUTO) {
+        if (TryGpuBackend(rocm_dso, true))
             goto RegisterBackend;
-        }
     }
 
     // No pre-built DSO found
@@ -206,16 +250,6 @@ static bool ImportCudaImpl(void) {
     return false;
 
 RegisterBackend:
-    // Suppress DSO's ggml logging before backend registration, which triggers
-    // ggml_cuda_init() inside the DSO. Without this, CUDA device enumeration
-    // messages appear even when --verbose is not set.
-    if (!FLAG_verbose && (g_cuda.log_set.default_abi || g_cuda.log_set.windows_abi)) {
-        if (IsWindows())
-            g_cuda.log_set.windows_abi(llamafile_log_callback_null, NULL);
-        else
-            g_cuda.log_set.default_abi(llamafile_log_callback_null, NULL);
-    }
-
     // Register the CUDA backend with GGML
     if (g_cuda.backend_reg.default_abi || g_cuda.backend_reg.windows_abi) {
         ggml_backend_reg_t reg;

From e398c0291c75d2c9ee0d0079b9b403a346faca57 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide.eynard@gmail.com>
Date: Mon, 4 May 2026 11:04:03 +0100
Subject: [PATCH 3/3] Addressed PR review

---
 llamafile/cuda.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/llamafile/cuda.c b/llamafile/cuda.c
index 1e53865391..e5dc4f27a9 100644
--- a/llamafile/cuda.c
+++ b/llamafile/cuda.c
@@ -120,8 +120,9 @@ static bool LinkCuda(const char *dso) {
     else
         *(void **)(&g_cuda.backend_reg.default_abi) = sym;
 
-    // Optional - don't fail if not found
+    // Required: TryGpuBackend uses this to reject 0-device DSOs
     sym = cosmo_dlsym(lib, "ggml_backend_cuda_get_device_count");
+    ok &= (sym != NULL);
     if (IsWindows())
         *(void **)(&g_cuda.get_device_count.windows_abi) = sym;
     else
@@ -189,18 +190,16 @@ static bool TryGpuBackend(const char *dso, bool is_amd) {
     // loads fine even when no compatible hardware is present, so we must
     // probe device count to avoid registering a 0-device backend (which
     // would then prevent fallback to other GPU backends in AUTO mode).
-    if (g_cuda.get_device_count.default_abi || g_cuda.get_device_count.windows_abi) {
-        int count;
-        if (IsWindows())
-            count = g_cuda.get_device_count.windows_abi();
-        else
-            count = g_cuda.get_device_count.default_abi();
-        if (count <= 0) {
-            llamafile_info("cuda", "%s library loaded but no devices detected; trying next backend",
-                           is_amd ? "ROCm" : "CUDA");
-            UnlinkCuda();
-            return false;
-        }
+    int count;
+    if (IsWindows())
+        count = g_cuda.get_device_count.windows_abi();
+    else
+        count = g_cuda.get_device_count.default_abi();
+    if (count <= 0) {
+        llamafile_info("cuda", "%s library loaded but no devices detected; trying next backend",
+                       is_amd ? "ROCm" : "CUDA");
+        UnlinkCuda();
+        return false;
     }
 
     g_cuda.is_amd = is_amd;
@@ -272,14 +271,12 @@ static void ImportCuda(void) {
         g_cuda.supported = true;
         llamafile_info("cuda", "%s GPU support successfully loaded",
                        g_cuda.is_amd ? "AMD ROCm" : "NVIDIA CUDA");
-        if (g_cuda.get_device_count.default_abi || g_cuda.get_device_count.windows_abi) {
-            int count;
-            if (IsWindows())
-                count = g_cuda.get_device_count.windows_abi();
-            else
-                count = g_cuda.get_device_count.default_abi();
-            llamafile_info("cuda", "found %d GPU device(s)", count);
-        }
+        int count;
+        if (IsWindows())
+            count = g_cuda.get_device_count.windows_abi();
+        else
+            count = g_cuda.get_device_count.default_abi();
+        llamafile_info("cuda", "found %d GPU device(s)", count);
     } else if (FLAG_gpu == LLAMAFILE_GPU_NVIDIA || FLAG_gpu == LLAMAFILE_GPU_AMD) {
         fprintf(stderr, "fatal error: support for --gpu %s was explicitly requested, "
                 "but it wasn't available\n", llamafile_describe_gpu());