From c250d5c1448b4b9919bb580223081abec707ee1f Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Thu, 30 Apr 2026 11:26:38 +0000 Subject: [PATCH 1/3] New release version: update version.h and docs --- README.md | 6 +++--- docs/example_llamafiles.md | 28 ++++++++++++++-------------- docs/index.md | 4 ++-- docs/quickstart.md | 6 +++--- docs/support.md | 4 ++-- llamafile/version.h | 2 +- 6 files changed, 25 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 5ce0fcf504..57ff701b94 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ locally on most operating systems and CPU archiectures, with no installation. llamafile also includes **[whisperfile](https://docs.mozilla.ai/llamafile/whisperfile)**, a single-file speech-to-text tool built on [whisper.cpp](https://github.com/ggerganov/whisper.cpp) and the same Cosmopolitan packaging. It supports transcription and translation of audio files across all the same platforms, with no installation required. -## v0.10.0 +## v0.10.* **llamafile versions starting from 0.10.0 use a new build system**, aimed at keeping our code more easily aligned with the latest versions of llama.cpp. This means they support more recent models and functionalities, @@ -32,7 +32,7 @@ but at the same time they might be missing some of the features you were accustomed to (check out [this doc](README_0.10.0.md) for a high-level description of what has been done). If you liked the "classic experience" more, you will always be able to access the previous versions from our [releases](https://github.com/mozilla-ai/llamafile/releases) page. Our pre-built llamafiles always -show which version of the server they have been bundled with ([0.9.* example](https://huggingface.co/mozilla-ai/llava-v1.5-7b-llamafile), [0.10.* example](https://huggingface.co/mozilla-ai/llamafile_0.10.0)), so you will always know +show which version of the server they have been bundled with ([0.9.* example](https://huggingface.co/mozilla-ai/llava-v1.5-7b-llamafile), [0.10.* example](https://huggingface.co/mozilla-ai/llamafile_0.10)), so you will always know which version of the software you are downloading. @@ -47,7 +47,7 @@ Download and run your first llamafile in minutes: ```sh # Download an example model (Qwen3.5 0.8B) -curl -LO https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Qwen3.5-0.8B-Q8_0.llamafile +curl -LO https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Qwen3.5-0.8B-Q8_0.llamafile # Make it executable (macOS/Linux/BSD) chmod +x Qwen3.5-0.8B-Q8_0.llamafile diff --git a/docs/example_llamafiles.md b/docs/example_llamafiles.md index eb08800170..519e107b41 100644 --- a/docs/example_llamafiles.md +++ b/docs/example_llamafiles.md @@ -1,24 +1,24 @@ We provide example llamafiles for a variety of models, so you can easily try out llamafile with different kinds of LLMs. The following table lists llamafiles bundled with the latest -available version of the server (v0.10.0). The smaller the file is, the more easily it will +available version of the server (v0.10.*). The smaller the file is, the more easily it will run on your computer, even if no GPU is present (as a reference, Qwen3.5 0.8B Q8 generates text on a Raspberry Pi5 at ~8 tokens/sec). | Model | Size | License | llamafile | | --- | --- | --- | --- | -| [Qwen3.5 0.8B](https://huggingface.co/Qwen/Qwen3.5-0.8B) Q8_0 | 1.6 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-0.8B-Q8_0.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Qwen3.5-0.8B-Q8_0.llamafile) | -| [Qwen3.5 2B](https://huggingface.co/Qwen/Qwen3.5-2B) Q8_0 | 3.2 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-2B-Q8_0.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Qwen3.5-2B-Q8_0.llamafile) | -| [Ministral 3 3B Instruct 2512](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512) Q4_K_M | 3.4 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Ministral-3-3B-Instruct-2512-Q4_K_M.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Ministral-3-3B-Instruct-2512-Q4_K_M.llamafile) | -| [Qwen3.5 4B](https://huggingface.co/Qwen/Qwen3.5-4B) Q5_K_S | 4.1 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-4B-Q5_K_S.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Qwen3.5-4B-Q5_K_S.llamafile) | -| [llava v1.6 mistral 7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b) Q4_K_M | 5.3 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [llava-v1.6-mistral-7b-Q4_K_M.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/llava-v1.6-mistral-7b-Q4_K_M.llamafile) | -| [Apertus 8B Instruct 2509](https://huggingface.co/swiss-ai/Apertus-8B-Instruct-2509) | 5.9 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Apertus-8B-Instruct-2509.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Apertus-8B-Instruct-2509.llamafile) | -| [Qwen3.5 9B](https://huggingface.co/Qwen/Qwen3.5-9B) Q5_K_S | 7.4 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-9B-Q5_K_S.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Qwen3.5-9B-Q5_K_S.llamafile) | -| [Ministral 3 3B Instruct 2512](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512) BF16 | 7.8 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Ministral-3-3B-Instruct-2512-BF16.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Ministral-3-3B-Instruct-2512-BF16.llamafile) | -| [llava v1.6 mistral 7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b) Q8_0 | 8.4 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [llava-v1.6-mistral-7b-Q8_0.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/llava-v1.6-mistral-7b-Q8_0.llamafile) | -| [gpt-oss 20b](https://huggingface.co/openai/gpt-oss-20b) mxfp4 | 12 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [gpt-oss-20b-mxfp4.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/gpt-oss-20b-mxfp4.llamafile) | -| [gpt-oss 20b](https://huggingface.co/openai/gpt-oss-20b) Q5_K_S | 12 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [gpt-oss-20b-Q5_K_S.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/gpt-oss-20b-Q5_K_S.llamafile) | -| [LFM2 24B A2B](https://huggingface.co/LiquidAI/LFM2-24B-A2B) Q5_K_M | 16 GB | [lfm1.0](https://huggingface.co/LiquidAI/LFM2-24B-A2B/blob/main/LICENSE) | [LFM2-24B-A2B-Q5_K_M.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/LFM2-24B-A2B-Q5_K_M.llamafile) | -| [Qwen3.5 27B](https://huggingface.co/Qwen/Qwen3.5-27B) Q5_K_S | 19 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-27B-Q5_K_S.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Qwen3.5-27B-Q5_K_S.llamafile) | +| [Qwen3.5 0.8B](https://huggingface.co/Qwen/Qwen3.5-0.8B) Q8_0 | 1.6 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-0.8B-Q8_0.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Qwen3.5-0.8B-Q8_0.llamafile) | +| [Qwen3.5 2B](https://huggingface.co/Qwen/Qwen3.5-2B) Q8_0 | 3.2 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-2B-Q8_0.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Qwen3.5-2B-Q8_0.llamafile) | +| [Ministral 3 3B Instruct 2512](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512) Q4_K_M | 3.4 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Ministral-3-3B-Instruct-2512-Q4_K_M.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Ministral-3-3B-Instruct-2512-Q4_K_M.llamafile) | +| [Qwen3.5 4B](https://huggingface.co/Qwen/Qwen3.5-4B) Q5_K_S | 4.1 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-4B-Q5_K_S.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Qwen3.5-4B-Q5_K_S.llamafile) | +| [llava v1.6 mistral 7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b) Q4_K_M | 5.3 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [llava-v1.6-mistral-7b-Q4_K_M.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/llava-v1.6-mistral-7b-Q4_K_M.llamafile) | +| [Apertus 8B Instruct 2509](https://huggingface.co/swiss-ai/Apertus-8B-Instruct-2509) | 5.9 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Apertus-8B-Instruct-2509.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Apertus-8B-Instruct-2509.llamafile) | +| [Qwen3.5 9B](https://huggingface.co/Qwen/Qwen3.5-9B) Q5_K_S | 7.4 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-9B-Q5_K_S.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Qwen3.5-9B-Q5_K_S.llamafile) | +| [Ministral 3 3B Instruct 2512](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512) BF16 | 7.8 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Ministral-3-3B-Instruct-2512-BF16.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Ministral-3-3B-Instruct-2512-BF16.llamafile) | +| [llava v1.6 mistral 7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b) Q8_0 | 8.4 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [llava-v1.6-mistral-7b-Q8_0.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/llava-v1.6-mistral-7b-Q8_0.llamafile) | +| [gpt-oss 20b](https://huggingface.co/openai/gpt-oss-20b) mxfp4 | 12 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [gpt-oss-20b-mxfp4.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/gpt-oss-20b-mxfp4.llamafile) | +| [gpt-oss 20b](https://huggingface.co/openai/gpt-oss-20b) Q5_K_S | 12 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [gpt-oss-20b-Q5_K_S.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/gpt-oss-20b-Q5_K_S.llamafile) | +| [LFM2 24B A2B](https://huggingface.co/LiquidAI/LFM2-24B-A2B) Q5_K_M | 16 GB | [lfm1.0](https://huggingface.co/LiquidAI/LFM2-24B-A2B/blob/main/LICENSE) | [LFM2-24B-A2B-Q5_K_M.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/LFM2-24B-A2B-Q5_K_M.llamafile) | +| [Qwen3.5 27B](https://huggingface.co/Qwen/Qwen3.5-27B) Q5_K_S | 19 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen3.5-27B-Q5_K_S.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Qwen3.5-27B-Q5_K_S.llamafile) | ## Legacy llamafiles diff --git a/docs/index.md b/docs/index.md index 9574237661..8b514f7b58 100644 --- a/docs/index.md +++ b/docs/index.md @@ -24,7 +24,7 @@ locally on most operating systems and CPU archiectures, with no installation. llamafile also includes **[whisperfile](whisperfile/index.md)**, a single-file speech-to-text tool built on [whisper.cpp](https://github.com/ggerganov/whisper.cpp) and the same Cosmopolitan packaging. It supports transcription and translation of audio files across all the same platforms, with no installation required. -## v0.10.0 +## v0.10.* **llamafile versions starting from 0.10.0 use a new build system**, aimed at keeping our code more easily aligned with the latest versions of llama.cpp. This means they support more recent models and functionalities, @@ -32,7 +32,7 @@ but at the same time they might be missing some of the features you were accustomed to (check out [this doc](https://github.com/mozilla-ai/llamafile/blob/main/README_0.10.0.md) for a high-level description of what has been done). If you liked the "classic experience" more, you will always be able to access the previous versions from our [releases](https://github.com/mozilla-ai/llamafile/releases) page. Our pre-built llamafiles always -show which version of the server they have been bundled with ([0.9.* example](https://huggingface.co/mozilla-ai/llava-v1.5-7b-llamafile), [0.10.* example](https://huggingface.co/mozilla-ai/llamafile_0.10.0)), so you will always know +show which version of the server they have been bundled with ([0.9.* example](https://huggingface.co/mozilla-ai/llava-v1.5-7b-llamafile), [0.10.* example](https://huggingface.co/mozilla-ai/llamafile_0.10)), so you will always know which version of the software you are downloading. diff --git a/docs/quickstart.md b/docs/quickstart.md index ae77dd886a..f7e22654b1 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -14,7 +14,7 @@ other hand, you have powerful hardware and/or GPUs, [feel free to choose](exampl larger and more expressive models which should provide more accurate responses. -1. Download [Qwen3.5-0.8B-Q8_0.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/Qwen3.5-0.8B-Q8_0.llamafile) (1.77 GB). +1. Download [Qwen3.5-0.8B-Q8_0.llamafile](https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/Qwen3.5-0.8B-Q8_0.llamafile) (1.77 GB). 2. Open your computer's terminal. @@ -175,7 +175,7 @@ enable you to work around Windows' 4GB executable file size limit. For Windows users, here's an example for the gpt-oss LLM (whose size is >12GB): ```sh -curl -L -o llamafile.exe https://huggingface.co/mozilla-ai/llamafile_0.10.0/resolve/main/llamafile_0.10.0 +curl -L -o llamafile.exe https://huggingface.co/mozilla-ai/llamafile_0.10/resolve/main/llamafile_0.10.1 curl -L -o gpt-oss.gguf https://huggingface.co/unsloth/gpt-oss-20b-GGUF/resolve/main/gpt-oss-20b-Q5_K_S.gguf ./llamafile.exe -m gpt-oss.gguf ``` @@ -207,4 +207,4 @@ cd ~/.ollama/models/blobs llamafile -m sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29 ``` **Note** that Ollama's GGUF weights do not always work with llama.cpp (see e.g. [here](https://forums.developer.nvidia.com/t/nemotron-3-super-120b-on-gb10-llama-cpp-sm-121-build-ollama-gguf-incompatibility-fix/363459)), -and as llamafile relies on llama.cpp this trick might not always work for you. \ No newline at end of file +and as llamafile relies on llama.cpp this trick might not always work for you. diff --git a/docs/support.md b/docs/support.md index 784fec2fab..28fc419940 100644 --- a/docs/support.md +++ b/docs/support.md @@ -81,6 +81,6 @@ In the event that GPU support couldn't be compiled and dynamically linked on the fly for any reason, llamafile will fall back to CPU inference. -**NOTE** that the 0.10.0 build of llamafile has not been tested on all +**NOTE** that the 0.10.* build of llamafile has not been tested on all GPUs/platforms yet, so we welcome your feedback both whether there are -any issues or if everything runs smoothly on your specific setup! \ No newline at end of file +any issues or if everything runs smoothly on your specific setup! diff --git a/llamafile/version.h b/llamafile/version.h index 91429ea53b..bfbf3c78c4 100644 --- a/llamafile/version.h +++ b/llamafile/version.h @@ -20,7 +20,7 @@ #define LLAMAFILE_MAJOR 0 #define LLAMAFILE_MINOR 10 -#define LLAMAFILE_PATCH 0 +#define LLAMAFILE_PATCH 1 #define LLAMAFILE_VERSION \ (100000000 * LLAMAFILE_MAJOR + 1000000 * LLAMAFILE_MINOR + LLAMAFILE_PATCH) From bb8158115d45b7e254f9e056a26630158ac677e5 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Fri, 1 May 2026 10:47:13 +0000 Subject: [PATCH 2/3] Added probe for GPU libs --- llamafile/cuda.c | 76 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/llamafile/cuda.c b/llamafile/cuda.c index e2fc017fa2..1e53865391 100644 --- a/llamafile/cuda.c +++ b/llamafile/cuda.c @@ -158,6 +158,55 @@ static bool LinkCuda(const char *dso) { return true; } +static void UnlinkCuda(void) { + if (g_cuda.lib_handle) { + cosmo_dlclose(g_cuda.lib_handle); + g_cuda.lib_handle = NULL; + } + memset(&g_cuda.backend_init, 0, sizeof(g_cuda.backend_init)); + memset(&g_cuda.backend_reg, 0, sizeof(g_cuda.backend_reg)); + memset(&g_cuda.get_device_count, 0, sizeof(g_cuda.get_device_count)); + memset(&g_cuda.get_device_description, 0, sizeof(g_cuda.get_device_description)); + memset(&g_cuda.log_set, 0, sizeof(g_cuda.log_set)); +} + +static bool TryGpuBackend(const char *dso, bool is_amd) { + if (!llamafile_try_load_prebuilt_dso(dso, "cuda", LinkCuda)) + return false; + + // Suppress the DSO's ggml logging before we touch any function that + // triggers ggml_cuda_init() (e.g. get_device_count). Without this, a + // failed init on the wrong backend would print a confusing error to + // stderr even when --verbose is not set. + if (!FLAG_verbose && (g_cuda.log_set.default_abi || g_cuda.log_set.windows_abi)) { + if (IsWindows()) + g_cuda.log_set.windows_abi(llamafile_log_callback_null, NULL); + else + g_cuda.log_set.default_abi(llamafile_log_callback_null, NULL); + } + + // Verify the backend has at least one device before committing. The DSO + // loads fine even when no compatible hardware is present, so we must + // probe device count to avoid registering a 0-device backend (which + // would then prevent fallback to other GPU backends in AUTO mode). + if (g_cuda.get_device_count.default_abi || g_cuda.get_device_count.windows_abi) { + int count; + if (IsWindows()) + count = g_cuda.get_device_count.windows_abi(); + else + count = g_cuda.get_device_count.default_abi(); + if (count <= 0) { + llamafile_info("cuda", "%s library loaded but no devices detected; trying next backend", + is_amd ? "ROCm" : "CUDA"); + UnlinkCuda(); + return false; + } + } + + g_cuda.is_amd = is_amd; + return true; +} + static bool ImportCudaImpl(void) { // Skip on Apple Silicon (use Metal instead) if (IsXnuSilicon()) { @@ -168,9 +217,7 @@ static bool ImportCudaImpl(void) { switch (FLAG_gpu) { case LLAMAFILE_GPU_AUTO: case LLAMAFILE_GPU_NVIDIA: - break; case LLAMAFILE_GPU_AMD: - g_cuda.is_amd = true; break; default: return false; @@ -183,19 +230,16 @@ static bool ImportCudaImpl(void) { snprintf(cuda_dso, sizeof(cuda_dso), "ggml-cuda.%s", ext); snprintf(rocm_dso, sizeof(rocm_dso), "ggml-rocm.%s", ext); - // Try to load pre-built DSO - if (FLAG_gpu == LLAMAFILE_GPU_AMD || FLAG_gpu == LLAMAFILE_GPU_AUTO) { - if (llamafile_try_load_prebuilt_dso(rocm_dso, "cuda", LinkCuda)) { - g_cuda.is_amd = true; + // In AUTO mode, prefer CUDA over ROCm: it covers the common NVIDIA case + // and lets ROCm be the fallback when CUDA is absent or has no devices. + if (FLAG_gpu == LLAMAFILE_GPU_NVIDIA || FLAG_gpu == LLAMAFILE_GPU_AUTO) { + if (TryGpuBackend(cuda_dso, false)) goto RegisterBackend; - } } - if (FLAG_gpu == LLAMAFILE_GPU_NVIDIA || FLAG_gpu == LLAMAFILE_GPU_AUTO) { - if (llamafile_try_load_prebuilt_dso(cuda_dso, "cuda", LinkCuda)) { - g_cuda.is_amd = false; + if (FLAG_gpu == LLAMAFILE_GPU_AMD || FLAG_gpu == LLAMAFILE_GPU_AUTO) { + if (TryGpuBackend(rocm_dso, true)) goto RegisterBackend; - } } // No pre-built DSO found @@ -206,16 +250,6 @@ static bool ImportCudaImpl(void) { return false; RegisterBackend: - // Suppress DSO's ggml logging before backend registration, which triggers - // ggml_cuda_init() inside the DSO. Without this, CUDA device enumeration - // messages appear even when --verbose is not set. - if (!FLAG_verbose && (g_cuda.log_set.default_abi || g_cuda.log_set.windows_abi)) { - if (IsWindows()) - g_cuda.log_set.windows_abi(llamafile_log_callback_null, NULL); - else - g_cuda.log_set.default_abi(llamafile_log_callback_null, NULL); - } - // Register the CUDA backend with GGML if (g_cuda.backend_reg.default_abi || g_cuda.backend_reg.windows_abi) { ggml_backend_reg_t reg; From e398c0291c75d2c9ee0d0079b9b403a346faca57 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Mon, 4 May 2026 11:04:03 +0100 Subject: [PATCH 3/3] Addressed PR review --- llamafile/cuda.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/llamafile/cuda.c b/llamafile/cuda.c index 1e53865391..e5dc4f27a9 100644 --- a/llamafile/cuda.c +++ b/llamafile/cuda.c @@ -120,8 +120,9 @@ static bool LinkCuda(const char *dso) { else *(void **)(&g_cuda.backend_reg.default_abi) = sym; - // Optional - don't fail if not found + // Required: TryGpuBackend uses this to reject 0-device DSOs sym = cosmo_dlsym(lib, "ggml_backend_cuda_get_device_count"); + ok &= (sym != NULL); if (IsWindows()) *(void **)(&g_cuda.get_device_count.windows_abi) = sym; else @@ -189,18 +190,16 @@ static bool TryGpuBackend(const char *dso, bool is_amd) { // loads fine even when no compatible hardware is present, so we must // probe device count to avoid registering a 0-device backend (which // would then prevent fallback to other GPU backends in AUTO mode). - if (g_cuda.get_device_count.default_abi || g_cuda.get_device_count.windows_abi) { - int count; - if (IsWindows()) - count = g_cuda.get_device_count.windows_abi(); - else - count = g_cuda.get_device_count.default_abi(); - if (count <= 0) { - llamafile_info("cuda", "%s library loaded but no devices detected; trying next backend", - is_amd ? "ROCm" : "CUDA"); - UnlinkCuda(); - return false; - } + int count; + if (IsWindows()) + count = g_cuda.get_device_count.windows_abi(); + else + count = g_cuda.get_device_count.default_abi(); + if (count <= 0) { + llamafile_info("cuda", "%s library loaded but no devices detected; trying next backend", + is_amd ? "ROCm" : "CUDA"); + UnlinkCuda(); + return false; } g_cuda.is_amd = is_amd; @@ -272,14 +271,12 @@ static void ImportCuda(void) { g_cuda.supported = true; llamafile_info("cuda", "%s GPU support successfully loaded", g_cuda.is_amd ? "AMD ROCm" : "NVIDIA CUDA"); - if (g_cuda.get_device_count.default_abi || g_cuda.get_device_count.windows_abi) { - int count; - if (IsWindows()) - count = g_cuda.get_device_count.windows_abi(); - else - count = g_cuda.get_device_count.default_abi(); - llamafile_info("cuda", "found %d GPU device(s)", count); - } + int count; + if (IsWindows()) + count = g_cuda.get_device_count.windows_abi(); + else + count = g_cuda.get_device_count.default_abi(); + llamafile_info("cuda", "found %d GPU device(s)", count); } else if (FLAG_gpu == LLAMAFILE_GPU_NVIDIA || FLAG_gpu == LLAMAFILE_GPU_AMD) { fprintf(stderr, "fatal error: support for --gpu %s was explicitly requested, " "but it wasn't available\n", llamafile_describe_gpu());