mozilla-ai · aittalam · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/diffusionfile/BUILD.mk b/diffusionfile/BUILD.mk
@@ -76,9 +76,7 @@ DIFFUSIONFILE_CPPFLAGS := \
 
 DIFFUSIONFILE_LLAMAFILE_OBJS := \
 	o/$(MODE)/llamafile/llamafile.o \
-	o/$(MODE)/llamafile/metal.o \
-	o/$(MODE)/llamafile/cuda.o \
-	o/$(MODE)/llamafile/vulkan.o \
+	o/$(MODE)/llamafile/gpu.a \
 	o/$(MODE)/llamafile/zip.o \
 	o/$(MODE)/llamafile/check_cpu.o
 

diff --git a/docs/skills/llamafile/SKILL.md b/docs/skills/llamafile/SKILL.md
@@ -1,7 +1,7 @@
 ---
 name: llamafile
 description: This skill should be used when the user asks to "build llamafile", "rebuild llamafile", "run llamafile", "run llamafile tests", "debug llamafile", "set up llamafile", "update patches", "fix patch conflict", "update llama.cpp", "pull latest llama.cpp", "sync upstream llama.cpp", "reset submodules", "write a test for llamafile", "how does llamafile work", "llamafile architecture", or needs guidance on the llamafile build system, patch workflow, submodule integration, cosmocc toolchain, or development practices.
-version: 0.1.3
+version: 0.1.4
 ---
 
 # Llamafile Development Guide
@@ -120,6 +120,10 @@ Outputs: `o/$(MODE)/package/file.o`
 
 Binaries include both x86_64 and aarch64 code paths with runtime CPU feature detection (AVX, AVX2, AVX-512, ARM NEON).
 
+### GPU Backend Loaders
+
+Dynamically-loaded backends that export the ggml C ABI — CUDA, ROCm, Vulkan — all go through the shared probe core in `llamafile/gpu_backend.c`. Each is just a `GpuBackendDesc` + a link thunk; the core does load → log-suppress → **device-count gate** (reject 0-device DSOs so AUTO falls back) → register, with a SIGSEGV/SIGABRT crash guard around the foreign probe call (driver init can fault across the cosmo/ms_abi boundary — issue #988). Metal stays separate by design (runtime-compiled, no ms_abi split, no device gate). When adding/changing a backend: route it through the core, keep the gate, and add a case to `tests/gpu_backend_test.cpp`. A more detailed design doc lives separately.
+
 ## Main Executables
 
 After building, find binaries in `o/$(MODE)/`:

diff --git a/llama.cpp.patches/llamafile-files/BUILD.mk b/llama.cpp.patches/llamafile-files/BUILD.mk
@@ -426,9 +426,7 @@ UI_GEN_OBJ := $(UI_CPP_GEN:%.cpp=%.cpp.o)
 # llamafile objects are used to add dynamic GPU support (Metal, CUDA, ROCm, Vulkan)
 TOOL_LLAMAFILE_OBJS := \
 	o/$(MODE)/llamafile/llamafile.o \
-	o/$(MODE)/llamafile/metal.o \
-	o/$(MODE)/llamafile/cuda.o \
-	o/$(MODE)/llamafile/vulkan.o \
+	o/$(MODE)/llamafile/gpu.a \
 	o/$(MODE)/llamafile/zip.o
 
 # Server objects depend on the llamafile bridge header and on the

diff --git a/llamafile/BUILD.mk b/llamafile/BUILD.mk
@@ -121,6 +121,7 @@ LLAMAFILE_HIGHLIGHT_SRCS := \
 LLAMAFILE_SRCS_C := \
 	llamafile/bestline.c \
 	llamafile/cuda.c \
+	llamafile/gpu_backend.c \
 	llamafile/llamafile.c \
 	llamafile/metal.c \
 	llamafile/vulkan.c \
@@ -334,6 +335,24 @@ o/$(MODE)/llamafile/%.o: llamafile/%.c
 	@mkdir -p $(@D)
 	$(CC) $(CFLAGS) $(LLAMAFILE_CPPFLAGS) -c -o $@ $<
 
+# ==============================================================================
+# GPU backend archive
+# ==============================================================================
+# Single linkable unit grouping the runtime GPU loaders (CUDA/ROCm, Vulkan,
+# Metal) and their shared probe core. The non-llamafile executables
+# (whisperfile, diffusionfile, the llama.cpp tools) pull these in via
+# llamafile_has_gpu(); linking the archive instead of listing each object means
+# adding a GPU backend source does not require editing every consumer's
+# BUILD.mk. Tidiness only: every consumer references llamafile_has_gpu(), so all
+# members are pulled and the build output is unchanged.
+LLAMAFILE_GPU_OBJS := \
+	o/$(MODE)/llamafile/cuda.o \
+	o/$(MODE)/llamafile/gpu_backend.o \
+	o/$(MODE)/llamafile/metal.o \
+	o/$(MODE)/llamafile/vulkan.o
+
+o/$(MODE)/llamafile/gpu.a: $(LLAMAFILE_GPU_OBJS)
+
 o/$(MODE)/llamafile/%.o: llamafile/%.cpp
 	@mkdir -p $(@D)
 	$(CXX) $(CXXFLAGS) $(LLAMAFILE_CPPFLAGS) -c -o $@ $<