@@ -158,6 +158,55 @@ static bool LinkCuda(const char *dso) {
158158 return true;
159159}
160160
161+ static void UnlinkCuda (void ) {
162+ if (g_cuda .lib_handle ) {
163+ cosmo_dlclose (g_cuda .lib_handle );
164+ g_cuda .lib_handle = NULL ;
165+ }
166+ memset (& g_cuda .backend_init , 0 , sizeof (g_cuda .backend_init ));
167+ memset (& g_cuda .backend_reg , 0 , sizeof (g_cuda .backend_reg ));
168+ memset (& g_cuda .get_device_count , 0 , sizeof (g_cuda .get_device_count ));
169+ memset (& g_cuda .get_device_description , 0 , sizeof (g_cuda .get_device_description ));
170+ memset (& g_cuda .log_set , 0 , sizeof (g_cuda .log_set ));
171+ }
172+
173+ static bool TryGpuBackend (const char * dso , bool is_amd ) {
174+ if (!llamafile_try_load_prebuilt_dso (dso , "cuda" , LinkCuda ))
175+ return false;
176+
177+ // Suppress the DSO's ggml logging before we touch any function that
178+ // triggers ggml_cuda_init() (e.g. get_device_count). Without this, a
179+ // failed init on the wrong backend would print a confusing error to
180+ // stderr even when --verbose is not set.
181+ if (!FLAG_verbose && (g_cuda .log_set .default_abi || g_cuda .log_set .windows_abi )) {
182+ if (IsWindows ())
183+ g_cuda .log_set .windows_abi (llamafile_log_callback_null , NULL );
184+ else
185+ g_cuda .log_set .default_abi (llamafile_log_callback_null , NULL );
186+ }
187+
188+ // Verify the backend has at least one device before committing. The DSO
189+ // loads fine even when no compatible hardware is present, so we must
190+ // probe device count to avoid registering a 0-device backend (which
191+ // would then prevent fallback to other GPU backends in AUTO mode).
192+ if (g_cuda .get_device_count .default_abi || g_cuda .get_device_count .windows_abi ) {
193+ int count ;
194+ if (IsWindows ())
195+ count = g_cuda .get_device_count .windows_abi ();
196+ else
197+ count = g_cuda .get_device_count .default_abi ();
198+ if (count <= 0 ) {
199+ llamafile_info ("cuda" , "%s library loaded but no devices detected; trying next backend" ,
200+ is_amd ? "ROCm" : "CUDA" );
201+ UnlinkCuda ();
202+ return false;
203+ }
204+ }
205+
206+ g_cuda .is_amd = is_amd ;
207+ return true;
208+ }
209+
161210static bool ImportCudaImpl (void ) {
162211 // Skip on Apple Silicon (use Metal instead)
163212 if (IsXnuSilicon ()) {
@@ -168,9 +217,7 @@ static bool ImportCudaImpl(void) {
168217 switch (FLAG_gpu ) {
169218 case LLAMAFILE_GPU_AUTO :
170219 case LLAMAFILE_GPU_NVIDIA :
171- break ;
172220 case LLAMAFILE_GPU_AMD :
173- g_cuda .is_amd = true;
174221 break ;
175222 default :
176223 return false;
@@ -183,19 +230,16 @@ static bool ImportCudaImpl(void) {
183230 snprintf (cuda_dso , sizeof (cuda_dso ), "ggml-cuda.%s" , ext );
184231 snprintf (rocm_dso , sizeof (rocm_dso ), "ggml-rocm.%s" , ext );
185232
186- // Try to load pre-built DSO
187- if ( FLAG_gpu == LLAMAFILE_GPU_AMD || FLAG_gpu == LLAMAFILE_GPU_AUTO ) {
188- if ( llamafile_try_load_prebuilt_dso ( rocm_dso , "cuda" , LinkCuda ) ) {
189- g_cuda . is_amd = true;
233+ // In AUTO mode, prefer CUDA over ROCm: it covers the common NVIDIA case
234+ // and lets ROCm be the fallback when CUDA is absent or has no devices.
235+ if ( FLAG_gpu == LLAMAFILE_GPU_NVIDIA || FLAG_gpu == LLAMAFILE_GPU_AUTO ) {
236+ if ( TryGpuBackend ( cuda_dso , false))
190237 goto RegisterBackend ;
191- }
192238 }
193239
194- if (FLAG_gpu == LLAMAFILE_GPU_NVIDIA || FLAG_gpu == LLAMAFILE_GPU_AUTO ) {
195- if (llamafile_try_load_prebuilt_dso (cuda_dso , "cuda" , LinkCuda )) {
196- g_cuda .is_amd = false;
240+ if (FLAG_gpu == LLAMAFILE_GPU_AMD || FLAG_gpu == LLAMAFILE_GPU_AUTO ) {
241+ if (TryGpuBackend (rocm_dso , true))
197242 goto RegisterBackend ;
198- }
199243 }
200244
201245 // No pre-built DSO found
@@ -206,16 +250,6 @@ static bool ImportCudaImpl(void) {
206250 return false;
207251
208252RegisterBackend :
209- // Suppress DSO's ggml logging before backend registration, which triggers
210- // ggml_cuda_init() inside the DSO. Without this, CUDA device enumeration
211- // messages appear even when --verbose is not set.
212- if (!FLAG_verbose && (g_cuda .log_set .default_abi || g_cuda .log_set .windows_abi )) {
213- if (IsWindows ())
214- g_cuda .log_set .windows_abi (llamafile_log_callback_null , NULL );
215- else
216- g_cuda .log_set .default_abi (llamafile_log_callback_null , NULL );
217- }
218-
219253 // Register the CUDA backend with GGML
220254 if (g_cuda .backend_reg .default_abi || g_cuda .backend_reg .windows_abi ) {
221255 ggml_backend_reg_t reg ;
0 commit comments