Reduce cuda dylibs size (#963)

aittalam · claude · web-flow · commit 3dd200539c6c · 2026-05-13T09:18:32.000+01:00
* Reduce CUDA library size with opt-in build flags Reimplements the size-reduction ideas from PR #921 on top of llamacpp-7b8443a, and adds the upstream guards the original PR omitted so --no-iq-quants actually works at link time. New flags on llamafile/cuda.sh (all off by default): --minimize-size umbrella: enables the four flags below --minimal-archs virtual PTX for sm_75/sm_90, real SASS for 80/86/89 --no-iq-quants drop 8 mmq-instance-iq*.cu + define GGML_CUDA_NO_IQ_QUANTS --strip strip --strip-unneeded after link --compress --compress-mode=size (requires CUDA >= 12.8) build-functions.sh: collect_gpu_sources now categorizes template instances and includes only the 3 default fattn-vec quant combos (f16-f16, q4_0-q4_0, q8_0-q8_0), matching upstream CMake's default. Optional NO_IQ_QUANTS argument excludes mmq-instance-iq*.cu. New patches in llama.cpp.patches/patches/ wrap IQ-quant dispatch sites in #ifndef GGML_CUDA_NO_IQ_QUANTS so --no-iq-quants links cleanly without the excluded template instantiations: mmq.cu / mmq.cuh / mmvq.cu / convert.cu / cpy.cu Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * Mirror cuda.sh size-reduction flags in cuda.bat and cuda_parallel.bat Adds the same option surface to both Windows build scripts: --minimize-size umbrella: enables the four flags below --minimal-archs virtual PTX for sm_75/sm_90, real SASS for 80/86/89 --no-iq-quants drop 8 mmq-instance-iq*.cu + define GGML_CUDA_NO_IQ_QUANTS --strip no-op on Windows (debug info lives in a separate .pdb); accepted for parity with cuda.sh --compress --compress-mode=size (requires CUDA >= 12.8) --fa-all-quants compile all fattn-vec quant combos and define GGML_CUDA_FA_ALL_QUANTS Source collection is restructured into the same 5 categories the bash script uses, defaulting to the 3 common fattn-vec combos (f16-f16, q4_0-q4_0, q8_0-q8_0) and gating mmq-instance-iq* under --no-iq-quants. CUDA version parsing now captures both major and minor for the >=12.8 compress check, with a safe 0 fallback when nvcc's output can't be parsed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/llama.cpp.patches/patches/ggml_src_ggml-cuda_convert.cu.patch b/llama.cpp.patches/patches/ggml_src_ggml-cuda_convert.cu.patch
@@ -0,0 +1,35 @@
+diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
+--- a/llama.cpp/ggml/src/ggml-cuda/convert.cu
++++ b/llama.cpp/ggml/src/ggml-cuda/convert.cu
+@@ -736,6 +736,7 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
+             return dequantize_row_q5_K_cuda;
+         case GGML_TYPE_Q6_K:
+             return dequantize_row_q6_K_cuda;
++#ifndef GGML_CUDA_NO_IQ_QUANTS
+         case GGML_TYPE_IQ2_XXS:
+             return dequantize_row_iq2_xxs_cuda;
+         case GGML_TYPE_IQ2_XS:
+@@ -754,6 +755,7 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
+             return dequantize_row_iq4_xs_cuda;
+         case GGML_TYPE_IQ3_S:
+             return dequantize_row_iq3_s_cuda;
++#endif // GGML_CUDA_NO_IQ_QUANTS
+         case GGML_TYPE_MXFP4:
+             return dequantize_row_mxfp4_cuda;
+         case GGML_TYPE_NVFP4:
+@@ -791,6 +793,7 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
+             return dequantize_row_q5_K_cuda;
+         case GGML_TYPE_Q6_K:
+             return dequantize_row_q6_K_cuda;
++#ifndef GGML_CUDA_NO_IQ_QUANTS
+         case GGML_TYPE_IQ2_XXS:
+             return dequantize_row_iq2_xxs_cuda;
+         case GGML_TYPE_IQ2_XS:
+@@ -809,6 +812,7 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
+             return dequantize_row_iq4_xs_cuda;
+         case GGML_TYPE_IQ3_S:
+             return dequantize_row_iq3_s_cuda;
++#endif // GGML_CUDA_NO_IQ_QUANTS
+         case GGML_TYPE_MXFP4:
+             return dequantize_row_mxfp4_cuda;
+         case GGML_TYPE_NVFP4:
diff --git a/llama.cpp.patches/patches/ggml_src_ggml-cuda_cpy.cu.patch b/llama.cpp.patches/patches/ggml_src_ggml-cuda_cpy.cu.patch
@@ -0,0 +1,31 @@
+diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
+--- a/llama.cpp/ggml/src/ggml-cuda/cpy.cu
++++ b/llama.cpp/ggml/src/ggml-cuda/cpy.cu
+@@ -360,6 +360,7 @@ static void ggml_cpy_q5_1_f32_cuda(
+         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
++#ifndef GGML_CUDA_NO_IQ_QUANTS
+ static void ggml_cpy_f32_iq4_nl_cuda(
+     const char * cx, char * cdst, const int64_t ne,
+     const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+@@ -371,6 +372,7 @@ static void ggml_cpy_f32_iq4_nl_cuda(
+     cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
+         (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
++#endif // GGML_CUDA_NO_IQ_QUANTS
+ 
+ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
+     const int64_t ne = ggml_nelements(src0);
+@@ -465,9 +467,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
+     } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
+         ggml_cpy_q5_0_f32_cuda
+                 (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
++#ifndef GGML_CUDA_NO_IQ_QUANTS
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
+         ggml_cpy_f32_iq4_nl_cuda
+                 (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
++#endif // GGML_CUDA_NO_IQ_QUANTS
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
+         ggml_cpy_f32_q5_1_cuda
+                 (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
diff --git a/llama.cpp.patches/patches/ggml_src_ggml-cuda_mmq.cu.patch b/llama.cpp.patches/patches/ggml_src_ggml-cuda_mmq.cu.patch
@@ -0,0 +1,47 @@
+diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
+--- a/llama.cpp/ggml/src/ggml-cuda/mmq.cu
++++ b/llama.cpp/ggml/src/ggml-cuda/mmq.cu
+@@ -44,6 +44,7 @@ static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, con
+         case GGML_TYPE_Q6_K:
+             mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
+             break;
++#ifndef GGML_CUDA_NO_IQ_QUANTS
+         case GGML_TYPE_IQ2_XXS:
+             mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
+             break;
+@@ -68,6 +69,7 @@ static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, con
+         case GGML_TYPE_IQ4_NL:
+             mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
+             break;
++#endif // GGML_CUDA_NO_IQ_QUANTS
+         default:
+             GGML_ABORT("fatal error");
+             break;
+@@ -286,6 +288,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
+         case GGML_TYPE_Q4_K:
+         case GGML_TYPE_Q5_K:
+         case GGML_TYPE_Q6_K:
++#ifndef GGML_CUDA_NO_IQ_QUANTS
+         case GGML_TYPE_IQ2_XXS:
+         case GGML_TYPE_IQ2_XS:
+         case GGML_TYPE_IQ2_S:
+@@ -294,6 +297,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
+         case GGML_TYPE_IQ1_S:
+         case GGML_TYPE_IQ4_XS:
+         case GGML_TYPE_IQ4_NL:
++#endif // GGML_CUDA_NO_IQ_QUANTS
+             mmq_supported = true;
+             break;
+         default:
+@@ -356,9 +360,11 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
+                     return ne11 <= 128;
+                 case GGML_TYPE_Q6_K:
+                     return ne11 <= (GGML_CUDA_CC_IS_RDNA3_0(cc) ? 128 : 256);
++#ifndef GGML_CUDA_NO_IQ_QUANTS
+                 case GGML_TYPE_IQ2_XS:
+                 case GGML_TYPE_IQ2_S:
+                     return GGML_CUDA_CC_IS_RDNA3_5(cc) || ne11 <= 128;
++#endif // GGML_CUDA_NO_IQ_QUANTS
+                 default:
+                     return true;
+             }
diff --git a/llama.cpp.patches/patches/ggml_src_ggml-cuda_mmq.cuh.patch b/llama.cpp.patches/patches/ggml_src_ggml-cuda_mmq.cuh.patch
@@ -0,0 +1,19 @@
+diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
+--- a/llama.cpp/ggml/src/ggml-cuda/mmq.cuh
++++ b/llama.cpp/ggml/src/ggml-cuda/mmq.cuh
+@@ -4088,6 +4088,7 @@ extern DECL_MMQ_CASE(GGML_TYPE_Q3_K);
+ extern DECL_MMQ_CASE(GGML_TYPE_Q4_K);
+ extern DECL_MMQ_CASE(GGML_TYPE_Q5_K);
+ extern DECL_MMQ_CASE(GGML_TYPE_Q6_K);
++#ifndef GGML_CUDA_NO_IQ_QUANTS
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
+@@ -4096,6 +4097,7 @@ extern DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ4_NL);
+ extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS);
++#endif // GGML_CUDA_NO_IQ_QUANTS
+ 
+ // -------------------------------------------------------------------------------------------------------------------------
+ 
diff --git a/llama.cpp.patches/patches/ggml_src_ggml-cuda_mmvq.cu.patch b/llama.cpp.patches/patches/ggml_src_ggml-cuda_mmvq.cu.patch
@@ -0,0 +1,35 @@
+diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
+--- a/llama.cpp/ggml/src/ggml-cuda/mmvq.cu
++++ b/llama.cpp/ggml/src/ggml-cuda/mmvq.cu
+@@ -22,6 +22,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
+         case GGML_TYPE_Q4_K:    return vec_dot_q4_K_q8_1;
+         case GGML_TYPE_Q5_K:    return vec_dot_q5_K_q8_1;
+         case GGML_TYPE_Q6_K:    return vec_dot_q6_K_q8_1;
++#ifndef GGML_CUDA_NO_IQ_QUANTS
+         case GGML_TYPE_IQ2_XXS: return vec_dot_iq2_xxs_q8_1;
+         case GGML_TYPE_IQ2_XS:  return vec_dot_iq2_xs_q8_1;
+         case GGML_TYPE_IQ2_S:   return vec_dot_iq2_s_q8_1;
+@@ -31,6 +32,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
+         case GGML_TYPE_IQ4_NL:  return vec_dot_iq4_nl_q8_1;
+         case GGML_TYPE_IQ4_XS:  return vec_dot_iq4_xs_q8_1;
+         case GGML_TYPE_IQ3_S:   return vec_dot_iq3_s_q8_1;
++#endif // GGML_CUDA_NO_IQ_QUANTS
+         default:                return nullptr;
+     }
+ }
+@@ -50,6 +52,7 @@ static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) {
+         case GGML_TYPE_Q4_K:    return VDR_Q4_K_Q8_1_MMVQ;
+         case GGML_TYPE_Q5_K:    return VDR_Q5_K_Q8_1_MMVQ;
+         case GGML_TYPE_Q6_K:    return VDR_Q6_K_Q8_1_MMVQ;
++#ifndef GGML_CUDA_NO_IQ_QUANTS
+         case GGML_TYPE_IQ2_XXS: return VDR_IQ2_XXS_Q8_1_MMVQ;
+         case GGML_TYPE_IQ2_XS:  return VDR_IQ2_XS_Q8_1_MMVQ;
+         case GGML_TYPE_IQ2_S:   return VDR_IQ2_S_Q8_1_MMVQ;
+@@ -57,6 +60,7 @@ static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) {
+         case GGML_TYPE_IQ3_S:   return VDR_IQ3_S_Q8_1_MMVQ;
+         case GGML_TYPE_IQ4_NL:  return VDR_IQ4_NL_Q8_1_MMVQ;
+         case GGML_TYPE_IQ4_XS:  return VDR_IQ4_XS_Q8_1_MMVQ;
++#endif // GGML_CUDA_NO_IQ_QUANTS
+         default:                return 1;
+     }
+ }
diff --git a/llamafile/build-functions.sh b/llamafile/build-functions.sh
@@ -98,17 +98,64 @@ setup_build_dir() {
     mkdir -p "$build_dir"
 }
 
-# Collect CUDA/HIP source files
+# Collect CUDA/HIP source files with selective template inclusion
 # Sets: CUDA_SOURCES, NUM_SOURCES
-# Args: $1 = GGML_CUDA_DIR, $2 = extra sources (optional, e.g., tinyblas.cu path)
+# Args: $1 = GGML_CUDA_DIR
+#       $2 = caller-supplied sources prepended to the list (e.g., tinyblas.cu
+#            for the default TinyBLAS build; empty for the --cublas build)
+#       $3 = NO_IQ_QUANTS (optional, "1" to exclude IQ quant MMQ templates)
+#       $4 = FA_ALL_QUANTS (optional, "1" to include all fattn-vec quant combos
+#            instead of the 3 default ones; mirrors upstream's GGML_CUDA_FA_ALL_QUANTS)
 collect_gpu_sources() {
     local ggml_cuda_dir="$1"
-    local extra_sources="$2"
+    local caller_sources="$2"
+    local no_iq_quants="${3:-0}"
+    local fa_all_quants="${4:-0}"
 
-    CUDA_SOURCES="$extra_sources"
+    CUDA_SOURCES="$caller_sources"
 
-    for f in "$ggml_cuda_dir"/*.cu "$ggml_cuda_dir/template-instances"/*.cu; do
+    # 1. Main CUDA sources (always included)
+    for f in "$ggml_cuda_dir"/*.cu; do
+        [ -f "$f" ] && CUDA_SOURCES="$CUDA_SOURCES $f"
+    done
+
+    local ti_dir="$ggml_cuda_dir/template-instances"
+
+    # 2. fattn-mma and fattn-tile instances (always included)
+    for f in "$ti_dir"/fattn-mma-*.cu "$ti_dir"/fattn-tile-*.cu; do
+        [ -f "$f" ] && CUDA_SOURCES="$CUDA_SOURCES $f"
+    done
+
+    # 3. fattn-vec: default to the 4 common quant combos (f16-f16, q4_0-q4_0,
+    #    q8_0-q8_0, bf16-bf16), matching upstream CMake. With FA_ALL_QUANTS=1
+    #    include all fattn-vec instances (mirrors upstream's
+    #    GGML_CUDA_FA_ALL_QUANTS opt-in).
+    if [ "$fa_all_quants" = "1" ]; then
+        for f in "$ti_dir"/fattn-vec-instance-*.cu; do
+            [ -f "$f" ] && CUDA_SOURCES="$CUDA_SOURCES $f"
+        done
+    else
+        for f in "$ti_dir"/fattn-vec-instance-f16-f16.cu \
+                 "$ti_dir"/fattn-vec-instance-q4_0-q4_0.cu \
+                 "$ti_dir"/fattn-vec-instance-q8_0-q8_0.cu \
+                 "$ti_dir"/fattn-vec-instance-bf16-bf16.cu; do
+            [ -f "$f" ] && CUDA_SOURCES="$CUDA_SOURCES $f"
+        done
+    fi
+
+    # 4. mmf instances (always included)
+    for f in "$ti_dir"/mmf-*.cu; do
+        [ -f "$f" ] && CUDA_SOURCES="$CUDA_SOURCES $f"
+    done
+
+    # 5. mmq instances: include all, but optionally exclude IQ quant templates
+    for f in "$ti_dir"/mmq-*.cu; do
         if [ -f "$f" ]; then
+            if [ "$no_iq_quants" = "1" ]; then
+                case "$(basename "$f")" in
+                    mmq-instance-iq*) continue ;;
+                esac
+            fi
             CUDA_SOURCES="$CUDA_SOURCES $f"
         fi
     done
diff --git a/llamafile/cuda.bat b/llamafile/cuda.bat
diff --git a/llamafile/cuda.sh b/llamafile/cuda.sh
diff --git a/llamafile/cuda_parallel.bat b/llamafile/cuda_parallel.bat