chore(gguf): support prebuilt llama.cpp binaries + add Q3_K_M for 8 GB VRAM

javierdejesusda · javierdejesusda · commit 881409f15e90 · 2026-04-29T02:23:46.000+02:00
scripts/build_gguf.sh now resolves the convert script and the
llama-quantize binary independently:
  - LLAMACPP_REPO points at a cloned llama.cpp checkout (provides
    convert_hf_to_gguf.py).
  - LLAMACPP_BIN points at the directory holding the quantize binary
    and probes both the bare and the .exe filename.
  - LLAMACPP (legacy) still works as an alias for LLAMACPP_REPO so the
    prior single-env-var docs do not break.
This lets the build run against either a from-source build (Linux MI300X)
or the official ggml-org Windows prebuilt zip without forking the script.

QUANTS now also produces Q3_K_M (~6.5 GB) ahead of Q4_K_M / Q5_K_M /
Q6_K / Q8_0. Q3_K_M is the new headline 8 GB-class consumer target
(RTX 4070 Laptop, RTX 3060 Ti); Q4_K_M still ships for 12-16 GB cards.
README and model-card quantization tables updated to reflect the new
hardware tier breakdown.
diff --git a/README.md b/README.md
@@ -178,8 +178,12 @@ generation work is never wasted.
 - **Training (one-shot).** Single AMD Instinct MI300X (192 GB HBM3) on ROCm 7.0.
   Full-parameter SFT of a 14B Qwen1 model at sequence length 8192 does not fit on
   80 GB-class hardware; the MI300X is not optional for the training path.
-- **Consumer inference.** The Q4_K_M GGUF (≈9.45 GB) targets a single
-  RTX 4060 Ti 16 GB via llama.cpp. Pass-1 per-section context fits at 4-6K tokens.
+- **Consumer inference.** The **Q3_K_M GGUF (≈6.5 GB)** is the
+  recommended 8 GB-class target (RTX 4070 Laptop, RTX 3060 Ti, etc.) and
+  fits fully in VRAM at Pass-1 4-6K context. The Q4_K_M GGUF (≈9.45 GB)
+  is shipped for 12-16 GB cards (RTX 4060 Ti 16 GB, RTX 3080); on 8 GB
+  cards Q4_K_M still runs via partial GPU offload (`llama-cli -ngl 25`).
+  Larger Q5_K_M / Q6_K / Q8_0 quants ship for prosumer / dual-GPU rigs.
 - **Demo / research inference.** Any ROCm or CUDA host with the BF16 checkpoint;
   the `MemoCriticAgent` is pure orchestration and adds zero VRAM cost beyond the
   base model.
diff --git a/docs/model-card.md b/docs/model-card.md
@@ -206,12 +206,13 @@ checkpoint by `scripts/build_gguf.sh`, which calls llama.cpp's
 target quant. See the script's prereq header for the required
 llama.cpp checkout and disk-budget notes.
 
-| Quant     | Approx. size | Intended hardware                | Target throughput (tok/s) |
-|-----------|--------------|----------------------------------|---------------------------|
-| Q4_K_M    | ~9.45 GB     | 16 GB consumer GPU (RTX 4060 Ti) | ≥ 18                      |
-| Q5_K_M    | ~10.5 GB     | 16-24 GB consumer GPU            | TBD                       |
-| Q6_K      | ~12.1 GB     | 24 GB+ consumer or prosumer      | TBD                       |
-| Q8_0      | ~15.7 GB     | 24 GB+ prosumer / dual-GPU CPU offload | TBD                 |
+| Quant     | Approx. size | Intended hardware                                       | Target throughput (tok/s) |
+|-----------|--------------|---------------------------------------------------------|---------------------------|
+| Q3_K_M    | ~6.5 GB      | 8 GB consumer GPU (RTX 4070 Laptop, RTX 3060 Ti)        | TBD                       |
+| Q4_K_M    | ~9.45 GB     | 12-16 GB consumer GPU (RTX 4060 Ti 16 GB, RTX 3080)     | TBD                       |
+| Q5_K_M    | ~10.5 GB     | 16-24 GB consumer GPU                                   | TBD                       |
+| Q6_K      | ~12.1 GB     | 24 GB+ consumer or prosumer                             | TBD                       |
+| Q8_0      | ~15.7 GB     | 24 GB+ prosumer / dual-GPU CPU offload                  | TBD                       |
 
 Pass-1 per-section context of 4-6K tokens is the supported consumer
 operating point; longer contexts require the BF16 checkpoint served via
diff --git a/scripts/build_gguf.sh b/scripts/build_gguf.sh
@@ -2,20 +2,33 @@
 #
 # Build the YuhoLens-14B GGUF release set from a HuggingFace checkpoint.
 #
-# Required tools (operator must install before running):
-#   - python (matching the llama.cpp checkout's environment)
-#   - llama.cpp cloned and built with the quantize binary:
-#       git clone https://github.com/ggerganov/llama.cpp ../llama.cpp
-#       cd ../llama.cpp && cmake -B build && cmake --build build --target llama-quantize
-#       pip install -r ../llama.cpp/requirements.txt
-#   - At least 80 GB free on the target disk (f16 + four quants for a 14B model).
+# Two-directory layout:
+#   - LLAMACPP_REPO: the cloned llama.cpp repo (provides convert_hf_to_gguf.py).
+#   - LLAMACPP_BIN:  the directory holding the llama-quantize binary. May be
+#                    the repo's build/bin (when llama.cpp is built from source)
+#                    OR a flat directory of prebuilt Windows binaries (which
+#                    is what ggml-org publishes on the GitHub releases page).
+#
+#   When LLAMACPP_BIN is unset it auto-derives to "$LLAMACPP_REPO/build/bin".
+#   Both the bare ("llama-quantize") and the .exe ("llama-quantize.exe") name
+#   are probed, so the same script works on Linux source builds and on
+#   Windows prebuilt-binary checkouts.
+#
+# Required tools the operator must install BEFORE running:
+#   - python with `gguf` and `safetensors` packages (pip install gguf
+#     safetensors).
+#   - llama.cpp cloned somewhere readable (default ../llama.cpp).
+#   - A llama-quantize binary, either built from source or unzipped from
+#     the official prebuilt Windows release.
+#   - At least 80 GB free disk for a 14B model (f16 intermediate + 5 quants).
 #
 # Usage:
 #   scripts/build_gguf.sh <checkpoint_dir> [output_dir]
 #
 # Defaults:
-#   - LLAMACPP env var overrides the llama.cpp path (default: ../llama.cpp).
 #   - output_dir defaults to <checkpoint_dir> when omitted.
+#   - LLAMACPP_REPO defaults to ../llama.cpp.
+#   - LLAMACPP_BIN defaults to $LLAMACPP_REPO/build/bin.
 #
 # This script does not run automatically. Operator runs it after the HF
 # checkpoint is downloaded locally; the resulting GGUFs are uploaded to
@@ -38,31 +51,47 @@ fi
 
 mkdir -p "$OUT_DIR"
 
-LLAMACPP="${LLAMACPP:-../llama.cpp}"
-CONVERT_SCRIPT="$LLAMACPP/convert_hf_to_gguf.py"
-QUANT_BIN="$LLAMACPP/build/bin/llama-quantize"
+# Back-compat: legacy LLAMACPP env var maps to LLAMACPP_REPO.
+LLAMACPP_REPO="${LLAMACPP_REPO:-${LLAMACPP:-../llama.cpp}}"
+LLAMACPP_BIN="${LLAMACPP_BIN:-$LLAMACPP_REPO/build/bin}"
+CONVERT_SCRIPT="$LLAMACPP_REPO/convert_hf_to_gguf.py"
+
+resolve_quant_bin() {
+  for candidate in "$LLAMACPP_BIN/llama-quantize" "$LLAMACPP_BIN/llama-quantize.exe"; do
+    if [[ -f "$candidate" ]]; then
+      echo "$candidate"
+      return 0
+    fi
+  done
+  return 1
+}
 
 if [[ ! -f "$CONVERT_SCRIPT" ]]; then
   echo "error: convert script not found at $CONVERT_SCRIPT" >&2
-  echo "       set LLAMACPP=/path/to/llama.cpp or clone llama.cpp at ../llama.cpp" >&2
+  echo "       set LLAMACPP_REPO=/path/to/llama.cpp clone" >&2
   exit 66
 fi
 
-if [[ ! -x "$QUANT_BIN" ]]; then
-  echo "error: llama-quantize binary not found at $QUANT_BIN" >&2
-  echo "       build llama.cpp first: cmake -B build && cmake --build build --target llama-quantize" >&2
+if ! QUANT_BIN="$(resolve_quant_bin)"; then
+  echo "error: llama-quantize binary not found in $LLAMACPP_BIN" >&2
+  echo "       set LLAMACPP_BIN to the directory containing llama-quantize(.exe)" >&2
+  echo "       (build/bin/ for a source build, or the unzip dir for prebuilts)" >&2
   exit 66
 fi
 
 OUT_F16="${OUT_DIR%/}/yuholens-14b-f16.gguf"
 
+echo "[gguf] using convert script: $CONVERT_SCRIPT"
+echo "[gguf] using quantize bin:   $QUANT_BIN"
 echo "[gguf] converting $CKPT -> $OUT_F16"
 python "$CONVERT_SCRIPT" \
   --outfile "$OUT_F16" \
   --outtype f16 \
   "$CKPT"
 
-QUANTS=("Q4_K_M" "Q5_K_M" "Q6_K" "Q8_0")
+# Q3_K_M is the 8 GB consumer headline quant; everything from Q4_K_M up
+# wants 10 GB+ VRAM or partial CPU offload at runtime.
+QUANTS=("Q3_K_M" "Q4_K_M" "Q5_K_M" "Q6_K" "Q8_0")
 for quant in "${QUANTS[@]}"; do
   out="${OUT_DIR%/}/yuholens-14b-${quant}.gguf"
   echo "[gguf] quantising $quant -> $out"