reduced quantization of the model

tsebastiani · tsebastiani · commit 78e07c05a9f6 · 2025-09-24T12:47:32.000+02:00
Signed-off-by: Tullio Sebastiani &lt;tsebasti@redhat.com&gt;
diff --git a/containers/lightspeed-rag/Containerfile.apple-silicon b/containers/lightspeed-rag/Containerfile.apple-silicon
@@ -57,11 +57,11 @@ RUN CMAKE_ARGS="-DGGML_VULKAN=on" pip3 install --no-cache-dir --verbose llama-cp
 # Install huggingface-hub for reliable model downloads (PRESERVED)
 RUN pip3 install --no-cache-dir huggingface-hub
 
-# Download LLM and embedding models (3B model for better performance)
+# Download LLM model (IQ3_M quantization for maximum size reduction)
 RUN mkdir -p /app/models && \
-    python3 -c "import huggingface_hub; huggingface_hub.hf_hub_download(repo_id='bartowski/Llama-3.2-3B-Instruct-GGUF', filename='Llama-3.2-3B-Instruct-Q4_K_M.gguf', local_dir='/app/models', local_dir_use_symlinks=False)" && \
+    python3 -c "import huggingface_hub; huggingface_hub.hf_hub_download(repo_id='bartowski/Llama-3.2-3B-Instruct-GGUF', filename='Llama-3.2-3B-Instruct-IQ3_M.gguf', local_dir='/app/models', local_dir_use_symlinks=False)" && \
     ls -la /app/models/ && \
-    [ $(stat -c%s /app/models/Llama-3.2-3B-Instruct-Q4_K_M.gguf) -gt 1500000000 ] || (echo "Model download failed - file too small" && exit 1)
+    [ $(stat -c%s /app/models/Llama-3.2-3B-Instruct-IQ3_M.gguf) -gt 1200000000 ] || (echo "Model download failed - file too small" && exit 1)
 
 # Pre-download embedding models (UPDATED to use all-MiniLM-L6-v2 primarily)
 RUN mkdir -p /root/.cache/huggingface/transformers && \
diff --git a/containers/lightspeed-rag/Containerfile.nvidia b/containers/lightspeed-rag/Containerfile.nvidia
@@ -49,11 +49,11 @@ RUN /app/venv/bin/pip install --no-cache-dir -r /tmp/krkn-lightspeed/requirement
 RUN CMAKE_ARGS="-DLLAMA_CUDA=on" /app/venv/bin/pip install --no-cache-dir --upgrade llama-cpp-python || \
     /app/venv/bin/pip install --no-cache-dir llama-cpp-python
 
-# Download Llama 3.2 3B model
+# Download Llama 3.2 3B model (IQ3_M quantization for maximum size reduction)
 RUN mkdir -p /app/models && \
-    /app/venv/bin/python -c "import huggingface_hub; huggingface_hub.hf_hub_download(repo_id='bartowski/Llama-3.2-3B-Instruct-GGUF', filename='Llama-3.2-3B-Instruct-Q4_K_M.gguf', local_dir='/app/models', local_dir_use_symlinks=False)" && \
+    /app/venv/bin/python -c "import huggingface_hub; huggingface_hub.hf_hub_download(repo_id='bartowski/Llama-3.2-3B-Instruct-GGUF', filename='Llama-3.2-3B-Instruct-IQ3_M.gguf', local_dir='/app/models', local_dir_use_symlinks=False)" && \
     ls -la /app/models/ && \
-    [ $(stat -c%s /app/models/Llama-3.2-3B-Instruct-Q4_K_M.gguf) -gt 1500000000 ] || (echo "Model download failed - file too small" && exit 1)
+    [ $(stat -c%s /app/models/Llama-3.2-3B-Instruct-IQ3_M.gguf) -gt 1200000000 ] || (echo "Model download failed - file too small" && exit 1)
 
 # Download sentence transformer model (all-MiniLM-L6-v2)
 RUN mkdir -p /root/.cache/huggingface/transformers && \