Skip to content

Commit 78e07c0

Browse files
committed
reduced quantization of the model
Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>
1 parent 551f033 commit 78e07c0

File tree

2 files changed

+6
-6
lines changed

2 files changed

+6
-6
lines changed

containers/lightspeed-rag/Containerfile.apple-silicon

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,11 @@ RUN CMAKE_ARGS="-DGGML_VULKAN=on" pip3 install --no-cache-dir --verbose llama-cp
5757
# Install huggingface-hub for reliable model downloads (PRESERVED)
5858
RUN pip3 install --no-cache-dir huggingface-hub
5959

60-
# Download LLM and embedding models (3B model for better performance)
60+
# Download LLM model (IQ3_M quantization for maximum size reduction)
6161
RUN mkdir -p /app/models && \
62-
python3 -c "import huggingface_hub; huggingface_hub.hf_hub_download(repo_id='bartowski/Llama-3.2-3B-Instruct-GGUF', filename='Llama-3.2-3B-Instruct-Q4_K_M.gguf', local_dir='/app/models', local_dir_use_symlinks=False)" && \
62+
python3 -c "import huggingface_hub; huggingface_hub.hf_hub_download(repo_id='bartowski/Llama-3.2-3B-Instruct-GGUF', filename='Llama-3.2-3B-Instruct-IQ3_M.gguf', local_dir='/app/models', local_dir_use_symlinks=False)" && \
6363
ls -la /app/models/ && \
64-
[ $(stat -c%s /app/models/Llama-3.2-3B-Instruct-Q4_K_M.gguf) -gt 1500000000 ] || (echo "Model download failed - file too small" && exit 1)
64+
[ $(stat -c%s /app/models/Llama-3.2-3B-Instruct-IQ3_M.gguf) -gt 1200000000 ] || (echo "Model download failed - file too small" && exit 1)
6565

6666
# Pre-download embedding models (UPDATED to use all-MiniLM-L6-v2 primarily)
6767
RUN mkdir -p /root/.cache/huggingface/transformers && \

containers/lightspeed-rag/Containerfile.nvidia

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,11 @@ RUN /app/venv/bin/pip install --no-cache-dir -r /tmp/krkn-lightspeed/requirement
4949
RUN CMAKE_ARGS="-DLLAMA_CUDA=on" /app/venv/bin/pip install --no-cache-dir --upgrade llama-cpp-python || \
5050
/app/venv/bin/pip install --no-cache-dir llama-cpp-python
5151

52-
# Download Llama 3.2 3B model
52+
# Download Llama 3.2 3B model (IQ3_M quantization for maximum size reduction)
5353
RUN mkdir -p /app/models && \
54-
/app/venv/bin/python -c "import huggingface_hub; huggingface_hub.hf_hub_download(repo_id='bartowski/Llama-3.2-3B-Instruct-GGUF', filename='Llama-3.2-3B-Instruct-Q4_K_M.gguf', local_dir='/app/models', local_dir_use_symlinks=False)" && \
54+
/app/venv/bin/python -c "import huggingface_hub; huggingface_hub.hf_hub_download(repo_id='bartowski/Llama-3.2-3B-Instruct-GGUF', filename='Llama-3.2-3B-Instruct-IQ3_M.gguf', local_dir='/app/models', local_dir_use_symlinks=False)" && \
5555
ls -la /app/models/ && \
56-
[ $(stat -c%s /app/models/Llama-3.2-3B-Instruct-Q4_K_M.gguf) -gt 1500000000 ] || (echo "Model download failed - file too small" && exit 1)
56+
[ $(stat -c%s /app/models/Llama-3.2-3B-Instruct-IQ3_M.gguf) -gt 1200000000 ] || (echo "Model download failed - file too small" && exit 1)
5757

5858
# Download sentence transformer model (all-MiniLM-L6-v2)
5959
RUN mkdir -p /root/.cache/huggingface/transformers && \

0 commit comments

Comments
 (0)