diff --git a/Dockerfile b/Dockerfile
index 0988daf58..3bcc39c48 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,10 +1,13 @@
# Rust builder
+
FROM lukemathwalker/cargo-chef:latest-rust-1.83 AS chef
+
WORKDIR /usr/src
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
FROM chef as planner
+
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
@@ -49,21 +52,35 @@ ARG TARGETPLATFORM
ENV PATH /opt/conda/bin:$PATH
+# For build-time CUDA memory resilience
+ENV PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
+
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
ccache \
curl \
- git && \
- rm -rf /var/lib/apt/lists/*
+ git \
+ ninja-build \
+ wget \
+ && rm -rf /var/lib/apt/lists/*
+
+# Add these lines to install a *newer* CMake version
+RUN apt-get update && \
+ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends software-properties-common && \
+ wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null && \
+ echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null && \
+ apt-get update && \
+ rm -f /etc/apt/sources.list.d/cmake.list && \
+ apt-get install -y --no-install-recommends cmake
# Install conda
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
- "linux/arm64") MAMBA_ARCH=aarch64 ;; \
- *) MAMBA_ARCH=x86_64 ;; \
+ "linux/arm64") MAMBA_ARCH=aarch64 ;; \
+ *) MAMBA_ARCH=x86_64 ;; \
esac && \
- curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
+ curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
bash ~/mambaforge.sh -b -p /opt/conda && \
rm ~/mambaforge.sh
@@ -71,94 +88,104 @@ RUN chmod +x ~/mambaforge.sh && \
# Install pytorch
# On arm64 we exit with an error code
RUN case ${TARGETPLATFORM} in \
- "linux/arm64") exit 1 ;; \
- *) /opt/conda/bin/conda update -y conda && \
- /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
+ "linux/arm64") exit 1 ;; \
+ *) /opt/conda/bin/conda update -y conda && \
+ /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
esac && \
/opt/conda/bin/conda clean -ya
# CUDA kernels builder image
FROM pytorch-install as kernel-builder
-ARG MAX_JOBS=2
+# This environment variable controls the number of parallel compilation jobs for CUDA kernels.
+# It is set to a conservative value (2) by default for stability on machines
+# with limited RAM relative to CPU cores, preventing Out-Of-Memory (OOM) crashes during build.
+#
+# You can adjust this value to optimize build speed based on your system's RAM:
+# - If you have more RAM (e.g., 96GB+), you can increase this value (e.g., to 16, 24, or 32)
+# to significantly speed up the build. Always monitor RAM usage (htop) to avoid OOM crashes.
+# - If you encounter OOM errors even with this value, try reducing it further to 1.
+ENV MAX_JOBS=2
+# If you encounter OOM errors even with this value, try reducing it to 1.
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
- ninja-build cmake \
- && rm -rf /var/lib/apt/lists/*
+RUN pip install setuptools_scm --no-cache-dir
# Build Flash Attention CUDA kernels
FROM kernel-builder as flash-att-builder
+
WORKDIR /usr/src
COPY server/Makefile-flash-att Makefile
-RUN make build-flash-attention
+RUN make build-flash-attention -j$(MAX_JOBS)
# Build Flash Attention v2 CUDA kernels
FROM kernel-builder as flash-att-v2-builder
+
WORKDIR /usr/src
COPY server/Makefile-flash-att-v2 Makefile
-RUN make build-flash-attention-v2-cuda
+RUN make build-flash-attention-v2-cuda -j$(MAX_JOBS)
# Build Transformers exllama kernels
FROM kernel-builder as exllama-kernels-builder
+
WORKDIR /usr/src
COPY server/exllama_kernels/ .
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+RUN MAX_JOBS=$(MAX_JOBS) TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
# Build Transformers exllama kernels
FROM kernel-builder as exllamav2-kernels-builder
+
WORKDIR /usr/src
COPY server/exllamav2_kernels/ .
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+RUN MAX_JOBS=$(MAX_JOBS) TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
# Build Transformers awq kernels
FROM kernel-builder as awq-kernels-builder
+
WORKDIR /usr/src
COPY server/Makefile-awq Makefile
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq -j$(MAX_JOBS)
# Build Transformers CUDA kernels
FROM kernel-builder as custom-kernels-builder
+
WORKDIR /usr/src
COPY server/custom_kernels/ .
# Build specific version of transformers
-RUN python setup.py build
+RUN MAX_JOBS=$(MAX_JOBS) python setup.py build
# Build vllm CUDA kernels
FROM kernel-builder as vllm-builder
+
WORKDIR /usr/src
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
- wget \
- && rm -rf /var/lib/apt/lists/*
-RUN DEBIAN_FRONTEND=noninteractive apt purge -y --auto-remove cmake
-RUN wget 'https://github.com/Kitware/CMake/releases/download/v3.30.0/cmake-3.30.0-linux-x86_64.tar.gz'
-RUN tar xzvf 'cmake-3.30.0-linux-x86_64.tar.gz'
-RUN ln -s "$(pwd)/cmake-3.30.0-linux-x86_64/bin/cmake" /usr/local/bin/cmake
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
COPY server/Makefile-vllm Makefile
# Build specific version of vllm
-RUN make build-vllm-cuda
+RUN make build-vllm-cuda -j$(MAX_JOBS)
# Build megablocks kernels
FROM kernel-builder as megablocks-kernels-builder
+
WORKDIR /usr/src
COPY server/Makefile-megablocks Makefile
ENV TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
-RUN make build-megablocks
+RUN make build-megablocks -j$(MAX_JOBS)
# Build punica CUDA kernels
FROM kernel-builder as punica-builder
WORKDIR /usr/src
+
COPY server/punica_kernels/ .
# Build specific version of punica
ENV TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
-RUN python setup.py build
+RUN MAX_JOBS=$(MAX_JOBS) python setup.py build
# Build eetq kernels
FROM kernel-builder as eetq-kernels-builder
+
WORKDIR /usr/src
COPY server/Makefile-eetq Makefile
# Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq -j$(MAX_JOBS)
# LoRAX base image
FROM nvidia/cuda:12.4.0-base-ubuntu22.04 as base
@@ -185,32 +212,36 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
COPY --from=pytorch-install /opt/conda /opt/conda
# Copy build artifacts from flash attention builder
-COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
# Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
# Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
# Copy build artifacts from awq kernels builder
-COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
# Copy builds artifacts from vllm builder
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy builds artifacts from punica builder
-COPY --from=punica-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=punica-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from megablocks builder
-COPY --from=megablocks-kernels-builder /usr/src/megablocks/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=megablocks-kernels-builder /usr/src/megablocks/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from eetq builder
-COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Install flash-attention dependencies
RUN pip install einops --no-cache-dir
@@ -238,7 +269,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
g++ \
&& rm -rf /var/lib/apt/lists/*
-
# Final image
FROM base
LABEL source="https://github.com/predibase/lorax"
@@ -250,7 +280,6 @@ RUN chmod +x entrypoint.sh
COPY sync.sh sync.sh
RUN chmod +x sync.sh
-
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \
unzip awscliv2.zip && \
sudo ./aws/install && \
diff --git a/README.md b/README.md
index 8998cfbdc..d6d8582a5 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,19 @@ _LoRAX: Multi-LoRA inference server that scales to 1000s of fine-tuned LLMs_
LoRAX (LoRA eXchange) is a framework that allows users to serve thousands of fine-tuned models on a single GPU, dramatically reducing the cost of serving without compromising on throughput or latency.
+---
+
+**π Start Here: For a Robust & Reliable LoRAX Deployment**
+
+While this `README.md` provides a general overview, setting up a performant LoRAX server involves specific hardware, software, and environment configurations. To ensure a smooth, "impossible-to-fail" deployment experience, we highly recommend consulting our detailed **[LoRAX Deployment Playbook](lorax_deployment_playbook.md)**. This guide covers:
+
+* **[Bulletproof Host System Setup](lorax_deployment_playbook.md#phase-1-host-setup):** NVIDIA drivers, Docker, `nvidia-container-toolkit`, and crucial user permissions.
+* **[GPU VRAM Considerations](lorax_deployment_playbook.md#phase-2-deploy-lorax):** Understanding LLM memory requirements and selecting compatible models for your hardware.
+* **[Pre-Built vs. Source Deployment](lorax_deployment_playbook.md#phase-2-deploy-lorax):** Choosing the fastest path or building from source with all CUDA kernels.
+* **[Common Pitfalls & Troubleshooting](lorax_deployment_playbook.md#troubleshooting-guide):** Solutions for Hugging Face authentication, model download stalls, and more.
+
+---
+
## π Table of contents
- [π Table of contents](#-table-of-contents)
@@ -59,6 +72,9 @@ Base models can be loaded in fp16 or quantized with `bitsandbytes`, [GPT-Q](http
Supported adapters include LoRA adapters trained using the [PEFT](https://github.com/huggingface/peft) and [Ludwig](https://ludwig.ai/) libraries. Any of the linear layers in the model can be adapted via LoRA and loaded in LoRAX.
+**βοΈ Model Compatibility & VRAM:** Selecting the right model for your GPU's VRAM is crucial. Not all quantized models are plug-and-play due to varying toolchains. For detailed guidance on VRAM limitations and troubleshooting quantized model errors (e.g., `CUDA out of memory`, `RuntimeError`), refer to **[Phase 2: Deploy LoRAX](lorax_deployment_playbook.md#phase-2-deploy-lorax)** in the LoRAX Deployment Playbook.
+
+
## πββοΈ Getting Started
We recommend starting with our pre-built Docker image to avoid compiling custom CUDA kernels and other dependencies.
@@ -72,6 +88,8 @@ The minimum system requirements need to run LoRAX include:
- Linux OS
- Docker (for this guide)
+**π¨ Critical Setup Note:** Meeting these requirements can be complex. For a step-by-step, verified guide on installing GPU drivers, Docker Engine, and `nvidia-container-toolkit` (including essential user permissions), please follow **[Phase 1: Host Setup](lorax_deployment_playbook.md#phase-1-host-setup)** in the LoRAX Deployment Playbook. Incorrect setup here is the most common cause of deployment failures.
+
### Launch LoRAX Server
#### Prerequisites
@@ -80,6 +98,8 @@ Then
- `sudo systemctl daemon-reload`
- `sudo systemctl restart docker`
+**π‘ For the most reliable and fully explained `docker run` command, including critical flags (`-e HUGGING_FACE_HUB_TOKEN`, `--user`), model selection based on GPU VRAM, and troubleshooting common issues like model download stalls or quantized model compatibility, refer to our comprehensive guide: [Phase 2: Deploy LoRAX](lorax_deployment_playbook.md#phase-2-deploy-lorax) and [Phase 3: Test the API](lorax_deployment_playbook.md#phase-3-test-the-api) in the LoRAX Deployment Playbook.**
+
```shell
model=mistralai/Mistral-7B-Instruct-v0.1
volume=$PWD/data
diff --git a/lorax_deployment_playbook.md b/lorax_deployment_playbook.md
new file mode 100644
index 000000000..d31da4390
--- /dev/null
+++ b/lorax_deployment_playbook.md
@@ -0,0 +1,542 @@
+# π LoRAX Deployment Playbook
+
+Welcome to the **LoRAX Deployment Playbook**! This guide is designed for **first-time operators** setting up a **LoRAX server** on a fresh **Ubuntu 22.04** GPU host with **sudo** access. We'll walk you through each step, explain *why* it matters, and provide quick fixes for common issues. Let's get your **LoRAX server** up and running! π
+
+> **Goal:** Deploy a working **LoRAX server** with a chosen model, understand the process, and troubleshoot issues fast.
+
+---
+
+## π Overview
+
+To deploy **LoRAX**, you need these components in order:
+
+1. **GPU Driver** β Verify `nvidia-smi` works on the host.
+2. **Docker Engine** β Ensure the user is in the `docker` group.
+3. **NVIDIA Container Runtime** β Make GPUs accessible inside containers.
+4. **LoRAX Container** β Pull or build the container image.
+5. **Model Files** β Download or cache model files.
+6. **API** β Confirm the server is listening and passes a basic inference test.
+
+> **Quick Sanity Check:** Stop at the first failure in this sequence:
+> - **A.** Run `nvidia-smi` on the host.
+> - **B.** Test GPU access in a container: `docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi`.
+> - **C.** Launch **LoRAX** with `MODEL_ID=mistralai/Mistral-7B-Instruct-v0.1` (the pre-built image is recommended for this check).
+> - **D.** Test the API with `curl`.
+> - **E.** Scale up to a larger model.
+
+---
+
+## Phase 1: Host Setup
+
+Before diving into installations, let's quickly check if your system already has the necessary components. Run the `Check` command for each step. If it passes, you can **skip** the corresponding installation section. If it fails, expand the "Installation Guide" to proceed.
+
+### 1. Check NVIDIA Driver β
+
+Ensure your **NVIDIA driver** is working correctly.
+
+```bash
+nvidia-smi
+```
+**Success:** Displays a table with the driver version and GPU details.
+
+Click to expand: Common Failures & Troubleshooting
+
+- *`command not found`* β Driver not installed or PATH issue.
+- *"NVIDIA-SMI has failed"* β Kernel module mismatch or Secure Boot blocking.
+
+
+
+
+Click to expand: NVIDIA Driver Installation Guide
+
+Installing NVIDIA drivers can be complex and varies greatly by OS and GPU. **We strongly recommend following the official NVIDIA documentation for your specific GPU and Linux distribution.** Example: [NVIDIA Drivers Downloads](https://www.nvidia.com/Download/index.aspx).
+
+
+
+---
+
+### 2. Check Docker Engine Installation π³
+
+Run this command to check if Docker is installed and running:
+
+```bash
+if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
+ echo "Docker Engine: Installed and running. β
"
+else
+ echo "Docker Engine: NOT detected or NOT running. β"
+fi
+```
+**Success:** `Docker Engine: Installed and running. β
`
+
+Click to expand: Common Failures & Troubleshooting
+
+- `Docker Engine: NOT detected or NOT running. β`
+- *GPG/repo errors ("NO_PUBKEY", "Unsigned")* β Key issue; redo key setup.
+- *Architecture mismatch* on non-x86 hosts.
+
+
+
+
+Click to expand: Install Docker Engine
+
+Set up **Docker** to run containers on **Ubuntu 22.04**.
+
+```bash
+sudo apt-get purge -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+sudo apt-get autoremove -y --purge
+sudo rm -rf /var/lib/docker /var/lib/containerd
+
+sudo apt update
+sudo apt install -y ca-certificates curl
+sudo install -m 0755 -d /etc/apt/keyrings
+sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
+sudo chmod a+r /etc/apt/keyrings/docker.asc
+echo \
+ "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
+ $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+ sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+
+sudo apt update
+sudo apt install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+```
+
+**What This Does:**
+- Updates package metadata.
+- Installs tools for HTTPS repositories.
+- Sets up Docker's GPG key and repository.
+- Installs **Docker Engine**, CLI, and plugins.
+
+**Success:** Run `docker --version` and `systemctl status docker` (should show *active (running)*).
+**Common Failures:**
+- GPG/repo errors ("NO_PUBKEY", "Unsigned") β Key issue; redo key setup.
+- Architecture mismatch on non-x86 hosts.
+
+> **Fix:** Re-run key download steps and `apt update`.
+
+
+
+---
+
+### 3. Check NVIDIA Container Toolkit π§
+
+Run this command to verify GPU access within a container (requires Docker and Toolkit):
+
+```bash
+docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi
+```
+**Success:** Displays GPU details (similar to `nvidia-smi` on host).
+
+Click to expand: Common Failures & Troubleshooting
+
+- *"Unknown runtime specified nvidia"* or *"Could not select device driver"* β Toolkit not correctly installed or configured.
+
+
+
+
+Click to expand: Install NVIDIA Container Toolkit
+
+Enable GPU access inside **Docker containers**.
+
+```bash
+# SHORT, FORCEFUL NVIDIA TOOLKIT INSTALL FOR UBUNTU 22.04 (Vast Mystery Box)
+set -euo pipefail
+
+# -- CRITICAL CHECKS --
+[[ "$(lsb_release -rs)" = "22.04" ]] || echo "[WARNING] Not Ubuntu 22.04. You WILL break stuff."
+command -v docker >/dev/null || { echo "[FATAL] Docker not found."; exit 1; }
+
+# -- FORCE OVERWRITE EXISTING GPG KEY --
+sudo rm -f /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+
+# -- ADD REPO & KEY (no prompt) --
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \
+ sudo gpg --yes --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+
+curl -fsSL https://nvidia.github.io/libnvidia-container/ubuntu22.04/libnvidia-container.list \
+| sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#' \
+| sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list > /dev/null
+
+# -- INSTALL --
+sudo apt-get update
+sudo apt-get install -y nvidia-container-toolkit
+
+# -- CONFIGURE --
+sudo nvidia-ctk runtime configure --runtime=docker
+sudo systemctl restart docker
+
+# -- SANITY TEST --
+docker run --rm --gpus all nvidia/cuda:12.3.0-base-ubuntu22.04 nvidia-smi \
+|| echo "[FATAL] Docker can't see your GPU. Drivers likely broken. Try 'nvidia-smi' on host."
+```
+
+**What This Does:**
+- Adds the NVIDIA Container Toolkit repository.
+- Installs the toolkit and configures Docker to use NVIDIA GPUs.
+
+**Success:** Check `/etc/docker/daemon.json` for `runtimes.nvidia`. Test with a CUDA container (Step 5).
+**Common Failures:**
+- `nvidia-ctk: command not found` β Installation failed; redo apt steps.
+- "Could not select device driver" β Runtime misconfigured; re-run configure and restart.
+
+> **Fix:** Re-run the toolkit installation and configuration steps.
+
+
+---
+
+### 4. Check User in Docker Group π€
+
+Run this command to check if your user is already in the 'docker' group:
+
+```bash
+groups | grep -q docker && echo "User is in the docker group." || echo "User is NOT in the docker group. Permissions needed."
+```
+**Success:** `User is in the docker group.`
+
+Click to expand: Common Failures & Troubleshooting
+
+- `User is NOT in the docker group. Permissions needed.`
+- *Commands still require `sudo`* β Log out and back in.
+
+
+
+
+Click to expand: Add User to Docker Group
+
+Allow running **Docker** commands without `sudo`.
+
+```bash
+sudo usermod -aG docker $USER
+newgrp docker
+```
+
+**Success:** `groups` shows `docker`; `docker ps` works without `sudo`.
+**Common Failure:** Commands still require `sudo` β Log out and back in.
+
+> **Tip:** Log out and log back in to apply group changes.
+
+
+
+---
+
+### 5. Hugging Face Authentication π
+
+Some models on Hugging Face require authentication to download. This is especially true for "gated" models like Mistral, Llama, and other proprietary models. You'll need a **Hugging Face Hub Token** to access these models.
+
+**What is a Hugging Face Hub Token?**
+A personal access token that acts like a password for programmatic access to Hugging Face. It allows LoRAX to download models on your behalf.
+
+Run this command to check if your `HUGGING_FACE_HUB_TOKEN` is already set as an environment variable:
+
+```bash
+if [ -n "$HUGGING_FACE_HUB_TOKEN" ]; then
+ echo "HUGGING_FACE_HUB_TOKEN is set. β
"
+else
+ echo "HUGGING_FACE_HUB_TOKEN is NOT set. β"
+fi
+```
+**Success:** `HUGGING_FACE_HUB_TOKEN is set. β
`
+
+Click to expand: Common Failures & Troubleshooting
+
+- `HUGGING_FACE_HUB_TOKEN is NOT set. β` β Token missing or not exported correctly.
+
+
+
+
+Click to expand: Set up HUGGING_FACE_HUB_TOKEN
+
+#### Get Your Hugging Face Token
+
+1. **Visit the token page:** Go to [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
+2. **Generate a new token:**
+ - Click "New token"
+ - Give it a name (e.g., "LoRAX Deployment")
+ - Select "Read" role (sufficient for downloading models)
+ - Click "Generate token"
+3. **Copy the token:** It will look like `hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx`
+4. **Request model access:** For gated models, visit their Hugging Face page and click "Request access" (e.g., [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3))
+
+#### Set the Environment Variable
+
+Add the token to your shell configuration so it's available for Docker:
+
+```bash
+# Add this line to your ~/.bashrc or ~/.zshrc file
+export HUGGING_FACE_HUB_TOKEN='hf_YOUR_TOKEN_HERE'
+
+# Reload your shell configuration
+source ~/.bashrc # or source ~/.zshrc if using zsh
+
+# Verify it's set
+echo $HUGGING_FACE_HUB_TOKEN
+```
+
+> **Important:** Replace `hf_YOUR_TOKEN_HERE` with your actual token. The Docker container will pick up this environment variable when passed with the `-e` flag.
+
+> **Note:** For public models like `gpt2`, you don't need a token, but having one set up allows you to easily switch to gated models later.
+
+
+
+---
+
+## Phase 2: Deploy LoRAX
+
+You can deploy LoRAX using either the **pre-built image** or by **building from source**. Both methods now support the same set of models:
+- `meta-llama/Llama-3.2-3B-Instruct`
+- `mistralai/Mistral-7B-Instruct-v0.1`
+- `meta-llama/Meta-Llama-3-8B-Instruct`
+
+Choose your deployment path:
+- **(A) Pre-built Image** β Fastest option, recommended for most users.
+- **(B) Build from Source** β For custom changes or unreleased patches.
+
+### 1. (Option A) Pull the Pre-built Image
+
+```bash
+docker pull ghcr.io/predibase/lorax:main
+```
+
+### 1. (Option B) Build the Image from Source
+
+Want to build LoRAX from source for custom changes or the latest patches? Follow these steps:
+
+```bash
+# 1. Clone the repository (if you haven't already)
+# NOTE: This guide uses a battle-tested branch of the LoRAX repository
+# that includes fixes for common on-premise deployment issues (e.g., build-time
+# dependencies and submodule initialization). Once these fixes are
+# merged upstream, you can use the official `predibase/lorax.git` repository.
+git clone -b feat/deployment-playbook-enhancements https://github.com/minhkhoango/lorax.git
+cd lorax
+# 2. Initialize submodules
+git submodule update --init --recursive
+```
+
+> **Tip: Speed Up Your Build!**
+>
+> By default, the Dockerfile uses `MAX_JOBS=2` to avoid out-of-memory (OOM) errors on machines with limited RAM. If you have a lot of RAM (e.g., 64GB, 96GB, or more), you can **dramatically speed up the build** by increasing this value.
+>
+> **How to adjust build speed:**
+> 1. Open your `Dockerfile` at the root of your cloned repository (`~/lorax/Dockerfile`) in your editor.
+> 2. Locate the line:
+> ```Dockerfile
+> ENV MAX_JOBS=2
+> ```
+> (This line is typically found around line 90 in the `Dockerfile` within the `kernel-builder` stage, but verify its exact location).
+> 3. Change `2` to a higher number (e.g., `16`, `24`, or `32`) if your system has enough RAM.
+> 4. Save your `Dockerfile` and rebuild the image.
+>
+> *Not sure how much RAM you have? Run `htop` or `free -h` in your terminal. If you run out of memory during build, lower `MAX_JOBS` and try again!*
+
+Now, build your Docker image:
+
+```bash
+export DOCKER_BUILDKIT=1
+docker build -t my-lorax-server -f Dockerfile .
+```
+
+---
+
+### 2. Choose Your Model & Run the Container
+
+Refer to the table below to select a model that fits your hardware and requirements:
+
+| **Model** | **Params** | **VRAM (FP16/BF16)** | **Notes** |
+|-----------|------------|-----------------------|-----------|
+| `meta-llama/Llama-3.2-3B-Instruct` | 3B | ~7 GB | Good for 8GB+ GPUs |
+| `mistralai/Mistral-7B-Instruct-v0.1` | 7B | ~14β15 GB | Needs 16β24 GB VRAM. |
+| `meta-llama/Meta-Llama-3-8B-Instruct` | 8B | ~16 GB | Tight on 16 GB; better with 24 GB. |
+
+> **VRAM Tips:**
+> - Keep **10β15% VRAM free** for KV cache and overhead.
+> - **6β8 GB GPUs**: Stick to quantized or smaller models.
+> - **12β16 GB GPUs**: Comfortable for 7B; tight for 8B.
+> - **24 GB+ GPUs**: Suitable for 13B or multi-instance setups.
+
+#### Run the Container
+
+Set your desired model and image name (see below):
+
+```bash
+MODEL_ID="meta-llama/Llama-3.2-3B-Instruct" # or mistralai/Mistral-7B-Instruct-v0.1, meta-llama/Meta-Llama-3-8B-Instruct
+SHARDED_MODEL="false" # Set to 'true' for sharded (multi-GPU) models like 70B
+PORT=80 # Host port to access the LoRAX server
+
+# For pre-built image:
+IMAGE_NAME="ghcr.io/predibase/lorax:main"
+# For source-built image:
+# IMAGE_NAME="my-lorax-server"
+
+docker run --rm \
+ --name lorax \
+ --gpus all \
+ -e HUGGING_FACE_HUB_TOKEN="$HUGGING_FACE_HUB_TOKEN" \
+ -e TRANSFORMERS_CACHE=/data \
+ -v "$HOME/lorax_model_cache":/data \
+ -v "$HOME/lorax_outlines_cache":/root/.cache/outlines \
+ --user "$(id -u):$(id -g)" \
+ -p ${PORT}:80 \
+ $IMAGE_NAME \
+ --model-id "$MODEL_ID" \
+ --sharded "$SHARDED_MODEL"
+```
+
+
+Click to expand: Explanation of Docker Run Flags
+
+**What This Does:**
+- `docker run --rm --name lorax`: Starts a new container, removes it on exit, and names it `lorax`.
+- `--gpus all`: Grants the container access to all available GPUs.
+- `-e HUGGING_FACE_HUB_TOKEN`: Passes your Hugging Face authentication token.
+- `-v "$HOME/lorax_model_cache":/data`: Mounts a local directory for persistent model caching.
+- `-v "$HOME/lorax_outlines_cache":/root/.cache/outlines`: Mounts cache for Outlines library.
+- `--user "$(id -u):$(id -g)"`: Runs the container process as your host user for permission consistency.
+- `-p ${PORT}:80`: Maps the container's internal port 80 to your specified host port.
+- `$IMAGE_NAME`: Specifies the Docker image to use (pre-built or source-built).
+- `--model-id "$MODEL_ID"`: Sets the Hugging Face model to load.
+- `--sharded "$SHARDED_MODEL"`: Configures for multi-GPU sharding if set to `true`.
+
+
+
+---
+
+## Phase 3: Test the API
+
+Once logs show the server is ready, test the **LoRAX API**.
+
+**Example Inference:**
+
+```bash
+curl 127.0.0.1:80/generate \
+ -X POST \
+ -d '{ "inputs": "[INST] What LLM model are you? [/INST]", "parameters": { "max_new_tokens": 64 } }' \
+ -H 'Content-Type: application/json'
+```
+
+If you're using a base model that supports LoRA adapters (like Mistral-7B) and have an adapter ID, you can test prompting a specific fine-tuned adapter.
+
+```bash
+curl 127.0.0.1:8080/generate \
+ -X POST \
+ -d '{
+ "inputs": "[INST] Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? [/INST]",
+ "parameters": {
+ "max_new_tokens": 64,
+ "adapter_id": "vineetsharma/qlora-adapter-Mistral-7B-Instruct-v0.1-gsm8k"
+ }
+ }' \
+ -H 'Content-Type: application/json'
+```
+
+Note: Replace vineetsharma/qlora-adapter-Mistral-7B-Instruct-v0.1-gsm8k with an adapter_id that is compatible with
+your chosen base model.
+
+**Success:** Logs show model download/cache hit and βModel loadedβ; health endpoint responds.
+
+Click to expand: Common Failures during API Test
+
+**Common Failures:** Refer to the Comprehensive Troubleshooting Guide below.
+
+
+
+
+## Troubleshooting Guide
+
+
+Click to expand: Comprehensive Troubleshooting Guide
+
+**Format:** [Stage] Symptom β Cause β Fix
+
+- **[Host]** `nvidia-smi` fails β Driver issue β Check `dmesg | grep -i nvidia | tail -n5`; reinstall driver or fix Secure Boot.
+- **[Container]** βCould not select device driverβ β Runtime misconfigured β Verify `/etc/docker/daemon.json`; redo toolkit setup.
+- **[Docker]** Cache permission denied β Root-owned files β Run `sudo chown -R $(id -u):$(id -g) $HOME/lorax_model_cache`.
+- **[Model Load]** CUDA OOM β Model too large β Check `nvidia-smi`; use smaller/quantized model.
+- **[Model Load]** Download stalls β Network issue β Use manual download workaround.
+- **[Model Load]** `RuntimeError: weight not found` or **`TypeError`** β Model or quantization incompatibility with the pre-built image. For detailed fixes, see the "Troubleshooting Model Compatibility (Build from Source)" section above.
+- **[Download]** `UserWarning: Not enough free disk space` or `No space left on device` (during model download/caching):** The mounted model cache directory has insufficient space. Check `df -h $HOME/lorax_model_cache`, then `rm -rf` unused model folders. Consider larger disk if needed.
+- **[Performance]** Slow first call β Warmup overhead β Send a short warmup prompt.
+- **[Performance]** Low GPU usage (<30%) β Small batches β Enable batching or increase concurrency.
+- **[Stability]** Exit code 137 β Host OOM β Check `dmesg | tail`; reduce model size.
+
+
+
+
+
+
+Model Compatibility Beyond Mistral-7B (Build from Source)
+
+**Common Issues & Solutions:**
+
+* **`TypeError: TensorParallelColumnLinear.load_multi() got an unexpected keyword argument 'fan_in_fan_out'` (for `gpt2`):**
+ * **Cause:** This error is specific to `gpt2`'s `Conv1D` layer architecture and an API mismatch with the `vLLM` integration in LoRAX's custom modeling.
+ * **Fix:** Ensure your `vLLM` is pinned to a compatible version/commit in `server/Makefile-vllm` (e.g., `v0.7.3` or specific fixes like `9985d06add07a4cc691dc54a7e34f54205c04d40` if explicitly needed). Rebuild your Docker image. The `--model-impl transformers` flag, while a workaround in some TGI contexts, is not supported by `lorax-launcher`.
+
+* **`ImportError: No module named 'msgspec'` (for `Qwen` models or others using newer `vLLM` features):**
+ * **Cause:** The `vLLM` version integrated in your build may require the `msgspec` Python library, which is not a default dependency.
+ * **Fix:** Add `msgspec` to your `server/requirements.txt` file and rebuild your Docker image with `--no-cache` to ensure the new dependency is installed.
+
+* **`RuntimeError: weight transformer.wte.weight does not exist` (for `bigcode/starcoder2-3b`):**
+ * **Cause:** This indicates a specific naming convention or structural mismatch for certain weight files within the `bigcode/starcoder2-3b` checkpoint that LoRAX's `FlashSantacoderModel` is trying to load.
+ * **Fix:** This often requires deeper debugging of the model's weight structure or changes within `lorax_server/models/custom_modeling/flash_santacoder_modeling.py`. Consider this model a known edge case that may require specific code adjustments beyond standard dependency management.
+
+
+
+---
+
+## π§Ή Cleanup & Reset
+
+
+Click to expand: Cleanup & Reset Your Environment
+
+```bash
+docker stop lorax
+docker system prune -f
+rm -rf $HOME/lorax_model_cache/*
+sudo chown -R $(id -u):$(id -g) $HOME/lorax_model_cache
+```
+
+
+
+---
+
+## π Quick Command Recap
+
+```bash
+# Check GPU access
+nvidia-smi
+docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi
+
+# Pull and run LoRAX (Pre-built Image)
+MODEL_ID="mistralai/Mistral-7B-Instruct-v0.1"; \
+docker run --rm --name lorax --gpus all -e HUGGING_FACE_HUB_TOKEN="$HUGGING_FACE_HUB_TOKEN" \
+ -e TRANSFORMERS_CACHE=/data -v "$HOME/lorax_model_cache":/data \
+ -v "$HOME/lorax_outlines_cache":/root/.cache/outlines \
+ --user "$(id -u):$(id -g)" -p 80:80 \
+ ghcr.io/predibase/lorax:main --model-id "$MODEL_ID" --sharded false
+
+# Test the API
+curl 127.0.0.1:80/generate \
+ -X POST \
+ -d '{ "inputs": "[INST] What LLM model are you? [/INST]", "parameters": { "max_new_tokens": 64 } }' \
+ -H 'Content-Type: application/json'
+```
+
+
+---
+
+## π Next Steps
+
+
+Click to expand: Beyond Basic Deployment (Next Steps)
+
+- **Monitoring:** Add logging/metrics with Prometheus or parse stdout.
+- **Security:** Set up a reverse proxy (nginx/traefik) with TLS for public access.
+- **Automation:** Create health/warmup scripts (e.g., systemd or Docker Compose).
+- **Reliability:** Add watchdog with `Restart=on-failure` (systemd or Docker policies).
+
+
+
+---
+
+**Happy Deploying!** π
+
diff --git a/server/requirements.txt b/server/requirements.txt
index 036f3be8a..c808e2032 100644
--- a/server/requirements.txt
+++ b/server/requirements.txt
@@ -32,19 +32,6 @@ mpmath==1.3.0 ; python_version >= "3.9" and python_version < "4.0"
multidict==6.1.0 ; python_version >= "3.9" and python_version < "4.0"
networkx==3.2.1 ; python_version >= "3.9" and python_version < "4.0"
numpy==1.26.4 ; python_version >= "3.9" and python_version < "4.0"
-nvidia-cublas-cu12==12.1.3.1 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-cuda-cupti-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-cuda-nvrtc-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-cuda-runtime-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-cudnn-cu12==9.1.0.70 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-cufft-cu12==11.0.2.54 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-curand-cu12==10.3.2.106 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-cusolver-cu12==11.4.5.107 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-cusparse-cu12==12.1.0.106 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-ml-py==12.570.86 ; python_version >= "3.9" and python_version < "4.0"
-nvidia-nccl-cu12==2.20.5 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-nvjitlink-cu12==12.8.61 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-nvtx-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
opentelemetry-api==1.21.0 ; python_version >= "3.9" and python_version < "4.0"
opentelemetry-exporter-otlp-proto-common==1.21.0 ; python_version >= "3.9" and python_version < "4.0"
opentelemetry-exporter-otlp-proto-grpc==1.21.0 ; python_version >= "3.9" and python_version < "4.0"
@@ -74,7 +61,6 @@ stanford-stk==0.7.1 ; python_version >= "3.9" and python_version < "4.0" and sys
sympy==1.13.3 ; python_version >= "3.9" and python_version < "4.0"
tiktoken==0.5.2 ; python_version >= "3.9" and python_version < "4.0"
tokenizers==0.21.0 ; python_version >= "3.9" and python_version < "4.0"
-torch==2.6.0 ; python_version >= "3.9" and python_version < "4.0"
tqdm==4.67.1 ; python_version >= "3.9" and python_version < "4.0"
transformers==4.49.0 ; python_version >= "3.9" and python_version < "4.0"
triton==3.0.0 ; python_version >= "3.9" and sys_platform == "linux" and python_version < "4.0" or python_version >= "3.9" and python_version < "3.13" and platform_machine == "x86_64" and platform_system == "Linux"