feat: Pre-load ML models during image build

agullon · agullon · commit cdfa11483e21 · 2026-03-30T16:43:09.000+02:00
Pre-download semantic-router ML models (~18GB) during bootc image
build instead of first boot. This eliminates SSH timeouts and
first-boot delays caused by model downloads.

Changes:
- Add preload-models.sh script to download models during build
- Mount /var/cache/vllm-sr as hostPath in semantic-router deployment
- Update HF_HOME env vars to use pre-cached directory
- Increase default VM resources to 16GB RAM / 8 vCPUs

The final image size increases to ~24GB but VMs boot with fully
operational semantic routing immediately.

pre-commit.check-secrets: ENABLED
diff --git a/Containerfile b/Containerfile
@@ -78,7 +78,16 @@ COPY config/templates/ /etc/semantic-router/templates/
 COPY config/llm-router-dashboard.json /etc/semantic-router/
 COPY scripts/configure-semantic-router.sh /usr/local/bin/
 COPY scripts/setup-gpu-operator.sh /usr/local/bin/
-RUN chmod +x /usr/local/bin/configure-semantic-router.sh /usr/local/bin/setup-gpu-operator.sh
+COPY scripts/preload-models.sh /usr/local/bin/
+RUN chmod +x /usr/local/bin/configure-semantic-router.sh \
+             /usr/local/bin/setup-gpu-operator.sh \
+             /usr/local/bin/preload-models.sh
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Pre-download semantic-router ML models (~18GB)
+# ─────────────────────────────────────────────────────────────────────────────
+# This downloads models during image build to avoid first-boot delays
+RUN /usr/local/bin/preload-models.sh /var/cache/vllm-sr
 
 # ─────────────────────────────────────────────────────────────────────────────
 # Helm — needed to install the NVIDIA GPU Operator post-boot (GPU builds only)
diff --git a/README.md b/README.md
@@ -64,8 +64,14 @@ podman build --build-arg ENABLE_GPU=false -t hybrid-inference-bootc:latest -f Co
 ```
 
 The `ENABLE_GPU=false` build skips the NVIDIA container toolkit, local SLM
-manifests, vLLM image pre-pull, and Helm. The resulting image is smaller and
-builds on any host without NVIDIA repos.
+manifests, vLLM image pre-pull, and Helm. The resulting image builds on any
+host without NVIDIA repos.
+
+**ML Model Pre-loading:** The image build pre-downloads semantic-router ML
+models (~18GB including jailbreak detection, PII detection, and domain
+classification models) to eliminate first-boot download delays. This increases
+the final image size to ~24GB but ensures VMs boot with fully operational
+semantic routing immediately.
 
 CI builds run automatically on push to `main` and publish multi-arch
 (amd64 + arm64) manifest lists to
@@ -75,9 +81,10 @@ CI builds run automatically on push to `main` and publish multi-arch
 ## First Boot
 
 > [!NOTE]
-> On first boot, infrastructure pods may show `CreateContainerConfigError`
+> On first boot, infrastructure pods may briefly show `CreateContainerConfigError`
 > (waiting for ConfigMap/Secret). If built with GPU support, the vLLM SLM
-> pod will show `Pending` (waiting for GPU resources). This is expected.
+> pod will show `Pending` (waiting for GPU resources). semantic-router pods
+> start immediately since ML models are pre-loaded during image build.
 
 ### 1. Boot the image
 
diff --git a/manifests/semantic-router/overlays/full/deployment.yaml b/manifests/semantic-router/overlays/full/deployment.yaml
@@ -50,11 +50,11 @@ spec:
           protocol: TCP
         env:
         - name: HOME
-          value: "/tmp"
+          value: "/var/cache/vllm-sr"
         - name: HF_HOME
-          value: "/tmp/hf-cache"
+          value: "/var/cache/vllm-sr/huggingface"
         - name: HUGGINGFACE_HUB_CACHE
-          value: "/tmp/hf-cache"
+          value: "/var/cache/vllm-sr/huggingface"
         - name: LITELLM_API_KEY
           valueFrom:
             secretKeyRef:
@@ -70,8 +70,8 @@ spec:
           subPath: config.yaml
         - name: models
           mountPath: /app/models
-        - name: hf-cache
-          mountPath: /tmp/hf-cache
+        - name: model-cache
+          mountPath: /var/cache/vllm-sr
         - name: vllm-sr-workdir
           mountPath: /app/.vllm-sr
         - name: envoy-config
@@ -113,9 +113,10 @@ spec:
       - name: models
         emptyDir:
           sizeLimit: 25Gi
-      - name: hf-cache
-        emptyDir:
-          sizeLimit: 25Gi
+      - name: model-cache
+        hostPath:
+          path: /var/cache/vllm-sr
+          type: Directory
       - name: vllm-sr-workdir
         emptyDir: {}
       - name: envoy-config
diff --git a/scripts/preload-models.sh b/scripts/preload-models.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/bash
+# preload-models.sh - Pre-download semantic-router ML models during image build
+#
+# This script runs a temporary vllm-sr container to download all ML models
+# (~18GB) so they don't need to be downloaded on first boot.
+
+set -euo pipefail
+
+CACHE_DIR="${1:-/var/cache/vllm-sr}"
+CONTAINER_NAME="vllm-sr-preload-$$"
+
+echo "Creating cache directory at ${CACHE_DIR}..."
+mkdir -p "${CACHE_DIR}"
+
+# Create a minimal config that will trigger model downloads
+TEMP_CONFIG=$(mktemp)
+cat > "${TEMP_CONFIG}" <<'EOF'
+version: v0.3
+listeners:
+  - name: "api"
+    address: "0.0.0.0"
+    port: 8801
+    timeout: "300s"
+providers:
+  defaults:
+    default_model: "test-model"
+  models:
+    - name: "test-model"
+      backend_refs:
+        - name: "local"
+          weight: 1
+          endpoint: "localhost:8000"
+          protocol: "http"
+          api_key: "test"
+routing:
+  modelCards:
+    - name: "test-model"
+  signals:
+    domains:
+      - name: "other"
+        description: "Test"
+        mmlu_categories: ["other"]
+  decisions:
+    - name: "default"
+      description: "Default"
+      priority: 1
+      rules:
+        operator: "OR"
+        conditions:
+          - type: "domain"
+            name: "other"
+      modelRefs:
+        - model: "test-model"
+          use_reasoning: false
+EOF
+
+echo "Starting vllm-sr container to pre-download models..."
+podman run --rm \
+    --name "${CONTAINER_NAME}" \
+    -v "${CACHE_DIR}":/root/.cache:Z \
+    -v "${TEMP_CONFIG}":/tmp/config.yaml:ro,Z \
+    --env VLLM_SR_RUNTIME_CONFIG_PATH=/tmp/config.yaml \
+    ghcr.io/vllm-project/semantic-router/vllm-sr:latest \
+    timeout 300 /app/start-router.sh /tmp/config.yaml /app/.vllm-sr || true
+
+rm -f "${TEMP_CONFIG}"
+
+# Verify models were downloaded
+if [[ -d "${CACHE_DIR}/huggingface" ]]; then
+    CACHE_SIZE=$(du -sh "${CACHE_DIR}" | cut -f1)
+    echo "✓ Models cached successfully (${CACHE_SIZE})"
+    echo "Cache contents:"
+    ls -lh "${CACHE_DIR}"
+else
+    echo "⚠ Warning: Models may not have been fully cached"
+fi
diff --git a/scripts/start-bootc-vm.sh b/scripts/start-bootc-vm.sh
@@ -37,8 +37,8 @@ for arg in "$@"; do
     esac
 done
 
-RAM=8192
-VCPUS=4
+RAM=16384
+VCPUS=8
 DISK_SIZE=100
 
 VM_NAME="${VM_NAME:-bootc-vm-$(date +%Y%m%d%H%M%S)}"