Improve load time by 7x by using loadsafetensors

raphaelamorim · raphaelamorim · commit cb7822940206 · 2025-11-24T12:30:21.000-05:00
diff --git a/run-nemotron-v2-VL/Dockerfile b/run-nemotron-v2-VL/Dockerfile
@@ -28,7 +28,7 @@ RUN export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 RUN python use_existing_torch.py && \
     pip install -r requirements/build.txt && \
     pip install --no-build-isolation -e .
-
+RUN pip install "fastsafetensors>=0.1.10"
 # Expose port 8000 for vLLM API server
 EXPOSE 8000
 
diff --git a/run-nemotron-v2-VL/run.sh b/run-nemotron-v2-VL/run.sh
@@ -10,4 +10,4 @@ docker run --runtime nvidia --gpus all \
       --env "HUGGING_FACE_HUB_TOKEN=<YOUR_HUGGINGFACE_TOKEN>" \
       --env "TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas" \
       vllm:25.10 \
-      vllm serve nvidia/Nemotron-Nano-VL-12B-V2-FP4-QAD --trust-remote-code --quantization modelopt_fp4 --max-model-len 24000 --gpu-memory-utilization 0.3
+      vllm serve nvidia/Nemotron-Nano-VL-12B-V2-FP4-QAD --load-format fastsafetensors --trust-remote-code --quantization modelopt_fp4 --max-model-len 24000 --gpu-memory-utilization 0.3