Skip to content

Commit cb78229

Browse files
committed
Improve load time by 7x by using loadsafetensors
1 parent e4c81d9 commit cb78229

File tree

2 files changed

+2
-2
lines changed

2 files changed

+2
-2
lines changed

run-nemotron-v2-VL/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ RUN export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
2828
RUN python use_existing_torch.py && \
2929
pip install -r requirements/build.txt && \
3030
pip install --no-build-isolation -e .
31-
31+
RUN pip install "fastsafetensors>=0.1.10"
3232
# Expose port 8000 for vLLM API server
3333
EXPOSE 8000
3434

run-nemotron-v2-VL/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@ docker run --runtime nvidia --gpus all \
1010
--env "HUGGING_FACE_HUB_TOKEN=<YOUR_HUGGINGFACE_TOKEN>" \
1111
--env "TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas" \
1212
vllm:25.10 \
13-
vllm serve nvidia/Nemotron-Nano-VL-12B-V2-FP4-QAD --trust-remote-code --quantization modelopt_fp4 --max-model-len 24000 --gpu-memory-utilization 0.3
13+
vllm serve nvidia/Nemotron-Nano-VL-12B-V2-FP4-QAD --load-format fastsafetensors --trust-remote-code --quantization modelopt_fp4 --max-model-len 24000 --gpu-memory-utilization 0.3

0 commit comments

Comments
 (0)