File tree Expand file tree Collapse file tree 2 files changed +0
-12
lines changed
Expand file tree Collapse file tree 2 files changed +0
-12
lines changed Original file line number Diff line number Diff line change 33
44# Launch Docker container with NVIDIA GPU support
55docker run --runtime nvidia --gpus all \
6- # Increase memory lock and stack size limits for GPU operations
76 --ulimit memlock=-1 --ulimit stack=67108864 \
8- # Mount HuggingFace cache to avoid re-downloading models
97 -v ~ /.cache/huggingface:/root/.cache/huggingface \
10- # Mount vLLM cache for compiled kernels and other artifacts
118 -v ~ /.cache/vllm:/root/.cache/vllm \
12- # Expose port 8000 for vLLM API server
139 -p 8000:8000 \
14- # Set HuggingFace token for model access
1510 --env " HUGGING_FACE_HUB_TOKEN=<YOUR_HUGGINGFACE_TOKEN>" \
16- # Set Triton PTXAS compiler path for CUDA compilation
1711 --env " TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas" \
18- # Use the vllm:25.10 image built by build.sh
1912 vllm:25.10 \
20- # Serve Nemotron Nano VL 12B model with FP4 quantization
21- # --trust-remote-code: Allow execution of custom model code
22- # --quantization modelopt_fp4: Use FP4 quantization for reduced memory
23- # --max-model-len 24000: Set maximum sequence length
24- # --gpu-memory-utilization 0.3: Use 30% of GPU memory for model weights
2513 vllm serve nvidia/Nemotron-Nano-VL-12B-V2-FP4-QAD --trust-remote-code --quantization modelopt_fp4 --max-model-len 24000 --gpu-memory-utilization 0.3
You can’t perform that action at this time.
0 commit comments