-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvllm-rtx6000-pro.yaml
More file actions
78 lines (69 loc) · 3.09 KB
/
vllm-rtx6000-pro.yaml
File metadata and controls
78 lines (69 loc) · 3.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# =============================================================================
# Petronella Technology Group - vLLM Configuration for RTX 6000 Pro Blackwell
# =============================================================================
#
# NVIDIA RTX 6000 Pro (Blackwell) Specifications:
# - 96 GB GDDR7 VRAM per GPU
# - 2,016 GB/s memory bandwidth
# - PCIe 5.0 x16 (64 GB/s bidirectional)
# - No NVLink (PCIe only) -- use Pipeline Parallelism
# - FP8 Tensor Core support (5th gen)
# - 2,560 CUDA cores, 80 RT cores, 80 Tensor cores (4th gen)
#
# This config is optimized for the RTX 6000 Pro's strengths:
# - High VRAM per GPU means fewer GPUs needed
# - GDDR7 bandwidth supports large KV caches
# - PCIe interconnect means PP is mandatory for multi-GPU
# - FP8 support enables quantized large models
#
# GPU Count Recommendations:
# 1x RTX 6000 Pro (96GB): Llama 3.1 70B (BF16), Qwen2.5 72B, Mixtral 8x7B
# 2x RTX 6000 Pro (192GB): Llama 3.1 70B (128K ctx), Mixtral 8x22B
# 4x RTX 6000 Pro (384GB): Llama 3.1 405B (FP8), DeepSeek-V3 (FP8)
# 8x RTX 6000 Pro (768GB): Llama 3.1 405B (BF16 full), any model
#
# Docs: https://petronellatech.com/hardware/rtx-6000-pro-blackwell-multi-gpu-vllm/
# =============================================================================
# --- Model Configuration ---
model: meta-llama/Llama-3.1-70B-Instruct
dtype: bfloat16
quantization: null # Set to "fp8" for 405B models
trust_remote_code: true
# --- Parallelism ---
# RTX 6000 Pro uses PCIe -- always use pipeline parallelism
tensor_parallel_size: 1
pipeline_parallel_size: 2 # Adjust: 1, 2, 4, or 8
# --- Memory Configuration ---
# 96GB GDDR7 allows aggressive memory utilization
gpu_memory_utilization: 0.92
max_model_len: 65536 # 65K tokens with 2x GPUs
swap_space: 8 # GiB of CPU memory for swapping
kv_cache_dtype: auto # Use FP8 KV cache if model supports it
# --- Scheduling (PP-optimized) ---
# Higher max_num_seqs fills pipeline bubbles more effectively
max_num_seqs: 512
max_num_batched_tokens: 65536
enable_chunked_prefill: true
# --- Performance ---
# Enforce eager mode can help with PCIe systems (avoids CUDA graph overhead)
enforce_eager: false # Set true if you see CUDA graph compilation issues
disable_log_requests: true
disable_log_stats: false # Keep stats for monitoring
# --- Serving ---
host: 0.0.0.0
port: 8000
api_key: null # Set via VLLM_API_KEY env var
# --- RTX 6000 Pro Specific Optimizations ---
# GDDR7 has higher bandwidth than GDDR6X, enabling larger batches
# The 96GB VRAM allows models that previously needed 2x GPUs on 48GB cards
# Example: Llama 3.1 70B on 1x RTX 6000 Pro (96GB)
# Model weights (BF16): ~140GB -- DOES NOT FIT on 1 GPU
# Model weights (FP8): ~70GB -- Fits with room for 26GB KV cache
# KV cache at 32K ctx: ~10GB
# Overhead: ~5GB
# Total: ~85GB of 96GB = 88% utilization
# Example: Llama 3.1 70B on 2x RTX 6000 Pro (192GB total)
# Model weights (BF16): ~140GB (70GB per GPU with PP=2)
# KV cache at 65K ctx: ~20GB per GPU
# Overhead: ~5GB per GPU
# Total per GPU: ~95GB of 96GB = 99% utilization (lower gpu_memory_utilization)