ptg-vllm-deploy/configs/vllm-rtx6000-pro.yaml at main · capetron/ptg-vllm-deploy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# =============================================================================
# Petronella Technology Group - vLLM Configuration for RTX 6000 Pro Blackwell
# =============================================================================
#
# NVIDIA RTX 6000 Pro (Blackwell) Specifications:
#   - 96 GB GDDR7 VRAM per GPU
#   - 2,016 GB/s memory bandwidth
#   - PCIe 5.0 x16 (64 GB/s bidirectional)
#   - No NVLink (PCIe only) -- use Pipeline Parallelism
#   - FP8 Tensor Core support (5th gen)
#   - 2,560 CUDA cores, 80 RT cores, 80 Tensor cores (4th gen)
#
# This config is optimized for the RTX 6000 Pro's strengths:
#   - High VRAM per GPU means fewer GPUs needed
#   - GDDR7 bandwidth supports large KV caches
#   - PCIe interconnect means PP is mandatory for multi-GPU
#   - FP8 support enables quantized large models
#
# GPU Count Recommendations:
#   1x RTX 6000 Pro (96GB):  Llama 3.1 70B (BF16), Qwen2.5 72B, Mixtral 8x7B
#   2x RTX 6000 Pro (192GB): Llama 3.1 70B (128K ctx), Mixtral 8x22B
#   4x RTX 6000 Pro (384GB): Llama 3.1 405B (FP8), DeepSeek-V3 (FP8)
#   8x RTX 6000 Pro (768GB): Llama 3.1 405B (BF16 full), any model
#
# Docs: https://petronellatech.com/hardware/rtx-6000-pro-blackwell-multi-gpu-vllm/
# =============================================================================

# --- Model Configuration ---
model: meta-llama/Llama-3.1-70B-Instruct
dtype: bfloat16
quantization: null  # Set to "fp8" for 405B models
trust_remote_code: true

# --- Parallelism ---
# RTX 6000 Pro uses PCIe -- always use pipeline parallelism
tensor_parallel_size: 1
pipeline_parallel_size: 2  # Adjust: 1, 2, 4, or 8

# --- Memory Configuration ---
# 96GB GDDR7 allows aggressive memory utilization
gpu_memory_utilization: 0.92
max_model_len: 65536  # 65K tokens with 2x GPUs
swap_space: 8  # GiB of CPU memory for swapping
kv_cache_dtype: auto  # Use FP8 KV cache if model supports it

# --- Scheduling (PP-optimized) ---
# Higher max_num_seqs fills pipeline bubbles more effectively
max_num_seqs: 512
max_num_batched_tokens: 65536
enable_chunked_prefill: true

# --- Performance ---
# Enforce eager mode can help with PCIe systems (avoids CUDA graph overhead)
enforce_eager: false  # Set true if you see CUDA graph compilation issues
disable_log_requests: true
disable_log_stats: false  # Keep stats for monitoring

# --- Serving ---
host: 0.0.0.0
port: 8000
api_key: null  # Set via VLLM_API_KEY env var

# --- RTX 6000 Pro Specific Optimizations ---
# GDDR7 has higher bandwidth than GDDR6X, enabling larger batches
# The 96GB VRAM allows models that previously needed 2x GPUs on 48GB cards

# Example: Llama 3.1 70B on 1x RTX 6000 Pro (96GB)
# Model weights (BF16): ~140GB -- DOES NOT FIT on 1 GPU
# Model weights (FP8):  ~70GB  -- Fits with room for 26GB KV cache
# KV cache at 32K ctx:  ~10GB
# Overhead:             ~5GB
# Total:                ~85GB of 96GB = 88% utilization

# Example: Llama 3.1 70B on 2x RTX 6000 Pro (192GB total)
# Model weights (BF16): ~140GB (70GB per GPU with PP=2)
# KV cache at 65K ctx:  ~20GB per GPU
# Overhead:             ~5GB per GPU
# Total per GPU:        ~95GB of 96GB = 99% utilization (lower gpu_memory_utilization)