-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcheck_nccl_status.py
More file actions
158 lines (129 loc) · 5.17 KB
/
check_nccl_status.py
File metadata and controls
158 lines (129 loc) · 5.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python3
"""
Script to check NCCL availability and show multi-GPU configuration.
This script demonstrates what would happen with multiple GPUs.
"""
import sys
def check_nccl_status():
"""Check NCCL installation and GPU setup."""
print("=" * 60)
print("NCCL and Multi-GPU Status Check")
print("=" * 60)
try:
import torch
# GPU count
gpu_count = torch.cuda.device_count()
print(f"\nPyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU count: {gpu_count}")
# List all GPUs
if gpu_count > 0:
for i in range(gpu_count):
name = torch.cuda.get_device_name(i)
mem_gb = torch.cuda.get_device_properties(i).total_memory / (1024**3)
print(f" GPU {i}: {name} ({mem_gb:.1f} GB)")
# NCCL availability - try different approaches
print()
nccl_available = False
try:
# Check if NCCL backend is available
nccl_available = torch.distributed.is_nccl_available()
print(f"NCCL backend available: {nccl_available}")
except:
# Fallback: check if NCCL module exists
try:
import torch.cuda.nccl
nccl_available = True
print(f"NCCL module available: True")
except:
print(f"NCCL not available")
# NCCL version
if nccl_available:
try:
version = torch.cuda.nccl.version()
print(f"NCCL version: {version}")
except:
pass
# Check for NCCL package
try:
import pkg_resources
nccl_pkgs = [p for p in pkg_resources.working_set
if 'nccl' in p.project_name.lower()]
if nccl_pkgs:
print(f"NCCL package: {nccl_pkgs[0]}")
except:
pass
# Multi-GPU recommendations
print("\n" + "=" * 60)
print("Multi-GPU Configuration")
print("=" * 60)
if gpu_count == 1:
print("\nSingle GPU detected (NCCL not needed)")
print("\nCurrent mode: Single-GPU inference")
print(" - No inter-GPU communication")
print(" - NCCL installed but idle")
print(" - Maximum model size limited to GPU memory")
print("\nTo use NCCL with tensor parallelism:")
print(" 1. Request multiple GPUs from your cluster")
print(" Example (SLURM): salloc --gres=gpu:4")
print(" 2. Use vLLM with tensor_parallel_size:")
print(" llm = LLM(model='...', tensor_parallel_size=4)")
elif gpu_count >= 2:
print(f"\n{gpu_count} GPUs detected - NCCL will be used automatically!")
print("\nExample tensor parallelism configurations:")
for tp_size in [2, 4, 8]:
if tp_size <= gpu_count:
print(f"\n {tp_size} GPUs:")
print(f" llm = LLM(")
print(f" model='meta-llama/Llama-2-70b-hf',")
print(f" tensor_parallel_size={tp_size}")
print(f" )")
print("\nNCCL will handle all inter-GPU communication")
print("Can run much larger models than single GPU")
print("\n" + "=" * 60)
return True
except Exception as e:
print(f"\nError checking NCCL status: {e}")
import traceback
traceback.print_exc()
return False
def show_example_configs():
"""Show example configurations for different GPU counts."""
import torch
gpu_count = torch.cuda.device_count()
print("\nExample vLLM Configurations")
print("=" * 60)
examples = {
1: [
("Small model", "facebook/opt-125m", 1),
("Medium model", "facebook/opt-13b", 1),
("Large model (quantized)", "meta-llama/Llama-2-70b-hf (8-bit)", 1),
],
4: [
("Large model", "meta-llama/Llama-2-70b-hf", 4),
("Very large model", "meta-llama/Llama-2-405b-hf", 4),
],
8: [
("Massive model", "meta-llama/Llama-2-405b-hf", 8),
]
}
for required_gpus in sorted(examples.keys()):
if required_gpus <= gpu_count:
print(f"\nWith {required_gpus} GPU(s) available:")
for desc, model, tp_size in examples[required_gpus]:
print(f" • {desc}: {model}")
if tp_size > 1:
print(f" → Use tensor_parallel_size={tp_size}")
print(f" → NCCL handles GPU communication")
def main():
"""Run NCCL status check."""
print(f"\nPython executable: {sys.executable}\n")
success = check_nccl_status()
if success:
show_example_configs()
print("\n" + "=" * 60)
print("For more details, see: nccl_and_multi_gpu.md")
print("=" * 60 + "\n")
return 0 if success else 1
if __name__ == "__main__":
sys.exit(main())