distributed-inference-vllm/envs/uv/u260304_vllm/check_nccl_status.py at main · KempnerInstitute/distributed-inference-vllm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python3
"""
Script to check NCCL availability and show multi-GPU configuration.
This script demonstrates what would happen with multiple GPUs.
"""

import sys

def check_nccl_status():
    """Check NCCL installation and GPU setup."""
    print("=" * 60)
    print("NCCL and Multi-GPU Status Check")
    print("=" * 60)

    try:
        import torch

        # GPU count
        gpu_count = torch.cuda.device_count()
        print(f"\nPyTorch version: {torch.__version__}")
        print(f"CUDA available: {torch.cuda.is_available()}")
        print(f"GPU count: {gpu_count}")

        # List all GPUs
        if gpu_count > 0:
            for i in range(gpu_count):
                name = torch.cuda.get_device_name(i)
                mem_gb = torch.cuda.get_device_properties(i).total_memory / (1024**3)
                print(f"  GPU {i}: {name} ({mem_gb:.1f} GB)")

        # NCCL availability - try different approaches
        print()
        nccl_available = False
        try:
            # Check if NCCL backend is available
            nccl_available = torch.distributed.is_nccl_available()
            print(f"NCCL backend available: {nccl_available}")
        except:
            # Fallback: check if NCCL module exists
            try:
                import torch.cuda.nccl
                nccl_available = True
                print(f"NCCL module available: True")
            except:
                print(f"NCCL not available")

        # NCCL version
        if nccl_available:
            try:
                version = torch.cuda.nccl.version()
                print(f"NCCL version: {version}")
            except:
                pass

        # Check for NCCL package
        try:
            import pkg_resources
            nccl_pkgs = [p for p in pkg_resources.working_set
                        if 'nccl' in p.project_name.lower()]
            if nccl_pkgs:
                print(f"NCCL package: {nccl_pkgs[0]}")
        except:
            pass

        # Multi-GPU recommendations
        print("\n" + "=" * 60)
        print("Multi-GPU Configuration")
        print("=" * 60)

        if gpu_count == 1:
            print("\nSingle GPU detected (NCCL not needed)")
            print("\nCurrent mode: Single-GPU inference")
            print("  - No inter-GPU communication")
            print("  - NCCL installed but idle")
            print("  - Maximum model size limited to GPU memory")

            print("\nTo use NCCL with tensor parallelism:")
            print("  1. Request multiple GPUs from your cluster")
            print("     Example (SLURM): salloc --gres=gpu:4")
            print("  2. Use vLLM with tensor_parallel_size:")
            print("     llm = LLM(model='...', tensor_parallel_size=4)")

        elif gpu_count >= 2:
            print(f"\n{gpu_count} GPUs detected - NCCL will be used automatically!")
            print("\nExample tensor parallelism configurations:")

            for tp_size in [2, 4, 8]:
                if tp_size <= gpu_count:
                    print(f"\n  {tp_size} GPUs:")
                    print(f"    llm = LLM(")
                    print(f"        model='meta-llama/Llama-2-70b-hf',")
                    print(f"        tensor_parallel_size={tp_size}")
                    print(f"    )")

            print("\nNCCL will handle all inter-GPU communication")
            print("Can run much larger models than single GPU")

        print("\n" + "=" * 60)
        return True

    except Exception as e:
        print(f"\nError checking NCCL status: {e}")
        import traceback
        traceback.print_exc()
        return False


def show_example_configs():
    """Show example configurations for different GPU counts."""
    import torch
    gpu_count = torch.cuda.device_count()

    print("\nExample vLLM Configurations")
    print("=" * 60)

    examples = {
        1: [
            ("Small model", "facebook/opt-125m", 1),
            ("Medium model", "facebook/opt-13b", 1),
            ("Large model (quantized)", "meta-llama/Llama-2-70b-hf (8-bit)", 1),
        ],
        4: [
            ("Large model", "meta-llama/Llama-2-70b-hf", 4),
            ("Very large model", "meta-llama/Llama-2-405b-hf", 4),
        ],
        8: [
            ("Massive model", "meta-llama/Llama-2-405b-hf", 8),
        ]
    }

    for required_gpus in sorted(examples.keys()):
        if required_gpus <= gpu_count:
            print(f"\nWith {required_gpus} GPU(s) available:")
            for desc, model, tp_size in examples[required_gpus]:
                print(f"  • {desc}: {model}")
                if tp_size > 1:
                    print(f"    → Use tensor_parallel_size={tp_size}")
                    print(f"    → NCCL handles GPU communication")


def main():
    """Run NCCL status check."""
    print(f"\nPython executable: {sys.executable}\n")

    success = check_nccl_status()

    if success:
        show_example_configs()

    print("\n" + "=" * 60)
    print("For more details, see: nccl_and_multi_gpu.md")
    print("=" * 60 + "\n")

    return 0 if success else 1


if __name__ == "__main__":
    sys.exit(main())