distributed-inference-vllm/workflows/Meta-Llama-3.1-405B-Instruct-FP8_multinode-server/batch_processing_100.py at main · KempnerInstitute/distributed-inference-vllm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import asyncio
from openai import AsyncOpenAI

client = AsyncOpenAI(base_url="http://localhost:8000/v1", api_key="token")

async def send_request(prompt):
    response = await client.chat.completions.create(
        model="neuralmagic/Meta-Llama-3.1-405B-Instruct-FP8",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=100
    )
    return response.choices[0].message.content

async def run_batch():
    prompts = [
        "Explain quantum entanglement in one sentence.",
        "Write a haiku about H200 GPUs.",
        "What is the square root of 144?",
        "Translate 'Artificial Intelligence' to Persian.",
        "Who won the Nobel Prize in Physics in 1921?",
        "Explain the difference between PCIe and NVLink.",
        "What is gradient descent?",
        "Write a short poem about distributed systems.",
        "What is tail latency in distributed computing?",
        "Explain what a straggler is in GPU training.",
        "What does RDMA stand for?",
        "Explain CUDA in simple terms.",
        "What is transformer attention?",
        "Summarize the theory of relativity in two sentences.",
        "What is the capital of Japan?",
        "Explain the purpose of a KV cache in LLM inference.",
        "What is memory bandwidth?",
        "Define overfitting in machine learning.",
        "What is mixed precision training?",
        "Explain SHARP in networking.",
        "What is the derivative of x^2?",
        "Explain Lustre striping.",
        "What is InfiniBand?",
        "Difference between Ethernet and InfiniBand?",
        "Explain model quantization.",
        "What is FP8?",
        "Explain tensor parallelism.",
        "What is pipeline parallelism?",
        "Define FSDP.",
        "What is DDP in PyTorch?",
        "Explain torch.compile.",
        "What is Triton language?",
        "Define zero-shot learning.",
        "What is a hyperparameter?",
        "Explain self-supervised learning.",
        "What is reinforcement learning?",
        "Define PPO algorithm briefly.",
        "Explain gradient clipping.",
        "What is a CUDA kernel?",
        "Explain GPU memory hierarchy.",
        "What is VRAM?",
        "Define NUMA.",
        "Explain latency vs throughput.",
        "What is load balancing?",
        "Explain admission control.",
        "What is a bottleneck in systems?",
        "Explain scaling laws in LLMs.",
        "What is the difference between RAM and cache?",
        "What is vector embedding?",
        "Explain cosine similarity.",
        "What is backpropagation?",
        "What is stochastic gradient descent?",
        "Define batch normalization.",
        "Explain dropout.",
        "What is model distillation?",
        "Explain pruning in neural networks.",
        "What is sparse modeling?",
        "Define FLOPS.",
        "Explain energy efficiency in AI.",
        "What is dynamic voltage scaling?",
        "What is Green AI?",
        "Explain checkpointing in training.",
        "What is fault tolerance?",
        "Define SLURM.",
        "What is job scheduling?",
        "Explain object storage.",
        "Difference between NFS and object storage?",
        "What is WebDataset?",
        "Explain streaming datasets.",
        "What is a dataloader?",
        "Define epoch in ML training.",
        "What is a seed in experiments?",
        "Explain reproducibility in ML.",
        "What is profiling?",
        "Define GPU utilization.",
        "Explain kernel fusion.",
        "What is JIT compilation?",
        "Explain graph compilation.",
        "What is ONNX?",
        "Define model serving.",
        "Explain inference batching.",
        "What is microbatching?",
        "Explain autoregressive decoding.",
        "What is beam search?",
        "Explain temperature in sampling.",
        "What is top-p sampling?",
        "Define hallucination in LLMs.",
        "Explain fine-tuning.",
        "What is LoRA?",
        "Define transfer learning.",
        "Explain domain adaptation.",
        "What is active learning?",
        "Define federated learning.",
        "Explain multi-agent systems.",
        "What is neural architecture search?",
        "Explain HPC.",
        "What is exascale computing?",
        "Explain cloud vs on-prem AI infrastructure.",
        "Write one sentence about the future of AI."
    ]

    results = await asyncio.gather(*(send_request(p) for p in prompts))

    for i, res in enumerate(results):
        print(f"Response {i+1}: {res}\n")

asyncio.run(run_batch())