-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdriver
executable file
·119 lines (94 loc) · 3.52 KB
/
driver
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env -S python -u
import argparse
import fmwork
import torch
import traceback
import vllm
class var: pass
class par: pass
def main():
params()
llm()
runs()
done()
def params():
fmwork.banner('PARAMS')
parser = argparse.ArgumentParser()
parser.add_argument('-m', '--model_path', type=str, required=True)
parser.add_argument('-i', '--input_size', type=str, required=True)
parser.add_argument('-o', '--output_size', type=str, required=True)
parser.add_argument('-b', '--batch_size', type=str, required=True)
parser.add_argument('-t', '--tensor_parallel', type=int, required=True)
parser.add_argument('-r', '--reps', type=int, default=3)
parser.add_argument('-d', '--dtype', type=str, default='auto')
parser.add_argument('-q', '--quantization', type=str, default=None)
parser.add_argument('-k', '--kv_cache_dtype', type=str, default='auto')
parser.add_argument('-s', '--num_scheduler_steps', type=int, default=1)
parser.add_argument('-M', '--max_num_seqs', type=int, default=1024)
parser.add_argument('-u', '--gpu_memory_utilization', type=float, default=0.95)
parser.add_argument('-e', '--enforce_eager', action='store_true')
parser.parse_args(namespace=par)
attrs = []
for attr in dir(par):
if not attr.startswith('__') and not attr.endswith('__'):
attrs.append(attr)
pad = max([len(x) for x in attrs])
for attr in sorted(attrs):
print('%-*s = %s' % (
pad, attr, getattr(par, attr)))
var.input_sizes = list(map(int, par.input_size.split(',')))
var.output_sizes = list(map(int, par.output_size.split(',')))
var.batch_sizes = list(map(int, par.batch_size.split(',')))
def llm():
fmwork.banner('LLM')
var.llm = vllm.LLM(
model = par.model_path,
tensor_parallel_size = par.tensor_parallel,
dtype = par.dtype,
quantization = par.quantization,
kv_cache_dtype = par.kv_cache_dtype,
enforce_eager = par.enforce_eager,
num_scheduler_steps = par.num_scheduler_steps,
gpu_memory_utilization = par.gpu_memory_utilization,
max_model_len = max(var.input_sizes) + max(var.output_sizes),
trust_remote_code = True,
)
def runs():
for batch_size in var.batch_sizes:
for input_size in var.input_sizes:
for output_size in var.output_sizes:
run(input_size, output_size, batch_size)
def run(input_size, output_size, batch_size):
fmwork.banner(
'RUN',
input_size, '/',
output_size, '/',
batch_size, '/',
par.tensor_parallel
)
input_batch = fmwork.input_generator(
par.model_path,
input_size, batch_size,
return_tensors='np',
)
sampling_params = vllm.SamplingParams(
max_tokens = output_size,
ignore_eos = True,
)
kwargs = {
'prompt_token_ids' : input_batch,
'sampling_params' : sampling_params,
'use_tqdm' : False,
}
fmwork.reset()
for rep in range(par.reps):
fmwork.t0()
var.llm.generate(**kwargs)
torch.cuda.synchronize()
fmwork.t1(
rep, par.reps,
input_size, output_size, batch_size,
par.tensor_parallel)
def done():
fmwork.banner('DONE')
if __name__ == '__main__': main()