2323PORT = int (os .environ .get ("VLLM_PORT" , "8000" ))
2424BASE_URL = f"http://{ HOST } :{ PORT } /v1"
2525
26+
2627class SUT :
2728 def __init__ (
2829 self ,
@@ -64,7 +65,7 @@ def __init__(
6465 "temperature" : 0.0 ,
6566 "max_tokens" : 1024 ,
6667 }
67-
68+
6869 if scenario == "offline" :
6970 from vllm import SamplingParams
7071 from transformers import AutoProcessor
@@ -109,23 +110,23 @@ def process_queries(self):
109110 prompts = []
110111 for item in qitems :
111112 question = self .data_object .prompts [item .index ]
112-
113- placeholders = [{"type" : "image_url" , "image_url" : {"url" : f"data:image/png;base64,{ b64img } " }} for b64img in self .data_object .images [item .index ]]
113+
114+ placeholders = [{"type" : "image_url" , "image_url" : {
115+ "url" : f"data:image/png;base64,{ b64img } " }} for b64img in self .data_object .images [item .index ]]
114116 messages = [
115117 {"role" : "system" , "content" : "You are a helpful assistant." },
116- {"role" : "user" , "content" : [* placeholders , {"type" : "text" , "text" : question }]},
118+ {"role" : "user" , "content" : [
119+ * placeholders , {"type" : "text" , "text" : question }]},
117120 ]
118-
121+
119122 prompt = self .processor .apply_chat_template (
120123 messages , tokenize = False , add_generation_prompt = True
121124 )
122125 prompts .append ({
123126 "prompt" : prompt ,
124127 "multi_modal_data" : {"image" : self .data_object .images [item .index ]}
125128 })
126-
127-
128-
129+
129130 tik2 = time .time ()
130131 outputs = self .model .generate (
131132 prompts = prompts , sampling_params = self .sampling_params
@@ -168,10 +169,10 @@ def load_model(self):
168169 from vllm import LLM
169170 log .info ("Loading model..." )
170171 self .model = LLM (
171- self .model_path ,
172- dtype = self .dtype ,
173- tensor_parallel_size = self .tensor_parallel_size ,
174- )
172+ self .model_path ,
173+ dtype = self .dtype ,
174+ tensor_parallel_size = self .tensor_parallel_size ,
175+ )
175176 log .info ("Loaded model" )
176177
177178 def get_sut (self ):
@@ -199,7 +200,6 @@ def issue_queries(self, query_samples):
199200 def flush_queries (self ):
200201 pass
201202
202-
203203 def __del__ (self ):
204204 pass
205205
@@ -231,11 +231,9 @@ def __init__(
231231 api_key = "EMPTY"
232232 )
233233
234-
235234 def start (self ):
236235 pass
237236
238-
239237 async def _issue_one (
240238 self ,
241239 sample : Dict [str , Any ],
@@ -244,7 +242,8 @@ async def _issue_one(
244242 log .info ("CALLED _issue_one" )
245243 """Send one streaming chat.completion request and record timings."""
246244
247- contents = [{"type" : "text" , "text" : self .data_object .prompts [sample .index ]}]
245+ contents = [
246+ {"type" : "text" , "text" : self .data_object .prompts [sample .index ]}]
248247 for img_b64 in self .data_object .images [sample .index ]:
249248 contents .append ({
250249 "type" : "image_url" ,
@@ -274,32 +273,40 @@ async def _issue_one(
274273 text = getattr (delta , "content" , None )
275274 if text :
276275 if ttft_set is False :
277- text_int32 = np .array ([ord (c ) for c in text ], dtype = np .int32 )
276+ text_int32 = np .array ([ord (c )
277+ for c in text ], dtype = np .int32 )
278278 response_data = array .array ("B" , text_int32 .tobytes ())
279279 bi = response_data .buffer_info ()
280- response = [lg .QuerySampleResponse (sample .id , bi [0 ], bi [1 ])]
280+ response = [
281+ lg .QuerySampleResponse (
282+ sample .id , bi [0 ], bi [1 ])]
281283 lg .FirstTokenComplete (response )
282284 ttft_set = True
283285 out .append (text )
284286
285- # when the stream ends, total latency
287+ # when the stream ends, total latency
286288 final_tokens = "" .join (out )
287- final_tokens_int32 = np .array ([ord (c ) for c in final_tokens ], dtype = np .int32 )
289+ final_tokens_int32 = np .array (
290+ [ord (c ) for c in final_tokens ], dtype = np .int32 )
288291 n_tokens = len (final_tokens_int32 )
289292 response_array = array .array ("B" , final_tokens_int32 .tobytes ())
290293 bi = response_array .buffer_info ()
291- response = [lg .QuerySampleResponse (sample .id , bi [0 ], bi [1 ], n_tokens )]
294+ response = [
295+ lg .QuerySampleResponse (
296+ sample .id ,
297+ bi [0 ],
298+ bi [1 ],
299+ n_tokens )]
292300 lg .QuerySamplesComplete (response )
293301
294-
295302 async def _issue_queries_async (self , query_samples ):
296303 """Async internal version used by the sync wrapper."""
297- log .info (f"CALLED _issue_queries_async, num workers: { self .num_workers } " )
304+ log .info (
305+ f"CALLED _issue_queries_async, num workers: { self .num_workers } " )
298306 semaphore = asyncio .Semaphore (self .num_workers )
299307 tasks = [self ._issue_one (s , semaphore ) for s in query_samples ]
300308 return await asyncio .gather (* tasks )
301309
302-
303310 def issue_queries (self , query_samples ):
304311 try :
305312 loop = asyncio .get_running_loop ()
@@ -314,4 +321,4 @@ def issue_queries(self, query_samples):
314321 asyncio .run (self ._issue_queries_async (query_samples ))
315322
316323 def stop (self ):
317- pass
324+ pass
0 commit comments