11# adopt from https://github.com/TIGER-AI-Lab/MMLU-Pro/blob/main/evaluate_from_api.py
2-
3- import os
42import re
53import random
6- from tqdm import tqdm
7- from datasets import load_dataset
84import argparse
9- from benchmarks .backend_request_func import async_request_openai_chat_completions , RequestFuncInput
105import asyncio
116
7+ from gllm import LLM
128
13- API_KEY = "EMPTY"
14- random . seed ( 12345 )
9+ from tqdm import tqdm
10+ from datasets import load_dataset
1511
12+ random .seed (12345 )
1613
1714def load_mmlu_pro ():
1815 dataset = load_dataset ("TIGER-Lab/MMLU-Pro" )
@@ -83,7 +80,7 @@ def extract_final(text):
8380 return None
8481
8582
86- def single_request (api_url , single_question , cot_examples_dict , pbar ):
83+ def single_request (single_question , cot_examples_dict ):
8784 category = single_question ["category" ]
8885 cot_examples = cot_examples_dict [category ]
8986 question = single_question ["question" ]
@@ -95,43 +92,44 @@ def single_request(api_url, single_question, cot_examples_dict, pbar):
9592 prompt += format_example (each ["question" ],
9693 each ["options" ], each ["cot_content" ])
9794 input_text = format_example (question , options )
98-
9995 prompt = prompt + input_text
100-
101- request_func_input = RequestFuncInput (prompt = prompt ,
102- api_url = api_url ,
103- prompt_len = len (prompt ),
104- output_len = args .output_len ,
105- model = args .model ,
106- )
107- return async_request_openai_chat_completions (request_func_input = request_func_input , pbar = pbar )
96+
97+ return prompt
10898
10999
110100
111101async def evaluate (subjects ):
112- api_url = f"http://{ args .host } :{ args .port } /v1/chat/completions"
113102 test_df , dev_df = load_mmlu_pro ()
114103 if not subjects :
115104 subjects = list (test_df .keys ())
116105 print ("assigned subjects" , subjects )
117106 category_record = {'total' :{'#correct' :0 ,'#wrong' :0 }}
118107
119- print (f"Sending requests ..." )
120- pbar = tqdm ()
121- tasks = []
108+ llm = LLM (model_path = args .model ,
109+ gpu_memory_util = args .gpu_memory_util ,
110+ kvthresh = args .kvthresh ,
111+ pp_size = args .pp ,
112+ tp_size = args .tp ,
113+ enable_prefix_caching = True ,
114+ use_thinking = False )
115+
116+ print (f"generating requests ..." )
117+ prompts = []
122118 test_data_total = []
123119 for subject in subjects :
124120 test_data = test_df [subject ][:args .num_per_sub ]
125121 test_data_total .extend (test_data )
126122 for each in test_data :
127- tasks .append (single_request (api_url , each , dev_df , pbar ))
128- pbar .total = len (tasks )
129- completions = await asyncio .gather (* tasks )
130- pbar .close ()
123+ prompts .append (single_request (each , dev_df ))
124+
125+ seqs = llm .generate (prompts , output_lens = [args .output_len for i in range (len (prompts ))])
126+
127+ outputs = [seq .output for seq in seqs ]
128+
131129 print (f"Processing completions ..." )
132- for idx , each in tqdm (enumerate (test_data_total ),total = len (tasks )):
130+ for idx , each in tqdm (enumerate (test_data_total ),total = len (prompts )):
133131 label = each ["answer" ]
134- response = completions [idx ]. generated_text
132+ response = outputs [idx ]
135133 response = response .replace ('**' , '' )
136134 pred = extract_answer (response )
137135 category = each ["category" ]
@@ -162,8 +160,10 @@ async def evaluate(subjects):
162160 parser .add_argument ("--assigned_subjects" , "-a" , type = str , default = "all" ,
163161 help = "business, law, psychology, biology, chemistry, history, other, health, "
164162 "economics, math, physics, computer science, philosophy, engineering" )
165- parser .add_argument ("--host" , type = str , default = '0.0.0.0' )
166- parser .add_argument ("--port" , type = int , default = 8000 )
163+ parser .add_argument ("--tp" , type = int , default = 1 )
164+ parser .add_argument ("--pp" , type = int , default = 1 )
165+ parser .add_argument ('--gpu-memory-util' , type = float , default = 0.9 )
166+ parser .add_argument ('--kvthresh' , type = float , default = 0.2 )
167167 parser .add_argument ("--output-len" , type = int , default = 1024 )
168168 parser .add_argument ("--num-per-sub" , type = int , default = 100 )
169169 assigned_subjects = []
0 commit comments