@@ -517,6 +517,7 @@ def load_model(
517517 stream_interval : int = 1 ,
518518 max_tokens : int = 32768 ,
519519 force_mllm : bool = False ,
520+ gpu_memory_utilization : float = 0.90 ,
520521):
521522 """
522523 Load a model (auto-detects MLLM vs LLM).
@@ -546,6 +547,7 @@ def load_model(
546547 scheduler_config = scheduler_config ,
547548 stream_interval = stream_interval ,
548549 force_mllm = force_mllm ,
550+ gpu_memory_utilization = gpu_memory_utilization ,
549551 )
550552 # BatchedEngine will be started in lifespan (uvicorn's event loop)
551553 # Just log for now
@@ -1231,10 +1233,22 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
12311233 f"prompt_chars={ prompt_len } prompt_preview={ prompt_preview !r} "
12321234 )
12331235
1236+ # Resolve repetition penalty for completions
1237+ comp_rep_penalty = request .repetition_penalty
1238+ if comp_rep_penalty is None and request .frequency_penalty :
1239+ comp_rep_penalty = 1.0 + request .frequency_penalty
1240+ if comp_rep_penalty is None and request .presence_penalty :
1241+ comp_rep_penalty = 1.0 + request .presence_penalty
1242+
12341243 if request .stream :
12351244 return StreamingResponse (
12361245 _disconnect_guard (
1237- stream_completion (engine , prompts [0 ], request ),
1246+ stream_completion (
1247+ engine ,
1248+ prompts [0 ],
1249+ request ,
1250+ repetition_penalty = comp_rep_penalty ,
1251+ ),
12381252 raw_request ,
12391253 ),
12401254 media_type = "text/event-stream" ,
@@ -1248,14 +1262,16 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
12481262 total_prompt_tokens = 0
12491263
12501264 for i , prompt in enumerate (prompts ):
1265+ gen_kwargs = {
1266+ "max_tokens" : request .max_tokens or _default_max_tokens ,
1267+ "temperature" : _resolve_temperature (request .temperature ),
1268+ "top_p" : _resolve_top_p (request .top_p ),
1269+ "stop" : request .stop ,
1270+ }
1271+ if comp_rep_penalty is not None :
1272+ gen_kwargs ["repetition_penalty" ] = comp_rep_penalty
12511273 output = await _wait_with_disconnect (
1252- engine .generate (
1253- prompt = prompt ,
1254- max_tokens = request .max_tokens or _default_max_tokens ,
1255- temperature = _resolve_temperature (request .temperature ),
1256- top_p = _resolve_top_p (request .top_p ),
1257- stop = request .stop ,
1258- ),
1274+ engine .generate (prompt = prompt , ** gen_kwargs ),
12591275 raw_request ,
12601276 timeout = timeout ,
12611277 )
@@ -1387,12 +1403,21 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
13871403 # Inject JSON instruction into messages
13881404 messages = _inject_json_instruction (messages , json_instruction )
13891405
1406+ # Resolve repetition penalty: explicit > frequency_penalty > presence_penalty
1407+ rep_penalty = request .repetition_penalty
1408+ if rep_penalty is None and request .frequency_penalty :
1409+ rep_penalty = 1.0 + request .frequency_penalty
1410+ if rep_penalty is None and request .presence_penalty :
1411+ rep_penalty = 1.0 + request .presence_penalty
1412+
13901413 # Prepare kwargs
13911414 chat_kwargs = {
13921415 "max_tokens" : request .max_tokens or _default_max_tokens ,
13931416 "temperature" : _resolve_temperature (request .temperature ),
13941417 "top_p" : _resolve_top_p (request .top_p ),
13951418 }
1419+ if rep_penalty is not None :
1420+ chat_kwargs ["repetition_penalty" ] = rep_penalty
13961421
13971422 # Add multimodal content
13981423 if has_media :
@@ -1862,15 +1887,18 @@ async def stream_completion(
18621887 engine : BaseEngine ,
18631888 prompt : str ,
18641889 request : CompletionRequest ,
1890+ repetition_penalty : float | None = None ,
18651891) -> AsyncIterator [str ]:
18661892 """Stream completion response."""
1867- async for output in engine .stream_generate (
1868- prompt = prompt ,
1869- max_tokens = request .max_tokens or _default_max_tokens ,
1870- temperature = _resolve_temperature (request .temperature ),
1871- top_p = _resolve_top_p (request .top_p ),
1872- stop = request .stop ,
1873- ):
1893+ gen_kwargs = {
1894+ "max_tokens" : request .max_tokens or _default_max_tokens ,
1895+ "temperature" : _resolve_temperature (request .temperature ),
1896+ "top_p" : _resolve_top_p (request .top_p ),
1897+ "stop" : request .stop ,
1898+ }
1899+ if repetition_penalty is not None :
1900+ gen_kwargs ["repetition_penalty" ] = repetition_penalty
1901+ async for output in engine .stream_generate (prompt = prompt , ** gen_kwargs ):
18741902 data = {
18751903 "id" : f"cmpl-{ uuid .uuid4 ().hex [:8 ]} " ,
18761904 "object" : "text_completion" ,
0 commit comments