1919from jinja2 import Template
2020from ray import serve
2121
22+ from matrix .app_server .llm .ray_serve_vllm import BaseDeployment
2223from matrix .common .cluster_info import ClusterInfo
2324from matrix .utils .ray import Action , get_ray_address , kill_matrix_actors
2425
132133 "quantization" : "compressed-tensors" ,
133134 "use_v1_engine" : "true" ,
134135 },
136+ "unsloth/mistral-7b-instruct-v0.2-bnb-4bit" : {
137+ "name" : "unsloth-mistral-7B" ,
138+ "tensor-parallel-size" : 1 ,
139+ "pipeline-parallel-size" : 1 ,
140+ "enable-prefix-caching" : True ,
141+ "max_ongoing_requests" : 256 ,
142+ "max-model-len" : 32768 ,
143+ "gpu-memory-utilization" : 0.4 ,
144+ "enable-lora" : True ,
145+ "quantization" : "bitsandbytes" ,
146+ "load-format" : "bitsandbytes" ,
147+ "max_lora_rank" : 32 ,
148+ },
135149}
136150
137151non_model_params = [
@@ -372,14 +386,6 @@ def get_yaml_for_deployment(
372386 yaml_str += "\n " + yaml .dump ([found_app [0 ]], indent = 2 , sort_keys = False )
373387 continue
374388
375- unknown = {
376- k : v
377- for k , v in app .items ()
378- if k not in non_model_params
379- and not hasattr (AsyncEngineArgs , k .replace ("-" , "_" ))
380- }
381- assert not unknown , f"unknown vllm model args { unknown } "
382-
383389 app_type = app .get ("app_type" , "llm" )
384390 assert app_type in [
385391 "llm" ,
@@ -397,6 +403,19 @@ def get_yaml_for_deployment(
397403 if "max_replica" not in app :
398404 app ["max_replica" ] = app ["min_replica" ]
399405
406+ if app_type in ["llm" , "sglang_llm" ]:
407+ unknown = {
408+ k : v
409+ for k , v in app .items ()
410+ if k not in non_model_params
411+ and not hasattr (AsyncEngineArgs , k .replace ("-" , "_" ))
412+ and not hasattr (BaseDeployment , k .replace ("-" , "_" ))
413+ }
414+ assert not unknown , f"unknown vllm model args { unknown } "
415+ else :
416+ unknown = {k : v for k , v in app .items () if k not in non_model_params }
417+ assert not unknown , f"unknown { app_type } model args { unknown } "
418+
400419 if app_type in ["llm" , "sglang_llm" ]:
401420 update_vllm_app_params (app )
402421 yaml_str += Template (vllm_app_template ).render (
0 commit comments