Skip to content

Commit 032b149

Browse files
author
yli1 user
committed
enable lora modules
1 parent 680e71f commit 032b149

File tree

2 files changed

+41
-8
lines changed

2 files changed

+41
-8
lines changed

matrix/app_server/deploy_utils.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from jinja2 import Template
2020
from ray import serve
2121

22+
from matrix.app_server.llm.ray_serve_vllm import BaseDeployment
2223
from matrix.common.cluster_info import ClusterInfo
2324
from matrix.utils.ray import Action, get_ray_address, kill_matrix_actors
2425

@@ -132,6 +133,19 @@
132133
"quantization": "compressed-tensors",
133134
"use_v1_engine": "true",
134135
},
136+
"unsloth/mistral-7b-instruct-v0.2-bnb-4bit": {
137+
"name": "unsloth-mistral-7B",
138+
"tensor-parallel-size": 1,
139+
"pipeline-parallel-size": 1,
140+
"enable-prefix-caching": True,
141+
"max_ongoing_requests": 256,
142+
"max-model-len": 32768,
143+
"gpu-memory-utilization": 0.4,
144+
"enable-lora": True,
145+
"quantization": "bitsandbytes",
146+
"load-format": "bitsandbytes",
147+
"max_lora_rank": 32,
148+
},
135149
}
136150

137151
non_model_params = [
@@ -372,14 +386,6 @@ def get_yaml_for_deployment(
372386
yaml_str += "\n" + yaml.dump([found_app[0]], indent=2, sort_keys=False)
373387
continue
374388

375-
unknown = {
376-
k: v
377-
for k, v in app.items()
378-
if k not in non_model_params
379-
and not hasattr(AsyncEngineArgs, k.replace("-", "_"))
380-
}
381-
assert not unknown, f"unknown vllm model args {unknown}"
382-
383389
app_type = app.get("app_type", "llm")
384390
assert app_type in [
385391
"llm",
@@ -397,6 +403,19 @@ def get_yaml_for_deployment(
397403
if "max_replica" not in app:
398404
app["max_replica"] = app["min_replica"]
399405

406+
if app_type in ["llm", "sglang_llm"]:
407+
unknown = {
408+
k: v
409+
for k, v in app.items()
410+
if k not in non_model_params
411+
and not hasattr(AsyncEngineArgs, k.replace("-", "_"))
412+
and not hasattr(BaseDeployment, k.replace("-", "_"))
413+
}
414+
assert not unknown, f"unknown vllm model args {unknown}"
415+
else:
416+
unknown = {k: v for k, v in app.items() if k not in non_model_params}
417+
assert not unknown, f"unknown {app_type} model args {unknown}"
418+
400419
if app_type in ["llm", "sglang_llm"]:
401420
update_vllm_app_params(app)
402421
yaml_str += Template(vllm_app_template).render(

matrix/app_server/llm/ray_serve_vllm.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,13 @@ async def CreateChatCompletion(self, request):
380380
)
381381
logger.debug(f"Request: {chat}")
382382
try:
383+
if (
384+
self.openai_serving_chat.models.static_lora_modules
385+
and len(self.openai_serving_chat.models.lora_requests) == 0
386+
):
387+
# only need for lora modules, at vllm >= v0.7.0
388+
# due to https://github.com/vllm-project/vllm/commit/ac2f3f7fee93cf9cd97c0078e362feab7b6c8299
389+
await self.openai_serving_chat.models.init_static_loras()
383390
generator = await self.openai_serving_chat.create_chat_completion(chat)
384391
if isinstance(generator, ErrorResponse):
385392
status_code = self.http_to_grpc_status(generator.code)
@@ -417,6 +424,13 @@ async def CreateCompletion(self, request):
417424
)
418425
logger.debug(f"Request: {completion_request}")
419426
try:
427+
if (
428+
self.openai_serving_chat.models.static_lora_modules
429+
and len(self.openai_serving_chat.models.lora_requests) == 0
430+
):
431+
# only need for lora modules, at vllm >= v0.7.0
432+
# due to https://github.com/vllm-project/vllm/commit/ac2f3f7fee93cf9cd97c0078e362feab7b6c8299
433+
await self.openai_serving_chat.models.init_static_loras()
420434
generator = await self.openai_serving_completion.create_completion(
421435
completion_request,
422436
Request( # this Request is purely dummy, it is changed to optional in vllm's recent pull https://github.com/vllm-project/vllm/pull/12503

0 commit comments

Comments
 (0)