Skip to content

[Serve.llm] Refactor LLMServer and LLMEngine to not diverge too much from vllm chat formatting logic #52597

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 30 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
b670ce5
refactor
kouroshHakha Apr 25, 2025
28cd858
wip
kouroshHakha Apr 25, 2025
37d624d
lint
kouroshHakha Apr 25, 2025
1799bf5
wip
kouroshHakha Apr 25, 2025
6594f34
wip
kouroshHakha Apr 27, 2025
ff2bb07
wip
kouroshHakha Apr 28, 2025
be35a11
wip
kouroshHakha Apr 28, 2025
e1045f0
Merge branch 'master' into kh/fix-vlm-chat-template
kouroshHakha Apr 28, 2025
9da0c2a
fixed tests
kouroshHakha Apr 28, 2025
f5b4958
fixed release tests
kouroshHakha Apr 28, 2025
7ccd0d4
wip
kouroshHakha Apr 29, 2025
5804ba5
Update python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engin…
kouroshHakha Apr 29, 2025
3101e81
Update python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engin…
kouroshHakha Apr 29, 2025
b6eed94
removed serve context stuff
kouroshHakha Apr 29, 2025
2ed151e
Merge branch 'kh/fix-vlm-chat-template' of https://github.com/kourosh…
kouroshHakha Apr 29, 2025
d74febc
wip
kouroshHakha Apr 29, 2025
93ee153
wip
kouroshHakha Apr 29, 2025
a4b34f3
wip
kouroshHakha Apr 29, 2025
d4d2a81
wip
kouroshHakha Apr 29, 2025
be06a5c
fixed test
kouroshHakha Apr 29, 2025
db40d54
Fixed tests
kouroshHakha Apr 29, 2025
fd43abd
wip
kouroshHakha Apr 29, 2025
c1c4f2b
wip
kouroshHakha Apr 29, 2025
0afb5f6
wip
kouroshHakha Apr 29, 2025
f473e27
wip
kouroshHakha Apr 29, 2025
d1e4164
wip
kouroshHakha Apr 30, 2025
89e61c5
wip
kouroshHakha Apr 30, 2025
84f39fe
wip
kouroshHakha Apr 30, 2025
830a4b8
wip
kouroshHakha Apr 30, 2025
916acea
Merge branch 'master' into kh/fix-vlm-chat-template
kouroshHakha Apr 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions python/ray/llm/_internal/serve/configs/prompt_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@


class Text(BaseModel):
field: str = "text"
# field: str = "text"
type: str = "text"
text: str

Expand All @@ -35,18 +35,24 @@ class Text(BaseModel):
# This is to support the "content" content type in the prompt format, as opposite of
# the "text" content from the above which most other model uses.
class Content(BaseModel):
field: str = "text"
# field: str = "text"
type: str = "text"
content: str


class Image(BaseModel):
field: str = "image_url"
image_url: Dict
# field: str = "image_url"
type: str = "image_url"
image_url: Union[Dict, str]

@field_validator("image_url")
@classmethod
def check_image_url(cls, value):
# image_url can be a string as well:
# https://platform.openai.com/docs/guides/images-vision?api-mode=responses&format=url
if isinstance(value, str):
return value

if "url" not in value or not value["url"] or not isinstance(value["url"], str):
raise ValueError(
# TODO(xwjiang): Link to doc.
Expand Down
1 change: 1 addition & 0 deletions python/ray/llm/_internal/serve/configs/server_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -945,3 +945,4 @@ class GenerationRequest(BaseModelExtended):
prompt: Union[str, List[int], List[str]]
request_id: Union[str, List[str]]
sampling_params: Optional[Union[SamplingParams, List[SamplingParams]]] = None
stream: bool = False
59 changes: 59 additions & 0 deletions python/ray/llm/_internal/serve/deployments/llm/llm_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from typing import AsyncGenerator, Optional

from ray.llm._internal.serve.configs.server_models import (
Prompt,
LLMRawResponse,
LLMConfig,
GenerationRequest,
DiskMultiplexConfig,
)


import abc


class LLMEngine(abc.ABC):
"""Base class for all LLM engines"""

def __init__(self, llm_config: LLMConfig):
self._llm_config = llm_config

@abc.abstractmethod
async def start(self):
"""Start the engine"""
pass

@abc.abstractmethod
async def prepare_request(
self,
request_id: str,
prompt: Prompt,
stream: bool,
disk_lora_model: Optional[DiskMultiplexConfig] = None,
**kwargs,
) -> GenerationRequest:
"""Prepare an EngineRequest for the engine"""
pass

@abc.abstractmethod
async def generate(
self, request: GenerationRequest
) -> AsyncGenerator[LLMRawResponse, None]:
"""Generate an LLMRawResponse stream"""
pass

async def check_health(self):
"""Check the health of the engine"""
pass

async def sleep(self):
"""Puts the engine to sleep"""
pass

async def wakeup(self):
"""Wakes up the engine"""
pass

def shutdown(self):
"""Shuts down the engine"""
pass
68 changes: 17 additions & 51 deletions python/ray/llm/_internal/serve/deployments/llm/llm_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from typing import AsyncGenerator, Dict, Any, Optional, Type, Union

# Third-party imports
from pydantic import ValidationError as PydanticValidationError
from ray import serve
from ray._common.utils import import_attr

Expand All @@ -15,9 +14,6 @@
ENGINE_START_TIMEOUT_S,
RAYLLM_VLLM_ENGINE_CLS_ENV,
)
from ray.llm._internal.serve.configs.error_handling import (
ValidationErrorWithPydantic,
)
from ray.llm._internal.serve.configs.openai_api_models import (
ChatCompletionLogProb,
ChatCompletionLogProbs,
Expand Down Expand Up @@ -52,10 +48,6 @@
LoraModelLoader,
)
from ray.llm._internal.serve.deployments.llm.vllm.vllm_engine import VLLMEngine
from ray.llm._internal.serve.deployments.llm.vllm.vllm_models import (
VLLMGenerationRequest,
VLLMSamplingParams,
)
from ray.llm._internal.serve.deployments.utils.error_handling_utils import (
StreamingErrorHandler,
)
Expand Down Expand Up @@ -511,50 +503,24 @@ async def _predict(
"""

logger.info(f"Received streaming request {request_id}")
try:
multiplexed_model_id = serve.get_multiplexed_model_id()

if multiplexed_model_id:
assert (
self._llm_config.lora_config is not None
), "Must setup lora config for multiplexed requests."
disk_lora_model = await self._disk_lora_model(multiplexed_model_id)
else:
disk_lora_model = None

prompt_output = self._llm_config.prompt_format.generate_prompt(prompt)

sampling_params = VLLMSamplingParams.from_prompt(prompt)
prompt_text = prompt_output.text
image_input = prompt_output.image
image = []
if not self._llm_config.supports_vision and image_input:
raise RuntimeError(
"You provided image input while the engine is not set up to handle images. "
"Did you forget to set `input_modality` to image in yaml file?"
)
multiplexed_model_id = serve.get_multiplexed_model_id()

if multiplexed_model_id:
assert (
self._llm_config.lora_config is not None
), "Must setup lora config for multiplexed requests."
disk_lora_model = await self._disk_lora_model(multiplexed_model_id)
else:
disk_lora_model = None

llm_request = await self.engine.prepare_request(
request_id=request_id,
prompt=prompt,
stream=stream,
disk_lora_model=disk_lora_model,
)

if self._llm_config.supports_vision and image_input:
for _image in image_input:
image_url = _image.image_url
image.append(await self.image_retriever.get(image_url))

request_params = {
"prompt": prompt_text,
"request_id": request_id,
"sampling_params": sampling_params,
"disk_multiplex_config": disk_lora_model,
"serve_request_context": serve.context._serve_request_context.get(),
}
if image:
request_params["multi_modal_data"] = {"image": image}
vllm_request = VLLMGenerationRequest(**request_params)
except PydanticValidationError as e:
# Wrap the PydanticValidationError in a ValidationErrorWithPydantic
# so that it can be used in a RayActorError
# See https://github.com/ray-project/ray/issues/43401
raise ValidationErrorWithPydantic(e) from None
async for llm_response in self.engine.generate(vllm_request, stream):
async for llm_response in self.engine.generate(llm_request):
yield llm_response

async def chat(self, request: ChatCompletionRequest) -> LLMChatResponse:
Expand Down
Loading