ray-project · kouroshHakha · Apr 30, 2025 · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025
@@ -24,7 +24,7 @@
 
 
 class Text(BaseModel):
-    field: str = "text"
+    # field: str = "text"
     type: str = "text"
     text: str
 
@@ -35,18 +35,24 @@ class Text(BaseModel):
 # This is to support the "content" content type in the prompt format, as opposite of
 # the "text" content from the above which most other model uses.
 class Content(BaseModel):
-    field: str = "text"
+    # field: str = "text"
     type: str = "text"
     content: str
 
 
 class Image(BaseModel):
-    field: str = "image_url"
-    image_url: Dict
+    # field: str = "image_url"
+    type: str = "image_url"
+    image_url: Union[Dict, str]
 
     @field_validator("image_url")
     @classmethod
     def check_image_url(cls, value):
+        # image_url can be a string as well:
+        # https://platform.openai.com/docs/guides/images-vision?api-mode=responses&format=url
+        if isinstance(value, str):
+            return value
+
         if "url" not in value or not value["url"] or not isinstance(value["url"], str):
             raise ValueError(
                 # TODO(xwjiang): Link to doc.

@@ -945,3 +945,4 @@ class GenerationRequest(BaseModelExtended):
     prompt: Union[str, List[int], List[str]]
     request_id: Union[str, List[str]]
     sampling_params: Optional[Union[SamplingParams, List[SamplingParams]]] = None
+    stream: bool = False
@@ -0,0 +1,59 @@
+from typing import AsyncGenerator, Optional
+
+from ray.llm._internal.serve.configs.server_models import (
+    Prompt,
+    LLMRawResponse,
+    LLMConfig,
+    GenerationRequest,
+    DiskMultiplexConfig,
+)
+
+
+import abc
+
+
+class LLMEngine(abc.ABC):
+    """Base class for all LLM engines"""
+
+    def __init__(self, llm_config: LLMConfig):
+        self._llm_config = llm_config
+
+    @abc.abstractmethod
+    async def start(self):
+        """Start the engine"""
+        pass
+
+    @abc.abstractmethod
+    async def prepare_request(
+        self,
+        request_id: str,
+        prompt: Prompt,
+        stream: bool,
+        disk_lora_model: Optional[DiskMultiplexConfig] = None,
+        **kwargs,
+    ) -> GenerationRequest:
+        """Prepare an EngineRequest for the engine"""
+        pass
+
+    @abc.abstractmethod
+    async def generate(
+        self, request: GenerationRequest
+    ) -> AsyncGenerator[LLMRawResponse, None]:
+        """Generate an LLMRawResponse stream"""
+        pass
+
+    async def check_health(self):
+        """Check the health of the engine"""
+        pass
+
+    async def sleep(self):
+        """Puts the engine to sleep"""
+        pass
+
+    async def wakeup(self):
+        """Wakes up the engine"""
+        pass
+
+    def shutdown(self):
+        """Shuts down the engine"""
+        pass
@@ -4,7 +4,6 @@
 from typing import AsyncGenerator, Dict, Any, Optional, Type, Union
 
 # Third-party imports
-from pydantic import ValidationError as PydanticValidationError
 from ray import serve
 from ray._common.utils import import_attr
 
@@ -15,9 +14,6 @@
     ENGINE_START_TIMEOUT_S,
     RAYLLM_VLLM_ENGINE_CLS_ENV,
 )
-from ray.llm._internal.serve.configs.error_handling import (
-    ValidationErrorWithPydantic,
-)
 from ray.llm._internal.serve.configs.openai_api_models import (
     ChatCompletionLogProb,
     ChatCompletionLogProbs,
@@ -52,10 +48,6 @@
     LoraModelLoader,
 )
 from ray.llm._internal.serve.deployments.llm.vllm.vllm_engine import VLLMEngine
-from ray.llm._internal.serve.deployments.llm.vllm.vllm_models import (
-    VLLMGenerationRequest,
-    VLLMSamplingParams,
-)
 from ray.llm._internal.serve.deployments.utils.error_handling_utils import (
     StreamingErrorHandler,
 )
@@ -511,50 +503,24 @@ async def _predict(
         """
 
         logger.info(f"Received streaming request {request_id}")
-        try:
-            multiplexed_model_id = serve.get_multiplexed_model_id()
-
-            if multiplexed_model_id:
-                assert (
-                    self._llm_config.lora_config is not None
-                ), "Must setup lora config for multiplexed requests."
-                disk_lora_model = await self._disk_lora_model(multiplexed_model_id)
-            else:
-                disk_lora_model = None
-
-            prompt_output = self._llm_config.prompt_format.generate_prompt(prompt)
-
-            sampling_params = VLLMSamplingParams.from_prompt(prompt)
-            prompt_text = prompt_output.text
-            image_input = prompt_output.image
-            image = []
-            if not self._llm_config.supports_vision and image_input:
-                raise RuntimeError(
-                    "You provided image input while the engine is not set up to handle images. "
-                    "Did you forget to set `input_modality` to image in yaml file?"
-                )
+        multiplexed_model_id = serve.get_multiplexed_model_id()
+
+        if multiplexed_model_id:
+            assert (
+                self._llm_config.lora_config is not None
+            ), "Must setup lora config for multiplexed requests."
+            disk_lora_model = await self._disk_lora_model(multiplexed_model_id)
+        else:
+            disk_lora_model = None
+
+        llm_request = await self.engine.prepare_request(
+            request_id=request_id,
+            prompt=prompt,
+            stream=stream,
+            disk_lora_model=disk_lora_model,
+        )
 
-            if self._llm_config.supports_vision and image_input:
-                for _image in image_input:
-                    image_url = _image.image_url
-                    image.append(await self.image_retriever.get(image_url))
-
-            request_params = {
-                "prompt": prompt_text,
-                "request_id": request_id,
-                "sampling_params": sampling_params,
-                "disk_multiplex_config": disk_lora_model,
-                "serve_request_context": serve.context._serve_request_context.get(),
-            }
-            if image:
-                request_params["multi_modal_data"] = {"image": image}
-            vllm_request = VLLMGenerationRequest(**request_params)
-        except PydanticValidationError as e:
-            # Wrap the PydanticValidationError in a ValidationErrorWithPydantic
-            # so that it can be used in a RayActorError
-            # See https://github.com/ray-project/ray/issues/43401
-            raise ValidationErrorWithPydantic(e) from None
-        async for llm_response in self.engine.generate(vllm_request, stream):
+        async for llm_response in self.engine.generate(llm_request):
             yield llm_response
 
     async def chat(self, request: ChatCompletionRequest) -> LLMChatResponse: