1212# See the License for the specific language governing permissions and
1313# limitations under the License.
1414
15- from typing import Any , Dict , Optional , Union , AsyncGenerator
16- import torch
1715from argparse import Namespace
18- from fastapi import Request
16+ from typing import Any , Dict , Optional , Union , AsyncGenerator
1917from http import HTTPStatus
2018
21- from kserve .protocol .rest .openai .errors import create_error_response
22- from kserve .protocol .rest .openai import OpenAIEncoderModel , OpenAIGenerativeModel
23- from kserve .protocol .rest .openai .types import (
24- Completion ,
25- ChatCompletion ,
26- CompletionRequest ,
27- ChatCompletionRequest ,
28- EmbeddingRequest ,
29- Embedding ,
30- ErrorResponse ,
31- )
32-
33- import vllm .envs as envs
19+ import torch
20+ from fastapi import Request
3421from vllm import AsyncEngineArgs
22+ import vllm .envs as envs
3523from vllm .entrypoints .logger import RequestLogger
3624from vllm .engine .protocol import EngineClient
3725from vllm .entrypoints .openai .serving_completion import OpenAIServingCompletion
3826from vllm .entrypoints .openai .serving_chat import OpenAIServingChat
3927from vllm .entrypoints .openai .serving_embedding import OpenAIServingEmbedding
28+ from vllm .entrypoints .openai .serving_score import ServingScores
4029from vllm .entrypoints .openai .tool_parsers import ToolParserManager
4130from vllm .entrypoints .openai .serving_models import BaseModelPath , OpenAIServingModels
4231from vllm .entrypoints .openai .cli_args import validate_parsed_serve_args
4332from vllm .entrypoints .chat_utils import load_chat_template
4433from vllm .entrypoints .openai .protocol import ErrorResponse as engineError
4534from vllm .entrypoints .openai .reasoning_parsers import ReasoningParserManager
35+
36+ from kserve .protocol .rest .openai .errors import create_error_response
37+ from kserve .protocol .rest .openai import (
38+ OpenAIEncoderModel ,
39+ OpenAIGenerativeModel ,
40+ )
41+ from kserve .protocol .rest .openai .types import (
42+ Completion ,
43+ ChatCompletion ,
44+ CompletionRequest ,
45+ ChatCompletionRequest ,
46+ EmbeddingRequest ,
47+ Embedding ,
48+ ErrorResponse ,
49+ RerankRequest ,
50+ Rerank ,
51+ )
4652from .utils import build_async_engine_client_from_engine_args , build_vllm_engine_args
4753
4854
@@ -53,7 +59,11 @@ class VLLMModel(
5359 vllm_engine_args : AsyncEngineArgs = None
5460 args : Namespace = None
5561 ready : bool = False
62+ openai_serving_models : Optional [OpenAIServingModels ] = None
5663 openai_serving_completion : Optional [OpenAIServingCompletion ] = None
64+ openai_serving_chat : Optional [OpenAIServingChat ] = None
65+ openai_serving_embedding : Optional [OpenAIServingEmbedding ] = None
66+ serving_reranking : Optional [ServingScores ] = None
5767
5868 def __init__ (
5969 self ,
@@ -68,6 +78,9 @@ def __init__(
6878 self .vllm_engine_args = engine_args
6979 self .request_logger = request_logger
7080 self .model_name = model_name
81+ self .base_model_paths = []
82+ self .log_stats = True
83+ self .model_config = None
7184
7285 async def start_engine (self ):
7386 if self .args .tool_parser_plugin and len (self .args .tool_parser_plugin ) > 3 :
@@ -169,6 +182,17 @@ async def start_engine(self):
169182 else None
170183 )
171184
185+ self .serving_reranking = (
186+ ServingScores (
187+ self .engine_client ,
188+ self .model_config ,
189+ self .openai_serving_models ,
190+ request_logger = self .request_logger ,
191+ )
192+ if self .model_config .task == "score"
193+ else None
194+ )
195+
172196 self .ready = True
173197 return self .ready
174198
@@ -201,6 +225,11 @@ async def create_completion(
201225 raw_request : Optional [Request ] = None ,
202226 context : Optional [Dict [str , Any ]] = None ,
203227 ) -> Union [AsyncGenerator [str , None ], Completion , ErrorResponse ]:
228+ if self .openai_serving_completion is None :
229+ return create_error_response (
230+ message = "The model does not support Completions API" ,
231+ status_code = HTTPStatus .BAD_REQUEST ,
232+ )
204233 response = await self .openai_serving_completion .create_completion (
205234 request , raw_request
206235 )
@@ -221,6 +250,11 @@ async def create_chat_completion(
221250 raw_request : Optional [Request ] = None ,
222251 context : Optional [Dict [str , Any ]] = None ,
223252 ) -> Union [AsyncGenerator [str , None ], ChatCompletion , ErrorResponse ]:
253+ if self .openai_serving_chat is None :
254+ return create_error_response (
255+ message = "The model does not support Chat Completions API" ,
256+ status_code = HTTPStatus .BAD_REQUEST ,
257+ )
224258 response = await self .openai_serving_chat .create_chat_completion (
225259 request , raw_request
226260 )
@@ -241,6 +275,11 @@ async def create_embedding(
241275 raw_request : Optional [Request ] = None ,
242276 context : Optional [Dict [str , Any ]] = None ,
243277 ) -> Union [AsyncGenerator [str , None ], Embedding , ErrorResponse ]:
278+ if self .openai_serving_embedding is None :
279+ return create_error_response (
280+ message = "The model does not support Embeddings API" ,
281+ status_code = HTTPStatus .BAD_REQUEST ,
282+ )
244283 response = await self .openai_serving_embedding .create_embedding (
245284 request , raw_request
246285 )
@@ -254,3 +293,26 @@ async def create_embedding(
254293 )
255294
256295 return response
296+
297+ async def create_rerank (
298+ self ,
299+ request : RerankRequest ,
300+ raw_request : Optional [Request ] = None ,
301+ context : Optional [Dict [str , Any ]] = None ,
302+ ) -> Union [AsyncGenerator [str , None ], Rerank , ErrorResponse ]:
303+ if self .serving_reranking is None :
304+ return create_error_response (
305+ message = "The model does not support Rerank API" ,
306+ status_code = HTTPStatus .BAD_REQUEST ,
307+ )
308+ response = await self .serving_reranking .do_rerank (request , raw_request )
309+
310+ if isinstance (response , engineError ):
311+ return create_error_response (
312+ message = response .message ,
313+ err_type = response .type ,
314+ param = response .param ,
315+ status_code = HTTPStatus (response .code ),
316+ )
317+
318+ return response
0 commit comments