etalab-ia
diff --git a/‎api/endpoints/chat.py‎
Lines changed: 1 addition & 0 deletions b/‎api/endpoints/chat.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎api/helpers/models/_modelregistry.py‎
Lines changed: 1 addition & 1 deletion b/‎api/helpers/models/_modelregistry.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎api/infrastructure/http/model/_mistralmodelhttpclient.py‎
Lines changed: 44 additions & 33 deletions b/‎api/infrastructure/http/model/_mistralmodelhttpclient.py‎
Lines changed: 44 additions & 33 deletions
diff --git a/‎api/infrastructure/http/model/_modelhttpclient.py‎
Lines changed: 2 additions & 0 deletions b/‎api/infrastructure/http/model/_modelhttpclient.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎api/infrastructure/http/model/_vllmmodelhttpclient.py‎
Lines changed: 9 additions & 1 deletion b/‎api/infrastructure/http/model/_vllmmodelhttpclient.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎api/infrastructure/model/_modelprovidergateway.py‎
Lines changed: 7 additions & 1 deletion b/‎api/infrastructure/model/_modelprovidergateway.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎api/schemas/chat.py‎
Lines changed: 2 additions & 2 deletions b/‎api/schemas/chat.py‎
Lines changed: 2 additions & 2 deletions
@@ -58,6 +58,7 @@ async def chat_completions(
     request_context: ContextVar[RequestContext] = Depends(get_request_context),
 ) -> JSONResponse | StreamingResponseWithStatusCode:
     """Creates a model response for the given chat conversation."""
+
     model_provider = await model_registry.get_model_provider(
         model=body.model,
         endpoint=EndpointRoute.CHAT_COMPLETIONS,
 
@@ -129,7 +129,7 @@ async def setup(self, models: list[ModelConfiguration], postgres_session: AsyncS
                 )
                 logger.info(f"Router {model.name} are created (id: {router_id})")
             except RouterAlreadyExistsException:
-                continue
+                pass
             except RouterAliasAlreadyExistsException:
                 continue
             except Exception as e:
 
@@ -2,7 +2,7 @@
 
 from mistralai.client.models import AudioChunk, ChatCompletionRequest, TextChunk, UserMessage
 
-from api.infrastructure.fastapi.schemas.models import ModelsResponse
+from api.infrastructure.fastapi.schemas.models import ModelResponse, ModelsResponse
 from api.infrastructure.http.model._modelhttpclient import ModelHttpClient, ModelHttpClientEndpoints
 from api.schemas.audio import AudioTranscription
 from api.schemas.core.models import RequestContent
@@ -14,31 +14,28 @@ class MistralModelHttpClient(ModelHttpClient):
     # request formatting
     @staticmethod
     def format_chat_completion_request(request_content: RequestContent) -> RequestContent:
-        try:
-            request_content.body = ChatCompletionRequest(**request_content.body).model_dump(by_alias=True)
-        except Exception:
-            # apply a minimal formatting and ignore error to let the provider raise the correct 422 error
-            # see https://docs.mistral.ai/api#operation-chat_completion_v1_chat_completions_post
-            request_content.body = {
-                "frequency_penalty": request_content.body.get("frequency_penalty") or 0.0,
-                "max_tokens": request_content.body.get("max_tokens"),
-                "messages": request_content.body.get("messages"),
-                "model": request_content.body.get("model"),
-                "n": request_content.body.get("n"),
-                "parallel_tool_calls": request_content.body.get("parallel_tool_calls") or False,
-                "prediction": request_content.body.get("prediction") or {},
-                "presence_penalty": request_content.body.get("presence_penalty") or 0.0,
-                "prompt_mode": request_content.body.get("prompt_mode"),
-                "random_seed": request_content.body.get("random_seed") or request_content.body.get("seed"),
-                "response_format": request_content.body.get("response_format") or {"type": "text"},
-                "safe_prompt": request_content.body.get("safe_prompt") or False,
-                "stop": request_content.body.get("stop") or [],
-                "stream": request_content.body.get("stream") or False,
-                "temperature": request_content.body.get("temperature"),
-                "tool_choice": request_content.body.get("tool_choice"),
-                "tools": request_content.body.get("tools"),
-                "top_p": request_content.body.get("top_p") or 1.0,
-            }
+        # see https://docs.mistral.ai/api#operation-chat_completion_v1_chat_completions_post
+        request_content.body = {
+            "frequency_penalty": request_content.body.get("frequency_penalty") or 0.0,
+            "max_tokens": request_content.body.get("max_tokens"),
+            "messages": request_content.body.get("messages"),
+            "model": request_content.body.get("model"),
+            "n": request_content.body.get("n"),
+            "parallel_tool_calls": request_content.body.get("parallel_tool_calls") or False,
+            "prediction": request_content.body.get("prediction") or {},
+            "presence_penalty": request_content.body.get("presence_penalty") or 0.0,
+            "prompt_mode": request_content.body.get("prompt_mode"),
+            "random_seed": request_content.body.get("random_seed") or request_content.body.get("seed"),
+            "response_format": request_content.body.get("response_format") or {"type": "text"},
+            "safe_prompt": request_content.body.get("safe_prompt") or False,
+            "stop": request_content.body.get("stop") or [],
+            "stream": request_content.body.get("stream") or False,
+            "temperature": request_content.body.get("temperature"),
+            "tool_choice": request_content.body.get("tool_choice"),
+            "tools": request_content.body.get("tools"),
+            "top_p": request_content.body.get("top_p") or 1.0,
+        }
+
         return request_content
 
     @staticmethod
@@ -57,16 +54,30 @@ def format_audio_transcription_request(request_content: RequestContent) -> Reque
         ).model_dump()
         request_content.files = {}
         request_content.form = {}
+
         return request_content
 
     # response formatting
     @staticmethod
-    def format_client_response_to_models_response(request_content: RequestContent, response_data: dict) -> ModelsResponse:
-        for model in response_data.get("data", []):
-            model.update({"type": None})
-        return ModelsResponse(**response_data)
+    def format_response_to_models_response(request_content: RequestContent, response_data: dict) -> ModelsResponse:
+        return ModelsResponse(
+            data=[
+                ModelResponse(
+                    id=model.get("id"),
+                    created=model.get("created"),
+                    owned_by=model.get("owned_by"),
+                    max_context_length=model.get("max_context_length"),
+                    aliases=model.get("aliases", []),
+                )
+                for model in response_data.get("data", [])
+            ]
+        )
 
     @staticmethod
-    def format_client_response_to_audio_transcription_response(request_content: RequestContent, response_data: dict) -> AudioTranscription:
-        text = response_data["choices"][0]["message"]["content"]
-        return AudioTranscription(text=text)
+    def format_response_to_audio_transcription_response(request_content: RequestContent, response_data: dict) -> AudioTranscription:
+        return AudioTranscription(
+            id=response_data["id"],
+            model=response_data["model"],
+            text=response_data["choices"][0]["message"]["content"],
+            usage=response_data["usage"],
+        )
@@ -93,6 +93,8 @@ def format_audio_transcription_request(self, request_content: RequestContent) ->
     @staticmethod
     def format_chat_completion_request(request_content: RequestContent) -> RequestContent:
         """This method can be overridden by children clients to format the chat completion request."""
+        # @TODO: setup default temperature by model (default=1.0)
+        # @TODO: catch stream options to avoid double usage computation
         return request_content
 
     @staticmethod
 
@@ -9,5 +9,13 @@ class VllmModelHttpClient(ModelHttpClient):
     # response formatting
     @staticmethod
     def format_response_to_models_response(request_content: RequestContent, response_data: dict) -> ModelsResponse:
-        data = [ModelResponse(max_context_length=model.get("max_model_len"), **model) for model in response_data.get("helpers/data", [])]
+        data = [
+            ModelResponse(
+                id=model.get("id"),
+                created=model.get("created"),
+                owned_by=model.get("owned_by"),
+                max_context_length=model.get("max_model_len"),
+            )
+            for model in response_data.get("data", [])
+        ]
         return ModelsResponse(data=data)
@@ -39,5 +39,11 @@ def _build_client(provider_type, url, key, timeout, model_name) -> ModelHttpClie
         }
 
         return provider_class[provider_type](
-            url=url, key=key, timeout=timeout, model_name=model_name, model_active_params=None, model_total_params=None, model_hosting_zone=None
+            url=url,
+            key=key,
+            timeout=timeout,
+            model_name=model_name,
+            model_active_params=None,
+            model_total_params=None,
+            model_hosting_zone=None,
         )
@@ -37,8 +37,8 @@ class CreateChatCompletion(BaseModel):
     stop: str | list[str] | None = Field(default_factory=list, description="Up to 4 sequences where the API will stop generating further tokens.")  # fmt: off
     stream: Literal[True, False] | None = Field(default=False, description="If set, partial message deltas will be sent. Tokens will be sent as data-only server-sent events as they become available, with the stream terminated by a data: [DONE] message.")  # fmt: off
     stream_options: Any | None = Field(default=None, description="Options for streaming response. Only set this when you set `stream: true`.")  # fmt: off
-    temperature: float | None = Field(default=0.7, description="What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or `top_p` but not both.")  # fmt: off
-    top_p: float | None = Field(default=1, description="An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.<br>We generally recommend altering this or `temperature` but not both.")  # fmt: off
+    temperature: float | None = Field(default=None, description="What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or `top_p` but not both.")  # fmt: off
+    top_p: float | None = Field(default=None, description="An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.<br>We generally recommend altering this or `temperature` but not both.")  # fmt: off
     tools: Annotated[list[dict| SearchTool] | None, Field(description="A list of tools the model may call. Currently, only functions are supported as a tool. Support function calling and build-in tools (currently only SearchTool). Use this to provide a list of functions the model may generate JSON inputs for.")] | None = Field(default=None)  # fmt: off
     tool_choice: Any = Field(default="none", description="Controls which (if any) tool is called by the model. `none` means the model will not call any tool and instead generates a message. `auto` means the model can pick between generating a message or calling one or more tools. `required` means the model must call one or more tools. Specifying a particular tool via `{\"type\": \"function\", \"function\": {\"name\": \"my_function\"}}` forces the model to call that tool.<br>`none` is the default when no tools are present. `auto` is the default if tools are present.")  # fmt: off
     parallel_tool_calls: bool | None = Field(default=False, description="Whether to call tools in parallel or sequentially. If true, the model will call tools in parallel. If false, the model will call tools sequentially. If None, the model will call tools in parallel if the model supports it, otherwise it will call tools sequentially.")  # fmt: off
Original file line number	Diff line number	Diff line change
`@@ -129,7 +129,7 @@ async def setup(self, models: list[ModelConfiguration], postgres_session: AsyncS`
`129`	`129`	`)`
`130`	`130`	`logger.info(f"Router {model.name} are created (id: {router_id})")`
`131`	`131`	`except RouterAlreadyExistsException:`
`132`		`- continue`
	`132`	`+ pass`
`133`	`133`	`except RouterAliasAlreadyExistsException:`
`134`	`134`	`continue`
`135`	`135`	`except Exception as e:`
Original file line number	Diff line number	Diff line change
`@@ -39,5 +39,11 @@ def _build_client(provider_type, url, key, timeout, model_name) -> ModelHttpClie`
`39`	`39`	`}`
`40`	`40`
`41`	`41`	`return provider_class[provider_type](`
`42`		`- url=url, key=key, timeout=timeout, model_name=model_name, model_active_params=None, model_total_params=None, model_hosting_zone=None`
	`42`	`+ url=url,`
	`43`	`+ key=key,`
	`44`	`+ timeout=timeout,`
	`45`	`+ model_name=model_name,`
	`46`	`+ model_active_params=None,`
	`47`	`+ model_total_params=None,`
	`48`	`+ model_hosting_zone=None,`
`43`	`49`	`)`