feat(completions): implement legacy /v1/completions endpoint (#683)

Evrard-Nil · web-flow · commit 186898de9f85 · 2026-05-28T10:42:57.000+02:00
* feat(completions): implement legacy /v1/completions endpoint

Replace the NOT_IMPLEMENTED stub with a working text-completions handler.
The prompt is translated into a single user chat message, sent through the
existing completion service, and the chat result is reshaped back into the
OpenAI `object: "text_completion"` format. Supports both streaming (SSE
text deltas + [DONE]) and non-streaming, carries the Inference-Id header
and usage passthrough (including cached_tokens), and registers the route
and OpenAPI path. Adds unit tests for the response/chunk transforms.

Note: because the prompt is sent as a chat message the backend applies its
chat template, and the synthesized response is not byte-verifiable against
attestation (unlike /v1/chat/completions).

* fix(completions): address review on /v1/completions

- OpenAPI: response body is CompletionResponse (not ChatCompletionResponse);
  derive ToSchema on CompletionResponse/CompletionChoice and register them.
- CompletionRequest.extra was missing #[serde(flatten)], so a normal request
  ({"model","prompt"}) failed to deserialize and unknown fields were dropped.
- Reject E2E-encryption headers and auto-redact opt-in with 400 instead of
  silently bypassing them (the response reshape is incompatible with passing
  the provider's encrypted/un-redacted bytes through).
- Warn when the Inference-Id can't be extracted from the first stream chunk,
  matching chat_completions.
- Add deserialize tests for the flatten fix.

* fix(completions): handle advertised legacy params on /v1/completions

Per review: CompletionRequest advertises echo/logprobs/best_of/presence_
penalty/frequency_penalty but the endpoint silently dropped them.

- Forward presence_penalty/frequency_penalty to the provider via `extra`
  (standard sampling params the chat backend accepts; no typed slot on the
  service request).
- Reject echo / logprobs / best_of&gt;1 with 400 unsupported_parameter — they
  have no equivalent under the translate-to-chat path.
- Add tests for the rejection helper and penalty forwarding.

* fix(completions): accept OpenAI one-or-many prompt/stop shapes

Per review: {"stop":"\n"} and {"prompt":["a","b"]} failed JSON extraction
(framework 422) before the handler could return a clean error.

- `stop` now accepts string or array (StopSequences), normalized to Vec.
- `prompt` now accepts string / string[] / int[] / int[][] (CompletionPrompt)
  so all OpenAI shapes deserialize; single-string is served, batch and
  token-id prompts are rejected with 400 unsupported_parameter (no mapping
  under translate-to-chat).
- Remove the now-unused (and now type-incompatible) From&lt;CompletionRequest&gt;
  for CompletionParams; the native text-completion path it fed is never wired.
- Register the new schemas in OpenAPI; add deserialization tests for the
  one-or-many shapes.
diff --git a/crates/api/src/conversions.rs b/crates/api/src/conversions.rs
@@ -1,6 +1,6 @@
 use crate::{middleware::AuthenticatedUser, models::*};
 use inference_providers::{
-    ChatCompletionParams, ChatMessage, CompletionParams, FinishReason, MessageRole, TokenUsage,
+    ChatCompletionParams, ChatMessage, FinishReason, MessageRole, TokenUsage,
 };
 use services::completions::CompletionError;
 
@@ -119,31 +119,6 @@ impl From<ChatCompletionRequest> for ChatCompletionParams {
     }
 }
 
-impl From<CompletionRequest> for CompletionParams {
-    fn from(req: CompletionRequest) -> Self {
-        Self {
-            model: req.model,
-            prompt: req.prompt,
-            max_tokens: req.max_tokens,
-            temperature: req.temperature,
-            top_p: req.top_p,
-            n: req.n,
-            stream: req.stream,
-            stop: req.stop,
-            frequency_penalty: req.frequency_penalty,
-            presence_penalty: req.presence_penalty,
-            logit_bias: None,
-            logprobs: req.logprobs,
-            echo: req.echo,
-            best_of: req.best_of,
-            seed: None,
-            user: None,
-            suffix: None,
-            stream_options: None,
-        }
-    }
-}
-
 impl From<ChatMessage> for crate::models::Message {
     fn from(msg: ChatMessage) -> Self {
         let content = msg.content.and_then(|v| {
diff --git a/crates/api/src/lib.rs b/crates/api/src/lib.rs
@@ -18,8 +18,8 @@ use crate::{
         },
         billing::{get_billing_costs, BillingRouteState},
         completions::{
-            audio_transcriptions, chat_completions, embeddings, image_edits, image_generations,
-            models, privacy_classify, privacy_redact, rerank, score,
+            audio_transcriptions, chat_completions, completions, embeddings, image_edits,
+            image_generations, models, privacy_classify, privacy_redact, rerank, score,
         },
         conversations,
         feature_requests::{
@@ -1057,6 +1057,7 @@ pub fn build_completion_routes(
     // Use default body limit (~2 MB) since they only accept JSON
     let text_inference_routes = Router::new()
         .route("/chat/completions", post(chat_completions))
+        .route("/completions", post(completions))
         .route("/images/generations", post(image_generations))
         .route("/audio/transcriptions", post(audio_transcriptions))
         .route("/rerank", post(rerank))
diff --git a/crates/api/src/models.rs b/crates/api/src/models.rs
@@ -173,10 +173,57 @@ pub struct ChatChoice {
     pub finish_reason: Option<String>, // "stop", "length", "content_filter"
 }
 
+/// OpenAI `/v1/completions` `prompt`: a single string, a batch of strings, or
+/// token-ID array(s). This endpoint serves only the single-string form; the
+/// other shapes still deserialize (so the handler can return a clean 400 rather
+/// than a framework deserialization error) and are rejected there.
+#[derive(Debug, Clone, Deserialize, ToSchema)]
+#[serde(untagged)]
+pub enum CompletionPrompt {
+    Text(String),
+    Strings(Vec<String>),
+    Tokens(Vec<i64>),
+    TokenBatches(Vec<Vec<i64>>),
+}
+
+impl CompletionPrompt {
+    /// Resolve to the single text prompt this endpoint supports, or an error
+    /// message explaining why the shape is unsupported.
+    pub fn single_text(&self) -> Result<&str, &'static str> {
+        match self {
+            CompletionPrompt::Text(s) => Ok(s),
+            CompletionPrompt::Strings(v) if v.len() == 1 => Ok(&v[0]),
+            CompletionPrompt::Strings(_) => Err(
+                "array (batch) prompts are not supported on /v1/completions; send a single string prompt",
+            ),
+            CompletionPrompt::Tokens(_) | CompletionPrompt::TokenBatches(_) => Err(
+                "token-id prompts are not supported on /v1/completions; send a string prompt",
+            ),
+        }
+    }
+}
+
+/// OpenAI `stop`: either a single string or an array of strings.
+#[derive(Debug, Clone, Deserialize, ToSchema)]
+#[serde(untagged)]
+pub enum StopSequences {
+    Single(String),
+    Many(Vec<String>),
+}
+
+impl StopSequences {
+    pub fn into_vec(self) -> Vec<String> {
+        match self {
+            StopSequences::Single(s) => vec![s],
+            StopSequences::Many(v) => v,
+        }
+    }
+}
+
 #[derive(Debug, Deserialize, ToSchema)]
 pub struct CompletionRequest {
     pub model: String,
-    pub prompt: String,
+    pub prompt: CompletionPrompt,
     pub max_tokens: Option<i64>,
     #[serde(default = "default_temperature")]
     pub temperature: Option<f32>,
@@ -187,15 +234,16 @@ pub struct CompletionRequest {
     pub stream: Option<bool>,
     pub logprobs: Option<i64>,
     pub echo: Option<bool>,
-    pub stop: Option<Vec<String>>,
+    pub stop: Option<StopSequences>,
     pub presence_penalty: Option<f32>,
     pub frequency_penalty: Option<f32>,
     pub best_of: Option<i64>,
 
+    #[serde(flatten)]
     pub extra: std::collections::HashMap<String, serde_json::Value>,
 }
 
-#[derive(Debug, Serialize)]
+#[derive(Debug, Serialize, ToSchema)]
 pub struct CompletionResponse {
     pub id: String,
     pub object: String, // "text_completion"
@@ -792,7 +840,7 @@ pub struct TopProvider {
     pub is_moderated: bool,
 }
 
-#[derive(Debug, Serialize)]
+#[derive(Debug, Serialize, ToSchema)]
 pub struct CompletionChoice {
     pub index: i64,
     pub text: String,
@@ -1016,9 +1064,8 @@ impl CompletionRequest {
             return Err("model is required".to_string());
         }
 
-        if self.prompt.is_empty() {
-            return Err("prompt is required".to_string());
-        }
+        // `prompt` shape is resolved in the handler (single_text) so unsupported
+        // shapes get a 400 unsupported_parameter rather than a generic error.
 
         if let Some(temp) = self.temperature {
             if !(0.0..=2.0).contains(&temp) {
diff --git a/crates/api/src/openapi.rs b/crates/api/src/openapi.rs
@@ -51,7 +51,7 @@ use utoipa::{Modify, OpenApi};
         crate::routes::completions::score,
         crate::routes::completions::privacy_classify,
         crate::routes::completions::privacy_redact,
-        // crate::routes::completions::completions,
+        crate::routes::completions::completions,
         crate::routes::completions::models,
         // Model endpoints (public model catalog)
         crate::routes::models::list_models,
@@ -165,7 +165,8 @@ use utoipa::{Modify, OpenApi};
             crate::routes::health::HealthResponse,
             // Core API models
             ChatCompletionRequest, ChatCompletionResponse, Message, CompletionUsage,
-            CompletionRequest, ModelsResponse, ModelInfo, ModelPricing, TopProvider, ErrorResponse,
+            CompletionRequest, CompletionPrompt, StopSequences, CompletionResponse,
+            CompletionChoice, ModelsResponse, ModelInfo, ModelPricing, TopProvider, ErrorResponse,
             // Image generation models
             ImageGenerationRequest, ImageGenerationResponse, ImageData,
             // Audio transcription models
diff --git a/crates/api/src/routes/completions.rs b/crates/api/src/routes/completions.rs