Skip to content

Commit 186898d

Browse files
authored
feat(completions): implement legacy /v1/completions endpoint (#683)
* feat(completions): implement legacy /v1/completions endpoint Replace the NOT_IMPLEMENTED stub with a working text-completions handler. The prompt is translated into a single user chat message, sent through the existing completion service, and the chat result is reshaped back into the OpenAI `object: "text_completion"` format. Supports both streaming (SSE text deltas + [DONE]) and non-streaming, carries the Inference-Id header and usage passthrough (including cached_tokens), and registers the route and OpenAPI path. Adds unit tests for the response/chunk transforms. Note: because the prompt is sent as a chat message the backend applies its chat template, and the synthesized response is not byte-verifiable against attestation (unlike /v1/chat/completions). * fix(completions): address review on /v1/completions - OpenAPI: response body is CompletionResponse (not ChatCompletionResponse); derive ToSchema on CompletionResponse/CompletionChoice and register them. - CompletionRequest.extra was missing #[serde(flatten)], so a normal request ({"model","prompt"}) failed to deserialize and unknown fields were dropped. - Reject E2E-encryption headers and auto-redact opt-in with 400 instead of silently bypassing them (the response reshape is incompatible with passing the provider's encrypted/un-redacted bytes through). - Warn when the Inference-Id can't be extracted from the first stream chunk, matching chat_completions. - Add deserialize tests for the flatten fix. * fix(completions): handle advertised legacy params on /v1/completions Per review: CompletionRequest advertises echo/logprobs/best_of/presence_ penalty/frequency_penalty but the endpoint silently dropped them. - Forward presence_penalty/frequency_penalty to the provider via `extra` (standard sampling params the chat backend accepts; no typed slot on the service request). - Reject echo / logprobs / best_of>1 with 400 unsupported_parameter — they have no equivalent under the translate-to-chat path. - Add tests for the rejection helper and penalty forwarding. * fix(completions): accept OpenAI one-or-many prompt/stop shapes Per review: {"stop":"\n"} and {"prompt":["a","b"]} failed JSON extraction (framework 422) before the handler could return a clean error. - `stop` now accepts string or array (StopSequences), normalized to Vec. - `prompt` now accepts string / string[] / int[] / int[][] (CompletionPrompt) so all OpenAI shapes deserialize; single-string is served, batch and token-id prompts are rejected with 400 unsupported_parameter (no mapping under translate-to-chat). - Remove the now-unused (and now type-incompatible) From<CompletionRequest> for CompletionParams; the native text-completion path it fed is never wired. - Register the new schemas in OpenAPI; add deserialization tests for the one-or-many shapes.
1 parent 9575972 commit 186898d

5 files changed

Lines changed: 642 additions & 75 deletions

File tree

crates/api/src/conversions.rs

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use crate::{middleware::AuthenticatedUser, models::*};
22
use inference_providers::{
3-
ChatCompletionParams, ChatMessage, CompletionParams, FinishReason, MessageRole, TokenUsage,
3+
ChatCompletionParams, ChatMessage, FinishReason, MessageRole, TokenUsage,
44
};
55
use services::completions::CompletionError;
66

@@ -119,31 +119,6 @@ impl From<ChatCompletionRequest> for ChatCompletionParams {
119119
}
120120
}
121121

122-
impl From<CompletionRequest> for CompletionParams {
123-
fn from(req: CompletionRequest) -> Self {
124-
Self {
125-
model: req.model,
126-
prompt: req.prompt,
127-
max_tokens: req.max_tokens,
128-
temperature: req.temperature,
129-
top_p: req.top_p,
130-
n: req.n,
131-
stream: req.stream,
132-
stop: req.stop,
133-
frequency_penalty: req.frequency_penalty,
134-
presence_penalty: req.presence_penalty,
135-
logit_bias: None,
136-
logprobs: req.logprobs,
137-
echo: req.echo,
138-
best_of: req.best_of,
139-
seed: None,
140-
user: None,
141-
suffix: None,
142-
stream_options: None,
143-
}
144-
}
145-
}
146-
147122
impl From<ChatMessage> for crate::models::Message {
148123
fn from(msg: ChatMessage) -> Self {
149124
let content = msg.content.and_then(|v| {

crates/api/src/lib.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ use crate::{
1818
},
1919
billing::{get_billing_costs, BillingRouteState},
2020
completions::{
21-
audio_transcriptions, chat_completions, embeddings, image_edits, image_generations,
22-
models, privacy_classify, privacy_redact, rerank, score,
21+
audio_transcriptions, chat_completions, completions, embeddings, image_edits,
22+
image_generations, models, privacy_classify, privacy_redact, rerank, score,
2323
},
2424
conversations,
2525
feature_requests::{
@@ -1057,6 +1057,7 @@ pub fn build_completion_routes(
10571057
// Use default body limit (~2 MB) since they only accept JSON
10581058
let text_inference_routes = Router::new()
10591059
.route("/chat/completions", post(chat_completions))
1060+
.route("/completions", post(completions))
10601061
.route("/images/generations", post(image_generations))
10611062
.route("/audio/transcriptions", post(audio_transcriptions))
10621063
.route("/rerank", post(rerank))

crates/api/src/models.rs

Lines changed: 54 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -173,10 +173,57 @@ pub struct ChatChoice {
173173
pub finish_reason: Option<String>, // "stop", "length", "content_filter"
174174
}
175175

176+
/// OpenAI `/v1/completions` `prompt`: a single string, a batch of strings, or
177+
/// token-ID array(s). This endpoint serves only the single-string form; the
178+
/// other shapes still deserialize (so the handler can return a clean 400 rather
179+
/// than a framework deserialization error) and are rejected there.
180+
#[derive(Debug, Clone, Deserialize, ToSchema)]
181+
#[serde(untagged)]
182+
pub enum CompletionPrompt {
183+
Text(String),
184+
Strings(Vec<String>),
185+
Tokens(Vec<i64>),
186+
TokenBatches(Vec<Vec<i64>>),
187+
}
188+
189+
impl CompletionPrompt {
190+
/// Resolve to the single text prompt this endpoint supports, or an error
191+
/// message explaining why the shape is unsupported.
192+
pub fn single_text(&self) -> Result<&str, &'static str> {
193+
match self {
194+
CompletionPrompt::Text(s) => Ok(s),
195+
CompletionPrompt::Strings(v) if v.len() == 1 => Ok(&v[0]),
196+
CompletionPrompt::Strings(_) => Err(
197+
"array (batch) prompts are not supported on /v1/completions; send a single string prompt",
198+
),
199+
CompletionPrompt::Tokens(_) | CompletionPrompt::TokenBatches(_) => Err(
200+
"token-id prompts are not supported on /v1/completions; send a string prompt",
201+
),
202+
}
203+
}
204+
}
205+
206+
/// OpenAI `stop`: either a single string or an array of strings.
207+
#[derive(Debug, Clone, Deserialize, ToSchema)]
208+
#[serde(untagged)]
209+
pub enum StopSequences {
210+
Single(String),
211+
Many(Vec<String>),
212+
}
213+
214+
impl StopSequences {
215+
pub fn into_vec(self) -> Vec<String> {
216+
match self {
217+
StopSequences::Single(s) => vec![s],
218+
StopSequences::Many(v) => v,
219+
}
220+
}
221+
}
222+
176223
#[derive(Debug, Deserialize, ToSchema)]
177224
pub struct CompletionRequest {
178225
pub model: String,
179-
pub prompt: String,
226+
pub prompt: CompletionPrompt,
180227
pub max_tokens: Option<i64>,
181228
#[serde(default = "default_temperature")]
182229
pub temperature: Option<f32>,
@@ -187,15 +234,16 @@ pub struct CompletionRequest {
187234
pub stream: Option<bool>,
188235
pub logprobs: Option<i64>,
189236
pub echo: Option<bool>,
190-
pub stop: Option<Vec<String>>,
237+
pub stop: Option<StopSequences>,
191238
pub presence_penalty: Option<f32>,
192239
pub frequency_penalty: Option<f32>,
193240
pub best_of: Option<i64>,
194241

242+
#[serde(flatten)]
195243
pub extra: std::collections::HashMap<String, serde_json::Value>,
196244
}
197245

198-
#[derive(Debug, Serialize)]
246+
#[derive(Debug, Serialize, ToSchema)]
199247
pub struct CompletionResponse {
200248
pub id: String,
201249
pub object: String, // "text_completion"
@@ -792,7 +840,7 @@ pub struct TopProvider {
792840
pub is_moderated: bool,
793841
}
794842

795-
#[derive(Debug, Serialize)]
843+
#[derive(Debug, Serialize, ToSchema)]
796844
pub struct CompletionChoice {
797845
pub index: i64,
798846
pub text: String,
@@ -1016,9 +1064,8 @@ impl CompletionRequest {
10161064
return Err("model is required".to_string());
10171065
}
10181066

1019-
if self.prompt.is_empty() {
1020-
return Err("prompt is required".to_string());
1021-
}
1067+
// `prompt` shape is resolved in the handler (single_text) so unsupported
1068+
// shapes get a 400 unsupported_parameter rather than a generic error.
10221069

10231070
if let Some(temp) = self.temperature {
10241071
if !(0.0..=2.0).contains(&temp) {

crates/api/src/openapi.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ use utoipa::{Modify, OpenApi};
5151
crate::routes::completions::score,
5252
crate::routes::completions::privacy_classify,
5353
crate::routes::completions::privacy_redact,
54-
// crate::routes::completions::completions,
54+
crate::routes::completions::completions,
5555
crate::routes::completions::models,
5656
// Model endpoints (public model catalog)
5757
crate::routes::models::list_models,
@@ -165,7 +165,8 @@ use utoipa::{Modify, OpenApi};
165165
crate::routes::health::HealthResponse,
166166
// Core API models
167167
ChatCompletionRequest, ChatCompletionResponse, Message, CompletionUsage,
168-
CompletionRequest, ModelsResponse, ModelInfo, ModelPricing, TopProvider, ErrorResponse,
168+
CompletionRequest, CompletionPrompt, StopSequences, CompletionResponse,
169+
CompletionChoice, ModelsResponse, ModelInfo, ModelPricing, TopProvider, ErrorResponse,
169170
// Image generation models
170171
ImageGenerationRequest, ImageGenerationResponse, ImageData,
171172
// Audio transcription models

0 commit comments

Comments
 (0)