Skip to content

Commit efd3d6b

Browse files
feat: [ENG-3733] Image output support on AI Gateway (#5409)
* added image pro preview pricing * image output support in one turn * add top k * update sdk and add image gen config --------- Co-authored-by: Justin <justintorre75@gmail.com>
1 parent 125f5b8 commit efd3d6b

File tree

16 files changed

+388
-41
lines changed

16 files changed

+388
-41
lines changed

packages/__tests__/cost/__snapshots__/registrySnapshots.test.ts.snap

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8751,8 +8751,8 @@ exports[`Registry Snapshots verify registry state 1`] = `
87518751
"claude-3.5-haiku:anthropic:*",
87528752
],
87538753
"totalArchivedConfigs": 0,
8754-
"totalEndpoints": 255,
8755-
"totalModelProviderConfigs": 255,
8754+
"totalEndpoints": 256,
8755+
"totalModelProviderConfigs": 256,
87568756
"totalModelsWithPtb": 95,
87578757
"totalProviders": 21,
87588758
}

packages/llm-mapper/transform/providers/google/response/toOpenai.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import {
66
OpenAIToolCall,
77
OpenAIUsage,
88
OpenAIReasoningDetail,
9+
ChatCompletionContentPartImage,
910
} from "../../../types/openai";
1011
import {
1112
GoogleCandidate,
@@ -56,13 +57,15 @@ function mapCandidates(candidates: GoogleCandidate[]): OpenAIChoice[] {
5657
let tool_calls: OpenAIToolCall[] = [];
5758
let reasoning: string | undefined;
5859
let reasoning_details: OpenAIReasoningDetail[] | undefined;
60+
let images: ChatCompletionContentPartImage[] | undefined;
5961

6062
if (candidate.content) {
6163
const extracted = extractContent(candidate.content);
6264
content = extracted.content;
6365
tool_calls = extracted.tool_calls;
6466
reasoning = extracted.reasoning;
6567
reasoning_details = extracted.reasoning_details;
68+
images = extracted.images;
6669
}
6770

6871
return {
@@ -73,6 +76,7 @@ function mapCandidates(candidates: GoogleCandidate[]): OpenAIChoice[] {
7376
...(reasoning && { reasoning }),
7477
...(reasoning_details && reasoning_details.length > 0 && { reasoning_details }),
7578
...(tool_calls.length > 0 && { tool_calls }),
79+
...(images && images.length > 0 && { images }),
7680
} as OpenAIResponseMessage,
7781
finish_reason: mapGoogleFinishReason(candidate.finishReason),
7882
logprobs: null,
@@ -88,6 +92,7 @@ interface ExtractedContent {
8892
tool_calls: OpenAIToolCall[];
8993
reasoning?: string;
9094
reasoning_details?: OpenAIReasoningDetail[];
95+
images?: ChatCompletionContentPartImage[];
9196
}
9297

9398
/**
@@ -106,6 +111,7 @@ function extractContent(
106111
const textParts: string[] = [];
107112
const thinkingTexts: string[] = [];
108113
const toolCalls: OpenAIToolCall[] = [];
114+
const imageParts: ChatCompletionContentPartImage[] = [];
109115
let collectedSignature: string | undefined;
110116

111117
for (const block of contents) {
@@ -127,6 +133,15 @@ function extractContent(
127133

128134
if (part.functionCall) {
129135
toolCalls.push(mapToolCall(part.functionCall, toolCalls.length));
136+
} else if (part.inlineData) {
137+
const mimeType = part.inlineData.mimeType || "image/png";
138+
const dataUri = `data:${mimeType};base64,${part.inlineData.data}`;
139+
imageParts.push({
140+
type: "image_url",
141+
image_url: {
142+
url: dataUri,
143+
},
144+
});
130145
} else if (part.text) {
131146
// Check if this is a thinking part (Google uses thought: true)
132147
if (part.thought === true) {
@@ -155,6 +170,11 @@ function extractContent(
155170
}));
156171
}
157172

173+
// Add images if image parts were found
174+
if (imageParts.length > 0) {
175+
result.images = imageParts;
176+
}
177+
158178
return result;
159179
}
160180

packages/llm-mapper/transform/providers/google/streamedResponse/toOpenai.ts

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,31 @@ export class GoogleToOpenAIStreamConverter {
136136
);
137137

138138
this.toolCallIndex += 1;
139+
} else if (part.inlineData) {
140+
// Handle image output from Google's image generation models
141+
const mimeType = part.inlineData.mimeType || "image/png";
142+
const dataUri = `data:${mimeType};base64,${part.inlineData.data}`;
143+
chunks.push(
144+
this.createChunk({
145+
choices: [
146+
{
147+
index: candidate.index ?? 0,
148+
delta: {
149+
images: [
150+
{
151+
type: "image_url",
152+
image_url: {
153+
url: dataUri,
154+
},
155+
},
156+
],
157+
},
158+
logprobs: null,
159+
finish_reason: null,
160+
},
161+
],
162+
})
163+
);
139164
} else if (part.text) {
140165
// Check if this is a thinking part (Google uses thought: true)
141166
if (part.thought === true) {

packages/llm-mapper/transform/providers/openai/request/toAnthropic.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,11 @@ function mapMessages(
379379
};
380380

381381
if (message.role === "assistant" || message.role === "user") {
382+
// Check if assistant message has images - not supported by Anthropic
383+
if (message.role === "assistant" && message.images && message.images.length > 0) {
384+
throw new Error("Image outputs in assistant messages are not supported by Anthropic");
385+
}
386+
382387
const processedToolCallContent: AnthropicContentBlock[] = [];
383388

384389
if (message.role === "assistant" && message.tool_calls) {

packages/llm-mapper/transform/providers/openai/request/toGoogle.ts

Lines changed: 58 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
1-
import { ExtendedHeliconeChatCreateParams, GeminiContent, GeminiGenerateContentRequest, GeminiGenerationConfig, GeminiPart, GeminiThinkingConfig, GeminiTool, GeminiToolConfig, GoogleReasoningOptions, ChatCompletionMessage } from "../../../types/google";
1+
import { GeminiContent, GeminiGenerateContentRequest, GeminiGenerationConfig, GeminiImageConfig, GeminiPart, GeminiThinkingConfig, GeminiTool, GeminiToolConfig, GoogleReasoningOptions } from "../../../types/google";
22
import {
33
HeliconeChatCompletionContentPart,
44
HeliconeChatCreateParams,
5+
HeliconeChatCompletionMessageParam,
6+
HeliconeImageGenerationConfig,
57
} from "@helicone-package/prompts/types";
68
import { ChatCompletionTool } from "openai/resources/chat/completions";
79

10+
type ChatCompletionMessage = NonNullable<HeliconeChatCreateParams["messages"]>[number];
11+
812
export function toGoogle(
913
openAIBody: HeliconeChatCreateParams
1014
): GeminiGenerateContentRequest {
@@ -63,6 +67,9 @@ export function toGoogle(
6367
if (message.role === "assistant") {
6468
const toolCallParts = mapToolCallsToParts(message);
6569
parts.push(...toolCallParts);
70+
71+
const imageParts = mapImagesToParts(message);
72+
parts.push(...imageParts);
6673
}
6774

6875
if (parts.length === 0) {
@@ -107,16 +114,13 @@ export function toGoogle(
107114
function buildGenerationConfig(
108115
body: HeliconeChatCreateParams
109116
): GeminiGenerationConfig | undefined {
110-
const bodyWithExtensions = body as ExtendedHeliconeChatCreateParams;
111-
112117
const getNumberOrUndefined = (
113118
value?: number | null
114119
): number | undefined => {
115120
return typeof value === "number" ? value : undefined;
116121
};
117122

118123
const maxOutputTokens =
119-
getNumberOrUndefined(bodyWithExtensions.max_output_tokens) ??
120124
getNumberOrUndefined(body.max_completion_tokens) ??
121125
getNumberOrUndefined(body.max_tokens) ??
122126
undefined;
@@ -137,7 +141,7 @@ function buildGenerationConfig(
137141
if (topP !== undefined) {
138142
config.topP = topP;
139143
}
140-
const topK = getNumberOrUndefined(bodyWithExtensions.top_k);
144+
const topK = getNumberOrUndefined(body.top_k);
141145
if (topK !== undefined) {
142146
config.topK = topK;
143147
}
@@ -166,6 +170,11 @@ function buildGenerationConfig(
166170
config.thinkingConfig = thinkingConfig;
167171
}
168172

173+
const imageConfig = buildImageConfig(body);
174+
if (imageConfig) {
175+
config.imageConfig = imageConfig;
176+
}
177+
169178
return Object.keys(config).length > 0 ? config : undefined;
170179
}
171180

@@ -264,6 +273,20 @@ function buildThinkingConfig(
264273
return thinkingConfig;
265274
}
266275

276+
function buildImageConfig(body: HeliconeChatCreateParams): GeminiImageConfig | undefined {
277+
const heliconeImageConfig = body.image_generation;
278+
279+
if (heliconeImageConfig === undefined) {
280+
return undefined;
281+
}
282+
283+
const imageConfig: GeminiImageConfig = {
284+
aspectRatio: heliconeImageConfig.aspect_ratio,
285+
imageSize: heliconeImageConfig.image_size
286+
};
287+
return imageConfig;
288+
}
289+
267290
function buildTools(body: HeliconeChatCreateParams): GeminiTool[] | undefined {
268291
if (!body.tools || body.tools.length === 0) {
269292
return undefined;
@@ -499,3 +522,33 @@ function parseArguments(
499522
return { raw: value };
500523
}
501524
}
525+
526+
/**
527+
* Maps images from assistant messages to Gemini inlineData parts.
528+
* This handles image outputs that were generated by previous model responses.
529+
*/
530+
function mapImagesToParts(message: HeliconeChatCompletionMessageParam): GeminiPart[] {
531+
const parts: GeminiPart[] = [];
532+
533+
if (!message.images) {
534+
return parts;
535+
}
536+
537+
for (const image of message.images) {
538+
if (image.type === "image_url" && image.image_url?.url) {
539+
const dataUri = image.image_url.url;
540+
if (dataUri.startsWith("data:")) {
541+
const [meta, data] = dataUri.split(",");
542+
let mimeType = meta.split(";")[0].replace("data:", "");
543+
if (!mimeType || mimeType.trim() === "") {
544+
mimeType = "image/png";
545+
}
546+
parts.push({
547+
inlineData: { mimeType, data },
548+
});
549+
}
550+
}
551+
}
552+
553+
return parts;
554+
}

packages/llm-mapper/transform/providers/responses/openai/response/toResponses.ts

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
import { OpenAIResponseBody } from "../../../../types/openai";
1+
import { OpenAIResponseBody, ChatCompletionContentPartImage } from "../../../../types/openai";
22
import {
33
ResponsesResponseBody,
44
ResponsesMessageOutputItem,
55
ResponsesUsage,
66
ResponsesReasoningItem,
7+
ResponsesOutputContentPart,
78
} from "../../../../types/responses";
89

910
export function toResponses(body: OpenAIResponseBody): ResponsesResponseBody {
@@ -31,13 +32,36 @@ export function toResponses(body: OpenAIResponseBody): ResponsesResponseBody {
3132
output.push(reasoningItem);
3233
}
3334

35+
// Build message content from text and images
36+
const messageContent: ResponsesOutputContentPart[] = [];
37+
3438
if (message?.content) {
39+
messageContent.push({
40+
type: "output_text",
41+
text: message.content,
42+
annotations: message.annotations ?? [],
43+
});
44+
}
45+
46+
// Map images to output_image parts
47+
if (message?.images && message.images.length > 0) {
48+
for (const img of message.images) {
49+
messageContent.push({
50+
type: "output_image",
51+
image_url: img.image_url?.url || "",
52+
detail: img.image_url?.detail,
53+
});
54+
}
55+
}
56+
57+
// Only create a message output item if we have content
58+
if (messageContent.length > 0) {
3559
const msg: ResponsesMessageOutputItem = {
3660
id: `msg_${Math.random().toString(36).slice(2, 10)}`,
3761
type: "message",
3862
status: "completed",
3963
role: "assistant",
40-
content: [{ type: "output_text", text: message.content, annotations: message.annotations ?? [] }],
64+
content: messageContent,
4165
};
4266
output.push(msg);
4367
}

packages/llm-mapper/transform/providers/responses/request/toChatCompletions.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,20 @@ function convertContentParts(
2929
return { type: "text", text: p.text };
3030
case "input_image": {
3131
if (p.image_url) {
32-
return { type: "image_url", image_url: { url: p.image_url } };
32+
return { type: "image_url", image_url: { url: p.image_url, detail: p.detail } };
3333
}
3434
// Chat Completions does not support file_id for images directly
3535
throw new Error(
3636
"input_image with file_id is not supported by Chat Completions"
3737
);
3838
}
39+
// Handle output_image when responses output is fed back as input
40+
case "output_image": {
41+
if (p.image_url) {
42+
return { type: "image_url", image_url: { url: p.image_url, detail: p.detail } };
43+
}
44+
throw new Error("output_image missing image_url");
45+
}
3946
case "input_file":
4047
// Chat Completions API does not support arbitrary files as message parts
4148
throw new Error("input_file is not supported by Chat Completions");
@@ -215,6 +222,7 @@ export function toChatCompletions(
215222
max_tokens: body.max_output_tokens,
216223
temperature: body.temperature,
217224
top_p: body.top_p,
225+
top_k: body.top_k,
218226
n: body.n,
219227
stream: body.stream,
220228
tools,
@@ -234,6 +242,7 @@ export function toChatCompletions(
234242
stream_options: body.stream_options,
235243
// Context editing passthrough (only supported by Anthropic - will be stripped for other providers)
236244
context_editing: body.context_editing,
245+
image_generation: body.image_generation,
237246
// Deprecated passthroughs (supported by Chat Completions clients)
238247
function_call: body.function_call,
239248
functions: body.functions,

0 commit comments

Comments
 (0)