Skip to content

Commit 30c866d

Browse files
authored
fix(openai-shim): preserve tool result images and local token caps (#659)
Keep tool-result images as real image_url parts for OpenAI-compatible requests and use max_tokens for local providers like Ollama and LM Studio.
1 parent f6a4455 commit 30c866d

2 files changed

Lines changed: 242 additions & 17 deletions

File tree

src/services/api/openaiShim.test.ts

Lines changed: 201 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,97 @@ test('preserves usage from final OpenAI stream chunk with empty choices', async
403403
expect(usageEvent?.usage?.output_tokens).toBe(45)
404404
})
405405

406+
test('uses max_tokens instead of max_completion_tokens for local providers', async () => {
407+
process.env.OPENAI_BASE_URL = 'http://localhost:11434/v1'
408+
409+
globalThis.fetch = (async (_input, init) => {
410+
const body = JSON.parse(String(init?.body))
411+
expect(body.max_tokens).toBe(64)
412+
expect(body.max_completion_tokens).toBeUndefined()
413+
expect(body.stream_options).toBeUndefined()
414+
415+
return new Response(
416+
JSON.stringify({
417+
id: 'chatcmpl-1',
418+
model: 'llama3.1:8b',
419+
choices: [
420+
{
421+
message: {
422+
role: 'assistant',
423+
content: 'hello',
424+
},
425+
finish_reason: 'stop',
426+
},
427+
],
428+
usage: {
429+
prompt_tokens: 5,
430+
completion_tokens: 1,
431+
total_tokens: 6,
432+
},
433+
}),
434+
{
435+
headers: {
436+
'Content-Type': 'application/json',
437+
},
438+
},
439+
)
440+
}) as FetchType
441+
442+
const client = createOpenAIShimClient({}) as OpenAIShimClient
443+
444+
await client.beta.messages.create({
445+
model: 'llama3.1:8b',
446+
messages: [{ role: 'user', content: 'hello' }],
447+
max_tokens: 64,
448+
stream: false,
449+
})
450+
})
451+
452+
test('keeps max_completion_tokens for non-local non-github providers', async () => {
453+
process.env.OPENAI_BASE_URL = 'https://api.openai.com/v1'
454+
455+
globalThis.fetch = (async (_input, init) => {
456+
const body = JSON.parse(String(init?.body))
457+
expect(body.max_completion_tokens).toBe(64)
458+
expect(body.max_tokens).toBeUndefined()
459+
460+
return new Response(
461+
JSON.stringify({
462+
id: 'chatcmpl-1',
463+
model: 'gpt-4o',
464+
choices: [
465+
{
466+
message: {
467+
role: 'assistant',
468+
content: 'hello',
469+
},
470+
finish_reason: 'stop',
471+
},
472+
],
473+
usage: {
474+
prompt_tokens: 5,
475+
completion_tokens: 1,
476+
total_tokens: 6,
477+
},
478+
}),
479+
{
480+
headers: {
481+
'Content-Type': 'application/json',
482+
},
483+
},
484+
)
485+
}) as FetchType
486+
487+
const client = createOpenAIShimClient({}) as OpenAIShimClient
488+
489+
await client.beta.messages.create({
490+
model: 'gpt-4o',
491+
messages: [{ role: 'user', content: 'hello' }],
492+
max_tokens: 64,
493+
stream: false,
494+
})
495+
})
496+
406497
test('preserves Gemini tool call extra_content in follow-up requests', async () => {
407498
let requestBody: Record<string, unknown> | undefined
408499

@@ -689,9 +780,117 @@ test('preserves image tool results as placeholders in follow-up requests', async
689780

690781
const toolMessage = (requestBody?.messages as Array<Record<string, unknown>>).find(
691782
message => message.role === 'tool',
692-
) as { content?: string } | undefined
783+
) as {
784+
content?: Array<{
785+
type: string
786+
text?: string
787+
image_url?: { url: string }
788+
}> | string
789+
} | undefined
790+
791+
expect(Array.isArray(toolMessage?.content)).toBe(true)
792+
const parts = toolMessage?.content as Array<{
793+
type: string
794+
text?: string
795+
image_url?: { url: string }
796+
}>
797+
const imagePart = parts.find(part => part.type === 'image_url')
798+
expect(imagePart?.image_url?.url).toBe('data:image/png;base64,ZmFrZQ==')
799+
})
693800

694-
expect(toolMessage?.content).toContain('[image:image/png]')
801+
test('preserves mixed text and image tool results as multipart content', async () => {
802+
let requestBody: Record<string, unknown> | undefined
803+
804+
globalThis.fetch = (async (_input, init) => {
805+
requestBody = JSON.parse(String(init?.body))
806+
807+
return new Response(
808+
JSON.stringify({
809+
id: 'chatcmpl-1',
810+
model: 'gpt-4o',
811+
choices: [
812+
{
813+
message: {
814+
role: 'assistant',
815+
content: 'done',
816+
},
817+
finish_reason: 'stop',
818+
},
819+
],
820+
usage: {
821+
prompt_tokens: 12,
822+
completion_tokens: 4,
823+
total_tokens: 16,
824+
},
825+
}),
826+
{
827+
headers: {
828+
'Content-Type': 'application/json',
829+
},
830+
},
831+
)
832+
}) as FetchType
833+
834+
const client = createOpenAIShimClient({}) as OpenAIShimClient
835+
836+
await client.beta.messages.create({
837+
model: 'gpt-4o',
838+
system: 'test system',
839+
messages: [
840+
{ role: 'user', content: 'Read this screenshot' },
841+
{
842+
role: 'assistant',
843+
content: [
844+
{
845+
type: 'tool_use',
846+
id: 'call_image_2',
847+
name: 'Read',
848+
input: { file_path: 'C:\\temp\\screenshot.png' },
849+
},
850+
],
851+
},
852+
{
853+
role: 'user',
854+
content: [
855+
{
856+
type: 'tool_result',
857+
tool_use_id: 'call_image_2',
858+
content: [
859+
{ type: 'text', text: 'Screenshot captured' },
860+
{
861+
type: 'image',
862+
source: {
863+
type: 'base64',
864+
media_type: 'image/png',
865+
data: 'ZmFrZQ==',
866+
},
867+
},
868+
],
869+
},
870+
],
871+
},
872+
],
873+
max_tokens: 64,
874+
stream: false,
875+
})
876+
877+
const toolMessage = (requestBody?.messages as Array<Record<string, unknown>>).find(
878+
message => message.role === 'tool',
879+
) as {
880+
content?: Array<{
881+
type: string
882+
text?: string
883+
image_url?: { url: string }
884+
}>
885+
} | undefined
886+
887+
expect(Array.isArray(toolMessage?.content)).toBe(true)
888+
const parts = toolMessage?.content ?? []
889+
expect(parts[0]).toEqual({ type: 'text', text: 'Screenshot captured' })
890+
expect(parts[1]).toEqual({
891+
type: 'image_url',
892+
image_url: { url: 'data:image/png;base64,ZmFrZQ==' },
893+
})
695894
})
696895

697896
test('uses GEMINI_ACCESS_TOKEN for Gemini OpenAI-compatible requests', async () => {

src/services/api/openaiShim.ts

Lines changed: 41 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -176,35 +176,61 @@ function convertSystemPrompt(
176176
return String(system)
177177
}
178178

179-
function convertToolResultContent(content: unknown): string {
180-
if (typeof content === 'string') return content
181-
if (!Array.isArray(content)) return JSON.stringify(content ?? '')
179+
function convertToolResultContent(
180+
content: unknown,
181+
isError?: boolean,
182+
): string | Array<{ type: string; text?: string; image_url?: { url: string } }> {
183+
if (typeof content === 'string') {
184+
return isError ? `Error: ${content}` : content
185+
}
186+
if (!Array.isArray(content)) {
187+
const text = JSON.stringify(content ?? '')
188+
return isError ? `Error: ${text}` : text
189+
}
182190

183-
const chunks: string[] = []
191+
const parts: Array<{
192+
type: string
193+
text?: string
194+
image_url?: { url: string }
195+
}> = []
184196
for (const block of content) {
185197
if (block?.type === 'text' && typeof block.text === 'string') {
186-
chunks.push(block.text)
198+
parts.push({ type: 'text', text: block.text })
187199
continue
188200
}
189201

190202
if (block?.type === 'image') {
191203
const source = block.source
192204
if (source?.type === 'url' && source.url) {
193-
chunks.push(`[Image](${source.url})`)
194-
} else if (source?.type === 'base64') {
195-
chunks.push(`[image:${source.media_type ?? 'unknown'}]`)
196-
} else {
197-
chunks.push('[image]')
205+
parts.push({ type: 'image_url', image_url: { url: source.url } })
206+
} else if (source?.type === 'base64' && source.media_type && source.data) {
207+
parts.push({
208+
type: 'image_url',
209+
image_url: {
210+
url: `data:${source.media_type};base64,${source.data}`,
211+
},
212+
})
198213
}
199214
continue
200215
}
201216

202217
if (typeof block?.text === 'string') {
203-
chunks.push(block.text)
218+
parts.push({ type: 'text', text: block.text })
204219
}
205220
}
206221

207-
return chunks.join('\n')
222+
if (parts.length === 0) return ''
223+
if (parts.length === 1 && parts[0].type === 'text') {
224+
const text = parts[0].text ?? ''
225+
return isError ? `Error: ${text}` : text
226+
}
227+
if (isError && parts[0]?.type === 'text') {
228+
parts[0] = { ...parts[0], text: `Error: ${parts[0].text ?? ''}` }
229+
} else if (isError) {
230+
parts.unshift({ type: 'text', text: 'Error:' })
231+
}
232+
233+
return parts
208234
}
209235

210236
function convertContentBlocks(
@@ -292,11 +318,10 @@ function convertMessages(
292318

293319
// Emit tool results as tool messages
294320
for (const tr of toolResults) {
295-
const trContent = convertToolResultContent(tr.content)
296321
result.push({
297322
role: 'tool',
298323
tool_call_id: tr.tool_use_id ?? 'unknown',
299-
content: tr.is_error ? `Error: ${trContent}` : trContent,
324+
content: convertToolResultContent(tr.content, tr.is_error),
300325
})
301326
}
302327

@@ -1216,12 +1241,13 @@ class OpenAIShimMessages {
12161241

12171242
const isGithub = isGithubModelsMode()
12181243
const isMistral = isMistralMode()
1244+
const isLocal = isLocalProviderUrl(request.baseUrl)
12191245

12201246
const githubEndpointType = getGithubEndpointType(request.baseUrl)
12211247
const isGithubCopilot = isGithub && githubEndpointType === 'copilot'
12221248
const isGithubModels = isGithub && (githubEndpointType === 'models' || githubEndpointType === 'custom')
12231249

1224-
if ((isGithub || isMistral) && body.max_completion_tokens !== undefined) {
1250+
if ((isGithub || isMistral || isLocal) && body.max_completion_tokens !== undefined) {
12251251
body.max_tokens = body.max_completion_tokens
12261252
delete body.max_completion_tokens
12271253
}

0 commit comments

Comments
 (0)