|
| 1 | +/** |
| 2 | + * Binary content persistence for web_fetch. |
| 3 | + * |
| 4 | + * When a fetched URL returns binary data (PDFs, images, archives, etc.) the |
| 5 | + * raw bytes are written to disk with a mime-derived extension so the agent can |
| 6 | + * inspect the file later. The decoded text is still summarized inline; the |
| 7 | + * saved file is a supplement, not a replacement. |
| 8 | + */ |
| 9 | +import { existsSync, mkdirSync, writeFileSync } from 'node:fs'; |
| 10 | +import { dirname, join } from 'node:path'; |
| 11 | +import { dexterPath } from '../../utils/paths.js'; |
| 12 | + |
| 13 | +// Directory where binary downloads are persisted. |
| 14 | +const WEB_FETCH_OUTPUT_DIR = dexterPath('web-fetch'); |
| 15 | + |
| 16 | +// Content types that are textual and therefore never treated as binary, even |
| 17 | +// though their top-level type may not be `text/*`. |
| 18 | +const TEXTUAL_CONTENT_TYPES = [ |
| 19 | + 'text/', |
| 20 | + 'application/json', |
| 21 | + 'application/xml', |
| 22 | + 'application/xhtml+xml', |
| 23 | + 'application/javascript', |
| 24 | + 'application/ecmascript', |
| 25 | + 'application/ld+json', |
| 26 | + 'application/rss+xml', |
| 27 | + 'application/atom+xml', |
| 28 | + 'image/svg+xml', |
| 29 | +]; |
| 30 | + |
| 31 | +// Mime type -> file extension. Gives persisted binaries a real extension so |
| 32 | +// downstream tools (and the user) can open them. |
| 33 | +const MIME_TO_EXTENSION: Record<string, string> = { |
| 34 | + 'application/pdf': 'pdf', |
| 35 | + 'application/json': 'json', |
| 36 | + 'text/csv': 'csv', |
| 37 | + 'text/plain': 'txt', |
| 38 | + 'text/html': 'html', |
| 39 | + 'text/markdown': 'md', |
| 40 | + 'application/zip': 'zip', |
| 41 | + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx', |
| 42 | + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx', |
| 43 | + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx', |
| 44 | + 'application/msword': 'doc', |
| 45 | + 'application/vnd.ms-excel': 'xls', |
| 46 | + 'audio/mpeg': 'mp3', |
| 47 | + 'audio/wav': 'wav', |
| 48 | + 'audio/ogg': 'ogg', |
| 49 | + 'video/mp4': 'mp4', |
| 50 | + 'video/webm': 'webm', |
| 51 | + 'image/png': 'png', |
| 52 | + 'image/jpeg': 'jpg', |
| 53 | + 'image/gif': 'gif', |
| 54 | + 'image/webp': 'webp', |
| 55 | + 'image/svg+xml': 'svg', |
| 56 | +}; |
| 57 | + |
| 58 | +function normalizeContentType(contentType: string): string { |
| 59 | + return contentType.split(';')[0]!.trim().toLowerCase(); |
| 60 | +} |
| 61 | + |
| 62 | +/** |
| 63 | + * Whether a content type should be treated as binary (saved to disk) rather |
| 64 | + * than decoded as text. |
| 65 | + */ |
| 66 | +export function isBinaryContentType(contentType: string): boolean { |
| 67 | + const normalized = normalizeContentType(contentType); |
| 68 | + if (!normalized) { |
| 69 | + return false; |
| 70 | + } |
| 71 | + if (TEXTUAL_CONTENT_TYPES.some((prefix) => normalized.startsWith(prefix))) { |
| 72 | + return false; |
| 73 | + } |
| 74 | + return true; |
| 75 | +} |
| 76 | + |
| 77 | +function extensionForContentType(contentType: string): string { |
| 78 | + const normalized = normalizeContentType(contentType); |
| 79 | + return MIME_TO_EXTENSION[normalized] ?? 'bin'; |
| 80 | +} |
| 81 | + |
| 82 | +export type PersistResult = { filepath: string; size: number } | { error: string }; |
| 83 | + |
| 84 | +/** |
| 85 | + * Persist raw binary bytes to the web-fetch output directory. |
| 86 | + * Returns the file path and size, or an error. |
| 87 | + */ |
| 88 | +export function persistBinaryContent( |
| 89 | + buffer: Buffer, |
| 90 | + contentType: string, |
| 91 | + id: string, |
| 92 | +): PersistResult { |
| 93 | + try { |
| 94 | + const extension = extensionForContentType(contentType); |
| 95 | + const filepath = join(WEB_FETCH_OUTPUT_DIR, `${id}.${extension}`); |
| 96 | + const dir = dirname(filepath); |
| 97 | + if (!existsSync(dir)) { |
| 98 | + mkdirSync(dir, { recursive: true }); |
| 99 | + } |
| 100 | + writeFileSync(filepath, buffer); |
| 101 | + return { filepath, size: buffer.length }; |
| 102 | + } catch (error) { |
| 103 | + return { error: error instanceof Error ? error.message : String(error) }; |
| 104 | + } |
| 105 | +} |
0 commit comments