paulprae-com/app/api/chat/route.ts at main · praeducer/paulprae-com · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
import { createAnthropic } from "@ai-sdk/anthropic";
import { gateway } from "@ai-sdk/gateway";
import {
  streamText,
  generateText,
  convertToModelMessages,
  pruneMessages,
  tool,
  stepCountIs,
  type UIMessage,
  type LanguageModel,
} from "ai";
import { z } from "zod";
import { buildSystemPrompt } from "../../../lib/agent/context";
import { SYSTEM_PROMPTS } from "../../../lib/generated/system-prompts";
import {
  MAX_MESSAGE_CHARS,
  CHAT_MODEL_ID,
  CHAT_CONFIG,
  CHAT_RESUME_CONFIG,
  CHAT_REQUEST_LIMITS,
  RATE_LIMIT_CONFIG,
  RESUME_DOWNLOAD_PATHS,
} from "../../../lib/constants";

// Next.js requires segment config exports to be static literals.
export const maxDuration = 120;

// ─── Model Provider ─────────────────────────────────────────────────────────

const useGateway = !!process.env.AI_GATEWAY_API_KEY;

// Required beta header for 1-hour prompt cache TTL (extended-cache-ttl feature).
// Without this header, Anthropic ignores the ttl field and the cache_control block
// may be silently dropped — resulting in no caching at all.
// See: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching
const anthropic = createAnthropic({
  headers: {
    "anthropic-beta": "extended-cache-ttl-2025-04-11",
  },
});

function getModel(modelId: string): LanguageModel {
  if (useGateway) {
    console.log(`[chat] Using AI Gateway for model: anthropic/${modelId}`);
    return gateway(`anthropic/${modelId}`) as LanguageModel;
  }
  console.log(`[chat] Using direct Anthropic SDK for model: ${modelId}`);
  return anthropic(modelId) as LanguageModel;
}

// ─── Request Limits (re-export for tests) ───────────────────────────────────

export { CHAT_REQUEST_LIMITS } from "../../../lib/constants";

const { maxMessages: MAX_MESSAGES, maxBodyBytes: MAX_BODY_BYTES } = CHAT_REQUEST_LIMITS;

export const generateTailoredResumeInputSchema = z.object({
  jobDescription: z
    .string()
    .max(
      CHAT_REQUEST_LIMITS.maxJobDescriptionChars,
      `Job description must be under ${CHAT_REQUEST_LIMITS.maxJobDescriptionChars} characters`,
    )
    .describe("The job description or role requirements to tailor the resume for"),
  emphasisAreas: z
    .array(
      z.coerce
        .string()
        .max(
          CHAT_REQUEST_LIMITS.maxEmphasisChars,
          `Each emphasis area must be under ${CHAT_REQUEST_LIMITS.maxEmphasisChars} characters`,
        ),
    )
    .max(
      CHAT_REQUEST_LIMITS.maxEmphasisItems,
      `Maximum ${CHAT_REQUEST_LIMITS.maxEmphasisItems} emphasis areas`,
    )
    .nullish()
    .describe("Specific areas to emphasize (e.g., 'AI/ML', 'healthcare', 'leadership')"),
});

export const getResumeLinksInputSchema = z.object({});

/**
 * Wrap untrusted job input in XML tags so prompts treat it as data.
 * Pass chatFormat=true to constrain output for the chat bubble (~500 words).
 */
export function buildTailoredResumePrompt(
  jobDescription: string,
  emphasisAreas?: string[],
  chatFormat?: boolean,
): string {
  const formatConstraint = chatFormat
    ? `\n\nFormat for chat display — maximum 500 words:\n- Summary: 3–4 sentences\n- Experience: 3 most relevant positions, 2–3 bullets each (action verb + metric)\n- Skills: single compact line\nOmit publications, certifications, education, and older roles unless critical for this role.`
    : "";

  return emphasisAreas?.length
    ? `Generate a tailored resume for the following job description.${formatConstraint}

<job_description>
${jobDescription}
</job_description>

<emphasis_areas>
${emphasisAreas.join(", ")}
</emphasis_areas>`
    : `Generate a tailored resume for the following job description.${formatConstraint}

<job_description>
${jobDescription}
</job_description>`;
}

// ─── Rate Limiting (Upstash + in-memory fallback) ───────────────────────────

let ratelimit: { limit: (key: string) => Promise<{ success: boolean }> } | null = null;

const memoryStore = new Map<string, number[]>();

function memoryRateLimit(key: string): boolean {
  const now = Date.now();
  const timestamps = memoryStore.get(key) ?? [];
  const valid = timestamps.filter((t) => now - t < RATE_LIMIT_CONFIG.windowMs);
  if (valid.length >= RATE_LIMIT_CONFIG.maxRequests) {
    memoryStore.set(key, valid);
    return false;
  }
  valid.push(now);
  memoryStore.set(key, valid);
  return true;
}

// Periodic cleanup to prevent memory leaks (every 5 minutes)
if (typeof globalThis !== "undefined") {
  const cleanup = () => {
    const now = Date.now();
    for (const [key, timestamps] of memoryStore) {
      const valid = timestamps.filter((t) => now - t < RATE_LIMIT_CONFIG.windowMs);
      if (valid.length === 0) memoryStore.delete(key);
      else memoryStore.set(key, valid);
    }
  };
  const globalRef = globalThis as unknown as { _rateLimitCleanup?: boolean };
  if (!globalRef._rateLimitCleanup) {
    globalRef._rateLimitCleanup = true;
    setInterval(cleanup, 5 * 60_000).unref?.();
  }
}

async function initRateLimit() {
  if (ratelimit !== null) return ratelimit;
  try {
    const hasRedisEnv =
      (process.env.UPSTASH_REDIS_REST_URL && process.env.UPSTASH_REDIS_REST_TOKEN) ||
      (process.env.KV_REST_API_URL && process.env.KV_REST_API_TOKEN);
    if (hasRedisEnv) {
      const { Ratelimit } = await import("@upstash/ratelimit");
      const { Redis } = await import("@upstash/redis");
      ratelimit = new Ratelimit({
        redis: Redis.fromEnv(),
        limiter: Ratelimit.slidingWindow(
          RATE_LIMIT_CONFIG.maxRequests,
          `${RATE_LIMIT_CONFIG.windowMs / 1000} s`,
        ),
        analytics: true,
        prefix: RATE_LIMIT_CONFIG.prefix,
      });
    } else {
      ratelimit = { limit: async (key: string) => ({ success: memoryRateLimit(key) }) };
    }
  } catch (err) {
    console.warn("[rate-limit] Upstash Redis init failed, using in-memory fallback:", err);
    ratelimit = { limit: async (key: string) => ({ success: memoryRateLimit(key) }) };
  }
  return ratelimit;
}

// ─── Cached System Prompts ──────────────────────────────────────────────────

// In-memory fallback cache for dev mode (when generated file may be stale).
const promptCache = new Map<string, string>();

function getSystemPrompt(mode: "chat" | "tools" | "resume-generator"): string {
  // Primary: use pre-built prompt (zero I/O, byte-identical across instances).
  // Pre-built strings are committed alongside career-data.json and regenerated
  // by `npm run build:prompts` (called automatically by the pipeline).
  const prebuilt = SYSTEM_PROMPTS[mode];
  if (prebuilt) return prebuilt;

  // Fallback: runtime build (dev mode or missing generated file).
  const cached = promptCache.get(mode);
  if (cached) return cached;

  const prompt = buildSystemPrompt(mode);
  if (!prompt) {
    throw new Error(
      `Failed to build system prompt for mode "${mode}". Ensure the pipeline has been run (npm run pipeline).`,
    );
  }

  promptCache.set(mode, prompt);
  return prompt;
}

// ─── Input Validation Helpers ───────────────────────────────────────────────

function messageTextLength(msg: UIMessage): number {
  let total = 0;
  if (Array.isArray(msg.parts)) {
    for (const part of msg.parts) {
      if (part.type === "text") total += part.text.length;
    }
  }
  return total;
}

function validateMessages(messages: UIMessage[]): string | null {
  for (let i = 0; i < messages.length; i++) {
    if (messageTextLength(messages[i]) > MAX_MESSAGE_CHARS) {
      return `Message ${i + 1} exceeds maximum length (${MAX_MESSAGE_CHARS} characters)`;
    }
  }
  return null;
}

// ─── Route Handler ──────────────────────────────────────────────────────────

export async function POST(request: Request) {
  const contentType = request.headers.get("content-type");
  if (!contentType?.includes("application/json")) {
    return new Response("Content-Type must be application/json", { status: 415 });
  }

  const rl = await initRateLimit();
  const forwarded = request.headers.get("x-forwarded-for")?.split(",")[0]?.trim();
  const ip =
    request.headers.get("x-real-ip") ??
    forwarded ??
    `anon-${(request.headers.get("user-agent") ?? "").slice(0, 32)}`;
  const { success } = await rl.limit(ip);
  if (!success) {
    return new Response("Too many requests. Please try again in a minute.", {
      status: 429,
      headers: { "Retry-After": String(Math.ceil(RATE_LIMIT_CONFIG.windowMs / 1000)) },
    });
  }

  const contentLength = request.headers.get("content-length");
  if (contentLength && parseInt(contentLength, 10) > MAX_BODY_BYTES) {
    return new Response("Request body too large", { status: 413 });
  }

  let body: { messages?: unknown; mode?: unknown };
  try {
    const rawText = await request.text();
    if (rawText.length > MAX_BODY_BYTES) {
      return new Response("Request body too large", { status: 413 });
    }
    body = JSON.parse(rawText);
  } catch {
    return new Response("Invalid JSON in request body", { status: 400 });
  }

  if (body === null || typeof body !== "object") {
    return new Response("Request body must be a JSON object", { status: 400 });
  }

  const { messages, mode } = body as {
    messages: UIMessage[];
    mode?: "chat" | "tools";
  };

  if (!messages || !Array.isArray(messages) || messages.length === 0) {
    return new Response("Messages array is required", { status: 400 });
  }

  if (messages.length > MAX_MESSAGES) {
    return new Response(`Too many messages (max ${MAX_MESSAGES})`, { status: 400 });
  }

  const msgError = validateMessages(messages);
  if (msgError) {
    return new Response(msgError, { status: 400 });
  }

  const validMode = mode === "tools" ? "tools" : "chat";

  if (!useGateway && !process.env.ANTHROPIC_API_KEY) {
    console.error("[chat] ANTHROPIC_API_KEY is not set");
    return new Response("AI service is not configured. Please try again later.", { status: 503 });
  }

  let systemPrompt: string;
  try {
    systemPrompt = getSystemPrompt(validMode);
  } catch (err) {
    console.error("[chat] Failed to build system prompt:", err);
    return new Response("Service temporarily unavailable. Please try again later.", {
      status: 503,
    });
  }

  const rawModelMessages = await convertToModelMessages(messages);
  // Keep tool-call/result pairs intact ("none") so multi-turn tool exchanges
  // remain valid for Anthropic's API. Pruning tool calls with "before-last-message"
  // strips the tool-call block from prior assistant messages, which either leaves an
  // empty content array (rejected by Anthropic) or an orphaned tool-result — both
  // cause silent stream failures on the second tailored-resume request. Token budget
  // is bounded by MAX_MESSAGES (50) and MAX_BODY_BYTES (256KB). Reasoning blocks are
  // still pruned as they are large and not needed for conversation continuity.
  const modelMessages = pruneMessages({
    messages: rawModelMessages,
    toolCalls: "none",
    reasoning: "before-last-message",
  }).filter((m) => m.content.length > 0); // never send empty content arrays to Anthropic

  const chatTools =
    validMode === "chat"
      ? {
          generate_tailored_resume: tool({
            description:
              "Generate a tailored version of Paul Prae's resume optimized for a specific job description. Use when a recruiter provides a JD or asks for a customized resume.",
            inputSchema: generateTailoredResumeInputSchema,
            execute: async ({ jobDescription, emphasisAreas }) => {
              try {
                // Truncate as a safety net in case the model produces verbose content
                // that approaches schema limits. null → undefined for buildTailoredResumePrompt.
                const safeJD = jobDescription.slice(0, CHAT_REQUEST_LIMITS.maxJobDescriptionChars);
                const safeEmphasis = emphasisAreas ?? undefined;
                const resumeSystemPrompt = getSystemPrompt("resume-generator");
                const userPrompt = buildTailoredResumePrompt(safeJD, safeEmphasis, true);

                const { text, finishReason } = await generateText({
                  model: getModel(CHAT_MODEL_ID),
                  // Cache control must be on the system message content block, not top-level
                  // providerOptions — the @ai-sdk/anthropic provider only applies cache_control
                  // to the Anthropic API when it is set via per-message providerOptions.
                  system: {
                    role: "system",
                    content: resumeSystemPrompt,
                    providerOptions: {
                      anthropic: { cacheControl: { type: "ephemeral", ttl: "1h" } },
                    },
                  },
                  prompt: userPrompt,
                  maxOutputTokens: CHAT_RESUME_CONFIG.maxOutputTokens,
                  temperature: CHAT_RESUME_CONFIG.temperature,
                });

                if (!text || text.length < 100) {
                  return {
                    error:
                      "Resume generation produced insufficient output. Please try rephrasing your job description.",
                  };
                }

                if (finishReason === "length") {
                  return {
                    error:
                      "The tailored resume was too long to display in chat. Please try a shorter job description, or download the full resume via the PDF link in the navigation bar.",
                  };
                }

                return {
                  tailoredResume: text,
                  standardResumeLinks: RESUME_DOWNLOAD_PATHS,
                  instructions:
                    "Present the tailored resume below, formatted as markdown. Tell the user: copy it with the copy button on this message; download the full standard resume via the PDF link in the navigation bar.",
                };
              } catch (err) {
                console.error("[tool:generate_tailored_resume]", err);
                return {
                  error:
                    "Resume generation failed. This may be due to high demand. Please try again in a moment.",
                };
              }
            },
          }),
          get_resume_links: tool({
            description:
              "Get download links for Paul Prae's resume in various formats. Use when someone asks to download or view the resume.",
            inputSchema: getResumeLinksInputSchema,
            execute: async () => RESUME_DOWNLOAD_PATHS,
          }),
        }
      : undefined;

  try {
    // Prompt caching: cache_control must be on the system message content block.
    // Passing cacheControl via top-level providerOptions puts it on the API request
    // root, which Anthropic ignores for caching. Per-message providerOptions is the
    // correct approach — the SDK only writes cache_control to the content block when
    // the system message is passed as a SystemModelMessage object (not a string).
    const result = streamText({
      model: getModel(CHAT_MODEL_ID),
      system: {
        role: "system",
        content: systemPrompt,
        providerOptions: {
          anthropic: { cacheControl: { type: "ephemeral", ttl: "1h" } },
        },
      },
      messages: modelMessages,
      maxOutputTokens: CHAT_CONFIG.maxOutputTokens,
      temperature:
        validMode === "tools" ? CHAT_CONFIG.toolsTemperature : CHAT_CONFIG.chatTemperature,
      tools: chatTools,
      stopWhen: chatTools ? stepCountIs(2) : stepCountIs(1),
      onError({ error }) {
        const errObj = error instanceof Error ? error : new Error(String(error));
        console.error(
          `[chat] Stream error (${useGateway ? "gateway" : "direct"}):`,
          errObj.message,
          errObj.cause ?? "",
          errObj.stack ?? "",
        );
      },
      onFinish({ usage, providerMetadata }) {
        // Log cache metrics to confirm 1h TTL is active (not 5m default).
        // In Vercel logs: look for ephemeral_1h > 0 (1h write) vs ephemeral_5m > 0 (5m fallback).
        // providerMetadata.anthropic.usage contains the raw Anthropic API response fields.
        const anthropicRaw = (providerMetadata?.anthropic as Record<string, unknown> | undefined)
          ?.usage as Record<string, unknown> | undefined;
        console.log(
          "[chat] cache_tokens:",
          JSON.stringify({
            cache_read: usage.inputTokenDetails?.cacheReadTokens ?? 0,
            cache_write: usage.inputTokenDetails?.cacheWriteTokens ?? 0,
            ephemeral_1h: anthropicRaw?.ephemeral_1h_input_tokens ?? "n/a",
            ephemeral_5m: anthropicRaw?.ephemeral_5m_input_tokens ?? "n/a",
          }),
        );
      },
    });

    return result.toUIMessageStreamResponse({
      onError: () => "An error occurred while generating a response. Please try again.",
    });
  } catch (err) {
    console.error(`[chat] ${useGateway ? "Gateway" : "Anthropic"} API error:`, err);
    const status =
      err instanceof Error && "status" in err ? (err as { status: number }).status : 500;
    const message =
      status === 429
        ? "The AI service is rate limited. Please try again in a moment."
        : status === 529
          ? "The AI service is temporarily overloaded. Please try again in a moment."
          : "An error occurred while generating a response. Please try again.";
    return new Response(message, { status: status >= 400 ? status : 500 });
  }
}