diff --git a/db/db.ts b/db/db.ts index ca6e853df3e..e203c54a38e 100644 --- a/db/db.ts +++ b/db/db.ts @@ -1238,3 +1238,57 @@ export const getUniqueTopicCount = async ( if (!count) throw new Error("Failed to get unique topic count") return count } + +/** + * Data structure for topics to be indexed in AI Search + */ +export interface TopicForAISearch { + id: number + name: string + slug: string + excerpt: string + markdown: string +} + +/** + * Get topics (tags with associated topic pages) for AI Search indexing. + * Returns tags that have a matching published topic page (TopicPage or LinearTopicPage). + */ +export async function getTopicsForAISearch( + knex: KnexReadonlyTransaction +): Promise { + const rows = await knexRaw<{ + id: number + name: string + slug: string + excerpt: string | null + markdown: string | null + }>( + knex, + `-- sql + SELECT + t.id, + t.name, + t.slug, + JSON_UNQUOTE(JSON_EXTRACT(p.content, '$.excerpt')) AS excerpt, + p.markdown + FROM tags t + JOIN posts_gdocs p ON t.slug = p.slug + WHERE + t.slug IS NOT NULL + AND p.published = 1 + AND p.type IN (:types) + ORDER BY t.name ASC`, + { + types: [OwidGdocType.TopicPage, OwidGdocType.LinearTopicPage], + } + ) + + return rows.map((row) => ({ + id: row.id, + name: row.name, + slug: row.slug, + excerpt: row.excerpt ?? "", + markdown: row.markdown ?? "", + })) +} diff --git a/functions/_common/env.ts b/functions/_common/env.ts index 6e13b0ea954..7c99f0929fb 100644 --- a/functions/_common/env.ts +++ b/functions/_common/env.ts @@ -1,5 +1,8 @@ export interface Env { ASSETS: Fetcher + AI: Ai + VECTORIZE_TOPICS: Vectorize + VECTORIZE_VOCABULARY: Vectorize url: URL GRAPHER_CONFIG_R2_BUCKET_URL: string GRAPHER_CONFIG_R2_BUCKET_FALLBACK_URL: string diff --git a/functions/api/ai-search/keywords/index.ts b/functions/api/ai-search/keywords/index.ts new file mode 100644 index 00000000000..2039889319c --- /dev/null +++ b/functions/api/ai-search/keywords/index.ts @@ -0,0 +1,346 @@ +import { Env } from "../../../_common/env.js" +import { validateQueryParams, COMMON_SEARCH_PARAMS } from "../utils.js" + +const LLM_MODEL = "@cf/meta/llama-3.1-8b-instruct-fast" +const EMBEDDING_MODEL = "@cf/baai/bge-base-en-v1.5" + +const VALID_PARAMS = new Set([ + COMMON_SEARCH_PARAMS.QUERY, + "limit", + "source", + "llm_filter", +]) + +const DEFAULT_LIMIT = 3 +const MAX_LIMIT = 10 +const VOCABULARY_CANDIDATES = 20 + +interface VocabularyCandidate { + keyword: string + topic_slug: string + topic_name: string + score: number +} + +interface KeywordsResponse { + query: string + keywords: string[] + timing: { + total_ms: number + search_ms?: number + llm_ms?: number + } + vocabulary?: { + candidates?: VocabularyCandidate[] + matched: number + } +} + +/** + * Generate embedding for a query using Workers AI + */ +async function generateQueryEmbedding( + ai: Ai, + query: string +): Promise { + interface EmbeddingResponse { + data: number[][] + } + + const response = (await ai.run(EMBEDDING_MODEL, { + text: [query], + })) as EmbeddingResponse + + return response.data[0] +} + +/** + * Search vocabulary using semantic similarity + */ +async function searchVocabulary( + env: Env, + query: string, + limit: number +): Promise { + const queryEmbedding = await generateQueryEmbedding(env.AI, query) + + const results = await env.VECTORIZE_VOCABULARY.query(queryEmbedding, { + topK: limit, + returnMetadata: "all", + }) + + return results.matches.map((match) => { + const metadata = match.metadata as unknown as { + keyword: string + topic_slug: string + topic_name: string + } + + return { + keyword: metadata.keyword, + topic_slug: metadata.topic_slug, + topic_name: metadata.topic_name, + score: match.score, + } + }) +} + +/** + * Use LLM to pick the most relevant keywords from vocabulary candidates. + */ +async function selectKeywordsWithLLM( + env: Env, + query: string, + vocabularyCandidates: VocabularyCandidate[], + limit: number +): Promise { + const vocabularyList = vocabularyCandidates + .map((c) => `- ${c.keyword}`) + .join("\n") + + const userMessage = `Rewrite this query into 1-${limit} keywords for searching Our World in Data charts. + +AVAILABLE VOCABULARY (you MUST choose from these terms ONLY): +${vocabularyList} + +Rules: +- Use ONLY terms from the vocabulary list above +- Select 1-${limit} most relevant terms +- Translate user's intent to exact vocabulary matches +- Do NOT return the original query "${query}" itself as a keyword +- Return ONLY a JSON array of strings + +Query: "${query}"` + + const response = (await ( + env.AI.run as ( + model: string, + options: object + ) => Promise<{ response?: string } | string> + )(LLM_MODEL, { + messages: [{ role: "user", content: userMessage }], + temperature: 0, + max_tokens: 200, + })) as { response?: string } | string + + const text = + typeof response === "string" ? response : response.response || "" + if (!text) { + throw new Error("Keywords LLM selection: empty response from model") + } + + console.log("Keywords LLM selection response:", text) + + const jsonMatch = text.match(/\[[\s\S]*?\]/) + if (!jsonMatch) { + throw new Error( + `Keywords LLM selection: no JSON array found in response: ${text}` + ) + } + + const keywords = JSON.parse(jsonMatch[0]) as string[] + if (!Array.isArray(keywords)) { + throw new Error( + `Keywords LLM selection: invalid array in response: ${jsonMatch[0]}` + ) + } + + return keywords +} + +/** + * Use LLM to generate keywords directly (no vocabulary constraints). + */ +async function generateKeywordsWithLLM( + env: Env, + query: string +): Promise { + const userMessage = `Rewrite this query into 1-5 keywords for searching Our World in Data charts. + +Rules: +- Use terms from OWID chart titles (e.g., "GDP per capita", "CO2 emissions", "fish supply") +- For vague terms, translate to metrics (e.g., "richer" → "GDP per capita") +- Do NOT add "rate", "level", "index" unless in original query +- NEVER include geographic names (countries, continents, regions) +- Do NOT return the original query "${query}" itself as a keyword + +Return ONLY a JSON array of strings. + +Query: "${query}" +` + + const response = (await ( + env.AI.run as ( + model: string, + options: object + ) => Promise<{ response?: string } | string> + )(LLM_MODEL, { + messages: [{ role: "user", content: userMessage }], + temperature: 0, + max_tokens: 200, + })) as { response?: string } | string + + const text = + typeof response === "string" ? response : response.response || "" + if (!text) { + throw new Error("Keywords LLM generation: empty response from model") + } + + console.log("Keywords LLM generation response:", text) + + const jsonMatch = text.match(/\[[\s\S]*?\]/) + if (!jsonMatch) { + throw new Error( + `Keywords LLM generation: no JSON array found in response: ${text}` + ) + } + + const keywords = JSON.parse(jsonMatch[0]) as string[] + if (!Array.isArray(keywords)) { + throw new Error( + `Keywords LLM generation: invalid array in response: ${jsonMatch[0]}` + ) + } + + return keywords +} + +export const onRequestGet: PagesFunction = async (context) => { + const { env, request } = context + const url = new URL(request.url) + const startTime = Date.now() + + try { + const validationError = validateQueryParams(url, VALID_PARAMS) + if (validationError) return validationError + + const query = url.searchParams.get(COMMON_SEARCH_PARAMS.QUERY) || "" + + if (!query) { + return new Response( + JSON.stringify({ + error: "Missing query parameter", + details: 'Please provide a "q" parameter', + }), + { + status: 400, + headers: { + "Content-Type": "application/json", + "Access-Control-Allow-Origin": "*", + }, + } + ) + } + + const limit = Math.min( + Math.max( + 1, + parseInt( + url.searchParams.get("limit") || DEFAULT_LIMIT.toString() + ) + ), + MAX_LIMIT + ) + + // source=semantic (default): use pre-indexed vocabulary via Vectorize + // source=llm: use LLM directly without vocabulary constraints + const source = (url.searchParams.get("source") || "semantic") as + | "semantic" + | "llm" + + // llm_filter (default true): use LLM to pick best keywords from candidates + const llmFilter = url.searchParams.get("llm_filter") !== "false" + + let keywords: string[] + let searchMs: number | undefined + let llmMs: number | undefined + let vocabularyCandidates: VocabularyCandidate[] | undefined + + if (source === "semantic") { + // Step 1: Semantic search to find vocabulary candidates + const searchStart = Date.now() + vocabularyCandidates = await searchVocabulary( + env, + query, + VOCABULARY_CANDIDATES + ) + searchMs = Date.now() - searchStart + + if (llmFilter) { + // Step 2: LLM picks the best keywords from candidates + const llmStart = Date.now() + keywords = await selectKeywordsWithLLM( + env, + query, + vocabularyCandidates, + limit + ) + llmMs = Date.now() - llmStart + } else { + // No LLM — return top semantic matches directly + keywords = vocabularyCandidates + .map((c) => c.keyword) + .slice(0, limit) + } + } else { + // source=llm — generate keywords directly without vocabulary + const llmStart = Date.now() + keywords = await generateKeywordsWithLLM(env, query) + llmMs = Date.now() - llmStart + } + + // Filter out keywords identical to the query + const queryLower = query.toLowerCase().trim() + keywords = keywords.filter( + (kw) => kw.toLowerCase().trim() !== queryLower + ) + + const endTime = Date.now() + + console.log( + `[AI Search keywords] query="${query}" | total=${endTime - startTime}ms | keywords=${keywords.join(", ")}` + ) + + const response: KeywordsResponse = { + query, + keywords, + timing: { + total_ms: endTime - startTime, + search_ms: searchMs, + llm_ms: llmMs, + }, + } + + if (source === "semantic" && vocabularyCandidates) { + response.vocabulary = { + candidates: vocabularyCandidates, + matched: vocabularyCandidates.length, + } + } + + return new Response(JSON.stringify(response, null, 2), { + headers: { + "Content-Type": "application/json", + "Cache-Control": "public, max-age=3600", + "Access-Control-Allow-Origin": "*", + }, + }) + } catch (error) { + console.error("Keywords endpoint error:", error) + + return new Response( + JSON.stringify({ + error: "Keyword suggestion failed", + message: + error instanceof Error ? error.message : "Unknown error", + }), + { + status: 500, + headers: { + "Content-Type": "application/json", + "Access-Control-Allow-Origin": "*", + }, + } + ) + } +} diff --git a/functions/api/ai-search/topics/index.ts b/functions/api/ai-search/topics/index.ts new file mode 100644 index 00000000000..8de2bb75610 --- /dev/null +++ b/functions/api/ai-search/topics/index.ts @@ -0,0 +1,424 @@ +import { Env } from "../../../_common/env.js" +import { + getBaseUrl, + validateQueryParams, + COMMON_SEARCH_PARAMS, +} from "../utils.js" + +const DEFAULT_LIMIT = 10 +const MAX_LIMIT = 50 +const SEMANTIC_CANDIDATES = 10 +const EMBEDDING_MODEL = "@cf/baai/bge-base-en-v1.5" +const LLM_MODEL = "@cf/meta/llama-3.1-8b-instruct-fast" +const DATASETTE_URL = "https://datasette-public.owid.io/owid.json" + +interface TopicData { + id: number + name: string + slug: string + excerpt: string +} + +interface DatasetteResponse { + ok: boolean + rows: Array<[number, string, string, string | null]> + error?: string | null +} + +// NOTE: Fetching topics from Datasette on cold start adds ~200-500ms latency. +// If this becomes a performance issue, consider switching back to a static +// topics.json file (generate with: npx tsx functions/scripts/generateTopicsJson.ts). +let topicsCache: TopicData[] | null = null +let topicsListCache: string | null = null + +async function getTopics(): Promise { + if (topicsCache) return topicsCache + + const sql = ` + SELECT t.id, t.name, t.slug, p.content->>'$.excerpt' AS excerpt + FROM tags t + JOIN posts_gdocs p ON t.slug = p.slug + WHERE t.slug IS NOT NULL + AND p.published = 1 + AND p.type IN ('topic-page', 'linear-topic-page') + ORDER BY t.name ASC + ` + const url = `${DATASETTE_URL}?sql=${encodeURIComponent(sql)}` + const response = await fetch(url) + + if (!response.ok) { + throw new Error( + `Datasette request failed: ${response.status} ${response.statusText}` + ) + } + + const data = (await response.json()) as DatasetteResponse + if (!data.ok || data.error) { + throw new Error( + `Datasette query error: ${data.error || "Unknown error"}` + ) + } + + topicsCache = data.rows.map(([id, name, slug, excerpt]) => ({ + id, + name, + slug, + excerpt: excerpt || "", + })) + return topicsCache +} + +async function getTopicsList(): Promise { + if (topicsListCache) return topicsListCache + const topics = await getTopics() + topicsListCache = topics.map((t) => `- ${t.name}`).join("\n") + return topicsListCache +} + +/** + * Topic hit for the API response + */ +interface TopicHit { + id: number + name: string + slug: string + excerpt: string + url: string + __position: number + score: number +} + +/** + * API response format + */ +interface TopicsApiResponse { + query: string + hits: TopicHit[] + nbHits: number + source: "semantic" | "llm" + timing_ms: number +} + +/** + * Generate embedding for a query using Workers AI + */ +async function generateQueryEmbedding( + ai: Ai, + query: string +): Promise { + interface EmbeddingResponse { + data: number[][] + } + + const response = (await ai.run(EMBEDDING_MODEL, { + text: [query], + })) as EmbeddingResponse + + return response.data[0] +} + +/** + * Search topics using Vectorize + */ +async function searchTopicsWithVectorize( + env: Env, + query: string, + limit: number, + baseUrl: string +): Promise { + // Generate embedding for the query + const queryEmbedding = await generateQueryEmbedding(env.AI, query) + + // Query Vectorize with the embedding + const results = await env.VECTORIZE_TOPICS.query(queryEmbedding, { + topK: limit, + returnMetadata: "all", + }) + + // Transform results to TopicHit format + const hits: TopicHit[] = results.matches.map((match, index) => { + const metadata = match.metadata as unknown as { + id: number + name: string + slug: string + excerpt: string + } + + return { + id: metadata.id, + name: metadata.name, + slug: metadata.slug, + excerpt: metadata.excerpt || "", + url: `${baseUrl}/${metadata.slug}`, + __position: index + 1, + score: match.score, + } + }) + + return hits +} + +/** + * Use LLM to filter semantic topic candidates to only strongly relevant ones. + */ +async function filterTopicsWithLLM( + env: Env, + query: string, + candidates: TopicHit[], + limit: number +): Promise { + const candidateList = candidates.map((c) => `- ${c.name}`).join("\n") + + const userMessage = `Given this search query: "${query}" + +These topic candidates were found via semantic search: +${candidateList} + +Select ONLY topics that are DIRECTLY and STRONGLY related to the query. +- Exclude topics that are only tangentially or loosely related +- Better to return fewer highly relevant topics than many weak ones +- If none are strongly relevant, return an empty array [] +- Maximum ${limit} topics +- Return ONLY a JSON array of topic names` + + const response = (await ( + env.AI.run as ( + model: string, + options: object + ) => Promise<{ response?: string } | string> + )(LLM_MODEL, { + messages: [{ role: "user", content: userMessage }], + temperature: 0, + max_tokens: 300, + })) as { response?: string } | string + + const text = + typeof response === "string" ? response : response.response || "" + if (!text) return candidates.slice(0, limit) + + console.log("Topics LLM filter response:", text) + + const jsonMatch = text.match(/\[[\s\S]*?\]/) + if (!jsonMatch) return candidates.slice(0, limit) + + try { + const selectedNames = JSON.parse(jsonMatch[0]) as string[] + const nameSet = new Set(selectedNames) + const filtered = candidates.filter((c) => nameSet.has(c.name)) + return filtered.slice(0, limit) + } catch { + return candidates.slice(0, limit) + } +} + +/** + * Recommend topics using LLM + */ +async function recommendTopicsWithLLM( + env: Env, + query: string, + limit: number, + baseUrl: string +): Promise { + const topicsList = await getTopicsList() + const systemMessage = `Here are all available topics:\n${topicsList}` + const userMessage = `Given this query: "${query}" + +Recommend ONLY topics that are DIRECTLY and STRONGLY related to this query. +- Only include topics where the connection is obvious and immediate +- If a topic is only tangentially or indirectly related, exclude it +- Better to return 1-3 highly relevant topics than many loosely related ones +- If no topics are strongly relevant, return an empty array [] +- Maximum ${limit} topics +- Return ONLY a JSON array of topic names` + + let response: any + try { + // Try llama-3.1-8b-instruct-fast which is more commonly available + response = await env.AI.run( + "@cf/meta/llama-3.1-8b-instruct-fast" as any, + { + messages: [ + { role: "system", content: systemMessage }, + { role: "user", content: userMessage }, + ], + temperature: 0.1, + max_tokens: 500, + } + ) + } catch (error) { + console.error("LLM API error:", error) + return [] + } + + // Parse LLM response - extract JSON array + const text = + typeof response === "string" ? response : response.response || "" + if (!text || typeof text !== "string") { + console.log("LLM response type issue:", typeof response, response) + return [] + } + + console.log("LLM response text:", text) + + const jsonMatch = text.match(/\[[\s\S]*?\]/) + if (!jsonMatch) { + console.log("No JSON array found in LLM response") + return [] + } + + try { + const recommendedNames = JSON.parse(jsonMatch[0]) as string[] + console.log("Recommended names:", recommendedNames) + + // Map names back to topic objects + const topics = await getTopics() + const hits = recommendedNames + .map((name, index) => { + const topic = topics.find((t) => t.name === name) + if (!topic) return null + return { + id: topic.id, + name: topic.name, + slug: topic.slug, + excerpt: topic.excerpt, + url: `${baseUrl}/${topic.slug}`, + __position: index + 1, + score: 1.0 - index * 0.05, // Synthetic score + } + }) + .filter((hit): hit is TopicHit => hit !== null) + .slice(0, limit) + + return hits + } catch { + return [] + } +} + +// Valid query parameter names for this endpoint +const VALID_PARAMS = new Set([ + COMMON_SEARCH_PARAMS.QUERY, // "q" + "limit", + "source", + "llm_filter", +]) + +export const onRequestGet: PagesFunction = async (context) => { + const { env, request } = context + const url = new URL(request.url) + const baseUrl = getBaseUrl(request) + + const startTime = performance.now() + + try { + // Validate query parameters - reject unknown params to catch typos + const validationError = validateQueryParams(url, VALID_PARAMS) + if (validationError) return validationError + + // Parse query parameter + const query = url.searchParams.get(COMMON_SEARCH_PARAMS.QUERY) || "" + + // source=semantic (default): use Vectorize embeddings + // source=llm: use LLM to recommend topics + const source = (url.searchParams.get("source") || "semantic") as + | "semantic" + | "llm" + if (source !== "semantic" && source !== "llm") { + return new Response( + JSON.stringify({ + error: "Invalid source parameter", + details: "source must be 'semantic' or 'llm'", + }), + { + status: 400, + headers: { + "Content-Type": "application/json", + "Access-Control-Allow-Origin": "*", + }, + } + ) + } + + // Parse limit parameter + const limit = Math.min( + Math.max( + 1, + parseInt( + url.searchParams.get("limit") || DEFAULT_LIMIT.toString() + ) + ), + MAX_LIMIT + ) + + // llm_filter (default true): use LLM to filter semantic candidates + const llmFilter = url.searchParams.get("llm_filter") !== "false" + + let response: TopicsApiResponse + + if (source === "llm") { + // LLM-based recommendations + const hits = await recommendTopicsWithLLM( + env, + query, + limit, + baseUrl + ) + const timing_ms = Math.round(performance.now() - startTime) + response = { + query, + hits, + nbHits: hits.length, + source: "llm", + timing_ms, + } + } else { + // Semantic search via Vectorize + const candidateLimit = llmFilter ? SEMANTIC_CANDIDATES : limit + const candidates = await searchTopicsWithVectorize( + env, + query, + candidateLimit, + baseUrl + ) + + const hits = llmFilter + ? await filterTopicsWithLLM(env, query, candidates, limit) + : candidates + + const timing_ms = Math.round(performance.now() - startTime) + response = { + query, + hits, + nbHits: hits.length, + source: "semantic", + timing_ms, + } + } + + return new Response(JSON.stringify(response, null, 2), { + headers: { + "Content-Type": "application/json", + "Cache-Control": "public, max-age=60", + "Access-Control-Allow-Origin": "*", + }, + }) + } catch (error) { + console.error("AI Search topics error:", error) + + return new Response( + JSON.stringify({ + error: "AI Search failed", + message: + error instanceof Error ? error.message : "Unknown error", + }), + { + status: 500, + headers: { + "Content-Type": "application/json", + "Access-Control-Allow-Origin": "*", + }, + } + ) + } +} diff --git a/functions/api/ai-search/utils.ts b/functions/api/ai-search/utils.ts new file mode 100644 index 00000000000..1a4e13afc44 --- /dev/null +++ b/functions/api/ai-search/utils.ts @@ -0,0 +1,345 @@ +import { + SearchUrlParam, + ChartRecordType, + GrapherTabName, +} from "@ourworldindata/types" + +/** + * Determine base URL from forwarded headers (when behind proxy) or fall back to request URL origin. + * Checks X-Forwarded-Host and X-Forwarded-Proto headers that reverse proxies typically set. + */ +export function getBaseUrl(request: Request): string { + const forwardedHost = request.headers.get("X-Forwarded-Host") + const forwardedProto = request.headers.get("X-Forwarded-Proto") || "https" + if (forwardedHost) { + return `${forwardedProto}://${forwardedHost}` + } + return new URL(request.url).origin +} + +/** + * Validate query parameters against a set of valid parameter names. + * Returns an error Response if invalid parameters are found, otherwise null. + */ +export function validateQueryParams( + url: URL, + validParams: Set +): Response | null { + const invalidParams = [...url.searchParams.keys()].filter( + (key) => !validParams.has(key) + ) + + if (invalidParams.length > 0) { + return new Response( + JSON.stringify({ + error: "Invalid query parameters", + details: `Unknown parameters: ${invalidParams.join(", ")}. Valid parameters are: ${[...validParams].join(", ")}`, + }), + { + status: 400, + headers: { + "Content-Type": "application/json", + "Access-Control-Allow-Origin": "*", + }, + } + ) + } + + return null +} + +/** + * Common valid query parameters shared across AI search endpoints + */ +export const COMMON_SEARCH_PARAMS = { + QUERY: SearchUrlParam.QUERY, // "q" + COUNTRY: SearchUrlParam.COUNTRY, // "countries" + TOPIC: SearchUrlParam.TOPIC, // "topics" + REQUIRE_ALL_COUNTRIES: SearchUrlParam.REQUIRE_ALL_COUNTRIES, // "requireAllCountries" +} as const + +/** + * AI Search response shape from Cloudflare + */ +export interface AISearchResult { + file_id: string + filename: string + score: number + attributes: Record + content: Array<{ + type: string + text: string + }> +} + +export interface AISearchResponse { + data: AISearchResult[] + has_more: boolean +} + +/** + * AI Search aiSearch() response shape (includes generated response) + */ +export interface AISearchAnswerResponse extends AISearchResponse { + response: string + search_query: string +} + +/** + * Streaming chunk from aiSearch() with stream: true + * The stream is NDJSON format with partial response and source data + */ +export interface AISearchStreamChunk { + response?: string // Partial generated text + data?: AISearchResult[] // Source documents (typically in first chunk) + search_query?: string + has_more?: boolean +} + +/** + * Minimal chart info for LLM context (keeps token usage low). + */ +export interface ChartInfoForLLM { + title: string + slug: string + subtitle?: string +} + +/** + * Full chart info matching EnrichedSearchChartHit from /api/search. + * Used for API responses to enable frontend reuse. + * Note: publishedAt/updatedAt are retrieved from Algolia but not in SearchChartHit type. + */ +export interface ChartInfo { + title: string + slug: string + url: string + subtitle?: string + variantName?: string + type: ChartRecordType + queryParams?: string + availableEntities: string[] + originalAvailableEntities?: string[] + availableTabs: GrapherTabName[] + publishedAt?: string + updatedAt?: string +} + +/** + * Extract text content from Cloudflare Workers AI response. + * Handles various response formats: string, { response }, { text }, { content }, + * and OpenAI-compatible { choices[0].message.content }. + */ +export function extractTextFromCFResponse(response: unknown): string { + if (typeof response === "string") { + return response + } + + if (response && typeof response === "object") { + const resp = response as Record + + // CF models may return array directly or as response field + if (Array.isArray(resp.response)) { + return JSON.stringify(resp.response) + } + if (typeof resp.response === "string") { + return resp.response + } + if (typeof resp.text === "string") { + return resp.text + } + if (typeof resp.content === "string") { + return resp.content + } + + // OpenAI-like format + if (resp.choices && Array.isArray(resp.choices) && resp.choices[0]) { + const choice = resp.choices[0] as Record + if (choice.message && typeof choice.message === "object") { + const msg = choice.message as Record + if (typeof msg.content === "string") { + return msg.content + } + } + } + } + + return "" +} + +/** + * Extract a JSON array from LLM text output. + * Returns the parsed array or null if parsing fails. + */ +export function extractJsonArray(text: string): T[] | null { + try { + const jsonMatch = text.match(/\[[\s\S]*\]/) + if (jsonMatch) { + const parsed = JSON.parse(jsonMatch[0]) + if (Array.isArray(parsed)) { + return parsed as T[] + } + } + } catch { + // Parsing failed + } + return null +} + +// ============================================================================= +// Semantic Search +// ============================================================================= + +/** Name of the AI Search instance in Cloudflare dashboard */ +const AI_SEARCH_INSTANCE_NAME = "search-charts" + +/** Chart metadata stored in R2 */ +interface ChartData { + type: string + slug: string + variantName: string + availableTabs: string[] + queryParams: string + publishedAt: string + updatedAt: string + views_7d: number + views_14d: number + views_365d: number + fmRank?: number + tag1?: string + tag2?: string + tag3?: string + tag4?: string +} + +function extractObjectIDFromFilename(filename: string): string { + return filename + .replace(/^(charts|explorers|mdim)\//, "") + .replace(/\.md$/, "") +} + +function extractTitleFromContent(text: string): string { + const match = text.match(/^#\s+(.+)$/m) + return match?.[1] ?? "" +} + +function extractSubtitleFromContent(text: string): string | undefined { + const lines = text.split("\n") + for (const rawLine of lines) { + const line = rawLine.trim() + if ( + !line || + line.startsWith("#") || + line.startsWith("**") || + line.startsWith("- ") + ) { + continue + } + return line + } + return undefined +} + +function parseChartData(result: AISearchResult): Partial { + const fileAttr = result.attributes.file as + | { chartdata?: string; type?: string; slug?: string } + | undefined + + const chartdataStr = fileAttr?.chartdata + if (chartdataStr && typeof chartdataStr === "string") { + try { + const decoded = atob(chartdataStr) + return JSON.parse(decoded) as ChartData + } catch { + // Fall through to legacy handling + } + } + + return { + type: (fileAttr?.type as string) ?? "chart", + slug: fileAttr?.slug as string | undefined, + } +} + +function calculateCombinedScore( + aiSearchScore: number, + fmRank: number | undefined, + views7d: number | undefined +): number { + const relevanceScore = aiSearchScore + const fmBoost = fmRank ? Math.max(0, 0.33 - fmRank * 0.03) : 0 + const viewsBoost = views7d ? 0.01 * Math.log10(views7d + 1) : 0 + return relevanceScore + fmBoost + viewsBoost +} + +/** + * Transform a single AI Search result to ChartInfo format. + */ +function transformAISearchResultToChartInfo( + result: AISearchResult, + baseUrl: string +): ChartInfo { + const objectID = extractObjectIDFromFilename(result.filename) + const text = result.content[0]?.text ?? "" + const title = extractTitleFromContent(text) + const subtitle = extractSubtitleFromContent(text) + const chartData = parseChartData(result) + + const slug = chartData.slug || objectID + const isExplorer = chartData.type === "explorerView" + const urlPath = isExplorer ? "explorers" : "grapher" + const queryParams = chartData.queryParams || "" + const url = `${baseUrl}/${urlPath}/${slug}${queryParams}` + + const type = (chartData.type as ChartRecordType) ?? "chart" + const availableTabs = (chartData.availableTabs as GrapherTabName[]) ?? [] + + return { + title, + slug, + url, + subtitle, + variantName: chartData.variantName, + type, + queryParams: chartData.queryParams, + availableEntities: [], + availableTabs, + publishedAt: chartData.publishedAt, + updatedAt: chartData.updatedAt, + } +} + +/** + * Perform a single semantic search using Cloudflare AI Search. + * Returns charts sorted by combined score (semantic relevance + FM rank + pageviews). + */ +export async function searchChartsSemantic( + ai: Ai, + query: string, + maxResults: number, + baseUrl: string +): Promise<{ query: string; charts: ChartInfo[] }> { + const results = (await ai.autorag(AI_SEARCH_INSTANCE_NAME).search({ + query, + max_num_results: Math.max(maxResults, Math.min(maxResults + 10, 50)), + ranking_options: { score_threshold: 0.1 }, + })) as AISearchResponse + + // Transform and score results + const scoredCharts = results.data.map((result) => { + const chart = transformAISearchResultToChartInfo(result, baseUrl) + const chartData = parseChartData(result) + const score = calculateCombinedScore( + result.score, + chartData.fmRank, + chartData.views_7d + ) + return { chart, score } + }) + + // Sort by score and take top results + scoredCharts.sort((a, b) => b.score - a.score) + const charts = scoredCharts.slice(0, maxResults).map((sc) => sc.chart) + + return { query, charts } +} diff --git a/functions/scripts/indexTopics.ts b/functions/scripts/indexTopics.ts new file mode 100644 index 00000000000..6c77e962a0e --- /dev/null +++ b/functions/scripts/indexTopics.ts @@ -0,0 +1,243 @@ +/** + * Cloudflare Worker script to index topics into Vectorize. + * + * This script: + * 1. Fetches topics from Datasette (public MySQL read-only API) + * 2. Generates embeddings using Workers AI + * 3. Upserts vectors with metadata to Vectorize + * + * Usage: + * wrangler dev --remote functions/scripts/indexTopics.ts + * # or deploy and invoke via HTTP: + * wrangler deploy functions/scripts/indexTopics.ts --name index-topics + * curl "https://index-topics..workers.dev" + * curl "https://index-topics..workers.dev?slug=climate-change" + */ + +import { Env } from "../_common/env.js" + +interface TopicData { + id: number + name: string + slug: string + excerpt: string | null + markdown: string | null +} + +interface DatasetteResponse { + ok: boolean + rows: Array<[number, string, string, string | null, string | null]> + error?: string | null +} + +const EMBEDDING_MODEL = "@cf/baai/bge-base-en-v1.5" +const EMBEDDING_BATCH_SIZE = 100 +const DATASETTE_URL = "https://datasette-public.owid.io/owid.json" + +/** + * Build the SQL query to fetch topics from Datasette. + * Matches the logic in db.getTopicsForAISearch() + * Note: Datasette uses DuckDB, so we use DuckDB JSON syntax (->>) instead of MySQL's JSON_UNQUOTE(JSON_EXTRACT()) + */ +function buildTopicsQuery(slugFilter?: string): string { + const baseQuery = ` + SELECT + t.id, + t.name, + t.slug, + p.content->>'$.excerpt' AS excerpt, + p.markdown + FROM tags t + JOIN posts_gdocs p ON t.slug = p.slug + WHERE + t.slug IS NOT NULL + AND p.published = 1 + AND p.type IN ('topic-page', 'linear-topic-page') + ` + + if (slugFilter) { + return `${baseQuery} AND t.slug = '${slugFilter}' ORDER BY t.name ASC` + } + + return `${baseQuery} ORDER BY t.name ASC` +} + +/** + * Fetch topics from Datasette API + */ +async function fetchTopicsFromDatasette( + slugFilter?: string +): Promise { + const sql = buildTopicsQuery(slugFilter) + const url = `${DATASETTE_URL}?sql=${encodeURIComponent(sql)}` + + console.log("Fetching topics from Datasette...") + const response = await fetch(url) + + if (!response.ok) { + throw new Error( + `Datasette request failed: ${response.status} ${response.statusText}` + ) + } + + const data = (await response.json()) as DatasetteResponse + + // Check for Datasette-level errors + if (!data.ok || data.error) { + throw new Error( + `Datasette query error: ${data.error || "Unknown error"}` + ) + } + + // Map rows to TopicData objects + return data.rows.map(([id, name, slug, excerpt, markdown]) => ({ + id, + name, + slug, + excerpt, + markdown, + })) +} + +/** + * Generate embeddings for texts using Workers AI. + * Supports batch processing. + */ +async function generateEmbeddings( + ai: Ai, + texts: string[] +): Promise { + interface EmbeddingResponse { + data: number[][] + } + + const response = (await ai.run(EMBEDDING_MODEL, { + text: texts, + })) as EmbeddingResponse + + return response.data +} + +/** + * Upsert vectors to Vectorize with metadata + */ +async function upsertTopicsToVectorize( + vectorize: Vectorize, + topics: TopicData[], + embeddings: number[][] +): Promise { + const vectors = topics.map((topic, i) => ({ + id: topic.slug, // Use slug as vector ID for easy lookup + values: embeddings[i], + metadata: { + id: topic.id, + name: topic.name, + slug: topic.slug, + excerpt: topic.excerpt || "", + }, + })) + + await vectorize.upsert(vectors) +} + +/** + * Main handler for the Cloudflare Worker + */ +async function handleRequest(request: Request, env: Env): Promise { + const url = new URL(request.url) + + // Support slug filter via query param + const slugFilter = url.searchParams.get("slug") || undefined + + try { + const startTime = Date.now() + + // Step 1: Fetch topics from Datasette + const topics = await fetchTopicsFromDatasette(slugFilter) + + if (topics.length === 0) { + return new Response( + JSON.stringify({ + error: `No topics found${slugFilter ? ` with slug: ${slugFilter}` : ""}`, + }), + { + status: 404, + headers: { "Content-Type": "application/json" }, + } + ) + } + + console.log(`Fetched ${topics.length} topics from Datasette`) + + // Step 2: Generate embeddings in batches + const allEmbeddings: number[][] = [] + + for (let i = 0; i < topics.length; i += EMBEDDING_BATCH_SIZE) { + const batch = topics.slice(i, i + EMBEDDING_BATCH_SIZE) + const batchTexts = batch.map((t) => t.name) + + console.log( + `Generating embeddings for batch ${Math.floor(i / EMBEDDING_BATCH_SIZE) + 1}/${Math.ceil(topics.length / EMBEDDING_BATCH_SIZE)}...` + ) + + const batchEmbeddings = await generateEmbeddings(env.AI, batchTexts) + allEmbeddings.push(...batchEmbeddings) + } + + console.log(`Generated ${allEmbeddings.length} embeddings`) + + // Step 3: Upsert to Vectorize in batches + for (let i = 0; i < topics.length; i += EMBEDDING_BATCH_SIZE) { + const batch = topics.slice(i, i + EMBEDDING_BATCH_SIZE) + const batchEmbeddings = allEmbeddings.slice( + i, + i + EMBEDDING_BATCH_SIZE + ) + + console.log( + `Upserting batch ${Math.floor(i / EMBEDDING_BATCH_SIZE) + 1}/${Math.ceil(topics.length / EMBEDDING_BATCH_SIZE)} to Vectorize...` + ) + + await upsertTopicsToVectorize( + env.VECTORIZE_TOPICS, + batch, + batchEmbeddings + ) + } + + const elapsedTime = Date.now() - startTime + + return new Response( + JSON.stringify({ + success: true, + message: `Successfully indexed ${topics.length} topics`, + topics: topics.map((t) => ({ id: t.id, slug: t.slug })), + elapsed_ms: elapsedTime, + }), + { + status: 200, + headers: { "Content-Type": "application/json" }, + } + ) + } catch (error) { + console.error("Error indexing topics:", error) + + return new Response( + JSON.stringify({ + error: "Failed to index topics", + details: error instanceof Error ? error.message : String(error), + }), + { + status: 500, + headers: { "Content-Type": "application/json" }, + } + ) + } +} + +// Export as ES module worker +export default { + async fetch(request: Request, env: Env): Promise { + return handleRequest(request, env) + }, +} diff --git a/functions/scripts/indexVocabulary.ts b/functions/scripts/indexVocabulary.ts new file mode 100644 index 00000000000..408941d9391 --- /dev/null +++ b/functions/scripts/indexVocabulary.ts @@ -0,0 +1,268 @@ +/** + * Cloudflare Worker script to index vocabulary keywords into Vectorize. + * + * This script: + * 1. Fetches vocabulary data from https://owid-public.owid.io/topic_vocabulary.json + * 2. Generates embeddings using Workers AI + * 3. Upserts vectors with metadata to Vectorize + * + * Usage: + * wrangler dev --remote functions/scripts/indexVocabulary.ts + * # or deploy and invoke via HTTP: + * wrangler deploy functions/scripts/indexVocabulary.ts --name index-vocabulary + * curl "https://index-vocabulary..workers.dev" + */ + +import { Env } from "../_common/env.js" + +const VOCABULARY_URL = "https://owid-public.owid.io/topic_vocabulary.json" + +interface TopicVocabulary { + topic_slug: string + topic_name: string + keywords: string[] + stats: { + num_charts_texts: number + num_keywords: number + input_tokens: number + output_tokens: number + total_cost_usd: number + } +} + +interface VocabularyData { + [topicSlug: string]: TopicVocabulary +} + +const EMBEDDING_MODEL = "@cf/baai/bge-base-en-v1.5" +const EMBEDDING_BATCH_SIZE = 100 + +/** + * Fetch vocabulary data from the public URL + */ +async function fetchVocabularyData(): Promise { + const response = await fetch(VOCABULARY_URL) + if (!response.ok) { + throw new Error( + `Failed to fetch vocabulary data: ${response.status} ${response.statusText}` + ) + } + return (await response.json()) as VocabularyData +} + +/** + * Normalize a keyword for consistent matching + */ +function normalizeKeyword(keyword: string): string { + return keyword.toLowerCase().trim() +} + +/** + * Generate a unique ID from topic and keyword for use as vector ID + * Vectorize has a 64-byte limit, so we hash to ensure short IDs + * Format: {topic_slug_prefix}-{hash} + */ +function generateVectorId(topicSlug: string, keyword: string): string { + // Generate hash from the full combined string for uniqueness + const combined = `${topicSlug}::${keyword}` + let hash = 0 + for (let i = 0; i < combined.length; i++) { + hash = ((hash << 5) - hash + combined.charCodeAt(i)) | 0 + } + // Convert to positive hex string + const hashStr = (hash >>> 0).toString(16).padStart(8, "0") + + // Use first 50 chars of topic slug (leaving room for hash and separator) + // Format: {topic-slug-prefix}-{hash} (max 50 + 1 + 8 = 59 bytes) + const topicPrefix = topicSlug.slice(0, 50) + return `${topicPrefix}-${hashStr}` +} + +/** + * Generate embeddings for keywords using Workers AI + */ +async function generateEmbeddings( + ai: Ai, + keywords: string[] +): Promise { + interface EmbeddingResponse { + data: number[][] + } + + const response = (await ai.run(EMBEDDING_MODEL, { + text: keywords, + })) as EmbeddingResponse + + return response.data +} + +/** + * Upsert keywords to Vectorize with metadata + */ +async function upsertKeywordsToVectorize( + vectorize: Vectorize, + keywords: Array<{ + keyword: string + topic_slug: string + topic_name: string + }>, + embeddings: number[][] +): Promise { + const vectors = keywords.map((item, i) => ({ + id: generateVectorId(item.topic_slug, item.keyword), + values: embeddings[i], + metadata: { + keyword: item.keyword, + topic_slug: item.topic_slug, + topic_name: item.topic_name, + normalized: normalizeKeyword(item.keyword), + }, + })) + + console.log( + `Upserting ${vectors.length} vectors. First ID: ${vectors[0]?.id}, embedding length: ${vectors[0]?.values.length}` + ) + + const result = await vectorize.upsert(vectors) + console.log(`Upsert result:`, result) +} + +/** + * Main handler for the Cloudflare Worker + */ +async function handleRequest(request: Request, env: Env): Promise { + const url = new URL(request.url) + + // Support topic filter via query param + const topicFilter = url.searchParams.get("topic") || undefined + + try { + const startTime = Date.now() + + // Fetch vocabulary data from public URL + const vocabularyData = await fetchVocabularyData() + + // Filter vocabulary by topic if specified + const filteredVocabulary = topicFilter + ? Object.fromEntries( + Object.entries(vocabularyData).filter( + ([slug]) => slug === topicFilter + ) + ) + : vocabularyData + + // Flatten vocabulary into keyword-topic pairs + const keywordPairs: Array<{ + keyword: string + topic_slug: string + topic_name: string + }> = [] + for (const [topicSlug, topicData] of Object.entries( + filteredVocabulary + )) { + for (const keyword of topicData.keywords) { + keywordPairs.push({ + keyword, + topic_slug: topicSlug, + topic_name: topicData.topic_name, + }) + } + } + + if (keywordPairs.length === 0) { + return new Response( + JSON.stringify({ + error: `No keywords found${topicFilter ? ` for topic: ${topicFilter}` : ""}`, + }), + { + status: 404, + headers: { "Content-Type": "application/json" }, + } + ) + } + + console.log( + `Indexing ${keywordPairs.length} keywords across ${Object.keys(filteredVocabulary).length} topics` + ) + + // Generate embeddings in batches + const allEmbeddings: number[][] = [] + + for (let i = 0; i < keywordPairs.length; i += EMBEDDING_BATCH_SIZE) { + const batch = keywordPairs.slice(i, i + EMBEDDING_BATCH_SIZE) + const batchKeywords = batch.map((kp) => kp.keyword) + + console.log( + `Generating embeddings for batch ${Math.floor(i / EMBEDDING_BATCH_SIZE) + 1}/${Math.ceil(keywordPairs.length / EMBEDDING_BATCH_SIZE)}...` + ) + + const batchEmbeddings = await generateEmbeddings( + env.AI, + batchKeywords + ) + allEmbeddings.push(...batchEmbeddings) + } + + console.log(`Generated ${allEmbeddings.length} embeddings`) + + // Upsert to Vectorize in batches (same as topics indexing) + for (let i = 0; i < keywordPairs.length; i += EMBEDDING_BATCH_SIZE) { + const batch = keywordPairs.slice(i, i + EMBEDDING_BATCH_SIZE) + const batchEmbeddings = allEmbeddings.slice( + i, + i + EMBEDDING_BATCH_SIZE + ) + + console.log( + `Upserting batch ${Math.floor(i / EMBEDDING_BATCH_SIZE) + 1}/${Math.ceil(keywordPairs.length / EMBEDDING_BATCH_SIZE)} to Vectorize...` + ) + + await upsertKeywordsToVectorize( + env.VECTORIZE_VOCABULARY, + batch, + batchEmbeddings + ) + } + + const elapsedTime = Date.now() - startTime + + return new Response( + JSON.stringify({ + success: true, + message: `Successfully indexed ${keywordPairs.length} keywords`, + topics: Object.keys(filteredVocabulary), + keywords_per_topic: Object.fromEntries( + Object.entries(filteredVocabulary).map(([slug, data]) => [ + slug, + data.keywords.length, + ]) + ), + elapsed_ms: elapsedTime, + }), + { + status: 200, + headers: { "Content-Type": "application/json" }, + } + ) + } catch (error) { + console.error("Error indexing vocabulary:", error) + + return new Response( + JSON.stringify({ + error: "Failed to index vocabulary", + details: error instanceof Error ? error.message : String(error), + }), + { + status: 500, + headers: { "Content-Type": "application/json" }, + } + ) + } +} + +// Export as ES module worker +export default { + async fetch(request: Request, env: Env): Promise { + return handleRequest(request, env) + }, +} diff --git a/site/owid.scss b/site/owid.scss index 893f60701e3..437b277e30f 100644 --- a/site/owid.scss +++ b/site/owid.scss @@ -79,6 +79,8 @@ @import "./search/SearchResultHeader.scss"; @import "./search/SearchNoResults.scss"; @import "./search/SearchDetectedFilters.scss"; +@import "./search/SearchSuggestedTopics.scss"; +@import "./search/SearchSuggestedKeywords.scss"; @import "./search/skeletons.scss"; @import "./search/SearchResultHeaderSkeleton.scss"; @import "./search/SearchWritingResultsSkeleton.scss"; diff --git a/site/search/Search.tsx b/site/search/Search.tsx index 99e6d084bec..f4d4adc11c1 100644 --- a/site/search/Search.tsx +++ b/site/search/Search.tsx @@ -30,6 +30,8 @@ import { SearchTemplatesData } from "./SearchTemplatesData.js" import { SearchTemplatesWriting } from "./SearchTemplatesWriting.js" import { SearchNoResults } from "./SearchNoResults.js" import { SearchDetectedFilters } from "./SearchDetectedFilters.js" +import { SearchSuggestedTopics } from "./SearchSuggestedTopics.js" +import { SearchSuggestedKeywords } from "./SearchSuggestedKeywords.js" import { buildSynonymMap } from "./synonymUtils.js" import { SiteAnalytics } from "../SiteAnalytics.js" import { PoweredBy } from "react-instantsearch" @@ -122,6 +124,8 @@ export const Search = ({ + +
diff --git a/site/search/SearchNoResults.tsx b/site/search/SearchNoResults.tsx index dc840e8b888..72adbae7edd 100644 --- a/site/search/SearchNoResults.tsx +++ b/site/search/SearchNoResults.tsx @@ -8,11 +8,10 @@ export const SearchNoResults = () => { className="search-no-results__icon" icon={faSearch} /> -

- There are no results for this query. -

+

No results found for this query.

- Try searching for something else or removing some filters. + Try exploring a suggested topic above, or search for something + else.

) diff --git a/site/search/SearchSuggestedKeywords.scss b/site/search/SearchSuggestedKeywords.scss new file mode 100644 index 00000000000..24aed455dd2 --- /dev/null +++ b/site/search/SearchSuggestedKeywords.scss @@ -0,0 +1,21 @@ +.search-suggested-keywords { + display: flex; + flex-wrap: wrap; + gap: 8px; + margin-top: 8px; + align-items: center; +} + +.search-suggested-keywords__label { + @include body-3-medium; + font-style: italic; + color: $blue-60; +} + +.search-suggested-keyword-button { + background: transparent; + border: none; + padding: 0; + cursor: pointer; + text-align: left; +} diff --git a/site/search/SearchSuggestedKeywords.tsx b/site/search/SearchSuggestedKeywords.tsx new file mode 100644 index 00000000000..91a3091d1c0 --- /dev/null +++ b/site/search/SearchSuggestedKeywords.tsx @@ -0,0 +1,58 @@ +import { useQuery } from "@tanstack/react-query" +import { useSearchContext } from "./SearchContext.js" +import { SearchFilterPill } from "./SearchFilterPill.js" +import { FontAwesomeIcon } from "@fortawesome/react-fontawesome" +import { faSearch } from "@fortawesome/free-solid-svg-icons" + +interface RewriteResponse { + keywords: string[] +} + +async function fetchSuggestedKeywords(query: string): Promise { + const params = new URLSearchParams({ q: query }) + const response = await fetch(`/api/ai-search/keywords?${params}`) + return response.json() +} + +export const SearchSuggestedKeywords = () => { + const { + state: { query }, + actions: { setQuery }, + } = useSearchContext() + + const enabled = query.length > 0 + + const { data } = useQuery({ + queryKey: ["suggestedKeywords", query], + queryFn: () => fetchSuggestedKeywords(query), + enabled, + staleTime: 60_000, + }) + + const keywords = data?.keywords + if (!keywords?.length) return null + + return ( +
+ Try also + {keywords.map((keyword) => ( + + ))} +
+ ) +} diff --git a/site/search/SearchSuggestedTopics.scss b/site/search/SearchSuggestedTopics.scss new file mode 100644 index 00000000000..40905a2f09a --- /dev/null +++ b/site/search/SearchSuggestedTopics.scss @@ -0,0 +1,21 @@ +.search-suggested-topics { + display: flex; + flex-wrap: wrap; + gap: 8px; + margin-top: 8px; + align-items: center; +} + +.search-suggested-topics__label { + @include body-3-medium; + font-style: italic; + color: $blue-60; +} + +.search-suggested-topic-button { + background: transparent; + border: none; + padding: 0; + cursor: pointer; + text-align: left; +} diff --git a/site/search/SearchSuggestedTopics.tsx b/site/search/SearchSuggestedTopics.tsx new file mode 100644 index 00000000000..928fff30534 --- /dev/null +++ b/site/search/SearchSuggestedTopics.tsx @@ -0,0 +1,66 @@ +import { useQuery } from "@tanstack/react-query" +import { FilterType } from "@ourworldindata/types" +import { useSearchContext } from "./SearchContext.js" +import { getFilterIcon, getFilterNamesOfType } from "./searchUtils.js" +import { SearchFilterPill } from "./SearchFilterPill.js" + +interface TopicHit { + name: string + slug: string + score: number +} + +interface TopicsApiResponse { + hits: TopicHit[] +} + +async function fetchSuggestedTopics(query: string): Promise { + const params = new URLSearchParams({ q: query, limit: "3" }) + const response = await fetch(`/api/ai-search/topics?${params}`) + return response.json() +} + +export const SearchSuggestedTopics = () => { + const { + state: { filters, query }, + actions: { setTopicAndClearQuery }, + } = useSearchContext() + + const hasTopicFilter = + getFilterNamesOfType(filters, FilterType.TOPIC).size > 0 + + const enabled = query.length > 0 && !hasTopicFilter + + const { data } = useQuery({ + queryKey: ["suggestedTopics", query], + queryFn: () => fetchSuggestedTopics(query), + enabled, + staleTime: 60_000, + }) + + const hits = data?.hits + if (!hits?.length) return null + + return ( +
+ Browse topic + {hits.map((hit) => ( + + ))} +
+ ) +} diff --git a/wrangler.jsonc b/wrangler.jsonc index a4211739af0..711592ea613 100644 --- a/wrangler.jsonc +++ b/wrangler.jsonc @@ -14,6 +14,26 @@ "pages_build_output_dir": "./localBake", + // Workers AI binding for search + "ai": { + "binding": "AI", + "remote": true, + }, + + // Vectorize bindings for semantic search + "vectorize": [ + { + "binding": "VECTORIZE_TOPICS", + "index_name": "topics", + "remote": true, + }, + { + "binding": "VECTORIZE_VOCABULARY", + "index_name": "vocabulary", + "remote": true, + }, + ], + "r2_buckets": [ { "bucket_name": "owid-user-surveys-staging", @@ -31,6 +51,9 @@ }, "env": { "preview": { + "ai": { + "binding": "AI", + }, "r2_buckets": [ { "bucket_name": "owid-user-surveys-staging", @@ -53,6 +76,9 @@ }, "production": { // Overrides for CF production deployment + "ai": { + "binding": "AI", + }, "compatibility_date": "2025-05-05", "r2_buckets": [ {