diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 1de11777..80dce092 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,9 +1,9 @@ --- name: Bug report about: Create an issue to help us fix bugs -title: '' +title: "" labels: bug -assignees: '' +assignees: "" --- **Describe the bug** diff --git a/.github/ISSUE_TEMPLATE/custom.md b/.github/ISSUE_TEMPLATE/custom.md index 96a47352..babf9b2c 100644 --- a/.github/ISSUE_TEMPLATE/custom.md +++ b/.github/ISSUE_TEMPLATE/custom.md @@ -1,7 +1,7 @@ --- name: Custom issue template about: Describe this issue template's purpose here. -title: '' -labels: '' -assignees: '' +title: "" +labels: "" +assignees: "" --- diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 5f0a04ce..d883b8f2 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,9 +1,9 @@ --- name: Feature request about: Suggest an idea for this project -title: '' +title: "" labels: enhancement -assignees: '' +assignees: "" --- **Is your feature request related to a problem? Please describe.** diff --git a/.github/scripts/starklings-evaluate.js b/.github/scripts/starklings-evaluate.js index 7b0ac31a..85756d62 100644 --- a/.github/scripts/starklings-evaluate.js +++ b/.github/scripts/starklings-evaluate.js @@ -1,6 +1,6 @@ -const fs = require('fs'); -const { execSync } = require('child_process'); -const path = require('path'); +const fs = require("fs"); +const { execSync } = require("child_process"); +const path = require("path"); // Configuration de débogage const DEBUG = true; @@ -20,8 +20,8 @@ function parseInfoToml(infoPath) { throw new Error(`info.toml not found at: ${infoPath}`); } - const content = fs.readFileSync(infoPath, 'utf8'); - const lines = content.split('\n'); + const content = fs.readFileSync(infoPath, "utf8"); + const lines = content.split("\n"); const categories = {}; let currentCategory = null; @@ -34,19 +34,19 @@ function parseInfoToml(infoPath) { const cleanLine = line.trim(); // Détecter les catégories - if (cleanLine.startsWith('# ') && !cleanLine.startsWith('##')) { + if (cleanLine.startsWith("# ") && !cleanLine.startsWith("##")) { currentCategory = cleanLine.substring(2).trim(); categories[currentCategory] = []; continue; } - if (cleanLine.startsWith('[[exercises]]')) { + if (cleanLine.startsWith("[[exercises]]")) { if (currentExercise) { if (hintLines.length > 0) { currentExercise.hint = hintLines - .join('\n') - .replace(/^"""/, '') - .replace(/"""$/, ''); + .join("\n") + .replace(/^"""/, "") + .replace(/"""$/, ""); } if (currentCategory) { categories[currentCategory].push(currentExercise); @@ -57,25 +57,25 @@ function parseInfoToml(infoPath) { hintLines = []; } else if (cleanLine.startsWith('hint = """')) { collectingHint = true; - hintLines.push(cleanLine.replace('hint = """', '').trim()); + hintLines.push(cleanLine.replace('hint = """', "").trim()); } else if (collectingHint) { if (cleanLine.endsWith('"""')) { - hintLines.push(cleanLine.replace('"""', '').trim()); + hintLines.push(cleanLine.replace('"""', "").trim()); collectingHint = false; } else { hintLines.push(cleanLine); } - } else if (cleanLine.startsWith('name = ')) { + } else if (cleanLine.startsWith("name = ")) { const match = cleanLine.match(/name = "(.+)"/); if (match) { currentExercise.name = match[1]; } - } else if (cleanLine.startsWith('path = ')) { + } else if (cleanLine.startsWith("path = ")) { const match = cleanLine.match(/path = "(.+)"/); if (match) { currentExercise.path = match[1]; } - } else if (cleanLine.startsWith('mode = ')) { + } else if (cleanLine.startsWith("mode = ")) { const match = cleanLine.match(/mode = "(.+)"/); if (match) { currentExercise.mode = match[1]; @@ -86,7 +86,7 @@ function parseInfoToml(infoPath) { // N'oublie pas le dernier exercice if (currentExercise) { if (hintLines.length > 0) { - currentExercise.hint = hintLines.join('\n').replace(/"""$/, ''); + currentExercise.hint = hintLines.join("\n").replace(/"""$/, ""); } if (currentCategory) { categories[currentCategory].push(currentExercise); @@ -97,16 +97,16 @@ function parseInfoToml(infoPath) { } async function testServerConnection() { - log('Testing server connection...'); + log("Testing server connection..."); try { - const response = await fetch('http://localhost:3001/', { - method: 'GET', + const response = await fetch("http://localhost:3001/", { + method: "GET", timeout: 5000, }); if (response.ok) { - log('✅ Server connection successful'); + log("✅ Server connection successful"); return true; } else { log(`❌ Server responded with status: ${response.status}`); @@ -139,7 +139,7 @@ STDERR: ${errorFeedback.stderr} Please analyze the error and fix the code accordingly. Exercise: ${exercise.name} -${exercise.hint ? `Hint: ${exercise.hint}` : ''} +${exercise.hint ? `Hint: ${exercise.hint}` : ""} Instructions: 1. Carefully analyze the compilation error above @@ -157,7 +157,7 @@ Please provide only the corrected code, without any additional explanation or ma prompt = `You are solving a Cairo programming exercise. Exercise: ${exercise.name} -${exercise.hint ? `Hint: ${exercise.hint}` : ''} +${exercise.hint ? `Hint: ${exercise.hint}` : ""} Instructions: 1. Read and understand the exercise requirements @@ -173,8 +173,8 @@ Please provide only the corrected code, without any additional explanation or ma } const requestBody = { - model: 'cairo-coder', - messages: [{ role: 'user', content: prompt }], + model: "cairo-coder", + messages: [{ role: "user", content: prompt }], stream: false, }; @@ -185,11 +185,11 @@ Please provide only the corrected code, without any additional explanation or ma ); const response = await fetch( - 'http://localhost:3001/v1/chat/completions', + "http://localhost:3001/v1/chat/completions", { - method: 'POST', + method: "POST", headers: { - 'Content-Type': 'application/json', + "Content-Type": "application/json", }, body: JSON.stringify(requestBody), timeout: 120000, // 2 minutes @@ -209,9 +209,9 @@ Please provide only the corrected code, without any additional explanation or ma if (SAVE_RESPONSES) { const responseFile = path.join( __dirname, - '..', - '..', - 'debug', + "..", + "..", + "debug", `${exercise.name}_response_attempt${attemptNumber}.json`, ); fs.mkdirSync(path.dirname(responseFile), { recursive: true }); @@ -227,7 +227,7 @@ Please provide only the corrected code, without any additional explanation or ma ); return cleanCode; } else { - throw new Error('Invalid response format from API'); + throw new Error("Invalid response format from API"); } } catch (error) { log( @@ -259,16 +259,16 @@ async function testExerciseWithFeedback( log(`❌ Exercise file not found: ${exercisePath}`); return { success: false, - error: { message: 'File not found', type: 'FILE_ERROR' }, + error: { message: "File not found", type: "FILE_ERROR" }, attempts: 0, }; } // Lire le contenu original - const originalContent = fs.readFileSync(exercisePath, 'utf8'); + const originalContent = fs.readFileSync(exercisePath, "utf8"); // Sauvegarder l'original - const backupPath = exercisePath + '.backup'; + const backupPath = exercisePath + ".backup"; fs.writeFileSync(backupPath, originalContent); let lastError = null; @@ -305,9 +305,9 @@ async function testExerciseWithFeedback( if (SAVE_RESPONSES && runNumber === RUN_NUMBER) { const solutionFile = path.join( __dirname, - '..', - '..', - 'debug', + "..", + "..", + "debug", `${exercise.name}_solution_attempt${attemptNumber}.cairo`, ); fs.mkdirSync(path.dirname(solutionFile), { recursive: true }); @@ -323,9 +323,9 @@ async function testExerciseWithFeedback( `cargo run --bin starklings run ${exercise.name}`, { cwd: starklingsPath, - stdio: 'pipe', + stdio: "pipe", timeout: 300000, - encoding: 'utf8', + encoding: "utf8", }, ); @@ -343,26 +343,26 @@ async function testExerciseWithFeedback( ); log(`Error code: ${error.status}`); log( - `stdout: ${error.stdout ? error.stdout.substring(0, 500) : 'none'}`, + `stdout: ${error.stdout ? error.stdout.substring(0, 500) : "none"}`, ); log( - `stderr: ${error.stderr ? error.stderr.substring(0, 500) : 'none'}`, + `stderr: ${error.stderr ? error.stderr.substring(0, 500) : "none"}`, ); // Formater l'erreur pour le feedback lastError = { exitCode: error.status, - stdout: error.stdout || '', - stderr: error.stderr || '', + stdout: error.stdout || "", + stderr: error.stderr || "", }; // Sauvegarder les erreurs pour chaque tentative si c'est le dernier run if (SAVE_RESPONSES && runNumber === RUN_NUMBER) { const errorFile = path.join( __dirname, - '..', - '..', - 'debug', + "..", + "..", + "debug", `${exercise.name}_error_attempt${attemptNumber}.txt`, ); fs.writeFileSync( @@ -400,7 +400,7 @@ async function testExerciseWithFeedback( if (attemptNumber === MAX_FEEDBACK_ATTEMPTS + 1) { return { success: false, - error: { message: apiError.message, type: 'API_ERROR' }, + error: { message: apiError.message, type: "API_ERROR" }, attempts: attemptNumber, finalAttempt: attemptNumber, }; @@ -469,9 +469,9 @@ async function processCategoryWorker( } } - const statusEmoji = result.success ? '✅' : '❌'; + const statusEmoji = result.success ? "✅" : "❌"; const attemptInfo = - result.attempts > 1 ? ` (${result.attempts} attempts)` : ''; + result.attempts > 1 ? ` (${result.attempts} attempts)` : ""; log(`[${categoryName}] ${exercise.name}: ${statusEmoji}${attemptInfo}`); } @@ -486,10 +486,10 @@ async function processCategoryWorker( const reportPath = path.join( __dirname, - '..', - '..', - 'debug', - `${categoryName.toLowerCase().replace(/\s+/g, '_')}_report_run${runNumber}.json`, + "..", + "..", + "debug", + `${categoryName.toLowerCase().replace(/\s+/g, "_")}_report_run${runNumber}.json`, ); fs.writeFileSync(reportPath, JSON.stringify(categoryResults, null, 2)); @@ -508,8 +508,8 @@ function extractCairoCode(generatedResponse) { // Extraire le contenu du premier bloc de code trouvé const codeBlock = matches[0]; const codeContent = codeBlock - .replace(/```(?:cairo|rust|)?\s*\n/, '') - .replace(/\n```$/, ''); + .replace(/```(?:cairo|rust|)?\s*\n/, "") + .replace(/\n```$/, ""); return codeContent.trim(); } @@ -519,7 +519,7 @@ function extractCairoCode(generatedResponse) { function generateConsolidatedReport(allResults) { if (allResults.length === 0) { - return { error: 'No successful runs' }; + return { error: "No successful runs" }; } // Taux de réussite global @@ -587,7 +587,7 @@ function generateConsolidatedReport(allResults) { categoryAverages[category] = { successRate: (rates.reduce((sum, rate) => sum + rate, 0) / rates.length).toFixed(1) + - '%', + "%", averageAttempts: ( attempts.reduce((sum, att) => sum + att, 0) / attempts.length ).toFixed(1), @@ -616,8 +616,8 @@ function generateConsolidatedReport(allResults) { run: run.runNumber, attempts: exercise.attempts || 1, finalAttempt: exercise.finalAttempt || 1, - type: exercise.error.type || 'COMPILATION_ERROR', - message: exercise.error.message || 'Compilation failed', + type: exercise.error.type || "COMPILATION_ERROR", + message: exercise.error.message || "Compilation failed", stdout: exercise.error.stdout ? exercise.error.stdout.substring(0, 500) : null, @@ -633,13 +633,13 @@ function generateConsolidatedReport(allResults) { return { summary: { totalRuns: allResults.length, - globalSuccessRate: averageSuccessRate + '%', + globalSuccessRate: averageSuccessRate + "%", averageAttemptsPerExercise: averageAttemptsPerExercise, totalFeedbackSuccesses: totalFeedbackSuccesses, feedbackSuccessRate: totalExercises > 0 - ? ((totalFeedbackSuccesses / totalExercises) * 100).toFixed(1) + '%' - : '0%', + ? ((totalFeedbackSuccesses / totalExercises) * 100).toFixed(1) + "%" + : "0%", }, categorySuccessRates: categoryAverages, exerciseErrorsByCategory: exerciseErrorsByCategory, @@ -647,28 +647,28 @@ function generateConsolidatedReport(allResults) { } async function runSingleTest(runNumber) { - const starklingsPath = path.join(process.cwd(), 'starklings'); - const infoPath = path.join(starklingsPath, 'info.toml'); + const starklingsPath = path.join(process.cwd(), "starklings"); + const infoPath = path.join(starklingsPath, "info.toml"); if (!fs.existsSync(starklingsPath)) { - throw new Error('Starklings directory not found'); + throw new Error("Starklings directory not found"); } if (!fs.existsSync(infoPath)) { - throw new Error('info.toml not found in starklings directory'); + throw new Error("info.toml not found in starklings directory"); } // Tester la connexion au serveur const serverOk = await testServerConnection(); if (!serverOk) { - throw new Error('Server is not accessible'); + throw new Error("Server is not accessible"); } // Parser les exercices par catégorie const categories = parseInfoToml(infoPath); if (Object.keys(categories).length === 0) { - throw new Error('No categories found'); + throw new Error("No categories found"); } // Filtrer à une seule catégorie si demandé @@ -697,7 +697,7 @@ async function runSingleTest(runNumber) { } // Créer le dossier de debug - const debugDir = path.join(__dirname, '..', '..', 'debug'); + const debugDir = path.join(__dirname, "..", "..", "debug"); fs.mkdirSync(debugDir, { recursive: true }); // Calculer le total d'exercices @@ -788,11 +788,11 @@ async function main() { } // Générer le rapport consolidé - const debugDir = path.join(__dirname, '..', '..', 'debug'); + const debugDir = path.join(__dirname, "..", "..", "debug"); const consolidatedReport = generateConsolidatedReport(allResults); const consolidatedReportPath = path.join( debugDir, - 'consolidated_report.json', + "consolidated_report.json", ); fs.writeFileSync( consolidatedReportPath, @@ -841,6 +841,6 @@ async function main() { } main().catch((error) => { - console.error('❌ Fatal error:', error); + console.error("❌ Fatal error:", error); process.exit(1); }); diff --git a/.github/workflows/publish-image.yml b/.github/workflows/publish-image.yml index dc728b5f..f09cf172 100644 --- a/.github/workflows/publish-image.yml +++ b/.github/workflows/publish-image.yml @@ -7,7 +7,7 @@ on: description: Release tag to add to images required: false type: string - default: '' + default: "" workflow_dispatch: permissions: diff --git a/docker-compose.yml b/docker-compose.yml index a2fef630..e5a88f8b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,7 +1,7 @@ services: postgres: image: pgvector/pgvector:pg17 - container_name: 'postgres' + container_name: "postgres" shm_size: 1g env_file: - path: .env @@ -18,7 +18,7 @@ services: build: context: . dockerfile: backend.dockerfile - container_name: 'cairo-coder-backend' + container_name: "cairo-coder-backend" env_file: - path: .env required: true @@ -33,7 +33,7 @@ services: ingester: platform: linux/amd64 - container_name: 'cairo-coder-ingester' + container_name: "cairo-coder-ingester" env_file: - path: .env required: true diff --git a/ingesters/__tests__/StarknetBlogIngester.test.ts b/ingesters/__tests__/StarknetBlogIngester.test.ts new file mode 100644 index 00000000..687ff47a --- /dev/null +++ b/ingesters/__tests__/StarknetBlogIngester.test.ts @@ -0,0 +1,372 @@ +import axios from 'axios'; +import { afterEach, describe, expect, it, vi } from 'bun:test'; +import { + StarknetBlogIngester, + __testing, +} from '../src/ingesters/StarknetBlogIngester'; +import { type BookPageDto } from '../src/utils/types'; + +const BASE_URL = 'https://www.starknet.io/blog'; +const SITEMAP_URL = 'https://www.starknet.io/sitemap.xml'; + +type MockResponse = { + status: number; + data: string; + headers: Record; +}; + +class TestStarknetBlogIngester extends StarknetBlogIngester { + public exposedDownloadAndExtractDocs(): Promise { + return this.downloadAndExtractDocs(); + } + + public exposedCreateChunks( + pages: BookPageDto[], + ): ReturnType { + return this.createChunks(pages); + } +} + +const buildHtml = (options: { + title: string; + metaDate?: string; + timeDate?: string; + headerTimeDate?: string; + jsonLdDate?: string; + bodyText?: string; +}): string => { + const { title, metaDate, timeDate, headerTimeDate, jsonLdDate, bodyText } = + options; + return ` + + + ${title} + ${metaDate ? `` : ''} + ${jsonLdDate ? `` : ''} + + + ${headerTimeDate ? `
` : ''} +
+

${title}

+ ${timeDate ? `` : ''} +

${bodyText ?? 'Body content goes here.'}

+

Join our newsletter

+

Subscribe for updates.

+

May also interest you

+

Other posts.

+
+ +`; +}; + +const buildSitemap = ( + urls: string[], +): string => ` + +${urls.map((url) => ` ${url}`).join('\n')} +`; + +const mockAxiosGet = (responses: Map) => { + return vi + .spyOn(axios, 'get') + .mockImplementation(async (url: string | any) => { + const key = typeof url === 'string' ? url : String(url); + const response = responses.get(key); + if (response) { + return response as any; + } + + return { + status: 404, + data: '', + headers: { 'content-type': 'text/html' }, + } as any; + }); +}; + +describe('StarknetBlogIngester (crawler)', () => { + afterEach(() => { + vi.restoreAllMocks(); + }); + + it('filters to 2025/2026 posts and strips boilerplate sections', async () => { + const sitemap = buildSitemap([ + 'https://starknet.io/blog/post-2025', + 'https://www.starknet.io/blog/post-2026', + 'https://www.starknet.io/blog/post-2024', + 'https://www.starknet.io/blog', + 'https://www.starknet.io/blog/tag/something', + ]); + + const responses = new Map([ + [ + SITEMAP_URL, + { + status: 200, + data: sitemap, + headers: { 'content-type': 'application/xml' }, + }, + ], + [ + 'https://www.starknet.io/blog/post-2025', + { + status: 200, + data: buildHtml({ + title: 'Post 2025', + metaDate: '2025-05-01T00:00:00Z', + }), + headers: { 'content-type': 'text/html' }, + }, + ], + [ + 'https://www.starknet.io/blog/post-2026', + { + status: 200, + data: buildHtml({ + title: 'Post 2026', + metaDate: '2026-06-01T00:00:00Z', + }), + headers: { 'content-type': 'text/html' }, + }, + ], + [ + 'https://www.starknet.io/blog/post-2024', + { + status: 200, + data: buildHtml({ + title: 'Post 2024', + metaDate: '2024-03-01T00:00:00Z', + }), + headers: { 'content-type': 'text/html' }, + }, + ], + ]); + + mockAxiosGet(responses); + + const ingester = new TestStarknetBlogIngester(); + const pages = await ingester.exposedDownloadAndExtractDocs(); + + expect(pages.map((page) => page.name).sort()).toEqual([ + 'post-2025', + 'post-2026', + ]); + + pages.forEach((page) => { + expect(page.content).not.toContain('Join our newsletter'); + expect(page.content).not.toContain('May also interest you'); + expect(page.content.toLowerCase()).not.toContain('newsletter'); + expect(page.content.toLowerCase()).not.toContain('may also interest you'); + expect(page.content).not.toMatch(/(^|\n)#+\s*Authors?\b/i); + expect(page.content.startsWith('# ')).toBe(true); + }); + }); + + it.each([ + { + label: 'meta tag', + html: buildHtml({ + title: 'Meta Date', + metaDate: '2025-02-10T00:00:00Z', + }), + }, + { + label: 'time element', + html: buildHtml({ + title: 'Time Date', + timeDate: '2026-04-12T00:00:00Z', + }), + }, + { + label: 'json-ld', + html: buildHtml({ + title: 'JsonLd Date', + jsonLdDate: '2025-11-01T00:00:00Z', + }), + }, + { + label: 'header time element', + html: buildHtml({ + title: 'Header Time', + headerTimeDate: '2025-08-09T00:00:00Z', + }), + }, + { + label: 'markdown text', + html: buildHtml({ + title: 'Text Date', + bodyText: 'Apr 3, 2025 · 3 min read', + }), + }, + ])('includes posts when year is detected via $label', async ({ html }) => { + const sitemap = buildSitemap(['https://www.starknet.io/blog/year-test']); + const responses = new Map([ + [ + SITEMAP_URL, + { + status: 200, + data: sitemap, + headers: { 'content-type': 'application/xml' }, + }, + ], + [ + 'https://www.starknet.io/blog/year-test', + { + status: 200, + data: html, + headers: { 'content-type': 'text/html' }, + }, + ], + ]); + + mockAxiosGet(responses); + + const ingester = new TestStarknetBlogIngester(); + const pages = await ingester.exposedDownloadAndExtractDocs(); + + expect(pages).toHaveLength(1); + expect(pages[0]?.name).toBe('year-test'); + expect(pages[0]?.content).not.toContain('Join our newsletter'); + expect(pages[0]?.content).not.toContain('May also interest you'); + expect(pages[0]?.content?.toLowerCase()).not.toContain('newsletter'); + expect(pages[0]?.content?.toLowerCase()).not.toContain( + 'may also interest you', + ); + expect(pages[0]?.content).not.toMatch(/(^|\n)#+\s*Authors?\b/i); + }); + + it('creates chunks with page-scoped source links and stable IDs', async () => { + const ingester = new TestStarknetBlogIngester(); + const pages: BookPageDto[] = [ + { + name: 'posts/2025/hello-world', + content: '# Hello World\n\nSome content here.', + }, + { + name: 'scaling-bitcoin', + content: + '# Scaling Bitcoin\n\n## Overview\n\nContent about Bitcoin.\n\n## Technical Details\n\nMore details here.', + }, + ]; + + const chunks = await ingester.exposedCreateChunks(pages); + + expect(chunks.length).toBeGreaterThan(0); + chunks.forEach((chunk) => { + // Verify sourceLink is never undefined or empty + expect(chunk.metadata.sourceLink).toBeDefined(); + expect(chunk.metadata.sourceLink).not.toBe(''); + expect(chunk.metadata.sourceLink).toContain( + 'https://www.starknet.io/blog/', + ); + + // Verify sourceLink matches the page name + expect(chunk.metadata.sourceLink).toContain(chunk.metadata.name); + + // Verify uniqueId starts with the correct prefix + expect(chunk.metadata.uniqueId).toContain('starknet-blog-'); + }); + + // Test specific page sources + const helloWorldChunks = chunks.filter( + (c) => c.metadata.name === 'posts/2025/hello-world', + ); + expect(helloWorldChunks.length).toBeGreaterThan(0); + helloWorldChunks.forEach((chunk) => { + expect(chunk.metadata.sourceLink).toBe( + 'https://www.starknet.io/blog/posts/2025/hello-world', + ); + }); + + const scalingBitcoinChunks = chunks.filter( + (c) => c.metadata.name === 'scaling-bitcoin', + ); + expect(scalingBitcoinChunks.length).toBeGreaterThan(0); + scalingBitcoinChunks.forEach((chunk) => { + expect(chunk.metadata.sourceLink).toBe( + 'https://www.starknet.io/blog/scaling-bitcoin', + ); + }); + }); +}); + +describe('StarknetBlogIngester (real page integration)', () => { + const REAL_PAGE_URL = + 'https://www.starknet.io/blog/starknet-2025-year-in-review'; + const REAL_PAGE_URL_SLASH = `${REAL_PAGE_URL}/`; + const REAL_PAGE_NAME = 'starknet-2025-year-in-review'; + + it( + 'processes real page through ingester extraction logic', + async () => { + const realResponse = await axios.get(REAL_PAGE_URL_SLASH, { + headers: { 'User-Agent': 'cairo-coder-ingester-test' }, + timeout: 30000, + }); + + expect(realResponse.status).toBe(200); + const html = realResponse.data as string; + expect(html).toContain('Starknet'); + + vi.spyOn(axios, 'get').mockImplementation( + async (url: string | any, config?: any) => { + const key = typeof url === 'string' ? url : String(url); + if (key === SITEMAP_URL) { + return { + status: 200, + data: buildSitemap([REAL_PAGE_URL_SLASH]), + headers: { 'content-type': 'application/xml' }, + } as any; + } + + if (key === REAL_PAGE_URL || key === REAL_PAGE_URL_SLASH) { + return { + status: 200, + data: html, + headers: { 'content-type': 'text/html' }, + } as any; + } + + return { + status: 404, + data: '', + headers: { 'content-type': 'text/html' }, + } as any; + }, + ); + + const ingester = new TestStarknetBlogIngester(); + const pages = await ingester.exposedDownloadAndExtractDocs(); + const page = pages.find((entry) => entry.name === REAL_PAGE_NAME); + + const { markdown, title, publishedYear } = __testing.extractContent( + html, + REAL_PAGE_URL, + ); + const cleaned = __testing.cleanBlogMarkdown(markdown); + const expectedContent = __testing.ensureTitleInMarkdown(title, cleaned); + + expect(title).toContain('Starknet'); + expect(publishedYear).toBe(2025); + + expect(page).toBeDefined(); + expect(page?.content.startsWith('# ')).toBe(true); + expect(page?.content).toContain('Starknet'); + expect(page?.content).toContain('2025'); + expect(page?.content).not.toContain('Join our newsletter'); + expect(page?.content).not.toContain('May also interest you'); + expect(page?.content.toLowerCase()).not.toContain('newsletter'); + expect(page?.content.toLowerCase()).not.toContain( + 'may also interest you', + ); + expect(page?.content).not.toMatch(/(^|\n)#+\s*Authors?\b/i); + expect(page?.content).toBe(expectedContent); + expect(expectedContent.toLowerCase()).not.toContain('newsletter'); + expect(expectedContent.toLowerCase()).not.toContain( + 'may also interest you', + ); + expect(expectedContent).not.toMatch(/(^|\n)#+\s*Authors?\b/i); + }, + { timeout: 30000 }, + ); +}); diff --git a/ingesters/bun.lock b/ingesters/bun.lock index eb47321b..d4aa78e2 100644 --- a/ingesters/bun.lock +++ b/ingesters/bun.lock @@ -15,9 +15,11 @@ "adm-zip": "^0.5.16", "asciidoctor": "^3.0.4", "axios": "^1.7.9", + "cheerio": "^1.0.0-rc.12", "dotenv": "^16.4.7", "downdoc": "1.0.2-stable", "lunr": "^2.3.9", + "node-html-markdown": "^2.0.0", "pg": "^8.14.1", "winston": "^3.17.0", }, @@ -150,6 +152,8 @@ "binary-search": ["binary-search@1.3.6", "", {}, "sha512-nbE1WxOTTrUWIfsfZ4aHGYu5DOuNkbxGokjV6Z2kxfJK3uaAb8zNK1muzOeipoLHZjInT4Br88BHpzevc681xA=="], + "boolbase": ["boolbase@1.0.0", "", {}, "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="], + "brace-expansion": ["brace-expansion@2.0.2", "", { "dependencies": { "balanced-match": "^1.0.0" } }, "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ=="], "braces": ["braces@3.0.3", "", { "dependencies": { "fill-range": "^7.1.1" } }, "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA=="], @@ -166,6 +170,10 @@ "character-parser": ["character-parser@2.2.0", "", { "dependencies": { "is-regex": "^1.0.3" } }, "sha512-+UqJQjFEFaTAs3bNsF2j2kEN1baG/zghZbdqoYEDxGZtJo9LBzl1A+m0D4n3qKx8N2FNv8/Xp6yV9mQmBuptaw=="], + "cheerio": ["cheerio@1.1.2", "", { "dependencies": { "cheerio-select": "^2.1.0", "dom-serializer": "^2.0.0", "domhandler": "^5.0.3", "domutils": "^3.2.2", "encoding-sniffer": "^0.2.1", "htmlparser2": "^10.0.0", "parse5": "^7.3.0", "parse5-htmlparser2-tree-adapter": "^7.1.0", "parse5-parser-stream": "^7.1.2", "undici": "^7.12.0", "whatwg-mimetype": "^4.0.0" } }, "sha512-IkxPpb5rS/d1IiLbHMgfPuS0FgiWTtFIm/Nj+2woXDLTZ7fOT2eqzgYbdMlLweqlHbsZjxEChoVK+7iph7jyQg=="], + + "cheerio-select": ["cheerio-select@2.1.0", "", { "dependencies": { "boolbase": "^1.0.0", "css-select": "^5.1.0", "css-what": "^6.1.0", "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.0.1" } }, "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g=="], + "ci-info": ["ci-info@4.3.0", "", {}, "sha512-l+2bNRMiQgcfILUi33labAZYIWlH1kWDp+ecNo5iisRKrbm0xcRyCww71/YU0Fkw0mAFpz9bJayXPjey6vkmaQ=="], "cliui": ["cliui@7.0.4", "", { "dependencies": { "string-width": "^4.2.0", "strip-ansi": "^6.0.0", "wrap-ansi": "^7.0.0" } }, "sha512-OcRE68cOsVMXp1Yvonl/fzkQOyjLSu/8bhPDfQt0e0/Eb283TKP20Fs2MqoPsr9SwA595rRCA+QMzYc9nBP+JQ=="], @@ -186,6 +194,10 @@ "constantinople": ["constantinople@4.0.1", "", { "dependencies": { "@babel/parser": "^7.6.0", "@babel/types": "^7.6.1" } }, "sha512-vCrqcSIq4//Gx74TXXCGnHpulY1dskqLTFGDmhrGxzeXL8lF8kvXv6mpNWlJj1uD4DW23D4ljAqbY4RRaaUZIw=="], + "css-select": ["css-select@5.2.2", "", { "dependencies": { "boolbase": "^1.0.0", "css-what": "^6.1.0", "domhandler": "^5.0.2", "domutils": "^3.0.1", "nth-check": "^2.0.1" } }, "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw=="], + + "css-what": ["css-what@6.2.2", "", {}, "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA=="], + "csstype": ["csstype@3.1.3", "", {}, "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="], "decamelize": ["decamelize@1.2.0", "", {}, "sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA=="], @@ -214,6 +226,8 @@ "enabled": ["enabled@2.0.0", "", {}, "sha512-AKrN98kuwOzMIdAizXGI86UFBoo26CL21UM763y1h/GMSJ4/OHU9k2YlsmBpyScFo/wbLzWQJBMCW4+IO3/+OQ=="], + "encoding-sniffer": ["encoding-sniffer@0.2.1", "", { "dependencies": { "iconv-lite": "^0.6.3", "whatwg-encoding": "^3.1.1" } }, "sha512-5gvq20T6vfpekVtqrYQsSCFZ1wEg5+wW0/QaZMWkFr6BqD3NfKs0rLCx4rrVlSWJeZb5NBJgVLswK/w2MWU+Gw=="], + "entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="], "es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="], @@ -276,10 +290,14 @@ "hasown": ["hasown@2.0.2", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ=="], + "he": ["he@1.2.0", "", { "bin": { "he": "bin/he" } }, "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw=="], + "htmlparser2": ["htmlparser2@9.1.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.1.0", "entities": "^4.5.0" } }, "sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ=="], "humanize-ms": ["humanize-ms@1.2.1", "", { "dependencies": { "ms": "^2.0.0" } }, "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ=="], + "iconv-lite": ["iconv-lite@0.6.3", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" } }, "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw=="], + "inflight": ["inflight@1.0.6", "", { "dependencies": { "once": "^1.3.0", "wrappy": "1" } }, "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA=="], "inherits": ["inherits@2.0.4", "", {}, "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="], @@ -364,6 +382,12 @@ "node-fetch": ["node-fetch@2.7.0", "", { "dependencies": { "whatwg-url": "^5.0.0" }, "peerDependencies": { "encoding": "^0.1.0" }, "optionalPeers": ["encoding"] }, "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A=="], + "node-html-markdown": ["node-html-markdown@2.0.0", "", { "dependencies": { "node-html-parser": "^6.1.13" } }, "sha512-DqUC3GGP7pwSYxS93SwHoP+qCw78xcMP6C6H2DuC8rPD2AweJRjBzQb5SdXpKtDlqAQ7hVotJcfhgU7hU5Gthw=="], + + "node-html-parser": ["node-html-parser@6.1.13", "", { "dependencies": { "css-select": "^5.1.0", "he": "1.2.0" } }, "sha512-qIsTMOY4C/dAa5Q5vsobRpOOvPfC4pB61UVW2uSwZNUp0QU/jCekTal1vMmbO0DgdHeLUJpv/ARmDqErVxA3Sg=="], + + "nth-check": ["nth-check@2.1.1", "", { "dependencies": { "boolbase": "^1.0.0" } }, "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w=="], + "num-sort": ["num-sort@2.1.0", "", {}, "sha512-1MQz1Ed8z2yckoBeSfkQHHO9K1yDRxxtotKSJ9yvcTUUxSvfvzEq5GwBrjjHEpMlq/k5gvXdmJ1SbYxWtpNoVg=="], "nunjucks": ["nunjucks@3.2.4", "", { "dependencies": { "a-sync-waterfall": "^1.0.0", "asap": "^2.0.3", "commander": "^5.1.0" }, "peerDependencies": { "chokidar": "^3.3.0" }, "optionalPeers": ["chokidar"], "bin": { "nunjucks-precompile": "bin/precompile" } }, "sha512-26XRV6BhkgK0VOxfbU5cQI+ICFUtMLixv1noZn1tGU38kQH5A5nmmbk/O45xdyBhD1esk47nKrY0mvQpZIhRjQ=="], @@ -384,6 +408,12 @@ "p-timeout": ["p-timeout@3.2.0", "", { "dependencies": { "p-finally": "^1.0.0" } }, "sha512-rhIwUycgwwKcP9yTOOFK/AKsAopjjCakVqLHePO3CC6Mir1Z99xT+R63jZxAT5lFZLa2inS5h+ZS2GvR99/FBg=="], + "parse5": ["parse5@7.3.0", "", { "dependencies": { "entities": "^6.0.0" } }, "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw=="], + + "parse5-htmlparser2-tree-adapter": ["parse5-htmlparser2-tree-adapter@7.1.0", "", { "dependencies": { "domhandler": "^5.0.3", "parse5": "^7.0.0" } }, "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g=="], + + "parse5-parser-stream": ["parse5-parser-stream@7.1.2", "", { "dependencies": { "parse5": "^7.0.0" } }, "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow=="], + "path-parse": ["path-parse@1.0.7", "", {}, "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw=="], "pg": ["pg@8.16.3", "", { "dependencies": { "pg-connection-string": "^2.9.1", "pg-pool": "^3.10.1", "pg-protocol": "^1.10.3", "pg-types": "2.2.0", "pgpass": "1.0.5" }, "optionalDependencies": { "pg-cloudflare": "^1.2.7" }, "peerDependencies": { "pg-native": ">=3.0.1" }, "optionalPeers": ["pg-native"] }, "sha512-enxc1h0jA/aq5oSDMvqyW3q89ra6XIIDZgCX9vkMrnz5DFTw/Ny3Li2lFQ+pt3L6MCgm/5o2o8HW9hiJji+xvw=="], @@ -460,6 +490,8 @@ "safe-stable-stringify": ["safe-stable-stringify@2.5.0", "", {}, "sha512-b3rppTKm9T+PsVCBEOUR46GWI7fdOs00VKZ1+9c1EWDaDMvjQc6tUwuFyIprgGgTcWoVHSKrU8H31ZHA2e0RHA=="], + "safer-buffer": ["safer-buffer@2.1.2", "", {}, "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="], + "semver": ["semver@7.7.2", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA=="], "simple-wcswidth": ["simple-wcswidth@1.1.2", "", {}, "sha512-j7piyCjAeTDSjzTSQ7DokZtMNwNlEAyxqSZeCS+CXH7fJ4jx3FuJ/mTW3mE+6JLs4VJBbcll0Kjn+KXI5t21Iw=="], @@ -498,6 +530,8 @@ "uglify-js": ["uglify-js@3.19.3", "", { "bin": { "uglifyjs": "bin/uglifyjs" } }, "sha512-v3Xu+yuwBXisp6QYTcH4UbH+xYJXqnq2m/LtQVWKWzYc1iehYnLixoQDN9FH6/j9/oybfd6W9Ghwkl8+UMKTKQ=="], + "undici": ["undici@7.16.0", "", {}, "sha512-QEg3HPMll0o3t2ourKwOeUAZ159Kn9mx5pnzHRQO8+Wixmh88YdZRiIwat0iNzNNXn0yoEtXJqFpyW7eM8BV7g=="], + "undici-types": ["undici-types@7.13.0", "", {}, "sha512-Ov2Rr9Sx+fRgagJ5AX0qvItZG/JKKoBRAVITs1zk7IqZGTJUwgUr7qoYBpWwakpWilTZFM98rG/AFRocu10iIQ=="], "unxhr": ["unxhr@1.2.0", "", {}, "sha512-6cGpm8NFXPD9QbSNx0cD2giy7teZ6xOkCUH3U89WKVkL9N9rBrWjlCwhR94Re18ZlAop4MOc3WU1M3Hv/bgpIw=="], @@ -512,6 +546,10 @@ "webidl-conversions": ["webidl-conversions@3.0.1", "", {}, "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="], + "whatwg-encoding": ["whatwg-encoding@3.1.1", "", { "dependencies": { "iconv-lite": "0.6.3" } }, "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ=="], + + "whatwg-mimetype": ["whatwg-mimetype@4.0.0", "", {}, "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg=="], + "whatwg-url": ["whatwg-url@5.0.0", "", { "dependencies": { "tr46": "~0.0.3", "webidl-conversions": "^3.0.0" } }, "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw=="], "winston": ["winston@3.18.3", "", { "dependencies": { "@colors/colors": "^1.6.0", "@dabh/diagnostics": "^2.0.8", "async": "^3.2.3", "is-stream": "^2.0.0", "logform": "^2.7.0", "one-time": "^1.0.0", "readable-stream": "^3.4.0", "safe-stable-stringify": "^2.3.1", "stack-trace": "0.0.x", "triple-beam": "^1.3.0", "winston-transport": "^4.9.0" } }, "sha512-NoBZauFNNWENgsnC9YpgyYwOVrl2m58PpQ8lNHjV3kosGs7KJ7Npk9pCUE+WJlawVSe8mykWDKWFSVfs3QO9ww=="], @@ -552,6 +590,8 @@ "chalk/ansi-styles": ["ansi-styles@4.3.0", "", { "dependencies": { "color-convert": "^2.0.1" } }, "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg=="], + "cheerio/htmlparser2": ["htmlparser2@10.0.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.2.1", "entities": "^6.0.0" } }, "sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g=="], + "color/color-convert": ["color-convert@3.1.2", "", { "dependencies": { "color-name": "^2.0.0" } }, "sha512-UNqkvCDXstVck3kdowtOTWROIJQwafjOfXSmddoDrXo4cewMKmusCeF22Q24zvjR8nwWib/3S/dfyzPItPEiJg=="], "color-string/color-name": ["color-name@2.0.2", "", {}, "sha512-9vEt7gE16EW7Eu7pvZnR0abW9z6ufzhXxGXZEVU9IqPdlsUiMwJeJfRtq0zePUmnbHGT9zajca7mX8zgoayo4A=="], @@ -564,6 +604,8 @@ "openai/@types/node": ["@types/node@18.19.129", "", { "dependencies": { "undici-types": "~5.26.4" } }, "sha512-hrmi5jWt2w60ayox3iIXwpMEnfUvOLJCRtrOPbHtH15nTjvO7uhnelvrdAs0dO0/zl5DZ3ZbahiaXEVb54ca/A=="], + "parse5/entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="], + "wrap-ansi/ansi-styles": ["ansi-styles@4.3.0", "", { "dependencies": { "color-convert": "^2.0.1" } }, "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg=="], "@jest/pattern/@types/node/undici-types": ["undici-types@6.21.0", "", {}, "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ=="], @@ -578,6 +620,8 @@ "bun-types/@types/node/undici-types": ["undici-types@6.21.0", "", {}, "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ=="], + "cheerio/htmlparser2/entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="], + "color/color-convert/color-name": ["color-name@2.0.2", "", {}, "sha512-9vEt7gE16EW7Eu7pvZnR0abW9z6ufzhXxGXZEVU9IqPdlsUiMwJeJfRtq0zePUmnbHGT9zajca7mX8zgoayo4A=="], "jest-mock/@types/node/undici-types": ["undici-types@6.21.0", "", {}, "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ=="], diff --git a/ingesters/config/sources.json b/ingesters/config/sources.json index bc2aebcc..af4127a1 100644 --- a/ingesters/config/sources.json +++ b/ingesters/config/sources.json @@ -73,7 +73,7 @@ "chunkOverlap": 512, "baseUrl": "https://docs.openzeppelin.com/contracts-cairo", "urlSuffix": "", - "useUrlMapping": false + "useUrlMapping": true } }, "corelib_docs": { @@ -131,7 +131,7 @@ "fileExtensions": [".md"], "chunkSize": 4096, "chunkOverlap": 512, - "baseUrl": "https://starknet.io/blog", + "baseUrl": "https://www.starknet.io/blog", "urlSuffix": "", "useUrlMapping": false } diff --git a/ingesters/package.json b/ingesters/package.json index a7725e04..1517884b 100644 --- a/ingesters/package.json +++ b/ingesters/package.json @@ -18,9 +18,11 @@ "adm-zip": "^0.5.16", "asciidoctor": "^3.0.4", "axios": "^1.7.9", + "cheerio": "^1.0.0-rc.12", "dotenv": "^16.4.7", "downdoc": "1.0.2-stable", "lunr": "^2.3.9", + "node-html-markdown": "^2.0.0", "pg": "^8.14.1", "winston": "^3.17.0" }, diff --git a/ingesters/src/ingesters/StarknetBlogIngester.ts b/ingesters/src/ingesters/StarknetBlogIngester.ts index d5671d21..3394754f 100644 --- a/ingesters/src/ingesters/StarknetBlogIngester.ts +++ b/ingesters/src/ingesters/StarknetBlogIngester.ts @@ -1,32 +1,183 @@ -import { type BookConfig } from '../utils/types'; -import { MarkdownIngester } from './MarkdownIngester'; -import { type BookChunk, DocumentSource } from '../types'; +import axios from 'axios'; +import { load, type Cheerio } from 'cheerio'; +import type { AnyNode, Element } from 'domhandler'; +import { NodeHtmlMarkdown } from 'node-html-markdown'; import { Document } from '@langchain/core/documents'; -import { VectorStore } from '../db/postgresVectorStore'; -import { type VectorStoreUpdateOptions } from '../utils/vectorStoreUtils'; +import { gunzipSync } from 'zlib'; +import { type BookChunk, DocumentSource } from '../types'; +import { type BookConfig, type BookPageDto } from '../utils/types'; +import { MarkdownIngester } from './MarkdownIngester'; import { logger } from '../utils/logger'; -import * as fs from 'fs/promises'; -import * as path from 'path'; import { calculateHash } from '../utils/contentUtils'; import { RecursiveMarkdownSplitter, type SplitOptions, } from '../utils/RecursiveMarkdownSplitter'; -import { getPythonPath } from '../utils/paths'; +import { getTempDir, getPythonPath } from '../utils/paths'; +import * as fs from 'fs/promises'; + +const USER_AGENT = 'cairo-coder-ingester'; +const CONCURRENCY = 4; +const MAX_RETRIES = 5; +const TIMEOUT_MS = 30_000; +const REQUEST_DELAY_MS = 300; +const REQUEST_JITTER_MS = 200; +const MAX_CRAWL_PAGES = 200; +const ALLOWED_YEARS = new Set([2025, 2026]); +const MIN_RETRY_DELAY_MS = 1_000; +const MAX_RETRY_DELAY_MS = 60_000; +const MIN_MARKDOWN_LENGTH = 30; +const MAX_SITEMAP_DEPTH = 5; +let globalBackoffUntil = 0; + +// Cache for URLs that are confirmed NOT 2025/2026 blog posts +// This avoids re-fetching and re-processing pages on subsequent runs +const EXCLUDED_URLS_CACHE_FILE = getPythonPath( + 'src', + 'cairo_coder_tools', + 'ingestion', + 'generated', + 'starknet-blog-excluded-urls.json', +); + +/** + * Simple cache for excluded URLs (non-2025/2026 blog posts). + * Persists to disk so subsequent runs skip already-checked pages. + */ +class ExcludedUrlsCache { + private urls: Set = new Set(); + private dirty = false; + + async load(): Promise { + try { + const content = await fs.readFile(EXCLUDED_URLS_CACHE_FILE, 'utf8'); + const data = JSON.parse(content); + if (Array.isArray(data.excludedUrls)) { + this.urls = new Set(data.excludedUrls); + logger.info(`Loaded ${this.urls.size} excluded URLs from cache`); + } + } catch { + // Cache file doesn't exist or is invalid - start fresh + this.urls = new Set(); + logger.debug('No existing excluded URLs cache found, starting fresh'); + } + } + + async save(): Promise { + if (!this.dirty) return; + try { + const data = { + updatedAt: new Date().toISOString(), + excludedUrls: Array.from(this.urls).sort(), + }; + await fs.writeFile( + EXCLUDED_URLS_CACHE_FILE, + JSON.stringify(data, null, 2), + ); + this.dirty = false; + logger.info(`Saved ${this.urls.size} excluded URLs to cache`); + } catch (error) { + logger.warn(`Failed to save excluded URLs cache: ${String(error)}`); + } + } + + has(url: string): boolean { + return this.urls.has(url); + } + + add(url: string): void { + if (!this.urls.has(url)) { + this.urls.add(url); + this.dirty = true; + } + } + + get size(): number { + return this.urls.size; + } +} + +// Global cache instance +const excludedUrlsCache = new ExcludedUrlsCache(); + +const DEFAULT_EXCLUDE_PATTERNS: RegExp[] = [ + /\/admin/i, + /\/api\//i, + /\/login/i, + /\/search/i, + /\/tag\//i, + /\/category\//i, + /\/author\//i, + /\/user\//i, + /\/wp-admin/i, + /\/wp-content/i, + /\/wp-includes/i, + /\/_next\//i, + /\/static\//i, + /\/assets\//i, + /\/js\//i, + /\/css\//i, + /\/images\//i, + /\/feed/i, + /\/rss/i, + /\/atom/i, + /\/sitemap/i, + /\/robots\.txt/i, + /\bmailto:/i, + /\btel:/i, + /#/, // fragments handled separately, but keep as guard + /\.css$/i, + /\/video\/?$/i, +]; + +const MAIN_CONTENT_SELECTORS = [ + 'main', + 'article', + '[role="main"]', + '.content', + '.doc-content', + '.markdown-body', + '.docs-content', + '.documentation', + '.post-content', + '.entry-content', + '.page-content', + '#content', + '.container-fluid', + '.container', + '.wrapper', +]; + +const BOILERPLATE_KEYWORDS = [ + 'navbar', + 'sidebar', + 'nav-bar', + 'side-bar', + 'menu', + 'toc', + 'breadcrumb', + 'footer', + 'header', +]; + +const PUBLISHED_META_SELECTORS = [ + 'meta[property="article:published_time"]', + 'meta[name="article:published_time"]', + 'meta[property="article:published"]', + 'meta[name="publish_date"]', + 'meta[name="pubdate"]', + 'meta[name="date"]', + 'meta[property="og:pubdate"]', +]; + +const MONTH_REGEX = + /(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)[a-z]*\s+\d{1,2},\s+(\d{4})/i; /** - * Ingester for Starknet blog posts documentation - * - * This ingester processes pre-summarized Starknet blog posts from the generated - * summary file, chunks them using the RecursiveMarkdownSplitter, and stores them - * in the vector database for retrieval. + * Ingester for Starknet blog posts (2025/2026). */ export class StarknetBlogIngester extends MarkdownIngester { - /** - * Constructor for the Starknet Blog ingester - */ constructor() { - // Define the configuration for the Starknet Blog const config: BookConfig = { repoOwner: 'starknet', repoName: 'starknet-blog', @@ -35,123 +186,767 @@ export class StarknetBlogIngester extends MarkdownIngester { chunkOverlap: 512, baseUrl: 'https://www.starknet.io/blog', urlSuffix: '', - useUrlMapping: false, + useUrlMapping: true, }; super(config, DocumentSource.STARKNET_BLOG); } - /** - * Read the pre-summarized Starknet blog documentation file - */ - async readSummaryFile(): Promise { - const summaryPath = getPythonPath( - 'src', - 'cairo_coder_tools', - 'ingestion', - 'generated', - 'starknet-blog.md', + protected override async downloadAndExtractDocs(): Promise { + logger.info('Crawling Starknet blog posts for 2025-2026'); + + // Reset global backoff state for fresh crawl + globalBackoffUntil = 0; + + // Load excluded URLs cache to skip already-checked pages + await excludedUrlsCache.load(); + + const baseUrl = this.config.baseUrl; + const discoveredUrls = await discoverUrls(baseUrl); + + if (discoveredUrls.length === 0) { + throw new Error('No URLs discovered for Starknet blog crawl'); + } + + const filteredUrls = filterUrls(discoveredUrls, baseUrl); + + if (filteredUrls.length === 0) { + throw new Error('No URLs remaining after Starknet blog filtering'); + } + + // Filter out URLs that are already in the excluded cache + const urlsToProcess = filteredUrls.filter( + (url) => !excludedUrlsCache.has(url), ); + const skippedFromCache = filteredUrls.length - urlsToProcess.length; - logger.info(`Reading Starknet blog summary from ${summaryPath}`); - const text = await fs.readFile(summaryPath, 'utf-8'); - return text; - } - - /** - * Chunk the blog summary file using RecursiveMarkdownSplitter - * - * This function takes the markdown content and splits it using a recursive - * strategy that respects headers, code blocks, and maintains overlap between chunks. - * - * @param text - The markdown content to chunk - * @returns Promise[]> - Array of document chunks - */ - async chunkSummaryFile(text: string): Promise[]> { - // Configure the splitter with appropriate settings + if (skippedFromCache > 0) { + logger.info( + `Skipping ${skippedFromCache} URLs from cache, processing ${urlsToProcess.length} URLs`, + ); + } else { + logger.info(`Processing ${urlsToProcess.length} Starknet blog URLs`); + } + + const results = await mapWithConcurrency( + urlsToProcess, + CONCURRENCY, + async (url) => { + const page = await fetchAndProcessPage(url, baseUrl); + if (!page) { + return null; + } + return page; + }, + ); + + const pages = results.filter((page): page is BookPageDto => page !== null); + + // Save updated cache + await excludedUrlsCache.save(); + + logger.info(`Collected ${pages.length} Starknet blog posts for ingestion`); + return pages; + } + + protected override async createChunks( + pages: BookPageDto[], + ): Promise[]> { + logger.info('Creating chunks from Starknet blog pages'); + + const chunks: Document[] = []; const splitOptions: SplitOptions = { maxChars: 2048, minChars: 500, overlap: 256, - headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest) + headerLevels: [1, 2, 3], preserveCodeBlocks: true, - idPrefix: 'starknet-blog', trim: true, }; - // Create the splitter and split the content - const splitter = new RecursiveMarkdownSplitter(splitOptions); - const chunks = splitter.splitMarkdownToChunks(text); + for (const page of pages) { + const pageId = sanitizePageId(page.name); + const splitter = new RecursiveMarkdownSplitter({ + ...splitOptions, + idPrefix: `starknet-blog-${pageId}`, + }); + const pageChunks = splitter.splitMarkdownToChunks(page.content); + const pageSource = this.buildSourceLink(page.name); - logger.info( - `Created ${chunks.length} chunks using RecursiveMarkdownSplitter`, - ); + pageChunks.forEach((chunk) => { + const contentHash = calculateHash(chunk.content); + const uniqueId = chunk.meta.uniqueId; + const sourceLink = pageSource; - // Convert chunks to Document format - const localChunks: Document[] = chunks.map((chunk) => { - const contentHash = calculateHash(chunk.content); - - return new Document({ - pageContent: chunk.content, - metadata: { - name: chunk.meta.title, - title: chunk.meta.title, - chunkNumber: chunk.meta.chunkNumber, // Already 0-based - contentHash: contentHash, - uniqueId: chunk.meta.uniqueId, - sourceLink: chunk.meta.sourceLink || this.config.baseUrl, - source: this.source, - }, + logger.debug( + `Creating chunk for ${page.name}: title="${chunk.meta.title || page.name}", sourceLink="${sourceLink}"`, + ); + + chunks.push( + new Document({ + pageContent: chunk.content, + metadata: { + name: page.name, + title: chunk.meta.title || page.name, + chunkNumber: chunk.meta.chunkNumber, + contentHash, + uniqueId, + sourceLink, + source: this.source, + }, + }), + ); }); + } + + logger.info(`Created ${chunks.length} chunks from Starknet blog pages`); + return chunks; + } + + protected getExtractDir(): string { + return getTempDir('starknet-blog'); + } + + protected override async cleanupDownloadedFiles(): Promise { + logger.info('No cleanup needed - Starknet blog crawl is in-memory'); + } + + private buildSourceLink(pageName: string): string { + const baseUrl = this.config.baseUrl.replace(/\/$/, ''); + const trimmed = pageName.replace(/^\/+/, ''); + return trimmed ? `${baseUrl}/${trimmed}` : baseUrl; + } +} + +async function discoverUrls(baseUrl: string): Promise { + const sitemapUrls = await discoverUrlsFromSitemap(baseUrl); + if (sitemapUrls.length > 0) { + return sitemapUrls; + } + + return discoverUrlsByCrawling(baseUrl); +} + +async function discoverUrlsFromSitemap(baseUrl: string): Promise { + const sitemapUrl = new URL('/sitemap.xml', baseUrl).toString(); + logger.info(`Checking for sitemap at ${sitemapUrl}`); + + const urls = await parseSitemap(sitemapUrl); + const base = new URL(baseUrl); + const seen = new Set(); + const validUrls: string[] = []; + + for (const url of urls) { + if (!url) continue; + const processed = processUrl(url, base, baseUrl); + if (!processed || seen.has(processed)) continue; + seen.add(processed); + validUrls.push(processed); + } + + logger.info(`Found ${validUrls.length} valid URLs from sitemap`); + return validUrls; +} + +async function parseSitemap( + sitemapUrl: string, + depth = 0, + visited = new Set(), +): Promise { + if (depth > MAX_SITEMAP_DEPTH) { + logger.warn(`Sitemap recursion depth exceeded for ${sitemapUrl}`); + return []; + } + + const normalizedUrl = sitemapUrl.toLowerCase(); + if (visited.has(normalizedUrl)) { + logger.debug(`Skipping already visited sitemap: ${sitemapUrl}`); + return []; + } + visited.add(normalizedUrl); + + const sitemapContent = await fetchSitemap(sitemapUrl); + if (!sitemapContent) { + return []; + } + + const locMatches = sitemapContent.matchAll(/\s*([^<\s]+)\s*<\/loc>/gi); + const locs = Array.from(locMatches, (match) => decodeXml(match[1] ?? '')); + + if (/sitemapindex/i.test(sitemapContent)) { + const nestedUrls: string[] = []; + for (const loc of locs) { + if (!loc) continue; + const nested = await parseSitemap(loc, depth + 1, visited); + nestedUrls.push(...nested); + } + return nestedUrls; + } + + return locs; +} + +async function fetchSitemap(url: string): Promise { + try { + const response = await axios.get(url, { + headers: { 'User-Agent': USER_AGENT }, + timeout: TIMEOUT_MS, + responseType: 'arraybuffer', + validateStatus: () => true, }); - return localChunks; + if (response.status >= 200 && response.status < 300) { + const contentType = response.headers['content-type'] ?? ''; + const isGzip = /gzip/i.test(contentType) || url.endsWith('.gz'); + const data = response.data; + let buffer: Buffer; + if (Buffer.isBuffer(data)) { + buffer = data; + } else if (typeof data === 'string') { + buffer = Buffer.from(data); + } else { + buffer = Buffer.from(data as ArrayBuffer); + } + if (isGzip) { + try { + return gunzipSync(buffer).toString('utf8'); + } catch (error) { + logger.debug(`Failed to gunzip sitemap ${url}: ${String(error)}`); + return null; + } + } + return buffer.toString('utf8'); + } + } catch (error) { + logger.debug(`Failed to fetch sitemap ${url}: ${String(error)}`); } - /** - * Starknet Blog specific processing based on the pre-summarized markdown file - * @param vectorStore - */ - public override async process( - vectorStore: VectorStore, - options?: VectorStoreUpdateOptions, - ): Promise { - try { - // 1. Read the pre-summarized documentation - const text = await this.readSummaryFile(); + return null; +} - // 2. Create chunks from the documentation - const chunks = await this.chunkSummaryFile(text); +async function discoverUrlsByCrawling(baseUrl: string): Promise { + logger.info('Falling back to crawling for URL discovery'); + const base = new URL(baseUrl); + const visited = new Set(); + const startUrl = processUrl(baseUrl, base, baseUrl); + if (!startUrl) return []; - logger.info( - `Created ${chunks.length} chunks from Starknet blog documentation`, - ); + const queue: string[] = [startUrl]; + visited.add(startUrl); + + while (queue.length > 0 && visited.size < MAX_CRAWL_PAGES) { + const current = queue.shift(); + if (!current) continue; + + const html = await fetchHtml(current); + if (!html) continue; + + const $ = load(html); + $('a[href], link[href]').each((_, element) => { + if (visited.size >= MAX_CRAWL_PAGES) return; + + const href = $(element).attr('href'); + if (!href) return; + + const processed = processUrl(href, base, baseUrl, current); + if (!processed || visited.has(processed)) return; + + visited.add(processed); + queue.push(processed); + }); + } + + logger.info(`Discovered ${visited.size} pages by crawling`); + return Array.from(visited); +} + +function filterUrls(urls: string[], baseUrl: string): string[] { + const base = new URL(baseUrl); + const seen = new Set(); + const filtered: string[] = []; + + for (const url of urls) { + const processed = processUrl(url, base, baseUrl); + if (!processed || !isBlogPostPath(processed, baseUrl)) continue; + if (seen.has(processed)) continue; + seen.add(processed); + filtered.push(processed); + } + + return filtered.sort(); +} + +async function fetchAndProcessPage( + url: string, + baseUrl: string, +): Promise { + const html = await fetchHtml(url); + if (!html) { + logger.debug(`Skipping ${url}: failed to fetch HTML`); + return null; + } - // 3. Update the vector store with the chunks - await this.updateVectorStore(vectorStore, chunks, options); + const { markdown, title, publishedYear } = extractContent(html, url); + + if (!publishedYear || !ALLOWED_YEARS.has(publishedYear)) { + logger.debug(`Skipping ${url}: not a 2025/2026 blog post`); + // Add to cache so we don't re-check this URL on future runs + excludedUrlsCache.add(url); + return null; + } + + const cleaned = cleanBlogMarkdown(markdown); + if (!cleaned || cleaned.length < MIN_MARKDOWN_LENGTH) { + logger.debug(`Skipping ${url}: extracted markdown too small`); + return null; + } + + const content = ensureTitleInMarkdown(title, cleaned); + const pageName = buildPageName(url, baseUrl); + + return { + name: pageName, + content, + }; +} - // 4. Clean up any temporary files (no temp files in this case) - await this.cleanupDownloadedFiles(); +async function fetchHtml(url: string): Promise { + let lastError = 'Unknown error'; + + for (let attempt = 0; attempt < MAX_RETRIES; attempt += 1) { + try { + await waitForGlobalBackoff(); + const response = await axios.get(url, { + headers: { 'User-Agent': USER_AGENT }, + timeout: TIMEOUT_MS, + validateStatus: () => true, + }); + + const contentType = response.headers['content-type'] ?? ''; + if (response.status === 200 && contentType.includes('text/html')) { + await sleep(REQUEST_DELAY_MS + randomJitter(REQUEST_JITTER_MS)); + return response.data as string; + } + + if (response.status === 429 || response.status >= 500) { + lastError = `Status ${response.status}`; + const retryAfter = response.headers['retry-after']; + const delayMs = computeRetryDelay(retryAfter, attempt); + scheduleGlobalBackoff(delayMs); + await sleep(delayMs); + continue; + } + + // Log non-retryable failures (404, 403, etc.) for debugging stale sitemap entries + if (response.status !== 200) { + logger.debug(`Non-retryable status ${response.status} for ${url}`); + } else if (!contentType.includes('text/html')) { + logger.debug(`Non-HTML content-type "${contentType}" for ${url}`); + } + return null; } catch (error) { - this.handleError(error); + lastError = String(error); + const delayMs = computeRetryDelay(undefined, attempt); + scheduleGlobalBackoff(delayMs); + await sleep(delayMs); } } - /** - * Get the directory path for extracting files - * - * @returns string - Path to the extract directory - */ - protected getExtractDir(): string { - const { getTempDir } = require('../utils/paths'); - return getTempDir('starknet-blog'); + logger.debug( + `Failed to fetch ${url} after ${MAX_RETRIES} attempts: ${lastError}`, + ); + return null; +} + +function extractContent( + html: string, + url: string, +): { markdown: string; title: string; publishedYear: number | null } { + const $ = load(html); + + const title = + $('meta[property="og:title"]').attr('content')?.trim() || + $('title').first().text().trim() || + $('h1').first().text().trim() || + url; + + const publishedYear = extractPublishedYearFromDom($); + + $( + 'script, style, noscript, nav, header, footer, aside, img, svg, iframe', + ).remove(); + + // Remove boilerplate elements (but never remove html, head, or body) + $('*') + .not('html, head, body') + .each((_, element) => { + const node = $(element); + const idClass = + `${node.attr('id') ?? ''} ${node.attr('class') ?? ''}`.toLowerCase(); + if (BOILERPLATE_KEYWORDS.some((keyword) => idClass.includes(keyword))) { + node.remove(); + } + }); + + let mainContent: Cheerio | null = null; + + // Try main content selectors first + for (const selector of MAIN_CONTENT_SELECTORS) { + const element = $(selector).first(); + if (element.length && element.text().trim().length > 100) { + mainContent = element; + break; + } } - /** - * Override cleanupDownloadedFiles since we don't download anything - */ - protected override async cleanupDownloadedFiles(): Promise { - // No cleanup needed as we're reading from a local file - logger.info('No cleanup needed - using local summary file'); + // Fallback: find largest content div (excluding nav/sidebar/etc) + if (!mainContent) { + let bestDiv = null; + let bestLength = 0; + $('div').each((_, element) => { + const node = $(element); + const textLength = node.text().trim().length; + if (textLength < 200) return; + + const idClass = + `${node.attr('id') ?? ''} ${node.attr('class') ?? ''}`.toLowerCase(); + if ( + ['nav', 'menu', 'sidebar', 'header', 'footer'].some((kw) => + idClass.includes(kw), + ) + ) { + return; + } + + if (textLength > bestLength) { + bestLength = textLength; + bestDiv = node; + } + }); + mainContent = bestDiv; + } + + // Last resort: use body + mainContent = mainContent ?? $('body').first(); + + const htmlFragment = mainContent ? ($.html(mainContent) ?? '') : ''; + const markdown = NodeHtmlMarkdown.translate(htmlFragment); + const normalizedMarkdown = normalizeMarkdown(markdown); + const finalPublishedYear = + publishedYear ?? extractPublishedYearFromMarkdown(normalizedMarkdown); + + return { + markdown: normalizedMarkdown, + title, + publishedYear: finalPublishedYear, + }; +} + +function extractPublishedYearFromDom( + $: ReturnType, +): number | null { + // Try meta tags + for (const selector of PUBLISHED_META_SELECTORS) { + const year = parseYear($(selector).attr('content')); + if (year) return year; + } + + // Try