diff --git a/.changeset/wide-hairs-fail.md b/.changeset/wide-hairs-fail.md new file mode 100644 index 0000000000..c28dcd3072 --- /dev/null +++ b/.changeset/wide-hairs-fail.md @@ -0,0 +1,5 @@ +--- +"parser": major +--- + +Add script to perform parsing to parser app. Store parsed information locally. Sanitize urls in filesystem compatible format to use as file names. diff --git a/apps/parser/.env.default b/apps/parser/.env.default new file mode 100644 index 0000000000..492d2bbc9c --- /dev/null +++ b/apps/parser/.env.default @@ -0,0 +1,9 @@ +# Default environment variables for the parser CLI +# Root URL to start parsing from +URL="https://example.com" + +# Maximum recursion depth (integer or null for unlimited) +DEPTH=null + +# Name of the vector index bucket/folder where parsed artifacts are stored +CHB_INDEX_ID="parser-vector-index-name" diff --git a/apps/parser/README.md b/apps/parser/README.md new file mode 100644 index 0000000000..f53d32acf0 --- /dev/null +++ b/apps/parser/README.md @@ -0,0 +1,92 @@ + +# Parser Utilities + +This package provides a TypeScript CLI tool for recursively crawling a website, extracting structured metadata from each page, and saving the results in a predictable directory structure. + +## Features + +- **Recursive website parsing**: Visits all reachable pages up to a configurable depth. +- **Structured output**: Saves each page's metadata as a JSON file. +- **Configurable via environment variables or .env file**. + +--- + +## Getting Started + +1. **Install dependencies:** + ```bash + npm install + ``` +2. **Type-check & compile** + ```bash + npm run compile + ``` +3. **Build the project:** + ```bash + npm run build + ``` + +--- + +## Usage + +### 1. Configure Environment Variables + +You can provide configuration in two ways: + +#### a) Using a `.env` file (recommended) + +Create a `.env` file in the `apps/parser` directory with the following content: + +``` +URL=https://example.com +CHB_INDEX_ID=name_of_your_choice +# DEPTH=2 # Optional, defaults to null +``` + +#### b) Using command line variables + +```bash +URL=https://example.com DEPTH=2 CHB_INDEX_ID=name_of_your_choice npm run parse +``` + +### 2. Run the Parser + +```bash +npm run parse +``` + +--- + +## Environment Variables + +- **`URL`** (required): The root page to start parsing from. +- **`CHB_INDEX_ID`** (required): The base directory for storing parsed data. Output will be saved as `/parsing//`. +- **`DEPTH`** (optional, default: `2`): Maximum recursion depth for crawling links. + +**Note:** The parser will first look for these variables in the environment. If not found, it will load them from `.env` in the `apps/parser` directory. + +--- + +## Output Structure + +Each visited page is saved as a JSON file: + +``` +/parsing//.json +``` + +- `` and `` are filesystem-safe versions of the URL components (illegal characters replaced with `-`). +- This structure ensures output is predictable, easy to diff, and human-readable. + +--- + +## Testing + +Run tests with: + +```bash +npm run test +``` + +Tests will compile the project and then execute Jest to ensure the CLI behaves as expected. diff --git a/apps/parser/jest.config.ts b/apps/parser/jest.config.ts new file mode 100644 index 0000000000..8d2a799c00 --- /dev/null +++ b/apps/parser/jest.config.ts @@ -0,0 +1,14 @@ +import type { Config } from "jest"; + +const config: Config = { + rootDir: __dirname, + testRegex: "tests/.*\\.test\\.ts$", + transform: { + "^.+\\.ts$": ["ts-jest", { tsconfig: "tsconfig.json" }], + }, + testEnvironment: "node", + clearMocks: true, + verbose: false, +}; + +export default config; diff --git a/apps/parser/package.json b/apps/parser/package.json index 955399a6f3..a54b55c5ce 100644 --- a/apps/parser/package.json +++ b/apps/parser/package.json @@ -1,10 +1,28 @@ { "name": "parser", - "version": "0.1.0", + "version": "1.0.0", "private": true, - "scripts": {}, + "scripts": { + "clean": "shx rm -rf dist", + "compile": "tsc --project tsconfig.json", + "build": "npm run clean && tsc --project tsconfig.build.json", + "parse": "npm run build && node dist/main.js", + "test": "npm run build && jest -i" + }, "dependencies": { - "puppeteer": "^24.37.1" + "node-fetch": "^3.3.2", + "puppeteer": "^24.37.1", + "puppeteer-extra": "^3.3.6", + "puppeteer-extra-plugin-stealth": "^2.11.2", + "xml2js": "^0.6.2" + }, + "devDependencies": { + "@types/jest": "^29.5.1", + "@types/node": "18.16.*", + "@types/xml2js": "^0.4.11", + "jest": "^29.5.0", + "shx": "^0.3.4", + "ts-jest": "^29.1.1", + "typescript": "5.1.6" } } - diff --git a/apps/parser/src/helpers/date-format.ts b/apps/parser/src/helpers/date-format.ts new file mode 100644 index 0000000000..b7dc757633 --- /dev/null +++ b/apps/parser/src/helpers/date-format.ts @@ -0,0 +1,7 @@ +export function toIsoOrNull(value: string | null): string | null { + if (!value) { + return null; + } + const date = new Date(value); + return Number.isNaN(date.getTime()) ? null : date.toISOString(); +} diff --git a/apps/parser/src/helpers/metadata-handling.ts b/apps/parser/src/helpers/metadata-handling.ts new file mode 100644 index 0000000000..be36dbb4c7 --- /dev/null +++ b/apps/parser/src/helpers/metadata-handling.ts @@ -0,0 +1,66 @@ +import { ParsedMetadata } from "../modules/types"; +import { toIsoOrNull } from "./date-format"; + +export const extractDocumentMetadata = (): ParsedMetadata => { + const getMeta = (name: string): string | null => { + return ( + document.querySelector(`meta[name="${name}"]`)?.getAttribute("content") || + document + .querySelector(`meta[property="${name}"]`) + ?.getAttribute("content") || + null + ); + }; + const metaTitle = getMeta("og:title") || getMeta("twitter:title"); + const documentTitle = document.title?.trim(); + const normalizedTitle = documentTitle?.length + ? documentTitle + : metaTitle || ""; + const normalizeText = (value: string | null | undefined): string => { + return value ? value.replace(/\s+/g, " ").trim() : ""; + }; + const mainText = normalizeText(document.querySelector("main")?.innerText); + const iframeTexts = Array.from(document.querySelectorAll("iframe")) + .map((frame) => { + try { + return normalizeText(frame.contentDocument?.body?.innerText ?? ""); + } catch (_error) { + return ""; + } + }) + .filter((text) => text.length > 0); + const prioritizedTextParts = [mainText, ...iframeTexts].filter( + (text) => text.length > 0, + ); + const prioritizedText = prioritizedTextParts.join("\n\n").trim(); + const fallbackBody = normalizeText(document.body?.innerText ?? ""); + const bodyText = + prioritizedText.length >= 120 ? prioritizedText : fallbackBody; + return { + title: normalizedTitle, + url: window.location.href, + bodyText, + lang: document.documentElement.lang || getMeta("og:locale") || null, + keywords: getMeta("keywords") || getMeta("news_keywords"), + datePublished: + getMeta("article:published_time") || + getMeta("date") || + getMeta("publish-date"), + lastModified: + document.lastModified !== "01/01/1970 00:00:00" + ? document.lastModified + : getMeta("article:modified_time"), + }; +}; + +export function serializeMetadata(raw: ParsedMetadata): ParsedMetadata { + return { + url: raw.url, + title: raw.title, + bodyText: raw.bodyText, + lang: raw.lang, + keywords: raw.keywords, + datePublished: toIsoOrNull(raw.datePublished), + lastModified: toIsoOrNull(raw.lastModified), + }; +} diff --git a/apps/parser/src/helpers/url-handling.ts b/apps/parser/src/helpers/url-handling.ts new file mode 100644 index 0000000000..f10f89f555 --- /dev/null +++ b/apps/parser/src/helpers/url-handling.ts @@ -0,0 +1,209 @@ +import { SanitizeOptions } from "../modules/types"; +import crypto from "crypto"; + +const ILLEGAL_RE = /[\/\?<>\\:\*\|"]/g; +const CONTROL_RE = /[\x00-\x1f\x80-\x9f]/g; +const RESERVED_RE = /^\.+$/; +const WINDOWS_RESERVED_RE = /^(con|prn|aux|nul|com[0-9]|lpt[0-9])$/i; +const WINDOWS_TRAILING_RE = /[\. ]+$/; +const DEFAULT_REPLACEMENT = "-"; + +let BASE_SCOPE: string; + +export function setBaseScope(scope: string): void { + BASE_SCOPE = scope; +} + +export function sanitizeUrlAsFilename( + url: string, + options?: SanitizeOptions, +): string { + if (!url) { + console.warn( + `Missing input url, sanitizing as default "${DEFAULT_REPLACEMENT}"`, + ); + return DEFAULT_REPLACEMENT; + } + let filenameBase = url; + if (filenameBase === BASE_SCOPE) { + filenameBase = new URL(filenameBase).hostname.replace(/^www\./, ""); + } else { + const pathAndSearch = url.replace(BASE_SCOPE, "").replace(/^\/+/, ""); + if (!pathAndSearch || pathAndSearch === "/") { + filenameBase = url.split("/").filter(Boolean).pop() || url; + } else { + filenameBase = pathAndSearch; + } + } + + const replacement = validReplacementOrDefault( + options?.replacement ?? DEFAULT_REPLACEMENT, + ); + let sanitized = filenameBase + .replace(/\/$/, "") + .replace(ILLEGAL_RE, replacement) + .replace(CONTROL_RE, replacement) + .replace(RESERVED_RE, replacement) + .replace(WINDOWS_RESERVED_RE, replacement) + .replace(WINDOWS_TRAILING_RE, replacement) + .trim(); + if (sanitized.length === 0) { + return replacement; + } + const trimmedName = sanitized.replace(/^[-_]+/, "") || sanitized; + if ( + options?.lengthThreshold && + trimmedName.length > options.lengthThreshold + ) { + const hash = crypto + .createHash("sha1") + .update(url) + .digest("hex") + .slice(0, 9); + const prefix = trimmedName.slice(0, options.lengthThreshold - 10); + return `${prefix}_${hash}`; + } + return trimmedName; +} + +export function sanitizeUrlAsDirectoryName( + url: string, + options?: SanitizeOptions, +): string { + let filenameBase = url; + filenameBase = filenameBase.replace(/^(https?:\/\/)?(www\.)?/, ""); + const replacement = validReplacementOrDefault( + options?.replacement ?? DEFAULT_REPLACEMENT, + ); + let sanitized = filenameBase + .replace(/\/$/, "") + .replace(ILLEGAL_RE, replacement) + .replace(CONTROL_RE, replacement) + .replace(RESERVED_RE, replacement) + .replace(WINDOWS_RESERVED_RE, replacement) + .replace(WINDOWS_TRAILING_RE, replacement) + .trim(); + if (sanitized.length === 0) { + return replacement; + } + const trimmedName = sanitized.replace(/^[-_]+/, "") || sanitized; + if ( + options?.lengthThreshold && + trimmedName.length > options.lengthThreshold + ) { + const hash = crypto + .createHash("sha1") + .update(url) + .digest("hex") + .slice(0, 9); + const prefix = trimmedName.slice(0, options.lengthThreshold - 10); + return `${prefix}_${hash}`; + } + return trimmedName; +} + +function validReplacementOrDefault(candidate: string): string { + if (!candidate) { + console.warn( + `Missing replacement character, using default "${DEFAULT_REPLACEMENT}"`, + ); + return DEFAULT_REPLACEMENT; + } + if ( + /[\/\?<>\\:\*\|"]/u.test(candidate) || + /[\x00-\x1f\x80-\x9f]/u.test(candidate) + ) { + console.warn( + `Invalid replacement character: "${candidate}", using default "${DEFAULT_REPLACEMENT}"`, + ); + return DEFAULT_REPLACEMENT; + } + return candidate; +} + +export const RemoveAnchorsFromUrl = (rawUrl: string): string => { + try { + const parsed = new URL(rawUrl); + parsed.hash = ""; + if (parsed.pathname.length > 1 && parsed.pathname.endsWith("/")) { + parsed.pathname = parsed.pathname.slice(0, -1); + } + const serialized = parsed.toString(); + if (parsed.pathname === "/" && !parsed.search) { + return serialized.endsWith("/") ? serialized.slice(0, -1) : serialized; + } + return serialized; + } catch (error) { + console.warn(`Failed to parse URL: ${rawUrl}`, error); + return rawUrl; + } +}; + +export function deriveSubPath( + targetUrl: string, + baseUrl: string, + sanitizedBaseUrl: string, +): string { + const base = new URL(baseUrl); + const target = new URL(targetUrl); + let relPath = target.pathname; + if (base.pathname !== "/" && relPath.startsWith(base.pathname)) { + relPath = relPath.slice(base.pathname.length); + if (!relPath.startsWith("/")) relPath = "/" + relPath; + } + if ( + RemoveAnchorsFromUrl(targetUrl) === sanitizedBaseUrl || + relPath === "/" || + relPath === "" + ) { + return "/"; + } + return `${relPath}${target.search}${target.hash}` || "/"; +} + +export function isWithinScope( + url: string, + scope: string, + validDomainVariants: string[] = [], +): boolean { + if (!scope) { + return true; + } + // TODO: This function could be generalized to better handle edge cases. For now it performs a basic check to see if the URL is within the same domain or valid subdomain variants as the scope. + try { + const urlObj = new URL(url); + const scopeObj = new URL(scope); + const urlDomain = urlObj.hostname.replace(/^www\./, ""); + const scopeDomain = scopeObj.hostname.replace(/^www\./, ""); + if (urlDomain === scopeDomain) { + return true; + } + const urlParts = urlDomain.split("."); + const scopeParts = scopeDomain.split("."); + if (urlParts.length > scopeParts.length) { + const subdomain = urlParts[0]; + const domainWithoutSubdomain = urlParts.slice(1).join("."); + if ( + domainWithoutSubdomain === scopeDomain && + validDomainVariants.includes(subdomain) + ) { + return true; + } + } + return false; + } catch (_error) { + return false; + } +} + +export function buildVisitKey(rawUrl: string): string { + try { + const url = new URL(rawUrl); + url.hash = ""; + url.hostname = url.hostname.replace(/^www\./, ""); + return RemoveAnchorsFromUrl(url.toString()); + } catch (error) { + console.warn(`Failed to build visit key for URL: ${rawUrl}`, error); + return rawUrl; + } +} diff --git a/apps/parser/src/main.ts b/apps/parser/src/main.ts new file mode 100644 index 0000000000..93356fc65a --- /dev/null +++ b/apps/parser/src/main.ts @@ -0,0 +1,43 @@ +import puppeteer from "puppeteer-extra"; +import StealthPlugin from "puppeteer-extra-plugin-stealth"; +import { resolveEnv } from "./modules/config"; +import { ensureDirectory } from "./modules/output"; +import { handleError } from "./modules/errors"; +import { exploreAndParsePages } from "./modules/parser"; +import { ParsedNode, ParsedMetadata } from "./modules/types"; +import { + RemoveAnchorsFromUrl, + buildVisitKey, + setBaseScope, +} from "./helpers/url-handling"; +import { assertReachable } from "./modules/network"; + +puppeteer.use(StealthPlugin()); + +const env = resolveEnv(); +const parsedPages = new Map(); +const scheduledPages = new Set(); +export const OUTPUT_DIRECTORY = env.outputDirectory; +export const MAX_DEPTH = env.maxDepth; +export const BASE_SCOPE = RemoveAnchorsFromUrl(env.baseUrl); +export const BASE_HOST_TOKEN = new URL(env.baseUrl).hostname + .replace(/^www\./, "") + .toLowerCase(); +export const VALID_DOMAIN_VARIANTS = env.validDomainVariants || []; + +setBaseScope(BASE_SCOPE); + +void (async () => { + try { + await assertReachable(env.baseUrl); + ensureDirectory(env.outputDirectory); + const browser = await puppeteer.launch({ headless: true }); + const root: ParsedNode = { url: env.baseUrl }; + scheduledPages.add(buildVisitKey(env.baseUrl)); + await exploreAndParsePages(browser, root, 0, parsedPages, scheduledPages); + await browser.close(); + console.log(`Parsing complete! Data saved to ${env.outputDirectory}`); + } catch (error) { + handleError(error); + } +})(); diff --git a/apps/parser/src/modules/config.ts b/apps/parser/src/modules/config.ts new file mode 100644 index 0000000000..dd535992c8 --- /dev/null +++ b/apps/parser/src/modules/config.ts @@ -0,0 +1,70 @@ +import path from "node:path"; +import { EnvConfig } from "./types"; +import { + RemoveAnchorsFromUrl, + sanitizeUrlAsDirectoryName, +} from "../helpers/url-handling"; +import * as dotenv from "dotenv"; + +const DEFAULT_DEPTH = null; + +export function resolveEnv(): EnvConfig { + let baseUrl = process.env.URL?.trim(); + let depth = process.env.DEPTH?.trim(); + let vectorIndexName = process.env.CHB_INDEX_ID?.trim(); + let validDomainVariants = process.env.validDomainVariants?.trim(); + if (!baseUrl || !depth || !vectorIndexName) { + const parserHome = path.resolve(__dirname, "../../"); + dotenv.config({ path: path.join(parserHome, ".env") }); + baseUrl = baseUrl || process.env.URL?.trim(); + depth = depth || process.env.DEPTH?.trim(); + vectorIndexName = vectorIndexName || process.env.CHB_INDEX_ID?.trim(); + validDomainVariants = + validDomainVariants || process.env.validDomainVariants?.trim(); + } + if (!baseUrl) { + throw new Error( + "Missing required URL. Set URL in environment or .env file.", + ); + } + const sanitizedBaseUrl = RemoveAnchorsFromUrl(baseUrl); + const parsedDepth = Number.parseInt(depth ?? `${DEFAULT_DEPTH}`, 10); + const maxDepth = Number.isNaN(parsedDepth) + ? DEFAULT_DEPTH + : Math.max(parsedDepth, 0); + const outputDirectory = generateOutputDirectoryPath( + vectorIndexName, + sanitizedBaseUrl, + ); + let parsedValidDomainVariants: string[] = []; + if (validDomainVariants) { + try { + parsedValidDomainVariants = JSON.parse(validDomainVariants); + if (!Array.isArray(parsedValidDomainVariants)) { + parsedValidDomainVariants = []; + } + } catch (_error) { + parsedValidDomainVariants = []; + } + } + return { + baseUrl, + sanitizedBaseUrl, + outputDirectory, + maxDepth, + validDomainVariants: parsedValidDomainVariants, + }; +} + +function generateOutputDirectoryPath( + vectorIndexName: string | undefined, + sanitizedBaseUrl: string, +): string { + const safeBaseSegment = sanitizeUrlAsDirectoryName(sanitizedBaseUrl, { + replacement: "-", + }); + if (!vectorIndexName) { + return `output/${safeBaseSegment}`; + } + return path.join(vectorIndexName, "parsing", safeBaseSegment); +} diff --git a/apps/parser/src/modules/dom-actions.ts b/apps/parser/src/modules/dom-actions.ts new file mode 100644 index 0000000000..5779726f72 --- /dev/null +++ b/apps/parser/src/modules/dom-actions.ts @@ -0,0 +1,44 @@ +import type { Page } from "puppeteer"; + +const TOGGLE_SELECTORS = [ + "[data-toggle]", + '[data-testid="accordion-toggle"]', + "[aria-expanded]", + ".accordion button", + ".accordion-toggle", + ".accordion-trigger", + ".faq-item button", + ".collapse-toggle", + ".MuiButtonBase-root[aria-expanded]", +]; + +export async function expandInteractiveSections(page: Page): Promise { + await page.evaluate((selectors) => { + document.querySelectorAll("details").forEach((element) => { + (element as HTMLDetailsElement).open = true; + }); + selectors.forEach((selector) => { + document.querySelectorAll(selector).forEach((node) => { + const target = node as HTMLElement; + if (!target || target.getAttribute("data-expanded") === "true") { + return; + } + const ariaExpanded = target.getAttribute("aria-expanded"); + const isToggleButton = + target.tagName === "BUTTON" || + target.getAttribute("role") === "button"; + const isCollapsed = + ariaExpanded === "false" || target.classList.contains("collapsed"); + const shouldClick = + (isToggleButton && ariaExpanded !== "true") || + isCollapsed || + selector === "[data-toggle]"; + if (shouldClick) { + target.click(); + target.setAttribute("data-expanded", "true"); + } + }); + }); + }, TOGGLE_SELECTORS); + await new Promise((resolve) => setTimeout(resolve, 250)); +} diff --git a/apps/parser/src/modules/errors.ts b/apps/parser/src/modules/errors.ts new file mode 100644 index 0000000000..094c3ad6b9 --- /dev/null +++ b/apps/parser/src/modules/errors.ts @@ -0,0 +1,4 @@ +export function handleError(error: unknown) { + console.error("Parser terminated with an error:", error); + process.exitCode = 1; +} diff --git a/apps/parser/src/modules/network.ts b/apps/parser/src/modules/network.ts new file mode 100644 index 0000000000..e5e06f2d56 --- /dev/null +++ b/apps/parser/src/modules/network.ts @@ -0,0 +1,47 @@ +const REQUEST_TIMEOUT_MS = 10_000; + +export async function assertReachable( + url: string, + timeoutMs: number = REQUEST_TIMEOUT_MS, +): Promise { + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), timeoutMs); + try { + const res = await fetch(url, { + method: "GET", + headers: { + "User-Agent": + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + }, + signal: controller.signal, + }); + const text = await res.text(); + if (isCloudflareChallenge(text)) { + console.warn( + "Cloudflare protection detected, skipping reachability check.", + ); + return; + } + if (!res.ok && res.status !== 405) { + throw new Error(`Status ${res.status}`); + } + } catch (error) { + if ((error as Error).name === "AbortError") { + throw new Error(`Target ${url} is unreachable: request timed out`); + } + throw new Error( + `Target ${url} is unreachable: ${(error as Error).message}`, + ); + } finally { + clearTimeout(timeoutId); + } +} + +function isCloudflareChallenge(html: string): boolean { + return /cloudflare|just a moment|verify you are human/i.test(html); +} + +async function fetch(input: any, init?: any): Promise { + const { default: nodeFetch } = await import("node-fetch"); + return nodeFetch(input, init); +} diff --git a/apps/parser/src/modules/output.ts b/apps/parser/src/modules/output.ts new file mode 100644 index 0000000000..0829b5baec --- /dev/null +++ b/apps/parser/src/modules/output.ts @@ -0,0 +1,29 @@ +import { mkdirSync } from "node:fs"; +import { writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import { ParsedMetadata } from "./types"; +import { sanitizeUrlAsFilename } from "../helpers/url-handling"; +import { BASE_HOST_TOKEN } from "../main"; + +const FILENAME_LENGTH_THRESHOLD = 250; + +export function ensureDirectory(dir: string) { + mkdirSync(dir, { recursive: true }); +} + +export async function persistSnapshot( + snapshot: ParsedMetadata, + outputDirectory: string, +): Promise { + let finalName = sanitizeUrlAsFilename(snapshot.url, { + lengthThreshold: FILENAME_LENGTH_THRESHOLD, + }); + if (finalName.replace(/^www\./, "") === BASE_HOST_TOKEN) { + finalName = "index"; + } + await saveMetadata(outputDirectory, `${finalName}.json`, snapshot); +} + +async function saveMetadata(dir: string, filename: string, metadata: object) { + await writeFile(join(dir, filename), JSON.stringify(metadata, null, 2)); +} diff --git a/apps/parser/src/modules/parser.ts b/apps/parser/src/modules/parser.ts new file mode 100644 index 0000000000..83cc3c85ec --- /dev/null +++ b/apps/parser/src/modules/parser.ts @@ -0,0 +1,164 @@ +import { Browser, Page } from "puppeteer"; +import { ParsedNode, ParsedMetadata } from "./types"; +import { + RemoveAnchorsFromUrl, + isWithinScope, + buildVisitKey, +} from "../helpers/url-handling"; +import { expandInteractiveSections } from "./dom-actions"; +import { persistSnapshot } from "./output"; +import { + OUTPUT_DIRECTORY, + MAX_DEPTH, + VALID_DOMAIN_VARIANTS, + BASE_HOST_TOKEN, + BASE_SCOPE, +} from "../main"; +import { + extractDocumentMetadata, + serializeMetadata, +} from "../helpers/metadata-handling"; + +const NAVIGATION_TIMEOUT_MS = 30_000; +const PAGE_NAVIGATION_OPTIONS = { + waitUntil: "networkidle2" as const, +}; + +export async function exploreAndParsePages( + browser: Browser, + node: ParsedNode, + depth: number, + parsedPages: Map, + scheduledPages: Set, +): Promise { + const visitKey = buildVisitKey(node.url); + scheduledPages.delete(visitKey); + if (parsedPages.has(visitKey) || (MAX_DEPTH !== null && depth > MAX_DEPTH)) { + return; + } + const normalizedUrl = RemoveAnchorsFromUrl(node.url); + if (!isWithinScope(normalizedUrl, BASE_SCOPE, VALID_DOMAIN_VARIANTS)) { + return; + } + const metadata = await generatePageParsedMetadata(browser, node.url); + if (!metadata) return; + parsedPages.set(visitKey, metadata); + node.title = metadata.title; + node.bodyText = metadata.bodyText; + node.lang = metadata.lang; + node.keywords = metadata.keywords; + node.datePublished = metadata.datePublished; + node.lastModified = metadata.lastModified; + let page; + let anchors: string[] = []; + try { + page = await browser.newPage(); + await page.goto(node.url, { + ...PAGE_NAVIGATION_OPTIONS, + timeout: NAVIGATION_TIMEOUT_MS, + }); + await expandInteractiveSections(page); + anchors = (await page.evaluate((allowedToken: string) => { + const anchors = Array.from(document.querySelectorAll("a[href]")); + const iframeSources = Array.from( + document.querySelectorAll("iframe[src]"), + ); + const unique = new Set(); + for (const anchor of anchors) { + const href = (anchor as HTMLAnchorElement).href; + if (!href || !href.startsWith("http")) continue; + try { + const target = new URL(href, window.location.href); + const normalizedHref = target.href.toLowerCase(); + if (allowedToken && !normalizedHref.includes(allowedToken)) continue; + if (target.href === window.location.href) continue; + unique.add(target.href); + } catch (error) { + console.warn(`Failed to parse anchor href: ${href}`, error); + } + } + for (const frame of iframeSources) { + const src = (frame as HTMLIFrameElement).src; + if (!src || !src.startsWith("http")) { + continue; + } + try { + const target = new URL(src, window.location.href); + const normalizedSrc = target.href.toLowerCase(); + if (allowedToken && !normalizedSrc.includes(allowedToken)) continue; + unique.add(target.href); + } catch (error) { + console.warn(`Failed to parse iframe src: ${src}`, error); + } + } + return Array.from(unique); + }, BASE_HOST_TOKEN)) as string[]; + } catch (error) { + console.warn(`Failed to extract anchors from ${node.url}`, error); + } finally { + if (page) await page.close(); + } + const nextChildren: ParsedNode[] = []; + let newLinksCount = 0; + for (const href of anchors) { + const normalized = RemoveAnchorsFromUrl(href); + const visitCandidate = buildVisitKey(normalized); + if (parsedPages.has(visitCandidate) || scheduledPages.has(visitCandidate)) + continue; + const lowerNormalized = normalized.toLowerCase(); + if (BASE_HOST_TOKEN && !lowerNormalized.includes(BASE_HOST_TOKEN)) { + continue; + } + if (!isWithinScope(normalized, BASE_SCOPE, VALID_DOMAIN_VARIANTS)) { + continue; + } + scheduledPages.add(visitCandidate); + newLinksCount += 1; + nextChildren.push({ url: normalized }); + } + node.children = nextChildren; + const totalKnown = parsedPages.size + scheduledPages.size; + console.log( + `Completed parsing of page ${ + node.url + }. Found ${newLinksCount} new links. Progress: ${ + parsedPages.size + }/${totalKnown} (${((parsedPages.size / totalKnown) * 100).toFixed(2)}%)`, + ); + if (!node.children || (MAX_DEPTH !== null && depth >= MAX_DEPTH)) return; + for (const child of node.children) { + await exploreAndParsePages( + browser, + child, + depth + 1, + parsedPages, + scheduledPages, + ); + } +} + +async function generatePageParsedMetadata( + browser: Browser, + url: string, +): Promise { + let page: Page | undefined; + try { + page = await browser.newPage(); + await page.goto(url, { + waitUntil: "networkidle2", + timeout: NAVIGATION_TIMEOUT_MS, + }); + await expandInteractiveSections(page); + const rawMetadata = await page.evaluate(extractDocumentMetadata); + const snapshot = serializeMetadata(rawMetadata); + await persistSnapshot(snapshot, OUTPUT_DIRECTORY); + return snapshot; + } catch (error) { + console.error(`Error while parsing ${url}:`, (error as Error).message); + return null; + } finally { + if (page) { + await page.close(); + } + } +} diff --git a/apps/parser/src/modules/types.ts b/apps/parser/src/modules/types.ts new file mode 100644 index 0000000000..d86044201e --- /dev/null +++ b/apps/parser/src/modules/types.ts @@ -0,0 +1,33 @@ +export type EnvConfig = { + readonly baseUrl: string; + readonly sanitizedBaseUrl: string; + readonly outputDirectory: string; + readonly maxDepth: number | null; + readonly validDomainVariants?: string[]; +}; + +export type SanitizeOptions = { + readonly replacement?: string; + readonly lengthThreshold?: number; +}; + +export type ParsedMetadata = { + readonly url: string; + readonly title: string; + readonly bodyText: string; + readonly lang: string | null; + readonly keywords: string | null; + readonly datePublished: string | null; + readonly lastModified: string | null; +}; + +export type ParsedNode = { + readonly url: string; + title?: string; + bodyText?: string; + lang?: string | null; + keywords?: string | null; + datePublished?: string | null; + lastModified?: string | null; + children?: ParsedNode[]; +}; diff --git a/apps/parser/tests/parser.error-handling.test.ts b/apps/parser/tests/parser.error-handling.test.ts new file mode 100644 index 0000000000..95f34c29d8 --- /dev/null +++ b/apps/parser/tests/parser.error-handling.test.ts @@ -0,0 +1,56 @@ +import { execFile, ExecFileException } from "node:child_process"; +import path from "node:path"; +import { promisify } from "node:util"; + +type ExecFileResult = { + readonly stdout: string; + readonly stderr: string; +}; + +const execFileAsync = promisify(execFile); +const parserScript = path.resolve(__dirname, "../dist/parser.js"); +const nonExistingHost = "http://nonexistent-url-1234567890.com"; +const unreachableHost = "http://127.0.0.1:9"; + +jest.setTimeout(60_000); + +describe("Parser error handling", () => { + it("handles non-resolving URLs gracefully", async () => { + const result = await captureParserError(nonExistingHost); + console.log("Non-resolving URL result:", result); + expect(result).not.toBe("Parser unexpectedly succeeded"); + expect( + /ENOTFOUND|EAI_AGAIN|getaddrinfo|unreachable|error/i.test(result), + ).toBe(true); + }); + + it("handles unreachable hosts gracefully", async () => { + const result = await captureParserError(unreachableHost); + console.log("Unreachable host result:", result); + expect(result).not.toBe("Parser unexpectedly succeeded"); + expect(/ECONNREFUSED|connect|unreachable|error/i.test(result)).toBe(true); + }); +}); + +async function captureParserError(url: string): Promise { + try { + await execFileAsync("node", [parserScript], { + env: { + ...process.env, + URL: url, + CHB_INDEX_ID: "test-parseer-vector-index", + }, + timeout: 30_000, + }); + return "Parser unexpectedly succeeded"; + } catch (error) { + const execError = error as ExecFileException & ExecFileResult; + if (typeof execError.stderr === "string" && execError.stderr.length > 0) { + return execError.stderr; + } + if (execError.code) { + return `${execError.code}`; + } + return (error as Error).message || "Unknown error"; + } +} diff --git a/apps/parser/tests/parser.sanitize-filename.test.ts b/apps/parser/tests/parser.sanitize-filename.test.ts new file mode 100644 index 0000000000..4f3af145bc --- /dev/null +++ b/apps/parser/tests/parser.sanitize-filename.test.ts @@ -0,0 +1,106 @@ +import { sanitizeUrlAsFilename } from "../src/helpers/url-handling"; + +describe("sanitizeUrlAsFilename", () => { + it("sanitizes URL paths with query strings and fragments", () => { + expect( + sanitizeUrlAsFilename("/docs/guide?version=1.0&lang=en#section"), + ).toBe("docs-guide-version=1.0&lang=en#section"); + }); + + it("sanitizes URL paths with special characters", () => { + expect(sanitizeUrlAsFilename("/api/v1/users:profile")).toBe( + "api-v1-users-profile", + ); + }); + + it("sanitizes control characters in URL paths", () => { + expect(sanitizeUrlAsFilename("/path\u0000with\u0001control.html")).toBe( + "path-with-control.html", + ); + }); + + it('replaces reserved names "." and ".."', () => { + expect(sanitizeUrlAsFilename(".")).toBe("-"); + expect(sanitizeUrlAsFilename("..")).toBe("-"); + }); + + it("replaces Windows reserved names", () => { + const reserved = [ + "con", + "prn", + "aux", + "nul", + "COM1", + "LPT1", + "com9", + "lpt9", + ]; + for (const name of reserved) { + expect(sanitizeUrlAsFilename(name)).toBe("-"); + expect(sanitizeUrlAsFilename(name.toUpperCase())).toBe("-"); + } + }); + + it("returns default replacement for empty input", () => { + expect(sanitizeUrlAsFilename("")).toBe("-"); + expect(sanitizeUrlAsFilename(null as unknown as string)).toBe("-"); + expect(sanitizeUrlAsFilename(undefined as unknown as string)).toBe("-"); + }); + + it("uses custom replacement option for URL paths", () => { + expect(sanitizeUrlAsFilename("/api/users/123", { replacement: "_" })).toBe( + "api_users_123", + ); + }); + + it("falls back to default replacement for invalid replacement chars", () => { + expect( + sanitizeUrlAsFilename("/docs/guide?ref=x", { replacement: "/" }), + ).toBe("docs-guide-ref=x"); + expect( + sanitizeUrlAsFilename("/docs/guide", { replacement: "\u0000" }), + ).toBe("docs-guide"); + }); + + it("appends hash suffix for URLs exceeding length threshold", () => { + const longUrl = "/docs/" + "very-long-path-segment/".repeat(20); + const result = sanitizeUrlAsFilename(longUrl, { + lengthThreshold: 255, + }); + expect(result.length).toBe(255); + }); + + it("trims leading dashes from URL paths", () => { + expect(sanitizeUrlAsFilename("/////path")).toBe("path"); + }); + + it("returns unchanged valid URL path", () => { + expect(sanitizeUrlAsFilename("api-reference.html")).toBe( + "api-reference.html", + ); + }); + + it("ignores illegal characters in replacement option", () => { + expect(sanitizeUrlAsFilename("/docs/guide", { replacement: "*" })).toBe( + "docs-guide", + ); + }); + + it("ignores control chars in replacement option", () => { + expect( + sanitizeUrlAsFilename("/docs/guide", { replacement: "\u0000" }), + ).toBe("docs-guide"); + }); + + it("sanitizes URL with subdomain path", () => { + expect(sanitizeUrlAsFilename("showcase.example.com/docs-guide")).toBe( + "showcase.example.com-docs-guide", + ); + }); + + it("handles URL with numbers and hyphens", () => { + expect(sanitizeUrlAsFilename("/api/v2/resource-123")).toBe( + "api-v2-resource-123", + ); + }); +}); diff --git a/apps/parser/tsconfig.build.json b/apps/parser/tsconfig.build.json new file mode 100644 index 0000000000..7d00017721 --- /dev/null +++ b/apps/parser/tsconfig.build.json @@ -0,0 +1,12 @@ +{ + "extends": "./tsconfig.json", + "compilerOptions": { + "noEmit": false, + "outDir": "dist", + "rootDir": "src", + "types": ["node"], + "sourceMap": true, + "declaration": false + }, + "include": ["src"] +} diff --git a/apps/parser/tsconfig.json b/apps/parser/tsconfig.json new file mode 100644 index 0000000000..59418dfa05 --- /dev/null +++ b/apps/parser/tsconfig.json @@ -0,0 +1,16 @@ +{ + "extends": "../../tsconfig.json", + "compilerOptions": { + "target": "es2021", + "module": "commonjs", + "moduleResolution": "node", + "lib": ["es2022", "dom"], + "esModuleInterop": true, + "strict": true, + "noEmit": true, + "resolveJsonModule": true, + "types": ["node", "jest"], + "forceConsistentCasingInFileNames": true + }, + "include": ["src", "tests"] +} diff --git a/package-lock.json b/package-lock.json index ef7082ee78..45a9e9bf87 100644 --- a/package-lock.json +++ b/package-lock.json @@ -640,9 +640,56 @@ "license": "MIT" }, "apps/parser": { - "version": "0.1.0", + "version": "1.0.0", "dependencies": { - "puppeteer": "^24.37.1" + "node-fetch": "^3.3.2", + "puppeteer": "^24.37.1", + "puppeteer-extra": "^3.3.6", + "puppeteer-extra-plugin-stealth": "^2.11.2", + "xml2js": "^0.6.2" + }, + "devDependencies": { + "@types/jest": "^29.5.1", + "@types/node": "18.16.*", + "@types/xml2js": "^0.4.11", + "jest": "^29.5.0", + "shx": "^0.3.4", + "ts-jest": "^29.1.1", + "typescript": "5.1.6" + } + }, + "apps/parser/node_modules/@types/node": { + "version": "18.16.20", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.16.20.tgz", + "integrity": "sha512-nL54VfDjThdP2UXJXZao5wp76CDiDw4zSRO8d4Tk7UgDqNKGKVEQB0/t3ti63NS+YNNkIQDvwEAF04BO+WYu7Q==", + "dev": true, + "license": "MIT" + }, + "apps/parser/node_modules/data-uri-to-buffer": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-4.0.1.tgz", + "integrity": "sha512-0R9ikRb668HB7QDxT1vkpuUBtqc53YyAwMwGeUFKRojY/NWKvdZ+9UYtRfGmhqNbRkTSVpMbmyhXipFFv2cb/A==", + "license": "MIT", + "engines": { + "node": ">= 12" + } + }, + "apps/parser/node_modules/node-fetch": { + "version": "3.3.2", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-3.3.2.tgz", + "integrity": "sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==", + "license": "MIT", + "dependencies": { + "data-uri-to-buffer": "^4.0.0", + "fetch-blob": "^3.1.4", + "formdata-polyfill": "^4.0.10" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/node-fetch" } }, "apps/storybook-app": { @@ -22126,6 +22173,16 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/xml2js": { + "version": "0.4.14", + "resolved": "https://registry.npmjs.org/@types/xml2js/-/xml2js-0.4.14.tgz", + "integrity": "sha512-4YnrRemBShWRO2QjvUin8ESA41rH+9nQGLUGZV/1IDhi3SL9OhdpNC/MrulTWuptXKwhx/aDxE7toV0f/ypIXQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@types/yargs": { "version": "17.0.35", "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-17.0.35.tgz", @@ -23438,6 +23495,15 @@ "node": ">= 0.4" } }, + "node_modules/arr-union": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz", + "integrity": "sha512-sKpyeERZ02v1FeCZT8lrfJq5u6goHCtpTAzPwJYe7c8SPFOboNjNg1vz2L4VTn9T4PQxEx13TbXLmYUcS6Ug7Q==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/array-buffer-byte-length": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/array-buffer-byte-length/-/array-buffer-byte-length-1.0.2.tgz", @@ -25359,6 +25425,22 @@ "node": ">=12" } }, + "node_modules/clone-deep": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/clone-deep/-/clone-deep-0.2.4.tgz", + "integrity": "sha512-we+NuQo2DHhSl+DP6jlUiAhyAjBQrYnpOk15rN6c6JSPScjiCLh8IbSU+VTcph6YS3o7mASE8a0+gbZ7ChLpgg==", + "license": "MIT", + "dependencies": { + "for-own": "^0.1.3", + "is-plain-object": "^2.0.1", + "kind-of": "^3.0.2", + "lazy-cache": "^1.0.3", + "shallow-clone": "^0.1.2" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/cloudfront-functions": { "resolved": "apps/cloudfront-functions", "link": true @@ -29139,6 +29221,29 @@ } } }, + "node_modules/fetch-blob": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/fetch-blob/-/fetch-blob-3.2.0.tgz", + "integrity": "sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "paypal", + "url": "https://paypal.me/jimmywarting" + } + ], + "license": "MIT", + "dependencies": { + "node-domexception": "^1.0.0", + "web-streams-polyfill": "^3.0.3" + }, + "engines": { + "node": "^12.20 || >= 14.13" + } + }, "node_modules/fflate": { "version": "0.7.3", "resolved": "https://registry.npmjs.org/fflate/-/fflate-0.7.3.tgz", @@ -29394,6 +29499,27 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/for-in": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/for-in/-/for-in-1.0.2.tgz", + "integrity": "sha512-7EwmXrOjyL+ChxMhmG5lnW9MPt1aIeZEwKhQzoBUdTV0N3zuwWDZYVJatDvZ2OyzPUvdIAZDsCetk3coyMfcnQ==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/for-own": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/for-own/-/for-own-0.1.5.tgz", + "integrity": "sha512-SKmowqGTJoPzLO1T0BBJpkfp3EMacCMOuH40hOUbrbzElVktk4DioXVM99QkLCyKoiuOmyjgcWMpVz2xjE7LZw==", + "license": "MIT", + "dependencies": { + "for-in": "^1.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/foreground-child": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.1.tgz", @@ -29580,6 +29706,18 @@ "node": ">=0.4.x" } }, + "node_modules/formdata-polyfill": { + "version": "4.0.10", + "resolved": "https://registry.npmjs.org/formdata-polyfill/-/formdata-polyfill-4.0.10.tgz", + "integrity": "sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g==", + "license": "MIT", + "dependencies": { + "fetch-blob": "^3.1.2" + }, + "engines": { + "node": ">=12.20.0" + } + }, "node_modules/forwarded": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", @@ -30941,6 +31079,12 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/is-buffer": { + "version": "1.1.6", + "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz", + "integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==", + "license": "MIT" + }, "node_modules/is-bun-module": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/is-bun-module/-/is-bun-module-2.0.0.tgz", @@ -31036,6 +31180,15 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/is-extendable": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-0.1.1.tgz", + "integrity": "sha512-5BMULNob1vgFX6EjQw5izWDxrecWK9AM72rugNr0TFldMOi0fj6Jk+zeKIt0xGj4cEfQIJth4w3OKWOJ4f+AFw==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/is-extglob": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", @@ -31213,6 +31366,18 @@ "node": ">=8" } }, + "node_modules/is-plain-object": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz", + "integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==", + "license": "MIT", + "dependencies": { + "isobject": "^3.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/is-potential-custom-element-name": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz", @@ -31421,6 +31586,15 @@ "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==", "license": "ISC" }, + "node_modules/isobject": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", + "integrity": "sha512-WhB9zCku7EGTj/HQQRz5aUQEUeoQZH2bWcltRErOpymJ4boYE6wL9Tbr23krRPSZ+C5zqNSrSw+Cc7sZZ4b7vg==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/isomorphic-unfetch": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/isomorphic-unfetch/-/isomorphic-unfetch-3.1.0.tgz", @@ -33121,6 +33295,18 @@ "resolved": "https://registry.npmjs.org/khroma/-/khroma-2.1.0.tgz", "integrity": "sha512-Ls993zuzfayK269Svk9hzpeGUKob/sIgZzyHYdjQoAdQetRKpOLj+k/QQQ/6Qi0Yz65mlROrfd+Ev+1+7dz9Kw==" }, + "node_modules/kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==", + "license": "MIT", + "dependencies": { + "is-buffer": "^1.1.5" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/kleur": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/kleur/-/kleur-3.0.3.tgz", @@ -33154,6 +33340,15 @@ "integrity": "sha512-8h2oVEZNktL4BH2JCOI90iD1yXwL6iNW7KcCKT2QZgQJR2vbqDsldCTPRU9NifTCqHZci57XvQQ15YTu+sTYPg==", "license": "MIT" }, + "node_modules/lazy-cache": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz", + "integrity": "sha512-RE2g0b5VGZsOCFOCgP7omTRYFqydmZkBwl5oNnQ1lDYC57uyO9KqNnNVxT7COSHTxrRCWVcAVOcbjk+tvh/rgQ==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/lazy-universal-dotenv": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/lazy-universal-dotenv/-/lazy-universal-dotenv-4.0.0.tgz", @@ -33586,6 +33781,20 @@ "integrity": "sha512-IAeFvcOnV9V0Yk+bFhYR07O3yNina9ANIN5MoXBKYJ/RLYPurd2d0yw14MDhpr9/momp0WofT1bPUh3hkzdi/g==", "license": "MIT" }, + "node_modules/merge-deep": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/merge-deep/-/merge-deep-3.0.3.tgz", + "integrity": "sha512-qtmzAS6t6grwEkNrunqTBdn0qKwFgNWvlxUbAV8es9M7Ot1EbyApytCnvE0jALPa46ZpKDUo527kKiaWplmlFA==", + "license": "MIT", + "dependencies": { + "arr-union": "^3.1.0", + "clone-deep": "^0.2.4", + "kind-of": "^3.0.2" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/merge-descriptors": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-2.0.0.tgz", @@ -34248,6 +34457,28 @@ "integrity": "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==", "license": "MIT" }, + "node_modules/mixin-object": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/mixin-object/-/mixin-object-2.0.1.tgz", + "integrity": "sha512-ALGF1Jt9ouehcaXaHhn6t1yGWRqGaHkPFndtFVHfZXOvkIZ/yoGaSi0AHVTafb3ZBGg4dr/bDwnaEKqCXzchMA==", + "license": "MIT", + "dependencies": { + "for-in": "^0.1.3", + "is-extendable": "^0.1.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/mixin-object/node_modules/for-in": { + "version": "0.1.8", + "resolved": "https://registry.npmjs.org/for-in/-/for-in-0.1.8.tgz", + "integrity": "sha512-F0to7vbBSHP8E3l6dCjxNOLuSFAACIxFy3UehTUlG7svlXi37HHsDkyVcHo0Pq8QwrE+pXvWSVX3ZT1T9wAZ9g==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/mjml": { "version": "4.18.0", "resolved": "https://registry.npmjs.org/mjml/-/mjml-4.18.0.tgz", @@ -36614,6 +36845,214 @@ "node": ">=18" } }, + "node_modules/puppeteer-extra": { + "version": "3.3.6", + "resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-3.3.6.tgz", + "integrity": "sha512-rsLBE/6mMxAjlLd06LuGacrukP2bqbzKCLzV1vrhHFavqQE/taQ2UXv3H5P0Ls7nsrASa+6x3bDbXHpqMwq+7A==", + "license": "MIT", + "dependencies": { + "@types/debug": "^4.1.0", + "debug": "^4.1.1", + "deepmerge": "^4.2.2" + }, + "engines": { + "node": ">=8" + }, + "peerDependencies": { + "@types/puppeteer": "*", + "puppeteer": "*", + "puppeteer-core": "*" + }, + "peerDependenciesMeta": { + "@types/puppeteer": { + "optional": true + }, + "puppeteer": { + "optional": true + }, + "puppeteer-core": { + "optional": true + } + } + }, + "node_modules/puppeteer-extra-plugin": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin/-/puppeteer-extra-plugin-3.2.3.tgz", + "integrity": "sha512-6RNy0e6pH8vaS3akPIKGg28xcryKscczt4wIl0ePciZENGE2yoaQJNd17UiEbdmh5/6WW6dPcfRWT9lxBwCi2Q==", + "license": "MIT", + "dependencies": { + "@types/debug": "^4.1.0", + "debug": "^4.1.1", + "merge-deep": "^3.0.1" + }, + "engines": { + "node": ">=9.11.2" + }, + "peerDependencies": { + "playwright-extra": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "playwright-extra": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, + "node_modules/puppeteer-extra-plugin-stealth": { + "version": "2.11.2", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.11.2.tgz", + "integrity": "sha512-bUemM5XmTj9i2ZerBzsk2AN5is0wHMNE6K0hXBzBXOzP5m5G3Wl0RHhiqKeHToe/uIH8AoZiGhc1tCkLZQPKTQ==", + "license": "MIT", + "dependencies": { + "debug": "^4.1.1", + "puppeteer-extra-plugin": "^3.2.3", + "puppeteer-extra-plugin-user-preferences": "^2.4.1" + }, + "engines": { + "node": ">=8" + }, + "peerDependencies": { + "playwright-extra": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "playwright-extra": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, + "node_modules/puppeteer-extra-plugin-user-data-dir": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-user-data-dir/-/puppeteer-extra-plugin-user-data-dir-2.4.1.tgz", + "integrity": "sha512-kH1GnCcqEDoBXO7epAse4TBPJh9tEpVEK/vkedKfjOVOhZAvLkHGc9swMs5ChrJbRnf8Hdpug6TJlEuimXNQ+g==", + "license": "MIT", + "dependencies": { + "debug": "^4.1.1", + "fs-extra": "^10.0.0", + "puppeteer-extra-plugin": "^3.2.3", + "rimraf": "^3.0.2" + }, + "engines": { + "node": ">=8" + }, + "peerDependencies": { + "playwright-extra": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "playwright-extra": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, + "node_modules/puppeteer-extra-plugin-user-data-dir/node_modules/fs-extra": { + "version": "10.1.0", + "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-10.1.0.tgz", + "integrity": "sha512-oRXApq54ETRj4eMiFzGnHWGy+zo5raudjuxN0b8H7s/RU2oW0Wvsx9O0ACRN/kRq9E8Vu/ReskGB5o3ji+FzHQ==", + "license": "MIT", + "dependencies": { + "graceful-fs": "^4.2.0", + "jsonfile": "^6.0.1", + "universalify": "^2.0.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/puppeteer-extra-plugin-user-data-dir/node_modules/glob": { + "version": "7.2.3", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz", + "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==", + "deprecated": "Old versions of glob are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me", + "license": "ISC", + "dependencies": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.1.1", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + }, + "engines": { + "node": "*" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/puppeteer-extra-plugin-user-data-dir/node_modules/jsonfile": { + "version": "6.2.0", + "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.2.0.tgz", + "integrity": "sha512-FGuPw30AdOIUTRMC2OMRtQV+jkVj2cfPqSeWXv1NEAJ1qZ5zb1X6z1mFhbfOB/iy3ssJCD+3KuZ8r8C3uVFlAg==", + "license": "MIT", + "dependencies": { + "universalify": "^2.0.0" + }, + "optionalDependencies": { + "graceful-fs": "^4.1.6" + } + }, + "node_modules/puppeteer-extra-plugin-user-data-dir/node_modules/rimraf": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz", + "integrity": "sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==", + "deprecated": "Rimraf versions prior to v4 are no longer supported", + "license": "ISC", + "dependencies": { + "glob": "^7.1.3" + }, + "bin": { + "rimraf": "bin.js" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/puppeteer-extra-plugin-user-data-dir/node_modules/universalify": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/universalify/-/universalify-2.0.1.tgz", + "integrity": "sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==", + "license": "MIT", + "engines": { + "node": ">= 10.0.0" + } + }, + "node_modules/puppeteer-extra-plugin-user-preferences": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-user-preferences/-/puppeteer-extra-plugin-user-preferences-2.4.1.tgz", + "integrity": "sha512-i1oAZxRbc1bk8MZufKCruCEC3CCafO9RKMkkodZltI4OqibLFXF3tj6HZ4LZ9C5vCXZjYcDWazgtY69mnmrQ9A==", + "license": "MIT", + "dependencies": { + "debug": "^4.1.1", + "deepmerge": "^4.2.2", + "puppeteer-extra-plugin": "^3.2.3", + "puppeteer-extra-plugin-user-data-dir": "^2.4.1" + }, + "engines": { + "node": ">=8" + }, + "peerDependencies": { + "playwright-extra": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "playwright-extra": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, "node_modules/puppeteer/node_modules/cosmiconfig": { "version": "9.0.0", "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-9.0.0.tgz", @@ -38865,6 +39304,42 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/shallow-clone": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-0.1.2.tgz", + "integrity": "sha512-J1zdXCky5GmNnuauESROVu31MQSnLoYvlyEn6j2Ztk6Q5EHFIhxkMhYcv6vuDzl2XEzoRr856QwzMgWM/TmZgw==", + "license": "MIT", + "dependencies": { + "is-extendable": "^0.1.1", + "kind-of": "^2.0.1", + "lazy-cache": "^0.2.3", + "mixin-object": "^2.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/shallow-clone/node_modules/kind-of": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-2.0.1.tgz", + "integrity": "sha512-0u8i1NZ/mg0b+W3MGGw5I7+6Eib2nx72S/QvXa0hYjEkjTknYmEYQJwGu3mLC0BrhtJjtQafTkyRUQ75Kx0LVg==", + "license": "MIT", + "dependencies": { + "is-buffer": "^1.0.2" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/shallow-clone/node_modules/lazy-cache": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-0.2.7.tgz", + "integrity": "sha512-gkX52wvU/R8DVMMt78ATVPFMJqfW8FPz1GZ1sVHBVQHmu/WvhIWE4cE1GBzhJNFicDeYhnwp6Rl35BcAIM3YOQ==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/sharp": { "version": "0.34.5", "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.5.tgz",