1-
2- import puppeteer from 'puppeteer-extra' ;
3- import StealthPlugin from 'puppeteer-extra-plugin-stealth' ;
4- import { Browser , Page } from 'puppeteer' ;
5- import { resolveEnv } from './modules/config' ;
6- import { ensureDirectory , saveMetadata } from './modules/output' ;
7- import { handleError } from './modules/errors' ;
8- import { parsePages } from './modules/crawler' ;
9- import { expandInteractiveSections } from './modules/dom-actions' ;
10- import { ParsedNode , ParsedMetadata } from './modules/types' ;
11- import { sanitizeUrlAsFilename , UrlWithoutAnchors , deriveSubPath } from './helpers/url-handling' ;
12- import { assertReachable } from './modules/network' ;
13- import { toIsoOrNull } from './helpers/date-format' ;
14- import crypto from 'crypto' ;
1+ import puppeteer from "puppeteer-extra" ;
2+ import StealthPlugin from "puppeteer-extra-plugin-stealth" ;
3+ import { Browser , Page } from "puppeteer" ;
4+ import { resolveEnv } from "./modules/config" ;
5+ import { ensureDirectory , saveMetadata } from "./modules/output" ;
6+ import { handleError } from "./modules/errors" ;
7+ import { buildVisitKey , parsePages } from "./modules/crawler" ;
8+ import { expandInteractiveSections } from "./modules/dom-actions" ;
9+ import { ParsedNode , ParsedMetadata } from "./modules/types" ;
10+ import {
11+ sanitizeUrlAsFilename ,
12+ UrlWithoutAnchors ,
13+ deriveSubPath ,
14+ } from "./helpers/url-handling" ;
15+ import { assertReachable } from "./modules/network" ;
16+ import { toIsoOrNull } from "./helpers/date-format" ;
17+ import crypto from "crypto" ;
1518
1619puppeteer . use ( StealthPlugin ( ) ) ;
1720
@@ -21,6 +24,7 @@ const FILENAME_LENGTH_THRESHOLD = 255;
2124
2225const env = resolveEnv ( ) ;
2326const parsedPages = new Map < string , ParsedMetadata > ( ) ;
27+ const scheduledPages = new Set < string > ( ) ;
2428
2529void ( async ( ) => {
2630 try {
@@ -31,18 +35,22 @@ void (async () => {
3135 const baseUrlObject = new URL ( env . baseUrl ) ;
3236 const baseOrigin = baseUrlObject . origin ;
3337 const baseScope = UrlWithoutAnchors ( env . baseUrl ) ;
34- const baseHostToken = baseUrlObject . hostname . replace ( / ^ w w w \. / , '' ) . toLowerCase ( ) ;
38+ const baseHostToken = baseUrlObject . hostname
39+ . replace ( / ^ w w w \. / , "" )
40+ . toLowerCase ( ) ;
41+ scheduledPages . add ( buildVisitKey ( env . baseUrl ) ) ;
3542 await parsePages (
3643 browser ,
3744 root ,
3845 0 ,
3946 env . maxDepth ,
4047 parsedPages ,
48+ scheduledPages ,
4149 parsePageFn ,
4250 baseOrigin ,
4351 baseScope ,
4452 baseHostToken ,
45- NAVIGATION_TIMEOUT_MS
53+ NAVIGATION_TIMEOUT_MS ,
4654 ) ;
4755 await browser . close ( ) ;
4856 console . log ( `Parsing complete! Data saved to ${ env . outputDirectory } ` ) ;
@@ -51,11 +59,17 @@ void (async () => {
5159 }
5260} ) ( ) ;
5361
54- async function parsePageFn ( browser : Browser , url : string ) : Promise < ParsedMetadata | null > {
62+ async function parsePageFn (
63+ browser : Browser ,
64+ url : string ,
65+ ) : Promise < ParsedMetadata | null > {
5566 let page : Page | undefined ;
5667 try {
5768 page = await browser . newPage ( ) ;
58- await page . goto ( url , { waitUntil : 'networkidle2' , timeout : NAVIGATION_TIMEOUT_MS } ) ;
69+ await page . goto ( url , {
70+ waitUntil : "networkidle2" ,
71+ timeout : NAVIGATION_TIMEOUT_MS ,
72+ } ) ;
5973 await expandInteractiveSections ( page ) ;
6074 const rawMetadata = await page . evaluate ( extractDocumentMetadata ) ;
6175 const snapshot = serializeMetadata ( rawMetadata ) ;
@@ -71,15 +85,28 @@ async function parsePageFn(browser: Browser, url: string): Promise<ParsedMetadat
7185 }
7286}
7387
74- async function persistSnapshot ( snapshot : ParsedMetadata , FILENAME_LENGTH_THRESHOLD : number ) : Promise < void > {
75- const subPath = deriveSubPath ( snapshot . url , env . baseUrl , env . sanitizedBaseUrl ) ;
76- const preferredName = subPath === '/' ? 'root' : subPath ;
77- const sanitizedName = sanitizeUrlAsFilename ( preferredName , { replacement : '-' } ) ;
78- const trimmedName = sanitizedName . replace ( / ^ [ - _ ] + / , '' ) || sanitizedName ;
88+ async function persistSnapshot (
89+ snapshot : ParsedMetadata ,
90+ FILENAME_LENGTH_THRESHOLD : number ,
91+ ) : Promise < void > {
92+ const subPath = deriveSubPath (
93+ snapshot . url ,
94+ env . baseUrl ,
95+ env . sanitizedBaseUrl ,
96+ ) ;
97+ const preferredName = subPath === "/" ? "root" : subPath ;
98+ const sanitizedName = sanitizeUrlAsFilename ( preferredName , {
99+ replacement : "-" ,
100+ } ) ;
101+ const trimmedName = sanitizedName . replace ( / ^ [ - _ ] + / , "" ) || sanitizedName ;
79102 let finalName = trimmedName ;
80103 if ( trimmedName . length > FILENAME_LENGTH_THRESHOLD ) {
81104 const normalizedUrl = UrlWithoutAnchors ( snapshot . url ) ;
82- const hash = crypto . createHash ( 'sha1' ) . update ( normalizedUrl ) . digest ( 'hex' ) . slice ( 0 , 10 ) ;
105+ const hash = crypto
106+ . createHash ( "sha1" )
107+ . update ( normalizedUrl )
108+ . digest ( "hex" )
109+ . slice ( 0 , 10 ) ;
83110 const prefix = trimmedName . slice ( 0 , 240 ) ;
84111 finalName = `${ prefix } _${ hash } ` ;
85112 }
@@ -101,42 +128,51 @@ function serializeMetadata(raw: ParsedMetadata): ParsedMetadata {
101128const extractDocumentMetadata = ( ) : ParsedMetadata => {
102129 const getMeta = ( name : string ) : string | null => {
103130 return (
104- document . querySelector ( `meta[name="${ name } "]` ) ?. getAttribute ( 'content' ) ||
105- document . querySelector ( `meta[property="${ name } "]` ) ?. getAttribute ( 'content' ) ||
131+ document . querySelector ( `meta[name="${ name } "]` ) ?. getAttribute ( "content" ) ||
132+ document
133+ . querySelector ( `meta[property="${ name } "]` )
134+ ?. getAttribute ( "content" ) ||
106135 null
107136 ) ;
108137 } ;
109- const metaTitle = getMeta ( ' og:title' ) || getMeta ( ' twitter:title' ) ;
138+ const metaTitle = getMeta ( " og:title" ) || getMeta ( " twitter:title" ) ;
110139 const documentTitle = document . title ?. trim ( ) ;
111- const normalizedTitle = documentTitle ?. length ? documentTitle : metaTitle || '' ;
140+ const normalizedTitle = documentTitle ?. length
141+ ? documentTitle
142+ : metaTitle || "" ;
112143 const normalizeText = ( value : string | null | undefined ) : string => {
113- return value ? value . replace ( / \s + / g, ' ' ) . trim ( ) : '' ;
144+ return value ? value . replace ( / \s + / g, " " ) . trim ( ) : "" ;
114145 } ;
115- const mainText = normalizeText ( document . querySelector ( ' main' ) ?. innerText ) ;
116- const iframeTexts = Array . from ( document . querySelectorAll ( ' iframe' ) )
146+ const mainText = normalizeText ( document . querySelector ( " main" ) ?. innerText ) ;
147+ const iframeTexts = Array . from ( document . querySelectorAll ( " iframe" ) )
117148 . map ( ( frame ) => {
118149 try {
119- return normalizeText ( frame . contentDocument ?. body ?. innerText ?? '' ) ;
150+ return normalizeText ( frame . contentDocument ?. body ?. innerText ?? "" ) ;
120151 } catch ( _error ) {
121- return '' ;
152+ return "" ;
122153 }
123154 } )
124155 . filter ( ( text ) => text . length > 0 ) ;
125- const prioritizedTextParts = [ mainText , ...iframeTexts ] . filter ( ( text ) => text . length > 0 ) ;
126- const prioritizedText = prioritizedTextParts . join ( '\n\n' ) . trim ( ) ;
127- const fallbackBody = normalizeText ( document . body ?. innerText ?? '' ) ;
128- const bodyText = prioritizedText . length >= 120 ? prioritizedText : fallbackBody ;
156+ const prioritizedTextParts = [ mainText , ...iframeTexts ] . filter (
157+ ( text ) => text . length > 0 ,
158+ ) ;
159+ const prioritizedText = prioritizedTextParts . join ( "\n\n" ) . trim ( ) ;
160+ const fallbackBody = normalizeText ( document . body ?. innerText ?? "" ) ;
161+ const bodyText =
162+ prioritizedText . length >= 120 ? prioritizedText : fallbackBody ;
129163 return {
130164 title : normalizedTitle ,
131165 url : window . location . href ,
132166 bodyText,
133- lang : document . documentElement . lang || getMeta ( ' og:locale' ) || null ,
134- keywords : getMeta ( ' keywords' ) || getMeta ( ' news_keywords' ) ,
167+ lang : document . documentElement . lang || getMeta ( " og:locale" ) || null ,
168+ keywords : getMeta ( " keywords" ) || getMeta ( " news_keywords" ) ,
135169 datePublished :
136- getMeta ( 'article:published_time' ) || getMeta ( 'date' ) || getMeta ( 'publish-date' ) ,
170+ getMeta ( "article:published_time" ) ||
171+ getMeta ( "date" ) ||
172+ getMeta ( "publish-date" ) ,
137173 lastModified :
138- document . lastModified !== ' 01/01/1970 00:00:00'
174+ document . lastModified !== " 01/01/1970 00:00:00"
139175 ? document . lastModified
140- : getMeta ( ' article:modified_time' ) ,
176+ : getMeta ( " article:modified_time" ) ,
141177 } ;
142178} ;
0 commit comments