1- import { Browser } from ' puppeteer' ;
2- import { ParseNode , ParseMetadata } from ' ./types' ;
3- import { UrlWithoutAnchors } from ' ../helpers/url-handling' ;
4- import { expandInteractiveSections } from ' ./dom-actions' ;
1+ import { Browser } from " puppeteer" ;
2+ import { ParsedNode , ParsedMetadata } from " ./types" ;
3+ import { UrlWithoutAnchors } from " ../helpers/url-handling" ;
4+ import { expandInteractiveSections } from " ./dom-actions" ;
55
66export async function parsePages (
77 browser : Browser ,
8- node : ParseNode ,
8+ node : ParsedNode ,
99 depth : number ,
1010 maxDepth : number ,
11- parsedPages : Map < string , ParseMetadata > ,
12- parsePageFn : ( browser : Browser , url : string ) => Promise < ParseMetadata | null > ,
11+ parsedPages : Map < string , ParsedMetadata > ,
12+ parsePageFn : (
13+ browser : Browser ,
14+ url : string ,
15+ ) => Promise < ParsedMetadata | null > ,
1316 baseOrigin : string ,
1417 baseScope : string ,
1518 baseHostToken : string ,
16- navigationTimeout = 30000
19+ navigationTimeout = 30000 ,
1720) : Promise < void > {
1821 const visitKey = buildVisitKey ( node . url ) ;
1922 if ( parsedPages . has ( visitKey ) || depth > maxDepth ) {
2023 return ;
2124 }
22-
2325 const normalizedUrl = UrlWithoutAnchors ( node . url ) ;
2426 if ( ! isWithinScope ( normalizedUrl , baseScope , baseHostToken ) ) {
2527 return ;
2628 }
27-
2829 const metadata = await parsePageFn ( browser , node . url ) ;
2930 if ( ! metadata ) return ;
30-
3131 parsedPages . set ( visitKey , metadata ) ;
3232 node . title = metadata . title ;
3333 node . bodyText = metadata . bodyText ;
3434 node . lang = metadata . lang ;
3535 node . keywords = metadata . keywords ;
3636 node . datePublished = metadata . datePublished ;
3737 node . lastModified = metadata . lastModified ;
38-
3938 let page ;
4039 let anchors : string [ ] = [ ] ;
4140 try {
4241 page = await browser . newPage ( ) ;
43- await page . goto ( node . url , { waitUntil : 'networkidle2' , timeout : navigationTimeout } ) ;
42+ await page . goto ( node . url , {
43+ waitUntil : "networkidle2" ,
44+ timeout : navigationTimeout ,
45+ } ) ;
4446 await expandInteractiveSections ( page ) ;
45- anchors = await page . evaluate ( ( allowedToken : string ) => {
46- const anchors = Array . from ( document . querySelectorAll ( 'a[href]' ) ) ;
47- const iframeSources = Array . from ( document . querySelectorAll ( 'iframe[src]' ) ) ;
47+ anchors = ( await page . evaluate ( ( allowedToken : string ) => {
48+ const anchors = Array . from ( document . querySelectorAll ( "a[href]" ) ) ;
49+ const iframeSources = Array . from (
50+ document . querySelectorAll ( "iframe[src]" ) ,
51+ ) ;
4852 const unique = new Set < string > ( ) ;
4953 for ( const anchor of anchors ) {
5054 const href = ( anchor as HTMLAnchorElement ) . href ;
51- if ( ! href || ! href . startsWith ( ' http' ) ) continue ;
55+ if ( ! href || ! href . startsWith ( " http" ) ) continue ;
5256 try {
5357 const target = new URL ( href , window . location . href ) ;
5458 const normalizedHref = target . href . toLowerCase ( ) ;
@@ -59,10 +63,9 @@ export async function parsePages(
5963 console . warn ( `Failed to parse anchor href: ${ href } ` , error ) ;
6064 }
6165 }
62-
6366 for ( const frame of iframeSources ) {
6467 const src = ( frame as HTMLIFrameElement ) . src ;
65- if ( ! src || ! src . startsWith ( ' http' ) ) {
68+ if ( ! src || ! src . startsWith ( " http" ) ) {
6669 continue ;
6770 }
6871 try {
@@ -75,20 +78,19 @@ export async function parsePages(
7578 }
7679 }
7780 return Array . from ( unique ) ;
78- } , baseHostToken ) as string [ ] ;
81+ } , baseHostToken ) ) as string [ ] ;
7982 } catch ( error ) {
8083 console . warn ( `Failed to extract anchors from ${ node . url } ` , error ) ;
8184 } finally {
8285 if ( page ) await page . close ( ) ;
8386 }
84-
85-
8687 const scheduled = new Set < string > ( ) ;
87- const nextChildren : ParseNode [ ] = [ ] ;
88+ const nextChildren : ParsedNode [ ] = [ ] ;
8889 for ( const href of anchors ) {
8990 const normalized = UrlWithoutAnchors ( href ) ;
9091 const visitCandidate = buildVisitKey ( href ) ;
91- if ( parsedPages . has ( visitCandidate ) || scheduled . has ( visitCandidate ) ) continue ;
92+ if ( parsedPages . has ( visitCandidate ) || scheduled . has ( visitCandidate ) )
93+ continue ;
9294 const lowerNormalized = normalized . toLowerCase ( ) ;
9395 if ( baseHostToken && ! lowerNormalized . includes ( baseHostToken ) ) {
9496 continue ;
@@ -100,7 +102,6 @@ export async function parsePages(
100102 nextChildren . push ( { url : href } ) ;
101103 }
102104 node . children = nextChildren ;
103-
104105 if ( ! node . children || depth >= maxDepth ) return ;
105106 for ( const child of node . children ) {
106107 await parsePages (
@@ -112,7 +113,7 @@ export async function parsePages(
112113 parsePageFn ,
113114 baseOrigin ,
114115 baseScope ,
115- baseHostToken
116+ baseHostToken ,
116117 ) ;
117118 }
118119}
@@ -133,13 +134,13 @@ function isWithinScope(url: string, scope: string, hostToken: string): boolean {
133134 return false ;
134135 }
135136 const nextChar = lowerUrl . charAt ( lowerScope . length ) ;
136- return nextChar === '/' || nextChar === '?' || nextChar === '#' ;
137+ return nextChar === "/" || nextChar === "?" || nextChar === "#" ;
137138}
138139
139140export function buildVisitKey ( rawUrl : string ) : string {
140141 try {
141142 const url = new URL ( rawUrl ) ;
142- url . hash = '' ;
143+ url . hash = "" ;
143144 return UrlWithoutAnchors ( url . toString ( ) ) ;
144145 } catch ( _error ) {
145146 return rawUrl ;
0 commit comments