1- import { createPlaywrightRouter , log } from "crawlee" ;
21import { readFileSync } from "node:fs" ;
2+ import { createPlaywrightRouter , log } from "crawlee" ;
33import { markJobPageDone , markListPageDone } from "./progress.js" ;
44
55function normalizeUrl ( raw : string | null | undefined ) : string | null {
@@ -17,16 +17,15 @@ function normalizeUrl(raw: string | null | undefined): string | null {
1717
1818function getExistingJobUrlSet ( ) : Set < string > {
1919 const filePath = process . env . JOBOPS_EXISTING_JOB_URLS_FILE ;
20- const raw =
21- filePath
22- ? ( ( ) => {
23- try {
24- return readFileSync ( filePath , "utf-8" ) ;
25- } catch {
26- return null ;
27- }
28- } ) ( )
29- : process . env . JOBOPS_EXISTING_JOB_URLS ;
20+ const raw = filePath
21+ ? ( ( ) => {
22+ try {
23+ return readFileSync ( filePath , "utf-8" ) ;
24+ } catch {
25+ return null ;
26+ }
27+ } ) ( )
28+ : process . env . JOBOPS_EXISTING_JOB_URLS ;
3029
3130 if ( ! raw ) return new Set ( ) ;
3231 try {
@@ -41,12 +40,16 @@ function getExistingJobUrlSet(): Set<string> {
4140 }
4241}
4342
44- const SKIP_APPLY_FOR_EXISTING = process . env . JOBOPS_SKIP_APPLY_FOR_EXISTING === "1" ;
43+ const SKIP_APPLY_FOR_EXISTING =
44+ process . env . JOBOPS_SKIP_APPLY_FOR_EXISTING === "1" ;
4545const EXISTING_JOB_URLS = getExistingJobUrlSet ( ) ;
4646
4747// Global counters for max jobs per search term
4848const jobCounts = new Map < string , number > ( ) ;
49- const MAX_JOBS_PER_TERM = parseInt ( process . env . GRADCRACKER_MAX_JOBS_PER_TERM || "0" , 10 ) ;
49+ const MAX_JOBS_PER_TERM = parseInt (
50+ process . env . GRADCRACKER_MAX_JOBS_PER_TERM || "0" ,
51+ 10 ,
52+ ) ;
5053
5154interface Job {
5255 title : string | null ;
@@ -72,7 +75,9 @@ router.addHandler(
7275 if ( MAX_JOBS_PER_TERM > 0 ) {
7376 const currentCount = jobCounts . get ( role ) || 0 ;
7477 if ( currentCount >= MAX_JOBS_PER_TERM ) {
75- log . info ( `Max jobs (${ MAX_JOBS_PER_TERM } ) already enqueued for role "${ role } ". Skipping list page.` ) ;
78+ log . info (
79+ `Max jobs (${ MAX_JOBS_PER_TERM } ) already enqueued for role "${ role } ". Skipping list page.` ,
80+ ) ;
7681 markListPageDone ( {
7782 currentUrl : request . url ,
7883 jobCardsFound : 0 ,
@@ -120,7 +125,8 @@ router.addHandler(
120125 let disciplines : string | null = null ;
121126 try {
122127 const disciplinesEl = article . locator ( "h3" ) ;
123- disciplines = ( await disciplinesEl . textContent ( { timeout : 1000 } ) ) ?. trim ( ) ?? null ;
128+ disciplines =
129+ ( await disciplinesEl . textContent ( { timeout : 1000 } ) ) ?. trim ( ) ?? null ;
124130 } catch {
125131 // h3 not found or timed out - that's okay, disciplines is optional
126132 }
@@ -195,8 +201,10 @@ router.addHandler(
195201 if ( MAX_JOBS_PER_TERM > 0 ) {
196202 const currentCount = jobCounts . get ( role ) || 0 ;
197203 if ( currentCount >= MAX_JOBS_PER_TERM ) {
198- log . info ( `Reached max jobs limit (${ MAX_JOBS_PER_TERM } ) for role "${ role } " while processing list. Stopping.` ) ;
199- break ;
204+ log . info (
205+ `Reached max jobs limit (${ MAX_JOBS_PER_TERM } ) for role "${ role } " while processing list. Stopping.` ,
206+ ) ;
207+ break ;
200208 }
201209 jobCounts . set ( role , currentCount + 1 ) ;
202210 }
@@ -205,7 +213,7 @@ router.addHandler(
205213 urls : [ jobUrl ] ,
206214 userData : {
207215 ...jobs [ jobs . length - 1 ] ,
208- label : "gradcracker-single-job-page"
216+ label : "gradcracker-single-job-page" ,
209217 } ,
210218 } ) ;
211219 enqueuedJobs ++ ;
@@ -216,7 +224,7 @@ router.addHandler(
216224 log . info ( `Extracted ${ jobs . length } jobs` ) ;
217225 if ( SKIP_APPLY_FOR_EXISTING && skippedKnownJobs > 0 ) {
218226 log . info (
219- `Skipping ${ skippedKnownJobs } already-known job pages; enqueued ${ enqueuedJobs } new job pages.`
227+ `Skipping ${ skippedKnownJobs } already-known job pages; enqueued ${ enqueuedJobs } new job pages.` ,
220228 ) ;
221229 }
222230
@@ -226,7 +234,7 @@ router.addHandler(
226234 jobPagesEnqueued : enqueuedJobs ,
227235 jobPagesSkipped : skippedKnownJobs ,
228236 } ) ;
229- }
237+ } ,
230238) ;
231239
232240router . addHandler (
@@ -261,7 +269,9 @@ router.addHandler(
261269
262270 // Prefer page-scoped popup detection. Using the browser context's "page" event
263271 // can accidentally capture unrelated pages created by other concurrent requests.
264- const popupPromise = page . waitForEvent ( "popup" , { timeout : 8000 } ) . catch ( ( ) => null ) ;
272+ const popupPromise = page
273+ . waitForEvent ( "popup" , { timeout : 8000 } )
274+ . catch ( ( ) => null ) ;
265275 const navigationPromise = page
266276 . waitForNavigation ( { timeout : 8000 , waitUntil : "domcontentloaded" } )
267277 . catch ( ( ) => null ) ;
@@ -271,7 +281,12 @@ router.addHandler(
271281 await applyButton . click ( ) ;
272282
273283 // Wait for URL to stabilize (same URL for 3 consecutive checks)
274- const waitForUrlStable = async ( targetPage : typeof page , maxWaitMs = 10000 , checkIntervalMs = 100 , requiredStableChecks = 3 ) => {
284+ const waitForUrlStable = async (
285+ targetPage : typeof page ,
286+ maxWaitMs = 10000 ,
287+ checkIntervalMs = 100 ,
288+ requiredStableChecks = 3 ,
289+ ) => {
275290 let lastUrl = targetPage . url ( ) ;
276291 let stableCount = 0 ;
277292 const startTime = Date . now ( ) ;
@@ -298,11 +313,15 @@ router.addHandler(
298313 const targetPage = maybePopup ?? page ;
299314
300315 if ( maybePopup ) {
301- await maybePopup . waitForLoadState ( "domcontentloaded" , { timeout : 15000 } ) . catch ( ( ) => null ) ;
316+ await maybePopup
317+ . waitForLoadState ( "domcontentloaded" , { timeout : 15000 } )
318+ . catch ( ( ) => null ) ;
302319 // If the popup initially opens as about:blank, give it a moment to redirect.
303320 if ( maybePopup . url ( ) === "about:blank" ) {
304321 await maybePopup
305- . waitForURL ( ( u ) => u . toString ( ) !== "about:blank" , { timeout : 15000 } )
322+ . waitForURL ( ( u ) => u . toString ( ) !== "about:blank" , {
323+ timeout : 15000 ,
324+ } )
306325 . catch ( ( ) => null ) ;
307326 }
308327 } else {
@@ -317,7 +336,7 @@ router.addHandler(
317336
318337 if ( applicationLink === originalUrl ) {
319338 log . info (
320- `Apply click did not change URL (still Gradcracker): ${ applicationLink } `
339+ `Apply click did not change URL (still Gradcracker): ${ applicationLink } ` ,
321340 ) ;
322341 } else {
323342 log . info ( `Captured application URL: ${ applicationLink } ` ) ;
@@ -342,5 +361,5 @@ router.addHandler(
342361 } ) ;
343362
344363 markJobPageDone ( { currentUrl : request . url } ) ;
345- }
364+ } ,
346365) ;
0 commit comments