Skip to content

Commit 4ffaf06

Browse files
committed
in extractors
1 parent aaab9b5 commit 4ffaf06

9 files changed

Lines changed: 630 additions & 569 deletions

File tree

biome.json

Lines changed: 25 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,27 @@
11
{
2-
"$schema": "https://biomejs.dev/schemas/2.3.12/schema.json",
3-
"formatter": {
4-
"indentStyle": "space",
5-
"indentWidth": 2
6-
},
7-
"files": {
8-
"includes": [
9-
"**",
10-
"!!**/dist"
11-
]
12-
},
13-
"css": {
14-
"parser": {
15-
"tailwindDirectives": true
2+
"$schema": "https://biomejs.dev/schemas/2.3.12/schema.json",
3+
"formatter": {
4+
"indentStyle": "space",
5+
"indentWidth": 2
6+
},
7+
"files": {
8+
"includes": ["**", "!!**/dist"]
9+
},
10+
"css": {
11+
"parser": {
12+
"tailwindDirectives": true
13+
}
14+
},
15+
"overrides": [
16+
{
17+
"includes": ["**/*.test.ts", "**/*.test.tsx", "**/test-utils.ts"],
18+
"linter": {
19+
"rules": {
20+
"suspicious": {
21+
"noExplicitAny": "off"
22+
}
1623
}
17-
},
18-
"overrides": [
19-
{
20-
"includes": [
21-
"**/*.test.ts",
22-
"**/*.test.tsx",
23-
"**/test-utils.ts"
24-
],
25-
"linter": {
26-
"rules": {
27-
"suspicious": {
28-
"noExplicitAny": "off"
29-
}
30-
}
31-
}
32-
}
33-
]
34-
}
24+
}
25+
}
26+
]
27+
}
Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,30 @@
11
{
2-
"name": "job-flow",
3-
"version": "0.0.1",
4-
"type": "module",
5-
"description": "This is an example of a Crawlee project.",
6-
"dependencies": {
7-
"camoufox-js": "^0.8.0",
8-
"crawlee": "^3.0.0",
9-
"playwright": "*"
10-
},
11-
"devDependencies": {
12-
"@apify/tsconfig": "^0.1.0",
13-
"@types/fs-extra": "^11",
14-
"@types/node": "^24.0.0",
15-
"fs-extra": "^11.3.0",
16-
"tsx": "^4.4.0",
17-
"typescript": "~5.9.0"
18-
},
19-
"scripts": {
20-
"start": "npm run start:dev",
21-
"start:prod": "node dist/main.js",
22-
"start:dev": "tsx src/main.ts",
23-
"build": "tsc",
24-
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
25-
"get-binaries": "camoufox-js fetch",
26-
"postinstall": "npm run get-binaries"
27-
},
28-
"author": "It's not you it's me",
29-
"license": "ISC"
2+
"name": "job-flow",
3+
"version": "0.0.1",
4+
"type": "module",
5+
"description": "This is an example of a Crawlee project.",
6+
"dependencies": {
7+
"camoufox-js": "^0.8.0",
8+
"crawlee": "^3.0.0",
9+
"playwright": "*"
10+
},
11+
"devDependencies": {
12+
"@apify/tsconfig": "^0.1.0",
13+
"@types/fs-extra": "^11",
14+
"@types/node": "^24.0.0",
15+
"fs-extra": "^11.3.0",
16+
"tsx": "^4.4.0",
17+
"typescript": "~5.9.0"
18+
},
19+
"scripts": {
20+
"start": "npm run start:dev",
21+
"start:prod": "node dist/main.js",
22+
"start:dev": "tsx src/main.ts",
23+
"build": "tsc",
24+
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
25+
"get-binaries": "camoufox-js fetch",
26+
"postinstall": "npm run get-binaries"
27+
},
28+
"author": "It's not you it's me",
29+
"license": "ISC"
3030
}

extractors/gradcracker/src/main.ts

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,8 @@
22
import { launchOptions } from "camoufox-js";
33
import { PlaywrightCrawler } from "crawlee";
44
import { firefox } from "playwright";
5-
6-
import { router } from "./routes.js";
75
import { initJobOpsProgress } from "./progress.js";
6+
import { router } from "./routes.js";
87

98
// locations
109
const locations = [
@@ -17,10 +16,7 @@ const locations = [
1716
];
1817

1918
// roles
20-
const defaultRoles = [
21-
"web-development",
22-
"software-systems",
23-
];
19+
const defaultRoles = ["web-development", "software-systems"];
2420

2521
let roles = defaultRoles;
2622
const envRolesRaw = process.env.GRADCRACKER_SEARCH_TERMS;
@@ -29,15 +25,16 @@ if (envRolesRaw) {
2925
try {
3026
const parsed = JSON.parse(envRolesRaw) as string[];
3127
if (Array.isArray(parsed) && parsed.length > 0) {
32-
roles = parsed.map(term =>
33-
term.toLowerCase()
34-
.replace(/[^a-z0-9]+/g, '-')
35-
.replace(/^-+|-+$/g, '')
28+
roles = parsed.map((term) =>
29+
term
30+
.toLowerCase()
31+
.replace(/[^a-z0-9]+/g, "-")
32+
.replace(/^-+|-+$/g, ""),
3633
);
37-
console.log(`Using configured search terms: ${roles.join(', ')}`);
34+
console.log(`Using configured search terms: ${roles.join(", ")}`);
3835
}
3936
} catch (e) {
40-
console.warn('Failed to parse GRADCRACKER_SEARCH_TERMS', e);
37+
console.warn("Failed to parse GRADCRACKER_SEARCH_TERMS", e);
4138
}
4239
}
4340

@@ -46,12 +43,12 @@ const gradcrackerUrls = locations.flatMap((location) => {
4643
return roles.map((role) => {
4744
return {
4845
url: `https://www.gradcracker.com/search/computing-technology/${role}-graduate-jobs-in-${location}?order=dateAdded`,
49-
role
46+
role,
5047
};
5148
});
5249
});
5350

54-
console.log(`Total gradcracker URLs: ${gradcrackerUrls.length}`)
51+
console.log(`Total gradcracker URLs: ${gradcrackerUrls.length}`);
5552

5653
const startUrls = gradcrackerUrls.map(({ url, role }) => ({
5754
url,

extractors/gradcracker/src/progress.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ interface JobOpsCrawlProgressState {
2626
const PROGRESS_PREFIX = "JOBOPS_PROGRESS ";
2727
const isEnabled = () => process.env.JOBOPS_EMIT_PROGRESS === "1";
2828

29-
let state: JobOpsCrawlProgressState = {
29+
const state: JobOpsCrawlProgressState = {
3030
listPagesProcessed: 0,
3131
jobCardsFound: 0,
3232
jobPagesEnqueued: 0,
@@ -80,4 +80,3 @@ export function markJobPageDone(params: { currentUrl: string }): void {
8080
state.currentUrl = params.currentUrl;
8181
emit();
8282
}
83-

extractors/gradcracker/src/routes.ts

Lines changed: 45 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import { createPlaywrightRouter, log } from "crawlee";
21
import { readFileSync } from "node:fs";
2+
import { createPlaywrightRouter, log } from "crawlee";
33
import { markJobPageDone, markListPageDone } from "./progress.js";
44

55
function normalizeUrl(raw: string | null | undefined): string | null {
@@ -17,16 +17,15 @@ function normalizeUrl(raw: string | null | undefined): string | null {
1717

1818
function getExistingJobUrlSet(): Set<string> {
1919
const filePath = process.env.JOBOPS_EXISTING_JOB_URLS_FILE;
20-
const raw =
21-
filePath
22-
? (() => {
23-
try {
24-
return readFileSync(filePath, "utf-8");
25-
} catch {
26-
return null;
27-
}
28-
})()
29-
: process.env.JOBOPS_EXISTING_JOB_URLS;
20+
const raw = filePath
21+
? (() => {
22+
try {
23+
return readFileSync(filePath, "utf-8");
24+
} catch {
25+
return null;
26+
}
27+
})()
28+
: process.env.JOBOPS_EXISTING_JOB_URLS;
3029

3130
if (!raw) return new Set();
3231
try {
@@ -41,12 +40,16 @@ function getExistingJobUrlSet(): Set<string> {
4140
}
4241
}
4342

44-
const SKIP_APPLY_FOR_EXISTING = process.env.JOBOPS_SKIP_APPLY_FOR_EXISTING === "1";
43+
const SKIP_APPLY_FOR_EXISTING =
44+
process.env.JOBOPS_SKIP_APPLY_FOR_EXISTING === "1";
4545
const EXISTING_JOB_URLS = getExistingJobUrlSet();
4646

4747
// Global counters for max jobs per search term
4848
const jobCounts = new Map<string, number>();
49-
const MAX_JOBS_PER_TERM = parseInt(process.env.GRADCRACKER_MAX_JOBS_PER_TERM || "0", 10);
49+
const MAX_JOBS_PER_TERM = parseInt(
50+
process.env.GRADCRACKER_MAX_JOBS_PER_TERM || "0",
51+
10,
52+
);
5053

5154
interface Job {
5255
title: string | null;
@@ -72,7 +75,9 @@ router.addHandler(
7275
if (MAX_JOBS_PER_TERM > 0) {
7376
const currentCount = jobCounts.get(role) || 0;
7477
if (currentCount >= MAX_JOBS_PER_TERM) {
75-
log.info(`Max jobs (${MAX_JOBS_PER_TERM}) already enqueued for role "${role}". Skipping list page.`);
78+
log.info(
79+
`Max jobs (${MAX_JOBS_PER_TERM}) already enqueued for role "${role}". Skipping list page.`,
80+
);
7681
markListPageDone({
7782
currentUrl: request.url,
7883
jobCardsFound: 0,
@@ -120,7 +125,8 @@ router.addHandler(
120125
let disciplines: string | null = null;
121126
try {
122127
const disciplinesEl = article.locator("h3");
123-
disciplines = (await disciplinesEl.textContent({ timeout: 1000 }))?.trim() ?? null;
128+
disciplines =
129+
(await disciplinesEl.textContent({ timeout: 1000 }))?.trim() ?? null;
124130
} catch {
125131
// h3 not found or timed out - that's okay, disciplines is optional
126132
}
@@ -195,8 +201,10 @@ router.addHandler(
195201
if (MAX_JOBS_PER_TERM > 0) {
196202
const currentCount = jobCounts.get(role) || 0;
197203
if (currentCount >= MAX_JOBS_PER_TERM) {
198-
log.info(`Reached max jobs limit (${MAX_JOBS_PER_TERM}) for role "${role}" while processing list. Stopping.`);
199-
break;
204+
log.info(
205+
`Reached max jobs limit (${MAX_JOBS_PER_TERM}) for role "${role}" while processing list. Stopping.`,
206+
);
207+
break;
200208
}
201209
jobCounts.set(role, currentCount + 1);
202210
}
@@ -205,7 +213,7 @@ router.addHandler(
205213
urls: [jobUrl],
206214
userData: {
207215
...jobs[jobs.length - 1],
208-
label: "gradcracker-single-job-page"
216+
label: "gradcracker-single-job-page",
209217
},
210218
});
211219
enqueuedJobs++;
@@ -216,7 +224,7 @@ router.addHandler(
216224
log.info(`Extracted ${jobs.length} jobs`);
217225
if (SKIP_APPLY_FOR_EXISTING && skippedKnownJobs > 0) {
218226
log.info(
219-
`Skipping ${skippedKnownJobs} already-known job pages; enqueued ${enqueuedJobs} new job pages.`
227+
`Skipping ${skippedKnownJobs} already-known job pages; enqueued ${enqueuedJobs} new job pages.`,
220228
);
221229
}
222230

@@ -226,7 +234,7 @@ router.addHandler(
226234
jobPagesEnqueued: enqueuedJobs,
227235
jobPagesSkipped: skippedKnownJobs,
228236
});
229-
}
237+
},
230238
);
231239

232240
router.addHandler(
@@ -261,7 +269,9 @@ router.addHandler(
261269

262270
// Prefer page-scoped popup detection. Using the browser context's "page" event
263271
// can accidentally capture unrelated pages created by other concurrent requests.
264-
const popupPromise = page.waitForEvent("popup", { timeout: 8000 }).catch(() => null);
272+
const popupPromise = page
273+
.waitForEvent("popup", { timeout: 8000 })
274+
.catch(() => null);
265275
const navigationPromise = page
266276
.waitForNavigation({ timeout: 8000, waitUntil: "domcontentloaded" })
267277
.catch(() => null);
@@ -271,7 +281,12 @@ router.addHandler(
271281
await applyButton.click();
272282

273283
// Wait for URL to stabilize (same URL for 3 consecutive checks)
274-
const waitForUrlStable = async (targetPage: typeof page, maxWaitMs = 10000, checkIntervalMs = 100, requiredStableChecks = 3) => {
284+
const waitForUrlStable = async (
285+
targetPage: typeof page,
286+
maxWaitMs = 10000,
287+
checkIntervalMs = 100,
288+
requiredStableChecks = 3,
289+
) => {
275290
let lastUrl = targetPage.url();
276291
let stableCount = 0;
277292
const startTime = Date.now();
@@ -298,11 +313,15 @@ router.addHandler(
298313
const targetPage = maybePopup ?? page;
299314

300315
if (maybePopup) {
301-
await maybePopup.waitForLoadState("domcontentloaded", { timeout: 15000 }).catch(() => null);
316+
await maybePopup
317+
.waitForLoadState("domcontentloaded", { timeout: 15000 })
318+
.catch(() => null);
302319
// If the popup initially opens as about:blank, give it a moment to redirect.
303320
if (maybePopup.url() === "about:blank") {
304321
await maybePopup
305-
.waitForURL((u) => u.toString() !== "about:blank", { timeout: 15000 })
322+
.waitForURL((u) => u.toString() !== "about:blank", {
323+
timeout: 15000,
324+
})
306325
.catch(() => null);
307326
}
308327
} else {
@@ -317,7 +336,7 @@ router.addHandler(
317336

318337
if (applicationLink === originalUrl) {
319338
log.info(
320-
`Apply click did not change URL (still Gradcracker): ${applicationLink}`
339+
`Apply click did not change URL (still Gradcracker): ${applicationLink}`,
321340
);
322341
} else {
323342
log.info(`Captured application URL: ${applicationLink}`);
@@ -342,5 +361,5 @@ router.addHandler(
342361
});
343362

344363
markJobPageDone({ currentUrl: request.url });
345-
}
364+
},
346365
);
Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
{
2-
"extends": "@apify/tsconfig",
3-
"compilerOptions": {
4-
"module": "NodeNext",
5-
"moduleResolution": "NodeNext",
6-
"target": "ES2022",
7-
"outDir": "dist",
8-
"noUnusedLocals": false,
9-
"lib": ["DOM"]
10-
},
11-
"include": ["./src/**/*"]
2+
"extends": "@apify/tsconfig",
3+
"compilerOptions": {
4+
"module": "NodeNext",
5+
"moduleResolution": "NodeNext",
6+
"target": "ES2022",
7+
"outDir": "dist",
8+
"noUnusedLocals": false,
9+
"lib": ["DOM"]
10+
},
11+
"include": ["./src/**/*"]
1212
}

0 commit comments

Comments
 (0)