From 8b96b1f2fc63e90c5ccc64d30bb70fa415df2f19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Rojo=20Mart=C3=ADnez?= Date: Sat, 16 May 2026 18:26:50 +0200 Subject: [PATCH 1/7] feat(providers): add Workable provider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Workable's documented JSON API requires an auth token; the only no-auth public surface is a Markdown feed at `apply.workable.com//jobs.md`. The provider auto-detects from the `apply.workable.com/` careers_url pattern, fetches via ctx.fetchText, and parses the table rows. Follows the SSRF defence pattern from providers/greenhouse.mjs: hostname allowlist + URL parse + HTTPS check + redirect:'error' on the fetch call. Exports parseWorkableMarkdown as a named export so test-all.mjs §11 can unit-test the parser independently of the network. Tests in test-all.mjs §11: - detect() resolves apply.workable.com/ → /jobs.md feed - detect() returns null for non-workable URLs - parseWorkableMarkdown extracts title/location/company correctly - parseWorkableMarkdown strips .md suffix from job URLs - empty / null inputs yield empty results without crashing - fetch() with allowed hostname reaches the http context Refs #651 --- providers/workable.mjs | 81 ++++++++++++++ test-all.mjs | 238 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 319 insertions(+) create mode 100644 providers/workable.mjs diff --git a/providers/workable.mjs b/providers/workable.mjs new file mode 100644 index 0000000000..c1c5606c9d --- /dev/null +++ b/providers/workable.mjs @@ -0,0 +1,81 @@ +// @ts-check +/** @typedef {import('./_types.js').Provider} Provider */ + +// Workable provider — hits the public markdown feed at //jobs.md. +// Workable's documented JSON API requires an auth token; the markdown feed +// is the only no-auth public surface. Auto-detects from careers_url pattern +// `https://apply.workable.com/`. A tracked_companies entry can also +// set `provider: workable` explicitly to bypass detection. + +const ALLOWED_WORKABLE_HOSTS = new Set(['apply.workable.com']); + +function assertWorkableUrl(url) { + let parsed; + try { + parsed = new URL(url); + } catch { + throw new Error(`workable: invalid URL: ${url}`); + } + if (parsed.protocol !== 'https:') throw new Error(`workable: URL must use HTTPS: ${url}`); + if (!ALLOWED_WORKABLE_HOSTS.has(parsed.hostname)) { + throw new Error(`workable: untrusted hostname "${parsed.hostname}" — must be one of: ${[...ALLOWED_WORKABLE_HOSTS].join(', ')}`); + } + return url; +} + +function resolveFeedUrl(entry) { + const url = entry.careers_url || ''; + const match = url.match(/apply\.workable\.com\/([^/?#]+)/); + if (!match) return null; + return `https://apply.workable.com/${match[1]}/jobs.md`; +} + +/** @type {Provider} */ +export default { + id: 'workable', + + detect(entry) { + const feedUrl = resolveFeedUrl(entry); + return feedUrl ? { url: feedUrl } : null; + }, + + async fetch(entry, ctx) { + const feedUrl = resolveFeedUrl(entry); + if (!feedUrl) throw new Error(`workable: cannot derive feed URL for ${entry.name}`); + assertWorkableUrl(feedUrl); + // redirect:'error' prevents SSRF via server-side redirects; combined with + // assertWorkableUrl above it guarantees the final hostname stays in the allowlist. + const text = await ctx.fetchText(feedUrl, { redirect: 'error' }); + return parseWorkableMarkdown(text, entry.name); + }, +}; + +/** + * Parse Workable's public markdown feed. Exported as a named export for unit + * tests. The feed exposes a table: + * | Title | Department | Location | Type | Salary | Posted | Details | + * where `Details` holds a markdown link + * [View](https://apply.workable.com//jobs/view/.md) + * + * @param {string} text — markdown body + * @param {string} companyName — value to write into job.company + * @returns {Array<{title: string, url: string, company: string, location: string}>} + */ +export function parseWorkableMarkdown(text, companyName) { + if (typeof text !== 'string') return []; + const jobs = []; + for (const line of text.split('\n')) { + if (!line.startsWith('|') || !line.includes('[View]')) continue; + const cols = line.split('|').map(c => c.trim()); + // Cols: ['', title, dept, location, type, salary, posted, '[View](url.md)', ''] + if (cols.length < 8) continue; + const title = cols[1]; + if (!title || title === 'Title') continue; + const location = cols[3] || ''; + const urlMatch = cols[7].match(/\(([^)]+)\)/); + let url = urlMatch ? urlMatch[1] : ''; + if (url.endsWith('.md')) url = url.slice(0, -3); + jobs.push({ title, url, location, company: companyName }); + } + return jobs; +} diff --git a/test-all.mjs b/test-all.mjs index c5152fc11f..7de46f1621 100644 --- a/test-all.mjs +++ b/test-all.mjs @@ -314,6 +314,244 @@ if (fileExists('VERSION')) { fail('VERSION file missing'); } +// ── 11. PROVIDERS — Workable ──────────────────────────────────────── + +console.log('\n11. Provider — workable'); + +try { + const workable = (await import(pathToFileURL(join(ROOT, 'providers/workable.mjs')).href)).default; + const { parseWorkableMarkdown } = await import(pathToFileURL(join(ROOT, 'providers/workable.mjs')).href); + + // detect() — auto-detection from careers_url + if (workable.id === 'workable') pass('workable.id is "workable"'); + else fail(`workable.id is ${JSON.stringify(workable.id)}`); + + const hit = workable.detect({ name: 'TestCo', careers_url: 'https://apply.workable.com/optimile' }); + if (hit && hit.url === 'https://apply.workable.com/optimile/jobs.md') { + pass('workable.detect() resolves apply.workable.com/ → /jobs.md feed'); + } else { + fail(`workable.detect() returned ${JSON.stringify(hit)}`); + } + + const miss = workable.detect({ name: 'TestCo', careers_url: 'https://example.com/careers' }); + if (miss === null) pass('workable.detect() returns null for non-workable URLs'); + else fail(`workable.detect() should return null, got ${JSON.stringify(miss)}`); + + // parse() — markdown table + const sampleMd = [ + '# Optimile — All Open Positions', + '', + '| Title | Department | Location | Type | Salary | Posted | Details |', + '|---|---|---|---|---|---|---|', + '| Senior AI PM | Product | Ghent, Belgium | Full-time | — | 2026-04-01 | [View](https://apply.workable.com/optimile/jobs/view/ABC123.md) |', + '| Tech Lead | Engineering | Remote | Full-time | — | 2026-03-25 | [View](https://apply.workable.com/optimile/jobs/view/DEF456.md) |', + ].join('\n'); + + const jobs = parseWorkableMarkdown(sampleMd, 'Optimile'); + if (jobs.length === 2) pass('parseWorkableMarkdown extracts 2 jobs from 2-row table'); + else fail(`parseWorkableMarkdown returned ${jobs.length} jobs, expected 2`); + + if (jobs[0]?.title === 'Senior AI PM' && jobs[0]?.location === 'Ghent, Belgium' && jobs[0]?.company === 'Optimile') { + pass('parseWorkableMarkdown extracts title, location, company correctly'); + } else { + fail(`parseWorkableMarkdown row 0 = ${JSON.stringify(jobs[0])}`); + } + + if (jobs[0]?.url === 'https://apply.workable.com/optimile/jobs/view/ABC123') { + pass('parseWorkableMarkdown strips .md suffix from job URL'); + } else { + fail(`parseWorkableMarkdown should strip .md; got url=${JSON.stringify(jobs[0]?.url)}`); + } + + // Robustness + if (parseWorkableMarkdown('', 'X').length === 0) pass('empty input → empty result'); + else fail('empty input should yield empty result'); + + if (parseWorkableMarkdown(null, 'X').length === 0) pass('null input → empty result (no crash)'); + else fail('null input should yield empty result without crashing'); + + // SSRF defence: untrusted hostname rejected before fetch + await workable.fetch( + { name: 'Bad', careers_url: 'https://apply.workable.com/evil' }, + { + transport: 'http', + fetchText: async (url) => { + if (!url.startsWith('https://apply.workable.com/')) { + throw new Error('fetchText called with unexpected URL'); + } + return '| Title | Department | Location | Type | Salary | Posted | Details |\n|---|---|---|---|---|---|---|\n'; + }, + fetchJson: async () => { throw new Error('fetchJson should not be called'); }, + }, + ); + pass('workable.fetch() reaches fetchText with allowed host'); + +} catch (e) { + fail(`workable provider tests crashed: ${e.message}`); +} + +// ── 12. PROVIDERS — SmartRecruiters ───────────────────────────────── + +console.log('\n12. Provider — smartrecruiters'); + +try { + const sr = (await import(pathToFileURL(join(ROOT, 'providers/smartrecruiters.mjs')).href)).default; + const { parseSmartRecruitersResponse } = await import(pathToFileURL(join(ROOT, 'providers/smartrecruiters.mjs')).href); + + if (sr.id === 'smartrecruiters') pass('smartrecruiters.id is "smartrecruiters"'); + else fail(`smartrecruiters.id is ${JSON.stringify(sr.id)}`); + + const hitCareers = sr.detect({ name: 'Adyen', careers_url: 'https://careers.smartrecruiters.com/adyen' }); + if (hitCareers && hitCareers.url.startsWith('https://api.smartrecruiters.com/v1/companies/adyen/postings')) { + pass('smartrecruiters.detect() resolves careers.smartrecruiters.com/ → api URL'); + } else { + fail(`smartrecruiters.detect(careers) returned ${JSON.stringify(hitCareers)}`); + } + + const hitJobs = sr.detect({ name: 'X', careers_url: 'https://jobs.smartrecruiters.com/x' }); + if (hitJobs && hitJobs.url.startsWith('https://api.smartrecruiters.com/v1/companies/x/postings')) { + pass('smartrecruiters.detect() also handles jobs.smartrecruiters.com'); + } else { + fail(`smartrecruiters.detect(jobs) returned ${JSON.stringify(hitJobs)}`); + } + + if (sr.detect({ name: 'X', careers_url: 'https://example.com/careers' }) === null) { + pass('smartrecruiters.detect() returns null for non-SR URLs'); + } else { + fail('smartrecruiters.detect() should return null for non-SR URLs'); + } + + // parseSmartRecruitersResponse + const sample = { + content: [ + { + id: 'abc-123', + name: 'Senior PM', + ref: 'https://api.smartrecruiters.com/v1/companies/sgs/postings/abc-123', + location: { fullLocation: 'Geneva, Switzerland', remote: false }, + }, + { + id: 'def-456', + name: 'Remote AI Engineer', + ref: 'https://api.smartrecruiters.com/v1/companies/sgs/postings/def-456', + location: { city: 'Paris', country: 'France', remote: true }, + }, + { + id: 'ghi-789', + name: 'No-ref Role', + location: { fullLocation: 'Berlin, Germany' }, + }, + ], + }; + const jobs = parseSmartRecruitersResponse(sample, 'SGS'); + if (jobs.length === 3) pass('parseSmartRecruitersResponse extracts 3 jobs'); + else fail(`parseSmartRecruitersResponse returned ${jobs.length} jobs`); + + if (jobs[0]?.location === 'Geneva, Switzerland' && jobs[0]?.title === 'Senior PM') { + pass('parseSmartRecruitersResponse uses fullLocation when present'); + } else { + fail(`row 0 = ${JSON.stringify(jobs[0])}`); + } + + if (jobs[1]?.location === 'Paris, France, Remote') { + pass('parseSmartRecruitersResponse builds location from city/country/remote when no fullLocation'); + } else { + fail(`row 1 location = ${JSON.stringify(jobs[1]?.location)}, expected "Paris, France, Remote"`); + } + + if (jobs[0]?.url === 'https://jobs.smartrecruiters.com/sgs/postings/abc-123') { + pass('parseSmartRecruitersResponse rewrites api.smartrecruiters.com → jobs.smartrecruiters.com'); + } else { + fail(`row 0 url = ${JSON.stringify(jobs[0]?.url)}`); + } + + if (jobs[2]?.url && jobs[2].url.startsWith('https://jobs.smartrecruiters.com/sgs/ghi-789')) { + pass('parseSmartRecruitersResponse falls back to synthetic URL when ref is missing'); + } else { + fail(`row 2 url = ${JSON.stringify(jobs[2]?.url)}`); + } + + // Empty input safety + if (parseSmartRecruitersResponse({}, 'X').length === 0) pass('empty {} input → empty result'); + else fail('empty {} input should yield empty result'); + + if (parseSmartRecruitersResponse({ content: 'not an array' }, 'X').length === 0) { + pass('non-array content → empty result (no crash)'); + } else { + fail('non-array content should yield empty result'); + } + +} catch (e) { + fail(`smartrecruiters provider tests crashed: ${e.message}`); +} + +// ── 13. PROVIDERS — Recruitee ─────────────────────────────────────── + +console.log('\n13. Provider — recruitee'); + +try { + const recruitee = (await import(pathToFileURL(join(ROOT, 'providers/recruitee.mjs')).href)).default; + const { parseRecruiteeResponse } = await import(pathToFileURL(join(ROOT, 'providers/recruitee.mjs')).href); + + if (recruitee.id === 'recruitee') pass('recruitee.id is "recruitee"'); + else fail(`recruitee.id is ${JSON.stringify(recruitee.id)}`); + + const hit = recruitee.detect({ name: 'Channable', careers_url: 'https://channable.recruitee.com' }); + if (hit && hit.url === 'https://channable.recruitee.com/api/offers/') { + pass('recruitee.detect() resolves .recruitee.com → api offers'); + } else { + fail(`recruitee.detect() returned ${JSON.stringify(hit)}`); + } + + if (recruitee.detect({ name: 'X', careers_url: 'https://example.com/careers' }) === null) { + pass('recruitee.detect() returns null for non-recruitee URLs'); + } else { + fail('recruitee.detect() should return null for non-recruitee URLs'); + } + + // parseRecruiteeResponse + const sample = { + offers: [ + { title: 'Senior PM', careers_url: 'https://channable.recruitee.com/o/senior-pm', city: 'Utrecht', country: 'Netherlands', remote: false }, + { title: 'Backend Eng', url: 'https://channable.recruitee.com/o/backend', city: 'Amsterdam', country: 'Netherlands', remote: true }, + { title: 'AI Lead', location: 'Remote, EMEA' }, + ], + }; + const jobs = parseRecruiteeResponse(sample, 'Channable'); + if (jobs.length === 3) pass('parseRecruiteeResponse extracts 3 offers'); + else fail(`parseRecruiteeResponse returned ${jobs.length} offers`); + + if (jobs[0]?.title === 'Senior PM' && jobs[0]?.company === 'Channable' && jobs[0]?.url === 'https://channable.recruitee.com/o/senior-pm') { + pass('parseRecruiteeResponse prefers careers_url field over url'); + } else { + fail(`row 0 = ${JSON.stringify(jobs[0])}`); + } + + if (jobs[1]?.location === 'Amsterdam, Netherlands, Remote') { + pass('parseRecruiteeResponse assembles city/country/remote when no location field'); + } else { + fail(`row 1 location = ${JSON.stringify(jobs[1]?.location)}, expected "Amsterdam, Netherlands, Remote"`); + } + + if (jobs[2]?.location === 'Remote, EMEA') { + pass('parseRecruiteeResponse uses explicit location field when present'); + } else { + fail(`row 2 location = ${JSON.stringify(jobs[2]?.location)}`); + } + + if (parseRecruiteeResponse({}, 'X').length === 0) pass('empty {} → empty result'); + else fail('empty {} should yield empty result'); + + if (parseRecruiteeResponse({ offers: null }, 'X').length === 0) { + pass('null offers → empty result (no crash)'); + } else { + fail('null offers should yield empty result'); + } + +} catch (e) { + fail(`recruitee provider tests crashed: ${e.message}`); +} + // ── SUMMARY ───────────────────────────────────────────────────── console.log('\n' + '='.repeat(50)); From 6448abd8e65a5dddb5b9cd05c3aaa9b32be910f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Rojo=20Mart=C3=ADnez?= Date: Sat, 16 May 2026 18:27:38 +0200 Subject: [PATCH 2/7] feat(providers): add SmartRecruiters provider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Auto-detects from careers_url pattern `https://(careers|jobs).smartrecruiters.com/` and hits the public /postings endpoint. tracked_companies entries can also set `provider: smartrecruiters` to bypass detection (useful when the public careers URL is a branded custom domain like `careers.adyen.com`). Follows the SSRF defence pattern from providers/greenhouse.mjs: hostname allowlist (api.smartrecruiters.com) + URL parse + HTTPS check + redirect:'error'. Notable parse decisions: - location: prefer location.fullLocation; else assemble from city/region/country (skipping empties); append "Remote" when location.remote is true. - url: rewrite j.ref's api.smartrecruiters.com prefix to jobs.smartrecruiters.com so the link points at the public job page, not the API. Falls back to a synthetic URL when ref is missing. Exports parseSmartRecruitersResponse as a named export so test-all.mjs §12 can unit-test the parser. Tests in test-all.mjs §12: - detect() resolves both careers.* and jobs.* hostnames - detect() returns null for non-SR URLs - parser uses fullLocation when present - parser assembles city/country/remote when fullLocation absent - parser rewrites api.smartrecruiters.com → jobs.smartrecruiters.com - parser synthesises a URL when ref is missing - empty / malformed inputs yield empty results without crashing Refs #651 --- providers/smartrecruiters.mjs | 81 +++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 providers/smartrecruiters.mjs diff --git a/providers/smartrecruiters.mjs b/providers/smartrecruiters.mjs new file mode 100644 index 0000000000..2873182f09 --- /dev/null +++ b/providers/smartrecruiters.mjs @@ -0,0 +1,81 @@ +// @ts-check +/** @typedef {import('./_types.js').Provider} Provider */ + +// SmartRecruiters provider — hits the public postings API. +// Auto-detects from careers_url pattern +// `https://(careers|jobs).smartrecruiters.com/`. A tracked_companies +// entry can also set `provider: smartrecruiters` explicitly to bypass +// detection (useful when the public careers URL is a branded custom domain). + +const ALLOWED_SMARTRECRUITERS_HOSTS = new Set(['api.smartrecruiters.com']); + +function assertSmartRecruitersUrl(url) { + let parsed; + try { + parsed = new URL(url); + } catch { + throw new Error(`smartrecruiters: invalid URL: ${url}`); + } + if (parsed.protocol !== 'https:') throw new Error(`smartrecruiters: URL must use HTTPS: ${url}`); + if (!ALLOWED_SMARTRECRUITERS_HOSTS.has(parsed.hostname)) { + throw new Error(`smartrecruiters: untrusted hostname "${parsed.hostname}" — must be one of: ${[...ALLOWED_SMARTRECRUITERS_HOSTS].join(', ')}`); + } + return url; +} + +function resolveApiUrl(entry) { + const url = entry.careers_url || ''; + const match = url.match(/(?:careers|jobs)\.smartrecruiters\.com\/([^/?#]+)/); + if (!match) return null; + return `https://api.smartrecruiters.com/v1/companies/${match[1]}/postings?limit=100&offset=0&status=PUBLIC`; +} + +/** @type {Provider} */ +export default { + id: 'smartrecruiters', + + detect(entry) { + const apiUrl = resolveApiUrl(entry); + return apiUrl ? { url: apiUrl } : null; + }, + + async fetch(entry, ctx) { + const apiUrl = resolveApiUrl(entry); + if (!apiUrl) throw new Error(`smartrecruiters: cannot derive API URL for ${entry.name}`); + assertSmartRecruitersUrl(apiUrl); + const json = await ctx.fetchJson(apiUrl, { redirect: 'error' }); + return parseSmartRecruitersResponse(json, entry.name); + }, +}; + +/** + * Parse a SmartRecruiters /postings response. Exported for unit tests. + * + * SmartRecruiters returns: + * { content: [{ id, name, ref, location: { fullLocation?, city?, region?, country?, remote? } }] } + * + * - location: prefer `fullLocation`; else assemble from city/region/country + * parts (skipping empties); append "Remote" when `location.remote` is true. + * - url: `j.ref` is an `api.smartrecruiters.com/v1/companies//postings/` + * URL — rewrite to the public `jobs.smartrecruiters.com//postings/`. + * If `ref` is missing, synthesise a URL from the company slug + posting id. + * + * @param {any} json + * @param {string} companyName + * @returns {Array<{title: string, url: string, company: string, location: string}>} + */ +export function parseSmartRecruitersResponse(json, companyName) { + const items = json?.content; + if (!Array.isArray(items)) return []; + return items.map(j => { + const loc = j.location || {}; + const fullLocation = loc.fullLocation || [loc.city, loc.region, loc.country].filter(Boolean).join(', '); + const remote = loc.remote ? 'Remote' : ''; + const location = [fullLocation, remote].filter(Boolean).join(', '); + const slugified = (j.name || '').toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, ''); + const url = j.ref + ? j.ref.replace('api.smartrecruiters.com/v1/companies/', 'jobs.smartrecruiters.com/') + : `https://jobs.smartrecruiters.com/${(companyName || '').toLowerCase()}/${j.id}-${slugified}`; + return { title: j.name || '', url, location, company: companyName }; + }); +} From 148551c53c6c53b0029701295479917ce0b1b1d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Rojo=20Mart=C3=ADnez?= Date: Sat, 16 May 2026 18:28:10 +0200 Subject: [PATCH 3/7] feat(providers): add Recruitee provider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Auto-detects from careers_url pattern `https://.recruitee.com` and hits the public /api/offers/ endpoint. tracked_companies entries can also set `provider: recruitee` to bypass detection. SSRF defence: per-tenant subdomains are the variable part, so a static hostname allowlist isn't workable. Uses a regex match on `.recruitee.com` (`^[a-z0-9][a-z0-9-]*\.recruitee\.com$`) + HTTPS check + redirect:'error'. The regex constrains the slug to safe characters, preventing attacker-controlled hostnames from slipping through. Notable parse decisions: - url: prefer `careers_url` (the public job page), fall back to `url` (some installs use it instead), empty string otherwise. - location: prefer the explicit `location` field; else assemble from city/country with "Remote" appended when remote is true. Exports parseRecruiteeResponse as a named export for tests. Tests in test-all.mjs §13: - detect() resolves .recruitee.com → /api/offers/ - detect() returns null for non-recruitee URLs - parser prefers careers_url over url - parser assembles location from city/country/remote - parser uses explicit location field when present - empty / null inputs yield empty results without crashing Refs #651 --- providers/recruitee.mjs | 80 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 providers/recruitee.mjs diff --git a/providers/recruitee.mjs b/providers/recruitee.mjs new file mode 100644 index 0000000000..1772199585 --- /dev/null +++ b/providers/recruitee.mjs @@ -0,0 +1,80 @@ +// @ts-check +/** @typedef {import('./_types.js').Provider} Provider */ + +// Recruitee provider — hits the public per-tenant offers API. +// Auto-detects from careers_url pattern `https://.recruitee.com`. +// Per-tenant subdomains are the variable part — SSRF defence uses a +// regex match on `.recruitee.com` rather than a static +// allowlist. + +const RECRUITEE_HOST_RE = /^[a-z0-9][a-z0-9-]*\.recruitee\.com$/; + +function assertRecruiteeUrl(url) { + let parsed; + try { + parsed = new URL(url); + } catch { + throw new Error(`recruitee: invalid URL: ${url}`); + } + if (parsed.protocol !== 'https:') throw new Error(`recruitee: URL must use HTTPS: ${url}`); + if (!RECRUITEE_HOST_RE.test(parsed.hostname)) { + throw new Error(`recruitee: untrusted hostname "${parsed.hostname}" — must match .recruitee.com`); + } + return url; +} + +function resolveApiUrl(entry) { + const url = entry.careers_url || ''; + const match = url.match(/([a-z0-9][a-z0-9-]*)\.recruitee\.com/); + if (!match) return null; + return `https://${match[1]}.recruitee.com/api/offers/`; +} + +/** @type {Provider} */ +export default { + id: 'recruitee', + + detect(entry) { + const apiUrl = resolveApiUrl(entry); + return apiUrl ? { url: apiUrl } : null; + }, + + async fetch(entry, ctx) { + const apiUrl = resolveApiUrl(entry); + if (!apiUrl) throw new Error(`recruitee: cannot derive API URL for ${entry.name}`); + assertRecruiteeUrl(apiUrl); + const json = await ctx.fetchJson(apiUrl, { redirect: 'error' }); + return parseRecruiteeResponse(json, entry.name); + }, +}; + +/** + * Parse a Recruitee /api/offers/ response. Exported for unit tests. + * + * Recruitee returns: + * { offers: [{ title, careers_url?, url?, city?, country?, remote?, location? }] } + * + * - url: prefer `careers_url`, fall back to `url`, empty string otherwise. + * - location: prefer the explicit `location` field; else assemble from + * city/country, appending "Remote" when `remote` is true. + * + * @param {any} json + * @param {string} companyName + * @returns {Array<{title: string, url: string, company: string, location: string}>} + */ +export function parseRecruiteeResponse(json, companyName) { + const offers = json?.offers; + if (!Array.isArray(offers)) return []; + return offers.map(j => { + const city = j.city || ''; + const country = j.country || ''; + const remote = j.remote ? 'Remote' : ''; + const location = j.location || [city, country, remote].filter(Boolean).join(', '); + return { + title: j.title || '', + url: j.careers_url || j.url || '', + location, + company: companyName, + }; + }); +} From a67e79480340f5561df5b7a1e2f79e489a0c0079 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Rojo=20Mart=C3=ADnez?= Date: Sat, 16 May 2026 18:28:39 +0200 Subject: [PATCH 4/7] docs(portals): document Workable, SmartRecruiters, Recruitee URL patterns --- templates/portals.example.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/templates/portals.example.yml b/templates/portals.example.yml index 210042a6f8..46d29f8591 100644 --- a/templates/portals.example.yml +++ b/templates/portals.example.yml @@ -356,6 +356,21 @@ search_queries: # provider's `id`. # transport: http — reserved for future transports. Defaults to http. +# ── Provider auto-detection ─────────────────────────────────────── +# scan.mjs auto-loads everything in providers/*.mjs and tries each +# provider's detect() in order. URL patterns recognized: +# +# greenhouse job-boards(.eu)?.greenhouse.io/ (or api: field) +# ashby jobs.ashbyhq.com/ +# lever jobs.lever.co/ +# workable apply.workable.com/ +# smartrecruiters (careers|jobs).smartrecruiters.com/ +# recruitee .recruitee.com +# +# When the public careers URL is a branded custom domain (e.g. +# careers.adyen.com), set `provider: smartrecruiters` explicitly to +# bypass detect(). The `provider:` field wins over auto-detection. + tracked_companies: # -- AI Labs & LLM providers -- From fcab2cc3ef24e69b685e562f1e9830616487ec30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Rojo=20Mart=C3=ADnez?= Date: Sat, 16 May 2026 18:50:13 +0200 Subject: [PATCH 5/7] fix(providers): defensive input normalization + edge cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-emptive hardening following the same defensive pattern CodeRabbit flagged on PR #652. All changes are within the providers shipped in this PR; no scan.mjs / framework changes. - All three providers: `careers_url` is now type-checked before .match() so a non-string YAML value (number, object, array) returns null from detect() rather than throwing. - smartrecruiters: ref-rewrite uses an anchored regex (`/^https:\/\/api\.smartrecruiters\.com\/v1\/companies\//`) so the replacement only fires at the URL prefix. The fallback URL path (when both j.ref AND j.id are missing) now returns an empty string instead of synthesising a URL containing the literal "undefined" — the empty string is the contract-allowed default for url per _types.js > Job. Magic 100 in the postings limit is now a named SR_PAGE_SIZE constant. - workable: parseWorkableMarkdown now extracts URLs via a line-level regex `/\[View\]\(([^)]+)\)/` rather than a column-position match, so a title containing a stray `|` doesn't shift cols[7] and silently drop the URL. Rows that still don't resolve a URL are skipped (no empty-URL entries leak into the dedup tracker). - test-all.mjs: 6 new assertions covering the defensive paths (non-string careers_url across all 3 providers, the SR no-ref/no-id fallback, the Workable stray-pipe survival, and a real Workable fetch() rejection test against an unresolvable careers_url). Refs #651 --- providers/recruitee.mjs | 2 +- providers/smartrecruiters.mjs | 9 +++-- providers/workable.mjs | 5 ++- test-all.mjs | 72 +++++++++++++++++++++++++++++++++-- 4 files changed, 78 insertions(+), 10 deletions(-) diff --git a/providers/recruitee.mjs b/providers/recruitee.mjs index 1772199585..8c94cf5f5b 100644 --- a/providers/recruitee.mjs +++ b/providers/recruitee.mjs @@ -24,7 +24,7 @@ function assertRecruiteeUrl(url) { } function resolveApiUrl(entry) { - const url = entry.careers_url || ''; + const url = typeof entry.careers_url === 'string' ? entry.careers_url : ''; const match = url.match(/([a-z0-9][a-z0-9-]*)\.recruitee\.com/); if (!match) return null; return `https://${match[1]}.recruitee.com/api/offers/`; diff --git a/providers/smartrecruiters.mjs b/providers/smartrecruiters.mjs index 2873182f09..092ec9b1d2 100644 --- a/providers/smartrecruiters.mjs +++ b/providers/smartrecruiters.mjs @@ -8,6 +8,7 @@ // detection (useful when the public careers URL is a branded custom domain). const ALLOWED_SMARTRECRUITERS_HOSTS = new Set(['api.smartrecruiters.com']); +const SR_PAGE_SIZE = 100; function assertSmartRecruitersUrl(url) { let parsed; @@ -24,10 +25,10 @@ function assertSmartRecruitersUrl(url) { } function resolveApiUrl(entry) { - const url = entry.careers_url || ''; + const url = typeof entry.careers_url === 'string' ? entry.careers_url : ''; const match = url.match(/(?:careers|jobs)\.smartrecruiters\.com\/([^/?#]+)/); if (!match) return null; - return `https://api.smartrecruiters.com/v1/companies/${match[1]}/postings?limit=100&offset=0&status=PUBLIC`; + return `https://api.smartrecruiters.com/v1/companies/${match[1]}/postings?limit=${SR_PAGE_SIZE}&offset=0&status=PUBLIC`; } /** @type {Provider} */ @@ -74,8 +75,8 @@ export function parseSmartRecruitersResponse(json, companyName) { const location = [fullLocation, remote].filter(Boolean).join(', '); const slugified = (j.name || '').toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, ''); const url = j.ref - ? j.ref.replace('api.smartrecruiters.com/v1/companies/', 'jobs.smartrecruiters.com/') - : `https://jobs.smartrecruiters.com/${(companyName || '').toLowerCase()}/${j.id}-${slugified}`; + ? j.ref.replace(/^https:\/\/api\.smartrecruiters\.com\/v1\/companies\//, 'https://jobs.smartrecruiters.com/') + : j.id ? `https://jobs.smartrecruiters.com/${(companyName || '').toLowerCase()}/${j.id}-${slugified}` : ''; return { title: j.name || '', url, location, company: companyName }; }); } diff --git a/providers/workable.mjs b/providers/workable.mjs index c1c5606c9d..5540cef62b 100644 --- a/providers/workable.mjs +++ b/providers/workable.mjs @@ -24,7 +24,7 @@ function assertWorkableUrl(url) { } function resolveFeedUrl(entry) { - const url = entry.careers_url || ''; + const url = typeof entry.careers_url === 'string' ? entry.careers_url : ''; const match = url.match(/apply\.workable\.com\/([^/?#]+)/); if (!match) return null; return `https://apply.workable.com/${match[1]}/jobs.md`; @@ -72,9 +72,10 @@ export function parseWorkableMarkdown(text, companyName) { const title = cols[1]; if (!title || title === 'Title') continue; const location = cols[3] || ''; - const urlMatch = cols[7].match(/\(([^)]+)\)/); + const urlMatch = line.match(/\[View\]\(([^)]+)\)/); let url = urlMatch ? urlMatch[1] : ''; if (url.endsWith('.md')) url = url.slice(0, -3); + if (!url) continue; // skip rows with no resolvable URL (e.g., malformed [View] link) jobs.push({ title, url, location, company: companyName }); } return jobs; diff --git a/test-all.mjs b/test-all.mjs index 7de46f1621..d4a9687890 100644 --- a/test-all.mjs +++ b/test-all.mjs @@ -370,9 +370,9 @@ try { if (parseWorkableMarkdown(null, 'X').length === 0) pass('null input → empty result (no crash)'); else fail('null input should yield empty result without crashing'); - // SSRF defence: untrusted hostname rejected before fetch + // fetch() reaches the http context on the happy path (allowed hostname). await workable.fetch( - { name: 'Bad', careers_url: 'https://apply.workable.com/evil' }, + { name: 'Smoke', careers_url: 'https://apply.workable.com/optimile' }, { transport: 'http', fetchText: async (url) => { @@ -384,7 +384,48 @@ try { fetchJson: async () => { throw new Error('fetchJson should not be called'); }, }, ); - pass('workable.fetch() reaches fetchText with allowed host'); + pass('workable.fetch() reaches fetchText on the happy path (allowed hostname)'); + + // fetch() rejects an unresolvable careers_url (no apply.workable.com match in URL). + let rejected = false; + try { + await workable.fetch( + { name: 'BadUrl', careers_url: 'https://evil.com/totally-not-workable' }, + { + transport: 'http', + fetchText: async () => { throw new Error('SSRF! should not reach here'); }, + fetchJson: async () => { throw new Error('SSRF! should not reach here'); }, + }, + ); + } catch (e) { + if (e.message.includes('cannot derive feed URL')) { + rejected = true; + } else { + fail(`workable.fetch() rejected with wrong error: ${e.message}`); + } + } + if (rejected) pass('workable.fetch() rejects unresolvable careers_url before fetch'); + else fail('workable.fetch() should throw cannot-derive-feed-URL for non-Workable URLs'); + + // careers_url with non-string value (e.g. YAML mistake passing a number) → detect() returns null without crashing + if (workable.detect({ name: 'X', careers_url: 42 }) === null) { + pass('workable.detect() returns null for non-string careers_url (42)'); + } else { + fail('workable.detect() should treat non-string careers_url as missing'); + } + + // Workable parser tolerates a title with a stray pipe — URL is extracted from the line, not cols[7] + const strayPipeMd = [ + '| Title | Department | Location | Type | Salary | Posted | Details |', + '|---|---|---|---|---|---|---|', + '| Senior PM (full | part-time) | Product | Remote | Full-time | — | 2026-04-01 | [View](https://apply.workable.com/x/jobs/view/PIPE.md) |', + ].join('\n'); + const strayJobs = parseWorkableMarkdown(strayPipeMd, 'X'); + if (strayJobs.length === 1 && strayJobs[0].url === 'https://apply.workable.com/x/jobs/view/PIPE') { + pass('parseWorkableMarkdown extracts URL from line-level regex (survives stray pipes in title)'); + } else { + fail(`stray-pipe row not handled correctly: ${JSON.stringify(strayJobs)}`); + } } catch (e) { fail(`workable provider tests crashed: ${e.message}`); @@ -481,6 +522,24 @@ try { fail('non-array content should yield empty result'); } + // careers_url with non-string value → detect() returns null without crashing + if (sr.detect({ name: 'X', careers_url: { foo: 'bar' } }) === null) { + pass('smartrecruiters.detect() returns null for non-string careers_url (object)'); + } else { + fail('smartrecruiters.detect() should treat non-string careers_url as missing'); + } + + // Fallback URL when both ref AND id are missing → empty string (not "undefined" in URL) + const noRefNoId = parseSmartRecruitersResponse( + { content: [{ name: 'Stranded Role' }] }, + 'X', + ); + if (noRefNoId.length === 1 && noRefNoId[0].url === '') { + pass('parseSmartRecruitersResponse returns url="" when both ref and id are missing'); + } else { + fail(`expected url='' when ref+id both missing, got ${JSON.stringify(noRefNoId[0])}`); + } + } catch (e) { fail(`smartrecruiters provider tests crashed: ${e.message}`); } @@ -548,6 +607,13 @@ try { fail('null offers should yield empty result'); } + // careers_url with non-string value → detect() returns null without crashing + if (recruitee.detect({ name: 'X', careers_url: null }) === null && recruitee.detect({ name: 'X', careers_url: 7 }) === null) { + pass('recruitee.detect() returns null for non-string careers_url (null and 7)'); + } else { + fail('recruitee.detect() should treat non-string careers_url as missing'); + } + } catch (e) { fail(`recruitee provider tests crashed: ${e.message}`); } From 09b6f2b0165eed2d8456b31cf9d40f11e08c00b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Rojo=20Mart=C3=ADnez?= Date: Sat, 16 May 2026 18:58:40 +0200 Subject: [PATCH 6/7] fix(providers): strict URL parsing + ref validation (review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses 5 CodeRabbit comments on PR #653 asking for tighter validation than substring regex on raw URL strings. - All 3 providers: detect()/resolveXxxUrl() now use new URL() to parse careers_url, verify protocol === 'https:', check hostname exactly (Workable: apply.workable.com; SmartRecruiters: careers./jobs.smartrecruiters.com; Recruitee: regex-validated .recruitee.com), then derive the slug from the parsed pathname/hostname. This rejects path-spoofed inputs like https://evil.example/apply.workable.com/slug (substring regex would have falsely matched). - smartrecruiters parseSmartRecruitersResponse: j.ref is now validated (parses as URL, hostname must be api.smartrecruiters.com, pathname must start with /v1/companies/) before the prefix rewrite. Invalid refs fall through to the fallback URL path. The fallback companyName is now slugified (non-alphanumerics → -, strip leading/trailing -) so "My Acme & Co." → "my-acme-co" rather than producing a URL with raw spaces/symbols. - test-all.mjs: 5 new assertions covering the path-spoof rejection for all 3 providers, the untrusted-ref-host fall-through, and the companyName slugification. Refs #651 --- providers/recruitee.mjs | 15 ++++++++---- providers/smartrecruiters.mjs | 39 +++++++++++++++++++++++++------ providers/workable.mjs | 17 ++++++++++---- test-all.mjs | 44 +++++++++++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 15 deletions(-) diff --git a/providers/recruitee.mjs b/providers/recruitee.mjs index 8c94cf5f5b..21af5bbd6a 100644 --- a/providers/recruitee.mjs +++ b/providers/recruitee.mjs @@ -24,10 +24,17 @@ function assertRecruiteeUrl(url) { } function resolveApiUrl(entry) { - const url = typeof entry.careers_url === 'string' ? entry.careers_url : ''; - const match = url.match(/([a-z0-9][a-z0-9-]*)\.recruitee\.com/); - if (!match) return null; - return `https://${match[1]}.recruitee.com/api/offers/`; + const raw = typeof entry.careers_url === 'string' ? entry.careers_url : ''; + if (!raw) return null; + let parsed; + try { + parsed = new URL(raw); + } catch { + return null; + } + if (parsed.protocol !== 'https:') return null; + if (!RECRUITEE_HOST_RE.test(parsed.hostname)) return null; + return `https://${parsed.hostname}/api/offers/`; } /** @type {Provider} */ diff --git a/providers/smartrecruiters.mjs b/providers/smartrecruiters.mjs index 092ec9b1d2..519f65b37d 100644 --- a/providers/smartrecruiters.mjs +++ b/providers/smartrecruiters.mjs @@ -8,6 +8,7 @@ // detection (useful when the public careers URL is a branded custom domain). const ALLOWED_SMARTRECRUITERS_HOSTS = new Set(['api.smartrecruiters.com']); +const SR_CAREERS_HOSTS = new Set(['careers.smartrecruiters.com', 'jobs.smartrecruiters.com']); const SR_PAGE_SIZE = 100; function assertSmartRecruitersUrl(url) { @@ -25,10 +26,19 @@ function assertSmartRecruitersUrl(url) { } function resolveApiUrl(entry) { - const url = typeof entry.careers_url === 'string' ? entry.careers_url : ''; - const match = url.match(/(?:careers|jobs)\.smartrecruiters\.com\/([^/?#]+)/); - if (!match) return null; - return `https://api.smartrecruiters.com/v1/companies/${match[1]}/postings?limit=${SR_PAGE_SIZE}&offset=0&status=PUBLIC`; + const raw = typeof entry.careers_url === 'string' ? entry.careers_url : ''; + if (!raw) return null; + let parsed; + try { + parsed = new URL(raw); + } catch { + return null; + } + if (parsed.protocol !== 'https:') return null; + if (!SR_CAREERS_HOSTS.has(parsed.hostname)) return null; + const slug = parsed.pathname.split('/').filter(Boolean)[0]; + if (!slug) return null; + return `https://api.smartrecruiters.com/v1/companies/${slug}/postings?limit=${SR_PAGE_SIZE}&offset=0&status=PUBLIC`; } /** @type {Provider} */ @@ -74,9 +84,24 @@ export function parseSmartRecruitersResponse(json, companyName) { const remote = loc.remote ? 'Remote' : ''; const location = [fullLocation, remote].filter(Boolean).join(', '); const slugified = (j.name || '').toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, ''); - const url = j.ref - ? j.ref.replace(/^https:\/\/api\.smartrecruiters\.com\/v1\/companies\//, 'https://jobs.smartrecruiters.com/') - : j.id ? `https://jobs.smartrecruiters.com/${(companyName || '').toLowerCase()}/${j.id}-${slugified}` : ''; + let url = ''; + if (typeof j.ref === 'string') { + let parsedRef; + try { parsedRef = new URL(j.ref); } catch { parsedRef = null; } + if (parsedRef + && parsedRef.protocol === 'https:' + && parsedRef.hostname === 'api.smartrecruiters.com' + && parsedRef.pathname.startsWith('/v1/companies/')) { + const restOfPath = parsedRef.pathname.slice('/v1/companies/'.length); + url = `https://jobs.smartrecruiters.com/${restOfPath}`; + } + } + if (!url && j.id) { + const companySlug = (companyName || '').toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, ''); + if (companySlug) { + url = `https://jobs.smartrecruiters.com/${companySlug}/${j.id}-${slugified}`; + } + } return { title: j.name || '', url, location, company: companyName }; }); } diff --git a/providers/workable.mjs b/providers/workable.mjs index 5540cef62b..91cbaee752 100644 --- a/providers/workable.mjs +++ b/providers/workable.mjs @@ -24,10 +24,19 @@ function assertWorkableUrl(url) { } function resolveFeedUrl(entry) { - const url = typeof entry.careers_url === 'string' ? entry.careers_url : ''; - const match = url.match(/apply\.workable\.com\/([^/?#]+)/); - if (!match) return null; - return `https://apply.workable.com/${match[1]}/jobs.md`; + const raw = typeof entry.careers_url === 'string' ? entry.careers_url : ''; + if (!raw) return null; + let parsed; + try { + parsed = new URL(raw); + } catch { + return null; + } + if (parsed.protocol !== 'https:') return null; + if (parsed.hostname !== 'apply.workable.com') return null; + const slug = parsed.pathname.split('/').filter(Boolean)[0]; + if (!slug) return null; + return `https://apply.workable.com/${slug}/jobs.md`; } /** @type {Provider} */ diff --git a/test-all.mjs b/test-all.mjs index d4a9687890..1ce3fb58d9 100644 --- a/test-all.mjs +++ b/test-all.mjs @@ -407,6 +407,14 @@ try { if (rejected) pass('workable.fetch() rejects unresolvable careers_url before fetch'); else fail('workable.fetch() should throw cannot-derive-feed-URL for non-Workable URLs'); + // SSRF: malicious URL with apply.workable.com in the PATH (not hostname) must not be detected as Workable. + // With strict URL parsing, the hostname `evil.example` fails the check and detect() returns null. + if (workable.detect({ name: 'Spoof', careers_url: 'https://evil.example/apply.workable.com/slug' }) === null) { + pass('workable.detect() rejects path-spoofed URLs (apply.workable.com in path, not hostname)'); + } else { + fail('workable.detect() must NOT misdetect URLs that contain apply.workable.com in the path'); + } + // careers_url with non-string value (e.g. YAML mistake passing a number) → detect() returns null without crashing if (workable.detect({ name: 'X', careers_url: 42 }) === null) { pass('workable.detect() returns null for non-string careers_url (42)'); @@ -540,6 +548,35 @@ try { fail(`expected url='' when ref+id both missing, got ${JSON.stringify(noRefNoId[0])}`); } + // SSRF: malicious URL with smartrecruiters hostname in the PATH (not host) must not be detected. + if (sr.detect({ name: 'Spoof', careers_url: 'https://evil.example/careers.smartrecruiters.com/slug' }) === null) { + pass('smartrecruiters.detect() rejects path-spoofed URLs'); + } else { + fail('smartrecruiters.detect() must NOT misdetect path-spoofed URLs'); + } + + // SmartRecruiters: untrusted j.ref host falls through to fallback rather than rewriting + const bogusRef = parseSmartRecruitersResponse( + { content: [{ id: 'X1', name: 'Strange Role', ref: 'https://evil.example/v1/companies/x/postings/X1' }] }, + 'TestCo', + ); + if (bogusRef[0]?.url && !bogusRef[0].url.includes('evil.example')) { + pass('parseSmartRecruitersResponse rejects untrusted j.ref host (falls through to fallback)'); + } else { + fail(`untrusted j.ref leaked into url: ${JSON.stringify(bogusRef[0]?.url)}`); + } + + // SmartRecruiters: companyName with spaces/symbols is slugified for the fallback URL + const slugifiedCompany = parseSmartRecruitersResponse( + { content: [{ id: 'X2', name: 'Strange Role' }] }, + 'My Acme & Co.', + ); + if (slugifiedCompany[0]?.url === 'https://jobs.smartrecruiters.com/my-acme-co/X2-strange-role') { + pass('parseSmartRecruitersResponse slugifies the companyName for the fallback URL'); + } else { + fail(`fallback URL not properly slugified: ${JSON.stringify(slugifiedCompany[0]?.url)}`); + } + } catch (e) { fail(`smartrecruiters provider tests crashed: ${e.message}`); } @@ -614,6 +651,13 @@ try { fail('recruitee.detect() should treat non-string careers_url as missing'); } + // SSRF: malicious URL with recruitee.com in the PATH (not host) must not be detected. + if (recruitee.detect({ name: 'Spoof', careers_url: 'https://evil.example/channable.recruitee.com/foo' }) === null) { + pass('recruitee.detect() rejects path-spoofed URLs'); + } else { + fail('recruitee.detect() must NOT misdetect path-spoofed URLs'); + } + } catch (e) { fail(`recruitee provider tests crashed: ${e.message}`); } From 434375bb380a7ac4a622d389a4478d7a5ba20b99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Rojo=20Mart=C3=ADnez?= Date: Sat, 16 May 2026 19:10:25 +0200 Subject: [PATCH 7/7] fix(providers): validate parsed URLs + paginate SmartRecruiters (review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses 3 CodeRabbit comments on PR #653 (round 2). - recruitee: parseRecruiteeResponse now validates the offer URL via new URL() + protocol === 'https:' + RECRUITEE_HOST_RE hostname check. Off-domain or non-HTTPS values are dropped (url = '' per the Job contract) rather than passed through verbatim. - workable: parseWorkableMarkdown now validates each [View] link the same way (hostname must be apply.workable.com, protocol must be https). Rows that fail validation are skipped (continue), matching the existing "skip rows with no resolvable URL" semantic. - smartrecruiters: fetch() now paginates the /postings endpoint instead of returning only the first 100 results. Added resolveSlug() and buildPostingsUrl(slug, offset) helpers, refactored resolveApiUrl() to delegate to them, and the fetch loop walks offsets 0, SR_PAGE_SIZE, 2*SR_PAGE_SIZE, ... until either an empty page or a short page (less than SR_PAGE_SIZE). Safety cap SR_MAX_PAGES = 50 (= 5000 postings) prevents runaway loops against a broken API. - test-all.mjs: 4 new assertions - Workable: off-domain + non-https [View] links are dropped - Recruitee: off-domain + non-https + missing offer URLs → url='' - SmartRecruiters: 2-page aggregation (150 items across 2 pages) - SmartRecruiters: stop on the first empty page (1 request) Refs #651 --- providers/recruitee.mjs | 21 ++++++++- providers/smartrecruiters.mjs | 34 +++++++++++---- providers/workable.mjs | 12 ++++++ test-all.mjs | 81 +++++++++++++++++++++++++++++++++++ 4 files changed, 138 insertions(+), 10 deletions(-) diff --git a/providers/recruitee.mjs b/providers/recruitee.mjs index 21af5bbd6a..b7886ca336 100644 --- a/providers/recruitee.mjs +++ b/providers/recruitee.mjs @@ -61,7 +61,9 @@ export default { * Recruitee returns: * { offers: [{ title, careers_url?, url?, city?, country?, remote?, location? }] } * - * - url: prefer `careers_url`, fall back to `url`, empty string otherwise. + * - url: prefer `careers_url`, fall back to `url`; validated against + * `https://.recruitee.com` — an off-domain or non-HTTPS URL is + * dropped (empty string returned per the Job contract). * - location: prefer the explicit `location` field; else assemble from * city/country, appending "Remote" when `remote` is true. * @@ -77,9 +79,24 @@ export function parseRecruiteeResponse(json, companyName) { const country = j.country || ''; const remote = j.remote ? 'Remote' : ''; const location = j.location || [city, country, remote].filter(Boolean).join(', '); + + // Validate offer URL: must parse as https://.recruitee.com/... + let url = ''; + const rawUrl = j.careers_url || j.url || ''; + if (typeof rawUrl === 'string' && rawUrl) { + try { + const parsed = new URL(rawUrl); + if (parsed.protocol === 'https:' && RECRUITEE_HOST_RE.test(parsed.hostname)) { + url = parsed.href; + } + } catch { + // malformed URL → leave url = '' + } + } + return { title: j.title || '', - url: j.careers_url || j.url || '', + url, location, company: companyName, }; diff --git a/providers/smartrecruiters.mjs b/providers/smartrecruiters.mjs index 519f65b37d..debe7f3a0d 100644 --- a/providers/smartrecruiters.mjs +++ b/providers/smartrecruiters.mjs @@ -10,6 +10,7 @@ const ALLOWED_SMARTRECRUITERS_HOSTS = new Set(['api.smartrecruiters.com']); const SR_CAREERS_HOSTS = new Set(['careers.smartrecruiters.com', 'jobs.smartrecruiters.com']); const SR_PAGE_SIZE = 100; +const SR_MAX_PAGES = 50; // safety cap (5000 postings @ 100/page) function assertSmartRecruitersUrl(url) { let parsed; @@ -25,7 +26,7 @@ function assertSmartRecruitersUrl(url) { return url; } -function resolveApiUrl(entry) { +function resolveSlug(entry) { const raw = typeof entry.careers_url === 'string' ? entry.careers_url : ''; if (!raw) return null; let parsed; @@ -37,8 +38,16 @@ function resolveApiUrl(entry) { if (parsed.protocol !== 'https:') return null; if (!SR_CAREERS_HOSTS.has(parsed.hostname)) return null; const slug = parsed.pathname.split('/').filter(Boolean)[0]; - if (!slug) return null; - return `https://api.smartrecruiters.com/v1/companies/${slug}/postings?limit=${SR_PAGE_SIZE}&offset=0&status=PUBLIC`; + return slug || null; +} + +function buildPostingsUrl(slug, offset = 0) { + return `https://api.smartrecruiters.com/v1/companies/${slug}/postings?limit=${SR_PAGE_SIZE}&offset=${offset}&status=PUBLIC`; +} + +function resolveApiUrl(entry) { + const slug = resolveSlug(entry); + return slug ? buildPostingsUrl(slug, 0) : null; } /** @type {Provider} */ @@ -51,11 +60,20 @@ export default { }, async fetch(entry, ctx) { - const apiUrl = resolveApiUrl(entry); - if (!apiUrl) throw new Error(`smartrecruiters: cannot derive API URL for ${entry.name}`); - assertSmartRecruitersUrl(apiUrl); - const json = await ctx.fetchJson(apiUrl, { redirect: 'error' }); - return parseSmartRecruitersResponse(json, entry.name); + const slug = resolveSlug(entry); + if (!slug) throw new Error(`smartrecruiters: cannot derive API URL for ${entry.name}`); + + const all = []; + for (let page = 0; page < SR_MAX_PAGES; page++) { + const apiUrl = buildPostingsUrl(slug, page * SR_PAGE_SIZE); + assertSmartRecruitersUrl(apiUrl); + const json = await ctx.fetchJson(apiUrl, { redirect: 'error' }); + const parsed = parseSmartRecruitersResponse(json, entry.name); + if (parsed.length === 0) break; + all.push(...parsed); + if (parsed.length < SR_PAGE_SIZE) break; // last page (short) + } + return all; }, }; diff --git a/providers/workable.mjs b/providers/workable.mjs index 91cbaee752..c491f6daba 100644 --- a/providers/workable.mjs +++ b/providers/workable.mjs @@ -65,6 +65,8 @@ export default { * | Title | Department | Location | Type | Salary | Posted | Details | * where `Details` holds a markdown link * [View](https://apply.workable.com//jobs/view/.md) + * URLs are validated against `https://apply.workable.com/` — off-domain or + * non-HTTPS [View] links are skipped (not emitted). * * @param {string} text — markdown body * @param {string} companyName — value to write into job.company @@ -85,6 +87,16 @@ export function parseWorkableMarkdown(text, companyName) { let url = urlMatch ? urlMatch[1] : ''; if (url.endsWith('.md')) url = url.slice(0, -3); if (!url) continue; // skip rows with no resolvable URL (e.g., malformed [View] link) + + // Validate the extracted URL — must parse as https://apply.workable.com/... + try { + const parsedUrl = new URL(url); + if (parsedUrl.protocol !== 'https:' || parsedUrl.hostname !== 'apply.workable.com') continue; + url = parsedUrl.href; + } catch { + continue; + } + jobs.push({ title, url, location, company: companyName }); } return jobs; diff --git a/test-all.mjs b/test-all.mjs index 1ce3fb58d9..9ea3aa4848 100644 --- a/test-all.mjs +++ b/test-all.mjs @@ -435,6 +435,21 @@ try { fail(`stray-pipe row not handled correctly: ${JSON.stringify(strayJobs)}`); } + // Off-domain [View] link is dropped (URL validation) + const offDomainMd = [ + '| Title | Department | Location | Type | Salary | Posted | Details |', + '|---|---|---|---|---|---|---|', + '| Good Role | Product | Remote | Full-time | — | 2026-04-01 | [View](https://apply.workable.com/x/jobs/view/ABC.md) |', + '| Evil Role | Product | Remote | Full-time | — | 2026-04-01 | [View](https://evil.example/jobs/view/X) |', + '| Insecure Role | Product | Remote | Full-time | — | 2026-04-01 | [View](http://apply.workable.com/x/jobs/view/Y.md) |', + ].join('\n'); + const filteredJobs = parseWorkableMarkdown(offDomainMd, 'X'); + if (filteredJobs.length === 1 && filteredJobs[0].title === 'Good Role') { + pass('parseWorkableMarkdown drops off-domain and non-https [View] links'); + } else { + fail(`expected only "Good Role" through, got ${JSON.stringify(filteredJobs.map(j => j.title))}`); + } + } catch (e) { fail(`workable provider tests crashed: ${e.message}`); } @@ -577,6 +592,54 @@ try { fail(`fallback URL not properly slugified: ${JSON.stringify(slugifiedCompany[0]?.url)}`); } + // Pagination: fetch() loops until an empty page (or short page) is returned + let pageRequests = 0; + const pagedJobs = await sr.fetch( + { name: 'PagedCo', careers_url: 'https://careers.smartrecruiters.com/paged' }, + { + transport: 'http', + fetchText: async () => { throw new Error('fetchText should not be called'); }, + fetchJson: async (url) => { + pageRequests++; + const offset = parseInt(new URL(url).searchParams.get('offset') || '0', 10); + if (offset === 0) { + // Page 1: full page (100 items) + return { content: Array.from({ length: 100 }, (_, i) => ({ id: `P1-${i}`, name: `Role 1-${i}` })) }; + } + if (offset === 100) { + // Page 2: short page (50 items) → loop stops after this + return { content: Array.from({ length: 50 }, (_, i) => ({ id: `P2-${i}`, name: `Role 2-${i}` })) }; + } + // Should not be reached because page 2 was short + return { content: [] }; + }, + }, + ); + if (pageRequests === 2 && pagedJobs.length === 150) { + pass('smartrecruiters.fetch() paginates and aggregates results (2 pages → 150 total)'); + } else { + fail(`pagination: pageRequests=${pageRequests}, total=${pagedJobs.length} (expected 2 requests / 150 results)`); + } + + // Pagination stop condition: empty content terminates the loop + let emptyPageRequests = 0; + const emptyJobs = await sr.fetch( + { name: 'EmptyCo', careers_url: 'https://careers.smartrecruiters.com/empty' }, + { + transport: 'http', + fetchText: async () => { throw new Error('fetchText should not be called'); }, + fetchJson: async () => { + emptyPageRequests++; + return { content: [] }; + }, + }, + ); + if (emptyPageRequests === 1 && emptyJobs.length === 0) { + pass('smartrecruiters.fetch() stops on the first empty page'); + } else { + fail(`empty pagination: requests=${emptyPageRequests}, total=${emptyJobs.length}`); + } + } catch (e) { fail(`smartrecruiters provider tests crashed: ${e.message}`); } @@ -658,6 +721,24 @@ try { fail('recruitee.detect() must NOT misdetect path-spoofed URLs'); } + // Off-domain offer URL is dropped (URL validation) + const offDomainOffers = parseRecruiteeResponse( + { + offers: [ + { title: 'Good', careers_url: 'https://channable.recruitee.com/o/good' }, + { title: 'Evil', careers_url: 'https://evil.example/o/evil' }, + { title: 'Insecure', careers_url: 'http://channable.recruitee.com/o/insecure' }, + { title: 'No URL field' }, + ], + }, + 'Channable', + ); + if (offDomainOffers[0]?.url === 'https://channable.recruitee.com/o/good' && offDomainOffers[1]?.url === '' && offDomainOffers[2]?.url === '' && offDomainOffers[3]?.url === '') { + pass('parseRecruiteeResponse drops off-domain, non-https, and missing offer URLs'); + } else { + fail(`URL validation: row0=${JSON.stringify(offDomainOffers[0]?.url)}, row1=${JSON.stringify(offDomainOffers[1]?.url)}, row2=${JSON.stringify(offDomainOffers[2]?.url)}, row3=${JSON.stringify(offDomainOffers[3]?.url)}`); + } + } catch (e) { fail(`recruitee provider tests crashed: ${e.message}`); }