From 8b96b1f2fc63e90c5ccc64d30bb70fa415df2f19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jes=C3=BAs=20Rojo=20Mart=C3=ADnez?=
 <jrojomartinez@gmail.com>
Date: Sat, 16 May 2026 18:26:50 +0200
Subject: [PATCH 1/7] feat(providers): add Workable provider
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Workable's documented JSON API requires an auth token; the only
no-auth public surface is a Markdown feed at
`apply.workable.com/<slug>/jobs.md`. The provider auto-detects from
the `apply.workable.com/<slug>` careers_url pattern, fetches via
ctx.fetchText, and parses the table rows.

Follows the SSRF defence pattern from providers/greenhouse.mjs:
hostname allowlist + URL parse + HTTPS check + redirect:'error' on
the fetch call.

Exports parseWorkableMarkdown as a named export so test-all.mjs §11
can unit-test the parser independently of the network.

Tests in test-all.mjs §11:
  - detect() resolves apply.workable.com/<slug> → /jobs.md feed
  - detect() returns null for non-workable URLs
  - parseWorkableMarkdown extracts title/location/company correctly
  - parseWorkableMarkdown strips .md suffix from job URLs
  - empty / null inputs yield empty results without crashing
  - fetch() with allowed hostname reaches the http context

Refs #651
---
 providers/workable.mjs |  81 ++++++++++++++
 test-all.mjs           | 238 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 319 insertions(+)
 create mode 100644 providers/workable.mjs
diff --git a/providers/workable.mjs b/providers/workable.mjs
new file mode 100644
index 0000000000..c1c5606c9d
--- /dev/null
+++ b/providers/workable.mjs
@@ -0,0 +1,81 @@
+// @ts-check
+/** @typedef {import('./_types.js').Provider} Provider */
+
+// Workable provider — hits the public markdown feed at /<slug>/jobs.md.
+// Workable's documented JSON API requires an auth token; the markdown feed
+// is the only no-auth public surface. Auto-detects from careers_url pattern
+// `https://apply.workable.com/<slug>`. A tracked_companies entry can also
+// set `provider: workable` explicitly to bypass detection.
+
+const ALLOWED_WORKABLE_HOSTS = new Set(['apply.workable.com']);
+
+function assertWorkableUrl(url) {
+  let parsed;
+  try {
+    parsed = new URL(url);
+  } catch {
+    throw new Error(`workable: invalid URL: ${url}`);
+  }
+  if (parsed.protocol !== 'https:') throw new Error(`workable: URL must use HTTPS: ${url}`);
+  if (!ALLOWED_WORKABLE_HOSTS.has(parsed.hostname)) {
+    throw new Error(`workable: untrusted hostname "${parsed.hostname}" — must be one of: ${[...ALLOWED_WORKABLE_HOSTS].join(', ')}`);
+  }
+  return url;
+}
+
+function resolveFeedUrl(entry) {
+  const url = entry.careers_url || '';
+  const match = url.match(/apply\.workable\.com\/([^/?#]+)/);
+  if (!match) return null;
+  return `https://apply.workable.com/${match[1]}/jobs.md`;
+}
+
+/** @type {Provider} */
+export default {
+  id: 'workable',
+
+  detect(entry) {
+    const feedUrl = resolveFeedUrl(entry);
+    return feedUrl ? { url: feedUrl } : null;
+  },
+
+  async fetch(entry, ctx) {
+    const feedUrl = resolveFeedUrl(entry);
+    if (!feedUrl) throw new Error(`workable: cannot derive feed URL for ${entry.name}`);
+    assertWorkableUrl(feedUrl);
+    // redirect:'error' prevents SSRF via server-side redirects; combined with
+    // assertWorkableUrl above it guarantees the final hostname stays in the allowlist.
+    const text = await ctx.fetchText(feedUrl, { redirect: 'error' });
+    return parseWorkableMarkdown(text, entry.name);
+  },
+};
+
+/**
+ * Parse Workable's public markdown feed. Exported as a named export for unit
+ * tests. The feed exposes a table:
+ *   | Title | Department | Location | Type | Salary | Posted | Details |
+ * where `Details` holds a markdown link
+ *   [View](https://apply.workable.com/<slug>/jobs/view/<id>.md)
+ *
+ * @param {string} text — markdown body
+ * @param {string} companyName — value to write into job.company
+ * @returns {Array<{title: string, url: string, company: string, location: string}>}
+ */
+export function parseWorkableMarkdown(text, companyName) {
+  if (typeof text !== 'string') return [];
+  const jobs = [];
+  for (const line of text.split('\n')) {
+    if (!line.startsWith('|') || !line.includes('[View]')) continue;
+    const cols = line.split('|').map(c => c.trim());
+    // Cols: ['', title, dept, location, type, salary, posted, '[View](url.md)', '']
+    if (cols.length < 8) continue;
+    const title = cols[1];
+    if (!title || title === 'Title') continue;
+    const location = cols[3] || '';
+    const urlMatch = cols[7].match(/\(([^)]+)\)/);
+    let url = urlMatch ? urlMatch[1] : '';
+    if (url.endsWith('.md')) url = url.slice(0, -3);
+    jobs.push({ title, url, location, company: companyName });
+  }
+  return jobs;
+}
diff --git a/test-all.mjs b/test-all.mjs
index c5152fc11f..7de46f1621 100644
--- a/test-all.mjs
+++ b/test-all.mjs
@@ -314,6 +314,244 @@ if (fileExists('VERSION')) {
   fail('VERSION file missing');
 }
 
+// ── 11. PROVIDERS — Workable ────────────────────────────────────────
+
+console.log('\n11. Provider — workable');
+
+try {
+  const workable = (await import(pathToFileURL(join(ROOT, 'providers/workable.mjs')).href)).default;
+  const { parseWorkableMarkdown } = await import(pathToFileURL(join(ROOT, 'providers/workable.mjs')).href);
+
+  // detect() — auto-detection from careers_url
+  if (workable.id === 'workable') pass('workable.id is "workable"');
+  else fail(`workable.id is ${JSON.stringify(workable.id)}`);
+
+  const hit = workable.detect({ name: 'TestCo', careers_url: 'https://apply.workable.com/optimile' });
+  if (hit && hit.url === 'https://apply.workable.com/optimile/jobs.md') {
+    pass('workable.detect() resolves apply.workable.com/<slug> → /jobs.md feed');
+  } else {
+    fail(`workable.detect() returned ${JSON.stringify(hit)}`);
+  }
+
+  const miss = workable.detect({ name: 'TestCo', careers_url: 'https://example.com/careers' });
+  if (miss === null) pass('workable.detect() returns null for non-workable URLs');
+  else fail(`workable.detect() should return null, got ${JSON.stringify(miss)}`);
+
+  // parse() — markdown table
+  const sampleMd = [
+    '# Optimile — All Open Positions',
+    '',
+    '| Title | Department | Location | Type | Salary | Posted | Details |',
+    '|---|---|---|---|---|---|---|',
+    '| Senior AI PM | Product | Ghent, Belgium | Full-time | — | 2026-04-01 | [View](https://apply.workable.com/optimile/jobs/view/ABC123.md) |',
+    '| Tech Lead | Engineering | Remote | Full-time | — | 2026-03-25 | [View](https://apply.workable.com/optimile/jobs/view/DEF456.md) |',
+  ].join('\n');
+
+  const jobs = parseWorkableMarkdown(sampleMd, 'Optimile');
+  if (jobs.length === 2) pass('parseWorkableMarkdown extracts 2 jobs from 2-row table');
+  else fail(`parseWorkableMarkdown returned ${jobs.length} jobs, expected 2`);
+
+  if (jobs[0]?.title === 'Senior AI PM' && jobs[0]?.location === 'Ghent, Belgium' && jobs[0]?.company === 'Optimile') {
+    pass('parseWorkableMarkdown extracts title, location, company correctly');
+  } else {
+    fail(`parseWorkableMarkdown row 0 = ${JSON.stringify(jobs[0])}`);
+  }
+
+  if (jobs[0]?.url === 'https://apply.workable.com/optimile/jobs/view/ABC123') {
+    pass('parseWorkableMarkdown strips .md suffix from job URL');
+  } else {
+    fail(`parseWorkableMarkdown should strip .md; got url=${JSON.stringify(jobs[0]?.url)}`);
+  }
+
+  // Robustness
+  if (parseWorkableMarkdown('', 'X').length === 0) pass('empty input → empty result');
+  else fail('empty input should yield empty result');
+
+  if (parseWorkableMarkdown(null, 'X').length === 0) pass('null input → empty result (no crash)');
+  else fail('null input should yield empty result without crashing');
+
+  // SSRF defence: untrusted hostname rejected before fetch
+  await workable.fetch(
+    { name: 'Bad', careers_url: 'https://apply.workable.com/evil' },
+    {
+      transport: 'http',
+      fetchText: async (url) => {
+        if (!url.startsWith('https://apply.workable.com/')) {
+          throw new Error('fetchText called with unexpected URL');
+        }
+        return '| Title | Department | Location | Type | Salary | Posted | Details |\n|---|---|---|---|---|---|---|\n';
+      },
+      fetchJson: async () => { throw new Error('fetchJson should not be called'); },
+    },
+  );
+  pass('workable.fetch() reaches fetchText with allowed host');
+
+} catch (e) {
+  fail(`workable provider tests crashed: ${e.message}`);
+}
+
+// ── 12. PROVIDERS — SmartRecruiters ─────────────────────────────────
+
+console.log('\n12. Provider — smartrecruiters');
+
+try {
+  const sr = (await import(pathToFileURL(join(ROOT, 'providers/smartrecruiters.mjs')).href)).default;
+  const { parseSmartRecruitersResponse } = await import(pathToFileURL(join(ROOT, 'providers/smartrecruiters.mjs')).href);
+
+  if (sr.id === 'smartrecruiters') pass('smartrecruiters.id is "smartrecruiters"');
+  else fail(`smartrecruiters.id is ${JSON.stringify(sr.id)}`);
+
+  const hitCareers = sr.detect({ name: 'Adyen', careers_url: 'https://careers.smartrecruiters.com/adyen' });
+  if (hitCareers && hitCareers.url.startsWith('https://api.smartrecruiters.com/v1/companies/adyen/postings')) {
+    pass('smartrecruiters.detect() resolves careers.smartrecruiters.com/<slug> → api URL');
+  } else {
+    fail(`smartrecruiters.detect(careers) returned ${JSON.stringify(hitCareers)}`);
+  }
+
+  const hitJobs = sr.detect({ name: 'X', careers_url: 'https://jobs.smartrecruiters.com/x' });
+  if (hitJobs && hitJobs.url.startsWith('https://api.smartrecruiters.com/v1/companies/x/postings')) {
+    pass('smartrecruiters.detect() also handles jobs.smartrecruiters.com');
+  } else {
+    fail(`smartrecruiters.detect(jobs) returned ${JSON.stringify(hitJobs)}`);
+  }
+
+  if (sr.detect({ name: 'X', careers_url: 'https://example.com/careers' }) === null) {
+    pass('smartrecruiters.detect() returns null for non-SR URLs');
+  } else {
+    fail('smartrecruiters.detect() should return null for non-SR URLs');
+  }
+
+  // parseSmartRecruitersResponse
+  const sample = {
+    content: [
+      {
+        id: 'abc-123',
+        name: 'Senior PM',
+        ref: 'https://api.smartrecruiters.com/v1/companies/sgs/postings/abc-123',
+        location: { fullLocation: 'Geneva, Switzerland', remote: false },
+      },
+      {
+        id: 'def-456',
+        name: 'Remote AI Engineer',
+        ref: 'https://api.smartrecruiters.com/v1/companies/sgs/postings/def-456',
+        location: { city: 'Paris', country: 'France', remote: true },
+      },
+      {
+        id: 'ghi-789',
+        name: 'No-ref Role',
+        location: { fullLocation: 'Berlin, Germany' },
+      },
+    ],
+  };
+  const jobs = parseSmartRecruitersResponse(sample, 'SGS');
+  if (jobs.length === 3) pass('parseSmartRecruitersResponse extracts 3 jobs');
+  else fail(`parseSmartRecruitersResponse returned ${jobs.length} jobs`);
+
+  if (jobs[0]?.location === 'Geneva, Switzerland' && jobs[0]?.title === 'Senior PM') {
+    pass('parseSmartRecruitersResponse uses fullLocation when present');
+  } else {
+    fail(`row 0 = ${JSON.stringify(jobs[0])}`);
+  }
+
+  if (jobs[1]?.location === 'Paris, France, Remote') {
+    pass('parseSmartRecruitersResponse builds location from city/country/remote when no fullLocation');
+  } else {
+    fail(`row 1 location = ${JSON.stringify(jobs[1]?.location)}, expected "Paris, France, Remote"`);
+  }
+
+  if (jobs[0]?.url === 'https://jobs.smartrecruiters.com/sgs/postings/abc-123') {
+    pass('parseSmartRecruitersResponse rewrites api.smartrecruiters.com → jobs.smartrecruiters.com');
+  } else {
+    fail(`row 0 url = ${JSON.stringify(jobs[0]?.url)}`);
+  }
+
+  if (jobs[2]?.url && jobs[2].url.startsWith('https://jobs.smartrecruiters.com/sgs/ghi-789')) {
+    pass('parseSmartRecruitersResponse falls back to synthetic URL when ref is missing');
+  } else {
+    fail(`row 2 url = ${JSON.stringify(jobs[2]?.url)}`);
+  }
+
+  // Empty input safety
+  if (parseSmartRecruitersResponse({}, 'X').length === 0) pass('empty {} input → empty result');
+  else fail('empty {} input should yield empty result');
+
+  if (parseSmartRecruitersResponse({ content: 'not an array' }, 'X').length === 0) {
+    pass('non-array content → empty result (no crash)');
+  } else {
+    fail('non-array content should yield empty result');
+  }
+
+} catch (e) {
+  fail(`smartrecruiters provider tests crashed: ${e.message}`);
+}
+
+// ── 13. PROVIDERS — Recruitee ───────────────────────────────────────
+
+console.log('\n13. Provider — recruitee');
+
+try {
+  const recruitee = (await import(pathToFileURL(join(ROOT, 'providers/recruitee.mjs')).href)).default;
+  const { parseRecruiteeResponse } = await import(pathToFileURL(join(ROOT, 'providers/recruitee.mjs')).href);
+
+  if (recruitee.id === 'recruitee') pass('recruitee.id is "recruitee"');
+  else fail(`recruitee.id is ${JSON.stringify(recruitee.id)}`);
+
+  const hit = recruitee.detect({ name: 'Channable', careers_url: 'https://channable.recruitee.com' });
+  if (hit && hit.url === 'https://channable.recruitee.com/api/offers/') {
+    pass('recruitee.detect() resolves <slug>.recruitee.com → api offers');
+  } else {
+    fail(`recruitee.detect() returned ${JSON.stringify(hit)}`);
+  }
+
+  if (recruitee.detect({ name: 'X', careers_url: 'https://example.com/careers' }) === null) {
+    pass('recruitee.detect() returns null for non-recruitee URLs');
+  } else {
+    fail('recruitee.detect() should return null for non-recruitee URLs');
+  }
+
+  // parseRecruiteeResponse
+  const sample = {
+    offers: [
+      { title: 'Senior PM', careers_url: 'https://channable.recruitee.com/o/senior-pm', city: 'Utrecht', country: 'Netherlands', remote: false },
+      { title: 'Backend Eng', url: 'https://channable.recruitee.com/o/backend', city: 'Amsterdam', country: 'Netherlands', remote: true },
+      { title: 'AI Lead', location: 'Remote, EMEA' },
+    ],
+  };
+  const jobs = parseRecruiteeResponse(sample, 'Channable');
+  if (jobs.length === 3) pass('parseRecruiteeResponse extracts 3 offers');
+  else fail(`parseRecruiteeResponse returned ${jobs.length} offers`);
+
+  if (jobs[0]?.title === 'Senior PM' && jobs[0]?.company === 'Channable' && jobs[0]?.url === 'https://channable.recruitee.com/o/senior-pm') {
+    pass('parseRecruiteeResponse prefers careers_url field over url');
+  } else {
+    fail(`row 0 = ${JSON.stringify(jobs[0])}`);
+  }
+
+  if (jobs[1]?.location === 'Amsterdam, Netherlands, Remote') {
+    pass('parseRecruiteeResponse assembles city/country/remote when no location field');
+  } else {
+    fail(`row 1 location = ${JSON.stringify(jobs[1]?.location)}, expected "Amsterdam, Netherlands, Remote"`);
+  }
+
+  if (jobs[2]?.location === 'Remote, EMEA') {
+    pass('parseRecruiteeResponse uses explicit location field when present');
+  } else {
+    fail(`row 2 location = ${JSON.stringify(jobs[2]?.location)}`);
+  }
+
+  if (parseRecruiteeResponse({}, 'X').length === 0) pass('empty {} → empty result');
+  else fail('empty {} should yield empty result');
+
+  if (parseRecruiteeResponse({ offers: null }, 'X').length === 0) {
+    pass('null offers → empty result (no crash)');
+  } else {
+    fail('null offers should yield empty result');
+  }
+
+} catch (e) {
+  fail(`recruitee provider tests crashed: ${e.message}`);
+}
+
 // ── SUMMARY ─────────────────────────────────────────────────────
 
 console.log('\n' + '='.repeat(50));

From 6448abd8e65a5dddb5b9cd05c3aaa9b32be910f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jes=C3=BAs=20Rojo=20Mart=C3=ADnez?=
 <jrojomartinez@gmail.com>
Date: Sat, 16 May 2026 18:27:38 +0200
Subject: [PATCH 2/7] feat(providers): add SmartRecruiters provider
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Auto-detects from careers_url pattern
`https://(careers|jobs).smartrecruiters.com/<slug>` and hits the
public /postings endpoint. tracked_companies entries can also set
`provider: smartrecruiters` to bypass detection (useful when the
public careers URL is a branded custom domain like `careers.adyen.com`).

Follows the SSRF defence pattern from providers/greenhouse.mjs:
hostname allowlist (api.smartrecruiters.com) + URL parse + HTTPS
check + redirect:'error'.

Notable parse decisions:
  - location: prefer location.fullLocation; else assemble from
    city/region/country (skipping empties); append "Remote" when
    location.remote is true.
  - url: rewrite j.ref's api.smartrecruiters.com prefix to
    jobs.smartrecruiters.com so the link points at the public job
    page, not the API. Falls back to a synthetic URL when ref is
    missing.

Exports parseSmartRecruitersResponse as a named export so
test-all.mjs §12 can unit-test the parser.

Tests in test-all.mjs §12:
  - detect() resolves both careers.* and jobs.* hostnames
  - detect() returns null for non-SR URLs
  - parser uses fullLocation when present
  - parser assembles city/country/remote when fullLocation absent
  - parser rewrites api.smartrecruiters.com → jobs.smartrecruiters.com
  - parser synthesises a URL when ref is missing
  - empty / malformed inputs yield empty results without crashing

Refs #651
---
 providers/smartrecruiters.mjs | 81 +++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 providers/smartrecruiters.mjs

diff --git a/providers/smartrecruiters.mjs b/providers/smartrecruiters.mjs
new file mode 100644
index 0000000000..2873182f09
--- /dev/null
+++ b/providers/smartrecruiters.mjs
@@ -0,0 +1,81 @@
+// @ts-check
+/** @typedef {import('./_types.js').Provider} Provider */
+
+// SmartRecruiters provider — hits the public postings API.
+// Auto-detects from careers_url pattern
+// `https://(careers|jobs).smartrecruiters.com/<slug>`. A tracked_companies
+// entry can also set `provider: smartrecruiters` explicitly to bypass
+// detection (useful when the public careers URL is a branded custom domain).
+
+const ALLOWED_SMARTRECRUITERS_HOSTS = new Set(['api.smartrecruiters.com']);
+
+function assertSmartRecruitersUrl(url) {
+  let parsed;
+  try {
+    parsed = new URL(url);
+  } catch {
+    throw new Error(`smartrecruiters: invalid URL: ${url}`);
+  }
+  if (parsed.protocol !== 'https:') throw new Error(`smartrecruiters: URL must use HTTPS: ${url}`);
+  if (!ALLOWED_SMARTRECRUITERS_HOSTS.has(parsed.hostname)) {
+    throw new Error(`smartrecruiters: untrusted hostname "${parsed.hostname}" — must be one of: ${[...ALLOWED_SMARTRECRUITERS_HOSTS].join(', ')}`);
+  }
+  return url;
+}
+
+function resolveApiUrl(entry) {
+  const url = entry.careers_url || '';
+  const match = url.match(/(?:careers|jobs)\.smartrecruiters\.com\/([^/?#]+)/);
+  if (!match) return null;
+  return `https://api.smartrecruiters.com/v1/companies/${match[1]}/postings?limit=100&offset=0&status=PUBLIC`;
+}
+
+/** @type {Provider} */
+export default {
+  id: 'smartrecruiters',
+
+  detect(entry) {
+    const apiUrl = resolveApiUrl(entry);
+    return apiUrl ? { url: apiUrl } : null;
+  },
+
+  async fetch(entry, ctx) {
+    const apiUrl = resolveApiUrl(entry);
+    if (!apiUrl) throw new Error(`smartrecruiters: cannot derive API URL for ${entry.name}`);
+    assertSmartRecruitersUrl(apiUrl);
+    const json = await ctx.fetchJson(apiUrl, { redirect: 'error' });
+    return parseSmartRecruitersResponse(json, entry.name);
+  },
+};
+
+/**
+ * Parse a SmartRecruiters /postings response. Exported for unit tests.
+ *
+ * SmartRecruiters returns:
+ *   { content: [{ id, name, ref, location: { fullLocation?, city?, region?, country?, remote? } }] }
+ *
+ * - location: prefer `fullLocation`; else assemble from city/region/country
+ *   parts (skipping empties); append "Remote" when `location.remote` is true.
+ * - url: `j.ref` is an `api.smartrecruiters.com/v1/companies/<slug>/postings/<id>`
+ *   URL — rewrite to the public `jobs.smartrecruiters.com/<slug>/postings/<id>`.
+ *   If `ref` is missing, synthesise a URL from the company slug + posting id.
+ *
+ * @param {any} json
+ * @param {string} companyName
+ * @returns {Array<{title: string, url: string, company: string, location: string}>}
+ */
+export function parseSmartRecruitersResponse(json, companyName) {
+  const items = json?.content;
+  if (!Array.isArray(items)) return [];
+  return items.map(j => {
+    const loc = j.location || {};
+    const fullLocation = loc.fullLocation || [loc.city, loc.region, loc.country].filter(Boolean).join(', ');
+    const remote = loc.remote ? 'Remote' : '';
+    const location = [fullLocation, remote].filter(Boolean).join(', ');
+    const slugified = (j.name || '').toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '');
+    const url = j.ref
+      ? j.ref.replace('api.smartrecruiters.com/v1/companies/', 'jobs.smartrecruiters.com/')
+      : `https://jobs.smartrecruiters.com/${(companyName || '').toLowerCase()}/${j.id}-${slugified}`;
+    return { title: j.name || '', url, location, company: companyName };
+  });
+}

From 148551c53c6c53b0029701295479917ce0b1b1d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jes=C3=BAs=20Rojo=20Mart=C3=ADnez?=
 <jrojomartinez@gmail.com>
Date: Sat, 16 May 2026 18:28:10 +0200
Subject: [PATCH 3/7] feat(providers): add Recruitee provider
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Auto-detects from careers_url pattern `https://<slug>.recruitee.com`
and hits the public /api/offers/ endpoint. tracked_companies entries
can also set `provider: recruitee` to bypass detection.

SSRF defence: per-tenant subdomains are the variable part, so a
static hostname allowlist isn't workable. Uses a regex match on
`<safe-slug>.recruitee.com` (`^[a-z0-9][a-z0-9-]*\.recruitee\.com$`)
+ HTTPS check + redirect:'error'. The regex constrains the slug to
safe characters, preventing attacker-controlled hostnames from
slipping through.

Notable parse decisions:
  - url: prefer `careers_url` (the public job page), fall back to
    `url` (some installs use it instead), empty string otherwise.
  - location: prefer the explicit `location` field; else assemble
    from city/country with "Remote" appended when remote is true.

Exports parseRecruiteeResponse as a named export for tests.

Tests in test-all.mjs §13:
  - detect() resolves <slug>.recruitee.com → /api/offers/
  - detect() returns null for non-recruitee URLs
  - parser prefers careers_url over url
  - parser assembles location from city/country/remote
  - parser uses explicit location field when present
  - empty / null inputs yield empty results without crashing

Refs #651
---
 providers/recruitee.mjs | 80 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 providers/recruitee.mjs

diff --git a/providers/recruitee.mjs b/providers/recruitee.mjs
new file mode 100644
index 0000000000..1772199585
--- /dev/null
+++ b/providers/recruitee.mjs
@@ -0,0 +1,80 @@
+// @ts-check
+/** @typedef {import('./_types.js').Provider} Provider */
+
+// Recruitee provider — hits the public per-tenant offers API.
+// Auto-detects from careers_url pattern `https://<slug>.recruitee.com`.
+// Per-tenant subdomains are the variable part — SSRF defence uses a
+// regex match on `<safe-slug>.recruitee.com` rather than a static
+// allowlist.
+
+const RECRUITEE_HOST_RE = /^[a-z0-9][a-z0-9-]*\.recruitee\.com$/;
+
+function assertRecruiteeUrl(url) {
+  let parsed;
+  try {
+    parsed = new URL(url);
+  } catch {
+    throw new Error(`recruitee: invalid URL: ${url}`);
+  }
+  if (parsed.protocol !== 'https:') throw new Error(`recruitee: URL must use HTTPS: ${url}`);
+  if (!RECRUITEE_HOST_RE.test(parsed.hostname)) {
+    throw new Error(`recruitee: untrusted hostname "${parsed.hostname}" — must match <slug>.recruitee.com`);
+  }
+  return url;
+}
+
+function resolveApiUrl(entry) {
+  const url = entry.careers_url || '';
+  const match = url.match(/([a-z0-9][a-z0-9-]*)\.recruitee\.com/);
+  if (!match) return null;
+  return `https://${match[1]}.recruitee.com/api/offers/`;
+}
+
+/** @type {Provider} */
+export default {
+  id: 'recruitee',
+
+  detect(entry) {
+    const apiUrl = resolveApiUrl(entry);
+    return apiUrl ? { url: apiUrl } : null;
+  },
+
+  async fetch(entry, ctx) {
+    const apiUrl = resolveApiUrl(entry);
+    if (!apiUrl) throw new Error(`recruitee: cannot derive API URL for ${entry.name}`);
+    assertRecruiteeUrl(apiUrl);
+    const json = await ctx.fetchJson(apiUrl, { redirect: 'error' });
+    return parseRecruiteeResponse(json, entry.name);
+  },
+};
+
+/**
+ * Parse a Recruitee /api/offers/ response. Exported for unit tests.
+ *
+ * Recruitee returns:
+ *   { offers: [{ title, careers_url?, url?, city?, country?, remote?, location? }] }
+ *
+ * - url: prefer `careers_url`, fall back to `url`, empty string otherwise.
+ * - location: prefer the explicit `location` field; else assemble from
+ *   city/country, appending "Remote" when `remote` is true.
+ *
+ * @param {any} json
+ * @param {string} companyName
+ * @returns {Array<{title: string, url: string, company: string, location: string}>}
+ */
+export function parseRecruiteeResponse(json, companyName) {
+  const offers = json?.offers;
+  if (!Array.isArray(offers)) return [];
+  return offers.map(j => {
+    const city = j.city || '';
+    const country = j.country || '';
+    const remote = j.remote ? 'Remote' : '';
+    const location = j.location || [city, country, remote].filter(Boolean).join(', ');
+    return {
+      title: j.title || '',
+      url: j.careers_url || j.url || '',
+      location,
+      company: companyName,
+    };
+  });
+}

From a67e79480340f5561df5b7a1e2f79e489a0c0079 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jes=C3=BAs=20Rojo=20Mart=C3=ADnez?=
 <jrojomartinez@gmail.com>
Date: Sat, 16 May 2026 18:28:39 +0200
Subject: [PATCH 4/7] docs(portals): document Workable, SmartRecruiters,
 Recruitee URL patterns

---
 templates/portals.example.yml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/templates/portals.example.yml b/templates/portals.example.yml
index 210042a6f8..46d29f8591 100644
--- a/templates/portals.example.yml
+++ b/templates/portals.example.yml
@@ -356,6 +356,21 @@ search_queries:
 #                          provider's `id`.
 #     transport: http    — reserved for future transports. Defaults to http.
 
+# ── Provider auto-detection ───────────────────────────────────────
+# scan.mjs auto-loads everything in providers/*.mjs and tries each
+# provider's detect() in order. URL patterns recognized:
+#
+#   greenhouse      job-boards(.eu)?.greenhouse.io/<slug>  (or api: field)
+#   ashby           jobs.ashbyhq.com/<slug>
+#   lever           jobs.lever.co/<slug>
+#   workable        apply.workable.com/<slug>
+#   smartrecruiters (careers|jobs).smartrecruiters.com/<slug>
+#   recruitee       <slug>.recruitee.com
+#
+# When the public careers URL is a branded custom domain (e.g.
+# careers.adyen.com), set `provider: smartrecruiters` explicitly to
+# bypass detect(). The `provider:` field wins over auto-detection.
+
 tracked_companies:
 
   # -- AI Labs & LLM providers --

From fcab2cc3ef24e69b685e562f1e9830616487ec30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jes=C3=BAs=20Rojo=20Mart=C3=ADnez?=
 <jrojomartinez@gmail.com>
Date: Sat, 16 May 2026 18:50:13 +0200
Subject: [PATCH 5/7] fix(providers): defensive input normalization + edge
 cases
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-emptive hardening following the same defensive pattern CodeRabbit
flagged on PR #652. All changes are within the providers shipped in
this PR; no scan.mjs / framework changes.

- All three providers: `careers_url` is now type-checked before .match()
  so a non-string YAML value (number, object, array) returns null from
  detect() rather than throwing.

- smartrecruiters: ref-rewrite uses an anchored regex
  (`/^https:\/\/api\.smartrecruiters\.com\/v1\/companies\//`) so the
  replacement only fires at the URL prefix. The fallback URL path (when
  both j.ref AND j.id are missing) now returns an empty string instead
  of synthesising a URL containing the literal "undefined" — the empty
  string is the contract-allowed default for url per _types.js > Job.
  Magic 100 in the postings limit is now a named SR_PAGE_SIZE constant.

- workable: parseWorkableMarkdown now extracts URLs via a line-level
  regex `/\[View\]\(([^)]+)\)/` rather than a column-position match,
  so a title containing a stray `|` doesn't shift cols[7] and silently
  drop the URL. Rows that still don't resolve a URL are skipped (no
  empty-URL entries leak into the dedup tracker).

- test-all.mjs: 6 new assertions covering the defensive paths
  (non-string careers_url across all 3 providers, the SR no-ref/no-id
  fallback, the Workable stray-pipe survival, and a real Workable
  fetch() rejection test against an unresolvable careers_url).

Refs #651
---
 providers/recruitee.mjs       |  2 +-
 providers/smartrecruiters.mjs |  9 +++--
 providers/workable.mjs        |  5 ++-
 test-all.mjs                  | 72 +++++++++++++++++++++++++++++++++--
 4 files changed, 78 insertions(+), 10 deletions(-)

diff --git a/providers/recruitee.mjs b/providers/recruitee.mjs
index 1772199585..8c94cf5f5b 100644
--- a/providers/recruitee.mjs
+++ b/providers/recruitee.mjs
@@ -24,7 +24,7 @@ function assertRecruiteeUrl(url) {
 }
 
 function resolveApiUrl(entry) {
-  const url = entry.careers_url || '';
+  const url = typeof entry.careers_url === 'string' ? entry.careers_url : '';
   const match = url.match(/([a-z0-9][a-z0-9-]*)\.recruitee\.com/);
   if (!match) return null;
   return `https://${match[1]}.recruitee.com/api/offers/`;
diff --git a/providers/smartrecruiters.mjs b/providers/smartrecruiters.mjs
index 2873182f09..092ec9b1d2 100644
--- a/providers/smartrecruiters.mjs
+++ b/providers/smartrecruiters.mjs
@@ -8,6 +8,7 @@
 // detection (useful when the public careers URL is a branded custom domain).
 
 const ALLOWED_SMARTRECRUITERS_HOSTS = new Set(['api.smartrecruiters.com']);
+const SR_PAGE_SIZE = 100;
 
 function assertSmartRecruitersUrl(url) {
   let parsed;
@@ -24,10 +25,10 @@ function assertSmartRecruitersUrl(url) {
 }
 
 function resolveApiUrl(entry) {
-  const url = entry.careers_url || '';
+  const url = typeof entry.careers_url === 'string' ? entry.careers_url : '';
   const match = url.match(/(?:careers|jobs)\.smartrecruiters\.com\/([^/?#]+)/);
   if (!match) return null;
-  return `https://api.smartrecruiters.com/v1/companies/${match[1]}/postings?limit=100&offset=0&status=PUBLIC`;
+  return `https://api.smartrecruiters.com/v1/companies/${match[1]}/postings?limit=${SR_PAGE_SIZE}&offset=0&status=PUBLIC`;
 }
 
 /** @type {Provider} */
@@ -74,8 +75,8 @@ export function parseSmartRecruitersResponse(json, companyName) {
     const location = [fullLocation, remote].filter(Boolean).join(', ');
     const slugified = (j.name || '').toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '');
     const url = j.ref
-      ? j.ref.replace('api.smartrecruiters.com/v1/companies/', 'jobs.smartrecruiters.com/')
-      : `https://jobs.smartrecruiters.com/${(companyName || '').toLowerCase()}/${j.id}-${slugified}`;
+      ? j.ref.replace(/^https:\/\/api\.smartrecruiters\.com\/v1\/companies\//, 'https://jobs.smartrecruiters.com/')
+      : j.id ? `https://jobs.smartrecruiters.com/${(companyName || '').toLowerCase()}/${j.id}-${slugified}` : '';
     return { title: j.name || '', url, location, company: companyName };
   });
 }
diff --git a/providers/workable.mjs b/providers/workable.mjs
index c1c5606c9d..5540cef62b 100644
--- a/providers/workable.mjs
+++ b/providers/workable.mjs
@@ -24,7 +24,7 @@ function assertWorkableUrl(url) {
 }
 
 function resolveFeedUrl(entry) {
-  const url = entry.careers_url || '';
+  const url = typeof entry.careers_url === 'string' ? entry.careers_url : '';
   const match = url.match(/apply\.workable\.com\/([^/?#]+)/);
   if (!match) return null;
   return `https://apply.workable.com/${match[1]}/jobs.md`;
@@ -72,9 +72,10 @@ export function parseWorkableMarkdown(text, companyName) {
     const title = cols[1];
     if (!title || title === 'Title') continue;
     const location = cols[3] || '';
-    const urlMatch = cols[7].match(/\(([^)]+)\)/);
+    const urlMatch = line.match(/\[View\]\(([^)]+)\)/);
     let url = urlMatch ? urlMatch[1] : '';
     if (url.endsWith('.md')) url = url.slice(0, -3);
+    if (!url) continue;  // skip rows with no resolvable URL (e.g., malformed [View] link)
     jobs.push({ title, url, location, company: companyName });
   }
   return jobs;
diff --git a/test-all.mjs b/test-all.mjs
index 7de46f1621..d4a9687890 100644
--- a/test-all.mjs
+++ b/test-all.mjs
@@ -370,9 +370,9 @@ try {
   if (parseWorkableMarkdown(null, 'X').length === 0) pass('null input → empty result (no crash)');
   else fail('null input should yield empty result without crashing');
 
-  // SSRF defence: untrusted hostname rejected before fetch
+  // fetch() reaches the http context on the happy path (allowed hostname).
   await workable.fetch(
-    { name: 'Bad', careers_url: 'https://apply.workable.com/evil' },
+    { name: 'Smoke', careers_url: 'https://apply.workable.com/optimile' },
     {
       transport: 'http',
       fetchText: async (url) => {
@@ -384,7 +384,48 @@ try {
       fetchJson: async () => { throw new Error('fetchJson should not be called'); },
     },
   );
-  pass('workable.fetch() reaches fetchText with allowed host');
+  pass('workable.fetch() reaches fetchText on the happy path (allowed hostname)');
+
+  // fetch() rejects an unresolvable careers_url (no apply.workable.com match in URL).
+  let rejected = false;
+  try {
+    await workable.fetch(
+      { name: 'BadUrl', careers_url: 'https://evil.com/totally-not-workable' },
+      {
+        transport: 'http',
+        fetchText: async () => { throw new Error('SSRF! should not reach here'); },
+        fetchJson: async () => { throw new Error('SSRF! should not reach here'); },
+      },
+    );
+  } catch (e) {
+    if (e.message.includes('cannot derive feed URL')) {
+      rejected = true;
+    } else {
+      fail(`workable.fetch() rejected with wrong error: ${e.message}`);
+    }
+  }
+  if (rejected) pass('workable.fetch() rejects unresolvable careers_url before fetch');
+  else fail('workable.fetch() should throw cannot-derive-feed-URL for non-Workable URLs');
+
+  // careers_url with non-string value (e.g. YAML mistake passing a number) → detect() returns null without crashing
+  if (workable.detect({ name: 'X', careers_url: 42 }) === null) {
+    pass('workable.detect() returns null for non-string careers_url (42)');
+  } else {
+    fail('workable.detect() should treat non-string careers_url as missing');
+  }
+
+  // Workable parser tolerates a title with a stray pipe — URL is extracted from the line, not cols[7]
+  const strayPipeMd = [
+    '| Title | Department | Location | Type | Salary | Posted | Details |',
+    '|---|---|---|---|---|---|---|',
+    '| Senior PM (full | part-time) | Product | Remote | Full-time | — | 2026-04-01 | [View](https://apply.workable.com/x/jobs/view/PIPE.md) |',
+  ].join('\n');
+  const strayJobs = parseWorkableMarkdown(strayPipeMd, 'X');
+  if (strayJobs.length === 1 && strayJobs[0].url === 'https://apply.workable.com/x/jobs/view/PIPE') {
+    pass('parseWorkableMarkdown extracts URL from line-level regex (survives stray pipes in title)');
+  } else {
+    fail(`stray-pipe row not handled correctly: ${JSON.stringify(strayJobs)}`);
+  }
 
 } catch (e) {
   fail(`workable provider tests crashed: ${e.message}`);
@@ -481,6 +522,24 @@ try {
     fail('non-array content should yield empty result');
   }
 
+  // careers_url with non-string value → detect() returns null without crashing
+  if (sr.detect({ name: 'X', careers_url: { foo: 'bar' } }) === null) {
+    pass('smartrecruiters.detect() returns null for non-string careers_url (object)');
+  } else {
+    fail('smartrecruiters.detect() should treat non-string careers_url as missing');
+  }
+
+  // Fallback URL when both ref AND id are missing → empty string (not "undefined" in URL)
+  const noRefNoId = parseSmartRecruitersResponse(
+    { content: [{ name: 'Stranded Role' }] },
+    'X',
+  );
+  if (noRefNoId.length === 1 && noRefNoId[0].url === '') {
+    pass('parseSmartRecruitersResponse returns url="" when both ref and id are missing');
+  } else {
+    fail(`expected url='' when ref+id both missing, got ${JSON.stringify(noRefNoId[0])}`);
+  }
+
 } catch (e) {
   fail(`smartrecruiters provider tests crashed: ${e.message}`);
 }
@@ -548,6 +607,13 @@ try {
     fail('null offers should yield empty result');
   }
 
+  // careers_url with non-string value → detect() returns null without crashing
+  if (recruitee.detect({ name: 'X', careers_url: null }) === null && recruitee.detect({ name: 'X', careers_url: 7 }) === null) {
+    pass('recruitee.detect() returns null for non-string careers_url (null and 7)');
+  } else {
+    fail('recruitee.detect() should treat non-string careers_url as missing');
+  }
+
 } catch (e) {
   fail(`recruitee provider tests crashed: ${e.message}`);
 }

From 09b6f2b0165eed2d8456b31cf9d40f11e08c00b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jes=C3=BAs=20Rojo=20Mart=C3=ADnez?=
 <jrojomartinez@gmail.com>
Date: Sat, 16 May 2026 18:58:40 +0200
Subject: [PATCH 6/7] fix(providers): strict URL parsing + ref validation
 (review)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses 5 CodeRabbit comments on PR #653 asking for tighter
validation than substring regex on raw URL strings.

- All 3 providers: detect()/resolveXxxUrl() now use new URL() to
  parse careers_url, verify protocol === 'https:', check hostname
  exactly (Workable: apply.workable.com; SmartRecruiters:
  careers./jobs.smartrecruiters.com; Recruitee: regex-validated
  <slug>.recruitee.com), then derive the slug from the parsed
  pathname/hostname. This rejects path-spoofed inputs like
  https://evil.example/apply.workable.com/slug (substring regex
  would have falsely matched).

- smartrecruiters parseSmartRecruitersResponse: j.ref is now
  validated (parses as URL, hostname must be api.smartrecruiters.com,
  pathname must start with /v1/companies/) before the prefix rewrite.
  Invalid refs fall through to the fallback URL path. The fallback
  companyName is now slugified (non-alphanumerics → -, strip
  leading/trailing -) so "My Acme & Co." → "my-acme-co" rather than
  producing a URL with raw spaces/symbols.

- test-all.mjs: 5 new assertions covering the path-spoof rejection
  for all 3 providers, the untrusted-ref-host fall-through, and the
  companyName slugification.

Refs #651
---
 providers/recruitee.mjs       | 15 ++++++++----
 providers/smartrecruiters.mjs | 39 +++++++++++++++++++++++++------
 providers/workable.mjs        | 17 ++++++++++----
 test-all.mjs                  | 44 +++++++++++++++++++++++++++++++++++
 4 files changed, 100 insertions(+), 15 deletions(-)

diff --git a/providers/recruitee.mjs b/providers/recruitee.mjs
index 8c94cf5f5b..21af5bbd6a 100644
--- a/providers/recruitee.mjs
+++ b/providers/recruitee.mjs
@@ -24,10 +24,17 @@ function assertRecruiteeUrl(url) {
 }
 
 function resolveApiUrl(entry) {
-  const url = typeof entry.careers_url === 'string' ? entry.careers_url : '';
-  const match = url.match(/([a-z0-9][a-z0-9-]*)\.recruitee\.com/);
-  if (!match) return null;
-  return `https://${match[1]}.recruitee.com/api/offers/`;
+  const raw = typeof entry.careers_url === 'string' ? entry.careers_url : '';
+  if (!raw) return null;
+  let parsed;
+  try {
+    parsed = new URL(raw);
+  } catch {
+    return null;
+  }
+  if (parsed.protocol !== 'https:') return null;
+  if (!RECRUITEE_HOST_RE.test(parsed.hostname)) return null;
+  return `https://${parsed.hostname}/api/offers/`;
 }
 
 /** @type {Provider} */
diff --git a/providers/smartrecruiters.mjs b/providers/smartrecruiters.mjs
index 092ec9b1d2..519f65b37d 100644
--- a/providers/smartrecruiters.mjs
+++ b/providers/smartrecruiters.mjs
@@ -8,6 +8,7 @@
 // detection (useful when the public careers URL is a branded custom domain).
 
 const ALLOWED_SMARTRECRUITERS_HOSTS = new Set(['api.smartrecruiters.com']);
+const SR_CAREERS_HOSTS = new Set(['careers.smartrecruiters.com', 'jobs.smartrecruiters.com']);
 const SR_PAGE_SIZE = 100;
 
 function assertSmartRecruitersUrl(url) {
@@ -25,10 +26,19 @@ function assertSmartRecruitersUrl(url) {
 }
 
 function resolveApiUrl(entry) {
-  const url = typeof entry.careers_url === 'string' ? entry.careers_url : '';
-  const match = url.match(/(?:careers|jobs)\.smartrecruiters\.com\/([^/?#]+)/);
-  if (!match) return null;
-  return `https://api.smartrecruiters.com/v1/companies/${match[1]}/postings?limit=${SR_PAGE_SIZE}&offset=0&status=PUBLIC`;
+  const raw = typeof entry.careers_url === 'string' ? entry.careers_url : '';
+  if (!raw) return null;
+  let parsed;
+  try {
+    parsed = new URL(raw);
+  } catch {
+    return null;
+  }
+  if (parsed.protocol !== 'https:') return null;
+  if (!SR_CAREERS_HOSTS.has(parsed.hostname)) return null;
+  const slug = parsed.pathname.split('/').filter(Boolean)[0];
+  if (!slug) return null;
+  return `https://api.smartrecruiters.com/v1/companies/${slug}/postings?limit=${SR_PAGE_SIZE}&offset=0&status=PUBLIC`;
 }
 
 /** @type {Provider} */
@@ -74,9 +84,24 @@ export function parseSmartRecruitersResponse(json, companyName) {
     const remote = loc.remote ? 'Remote' : '';
     const location = [fullLocation, remote].filter(Boolean).join(', ');
     const slugified = (j.name || '').toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '');
-    const url = j.ref
-      ? j.ref.replace(/^https:\/\/api\.smartrecruiters\.com\/v1\/companies\//, 'https://jobs.smartrecruiters.com/')
-      : j.id ? `https://jobs.smartrecruiters.com/${(companyName || '').toLowerCase()}/${j.id}-${slugified}` : '';
+    let url = '';
+    if (typeof j.ref === 'string') {
+      let parsedRef;
+      try { parsedRef = new URL(j.ref); } catch { parsedRef = null; }
+      if (parsedRef
+          && parsedRef.protocol === 'https:'
+          && parsedRef.hostname === 'api.smartrecruiters.com'
+          && parsedRef.pathname.startsWith('/v1/companies/')) {
+        const restOfPath = parsedRef.pathname.slice('/v1/companies/'.length);
+        url = `https://jobs.smartrecruiters.com/${restOfPath}`;
+      }
+    }
+    if (!url && j.id) {
+      const companySlug = (companyName || '').toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '');
+      if (companySlug) {
+        url = `https://jobs.smartrecruiters.com/${companySlug}/${j.id}-${slugified}`;
+      }
+    }
     return { title: j.name || '', url, location, company: companyName };
   });
 }
diff --git a/providers/workable.mjs b/providers/workable.mjs
index 5540cef62b..91cbaee752 100644
--- a/providers/workable.mjs
+++ b/providers/workable.mjs
@@ -24,10 +24,19 @@ function assertWorkableUrl(url) {
 }
 
 function resolveFeedUrl(entry) {
-  const url = typeof entry.careers_url === 'string' ? entry.careers_url : '';
-  const match = url.match(/apply\.workable\.com\/([^/?#]+)/);
-  if (!match) return null;
-  return `https://apply.workable.com/${match[1]}/jobs.md`;
+  const raw = typeof entry.careers_url === 'string' ? entry.careers_url : '';
+  if (!raw) return null;
+  let parsed;
+  try {
+    parsed = new URL(raw);
+  } catch {
+    return null;
+  }
+  if (parsed.protocol !== 'https:') return null;
+  if (parsed.hostname !== 'apply.workable.com') return null;
+  const slug = parsed.pathname.split('/').filter(Boolean)[0];
+  if (!slug) return null;
+  return `https://apply.workable.com/${slug}/jobs.md`;
 }
 
 /** @type {Provider} */
diff --git a/test-all.mjs b/test-all.mjs
index d4a9687890..1ce3fb58d9 100644
--- a/test-all.mjs
+++ b/test-all.mjs
@@ -407,6 +407,14 @@ try {
   if (rejected) pass('workable.fetch() rejects unresolvable careers_url before fetch');
   else fail('workable.fetch() should throw cannot-derive-feed-URL for non-Workable URLs');
 
+  // SSRF: malicious URL with apply.workable.com in the PATH (not hostname) must not be detected as Workable.
+  // With strict URL parsing, the hostname `evil.example` fails the check and detect() returns null.
+  if (workable.detect({ name: 'Spoof', careers_url: 'https://evil.example/apply.workable.com/slug' }) === null) {
+    pass('workable.detect() rejects path-spoofed URLs (apply.workable.com in path, not hostname)');
+  } else {
+    fail('workable.detect() must NOT misdetect URLs that contain apply.workable.com in the path');
+  }
+
   // careers_url with non-string value (e.g. YAML mistake passing a number) → detect() returns null without crashing
   if (workable.detect({ name: 'X', careers_url: 42 }) === null) {
     pass('workable.detect() returns null for non-string careers_url (42)');
@@ -540,6 +548,35 @@ try {
     fail(`expected url='' when ref+id both missing, got ${JSON.stringify(noRefNoId[0])}`);
   }
 
+  // SSRF: malicious URL with smartrecruiters hostname in the PATH (not host) must not be detected.
+  if (sr.detect({ name: 'Spoof', careers_url: 'https://evil.example/careers.smartrecruiters.com/slug' }) === null) {
+    pass('smartrecruiters.detect() rejects path-spoofed URLs');
+  } else {
+    fail('smartrecruiters.detect() must NOT misdetect path-spoofed URLs');
+  }
+
+  // SmartRecruiters: untrusted j.ref host falls through to fallback rather than rewriting
+  const bogusRef = parseSmartRecruitersResponse(
+    { content: [{ id: 'X1', name: 'Strange Role', ref: 'https://evil.example/v1/companies/x/postings/X1' }] },
+    'TestCo',
+  );
+  if (bogusRef[0]?.url && !bogusRef[0].url.includes('evil.example')) {
+    pass('parseSmartRecruitersResponse rejects untrusted j.ref host (falls through to fallback)');
+  } else {
+    fail(`untrusted j.ref leaked into url: ${JSON.stringify(bogusRef[0]?.url)}`);
+  }
+
+  // SmartRecruiters: companyName with spaces/symbols is slugified for the fallback URL
+  const slugifiedCompany = parseSmartRecruitersResponse(
+    { content: [{ id: 'X2', name: 'Strange Role' }] },
+    'My Acme & Co.',
+  );
+  if (slugifiedCompany[0]?.url === 'https://jobs.smartrecruiters.com/my-acme-co/X2-strange-role') {
+    pass('parseSmartRecruitersResponse slugifies the companyName for the fallback URL');
+  } else {
+    fail(`fallback URL not properly slugified: ${JSON.stringify(slugifiedCompany[0]?.url)}`);
+  }
+
 } catch (e) {
   fail(`smartrecruiters provider tests crashed: ${e.message}`);
 }
@@ -614,6 +651,13 @@ try {
     fail('recruitee.detect() should treat non-string careers_url as missing');
   }
 
+  // SSRF: malicious URL with recruitee.com in the PATH (not host) must not be detected.
+  if (recruitee.detect({ name: 'Spoof', careers_url: 'https://evil.example/channable.recruitee.com/foo' }) === null) {
+    pass('recruitee.detect() rejects path-spoofed URLs');
+  } else {
+    fail('recruitee.detect() must NOT misdetect path-spoofed URLs');
+  }
+
 } catch (e) {
   fail(`recruitee provider tests crashed: ${e.message}`);
 }

From 434375bb380a7ac4a622d389a4478d7a5ba20b99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jes=C3=BAs=20Rojo=20Mart=C3=ADnez?=
 <jrojomartinez@gmail.com>
Date: Sat, 16 May 2026 19:10:25 +0200
Subject: [PATCH 7/7] fix(providers): validate parsed URLs + paginate
 SmartRecruiters (review)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses 3 CodeRabbit comments on PR #653 (round 2).

- recruitee: parseRecruiteeResponse now validates the offer URL via
  new URL() + protocol === 'https:' + RECRUITEE_HOST_RE hostname
  check. Off-domain or non-HTTPS values are dropped (url = '' per
  the Job contract) rather than passed through verbatim.

- workable: parseWorkableMarkdown now validates each [View] link
  the same way (hostname must be apply.workable.com, protocol
  must be https). Rows that fail validation are skipped (continue),
  matching the existing "skip rows with no resolvable URL" semantic.

- smartrecruiters: fetch() now paginates the /postings endpoint
  instead of returning only the first 100 results. Added
  resolveSlug() and buildPostingsUrl(slug, offset) helpers,
  refactored resolveApiUrl() to delegate to them, and the fetch
  loop walks offsets 0, SR_PAGE_SIZE, 2*SR_PAGE_SIZE, ... until
  either an empty page or a short page (less than SR_PAGE_SIZE).
  Safety cap SR_MAX_PAGES = 50 (= 5000 postings) prevents runaway
  loops against a broken API.

- test-all.mjs: 4 new assertions
  - Workable: off-domain + non-https [View] links are dropped
  - Recruitee: off-domain + non-https + missing offer URLs → url=''
  - SmartRecruiters: 2-page aggregation (150 items across 2 pages)
  - SmartRecruiters: stop on the first empty page (1 request)

Refs #651
---
 providers/recruitee.mjs       | 21 ++++++++-
 providers/smartrecruiters.mjs | 34 +++++++++++----
 providers/workable.mjs        | 12 ++++++
 test-all.mjs                  | 81 +++++++++++++++++++++++++++++++++++
 4 files changed, 138 insertions(+), 10 deletions(-)

diff --git a/providers/recruitee.mjs b/providers/recruitee.mjs
index 21af5bbd6a..b7886ca336 100644
--- a/providers/recruitee.mjs
+++ b/providers/recruitee.mjs
@@ -61,7 +61,9 @@ export default {
  * Recruitee returns:
  *   { offers: [{ title, careers_url?, url?, city?, country?, remote?, location? }] }
  *
- * - url: prefer `careers_url`, fall back to `url`, empty string otherwise.
+ * - url: prefer `careers_url`, fall back to `url`; validated against
+ *   `https://<safe-slug>.recruitee.com` — an off-domain or non-HTTPS URL is
+ *   dropped (empty string returned per the Job contract).
  * - location: prefer the explicit `location` field; else assemble from
  *   city/country, appending "Remote" when `remote` is true.
  *
@@ -77,9 +79,24 @@ export function parseRecruiteeResponse(json, companyName) {
     const country = j.country || '';
     const remote = j.remote ? 'Remote' : '';
     const location = j.location || [city, country, remote].filter(Boolean).join(', ');
+
+    // Validate offer URL: must parse as https://<safe-slug>.recruitee.com/...
+    let url = '';
+    const rawUrl = j.careers_url || j.url || '';
+    if (typeof rawUrl === 'string' && rawUrl) {
+      try {
+        const parsed = new URL(rawUrl);
+        if (parsed.protocol === 'https:' && RECRUITEE_HOST_RE.test(parsed.hostname)) {
+          url = parsed.href;
+        }
+      } catch {
+        // malformed URL → leave url = ''
+      }
+    }
+
     return {
       title: j.title || '',
-      url: j.careers_url || j.url || '',
+      url,
       location,
       company: companyName,
     };
diff --git a/providers/smartrecruiters.mjs b/providers/smartrecruiters.mjs
index 519f65b37d..debe7f3a0d 100644
--- a/providers/smartrecruiters.mjs
+++ b/providers/smartrecruiters.mjs
@@ -10,6 +10,7 @@
 const ALLOWED_SMARTRECRUITERS_HOSTS = new Set(['api.smartrecruiters.com']);
 const SR_CAREERS_HOSTS = new Set(['careers.smartrecruiters.com', 'jobs.smartrecruiters.com']);
 const SR_PAGE_SIZE = 100;
+const SR_MAX_PAGES = 50;  // safety cap (5000 postings @ 100/page)
 
 function assertSmartRecruitersUrl(url) {
   let parsed;
@@ -25,7 +26,7 @@ function assertSmartRecruitersUrl(url) {
   return url;
 }
 
-function resolveApiUrl(entry) {
+function resolveSlug(entry) {
   const raw = typeof entry.careers_url === 'string' ? entry.careers_url : '';
   if (!raw) return null;
   let parsed;
@@ -37,8 +38,16 @@ function resolveApiUrl(entry) {
   if (parsed.protocol !== 'https:') return null;
   if (!SR_CAREERS_HOSTS.has(parsed.hostname)) return null;
   const slug = parsed.pathname.split('/').filter(Boolean)[0];
-  if (!slug) return null;
-  return `https://api.smartrecruiters.com/v1/companies/${slug}/postings?limit=${SR_PAGE_SIZE}&offset=0&status=PUBLIC`;
+  return slug || null;
+}
+
+function buildPostingsUrl(slug, offset = 0) {
+  return `https://api.smartrecruiters.com/v1/companies/${slug}/postings?limit=${SR_PAGE_SIZE}&offset=${offset}&status=PUBLIC`;
+}
+
+function resolveApiUrl(entry) {
+  const slug = resolveSlug(entry);
+  return slug ? buildPostingsUrl(slug, 0) : null;
 }
 
 /** @type {Provider} */
@@ -51,11 +60,20 @@ export default {
   },
 
   async fetch(entry, ctx) {
-    const apiUrl = resolveApiUrl(entry);
-    if (!apiUrl) throw new Error(`smartrecruiters: cannot derive API URL for ${entry.name}`);
-    assertSmartRecruitersUrl(apiUrl);
-    const json = await ctx.fetchJson(apiUrl, { redirect: 'error' });
-    return parseSmartRecruitersResponse(json, entry.name);
+    const slug = resolveSlug(entry);
+    if (!slug) throw new Error(`smartrecruiters: cannot derive API URL for ${entry.name}`);
+
+    const all = [];
+    for (let page = 0; page < SR_MAX_PAGES; page++) {
+      const apiUrl = buildPostingsUrl(slug, page * SR_PAGE_SIZE);
+      assertSmartRecruitersUrl(apiUrl);
+      const json = await ctx.fetchJson(apiUrl, { redirect: 'error' });
+      const parsed = parseSmartRecruitersResponse(json, entry.name);
+      if (parsed.length === 0) break;
+      all.push(...parsed);
+      if (parsed.length < SR_PAGE_SIZE) break;  // last page (short)
+    }
+    return all;
   },
 };
 
diff --git a/providers/workable.mjs b/providers/workable.mjs
index 91cbaee752..c491f6daba 100644
--- a/providers/workable.mjs
+++ b/providers/workable.mjs
@@ -65,6 +65,8 @@ export default {
  *   | Title | Department | Location | Type | Salary | Posted | Details |
  * where `Details` holds a markdown link
  *   [View](https://apply.workable.com/<slug>/jobs/view/<id>.md)
+ * URLs are validated against `https://apply.workable.com/` — off-domain or
+ * non-HTTPS [View] links are skipped (not emitted).
  *
  * @param {string} text — markdown body
  * @param {string} companyName — value to write into job.company
@@ -85,6 +87,16 @@ export function parseWorkableMarkdown(text, companyName) {
     let url = urlMatch ? urlMatch[1] : '';
     if (url.endsWith('.md')) url = url.slice(0, -3);
     if (!url) continue;  // skip rows with no resolvable URL (e.g., malformed [View] link)
+
+    // Validate the extracted URL — must parse as https://apply.workable.com/...
+    try {
+      const parsedUrl = new URL(url);
+      if (parsedUrl.protocol !== 'https:' || parsedUrl.hostname !== 'apply.workable.com') continue;
+      url = parsedUrl.href;
+    } catch {
+      continue;
+    }
+
     jobs.push({ title, url, location, company: companyName });
   }
   return jobs;
diff --git a/test-all.mjs b/test-all.mjs
index 1ce3fb58d9..9ea3aa4848 100644
--- a/test-all.mjs
+++ b/test-all.mjs
@@ -435,6 +435,21 @@ try {
     fail(`stray-pipe row not handled correctly: ${JSON.stringify(strayJobs)}`);
   }
 
+  // Off-domain [View] link is dropped (URL validation)
+  const offDomainMd = [
+    '| Title | Department | Location | Type | Salary | Posted | Details |',
+    '|---|---|---|---|---|---|---|',
+    '| Good Role | Product | Remote | Full-time | — | 2026-04-01 | [View](https://apply.workable.com/x/jobs/view/ABC.md) |',
+    '| Evil Role | Product | Remote | Full-time | — | 2026-04-01 | [View](https://evil.example/jobs/view/X) |',
+    '| Insecure Role | Product | Remote | Full-time | — | 2026-04-01 | [View](http://apply.workable.com/x/jobs/view/Y.md) |',
+  ].join('\n');
+  const filteredJobs = parseWorkableMarkdown(offDomainMd, 'X');
+  if (filteredJobs.length === 1 && filteredJobs[0].title === 'Good Role') {
+    pass('parseWorkableMarkdown drops off-domain and non-https [View] links');
+  } else {
+    fail(`expected only "Good Role" through, got ${JSON.stringify(filteredJobs.map(j => j.title))}`);
+  }
+
 } catch (e) {
   fail(`workable provider tests crashed: ${e.message}`);
 }
@@ -577,6 +592,54 @@ try {
     fail(`fallback URL not properly slugified: ${JSON.stringify(slugifiedCompany[0]?.url)}`);
   }
 
+  // Pagination: fetch() loops until an empty page (or short page) is returned
+  let pageRequests = 0;
+  const pagedJobs = await sr.fetch(
+    { name: 'PagedCo', careers_url: 'https://careers.smartrecruiters.com/paged' },
+    {
+      transport: 'http',
+      fetchText: async () => { throw new Error('fetchText should not be called'); },
+      fetchJson: async (url) => {
+        pageRequests++;
+        const offset = parseInt(new URL(url).searchParams.get('offset') || '0', 10);
+        if (offset === 0) {
+          // Page 1: full page (100 items)
+          return { content: Array.from({ length: 100 }, (_, i) => ({ id: `P1-${i}`, name: `Role 1-${i}` })) };
+        }
+        if (offset === 100) {
+          // Page 2: short page (50 items) → loop stops after this
+          return { content: Array.from({ length: 50 }, (_, i) => ({ id: `P2-${i}`, name: `Role 2-${i}` })) };
+        }
+        // Should not be reached because page 2 was short
+        return { content: [] };
+      },
+    },
+  );
+  if (pageRequests === 2 && pagedJobs.length === 150) {
+    pass('smartrecruiters.fetch() paginates and aggregates results (2 pages → 150 total)');
+  } else {
+    fail(`pagination: pageRequests=${pageRequests}, total=${pagedJobs.length} (expected 2 requests / 150 results)`);
+  }
+
+  // Pagination stop condition: empty content terminates the loop
+  let emptyPageRequests = 0;
+  const emptyJobs = await sr.fetch(
+    { name: 'EmptyCo', careers_url: 'https://careers.smartrecruiters.com/empty' },
+    {
+      transport: 'http',
+      fetchText: async () => { throw new Error('fetchText should not be called'); },
+      fetchJson: async () => {
+        emptyPageRequests++;
+        return { content: [] };
+      },
+    },
+  );
+  if (emptyPageRequests === 1 && emptyJobs.length === 0) {
+    pass('smartrecruiters.fetch() stops on the first empty page');
+  } else {
+    fail(`empty pagination: requests=${emptyPageRequests}, total=${emptyJobs.length}`);
+  }
+
 } catch (e) {
   fail(`smartrecruiters provider tests crashed: ${e.message}`);
 }
@@ -658,6 +721,24 @@ try {
     fail('recruitee.detect() must NOT misdetect path-spoofed URLs');
   }
 
+  // Off-domain offer URL is dropped (URL validation)
+  const offDomainOffers = parseRecruiteeResponse(
+    {
+      offers: [
+        { title: 'Good', careers_url: 'https://channable.recruitee.com/o/good' },
+        { title: 'Evil', careers_url: 'https://evil.example/o/evil' },
+        { title: 'Insecure', careers_url: 'http://channable.recruitee.com/o/insecure' },
+        { title: 'No URL field' },
+      ],
+    },
+    'Channable',
+  );
+  if (offDomainOffers[0]?.url === 'https://channable.recruitee.com/o/good' && offDomainOffers[1]?.url === '' && offDomainOffers[2]?.url === '' && offDomainOffers[3]?.url === '') {
+    pass('parseRecruiteeResponse drops off-domain, non-https, and missing offer URLs');
+  } else {
+    fail(`URL validation: row0=${JSON.stringify(offDomainOffers[0]?.url)}, row1=${JSON.stringify(offDomainOffers[1]?.url)}, row2=${JSON.stringify(offDomainOffers[2]?.url)}, row3=${JSON.stringify(offDomainOffers[3]?.url)}`);
+  }
+
 } catch (e) {
   fail(`recruitee provider tests crashed: ${e.message}`);
 }