From 2187fbf2331e8ea67c8dc45fe9f16ecefa87db82 Mon Sep 17 00:00:00 2001 From: axmanalad Date: Thu, 24 Jul 2025 13:04:36 -0700 Subject: [PATCH 01/10] fix: correct clean script for core package --- packages/core/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/core/package.json b/packages/core/package.json index 118be74f99c9..9e1247c7e84e 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -45,7 +45,7 @@ "homepage": "https://crawlee.dev", "scripts": { "build": "yarn clean && yarn compile && yarn copy", - "clean": "rimraf ./dist", + "clean": "rimraf packages/core/dist", "compile": "tsc -p tsconfig.build.json && gen-esm-wrapper ./dist/index.js ./dist/index.mjs", "copy": "tsx ../../scripts/copy.ts" }, From f16799cf6dce1e94512ffda92c64825379b7a76f Mon Sep 17 00:00:00 2001 From: axmanalad Date: Thu, 24 Jul 2025 14:50:49 -0700 Subject: [PATCH 02/10] feat: add option "allowedSubdomains" for same-domain strategy to filter subdomains --- .../core/src/enqueue_links/enqueue_links.ts | 69 ++++++++++++++----- 1 file changed, 53 insertions(+), 16 deletions(-) diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts index 8c9106389a7b..67695999651f 100644 --- a/packages/core/src/enqueue_links/enqueue_links.ts +++ b/packages/core/src/enqueue_links/enqueue_links.ts @@ -1,7 +1,7 @@ import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types'; import { type RobotsTxtFile } from '@crawlee/utils'; import ow from 'ow'; -import { getDomain } from 'tldts'; +import { getDomain, getSubdomain } from 'tldts'; import type { SetRequired } from 'type-fest'; import log from '@apify/log'; @@ -67,6 +67,14 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions { */ baseUrl?: string; + /** + * An array of allowed subdomains to be used for matching URLs. + * + * Note that this option is only used when the `strategy` is set to `same-domain`. + * @default ["*"] + */ + allowedSubdomains?: readonly string[]; + /** * An array of glob pattern strings or plain objects * containing glob pattern strings matching the URLs to be enqueued. @@ -156,8 +164,8 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions { * Depending on the strategy you select, we will only check certain parts of the URLs found. Here is a diagram of each URL part and their name: * * ```md - * Protocol Domain - * ┌────┐ ┌─────────┐ + * Protocol Sub Domain + * ┌────┐ ┌─────┐ ┌─────────┐ * https://example.crawlee.dev/... * │ └─────────────────┤ * │ Hostname │ @@ -198,8 +206,8 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions { * Depending on the strategy you select, we will only check certain parts of the URLs found. Here is a diagram of each URL part and their name: * * ```md - * Protocol Domain - * ┌────┐ ┌─────────┐ + * Protocol Sub Domain + * ┌────┐ ┌─────┐ ┌─────────┐ * https://example.crawlee.dev/... * │ └─────────────────┤ * │ Hostname │ @@ -232,6 +240,11 @@ export enum EnqueueStrategy { * Matches any URLs that have the same domain as the base URL. * For example, `https://wow.an.example.com` and `https://example.com` will both be matched for a base url of * `https://example.com`. + * + * Also matches filtered subdomains if `allowedSubdomains` is provided instead of any subdomain. + * For example, if `allowedSubdomains` is set to `['wow', 'nice']` and the base URL is `https://example.com`, then + * `https://wow.example.com`, `https://nice.example.com`, and `https://example.com` will be matched, but + * `https://bar.example.com` will not. * * > This strategy will match both `http` and `https` protocols regardless of the base URL protocol. */ @@ -313,6 +326,7 @@ export async function enqueueLinks( transformRequestFunction: ow.optional.function, strategy: ow.optional.string.oneOf(Object.values(EnqueueStrategy)), waitForAllRequestsToBeAdded: ow.optional.boolean, + allowedSubdomains: ow.optional.array.ofType(ow.string), }), ); @@ -329,6 +343,7 @@ export async function enqueueLinks( waitForAllRequestsToBeAdded, robotsTxtFile, onSkippedRequest, + allowedSubdomains, } = options; const urlExcludePatternObjects: UrlPatternObject[] = []; @@ -374,18 +389,40 @@ export async function enqueueLinks( enqueueStrategyPatterns.push({ glob: ignoreHttpSchema(`${url.origin}/**`) }); break; case EnqueueStrategy.SameDomain: { - // Get the actual hostname from the base url - const baseUrlHostname = getDomain(url.hostname, { mixedInputs: false }); - - if (baseUrlHostname) { - // We have a hostname, so we can use it to match all links on the page that point to it and any subdomains of it - url.hostname = baseUrlHostname; - enqueueStrategyPatterns.push( - { glob: ignoreHttpSchema(`${url.origin.replace(baseUrlHostname, `*.${baseUrlHostname}`)}/**`) }, - { glob: ignoreHttpSchema(`${url.origin}/**`) }, - ); + // Get the actual domain and subdomain from the base url + const baseUrlDomain = getDomain(url.hostname, { mixedInputs: false }); + const baseUrlSubdomain = getSubdomain(url.hostname); + const subList = allowedSubdomains ?? ['*']; + + if (baseUrlDomain) { + // We have a domain, so we can use it to match all links on the page that point to it and any subdomains of it + if (subList.includes('*')) { + url.hostname = baseUrlDomain; + enqueueStrategyPatterns.push( + { glob: ignoreHttpSchema(`${url.origin.replace(baseUrlDomain, `*.${baseUrlDomain}`)}/**`) }, + { glob: ignoreHttpSchema(`${url.origin}/**`) }, // Only base URL, no subdomains + ); + } else if (subList.length > 0) { + // Defaults to always include subdomain of original URL if it exists. + enqueueStrategyPatterns.push({ glob: ignoreHttpSchema(`${url.origin}/**`) }); // Original URL + // User decides to filter by specific subdomains, so we can match them. + for (const subdomain of subList) { + // Prevents duplicate subdomain from original URL to be pushed as a pattern + if (subdomain && subdomain !== baseUrlSubdomain) { + const filteredSubdomainUrl = new URL(url.origin); + filteredSubdomainUrl.hostname = `${subdomain}.${baseUrlDomain}`; + enqueueStrategyPatterns.push( + { glob: ignoreHttpSchema(`${filteredSubdomainUrl.origin}/**`) }, // Base URL with filtered subdomain + ); + } + } + url.hostname = baseUrlDomain; + enqueueStrategyPatterns.push( + { glob: ignoreHttpSchema(`${url.origin}/**`) }, // Only base URL, no subdomains + ); + } } else { - // We don't have a hostname (can happen for ips for instance), so reproduce the same behavior + // We don't have a domain (can happen for ips for instance), so reproduce the same behavior // as SameDomainAndSubdomain enqueueStrategyPatterns.push({ glob: ignoreHttpSchema(`${url.origin}/**`) }); } From 4804903678a2cb7249b4b112b2b5438a1d02b427 Mon Sep 17 00:00:00 2001 From: SalvadorN Date: Thu, 24 Jul 2025 16:06:32 -0700 Subject: [PATCH 03/10] chore(enqueueLinks_test): Added unit tests for the same-domain allowedSubdomains param --- test/core/enqueue_links/enqueue_links.test.ts | 327 ++++++++++++++++++ 1 file changed, 327 insertions(+) diff --git a/test/core/enqueue_links/enqueue_links.test.ts b/test/core/enqueue_links/enqueue_links.test.ts index fbf34c604973..8f0163f6aede 100644 --- a/test/core/enqueue_links/enqueue_links.test.ts +++ b/test/core/enqueue_links/enqueue_links.test.ts @@ -44,6 +44,25 @@ const HTML = ` `; +const HTML_WITH_SUBDOMAINS = ` + + + Base domain + Base domain HTTP + WWW subdomain + WWW subdomain HTTP + API subdomain + Blog subdomain + CDN subdomain + Admin subdomain + Complex subdomain API-V2 + Complex subdomain user-dashboard + Different domain + Different domain with subdomain + + +`; + function createRequestQueueMock() { const enqueued: Source[] = []; const requestQueue = new RequestQueue({ id: 'xxx', client: apifyClient }); @@ -572,6 +591,53 @@ describe('enqueueLinks()', () => { expect(enqueued[2].method).toBe('GET'); expect(enqueued[2].userData!.foo).toBe('bar'); }); + + test('allowedSubdomains works with browser crawler and specific subdomains', async () => { + await page.setContent(HTML_WITH_SUBDOMAINS); + + const { enqueued, requestQueue } = createRequestQueueMock(); + await browserCrawlerEnqueueLinks({ + options: { + baseUrl: 'https://example.com', + strategy: EnqueueStrategy.SameDomain, + allowedSubdomains: ['www', 'api'] + }, + page, + requestQueue, + originalRequestUrl: 'https://example.com', + }); + + expect(enqueued).toHaveLength(5); + expect(enqueued.map(r => r.url)).toEqual([ + 'https://example.com/page1', + 'http://example.com/page2', + 'https://www.example.com/page3', + 'http://www.example.com/page4', + 'https://api.example.com/page5', + ]); + }); + + test('allowedSubdomains works with browser crawler and empty array', async () => { + await page.setContent(HTML_WITH_SUBDOMAINS); + + const { enqueued, requestQueue } = createRequestQueueMock(); + await browserCrawlerEnqueueLinks({ + options: { + baseUrl: 'https://example.com', + strategy: EnqueueStrategy.SameDomain, + allowedSubdomains: [''] + }, + page, + requestQueue, + originalRequestUrl: 'https://example.com', + }); + + expect(enqueued).toHaveLength(2); + expect(enqueued.map(r => r.url)).toEqual([ + 'https://example.com/page1', + 'http://example.com/page2', + ]); + }); }); describe('using Cheerio', () => { @@ -1027,4 +1093,265 @@ describe('enqueueLinks()', () => { } }); }); + + describe('allowedSubdomains parameter', () => { + let $: CheerioRoot; + + beforeEach(async () => { + $ = load(HTML_WITH_SUBDOMAINS); + }); + + afterEach(async () => { + $ = null!; + }); + + test('works with explicit wildcard in allowedSubdomains', async () => { + const { enqueued, requestQueue } = createRequestQueueMock(); + await cheerioCrawlerEnqueueLinks({ + options: { + baseUrl: 'https://example.com', + strategy: EnqueueStrategy.SameDomain, + allowedSubdomains: ['*'] + }, + $, + requestQueue, + originalRequestUrl: 'https://example.com', + }); + + expect(enqueued).toHaveLength(10); + expect(enqueued.map(r => r.url)).toEqual([ + 'https://example.com/page1', + 'http://example.com/page2', + 'https://www.example.com/page3', + 'http://www.example.com/page4', + 'https://api.example.com/page5', + 'https://blog.example.com/page6', + 'https://cdn.example.com/page7', + 'https://admin.example.com/page8', + 'https://api-v2.example.com/page9', + 'https://user-dashboard.example.com/page10', + ]); + }); + + test('works with specific allowed subdomains', async () => { + const { enqueued, requestQueue } = createRequestQueueMock(); + await cheerioCrawlerEnqueueLinks({ + options: { + baseUrl: 'https://example.com', + strategy: EnqueueStrategy.SameDomain, + allowedSubdomains: ['www', 'api'] + }, + $, + requestQueue, + originalRequestUrl: 'https://example.com', + }); + + expect(enqueued).toHaveLength(5); + expect(enqueued.map(r => r.url)).toEqual([ + 'https://example.com/page1', + 'http://example.com/page2', + 'https://www.example.com/page3', + 'http://www.example.com/page4', + 'https://api.example.com/page5', + ]); + }); + + test('works with empty allowedSubdomains array', async () => { + const { enqueued, requestQueue } = createRequestQueueMock(); + await cheerioCrawlerEnqueueLinks({ + options: { + baseUrl: 'https://example.com', + strategy: EnqueueStrategy.SameDomain, + allowedSubdomains: [''] + }, + $, + requestQueue, + originalRequestUrl: 'https://example.com', + }); + + expect(enqueued).toHaveLength(2); + expect(enqueued.map(r => r.url)).toEqual([ + 'https://example.com/page1', + 'http://example.com/page2', + ]); + }); + + test('handles duplicate subdomains in allowedSubdomains', async () => { + const { enqueued, requestQueue } = createRequestQueueMock(); + await cheerioCrawlerEnqueueLinks({ + options: { + baseUrl: 'https://example.com', + strategy: EnqueueStrategy.SameDomain, + allowedSubdomains: ['www', 'api', 'www', 'api', 'blog'] + }, + $, + requestQueue, + originalRequestUrl: 'https://example.com', + }); + + expect(enqueued).toHaveLength(6); + expect(enqueued.map(r => r.url)).toEqual([ + 'https://example.com/page1', + 'http://example.com/page2', + 'https://www.example.com/page3', + 'http://www.example.com/page4', + 'https://api.example.com/page5', + 'https://blog.example.com/page6', + ]); + }); + + test('handles empty string subdomains in allowedSubdomains', async () => { + const { enqueued, requestQueue } = createRequestQueueMock(); + await cheerioCrawlerEnqueueLinks({ + options: { + baseUrl: 'https://example.com', + strategy: EnqueueStrategy.SameDomain, + allowedSubdomains: ['www', '', 'api', ' ', 'blog'] + }, + $, + requestQueue, + originalRequestUrl: 'https://example.com', + }); + + expect(enqueued).toHaveLength(6); + expect(enqueued.map(r => r.url)).toEqual([ + 'https://example.com/page1', + 'http://example.com/page2', + 'https://www.example.com/page3', + 'http://www.example.com/page4', + 'https://api.example.com/page5', + 'https://blog.example.com/page6', + ]); + }); + + test('works when baseUrl has subdomain and allowedSubdomains filters it', async () => { + const { enqueued, requestQueue } = createRequestQueueMock(); + await cheerioCrawlerEnqueueLinks({ + options: { + baseUrl: 'https://api.example.com', + strategy: EnqueueStrategy.SameDomain, + allowedSubdomains: ['www', 'blog'] + }, + $, + requestQueue, + originalRequestUrl: 'https://api.example.com', + }); + + expect(enqueued).toHaveLength(6); + expect(enqueued.map(r => r.url)).toEqual([ + 'https://example.com/page1', + 'http://example.com/page2', + 'https://www.example.com/page3', + 'http://www.example.com/page4', + 'https://api.example.com/page5', + 'https://blog.example.com/page6', + ]); + }); + + test('works when baseUrl has subdomain that matches allowedSubdomains', async () => { + const { enqueued, requestQueue } = createRequestQueueMock(); + await cheerioCrawlerEnqueueLinks({ + options: { + baseUrl: 'https://www.example.com', + strategy: EnqueueStrategy.SameDomain, + allowedSubdomains: ['www', 'api'] + }, + $, + requestQueue, + originalRequestUrl: 'https://www.example.com', + }); + + expect(enqueued).toHaveLength(5); + expect(enqueued.map(r => r.url)).toEqual([ + 'https://example.com/page1', + 'http://example.com/page2', + 'https://www.example.com/page3', + 'http://www.example.com/page4', + 'https://api.example.com/page5', + ]); + }); + + test('ignores allowedSubdomains when strategy is not same-domain', async () => { + const { enqueued, requestQueue } = createRequestQueueMock(); + await cheerioCrawlerEnqueueLinks({ + options: { + baseUrl: 'https://example.com', + strategy: EnqueueStrategy.SameHostname, + allowedSubdomains: ['www', 'api'] + }, + $, + requestQueue, + originalRequestUrl: 'https://example.com', + }); + + expect(enqueued).toHaveLength(2); + expect(enqueued[0].url).toBe('https://example.com/page1'); + expect(enqueued[1].url).toBe('http://example.com/page2'); + }); + + test('works with mixed protocol filtering', async () => { + const $mixed = load(HTML_WITH_SUBDOMAINS); + + const { enqueued, requestQueue } = createRequestQueueMock(); + await cheerioCrawlerEnqueueLinks({ + options: { + baseUrl: 'https://example.com', + strategy: EnqueueStrategy.SameDomain, + allowedSubdomains: ['www'] + }, + $: $mixed, + requestQueue, + originalRequestUrl: 'https://example.com', + }); + + expect(enqueued).toHaveLength(4); + expect(enqueued.map(r => r.url)).toEqual([ + 'https://example.com/page1', + 'http://example.com/page2', + 'https://www.example.com/page3', + 'http://www.example.com/page4', + ]); + }); + + test('works with complex subdomain names', async () => { + const $complex = load(HTML_WITH_SUBDOMAINS); + + const { enqueued, requestQueue } = createRequestQueueMock(); + await cheerioCrawlerEnqueueLinks({ + options: { + baseUrl: 'https://example.com', + strategy: EnqueueStrategy.SameDomain, + allowedSubdomains: ['api-v2', 'user-dashboard'] + }, + $: $complex, + requestQueue, + originalRequestUrl: 'https://example.com', + }); + + expect(enqueued).toHaveLength(4); + expect(enqueued.map(r => r.url)).toEqual([ + 'https://example.com/page1', + 'http://example.com/page2', + 'https://api-v2.example.com/page9', + 'https://user-dashboard.example.com/page10', + ]); + }); + + test('validation works with allowedSubdomains parameter', async () => { + const { enqueued, requestQueue } = createRequestQueueMock(); + + await expect( + cheerioCrawlerEnqueueLinks({ + options: { + baseUrl: 'https://example.com', + strategy: EnqueueStrategy.SameDomain, + allowedSubdomains: ['www', 'api'] + }, + $, + requestQueue, + originalRequestUrl: 'https://example.com', + }) + ).resolves.not.toThrow(); + }); + }); }); From fb6dd88b67f42e1380c7c0931dd352935a3f0cb5 Mon Sep 17 00:00:00 2001 From: axmanalad Date: Thu, 24 Jul 2025 17:00:03 -0700 Subject: [PATCH 04/10] fix(enqueueLinks): empty allowedSubdomains runs default same-domain - Fixed missing comma formatting in tests --- .../core/src/enqueue_links/enqueue_links.ts | 2 +- test/core/enqueue_links/enqueue_links.test.ts | 28 +++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts index 67695999651f..ac3f4056b1e2 100644 --- a/packages/core/src/enqueue_links/enqueue_links.ts +++ b/packages/core/src/enqueue_links/enqueue_links.ts @@ -396,7 +396,7 @@ export async function enqueueLinks( if (baseUrlDomain) { // We have a domain, so we can use it to match all links on the page that point to it and any subdomains of it - if (subList.includes('*')) { + if (subList.includes('*') || subList.length === 0) { url.hostname = baseUrlDomain; enqueueStrategyPatterns.push( { glob: ignoreHttpSchema(`${url.origin.replace(baseUrlDomain, `*.${baseUrlDomain}`)}/**`) }, diff --git a/test/core/enqueue_links/enqueue_links.test.ts b/test/core/enqueue_links/enqueue_links.test.ts index 8f0163f6aede..40b15c273f03 100644 --- a/test/core/enqueue_links/enqueue_links.test.ts +++ b/test/core/enqueue_links/enqueue_links.test.ts @@ -600,7 +600,7 @@ describe('enqueueLinks()', () => { options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, - allowedSubdomains: ['www', 'api'] + allowedSubdomains: ['www', 'api'], }, page, requestQueue, @@ -625,7 +625,7 @@ describe('enqueueLinks()', () => { options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, - allowedSubdomains: [''] + allowedSubdomains: [''], }, page, requestQueue, @@ -1111,7 +1111,7 @@ describe('enqueueLinks()', () => { options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, - allowedSubdomains: ['*'] + allowedSubdomains: ['*'], }, $, requestQueue, @@ -1139,7 +1139,7 @@ describe('enqueueLinks()', () => { options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, - allowedSubdomains: ['www', 'api'] + allowedSubdomains: ['www', 'api'], }, $, requestQueue, @@ -1162,7 +1162,7 @@ describe('enqueueLinks()', () => { options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, - allowedSubdomains: [''] + allowedSubdomains: [''], }, $, requestQueue, @@ -1182,7 +1182,7 @@ describe('enqueueLinks()', () => { options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, - allowedSubdomains: ['www', 'api', 'www', 'api', 'blog'] + allowedSubdomains: ['www', 'api', 'www', 'api', 'blog'], }, $, requestQueue, @@ -1206,7 +1206,7 @@ describe('enqueueLinks()', () => { options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, - allowedSubdomains: ['www', '', 'api', ' ', 'blog'] + allowedSubdomains: ['www', '', 'api', ' ', 'blog'], }, $, requestQueue, @@ -1230,7 +1230,7 @@ describe('enqueueLinks()', () => { options: { baseUrl: 'https://api.example.com', strategy: EnqueueStrategy.SameDomain, - allowedSubdomains: ['www', 'blog'] + allowedSubdomains: ['www', 'blog'], }, $, requestQueue, @@ -1254,7 +1254,7 @@ describe('enqueueLinks()', () => { options: { baseUrl: 'https://www.example.com', strategy: EnqueueStrategy.SameDomain, - allowedSubdomains: ['www', 'api'] + allowedSubdomains: ['www', 'api'], }, $, requestQueue, @@ -1277,7 +1277,7 @@ describe('enqueueLinks()', () => { options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameHostname, - allowedSubdomains: ['www', 'api'] + allowedSubdomains: ['www', 'api'], }, $, requestQueue, @@ -1297,7 +1297,7 @@ describe('enqueueLinks()', () => { options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, - allowedSubdomains: ['www'] + allowedSubdomains: ['www'], }, $: $mixed, requestQueue, @@ -1321,7 +1321,7 @@ describe('enqueueLinks()', () => { options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, - allowedSubdomains: ['api-v2', 'user-dashboard'] + allowedSubdomains: ['api-v2', 'user-dashboard'], }, $: $complex, requestQueue, @@ -1345,12 +1345,12 @@ describe('enqueueLinks()', () => { options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, - allowedSubdomains: ['www', 'api'] + allowedSubdomains: ['www', 'api'], }, $, requestQueue, originalRequestUrl: 'https://example.com', - }) + }), ).resolves.not.toThrow(); }); }); From 484473a4597cd501c18164dc2dd4ea31e9ff05b3 Mon Sep 17 00:00:00 2001 From: Bao Truong Date: Fri, 25 Jul 2025 09:52:45 -0700 Subject: [PATCH 05/10] adding documentation --- docs/introduction/03-adding-urls.mdx | 48 ++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/docs/introduction/03-adding-urls.mdx b/docs/introduction/03-adding-urls.mdx index 387b86fb9450..e223866c3567 100644 --- a/docs/introduction/03-adding-urls.mdx +++ b/docs/introduction/03-adding-urls.mdx @@ -112,6 +112,44 @@ await enqueueLinks({ }); ``` +### Controlling subdomain access + +When using the `same-domain` strategy, you can further control which subdomains are allowed using the `allowedSubdomains` option. By default with `same-domain`, all subdomains are included, but you can specify only certain subdomains: + +```ts +// Allow only specific subdomains with same-domain strategy +await enqueueLinks({ + strategy: 'same-domain', + allowedSubdomains: ['blog', 'shop', 'support'] +}); + +// Allow all subdomains (default same-domain behavior) +await enqueueLinks({ + strategy: 'same-domain', + allowedSubdomains: ['*'] +}); + +// Allow only the main domain (no subdomains) with same-domain strategy +await enqueueLinks({ + strategy: 'same-domain', + allowedSubdomains: [] +}); +``` + +For example, if your base URL is `https://example.com` and you use `strategy: 'same-domain'` with `allowedSubdomains: ['blog', 'shop']`, the crawler will enqueue links from: +- `https://example.com` (main domain) +- `https://blog.example.com` (allowed subdomain) +- `https://shop.example.com` (allowed subdomain) + +But it will **not** enqueue links from: +- `https://api.example.com` (subdomain not in allowed list) +- `https://docs.example.com` (subdomain not in allowed list) + +:::note +The `allowedSubdomains` option only works when you explicitly set `strategy: 'same-domain'`. It has no effect with the default `same-hostname` strategy or other strategies. +::: + + When you run the code, you will see the crawler log the **title** of the first page, then the **enqueueing** message showing number of URLs, followed by the **title** of the first enqueued page and so on and so on. ## Skipping duplicate URLs @@ -144,6 +182,16 @@ await enqueueLinks({ }); ``` +You can also combine the `same-domain` strategy with `allowedSubdomains` for precise control: + +```ts +await enqueueLinks({ + strategy: 'same-domain', + allowedSubdomains: ['www', 'mobile'], // Only allow www and mobile subdomains + globs: ['http?(s)://**.example.com/products/*'] // Additional URL filtering +}); +``` + ### Transform requests To have absolute control, we have the `transformRequestFunction`. Just before a new `Request` is constructed and enqueued to the `RequestQueue`, this function can be used to skip it or modify its contents such as `userData`, `payload` or, most importantly, `uniqueKey`. This is useful when you need to enqueue multiple requests to the queue, and these requests share the same URL, but differ in methods or payloads. Another use case is to dynamically update or create the `userData`. From dcac61d5e80346e3cb8a8ad89e9f16efc9dc3894 Mon Sep 17 00:00:00 2001 From: axmanalad Date: Fri, 25 Jul 2025 10:47:28 -0700 Subject: [PATCH 06/10] docs(enqueueLinks): correct allowedSubdomains code for main domain only --- docs/introduction/03-adding-urls.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/introduction/03-adding-urls.mdx b/docs/introduction/03-adding-urls.mdx index e223866c3567..6117a7c5dafb 100644 --- a/docs/introduction/03-adding-urls.mdx +++ b/docs/introduction/03-adding-urls.mdx @@ -132,7 +132,7 @@ await enqueueLinks({ // Allow only the main domain (no subdomains) with same-domain strategy await enqueueLinks({ strategy: 'same-domain', - allowedSubdomains: [] + allowedSubdomains: [''] }); ``` From 3714c7751e0ea43909810c11129ac01d1479a089 Mon Sep 17 00:00:00 2001 From: axmanalad Date: Fri, 25 Jul 2025 12:19:04 -0700 Subject: [PATCH 07/10] style(enqueueLinks): fix linting and clean up whitespace in comments and test cases --- .../core/src/enqueue_links/enqueue_links.ts | 4 +- test/core/enqueue_links/enqueue_links.test.ts | 64 +++++++++---------- 2 files changed, 31 insertions(+), 37 deletions(-) diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts index ac3f4056b1e2..bee8629a37bc 100644 --- a/packages/core/src/enqueue_links/enqueue_links.ts +++ b/packages/core/src/enqueue_links/enqueue_links.ts @@ -69,7 +69,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions { /** * An array of allowed subdomains to be used for matching URLs. - * + * * Note that this option is only used when the `strategy` is set to `same-domain`. * @default ["*"] */ @@ -240,7 +240,7 @@ export enum EnqueueStrategy { * Matches any URLs that have the same domain as the base URL. * For example, `https://wow.an.example.com` and `https://example.com` will both be matched for a base url of * `https://example.com`. - * + * * Also matches filtered subdomains if `allowedSubdomains` is provided instead of any subdomain. * For example, if `allowedSubdomains` is set to `['wow', 'nice']` and the base URL is `https://example.com`, then * `https://wow.example.com`, `https://nice.example.com`, and `https://example.com` will be matched, but diff --git a/test/core/enqueue_links/enqueue_links.test.ts b/test/core/enqueue_links/enqueue_links.test.ts index 40b15c273f03..e9f0d4350ca6 100644 --- a/test/core/enqueue_links/enqueue_links.test.ts +++ b/test/core/enqueue_links/enqueue_links.test.ts @@ -594,10 +594,10 @@ describe('enqueueLinks()', () => { test('allowedSubdomains works with browser crawler and specific subdomains', async () => { await page.setContent(HTML_WITH_SUBDOMAINS); - + const { enqueued, requestQueue } = createRequestQueueMock(); await browserCrawlerEnqueueLinks({ - options: { + options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, allowedSubdomains: ['www', 'api'], @@ -608,7 +608,7 @@ describe('enqueueLinks()', () => { }); expect(enqueued).toHaveLength(5); - expect(enqueued.map(r => r.url)).toEqual([ + expect(enqueued.map((r) => r.url)).toEqual([ 'https://example.com/page1', 'http://example.com/page2', 'https://www.example.com/page3', @@ -619,10 +619,10 @@ describe('enqueueLinks()', () => { test('allowedSubdomains works with browser crawler and empty array', async () => { await page.setContent(HTML_WITH_SUBDOMAINS); - + const { enqueued, requestQueue } = createRequestQueueMock(); await browserCrawlerEnqueueLinks({ - options: { + options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, allowedSubdomains: [''], @@ -633,10 +633,7 @@ describe('enqueueLinks()', () => { }); expect(enqueued).toHaveLength(2); - expect(enqueued.map(r => r.url)).toEqual([ - 'https://example.com/page1', - 'http://example.com/page2', - ]); + expect(enqueued.map((r) => r.url)).toEqual(['https://example.com/page1', 'http://example.com/page2']); }); }); @@ -1108,7 +1105,7 @@ describe('enqueueLinks()', () => { test('works with explicit wildcard in allowedSubdomains', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); await cheerioCrawlerEnqueueLinks({ - options: { + options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, allowedSubdomains: ['*'], @@ -1119,7 +1116,7 @@ describe('enqueueLinks()', () => { }); expect(enqueued).toHaveLength(10); - expect(enqueued.map(r => r.url)).toEqual([ + expect(enqueued.map((r) => r.url)).toEqual([ 'https://example.com/page1', 'http://example.com/page2', 'https://www.example.com/page3', @@ -1136,7 +1133,7 @@ describe('enqueueLinks()', () => { test('works with specific allowed subdomains', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); await cheerioCrawlerEnqueueLinks({ - options: { + options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, allowedSubdomains: ['www', 'api'], @@ -1147,7 +1144,7 @@ describe('enqueueLinks()', () => { }); expect(enqueued).toHaveLength(5); - expect(enqueued.map(r => r.url)).toEqual([ + expect(enqueued.map((r) => r.url)).toEqual([ 'https://example.com/page1', 'http://example.com/page2', 'https://www.example.com/page3', @@ -1159,7 +1156,7 @@ describe('enqueueLinks()', () => { test('works with empty allowedSubdomains array', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); await cheerioCrawlerEnqueueLinks({ - options: { + options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, allowedSubdomains: [''], @@ -1170,16 +1167,13 @@ describe('enqueueLinks()', () => { }); expect(enqueued).toHaveLength(2); - expect(enqueued.map(r => r.url)).toEqual([ - 'https://example.com/page1', - 'http://example.com/page2', - ]); + expect(enqueued.map((r) => r.url)).toEqual(['https://example.com/page1', 'http://example.com/page2']); }); test('handles duplicate subdomains in allowedSubdomains', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); await cheerioCrawlerEnqueueLinks({ - options: { + options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, allowedSubdomains: ['www', 'api', 'www', 'api', 'blog'], @@ -1190,7 +1184,7 @@ describe('enqueueLinks()', () => { }); expect(enqueued).toHaveLength(6); - expect(enqueued.map(r => r.url)).toEqual([ + expect(enqueued.map((r) => r.url)).toEqual([ 'https://example.com/page1', 'http://example.com/page2', 'https://www.example.com/page3', @@ -1203,7 +1197,7 @@ describe('enqueueLinks()', () => { test('handles empty string subdomains in allowedSubdomains', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); await cheerioCrawlerEnqueueLinks({ - options: { + options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, allowedSubdomains: ['www', '', 'api', ' ', 'blog'], @@ -1214,7 +1208,7 @@ describe('enqueueLinks()', () => { }); expect(enqueued).toHaveLength(6); - expect(enqueued.map(r => r.url)).toEqual([ + expect(enqueued.map((r) => r.url)).toEqual([ 'https://example.com/page1', 'http://example.com/page2', 'https://www.example.com/page3', @@ -1227,7 +1221,7 @@ describe('enqueueLinks()', () => { test('works when baseUrl has subdomain and allowedSubdomains filters it', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); await cheerioCrawlerEnqueueLinks({ - options: { + options: { baseUrl: 'https://api.example.com', strategy: EnqueueStrategy.SameDomain, allowedSubdomains: ['www', 'blog'], @@ -1238,7 +1232,7 @@ describe('enqueueLinks()', () => { }); expect(enqueued).toHaveLength(6); - expect(enqueued.map(r => r.url)).toEqual([ + expect(enqueued.map((r) => r.url)).toEqual([ 'https://example.com/page1', 'http://example.com/page2', 'https://www.example.com/page3', @@ -1251,7 +1245,7 @@ describe('enqueueLinks()', () => { test('works when baseUrl has subdomain that matches allowedSubdomains', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); await cheerioCrawlerEnqueueLinks({ - options: { + options: { baseUrl: 'https://www.example.com', strategy: EnqueueStrategy.SameDomain, allowedSubdomains: ['www', 'api'], @@ -1262,7 +1256,7 @@ describe('enqueueLinks()', () => { }); expect(enqueued).toHaveLength(5); - expect(enqueued.map(r => r.url)).toEqual([ + expect(enqueued.map((r) => r.url)).toEqual([ 'https://example.com/page1', 'http://example.com/page2', 'https://www.example.com/page3', @@ -1274,7 +1268,7 @@ describe('enqueueLinks()', () => { test('ignores allowedSubdomains when strategy is not same-domain', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); await cheerioCrawlerEnqueueLinks({ - options: { + options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameHostname, allowedSubdomains: ['www', 'api'], @@ -1291,10 +1285,10 @@ describe('enqueueLinks()', () => { test('works with mixed protocol filtering', async () => { const $mixed = load(HTML_WITH_SUBDOMAINS); - + const { enqueued, requestQueue } = createRequestQueueMock(); await cheerioCrawlerEnqueueLinks({ - options: { + options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, allowedSubdomains: ['www'], @@ -1305,7 +1299,7 @@ describe('enqueueLinks()', () => { }); expect(enqueued).toHaveLength(4); - expect(enqueued.map(r => r.url)).toEqual([ + expect(enqueued.map((r) => r.url)).toEqual([ 'https://example.com/page1', 'http://example.com/page2', 'https://www.example.com/page3', @@ -1315,10 +1309,10 @@ describe('enqueueLinks()', () => { test('works with complex subdomain names', async () => { const $complex = load(HTML_WITH_SUBDOMAINS); - + const { enqueued, requestQueue } = createRequestQueueMock(); await cheerioCrawlerEnqueueLinks({ - options: { + options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, allowedSubdomains: ['api-v2', 'user-dashboard'], @@ -1329,7 +1323,7 @@ describe('enqueueLinks()', () => { }); expect(enqueued).toHaveLength(4); - expect(enqueued.map(r => r.url)).toEqual([ + expect(enqueued.map((r) => r.url)).toEqual([ 'https://example.com/page1', 'http://example.com/page2', 'https://api-v2.example.com/page9', @@ -1339,10 +1333,10 @@ describe('enqueueLinks()', () => { test('validation works with allowedSubdomains parameter', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); - + await expect( cheerioCrawlerEnqueueLinks({ - options: { + options: { baseUrl: 'https://example.com', strategy: EnqueueStrategy.SameDomain, allowedSubdomains: ['www', 'api'], From 4d202a1d9a743cf02d755dee726074f50a9c73a1 Mon Sep 17 00:00:00 2001 From: axmanalad Date: Fri, 25 Jul 2025 12:20:20 -0700 Subject: [PATCH 08/10] Revert "fix: correct clean script for core package" This reverts commit 2187fbf2331e8ea67c8dc45fe9f16ecefa87db82. --- packages/core/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/core/package.json b/packages/core/package.json index 9e1247c7e84e..118be74f99c9 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -45,7 +45,7 @@ "homepage": "https://crawlee.dev", "scripts": { "build": "yarn clean && yarn compile && yarn copy", - "clean": "rimraf packages/core/dist", + "clean": "rimraf ./dist", "compile": "tsc -p tsconfig.build.json && gen-esm-wrapper ./dist/index.js ./dist/index.mjs", "copy": "tsx ../../scripts/copy.ts" }, From 8902b0ffc39dfb3021e8693b333649b3aec9e711 Mon Sep 17 00:00:00 2001 From: axmanalad Date: Fri, 25 Jul 2025 14:47:16 -0700 Subject: [PATCH 09/10] docs(adding-urls): move run result message back into "Filtering links to same domain" --- docs/introduction/03-adding-urls.mdx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/introduction/03-adding-urls.mdx b/docs/introduction/03-adding-urls.mdx index 6117a7c5dafb..3d0d2434de63 100644 --- a/docs/introduction/03-adding-urls.mdx +++ b/docs/introduction/03-adding-urls.mdx @@ -112,6 +112,8 @@ await enqueueLinks({ }); ``` +When you run the code, you will see the crawler log the **title** of the first page, then the **enqueueing** message showing number of URLs, followed by the **title** of the first enqueued page and so on and so on. + ### Controlling subdomain access When using the `same-domain` strategy, you can further control which subdomains are allowed using the `allowedSubdomains` option. By default with `same-domain`, all subdomains are included, but you can specify only certain subdomains: @@ -149,9 +151,6 @@ But it will **not** enqueue links from: The `allowedSubdomains` option only works when you explicitly set `strategy: 'same-domain'`. It has no effect with the default `same-hostname` strategy or other strategies. ::: - -When you run the code, you will see the crawler log the **title** of the first page, then the **enqueueing** message showing number of URLs, followed by the **title** of the first enqueued page and so on and so on. - ## Skipping duplicate URLs Skipping of duplicate URLs is critical, because visiting the same page multiple times would lead to duplicate results. This is automatically handled by the `RequestQueue` which deduplicates requests using their `uniqueKey`. This `uniqueKey` is automatically generated from the request's URL by lowercasing the URL, lexically ordering query parameters, removing fragments and a few other tweaks that ensure the queue only includes unique URLs. From a76b3a6371c480889824e6414656596a51b4d5fb Mon Sep 17 00:00:00 2001 From: axmanalad Date: Fri, 25 Jul 2025 15:15:14 -0700 Subject: [PATCH 10/10] fix(enqueueLinks): simplify subdomain filtering in 'same-domain' strat --- packages/core/src/enqueue_links/enqueue_links.ts | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts index bee8629a37bc..2cb50539e59f 100644 --- a/packages/core/src/enqueue_links/enqueue_links.ts +++ b/packages/core/src/enqueue_links/enqueue_links.ts @@ -402,18 +402,17 @@ export async function enqueueLinks( { glob: ignoreHttpSchema(`${url.origin.replace(baseUrlDomain, `*.${baseUrlDomain}`)}/**`) }, { glob: ignoreHttpSchema(`${url.origin}/**`) }, // Only base URL, no subdomains ); - } else if (subList.length > 0) { + } else { // Defaults to always include subdomain of original URL if it exists. enqueueStrategyPatterns.push({ glob: ignoreHttpSchema(`${url.origin}/**`) }); // Original URL // User decides to filter by specific subdomains, so we can match them. for (const subdomain of subList) { - // Prevents duplicate subdomain from original URL to be pushed as a pattern if (subdomain && subdomain !== baseUrlSubdomain) { const filteredSubdomainUrl = new URL(url.origin); filteredSubdomainUrl.hostname = `${subdomain}.${baseUrlDomain}`; - enqueueStrategyPatterns.push( - { glob: ignoreHttpSchema(`${filteredSubdomainUrl.origin}/**`) }, // Base URL with filtered subdomain - ); + enqueueStrategyPatterns.push({ + glob: ignoreHttpSchema(`${filteredSubdomainUrl.origin}/**`), + }); } } url.hostname = baseUrlDomain;