Skip to content

Commit 7f3cc4e

Browse files
committed
feat: propagate router route map to crawler & context request methods
When a typed router is used, the route map now also types the request inputs: - handler context: `ctx.addRequests` and `ctx.enqueueLinks` require the `userData` shape matching the request's `label` (and reject unknown labels); this is driven by the router, so it works for every crawler type. - crawler instance: `Routes` is inferred from the `requestHandler` option and used to type `crawler.addRequests`/`crawler.run` for the HTTP-based crawlers (Basic/Http/Cheerio/JSDOM/LinkeDOM). Unlabeled requests keep loose `userData` (they hit the default handler). All typing is backwards compatible via the open-map default. Note: crawler-instance `addRequests` typing is not yet wired for the browser crawlers (Playwright/Puppeteer/Stagehand) — their `requestHandler` is redefined in BrowserCrawlerOptions which breaks generic inference through the hierarchy; their handler-context methods are still fully typed via the router.
1 parent 4b70a48 commit 7f3cc4e

8 files changed

Lines changed: 191 additions & 26 deletions

File tree

packages/basic-crawler/src/internals/basic-crawler.ts

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import type {
2828
StatisticsOptions,
2929
StatisticState,
3030
StorageIdentifier,
31+
TypedRequestsLike,
3132
} from '@crawlee/core';
3233
import {
3334
AutoscaledPool,
@@ -112,7 +113,7 @@ export type ErrorHandler<
112113

113114
export interface StatusMessageCallbackParams<
114115
Context extends CrawlingContext = BasicCrawlingContext,
115-
Crawler extends BasicCrawler<any> = BasicCrawler<Context>,
116+
Crawler extends BasicCrawler<any, any, any, any> = BasicCrawler<Context>,
116117
> {
117118
state: StatisticState;
118119
crawler: Crawler;
@@ -122,7 +123,7 @@ export interface StatusMessageCallbackParams<
122123

123124
export type StatusMessageCallback<
124125
Context extends CrawlingContext = BasicCrawlingContext,
125-
Crawler extends BasicCrawler<any> = BasicCrawler<Context>,
126+
Crawler extends BasicCrawler<any, any, any, any> = BasicCrawler<Context>,
126127
> = (params: StatusMessageCallbackParams<Context, Crawler>) => Awaitable<void>;
127128

128129
export type RequireContextPipeline<
@@ -136,6 +137,7 @@ export interface BasicCrawlerOptions<
136137
Context extends CrawlingContext = CrawlingContext,
137138
ContextExtension = Dictionary<never>,
138139
ExtendedContext extends Context = Context & ContextExtension,
140+
Routes extends Record<keyof Routes, Dictionary> = Record<string, GetUserDataFromRequest<Context['request']>>,
139141
> {
140142
/**
141143
* User-provided function that performs the logic of the crawler. It is called for each URL to crawl.
@@ -154,7 +156,7 @@ export interface BasicCrawlerOptions<
154156
* The exceptions are logged to the request using the
155157
* {@apilink Request.pushErrorMessage|`Request.pushErrorMessage()`} function.
156158
*/
157-
requestHandler?: RequestHandler<ExtendedContext>;
159+
requestHandler?: RouterHandler<ExtendedContext, Routes> | RequestHandler<ExtendedContext>;
158160

159161
/**
160162
* Allows the user to extend the crawling context passed to the request handler with custom functionality.
@@ -514,6 +516,7 @@ export class BasicCrawler<
514516
Context extends CrawlingContext = CrawlingContext,
515517
ContextExtension = Dictionary<never>,
516518
ExtendedContext extends Context = Context & ContextExtension,
519+
Routes extends Record<keyof Routes, Dictionary> = Record<string, GetUserDataFromRequest<Context['request']>>,
517520
> {
518521
protected static readonly CRAWLEE_STATE_KEY = 'CRAWLEE_STATE';
519522

@@ -582,7 +585,10 @@ export class BasicCrawler<
582585
* Default {@apilink Router} instance that will be used if we don't specify any {@apilink BasicCrawlerOptions.requestHandler|`requestHandler`}.
583586
* See {@apilink Router.addHandler|`router.addHandler()`} and {@apilink Router.addDefaultHandler|`router.addDefaultHandler()`}.
584587
*/
585-
readonly router: RouterHandler<Context> = Router.create<Context>();
588+
readonly router: RouterHandler<Context, Routes> = Router.create<Context>() as unknown as RouterHandler<
589+
Context,
590+
Routes
591+
>;
586592

587593
private _basicContextPipeline?: ContextPipeline<{ request: Request }, CrawlingContext>;
588594

@@ -707,7 +713,7 @@ export class BasicCrawler<
707713
* All `BasicCrawler` parameters are passed via an options object.
708714
*/
709715
constructor(
710-
options: BasicCrawlerOptions<Context, ContextExtension, ExtendedContext> &
716+
options: BasicCrawlerOptions<Context, ContextExtension, ExtendedContext, Routes> &
711717
RequireContextPipeline<CrawlingContext, Context> = {} as any, // cast because the constructor logic handles missing `contextPipelineBuilder` - the type is just for DX
712718
) {
713719
ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
@@ -1265,7 +1271,7 @@ export class BasicCrawler<
12651271
* @param [requests] The requests to add.
12661272
* @param [options] Options for the request queue.
12671273
*/
1268-
async run(requests?: RequestsLike, options?: CrawlerRunOptions): Promise<FinalStatistics> {
1274+
async run(requests?: TypedRequestsLike<Routes>, options?: CrawlerRunOptions): Promise<FinalStatistics> {
12691275
if (this.running) {
12701276
throw new Error(
12711277
'This crawler instance is already running, you can add more requests to it via `crawler.addRequests()`.',
@@ -1536,7 +1542,7 @@ export class BasicCrawler<
15361542
* @param options Options for the request queue
15371543
*/
15381544
async addRequests(
1539-
requests: ReadonlyDeep<RequestsLike>,
1545+
requests: ReadonlyDeep<TypedRequestsLike<Routes>>,
15401546
options: CrawlerAddRequestsOptions = {},
15411547
): Promise<CrawlerAddRequestsResult> {
15421548
await this.getRequestManager();

packages/cheerio-crawler/src/internals/cheerio-crawler.ts

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ export interface CheerioCrawlerOptions<
3737
ExtendedContext extends CheerioCrawlingContext = CheerioCrawlingContext & ContextExtension,
3838
UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
3939
JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
40-
> extends HttpCrawlerOptions<CheerioCrawlingContext<UserData, JSONData>, ContextExtension, ExtendedContext> {}
40+
Routes extends Record<keyof Routes, Dictionary> = Record<string, UserData>,
41+
> extends HttpCrawlerOptions<CheerioCrawlingContext<UserData, JSONData>, ContextExtension, ExtendedContext, Routes> {}
4142

4243
export type CheerioHook<
4344
UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
@@ -182,11 +183,15 @@ export type CheerioRequestHandler<
182183
export class CheerioCrawler<
183184
ContextExtension = Dictionary<never>,
184185
ExtendedContext extends CheerioCrawlingContext = CheerioCrawlingContext & ContextExtension,
185-
> extends HttpCrawler<CheerioCrawlingContext, ContextExtension, ExtendedContext> {
186+
Routes extends Record<keyof Routes, Dictionary> = Record<
187+
string,
188+
GetUserDataFromRequest<CheerioCrawlingContext['request']>
189+
>,
190+
> extends HttpCrawler<CheerioCrawlingContext, ContextExtension, ExtendedContext, Routes> {
186191
/**
187192
* All `CheerioCrawler` parameters are passed via an options object.
188193
*/
189-
constructor(options?: CheerioCrawlerOptions<ContextExtension, ExtendedContext>) {
194+
constructor(options?: CheerioCrawlerOptions<ContextExtension, ExtendedContext, any, any, Routes>) {
190195
const { contextPipelineBuilder, ...rest } = options ?? {};
191196

192197
super({

packages/core/src/crawlers/crawler_commons.ts

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import type { ReadonlyDeep, SetRequired } from 'type-fest';
44
import type { Configuration } from '../configuration.js';
55
import type { EnqueueLinksOptions } from '../enqueue_links/enqueue_links.js';
66
import type { CrawleeLogger } from '../log.js';
7-
import type { Request, Source } from '../request.js';
7+
import type { Request, RequestOptions, Source } from '../request.js';
88
import type { Dataset } from '../storages/dataset.js';
99
import { KeyValueStore, type RecordOptions } from '../storages/key_value_store.js';
1010
import type { RequestQueueOperationOptions } from '../storages/request_queue.js';
@@ -13,6 +13,73 @@ import type { StorageIdentifier } from '../storages/storage_instance_manager.js'
1313
/** @internal */
1414
export type IsAny<T> = 0 extends 1 & T ? true : false;
1515

16+
/**
17+
* A request input (URL string, request-options object, or {@apilink Request}) whose `userData` is typed
18+
* according to its `label`, based on a router's route map.
19+
*
20+
* When the route map is open (the default `Record<string, ...>`), this is just the regular loose
21+
* {@apilink Source} input. When the map declares concrete labels, providing a `label` requires the matching
22+
* `userData` shape and rejects labels not present in the map; unlabeled requests keep loose `userData`.
23+
*/
24+
export type LabeledSource<Routes extends Record<keyof Routes, Dictionary>> = string extends keyof Routes
25+
? string | Source
26+
:
27+
| string
28+
| Request
29+
| ({ requestsFromUrl?: string; regex?: RegExp } & (
30+
| {
31+
[Label in keyof Routes & string]: Omit<Partial<RequestOptions<Routes[Label]>>, 'label'> & {
32+
label: Label;
33+
};
34+
}[keyof Routes & string]
35+
| (Omit<Partial<RequestOptions>, 'label'> & { label?: undefined })
36+
));
37+
38+
/**
39+
* The iterable/array of {@apilink LabeledSource} inputs accepted by the label-aware `addRequests`/`run`
40+
* methods of a crawler bound to a typed router.
41+
*/
42+
export type TypedRequestsLike<Routes extends Record<keyof Routes, Dictionary>> =
43+
| AsyncIterable<LabeledSource<Routes>>
44+
| Iterable<LabeledSource<Routes>>
45+
| LabeledSource<Routes>[];
46+
47+
/**
48+
* The label-aware `addRequests` method signature exposed on a request handler's context when the crawler is
49+
* bound to a typed router. Mirrors {@apilink RestrictedCrawlingContext.addRequests} with typed sources.
50+
*/
51+
export type TypedContextAddRequests<Routes extends Record<keyof Routes, Dictionary>> = (
52+
requestsLike: ReadonlyDeep<LabeledSource<Routes>[]>,
53+
options?: ReadonlyDeep<RequestQueueOperationOptions>,
54+
) => Promise<void>;
55+
56+
/**
57+
* An `enqueueLinks`-options object with its `label`/`userData` retyped according to a router's route map: a
58+
* declared `label` requires the matching `userData` shape (unknown labels are rejected), while unlabeled
59+
* calls keep loose `userData`. Returns the options unchanged when the route map is open (the default).
60+
*/
61+
type TypedEnqueueLinksOptions<Options, Routes extends Record<keyof Routes, Dictionary>> = string extends keyof Routes
62+
? Options
63+
: Omit<Options, 'label' | 'userData'> &
64+
(
65+
| { [Label in keyof Routes & string]: { label: Label; userData?: Routes[Label] } }[keyof Routes & string]
66+
| { label?: undefined; userData?: Dictionary }
67+
);
68+
69+
/**
70+
* Transforms a context's existing `enqueueLinks` method so that the `label`/`userData` in its options follow
71+
* the router's route map, while preserving everything else about the signature (argument optionality and
72+
* return type, which differ between crawler types).
73+
*/
74+
export type TypedContextEnqueueLinks<
75+
EnqueueLinks,
76+
Routes extends Record<keyof Routes, Dictionary>,
77+
> = EnqueueLinks extends (options?: infer Options) => infer Result
78+
? (options?: TypedEnqueueLinksOptions<Options, Routes>) => Result
79+
: EnqueueLinks extends (options: infer Options) => infer Result
80+
? (options: TypedEnqueueLinksOptions<Options, Routes>) => Result
81+
: EnqueueLinks;
82+
1683
/** @internal */
1784
export type WithRequired<T, K extends keyof T> = T & { [P in K]-?: T[P] };
1885

packages/core/src/router.ts

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,34 @@
11
import type { Dictionary } from '@crawlee/types';
22
import type { StandardSchemaV1 } from '@standard-schema/spec';
33

4-
import type { CrawlingContext, LoadedRequest, RestrictedCrawlingContext } from './crawlers/crawler_commons.js';
4+
import type {
5+
CrawlingContext,
6+
LoadedRequest,
7+
RestrictedCrawlingContext,
8+
TypedContextAddRequests,
9+
TypedContextEnqueueLinks,
10+
} from './crawlers/crawler_commons.js';
511
import { MissingRouteError, RequestValidationError } from './errors.js';
612
import type { Request } from './request.js';
713
import type { Awaitable } from './typedefs.js';
814

915
const defaultRoute = Symbol('default-route');
1016

1117
/**
12-
* The crawling context received by a route handler, with `request.userData` narrowed to `UserData`.
18+
* The crawling context received by a route handler, with `request.userData` narrowed to `UserData`, and
19+
* `addRequests`/`enqueueLinks` typed according to the router's route map (`Routes`) so that enqueuing a
20+
* request under a declared label requires the matching `userData` shape.
1321
*/
14-
export type RouterHandlerContext<Context, UserData extends Dictionary> = Omit<Context, 'request'> & {
22+
export type RouterHandlerContext<
23+
Context,
24+
UserData extends Dictionary,
25+
Routes extends Record<keyof Routes, Dictionary>,
26+
> = Omit<Context, 'request' | 'addRequests' | 'enqueueLinks'> & {
1527
request: LoadedRequest<Request<UserData>>;
16-
};
28+
addRequests: TypedContextAddRequests<Routes>;
29+
} & (Context extends { enqueueLinks: infer EnqueueLinks }
30+
? { enqueueLinks: TypedContextEnqueueLinks<EnqueueLinks, Routes> }
31+
: {});
1732

1833
/**
1934
* The set of labels accepted by {@apilink Router.addHandler}. When the router declares a concrete
@@ -185,7 +200,7 @@ export class Router<
185200
*/
186201
addHandler<Label extends keyof Routes & string>(
187202
label: Label,
188-
handler: (ctx: RouterHandlerContext<Context, Routes[Label]>) => Awaitable<void>,
203+
handler: (ctx: RouterHandlerContext<Context, Routes[Label], Routes>) => Awaitable<void>,
189204
): void;
190205

191206
/**
@@ -195,7 +210,7 @@ export class Router<
195210
*/
196211
addHandler<UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(
197212
label: RouterLabel<Routes>,
198-
handler: (ctx: RouterHandlerContext<Context, UserData>) => Awaitable<void>,
213+
handler: (ctx: RouterHandlerContext<Context, UserData, Routes>) => Awaitable<void>,
199214
): void;
200215

201216
addHandler(label: string | symbol, handler: (ctx: any) => Awaitable<void>): void {
@@ -209,7 +224,7 @@ export class Router<
209224
* (loosely typed by default). Pass an explicit `UserData` type argument to narrow it.
210225
*/
211226
addDefaultHandler<UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(
212-
handler: (ctx: RouterHandlerContext<Context, UserData>) => Awaitable<void>,
227+
handler: (ctx: RouterHandlerContext<Context, UserData, Routes>) => Awaitable<void>,
213228
) {
214229
this.validate(defaultRoute);
215230
this.routes.set(defaultRoute, handler);

packages/http-crawler/src/internals/http-crawler.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ export interface HttpCrawlerOptions<
6464
Context extends InternalHttpCrawlingContext = InternalHttpCrawlingContext,
6565
ContextExtension = Dictionary<never>,
6666
ExtendedContext extends Context = Context & ContextExtension,
67-
> extends BasicCrawlerOptions<Context, ContextExtension, ExtendedContext> {
67+
Routes extends Record<keyof Routes, Dictionary> = Record<string, GetUserDataFromRequest<Context['request']>>,
68+
> extends BasicCrawlerOptions<Context, ContextExtension, ExtendedContext, Routes> {
6869
/**
6970
* Timeout in which the HTTP request to the resource needs to finish, given in seconds.
7071
*/
@@ -315,7 +316,8 @@ export class HttpCrawler<
315316
Context extends InternalHttpCrawlingContext<any, any> = InternalHttpCrawlingContext,
316317
ContextExtension = Dictionary<never>,
317318
ExtendedContext extends Context = Context & ContextExtension,
318-
> extends BasicCrawler<Context, ContextExtension, ExtendedContext> {
319+
Routes extends Record<keyof Routes, Dictionary> = Record<string, GetUserDataFromRequest<Context['request']>>,
320+
> extends BasicCrawler<Context, ContextExtension, ExtendedContext, Routes> {
319321
protected preNavigationHooks: InternalHttpHook<CrawlingContext>[];
320322
protected postNavigationHooks: ((
321323
crawlingContext: CrawlingContextWithResponse,

packages/jsdom-crawler/src/internals/jsdom-crawler.ts

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ export interface JSDOMCrawlerOptions<
4141
ExtendedContext extends JSDOMCrawlingContext = JSDOMCrawlingContext & ContextExtension,
4242
UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
4343
JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
44-
> extends HttpCrawlerOptions<JSDOMCrawlingContext<UserData, JSONData>, ContextExtension, ExtendedContext> {
44+
Routes extends Record<keyof Routes, Dictionary> = Record<string, UserData>,
45+
> extends HttpCrawlerOptions<JSDOMCrawlingContext<UserData, JSONData>, ContextExtension, ExtendedContext, Routes> {
4546
/**
4647
* Download and run scripts.
4748
*/
@@ -186,7 +187,11 @@ const resources = new ResourceLoader({
186187
export class JSDOMCrawler<
187188
ContextExtension = Dictionary<never>,
188189
ExtendedContext extends JSDOMCrawlingContext = JSDOMCrawlingContext & ContextExtension,
189-
> extends HttpCrawler<JSDOMCrawlingContext, ContextExtension, ExtendedContext> {
190+
Routes extends Record<keyof Routes, Dictionary> = Record<
191+
string,
192+
GetUserDataFromRequest<JSDOMCrawlingContext['request']>
193+
>,
194+
> extends HttpCrawler<JSDOMCrawlingContext, ContextExtension, ExtendedContext, Routes> {
190195
protected static override optionsShape = {
191196
...HttpCrawler.optionsShape,
192197
runScripts: ow.optional.boolean,
@@ -197,7 +202,7 @@ export class JSDOMCrawler<
197202
protected hideInternalConsole: boolean;
198203
protected virtualConsole: VirtualConsole | null = null;
199204

200-
constructor(options: JSDOMCrawlerOptions<ContextExtension, ExtendedContext> = {}) {
205+
constructor(options: JSDOMCrawlerOptions<ContextExtension, ExtendedContext, any, any, Routes> = {}) {
201206
const { runScripts = false, hideInternalConsole = false, contextPipelineBuilder, ...httpOptions } = options;
202207

203208
super({

packages/linkedom-crawler/src/internals/linkedom-crawler.ts

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ export interface LinkeDOMCrawlerOptions<
3737
ExtendedContext extends LinkeDOMCrawlingContext = LinkeDOMCrawlingContext & ContextExtension,
3838
UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
3939
JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
40-
> extends HttpCrawlerOptions<LinkeDOMCrawlingContext<UserData, JSONData>, ContextExtension, ExtendedContext> {}
40+
Routes extends Record<keyof Routes, Dictionary> = Record<string, UserData>,
41+
> extends HttpCrawlerOptions<LinkeDOMCrawlingContext<UserData, JSONData>, ContextExtension, ExtendedContext, Routes> {}
4142

4243
export interface LinkeDOMCrawlerEnqueueLinksOptions extends Omit<EnqueueLinksOptions, 'urls' | 'requestManager'> {}
4344

@@ -169,10 +170,14 @@ export type LinkeDOMRequestHandler<
169170
export class LinkeDOMCrawler<
170171
ContextExtension = Dictionary<never>,
171172
ExtendedContext extends LinkeDOMCrawlingContext = LinkeDOMCrawlingContext & ContextExtension,
172-
> extends HttpCrawler<LinkeDOMCrawlingContext, ContextExtension, ExtendedContext> {
173+
Routes extends Record<keyof Routes, Dictionary> = Record<
174+
string,
175+
GetUserDataFromRequest<LinkeDOMCrawlingContext['request']>
176+
>,
177+
> extends HttpCrawler<LinkeDOMCrawlingContext, ContextExtension, ExtendedContext, Routes> {
173178
private static parser = new DOMParser();
174179

175-
constructor(options: LinkeDOMCrawlerOptions<ContextExtension, ExtendedContext>) {
180+
constructor(options: LinkeDOMCrawlerOptions<ContextExtension, ExtendedContext, any, any, Routes>) {
176181
const { contextPipelineBuilder, ...rest } = options;
177182

178183
super({

0 commit comments

Comments
 (0)