Skip to content

Commit 1110f07

Browse files
committed
feat: schema-validated router labels via standard-schema
Introduce per-label typing of `request.userData` for the router, in two layers: - A `label -> userData` map can be passed as the router's `Routes` type argument, typing `request.userData` per label and rejecting unknown labels at compile time. Backwards compatible (default is an open map). - A per-label Standard Schema map (Zod, Valibot, ArkType, …) passed to `Router.create`/`createXRouter` both infers the `userData` types and validates them at runtime before the handler runs, replacing `request.userData` with the parsed value. Invalid requests throw a new non-retryable `RequestValidationError`. Adds the types-only `@standard-schema/spec` dependency to `@crawlee/core`. Relates to #3082
1 parent f1f0959 commit 1110f07

15 files changed

Lines changed: 407 additions & 40 deletions

File tree

packages/basic-crawler/src/internals/basic-crawler.ts

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ import type {
2121
RequestTransform,
2222
RouterHandler,
2323
RouterRoutes,
24+
RouteSchemas,
25+
RoutesFromSchemas,
2426
SkippedRequestCallback,
2527
Source,
2628
StatisticsOptions,
@@ -2348,6 +2350,12 @@ export interface CrawlerRunOptions extends CrawlerAddRequestsOptions {
23482350
export function createBasicRouter<
23492351
Context extends BasicCrawlingContext = BasicCrawlingContext,
23502352
UserData extends Dictionary = GetUserDataFromRequest<Context['request']>,
2351-
>(routes?: RouterRoutes<Context, UserData>) {
2352-
return Router.create<Context>(routes);
2353+
Routes extends Record<keyof Routes, Dictionary> = Record<string, UserData>,
2354+
>(routes?: RouterRoutes<Context, Routes>): RouterHandler<Context, Routes>;
2355+
export function createBasicRouter<
2356+
Context extends BasicCrawlingContext = BasicCrawlingContext,
2357+
const Schemas extends RouteSchemas = RouteSchemas,
2358+
>(schemas: Schemas): RouterHandler<Context, RoutesFromSchemas<Schemas>>;
2359+
export function createBasicRouter(routesOrSchemas?: any): any {
2360+
return Router.create(routesOrSchemas);
23532361
}

packages/cheerio-crawler/src/internals/cheerio-crawler.ts

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@ import type {
88
InternalHttpHook,
99
IRequestManager,
1010
RequestHandler,
11+
RouterHandler,
1112
RouterRoutes,
13+
RouteSchemas,
14+
RoutesFromSchemas,
1215
SkippedRequestCallback,
1316
} from '@crawlee/http';
1417
import {
@@ -364,6 +367,12 @@ export async function cheerioCrawlerEnqueueLinks(
364367
export function createCheerioRouter<
365368
Context extends CheerioCrawlingContext = CheerioCrawlingContext,
366369
UserData extends Dictionary = GetUserDataFromRequest<Context['request']>,
367-
>(routes?: RouterRoutes<Context, UserData>) {
368-
return Router.create<Context>(routes);
370+
Routes extends Record<keyof Routes, Dictionary> = Record<string, UserData>,
371+
>(routes?: RouterRoutes<Context, Routes>): RouterHandler<Context, Routes>;
372+
export function createCheerioRouter<
373+
Context extends CheerioCrawlingContext = CheerioCrawlingContext,
374+
const Schemas extends RouteSchemas = RouteSchemas,
375+
>(schemas: Schemas): RouterHandler<Context, RoutesFromSchemas<Schemas>>;
376+
export function createCheerioRouter(routesOrSchemas?: any): any {
377+
return Router.create(routesOrSchemas);
369378
}

packages/core/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
"@crawlee/types": "workspace:*",
5858
"@crawlee/utils": "workspace:*",
5959
"@sapphire/async-queue": "^1.5.5",
60+
"@standard-schema/spec": "^1.0.0",
6061
"@vladfrangu/async_event_emitter": "^2.4.6",
6162
"csv-stringify": "^6.5.2",
6263
"json5": "^2.2.3",

packages/core/src/errors.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,32 @@ export class CriticalError extends NonRetryableError {}
1616
*/
1717
export class MissingRouteError extends CriticalError {}
1818

19+
/**
20+
* Thrown when a request's `userData` does not match the {@apilink RouteMap} schema registered for its label.
21+
*
22+
* As the `userData` does not change between attempts, this error is non-retryable.
23+
*/
24+
export class RequestValidationError extends NonRetryableError {
25+
constructor(
26+
readonly label: string | symbol,
27+
readonly issues: readonly {
28+
readonly message: string;
29+
readonly path?: readonly (PropertyKey | { key: PropertyKey })[];
30+
}[],
31+
) {
32+
const details = issues
33+
.map((issue) => {
34+
const path = (issue.path ?? [])
35+
.map((segment) => (typeof segment === 'object' ? segment.key : segment))
36+
.join('.');
37+
return `- ${path ? `${path}: ` : ''}${issue.message}`;
38+
})
39+
.join('\n');
40+
41+
super(`Request userData for label '${String(label)}' failed schema validation:\n${details}`);
42+
}
43+
}
44+
1945
/**
2046
* Errors of `RetryRequestError` type will always be retried by the crawler.
2147
*

packages/core/src/router.ts

Lines changed: 191 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,73 @@
11
import type { Dictionary } from '@crawlee/types';
2+
import type { StandardSchemaV1 } from '@standard-schema/spec';
23

34
import type { CrawlingContext, LoadedRequest, RestrictedCrawlingContext } from './crawlers/crawler_commons.js';
4-
import { MissingRouteError } from './errors.js';
5+
import { MissingRouteError, RequestValidationError } from './errors.js';
56
import type { Request } from './request.js';
67
import type { Awaitable } from './typedefs.js';
78

89
const defaultRoute = Symbol('default-route');
910

11+
/**
12+
* A map of request labels to the shape of `request.userData` expected for that label. Pass it as the
13+
* `Routes` type argument of {@apilink Router} (or a `createXRouter` factory) to get per-label typing of
14+
* `request.userData` and autocomplete/validation of labels in {@apilink Router.addHandler}.
15+
*
16+
* ```ts
17+
* interface MyRoutes {
18+
* PRODUCT: { sku: string; price: number };
19+
* CATEGORY: { categoryId: string };
20+
* }
21+
* ```
22+
*/
23+
export type RouteMap = Record<string, Dictionary>;
24+
25+
/**
26+
* A map of request labels to a [Standard Schema](https://standardschema.dev) (Zod, Valibot, ArkType, …)
27+
* validating that label's `request.userData`. Pass it to {@apilink Router.create} or a `createXRouter`
28+
* factory to derive the per-label `request.userData` types *and* validate them at runtime before the
29+
* matching handler runs.
30+
*/
31+
export type RouteSchemas = Record<string, StandardSchemaV1>;
32+
33+
/**
34+
* Derives a {@apilink RouteMap} (label → `userData` type) from a {@apilink RouteSchemas} map by inferring
35+
* each schema's output type. Outputs that are not object-shaped fall back to a plain {@apilink Dictionary}.
36+
*/
37+
export type RoutesFromSchemas<Schemas extends RouteSchemas> = {
38+
[Label in keyof Schemas]: StandardSchemaV1.InferOutput<Schemas[Label]> extends Dictionary
39+
? StandardSchemaV1.InferOutput<Schemas[Label]>
40+
: Dictionary;
41+
};
42+
43+
/**
44+
* The crawling context received by a route handler, with `request.userData` narrowed to `UserData`.
45+
*/
46+
export type RouterHandlerContext<Context, UserData extends Dictionary> = Omit<Context, 'request'> & {
47+
request: LoadedRequest<Request<UserData>>;
48+
};
49+
50+
/**
51+
* The set of labels accepted by {@apilink Router.addHandler}. When the router declares a concrete
52+
* {@apilink RouteMap} (e.g. `{ PRODUCT: ...; CATEGORY: ... }`), only those labels (plus symbols) are
53+
* allowed — unknown labels become a compile-time error. When the map is left open (the default
54+
* `Record<string, ...>`), any string or symbol label is accepted, preserving the original behaviour.
55+
*/
56+
export type RouterLabel<Routes extends Record<keyof Routes, Dictionary>> = string extends keyof Routes
57+
? string | symbol
58+
: (keyof Routes & string) | symbol;
59+
1060
export interface RouterHandler<
1161
Context extends Omit<RestrictedCrawlingContext, 'enqueueLinks'> = CrawlingContext,
12-
> extends Router<Context> {
62+
Routes extends Record<keyof Routes, Dictionary> = Record<string, GetUserDataFromRequest<Context['request']>>,
63+
> extends Router<Context, Routes> {
1364
(ctx: Context): Awaitable<void>;
1465
}
1566

1667
export type GetUserDataFromRequest<T> = T extends Request<infer Y> ? Y : never;
1768

18-
export type RouterRoutes<Context, UserData extends Dictionary> = {
19-
[label in string | symbol]: (ctx: Omit<Context, 'request'> & { request: Request<UserData> }) => Awaitable<void>;
69+
export type RouterRoutes<Context, Routes extends Record<keyof Routes, Dictionary>> = {
70+
[Label in keyof Routes]: (ctx: Omit<Context, 'request'> & { request: Request<Routes[Label]> }) => Awaitable<void>;
2071
};
2172

2273
/**
@@ -83,9 +134,57 @@ export type RouterRoutes<Context, UserData extends Dictionary> = {
83134
* ctx.log.info('...');
84135
* });
85136
* ```
137+
*
138+
* ## Typed labels
139+
*
140+
* To get `request.userData` typed per label, declare a {@apilink RouteMap} and pass it as the second
141+
* type argument. The label passed to {@apilink Router.addHandler} then drives the type of
142+
* `request.userData`, and unknown labels are rejected at compile time:
143+
*
144+
* ```ts
145+
* import { createCheerioRouter, CheerioCrawlingContext } from 'crawlee';
146+
*
147+
* interface Routes {
148+
* PRODUCT: { sku: string; price: number };
149+
* CATEGORY: { categoryId: string };
150+
* }
151+
*
152+
* const router = createCheerioRouter<CheerioCrawlingContext, Routes>();
153+
*
154+
* router.addHandler('PRODUCT', async ({ request }) => {
155+
* request.userData.sku; // string
156+
* request.userData.price; // number
157+
* });
158+
*
159+
* router.addHandler('TYPO', async () => {}); // compile error: not a known label
160+
* ```
161+
*
162+
* ## Schema-validated labels
163+
*
164+
* Passing a [Standard Schema](https://standardschema.dev) per label both infers the `request.userData`
165+
* types *and* validates them at runtime before the handler runs (replacing `request.userData` with the
166+
* parsed value). A failing request throws a {@apilink RequestValidationError}.
167+
*
168+
* ```ts
169+
* import { z } from 'zod';
170+
* import { createCheerioRouter } from 'crawlee';
171+
*
172+
* const router = createCheerioRouter({
173+
* PRODUCT: z.object({ sku: z.string(), price: z.number() }),
174+
* CATEGORY: z.object({ categoryId: z.string() }),
175+
* });
176+
*
177+
* router.addHandler('PRODUCT', async ({ request }) => {
178+
* request.userData.price; // number, inferred from the schema and validated at runtime
179+
* });
180+
* ```
86181
*/
87-
export class Router<Context extends Omit<RestrictedCrawlingContext, 'enqueueLinks'>> {
182+
export class Router<
183+
Context extends Omit<RestrictedCrawlingContext, 'enqueueLinks'>,
184+
Routes extends Record<keyof Routes, Dictionary> = Record<string, GetUserDataFromRequest<Context['request']>>,
185+
> {
88186
private readonly routes: Map<string | symbol, (ctx: any) => Awaitable<void>> = new Map();
187+
private readonly schemas: Map<string | symbol, StandardSchemaV1> = new Map();
89188
private readonly middlewares: ((ctx: Context) => Awaitable<void>)[] = [];
90189

91190
/**
@@ -95,26 +194,52 @@ export class Router<Context extends Omit<RestrictedCrawlingContext, 'enqueueLink
95194
protected constructor() {}
96195

97196
/**
98-
* Registers new route handler for given label.
197+
* Registers new route handler for given label. When the router declares a {@apilink RouteMap}, the
198+
* `label` is restricted to the declared labels and `request.userData` is typed accordingly.
199+
*/
200+
addHandler<Label extends keyof Routes & string>(
201+
label: Label,
202+
handler: (ctx: RouterHandlerContext<Context, Routes[Label]>) => Awaitable<void>,
203+
): void;
204+
205+
/**
206+
* Registers new route handler for given label, with an explicit `request.userData` type. Use this
207+
* overload to type a handler whose label is not part of the router's {@apilink RouteMap}.
99208
*/
100209
addHandler<UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(
101-
label: string | symbol,
102-
handler: (ctx: Omit<Context, 'request'> & { request: LoadedRequest<Request<UserData>> }) => Awaitable<void>,
103-
) {
210+
label: RouterLabel<Routes>,
211+
handler: (ctx: RouterHandlerContext<Context, UserData>) => Awaitable<void>,
212+
): void;
213+
214+
addHandler(label: string | symbol, handler: (ctx: any) => Awaitable<void>): void {
104215
this.validate(label);
105216
this.routes.set(label, handler);
106217
}
107218

108219
/**
109-
* Registers default route handler.
220+
* Registers default route handler. By default `request.userData` is typed as the union of all
221+
* `userData` shapes declared in the router's {@apilink RouteMap}.
110222
*/
111-
addDefaultHandler<UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(
112-
handler: (ctx: Omit<Context, 'request'> & { request: LoadedRequest<Request<UserData>> }) => Awaitable<void>,
223+
addDefaultHandler<UserData extends Dictionary = Routes[keyof Routes]>(
224+
handler: (ctx: RouterHandlerContext<Context, UserData>) => Awaitable<void>,
113225
) {
114226
this.validate(defaultRoute);
115227
this.routes.set(defaultRoute, handler);
116228
}
117229

230+
/**
231+
* Registers {@apilink RouteSchemas|Standard Schema} validators for the given labels. Before a matching
232+
* route handler runs, `request.userData` is validated against the label's schema and replaced with the
233+
* parsed value; a failing request throws a {@apilink RequestValidationError}.
234+
*/
235+
addSchemas(schemas: Partial<Record<keyof Routes & string, StandardSchemaV1>>) {
236+
for (const [label, schema] of Object.entries(schemas)) {
237+
if (schema) {
238+
this.schemas.set(label, schema as StandardSchemaV1);
239+
}
240+
}
241+
}
242+
118243
/**
119244
* Registers a middleware that will be fired before the matching route handler.
120245
* Multiple middlewares can be registered, they will be fired in the same order.
@@ -142,6 +267,27 @@ export class Router<Context extends Omit<RestrictedCrawlingContext, 'enqueueLink
142267
);
143268
}
144269

270+
/**
271+
* Validates `request.userData` against the schema registered for its label (if any), replacing it with
272+
* the parsed value. Throws a {@apilink RequestValidationError} when validation fails.
273+
*/
274+
private async validateRequest(context: Context) {
275+
const { label } = context.request;
276+
const schema = label != null ? this.schemas.get(label) : undefined;
277+
278+
if (!schema) {
279+
return;
280+
}
281+
282+
const result = await schema['~standard'].validate(context.request.userData);
283+
284+
if (result.issues) {
285+
throw new RequestValidationError(label!, result.issues);
286+
}
287+
288+
context.request.userData = result.value as Dictionary;
289+
}
290+
145291
/**
146292
* Throws when the label already exists in our registry.
147293
*/
@@ -174,27 +320,55 @@ export class Router<Context extends Omit<RestrictedCrawlingContext, 'enqueueLink
174320
* });
175321
* await crawler.run();
176322
* ```
323+
*
324+
* Passing a {@apilink RouteSchemas|Standard Schema} per label instead of handlers infers the
325+
* `request.userData` types and validates them at runtime:
326+
*
327+
* ```ts
328+
* import { z } from 'zod';
329+
*
330+
* const router = Router.create({
331+
* PRODUCT: z.object({ sku: z.string() }),
332+
* });
333+
* ```
177334
*/
178335
static create<
179336
Context extends Omit<RestrictedCrawlingContext, 'enqueueLinks'> = CrawlingContext,
180337
UserData extends Dictionary = GetUserDataFromRequest<Context['request']>,
181-
>(routes?: RouterRoutes<Context, UserData>): RouterHandler<Context> {
182-
const router = new Router<Context>();
338+
Routes extends Record<keyof Routes, Dictionary> = Record<string, UserData>,
339+
>(routes?: RouterRoutes<Context, Routes>): RouterHandler<Context, Routes>;
340+
341+
static create<
342+
Context extends Omit<RestrictedCrawlingContext, 'enqueueLinks'> = CrawlingContext,
343+
const Schemas extends RouteSchemas = RouteSchemas,
344+
>(schemas: Schemas): RouterHandler<Context, RoutesFromSchemas<Schemas>>;
345+
346+
static create<Context extends Omit<RestrictedCrawlingContext, 'enqueueLinks'> = CrawlingContext>(
347+
routesOrSchemas?: Record<string, ((ctx: any) => Awaitable<void>) | StandardSchemaV1>,
348+
): RouterHandler<Context, any> {
349+
const router = new Router<Context, any>();
183350
const obj = Object.create(Function.prototype);
184351

185352
obj.addHandler = router.addHandler.bind(router);
186353
obj.addDefaultHandler = router.addDefaultHandler.bind(router);
354+
obj.addSchemas = router.addSchemas.bind(router);
187355
obj.getHandler = router.getHandler.bind(router);
188356
obj.use = router.use.bind(router);
189357

190-
for (const [label, handler] of Object.entries(routes ?? {})) {
191-
router.addHandler(label, handler);
358+
for (const [label, value] of Object.entries(routesOrSchemas ?? {})) {
359+
if (typeof value === 'function') {
360+
router.addHandler(label as keyof Context & string, value as (ctx: any) => Awaitable<void>);
361+
} else {
362+
router.schemas.set(label, value);
363+
}
192364
}
193365

194366
const func = async function (context: Context) {
195367
const { url, loadedUrl, label } = context.request;
196368
context.log.debug('Page opened.', { label, url: loadedUrl ?? url });
197369

370+
await router.validateRequest(context);
371+
198372
for (const middleware of router.middlewares) {
199373
await middleware(context);
200374
}
@@ -204,6 +378,6 @@ export class Router<Context extends Omit<RestrictedCrawlingContext, 'enqueueLink
204378

205379
Object.setPrototypeOf(func, obj);
206380

207-
return func as unknown as RouterHandler<Context>;
381+
return func as unknown as RouterHandler<Context, any>;
208382
}
209383
}

0 commit comments

Comments
 (0)