Skip to content

Commit 4145eee

Browse files
authored
feat!: use native Request / Response interface (#3163)
Phasing out `got-scraping`-specific interfaces in favour of native `fetch` API. Related to #3071
1 parent 7d0c1fd commit 4145eee

23 files changed

Lines changed: 316 additions & 473 deletions

File tree

docs/upgrading/upgrading_v4.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,3 +94,16 @@ This experimental option relied on an outdated manifest version for browser exte
9494
## Available resource detection
9595

9696
In v3, we introduced a new way to detect available resources for the crawler, available via `systemInfoV2` flag. In v4, this is the default way to detect available resources. The old way is removed completely together with the `systemInfoV2` flag.
97+
98+
## `HttpClient` instances return `Response` objects
99+
100+
The interface of `HttpClient` instances was changed to return the [native `Response` objects](https://developer.mozilla.org/en-US/docs/Web/API/Response) instead of custom `HttpResponse` objects.
101+
102+
## `CrawlingContext.response` is now of type `Response`
103+
104+
The `CrawlingContext.response` property is now of type [`Response`](https://developer.mozilla.org/en-US/docs/Web/API/Response) instead of `HttpResponse`. `CrawlingContext.sendRequest` method now returns `Response` objects as well.
105+
106+
## Crawling context in the `FileDownload` crawler no longer includes `body` and `stream` properties
107+
108+
The crawling context in the `FileDownload` crawler no longer includes the `body` and `stream` properties. These can be accessed directly via the `response` property instead, e.g. `context.response.bytes()` or `context.response.body`.
109+

packages/basic-crawler/src/internals/basic-crawler.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1405,7 +1405,7 @@ export class BasicCrawler<
14051405
},
14061406
pushData: this.pushData.bind(this),
14071407
useState: this.useState.bind(this),
1408-
sendRequest: createSendRequest(this.httpClient, request!, session),
1408+
sendRequest: createSendRequest(this.httpClient, request!, session) as CrawlingContext['sendRequest'],
14091409
getKeyValueStore: async (idOrName?: string) => KeyValueStore.open(idOrName, { config: this.config }),
14101410
registerDeferredCleanup: (cleanup) => {
14111411
deferredCleanup.push(cleanup);

packages/basic-crawler/src/internals/send-request.ts

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import {
55
type Request,
66
type Session,
77
} from '@crawlee/core';
8-
import type { Method, Response as GotResponse } from 'got-scraping';
98

109
/**
1110
* Prepares a function to be used as the `sendRequest` context helper.
@@ -17,10 +16,7 @@ import type { Method, Response as GotResponse } from 'got-scraping';
1716
* @param getProxyUrl A function that will return the proxy URL that should be used for handling the request.
1817
*/
1918
export function createSendRequest(httpClient: BaseHttpClient, originRequest: Request, session: Session | undefined) {
20-
return async <Response = string>(
21-
// TODO the type information here (and in crawler_commons) is outright wrong... for BC - replace this with generic HttpResponse in v4
22-
overrideOptions: Partial<HttpRequestOptions> = {},
23-
): Promise<GotResponse<Response>> => {
19+
return async (overrideOptions: Partial<HttpRequestOptions> = {}): Promise<Response> => {
2420
const cookieJar = session
2521
? {
2622
getCookieString: async (url: string) => session.getCookieString(url),
@@ -31,7 +27,7 @@ export function createSendRequest(httpClient: BaseHttpClient, originRequest: Req
3127

3228
const requestOptions = processHttpRequestOptions({
3329
url: originRequest.url,
34-
method: originRequest.method as Method, // Narrow type to omit CONNECT
30+
method: originRequest.method,
3531
headers: originRequest.headers,
3632
proxyUrl: session?.proxyInfo?.url,
3733
sessionToken: session,
@@ -43,6 +39,6 @@ export function createSendRequest(httpClient: BaseHttpClient, originRequest: Req
4339
// Fill in body as the last step - `processHttpRequestOptions` may use either `body`, `json` or `form` so we cannot override it beforehand
4440
requestOptions.body ??= originRequest.payload;
4541

46-
return httpClient.sendRequest<any>(requestOptions) as unknown as GotResponse<Response>;
42+
return httpClient.sendRequest(requestOptions);
4743
};
4844
}

packages/core/src/cookie_utils.ts

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,14 @@ export interface ResponseLike {
1212
/**
1313
* @internal
1414
*/
15-
export function getCookiesFromResponse(response: ResponseLike): Cookie[] {
16-
const headers = typeof response.headers === 'function' ? response.headers() : response.headers;
17-
const cookieHeader = headers?.['set-cookie'] || '';
15+
export function getCookiesFromResponse(response: Response): Cookie[] {
16+
const headers = response.headers;
17+
const cookieHeaders = headers.getSetCookie();
1818

1919
try {
20-
return Array.isArray(cookieHeader)
21-
? cookieHeader.map((cookie) => Cookie.parse(cookie)!)
22-
: [Cookie.parse(cookieHeader)!];
20+
return cookieHeaders.map((cookie) => Cookie.parse(cookie)!);
2321
} catch (e) {
24-
throw new CookieParseError(cookieHeader);
22+
throw new CookieParseError(cookieHeaders);
2523
}
2624
}
2725

packages/core/src/crawlers/crawler_commons.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import type { Dictionary } from '@crawlee/types';
2-
import type { OptionsInit, Response as GotResponse } from 'got-scraping';
2+
import type { OptionsInit } from 'got-scraping';
33
import type { ReadonlyDeep, SetRequired } from 'type-fest';
44

55
import type { Configuration } from '../configuration.js';
@@ -156,7 +156,7 @@ export interface CrawlingContext<UserData extends Dictionary = Dictionary> exten
156156
* },
157157
* ```
158158
*/
159-
sendRequest<Response = string>(overrideOptions?: Partial<OptionsInit>): Promise<GotResponse<Response>>;
159+
sendRequest(overrideOptions?: Partial<OptionsInit>): Promise<Response>;
160160

161161
/**
162162
* Register a function to be called at the very end of the request handling process. This is useful for resources that should be accessible to error handlers, for instance.

packages/core/src/http_clients/base-http-client.ts

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import type { Readable } from 'node:stream';
22

3+
import type { AllowedHttpMethods } from '@crawlee/types';
34
import { applySearchParams, type SearchParams } from '@crawlee/utils';
45

56
import type { FormDataLike } from './form-data-like.js';
@@ -15,24 +16,6 @@ type Timeout =
1516
}
1617
| { request: number };
1718

18-
type Method =
19-
| 'GET'
20-
| 'POST'
21-
| 'PUT'
22-
| 'PATCH'
23-
| 'HEAD'
24-
| 'DELETE'
25-
| 'OPTIONS'
26-
| 'TRACE'
27-
| 'get'
28-
| 'post'
29-
| 'put'
30-
| 'patch'
31-
| 'head'
32-
| 'delete'
33-
| 'options'
34-
| 'trace';
35-
3619
/**
3720
* Maps permitted values of the `responseType` option on {@apilink HttpRequest} to the types that they produce.
3821
*/
@@ -79,7 +62,7 @@ export interface HttpRequest<TResponseType extends keyof ResponseTypes = 'text'>
7962
[k: string]: unknown; // TODO BC with got - remove in 4.0
8063

8164
url: string | URL;
82-
method?: Method;
65+
method?: AllowedHttpMethods;
8366
headers?: SimpleHeaders;
8467
body?: string | Buffer | Readable | Generator | AsyncGenerator | FormDataLike;
8568

@@ -146,6 +129,14 @@ interface HttpResponseWithoutBody<TResponseType extends keyof ResponseTypes = ke
146129
request: HttpRequest<TResponseType>;
147130
}
148131

132+
export class ResponseWithUrl extends Response {
133+
override url: string;
134+
constructor(body: BodyInit | null, init: ResponseInit & { url?: string }) {
135+
super(body, init);
136+
this.url = init.url ?? '';
137+
}
138+
}
139+
149140
/**
150141
* HTTP response data as returned by the {@apilink BaseHttpClient.sendRequest} method.
151142
*/
@@ -169,7 +160,7 @@ export interface StreamingHttpResponse extends HttpResponseWithoutBody {
169160
* Type of a function called when an HTTP redirect takes place. It is allowed to mutate the `updatedRequest` argument.
170161
*/
171162
export type RedirectHandler = (
172-
redirectResponse: BaseHttpResponseData,
163+
redirectResponse: Response,
173164
updatedRequest: { url?: string | URL; headers: SimpleHeaders },
174165
) => void;
175166

@@ -182,12 +173,12 @@ export interface BaseHttpClient {
182173
*/
183174
sendRequest<TResponseType extends keyof ResponseTypes = 'text'>(
184175
request: HttpRequest<TResponseType>,
185-
): Promise<HttpResponse<TResponseType>>;
176+
): Promise<Response>;
186177

187178
/**
188179
* Perform an HTTP Request and return after the response headers are received. The body may be read from a stream contained in the response.
189180
*/
190-
stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise<StreamingHttpResponse>;
181+
stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise<Response>;
191182
}
192183

193184
/**

packages/core/src/http_clients/got-scraping-http-client.ts

Lines changed: 62 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,40 @@
1+
import { Readable } from 'node:stream';
2+
13
import type { Options, PlainResponse } from 'got-scraping';
24
import { gotScraping } from 'got-scraping';
35

4-
import type {
5-
BaseHttpClient,
6-
HttpRequest,
7-
HttpResponse,
8-
RedirectHandler,
9-
ResponseTypes,
10-
StreamingHttpResponse,
6+
import {
7+
type BaseHttpClient,
8+
type HttpRequest,
9+
type RedirectHandler,
10+
type ResponseTypes,
11+
ResponseWithUrl,
1112
} from './base-http-client.js';
1213

1314
/**
1415
* A HTTP client implementation based on the `got-scraping` library.
1516
*/
1617
export class GotScrapingHttpClient implements BaseHttpClient {
18+
/**
19+
* Type guard that validates the HTTP method (excluding CONNECT).
20+
* @param request - The HTTP request to validate
21+
*/
22+
private validateRequest<TResponseType extends keyof ResponseTypes, T extends HttpRequest<TResponseType>>(
23+
request: T,
24+
): request is T & { method: Exclude<T['method'], 'CONNECT' | 'connect'> } {
25+
return !['CONNECT', 'connect'].includes(request.method!);
26+
}
27+
1728
/**
1829
* @inheritDoc
1930
*/
2031
async sendRequest<TResponseType extends keyof ResponseTypes>(
2132
request: HttpRequest<TResponseType>,
22-
): Promise<HttpResponse<TResponseType>> {
33+
): Promise<Response> {
34+
if (!this.validateRequest(request)) {
35+
throw new Error(`The HTTP method CONNECT is not supported by the GotScrapingHttpClient.`);
36+
}
37+
2338
const gotResult = await gotScraping({
2439
...request,
2540
retry: {
@@ -28,23 +43,45 @@ export class GotScrapingHttpClient implements BaseHttpClient {
2843
},
2944
});
3045

31-
return {
32-
...gotResult,
33-
body: gotResult.body as ResponseTypes[TResponseType],
34-
request: { url: request.url, ...gotResult.request },
35-
};
46+
const parsedHeaders = Object.entries(gotResult.headers)
47+
.map(([key, value]) => {
48+
if (value === undefined) return [];
49+
50+
if (Array.isArray(value)) {
51+
return value.map((v) => [key, v]);
52+
}
53+
54+
return [[key, value]];
55+
})
56+
.flat() as [string, string][];
57+
58+
return new ResponseWithUrl(new Uint8Array(gotResult.rawBody), {
59+
headers: new Headers(parsedHeaders),
60+
status: gotResult.statusCode,
61+
statusText: gotResult.statusMessage ?? '',
62+
url: gotResult.url,
63+
});
3664
}
3765

3866
/**
3967
* @inheritDoc
4068
*/
41-
async stream(request: HttpRequest, handleRedirect?: RedirectHandler): Promise<StreamingHttpResponse> {
69+
async stream(request: HttpRequest, handleRedirect?: RedirectHandler): Promise<Response> {
70+
if (!this.validateRequest(request)) {
71+
throw new Error(`The HTTP method CONNECT is not supported by the GotScrapingHttpClient.`);
72+
}
4273
// eslint-disable-next-line no-async-promise-executor
4374
return new Promise(async (resolve, reject) => {
4475
const stream = gotScraping({ ...request, isStream: true });
4576

46-
stream.on('redirect', (updatedOptions: Options, redirectResponse: PlainResponse) => {
47-
handleRedirect?.(redirectResponse, updatedOptions);
77+
stream.on('redirect', (updatedOptions: Options, redirectResponse: any) => {
78+
const nativeRedirectResponse = new ResponseWithUrl(redirectResponse.rawBody, {
79+
headers: redirectResponse.headers,
80+
status: redirectResponse.statusCode,
81+
statusText: redirectResponse.statusMessage,
82+
url: redirectResponse.url,
83+
});
84+
handleRedirect?.(nativeRedirectResponse, updatedOptions);
4885
});
4986

5087
// We need to end the stream for DELETE requests, otherwise it will hang.
@@ -55,37 +92,15 @@ export class GotScrapingHttpClient implements BaseHttpClient {
5592
stream.on('error', reject);
5693

5794
stream.on('response', (response: PlainResponse) => {
58-
const result: StreamingHttpResponse = {
59-
stream,
60-
request,
61-
redirectUrls: response.redirectUrls,
62-
url: response.url,
63-
ip: response.ip,
64-
statusCode: response.statusCode,
65-
headers: response.headers,
66-
trailers: response.trailers,
67-
complete: response.complete,
68-
get downloadProgress() {
69-
return stream.downloadProgress;
70-
},
71-
get uploadProgress() {
72-
return stream.uploadProgress;
73-
},
74-
};
75-
76-
Object.assign(result, response); // TODO BC - remove in 4.0
77-
78-
resolve(result);
79-
80-
stream.on('end', () => {
81-
result.complete = response.complete;
82-
83-
result.trailers ??= {};
84-
Object.assign(result.trailers, response.trailers);
85-
86-
(result as any).rawTrailers ??= []; // TODO BC - remove in 4.0
87-
Object.assign((result as any).rawTrailers, response.rawTrailers);
88-
});
95+
// Cast shouldn't be needed here, undici might have a different `ReadableStream` type
96+
resolve(
97+
new ResponseWithUrl(Readable.toWeb(stream) as any, {
98+
status: response.statusCode,
99+
statusText: response.statusMessage ?? '',
100+
headers: response.headers as HeadersInit,
101+
url: response.url,
102+
}),
103+
);
89104
});
90105
});
91106
}

packages/core/src/request.ts

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ export enum RequestState {
7979
* ```
8080
* @category Sources
8181
*/
82-
export class Request<UserData extends Dictionary = Dictionary> {
82+
class CrawleeRequest<UserData extends Dictionary = Dictionary> {
8383
/** Request ID */
8484
id?: string;
8585

@@ -193,7 +193,8 @@ export class Request<UserData extends Dictionary = Dictionary> {
193193
this.url = url;
194194
this.loadedUrl = loadedUrl;
195195
this.uniqueKey =
196-
uniqueKey || Request.computeUniqueKey({ url, method, payload, keepUrlFragment, useExtendedUniqueKey });
196+
uniqueKey ||
197+
CrawleeRequest.computeUniqueKey({ url, method, payload, keepUrlFragment, useExtendedUniqueKey });
197198
this.method = method;
198199
this.payload = payload;
199200
this.noRetry = noRetry;
@@ -255,6 +256,18 @@ export class Request<UserData extends Dictionary = Dictionary> {
255256
}
256257
}
257258

259+
/**
260+
* Converts the Crawlee Request object to a `fetch` API Request object.
261+
* @returns The native `fetch` API Request object.
262+
*/
263+
public intoFetchAPIRequest(): Request {
264+
return new Request(this.url, {
265+
method: this.method,
266+
headers: this.headers,
267+
body: this.payload,
268+
});
269+
}
270+
258271
/** Tells the crawler processing this request to skip the navigation and process the request directly. */
259272
get skipNavigation(): boolean {
260273
return this.userData.__crawlee?.skipNavigation ?? false;
@@ -398,7 +411,7 @@ export class Request<UserData extends Dictionary = Dictionary> {
398411
}
399412
return normalizedUrl;
400413
}
401-
const payloadHash = payload ? Request.hashPayload(payload) : '';
414+
const payloadHash = payload ? CrawleeRequest.hashPayload(payload) : '';
402415
return `${normalizedMethod}(${payloadHash}):${normalizedUrl}`;
403416
}
404417

@@ -526,10 +539,12 @@ interface ComputeUniqueKeyOptions {
526539
useExtendedUniqueKey?: boolean;
527540
}
528541

529-
export type Source = (Partial<RequestOptions> & { requestsFromUrl?: string; regex?: RegExp }) | Request;
542+
export type Source = (Partial<RequestOptions> & { requestsFromUrl?: string; regex?: RegExp }) | CrawleeRequest;
530543

531544
/** @internal */
532545
export interface InternalSource {
533546
requestsFromUrl: string;
534547
regex?: RegExp;
535548
}
549+
550+
export { CrawleeRequest as Request };

0 commit comments

Comments
 (0)