Skip to content
11 changes: 11 additions & 0 deletions packages/core/src/http_clients/got-scraping-http-client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,17 @@ export class GotScrapingHttpClient implements BaseHttpClient {
stream.on('error', reject);

stream.on('response', (response: PlainResponse) => {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 'response' event is emitted once the HTTP headers arrive, which is a long time after the TLS connection has been established (and the unexpected_message alert has likely been received).

// Handle socket errors to prevent unhandled TLS errors from crashing the process.
// TLS errors are emitted on the underlying socket (response.socket), not the got stream.
// Without this handler, errors like ERR_SSL_SSLV3_ALERT_UNEXPECTED_MESSAGE crash the process.
// Using `once` to avoid memory leaks from accumulated listeners on pooled sockets.
const socket = (response as any).socket;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, this is unnecessary even for higher-level errors (like the closed socket in the new test). Internal got error handling already handles these errors.

See that the test suite will pass even without these changes here.

if (socket && typeof socket.once === 'function') {
socket.once('error', (err: Error) => {
stream.destroy(err);
});
}

const result: StreamingHttpResponse = {
stream,
request,
Expand Down
140 changes: 140 additions & 0 deletions test/core/crawlers/socket_error_handling.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import http from 'node:http';
import type { AddressInfo, Socket } from 'node:net';

import { GotScrapingHttpClient, HttpCrawler } from '@crawlee/http';
import { MemoryStorageEmulator } from 'test/shared/MemoryStorageEmulator';

let server: http.Server;
let url: string;

const sockets = new Set<Socket>();

const router = new Map<string, http.RequestListener>();

router.set('/ok', (_req, res) => {
res.setHeader('content-type', 'text/html; charset=utf-8');
res.end('<html><body>OK</body></html>');
});

router.set('/destroy-socket-after-headers', (req, res) => {
// Send headers, start body, then destroy the socket to simulate a mid-response error.
// This simulates the TLS error scenario where the socket fails after headers are sent.
res.setHeader('content-type', 'text/html; charset=utf-8');
res.setHeader('content-length', '10000');
res.write('<html>');

// Destroy the underlying socket after a short delay to trigger an error
// after the 'response' event has already fired.
setTimeout(() => {
req.socket.destroy();
}, 50);
});

beforeAll(async () => {
server = http.createServer((request, response) => {
try {
const requestUrl = new URL(request.url!, 'http://localhost');
const handler = router.get(requestUrl.pathname);
if (handler) {
handler(request, response);
} else {
response.statusCode = 404;
response.end();
}
} catch {
response.destroy();
}
});

server.on('connection', (socket) => {
sockets.add(socket);
socket.on('close', () => sockets.delete(socket));
});

await new Promise<void>((resolve) =>
server.listen(() => {
url = `http://127.0.0.1:${(server.address() as AddressInfo).port}`;
resolve();
}),
);
});

const localStorageEmulator = new MemoryStorageEmulator();

beforeEach(async () => {
await localStorageEmulator.init();
});

afterAll(async () => {
for (const socket of sockets) {
socket.destroy();
}
await new Promise((resolve) => server.close(resolve));
await localStorageEmulator.destroy();
});

describe('HttpCrawler socket error handling', () => {
test('should handle mid-response socket destruction gracefully without crashing', async () => {
const errors: Error[] = [];

const crawler = new HttpCrawler({
httpClient: new GotScrapingHttpClient(),
maxRequestRetries: 0,
maxConcurrency: 1,
requestHandler: () => {
// Should not complete successfully for the error case
},
failedRequestHandler: (_ctx, error) => {
errors.push(error as Error);
},
});

await crawler.run([`${url}/destroy-socket-after-headers`]);

// The request should have failed (not crashed the process).
// The key assertion is that we reach this point without process crash.
expect(errors.length).toBe(1);
});

test('normal requests still work correctly', async () => {
const results: string[] = [];

const crawler = new HttpCrawler({
httpClient: new GotScrapingHttpClient(),
maxRequestRetries: 0,
maxConcurrency: 1,
requestHandler: ({ body }) => {
results.push(body.toString());
},
});

await crawler.run([`${url}/ok`]);

expect(results.length).toBe(1);
expect(results[0]).toContain('OK');
});

test('crawler recovers after socket error and processes next request', async () => {
const results: string[] = [];
const errors: Error[] = [];

const crawler = new HttpCrawler({
httpClient: new GotScrapingHttpClient(),
maxRequestRetries: 0,
maxConcurrency: 1,
requestHandler: ({ body }) => {
results.push(body.toString());
},
failedRequestHandler: (_ctx, error) => {
errors.push(error as Error);
},
});

await crawler.run([`${url}/destroy-socket-after-headers`, `${url}/ok`]);

// One should fail and one should succeed, but no process crash.
expect(errors.length).toBe(1);
expect(results.length).toBe(1);
expect(results[0]).toContain('OK');
});
});
Loading