Skip to content

Commit 73dfaef

Browse files
authored
Merge pull request #170 from Aure7138/proxy
feat: proxy support
2 parents b62a002 + cf03b8a commit 73dfaef

File tree

4 files changed

+17
-3
lines changed

4 files changed

+17
-3
lines changed

CHANGELOG.md

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
# [1.5.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.4.0...v1.5.0) (2024-07-05)
22

3-
43
### Features
54

6-
* git clone depth limit in docker ([87767db](https://github.com/BuilderIO/gpt-crawler/commit/87767dbda99b3259d44ec2c02dceb3a59bb2ca3c))
5+
- git clone depth limit in docker ([87767db](https://github.com/BuilderIO/gpt-crawler/commit/87767dbda99b3259d44ec2c02dceb3a59bb2ca3c))
76

87
# [1.4.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.3.0...v1.4.0) (2024-01-15)
98

config.ts

+1
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,5 @@ export const defaultConfig: Config = {
66
maxPagesToCrawl: 50,
77
outputFileName: "output.json",
88
maxTokens: 2000000,
9+
// proxyUrls: ["http://username:password@proxyserver:port"], // socks5://username:password@proxyserver:port
910
};

src/config.ts

+4
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,10 @@ export const configSchema = z.object({
8585
* @example 5000
8686
*/
8787
maxTokens: z.number().int().positive().optional(),
88+
/** Optional proxy server
89+
* @example ['http://username:password@proxyserver:port', 'socks5://username:password@proxyserver:port']
90+
*/
91+
proxyUrls: z.array(z.string()).optional(),
8892
});
8993

9094
export type Config = z.infer<typeof configSchema>;

src/core.ts

+11-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
// For more information, see https://crawlee.dev/
2-
import { Configuration, PlaywrightCrawler, downloadListOfUrls } from "crawlee";
2+
import {
3+
Configuration,
4+
PlaywrightCrawler,
5+
ProxyConfiguration,
6+
downloadListOfUrls,
7+
} from "crawlee";
38
import { readFile, writeFile } from "fs/promises";
49
import { glob } from "glob";
510
import { Config, configSchema } from "./config.js";
@@ -54,8 +59,13 @@ export async function crawl(config: Config) {
5459
if (process.env.NO_CRAWL !== "true") {
5560
// PlaywrightCrawler crawls the web using a headless
5661
// browser controlled by the Playwright library.
62+
const proxyConfiguration = new ProxyConfiguration({
63+
proxyUrls: config.proxyUrls,
64+
});
65+
5766
crawler = new PlaywrightCrawler(
5867
{
68+
proxyConfiguration,
5969
// Use the requestHandler to process each of the crawled pages.
6070
async requestHandler({ request, page, enqueueLinks, log, pushData }) {
6171
const title = await page.title();

0 commit comments

Comments
 (0)