Skip to content

Commit 892cd9d

Browse files
Merge pull request #148 from kaibadash/issues/147
fix: #147 Set `purgeOnStart: true` to process multiple sites as a server
2 parents 6a417bf + 5a2a565 commit 892cd9d

File tree

2 files changed

+80
-76
lines changed

2 files changed

+80
-76
lines changed

CHANGELOG.md

+2-4
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
11
# [1.4.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.3.0...v1.4.0) (2024-01-15)
22

3-
43
### Bug Fixes
54

6-
* linting ([0f4e58b](https://github.com/BuilderIO/gpt-crawler/commit/0f4e58b400eab312e7b595d7a2472bae93055415))
7-
5+
- linting ([0f4e58b](https://github.com/BuilderIO/gpt-crawler/commit/0f4e58b400eab312e7b595d7a2472bae93055415))
86

97
### Features
108

11-
* add server api readme docs ([717e625](https://github.com/BuilderIO/gpt-crawler/commit/717e625f47257bdbd96437acb7242bcd28c233ba))
9+
- add server api readme docs ([717e625](https://github.com/BuilderIO/gpt-crawler/commit/717e625f47257bdbd96437acb7242bcd28c233ba))
1210

1311
# [1.3.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.2.1...v1.3.0) (2024-01-06)
1412

src/core.ts

+78-72
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// For more information, see https://crawlee.dev/
2-
import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
2+
import { Configuration, PlaywrightCrawler, downloadListOfUrls } from "crawlee";
33
import { readFile, writeFile } from "fs/promises";
44
import { glob } from "glob";
55
import { Config, configSchema } from "./config.js";
@@ -54,83 +54,89 @@ export async function crawl(config: Config) {
5454
if (process.env.NO_CRAWL !== "true") {
5555
// PlaywrightCrawler crawls the web using a headless
5656
// browser controlled by the Playwright library.
57-
crawler = new PlaywrightCrawler({
58-
// Use the requestHandler to process each of the crawled pages.
59-
async requestHandler({ request, page, enqueueLinks, log, pushData }) {
60-
const title = await page.title();
61-
pageCounter++;
62-
log.info(
63-
`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
64-
);
65-
66-
// Use custom handling for XPath selector
67-
if (config.selector) {
68-
if (config.selector.startsWith("/")) {
69-
await waitForXPath(
70-
page,
71-
config.selector,
72-
config.waitForSelectorTimeout ?? 1000,
73-
);
74-
} else {
75-
await page.waitForSelector(config.selector, {
76-
timeout: config.waitForSelectorTimeout ?? 1000,
77-
});
78-
}
79-
}
57+
crawler = new PlaywrightCrawler(
58+
{
59+
// Use the requestHandler to process each of the crawled pages.
60+
async requestHandler({ request, page, enqueueLinks, log, pushData }) {
61+
const title = await page.title();
62+
pageCounter++;
63+
log.info(
64+
`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
65+
);
8066

81-
const html = await getPageHtml(page, config.selector);
67+
// Use custom handling for XPath selector
68+
if (config.selector) {
69+
if (config.selector.startsWith("/")) {
70+
await waitForXPath(
71+
page,
72+
config.selector,
73+
config.waitForSelectorTimeout ?? 1000,
74+
);
75+
} else {
76+
await page.waitForSelector(config.selector, {
77+
timeout: config.waitForSelectorTimeout ?? 1000,
78+
});
79+
}
80+
}
8281

83-
// Save results as JSON to ./storage/datasets/default
84-
await pushData({ title, url: request.loadedUrl, html });
82+
const html = await getPageHtml(page, config.selector);
8583

86-
if (config.onVisitPage) {
87-
await config.onVisitPage({ page, pushData });
88-
}
84+
// Save results as JSON to ./storage/datasets/default
85+
await pushData({ title, url: request.loadedUrl, html });
8986

90-
// Extract links from the current page
91-
// and add them to the crawling queue.
92-
await enqueueLinks({
93-
globs:
94-
typeof config.match === "string" ? [config.match] : config.match,
95-
exclude:
96-
typeof config.exclude === "string"
97-
? [config.exclude]
98-
: config.exclude ?? [],
99-
});
100-
},
101-
// Comment this option to scrape the full website.
102-
maxRequestsPerCrawl: config.maxPagesToCrawl,
103-
// Uncomment this option to see the browser window.
104-
// headless: false,
105-
preNavigationHooks: [
106-
// Abort requests for certain resource types
107-
async ({ request, page, log }) => {
108-
// If there are no resource exclusions, return
109-
const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? [];
110-
if (RESOURCE_EXCLUSTIONS.length === 0) {
111-
return;
112-
}
113-
if (config.cookie) {
114-
const cookies = (
115-
Array.isArray(config.cookie) ? config.cookie : [config.cookie]
116-
).map((cookie) => {
117-
return {
118-
name: cookie.name,
119-
value: cookie.value,
120-
url: request.loadedUrl,
121-
};
122-
});
123-
await page.context().addCookies(cookies);
87+
if (config.onVisitPage) {
88+
await config.onVisitPage({ page, pushData });
12489
}
125-
await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, (route) =>
126-
route.abort("aborted"),
127-
);
128-
log.info(
129-
`Aborting requests for as this is a resource excluded route`,
130-
);
90+
91+
// Extract links from the current page
92+
// and add them to the crawling queue.
93+
await enqueueLinks({
94+
globs:
95+
typeof config.match === "string" ? [config.match] : config.match,
96+
exclude:
97+
typeof config.exclude === "string"
98+
? [config.exclude]
99+
: config.exclude ?? [],
100+
});
131101
},
132-
],
133-
});
102+
// Comment this option to scrape the full website.
103+
maxRequestsPerCrawl: config.maxPagesToCrawl,
104+
// Uncomment this option to see the browser window.
105+
// headless: false,
106+
preNavigationHooks: [
107+
// Abort requests for certain resource types
108+
async ({ request, page, log }) => {
109+
// If there are no resource exclusions, return
110+
const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? [];
111+
if (RESOURCE_EXCLUSTIONS.length === 0) {
112+
return;
113+
}
114+
if (config.cookie) {
115+
const cookies = (
116+
Array.isArray(config.cookie) ? config.cookie : [config.cookie]
117+
).map((cookie) => {
118+
return {
119+
name: cookie.name,
120+
value: cookie.value,
121+
url: request.loadedUrl,
122+
};
123+
});
124+
await page.context().addCookies(cookies);
125+
}
126+
await page.route(
127+
`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`,
128+
(route) => route.abort("aborted"),
129+
);
130+
log.info(
131+
`Aborting requests for as this is a resource excluded route`,
132+
);
133+
},
134+
],
135+
},
136+
new Configuration({
137+
purgeOnStart: true,
138+
}),
139+
);
134140

135141
const isUrlASitemap = /sitemap.*\.xml$/.test(config.url);
136142

0 commit comments

Comments
 (0)