|
1 | 1 | // For more information, see https://crawlee.dev/
|
2 |
| -import { PlaywrightCrawler, downloadListOfUrls } from "crawlee"; |
| 2 | +import { Configuration, PlaywrightCrawler, downloadListOfUrls } from "crawlee"; |
3 | 3 | import { readFile, writeFile } from "fs/promises";
|
4 | 4 | import { glob } from "glob";
|
5 | 5 | import { Config, configSchema } from "./config.js";
|
@@ -54,83 +54,89 @@ export async function crawl(config: Config) {
|
54 | 54 | if (process.env.NO_CRAWL !== "true") {
|
55 | 55 | // PlaywrightCrawler crawls the web using a headless
|
56 | 56 | // browser controlled by the Playwright library.
|
57 |
| - crawler = new PlaywrightCrawler({ |
58 |
| - // Use the requestHandler to process each of the crawled pages. |
59 |
| - async requestHandler({ request, page, enqueueLinks, log, pushData }) { |
60 |
| - const title = await page.title(); |
61 |
| - pageCounter++; |
62 |
| - log.info( |
63 |
| - `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`, |
64 |
| - ); |
65 |
| - |
66 |
| - // Use custom handling for XPath selector |
67 |
| - if (config.selector) { |
68 |
| - if (config.selector.startsWith("/")) { |
69 |
| - await waitForXPath( |
70 |
| - page, |
71 |
| - config.selector, |
72 |
| - config.waitForSelectorTimeout ?? 1000, |
73 |
| - ); |
74 |
| - } else { |
75 |
| - await page.waitForSelector(config.selector, { |
76 |
| - timeout: config.waitForSelectorTimeout ?? 1000, |
77 |
| - }); |
78 |
| - } |
79 |
| - } |
| 57 | + crawler = new PlaywrightCrawler( |
| 58 | + { |
| 59 | + // Use the requestHandler to process each of the crawled pages. |
| 60 | + async requestHandler({ request, page, enqueueLinks, log, pushData }) { |
| 61 | + const title = await page.title(); |
| 62 | + pageCounter++; |
| 63 | + log.info( |
| 64 | + `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`, |
| 65 | + ); |
80 | 66 |
|
81 |
| - const html = await getPageHtml(page, config.selector); |
| 67 | + // Use custom handling for XPath selector |
| 68 | + if (config.selector) { |
| 69 | + if (config.selector.startsWith("/")) { |
| 70 | + await waitForXPath( |
| 71 | + page, |
| 72 | + config.selector, |
| 73 | + config.waitForSelectorTimeout ?? 1000, |
| 74 | + ); |
| 75 | + } else { |
| 76 | + await page.waitForSelector(config.selector, { |
| 77 | + timeout: config.waitForSelectorTimeout ?? 1000, |
| 78 | + }); |
| 79 | + } |
| 80 | + } |
82 | 81 |
|
83 |
| - // Save results as JSON to ./storage/datasets/default |
84 |
| - await pushData({ title, url: request.loadedUrl, html }); |
| 82 | + const html = await getPageHtml(page, config.selector); |
85 | 83 |
|
86 |
| - if (config.onVisitPage) { |
87 |
| - await config.onVisitPage({ page, pushData }); |
88 |
| - } |
| 84 | + // Save results as JSON to ./storage/datasets/default |
| 85 | + await pushData({ title, url: request.loadedUrl, html }); |
89 | 86 |
|
90 |
| - // Extract links from the current page |
91 |
| - // and add them to the crawling queue. |
92 |
| - await enqueueLinks({ |
93 |
| - globs: |
94 |
| - typeof config.match === "string" ? [config.match] : config.match, |
95 |
| - exclude: |
96 |
| - typeof config.exclude === "string" |
97 |
| - ? [config.exclude] |
98 |
| - : config.exclude ?? [], |
99 |
| - }); |
100 |
| - }, |
101 |
| - // Comment this option to scrape the full website. |
102 |
| - maxRequestsPerCrawl: config.maxPagesToCrawl, |
103 |
| - // Uncomment this option to see the browser window. |
104 |
| - // headless: false, |
105 |
| - preNavigationHooks: [ |
106 |
| - // Abort requests for certain resource types |
107 |
| - async ({ request, page, log }) => { |
108 |
| - // If there are no resource exclusions, return |
109 |
| - const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? []; |
110 |
| - if (RESOURCE_EXCLUSTIONS.length === 0) { |
111 |
| - return; |
112 |
| - } |
113 |
| - if (config.cookie) { |
114 |
| - const cookies = ( |
115 |
| - Array.isArray(config.cookie) ? config.cookie : [config.cookie] |
116 |
| - ).map((cookie) => { |
117 |
| - return { |
118 |
| - name: cookie.name, |
119 |
| - value: cookie.value, |
120 |
| - url: request.loadedUrl, |
121 |
| - }; |
122 |
| - }); |
123 |
| - await page.context().addCookies(cookies); |
| 87 | + if (config.onVisitPage) { |
| 88 | + await config.onVisitPage({ page, pushData }); |
124 | 89 | }
|
125 |
| - await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, (route) => |
126 |
| - route.abort("aborted"), |
127 |
| - ); |
128 |
| - log.info( |
129 |
| - `Aborting requests for as this is a resource excluded route`, |
130 |
| - ); |
| 90 | + |
| 91 | + // Extract links from the current page |
| 92 | + // and add them to the crawling queue. |
| 93 | + await enqueueLinks({ |
| 94 | + globs: |
| 95 | + typeof config.match === "string" ? [config.match] : config.match, |
| 96 | + exclude: |
| 97 | + typeof config.exclude === "string" |
| 98 | + ? [config.exclude] |
| 99 | + : config.exclude ?? [], |
| 100 | + }); |
131 | 101 | },
|
132 |
| - ], |
133 |
| - }); |
| 102 | + // Comment this option to scrape the full website. |
| 103 | + maxRequestsPerCrawl: config.maxPagesToCrawl, |
| 104 | + // Uncomment this option to see the browser window. |
| 105 | + // headless: false, |
| 106 | + preNavigationHooks: [ |
| 107 | + // Abort requests for certain resource types |
| 108 | + async ({ request, page, log }) => { |
| 109 | + // If there are no resource exclusions, return |
| 110 | + const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? []; |
| 111 | + if (RESOURCE_EXCLUSTIONS.length === 0) { |
| 112 | + return; |
| 113 | + } |
| 114 | + if (config.cookie) { |
| 115 | + const cookies = ( |
| 116 | + Array.isArray(config.cookie) ? config.cookie : [config.cookie] |
| 117 | + ).map((cookie) => { |
| 118 | + return { |
| 119 | + name: cookie.name, |
| 120 | + value: cookie.value, |
| 121 | + url: request.loadedUrl, |
| 122 | + }; |
| 123 | + }); |
| 124 | + await page.context().addCookies(cookies); |
| 125 | + } |
| 126 | + await page.route( |
| 127 | + `**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, |
| 128 | + (route) => route.abort("aborted"), |
| 129 | + ); |
| 130 | + log.info( |
| 131 | + `Aborting requests for as this is a resource excluded route`, |
| 132 | + ); |
| 133 | + }, |
| 134 | + ], |
| 135 | + }, |
| 136 | + new Configuration({ |
| 137 | + purgeOnStart: true, |
| 138 | + }), |
| 139 | + ); |
134 | 140 |
|
135 | 141 | const isUrlASitemap = /sitemap.*\.xml$/.test(config.url);
|
136 | 142 |
|
|
0 commit comments