Skip to content

Commit f983d36

Browse files
committed
fix: shrink HTML with cheerio
1 parent 5ff9938 commit f983d36

File tree

4 files changed

+67
-80
lines changed

4 files changed

+67
-80
lines changed

code/package-lock.json

Lines changed: 54 additions & 49 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

code/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
"ajv-formats": "^2.1.1",
1414
"apify": "^3.1.16",
1515
"apify-client": "^2.9.3",
16+
"cheerio": "^1.0.0",
1617
"crawlee": "^3.8.1",
1718
"gpt-3-encoder": "^1.1.4",
1819
"joplin-turndown-plugin-gfm": "^1.0.12",

code/src/processors.ts

Lines changed: 11 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,31 @@
1+
import { load } from 'cheerio';
12
import { encode } from 'gpt-3-encoder';
2-
import { Page } from 'playwright';
33

44
import { htmlToMarkdownProcessor } from './markdown.js';
55

66
const JSON_REGEX = /\{(?:[^{}]|())*\}/;
77

88
/**
99
* Shrinks HTML by removing css targeted elements and extra spaces
10-
* @param html
1110
*/
1211
export const shrinkHtml = async (
1312
html: string,
14-
page: Page,
1513
options: { removeLinkUrls: boolean; removeElementsCssSelector?: string },
1614
) => {
1715
const { removeElementsCssSelector, removeLinkUrls } = options;
1816

19-
const stripped = await page.evaluate(
20-
// eslint-disable-next-line @typescript-eslint/no-shadow
21-
([unstripped, removeSelector, removeLinkUrls]) => {
22-
const doc = new DOMParser().parseFromString(unstripped, 'text/html');
23-
if (removeSelector) {
24-
const elements = doc.querySelectorAll(removeSelector);
25-
for (const element of elements) {
26-
// there have been some cases when the page's own scripts cause errors and running this line
27-
// causes them to reemerge, so what in try/cartch
28-
try {
29-
element.remove();
30-
} catch (err) {
31-
/* ignore */
32-
}
33-
}
34-
}
17+
const $ = load(html);
3518

36-
if (removeLinkUrls) {
37-
const linkEls = doc.querySelectorAll('a');
38-
for (const linkEl of linkEls) {
39-
linkEl.removeAttribute('href');
40-
}
41-
}
19+
if (removeElementsCssSelector) {
20+
$(removeElementsCssSelector).map((_, el) => $(el).remove());
21+
}
22+
if (removeLinkUrls) {
23+
$('a').map((_, el) => $(el).removeAttr('href'));
24+
}
4225

43-
return doc.documentElement.outerHTML;
44-
},
45-
[html, removeElementsCssSelector, removeLinkUrls] as const,
46-
);
47-
return stripped.replace(/\s{2,}/g, ' ') // remove extra spaces
26+
const stripped = $.html();
27+
return stripped
28+
.replace(/\s{2,}/g, ' ') // remove extra spaces
4829
.replace(/>\s+</g, '><'); // remove all spaces between tags
4930
};
5031

code/src/routes/crawl-route.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ export const crawlRoute = async (context: PlaywrightCrawlingContext) => {
114114
originContentHtml = await page.content();
115115
}
116116

117-
const shrunkHtml = await shrinkHtml(originContentHtml, page, { removeLinkUrls, removeElementsCssSelector });
117+
const shrunkHtml = await shrinkHtml(originContentHtml, { removeLinkUrls, removeElementsCssSelector });
118118
const originPageContent = pageFormat === PAGE_FORMAT.MARKDOWN ? htmlToMarkdown(shrunkHtml) : shrunkHtml;
119119

120120
const instructionTokenLength = getNumberOfTextTokens(instructions);

0 commit comments

Comments
 (0)