Skip to content

Commit

Permalink
fix: shrink HTML with cheerio
Browse files Browse the repository at this point in the history
  • Loading branch information
Patai5 committed Sep 21, 2024
1 parent 5ff9938 commit f983d36
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 80 deletions.
103 changes: 54 additions & 49 deletions code/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions code/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"ajv-formats": "^2.1.1",
"apify": "^3.1.16",
"apify-client": "^2.9.3",
"cheerio": "^1.0.0",
"crawlee": "^3.8.1",
"gpt-3-encoder": "^1.1.4",
"joplin-turndown-plugin-gfm": "^1.0.12",
Expand Down
41 changes: 11 additions & 30 deletions code/src/processors.ts
Original file line number Diff line number Diff line change
@@ -1,50 +1,31 @@
import { load } from 'cheerio';
import { encode } from 'gpt-3-encoder';
import { Page } from 'playwright';

import { htmlToMarkdownProcessor } from './markdown.js';

const JSON_REGEX = /\{(?:[^{}]|())*\}/;

/**
* Shrinks HTML by removing css targeted elements and extra spaces
* @param html
*/
export const shrinkHtml = async (
html: string,
page: Page,
options: { removeLinkUrls: boolean; removeElementsCssSelector?: string },
) => {
const { removeElementsCssSelector, removeLinkUrls } = options;

const stripped = await page.evaluate(
// eslint-disable-next-line @typescript-eslint/no-shadow
([unstripped, removeSelector, removeLinkUrls]) => {
const doc = new DOMParser().parseFromString(unstripped, 'text/html');
if (removeSelector) {
const elements = doc.querySelectorAll(removeSelector);
for (const element of elements) {
// there have been some cases when the page's own scripts cause errors and running this line
// causes them to reemerge, so what in try/cartch
try {
element.remove();
} catch (err) {
/* ignore */
}
}
}
const $ = load(html);

if (removeLinkUrls) {
const linkEls = doc.querySelectorAll('a');
for (const linkEl of linkEls) {
linkEl.removeAttribute('href');
}
}
if (removeElementsCssSelector) {
$(removeElementsCssSelector).map((_, el) => $(el).remove());
}
if (removeLinkUrls) {
$('a').map((_, el) => $(el).removeAttr('href'));
}

return doc.documentElement.outerHTML;
},
[html, removeElementsCssSelector, removeLinkUrls] as const,
);
return stripped.replace(/\s{2,}/g, ' ') // remove extra spaces
const stripped = $.html();
return stripped
.replace(/\s{2,}/g, ' ') // remove extra spaces
.replace(/>\s+</g, '><'); // remove all spaces between tags
};

Expand Down
2 changes: 1 addition & 1 deletion code/src/routes/crawl-route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ export const crawlRoute = async (context: PlaywrightCrawlingContext) => {
originContentHtml = await page.content();
}

const shrunkHtml = await shrinkHtml(originContentHtml, page, { removeLinkUrls, removeElementsCssSelector });
const shrunkHtml = await shrinkHtml(originContentHtml, { removeLinkUrls, removeElementsCssSelector });
const originPageContent = pageFormat === PAGE_FORMAT.MARKDOWN ? htmlToMarkdown(shrunkHtml) : shrunkHtml;

const instructionTokenLength = getNumberOfTextTokens(instructions);
Expand Down

0 comments on commit f983d36

Please sign in to comment.