Skip to content

Commit 83fe2b1

Browse files
committed
feat: implement 'remove elements css selector'
1 parent 3c446a4 commit 83fe2b1

File tree

8 files changed

+56
-31
lines changed

8 files changed

+56
-31
lines changed

actors/extended-gpt-scraper/.actor/input_schema.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,14 @@
7575
"editor": "textfield",
7676
"prefill": ""
7777
},
78+
"removeElementsCssSelector": {
79+
"title": "Remove HTML elements (CSS selector)",
80+
"type": "string",
81+
"description": "A CSS selector matching HTML elements that will be removed from the DOM, before sending it to GPT processing. This is useful to skip irrelevant page content and save on GPT input tokens. \n\nBy default, the Actor removes usually unwanted elements like scripts, styles and inline images. You can disable the removal by setting this value to some non-existent CSS selector like `dummy_keep_everything`.",
82+
"editor": "textarea",
83+
"default": "script, style, noscript, path, svg, xlink",
84+
"prefill": "script, style, noscript, path, svg, xlink"
85+
},
7886
"maxCrawlingDepth": {
7987
"title": "Max crawling depth",
8088
"type": "integer",

actors/gpt-scraper/.actor/input_schema.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,14 @@
5858
"editor": "textfield",
5959
"prefill": ""
6060
},
61+
"removeElementsCssSelector": {
62+
"title": "Remove HTML elements (CSS selector)",
63+
"type": "string",
64+
"description": "A CSS selector matching HTML elements that will be removed from the DOM, before sending it to GPT processing. This is useful to skip irrelevant page content and save on GPT input tokens. \n\nBy default, the Actor removes usually unwanted elements like scripts, styles and inline images. You can disable the removal by setting this value to some non-existent CSS selector like `dummy_keep_everything`.",
65+
"editor": "textarea",
66+
"default": "script, style, noscript, path, svg, xlink",
67+
"prefill": "script, style, noscript, path, svg, xlink"
68+
},
6169
"maxCrawlingDepth": {
6270
"title": "Max crawling depth",
6371
"type": "integer",

packages/gpt-scraper-core/package.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
"cheerio": "^1.0.0-rc.12",
2727
"crawlee": "^3.0.0",
2828
"gpt-3-encoder": "^1.1.4",
29-
"html-to-text": "^9.0.5",
3029
"joplin-turndown-plugin-gfm": "^1.0.12",
3130
"langchain": "^0.0.197-rc.1",
3231
"openai": "^3.3.0",

packages/gpt-scraper-core/src/crawler.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import { getModelByName } from './models/models.js';
88
import { tryWrapInOpenaiError } from './models/openai.js';
99
import { getNumberOfTextTokens, htmlToMarkdown, maybeShortsTextByTokenLength, shrinkHtml } from './processors.js';
1010
import { Input, PAGE_FORMAT } from './types/input.js';
11-
import { parseInput, validateInput } from './input.js';
11+
import { parseInput, validateInput, validateInputCssSelectors } from './input.js';
1212
import { OpenaiAPIError } from './errors.js';
1313
import { OpenAIModelSettings } from './types/models.js';
1414

@@ -85,6 +85,9 @@ export const createCrawler = async ({ input }: { input: Input }) => {
8585
const state = await crawler.useState({ pageOutputted: 0 } as State);
8686
const url = request.loadedUrl || request.url;
8787

88+
const isFirstPage = state.pageOutputted === 0;
89+
if (isFirstPage) await validateInputCssSelectors(input, page);
90+
8891
if (input.maxPagesPerCrawl && state.pageOutputted >= input.maxPagesPerCrawl) {
8992
log.info(`Reached max pages per run (${input.maxPagesPerCrawl}), skipping URL ${url}.`);
9093
await Actor.exit(`Finished! Reached max pages per run (${input.maxPagesPerCrawl}).`);
@@ -128,7 +131,8 @@ export const createCrawler = async ({ input }: { input: Input }) => {
128131
originContentHtml = await page.content();
129132
}
130133

131-
const originPageContent = pageFormat === PAGE_FORMAT.MARKDOWN ? htmlToMarkdown(originContentHtml) : await shrinkHtml(originContentHtml, page);
134+
const shrunkHtml = await shrinkHtml(originContentHtml, page, input.removeElementsCssSelector);
135+
const originPageContent = pageFormat === PAGE_FORMAT.MARKDOWN ? htmlToMarkdown(shrunkHtml) : shrunkHtml;
132136

133137
const instructionTokenLength = getNumberOfTextTokens(input.instructions);
134138

packages/gpt-scraper-core/src/input.ts

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { Actor } from 'apify';
22
import { Cookie } from 'crawlee';
3+
import { Page } from 'playwright';
34
import { Input } from './types/input';
45

56
export const HTML_TAGS_TO_IGNORE = ['script', 'style', 'noscript'];
@@ -39,6 +40,17 @@ export const validateInput = async (input: Input) => {
3940
if (initialCookies) await validateInitialCookies(initialCookies);
4041
};
4142

43+
/**
44+
* Css selectors need to be validated in the browser context. We do the validation on the first page.
45+
*/
46+
export const validateInputCssSelectors = async (input: Input, page: Page) => {
47+
const { linkSelector, targetSelector, removeElementsCssSelector } = input;
48+
49+
await validateInputCssSelector(linkSelector, 'linkSelector', page);
50+
await validateInputCssSelector(targetSelector, 'targetSelector', page);
51+
await validateInputCssSelector(removeElementsCssSelector, 'removeElementsCssSelector', page);
52+
};
53+
4254
const parseNumberInRange = async (
4355
property: unknown,
4456
propertyName: string,
@@ -81,3 +93,13 @@ const validateInitialCookies = async (cookies: unknown): Promise<Cookie[]> => {
8193

8294
return cookies as Cookie[];
8395
};
96+
97+
const validateInputCssSelector = async (selector: string | undefined | null, inputName: string, page: Page) => {
98+
if (selector === undefined || selector === null) return;
99+
100+
try {
101+
await page.$(selector);
102+
} catch (e) {
103+
throw await Actor.fail(`INVALID INPUT: '${inputName}' is not a valid CSS selector! Got '${selector}'`);
104+
}
105+
};

packages/gpt-scraper-core/src/processors.ts

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,25 @@
11
import { encode } from 'gpt-3-encoder';
2-
import { convert } from 'html-to-text';
32
import { Page } from 'playwright';
43
import { htmlToMarkdownProcessor } from './markdown.js';
5-
import { HTML_TAGS_TO_IGNORE } from './input.js';
64

75
const JSON_REGEX = /\{(?:[^{}]|())*\}/;
86

97
/**
10-
* Converts HTML to text
8+
* Shrinks HTML by removing css targeted elements and extra spaces
119
* @param html
1210
*/
13-
export const htmlToText = (html: string) => {
14-
const options: any = {
15-
wordwrap: false,
16-
selectors: HTML_TAGS_TO_IGNORE.map((tag) => ({ selector: tag, format: 'skip' })),
17-
// ignoreHref: true, // ignore href targets
18-
};
19-
const text = convert(html, options);
20-
return text
21-
.replace(/\n{2,}/g, '\n\n'); // remove extra new lines
22-
};
23-
24-
/**
25-
* Shrinks HTML by removing script, style and no script tags and whitespaces
26-
* @param html
27-
*/
28-
export const shrinkHtml = async (html: string, page: Page) => {
29-
const stripped = await page.evaluate((unstripped) => {
30-
const doc = new DOMParser().parseFromString(unstripped, 'text/html');
31-
for (const tag of ['script', 'style', 'noscript', 'path', 'xlink']) {
32-
const elements = doc.querySelectorAll(tag);
11+
export const shrinkHtml = async (html: string, page: Page, removeElementsCssSelector?: string) => {
12+
const stripped = await page.evaluate(
13+
([unstripped, removeElementsCssSelector]) => {
14+
const doc = new DOMParser().parseFromString(unstripped, 'text/html');
15+
const elements = doc.querySelectorAll(removeElementsCssSelector || '');
3316
for (const element of elements) {
3417
element.remove();
3518
}
36-
}
37-
return doc.documentElement.outerHTML;
38-
}, html);
19+
return doc.documentElement.outerHTML;
20+
},
21+
[html, removeElementsCssSelector] as const,
22+
);
3923
return stripped.replace(/\s{2,}/g, ' ') // remove extra spaces
4024
.replace(/>\s+</g, '><'); // remove all spaces between tags
4125
};
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
export { HTML_TAGS_TO_IGNORE, Input, PAGE_FORMAT } from './input.js';
1+
export { Input, PAGE_FORMAT } from './input.js';
22
export { OpenAIModelSettings } from './models.js';

packages/gpt-scraper-core/src/types/input.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ export interface Input extends OpenAIModelSettings {
2222
pageFormatInRequest?: PAGE_FORMAT;
2323
saveSnapshots?: boolean;
2424
initialCookies?: Cookie[];
25+
removeElementsCssSelector?: string;
2526
}
2627

27-
export const HTML_TAGS_TO_IGNORE = ['script', 'style', 'noscript'];
2828
export enum PAGE_FORMAT {
2929
HTML = 'HTML',
3030
MARKDOWN = 'Markdown',

0 commit comments

Comments
 (0)