Skip to content

Commit

Permalink
feat: implement 'remove elements css selector'
Browse files Browse the repository at this point in the history
  • Loading branch information
Patai5 committed Dec 28, 2023
1 parent 3c446a4 commit 83fe2b1
Show file tree
Hide file tree
Showing 8 changed files with 56 additions and 31 deletions.
8 changes: 8 additions & 0 deletions actors/extended-gpt-scraper/.actor/input_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@
"editor": "textfield",
"prefill": ""
},
"removeElementsCssSelector": {
"title": "Remove HTML elements (CSS selector)",
"type": "string",
"description": "A CSS selector matching HTML elements that will be removed from the DOM, before sending it to GPT processing. This is useful to skip irrelevant page content and save on GPT input tokens. \n\nBy default, the Actor removes usually unwanted elements like scripts, styles and inline images. You can disable the removal by setting this value to some non-existent CSS selector like `dummy_keep_everything`.",
"editor": "textarea",
"default": "script, style, noscript, path, svg, xlink",
"prefill": "script, style, noscript, path, svg, xlink"
},
"maxCrawlingDepth": {
"title": "Max crawling depth",
"type": "integer",
Expand Down
8 changes: 8 additions & 0 deletions actors/gpt-scraper/.actor/input_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@
"editor": "textfield",
"prefill": ""
},
"removeElementsCssSelector": {
"title": "Remove HTML elements (CSS selector)",
"type": "string",
"description": "A CSS selector matching HTML elements that will be removed from the DOM, before sending it to GPT processing. This is useful to skip irrelevant page content and save on GPT input tokens. \n\nBy default, the Actor removes usually unwanted elements like scripts, styles and inline images. You can disable the removal by setting this value to some non-existent CSS selector like `dummy_keep_everything`.",
"editor": "textarea",
"default": "script, style, noscript, path, svg, xlink",
"prefill": "script, style, noscript, path, svg, xlink"
},
"maxCrawlingDepth": {
"title": "Max crawling depth",
"type": "integer",
Expand Down
1 change: 0 additions & 1 deletion packages/gpt-scraper-core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
"cheerio": "^1.0.0-rc.12",
"crawlee": "^3.0.0",
"gpt-3-encoder": "^1.1.4",
"html-to-text": "^9.0.5",
"joplin-turndown-plugin-gfm": "^1.0.12",
"langchain": "^0.0.197-rc.1",
"openai": "^3.3.0",
Expand Down
8 changes: 6 additions & 2 deletions packages/gpt-scraper-core/src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import { getModelByName } from './models/models.js';
import { tryWrapInOpenaiError } from './models/openai.js';
import { getNumberOfTextTokens, htmlToMarkdown, maybeShortsTextByTokenLength, shrinkHtml } from './processors.js';
import { Input, PAGE_FORMAT } from './types/input.js';
import { parseInput, validateInput } from './input.js';
import { parseInput, validateInput, validateInputCssSelectors } from './input.js';
import { OpenaiAPIError } from './errors.js';
import { OpenAIModelSettings } from './types/models.js';

Expand Down Expand Up @@ -85,6 +85,9 @@ export const createCrawler = async ({ input }: { input: Input }) => {
const state = await crawler.useState({ pageOutputted: 0 } as State);
const url = request.loadedUrl || request.url;

const isFirstPage = state.pageOutputted === 0;
if (isFirstPage) await validateInputCssSelectors(input, page);

if (input.maxPagesPerCrawl && state.pageOutputted >= input.maxPagesPerCrawl) {
log.info(`Reached max pages per run (${input.maxPagesPerCrawl}), skipping URL ${url}.`);
await Actor.exit(`Finished! Reached max pages per run (${input.maxPagesPerCrawl}).`);
Expand Down Expand Up @@ -128,7 +131,8 @@ export const createCrawler = async ({ input }: { input: Input }) => {
originContentHtml = await page.content();
}

const originPageContent = pageFormat === PAGE_FORMAT.MARKDOWN ? htmlToMarkdown(originContentHtml) : await shrinkHtml(originContentHtml, page);
const shrunkHtml = await shrinkHtml(originContentHtml, page, input.removeElementsCssSelector);
const originPageContent = pageFormat === PAGE_FORMAT.MARKDOWN ? htmlToMarkdown(shrunkHtml) : shrunkHtml;

const instructionTokenLength = getNumberOfTextTokens(input.instructions);

Expand Down
22 changes: 22 additions & 0 deletions packages/gpt-scraper-core/src/input.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { Actor } from 'apify';
import { Cookie } from 'crawlee';
import { Page } from 'playwright';
import { Input } from './types/input';

export const HTML_TAGS_TO_IGNORE = ['script', 'style', 'noscript'];
Expand Down Expand Up @@ -39,6 +40,17 @@ export const validateInput = async (input: Input) => {
if (initialCookies) await validateInitialCookies(initialCookies);
};

/**
* Css selectors need to be validated in the browser context. We do the validation on the first page.
*/
export const validateInputCssSelectors = async (input: Input, page: Page) => {
const { linkSelector, targetSelector, removeElementsCssSelector } = input;

await validateInputCssSelector(linkSelector, 'linkSelector', page);
await validateInputCssSelector(targetSelector, 'targetSelector', page);
await validateInputCssSelector(removeElementsCssSelector, 'removeElementsCssSelector', page);
};

const parseNumberInRange = async (
property: unknown,
propertyName: string,
Expand Down Expand Up @@ -81,3 +93,13 @@ const validateInitialCookies = async (cookies: unknown): Promise<Cookie[]> => {

return cookies as Cookie[];
};

const validateInputCssSelector = async (selector: string | undefined | null, inputName: string, page: Page) => {
if (selector === undefined || selector === null) return;

try {
await page.$(selector);
} catch (e) {
throw await Actor.fail(`INVALID INPUT: '${inputName}' is not a valid CSS selector! Got '${selector}'`);
}
};
36 changes: 10 additions & 26 deletions packages/gpt-scraper-core/src/processors.ts
Original file line number Diff line number Diff line change
@@ -1,41 +1,25 @@
import { encode } from 'gpt-3-encoder';
import { convert } from 'html-to-text';
import { Page } from 'playwright';
import { htmlToMarkdownProcessor } from './markdown.js';
import { HTML_TAGS_TO_IGNORE } from './input.js';

const JSON_REGEX = /\{(?:[^{}]|())*\}/;

/**
* Converts HTML to text
* Shrinks HTML by removing css targeted elements and extra spaces
* @param html
*/
export const htmlToText = (html: string) => {
const options: any = {
wordwrap: false,
selectors: HTML_TAGS_TO_IGNORE.map((tag) => ({ selector: tag, format: 'skip' })),
// ignoreHref: true, // ignore href targets
};
const text = convert(html, options);
return text
.replace(/\n{2,}/g, '\n\n'); // remove extra new lines
};

/**
* Shrinks HTML by removing script, style and no script tags and whitespaces
* @param html
*/
export const shrinkHtml = async (html: string, page: Page) => {
const stripped = await page.evaluate((unstripped) => {
const doc = new DOMParser().parseFromString(unstripped, 'text/html');
for (const tag of ['script', 'style', 'noscript', 'path', 'xlink']) {
const elements = doc.querySelectorAll(tag);
export const shrinkHtml = async (html: string, page: Page, removeElementsCssSelector?: string) => {
const stripped = await page.evaluate(
([unstripped, removeElementsCssSelector]) => {

Check failure on line 13 in packages/gpt-scraper-core/src/processors.ts

View workflow job for this annotation

GitHub Actions / lint

'removeElementsCssSelector' is already declared in the upper scope on line 11 column 60
const doc = new DOMParser().parseFromString(unstripped, 'text/html');
const elements = doc.querySelectorAll(removeElementsCssSelector || '');
for (const element of elements) {
element.remove();
}
}
return doc.documentElement.outerHTML;
}, html);
return doc.documentElement.outerHTML;
},
[html, removeElementsCssSelector] as const,
);
return stripped.replace(/\s{2,}/g, ' ') // remove extra spaces
.replace(/>\s+</g, '><'); // remove all spaces between tags
};
Expand Down
2 changes: 1 addition & 1 deletion packages/gpt-scraper-core/src/types/index.ts
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
export { HTML_TAGS_TO_IGNORE, Input, PAGE_FORMAT } from './input.js';
export { Input, PAGE_FORMAT } from './input.js';
export { OpenAIModelSettings } from './models.js';
2 changes: 1 addition & 1 deletion packages/gpt-scraper-core/src/types/input.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ export interface Input extends OpenAIModelSettings {
pageFormatInRequest?: PAGE_FORMAT;
saveSnapshots?: boolean;
initialCookies?: Cookie[];
removeElementsCssSelector?: string;
}

export const HTML_TAGS_TO_IGNORE = ['script', 'style', 'noscript'];
export enum PAGE_FORMAT {
HTML = 'HTML',
MARKDOWN = 'Markdown',
Expand Down

0 comments on commit 83fe2b1

Please sign in to comment.