Skip to content

Commit

Permalink
fix: remove key value stores from config
Browse files Browse the repository at this point in the history
  • Loading branch information
Patai5 committed Feb 18, 2024
1 parent 684425e commit 3af94de
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 32 deletions.
18 changes: 8 additions & 10 deletions packages/gpt-scraper-core/src/configuration.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import { AnySchema } from 'ajv';
import addFormats from 'ajv-formats';
import Ajv2020 from 'ajv/dist/2020';
import Ajv2020 from 'ajv/dist/2020.js';
import { Actor } from 'apify';
import { Cookie, KeyValueStore, RequestList, log } from 'crawlee';
import { Cookie, RequestList, log } from 'crawlee';
import { Page } from 'playwright';
import { getModelByName } from './models/models';
import { OpenAIModelHandler } from './models/openai';
import { getModelByName } from './models/models.js';
import { OpenAIModelHandler } from './models/openai.js';
import { OpenAIModelSettings } from './types';
import { Config } from './types/config';
import { Input, PAGE_FORMAT } from './types/input';
import { Config } from './types/config.js';
import { Input, PAGE_FORMAT } from './types/input.js';

/**
* Parses the Actor's input into a config object and validates it. Throws an Actor fail if the input is invalid.
Expand Down Expand Up @@ -39,8 +39,7 @@ export const parseConfiguration = async (input: Input): Promise<Config> => {

const proxyConfiguration = await Actor.createProxyConfiguration(input.proxyConfiguration);

const requestList = await RequestList.open('start-urls', startUrls);
const kvStore = await KeyValueStore.open();
const { requests } = await RequestList.open({ sources: startUrls });

// make sure to change 0 (unlimited) to a very high number, because this is used in arithmetics and comparisons
const maxCrawlingDepth = input.maxCrawlingDepth || 999999;
Expand All @@ -51,7 +50,7 @@ export const parseConfiguration = async (input: Input): Promise<Config> => {
includeUrlGlobs,
initialCookies,
instructions,
kvStore,
requests,
linkSelector,
maxCrawlingDepth,
maxPagesPerCrawl,
Expand All @@ -60,7 +59,6 @@ export const parseConfiguration = async (input: Input): Promise<Config> => {
pageFormat: pageFormatInRequest,
proxyConfiguration,
removeElementsCssSelector,
requestList,
saveSnapshots,
schema,
skipGptGlobs,
Expand Down
5 changes: 3 additions & 2 deletions packages/gpt-scraper-core/src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import { ERROR_TYPE } from './utils.js';
export const createCrawler = async ({ input }: { input: Input }) => {
const config = await parseConfiguration(input);

const { maxPagesPerCrawl, requestList, proxyConfiguration } = config;
const { maxPagesPerCrawl, proxyConfiguration, requests } = config;

const crawler = new PlaywrightCrawler({
launchContext: {
Expand All @@ -22,7 +22,6 @@ export const createCrawler = async ({ input }: { input: Input }) => {
requestHandlerTimeoutSecs: 3 * 60,
proxyConfiguration,
maxRequestsPerCrawl: maxPagesPerCrawl,
requestList,
requestHandler: crawlRoute,
preNavigationHooks: [
async () => {
Expand Down Expand Up @@ -69,6 +68,8 @@ export const createCrawler = async ({ input }: { input: Input }) => {
const defaultCrawlerState = { pagesOpened: 0, config };
await crawler.useState<CrawlerState>(defaultCrawlerState);

await crawler.addRequests(requests);

// @ts-expect-error patching
const oldCrawlerLogError = crawler.log.error.bind(crawler.log);
// @ts-expect-error patching
Expand Down
35 changes: 18 additions & 17 deletions packages/gpt-scraper-core/src/routes/crawl-route.ts
Original file line number Diff line number Diff line change
@@ -1,35 +1,36 @@
import { Actor } from 'apify';
import { Dataset, NonRetryableError, PlaywrightCrawlingContext, log, utils } from 'crawlee';
import { validateInputCssSelectors } from '../configuration';
import { ERROR_OCCURRED_MESSAGE, NonRetryableOpenaiAPIError, OpenaiAPIErrorToExitActor } from '../errors';
import { getNumberOfTextTokens, htmlToMarkdown, maybeShortsTextByTokenLength, shrinkHtml } from '../processors';
import { PAGE_FORMAT } from '../types';
import { CrawlerState } from '../types/crawler-state';
import { ERROR_TYPE, doesUrlMatchGlobs } from '../utils';
import { Dataset, KeyValueStore, NonRetryableError, PlaywrightCrawlingContext, log, utils } from 'crawlee';
import { validateInputCssSelectors } from '../configuration.js';
import { ERROR_OCCURRED_MESSAGE, NonRetryableOpenaiAPIError, OpenaiAPIErrorToExitActor } from '../errors.js';
import { getNumberOfTextTokens, htmlToMarkdown, maybeShortsTextByTokenLength, shrinkHtml } from '../processors.js';
import { CrawlerState } from '../types/crawler-state.js';
import { PAGE_FORMAT } from '../types/input.js';
import { ERROR_TYPE, doesUrlMatchGlobs } from '../utils.js';

/**
* The main crawling route. Enqueues new URLs and processes the page by calling the GPT model.
*/
export const crawlRoute = async (context: PlaywrightCrawlingContext) => {
const { request, page, enqueueLinks, closeCookieModals, crawler } = context;

const kvStore = await KeyValueStore.open();

const state = await crawler.useState<CrawlerState>();
const {
maxPagesPerCrawl,
maxCrawlingDepth,
includeUrlGlobs,
linkSelector,
excludeUrlGlobs,
skipGptGlobs,
targetSelector,
removeElementsCssSelector,
pageFormat,
includeUrlGlobs,
instructions,
linkSelector,
maxCrawlingDepth,
maxPagesPerCrawl,
model,
modelSettings,
pageFormat,
removeElementsCssSelector,
saveSnapshots,
kvStore,
schema,
modelSettings,
skipGptGlobs,
targetSelector,
} = state.config;

const { depth = 0 } = request.userData;
Expand Down
5 changes: 2 additions & 3 deletions packages/gpt-scraper-core/src/types/config.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { AnySchema } from 'ajv';
import { Cookie, GlobInput, KeyValueStore, ProxyConfiguration, RequestList } from 'crawlee';
import { Cookie, GlobInput, ProxyConfiguration, Request } from 'crawlee';
import { OpenAIModelHandler } from '../models/openai';
import { PAGE_FORMAT } from './input';
import { OpenAIModelSettings } from './models';
Expand All @@ -12,7 +12,6 @@ export interface Config {
includeUrlGlobs?: GlobInput[];
initialCookies?: Cookie[];
instructions: string;
kvStore: KeyValueStore;
linkSelector?: string;
maxCrawlingDepth: number;
maxPagesPerCrawl: number;
Expand All @@ -21,7 +20,7 @@ export interface Config {
pageFormat: PAGE_FORMAT;
proxyConfiguration?: ProxyConfiguration;
removeElementsCssSelector?: string;
requestList: RequestList;
requests: Request[];
saveSnapshots: boolean;
schema?: AnySchema;
skipGptGlobs?: GlobInput[];
Expand Down

0 comments on commit 3af94de

Please sign in to comment.