Skip to content

Commit 3af94de

Browse files
committed
fix: remove key value stores from config
1 parent 684425e commit 3af94de

File tree

4 files changed

+31
-32
lines changed

4 files changed

+31
-32
lines changed

packages/gpt-scraper-core/src/configuration.ts

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
import { AnySchema } from 'ajv';
22
import addFormats from 'ajv-formats';
3-
import Ajv2020 from 'ajv/dist/2020';
3+
import Ajv2020 from 'ajv/dist/2020.js';
44
import { Actor } from 'apify';
5-
import { Cookie, KeyValueStore, RequestList, log } from 'crawlee';
5+
import { Cookie, RequestList, log } from 'crawlee';
66
import { Page } from 'playwright';
7-
import { getModelByName } from './models/models';
8-
import { OpenAIModelHandler } from './models/openai';
7+
import { getModelByName } from './models/models.js';
8+
import { OpenAIModelHandler } from './models/openai.js';
99
import { OpenAIModelSettings } from './types';
10-
import { Config } from './types/config';
11-
import { Input, PAGE_FORMAT } from './types/input';
10+
import { Config } from './types/config.js';
11+
import { Input, PAGE_FORMAT } from './types/input.js';
1212

1313
/**
1414
* Parses the Actor's input into a config object and validates it. Throws an Actor fail if the input is invalid.
@@ -39,8 +39,7 @@ export const parseConfiguration = async (input: Input): Promise<Config> => {
3939

4040
const proxyConfiguration = await Actor.createProxyConfiguration(input.proxyConfiguration);
4141

42-
const requestList = await RequestList.open('start-urls', startUrls);
43-
const kvStore = await KeyValueStore.open();
42+
const { requests } = await RequestList.open({ sources: startUrls });
4443

4544
// make sure to change 0 (unlimited) to a very high number, because this is used in arithmetics and comparisons
4645
const maxCrawlingDepth = input.maxCrawlingDepth || 999999;
@@ -51,7 +50,7 @@ export const parseConfiguration = async (input: Input): Promise<Config> => {
5150
includeUrlGlobs,
5251
initialCookies,
5352
instructions,
54-
kvStore,
53+
requests,
5554
linkSelector,
5655
maxCrawlingDepth,
5756
maxPagesPerCrawl,
@@ -60,7 +59,6 @@ export const parseConfiguration = async (input: Input): Promise<Config> => {
6059
pageFormat: pageFormatInRequest,
6160
proxyConfiguration,
6261
removeElementsCssSelector,
63-
requestList,
6462
saveSnapshots,
6563
schema,
6664
skipGptGlobs,

packages/gpt-scraper-core/src/crawler.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import { ERROR_TYPE } from './utils.js';
99
export const createCrawler = async ({ input }: { input: Input }) => {
1010
const config = await parseConfiguration(input);
1111

12-
const { maxPagesPerCrawl, requestList, proxyConfiguration } = config;
12+
const { maxPagesPerCrawl, proxyConfiguration, requests } = config;
1313

1414
const crawler = new PlaywrightCrawler({
1515
launchContext: {
@@ -22,7 +22,6 @@ export const createCrawler = async ({ input }: { input: Input }) => {
2222
requestHandlerTimeoutSecs: 3 * 60,
2323
proxyConfiguration,
2424
maxRequestsPerCrawl: maxPagesPerCrawl,
25-
requestList,
2625
requestHandler: crawlRoute,
2726
preNavigationHooks: [
2827
async () => {
@@ -69,6 +68,8 @@ export const createCrawler = async ({ input }: { input: Input }) => {
6968
const defaultCrawlerState = { pagesOpened: 0, config };
7069
await crawler.useState<CrawlerState>(defaultCrawlerState);
7170

71+
await crawler.addRequests(requests);
72+
7273
// @ts-expect-error patching
7374
const oldCrawlerLogError = crawler.log.error.bind(crawler.log);
7475
// @ts-expect-error patching

packages/gpt-scraper-core/src/routes/crawl-route.ts

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,36 @@
11
import { Actor } from 'apify';
2-
import { Dataset, NonRetryableError, PlaywrightCrawlingContext, log, utils } from 'crawlee';
3-
import { validateInputCssSelectors } from '../configuration';
4-
import { ERROR_OCCURRED_MESSAGE, NonRetryableOpenaiAPIError, OpenaiAPIErrorToExitActor } from '../errors';
5-
import { getNumberOfTextTokens, htmlToMarkdown, maybeShortsTextByTokenLength, shrinkHtml } from '../processors';
6-
import { PAGE_FORMAT } from '../types';
7-
import { CrawlerState } from '../types/crawler-state';
8-
import { ERROR_TYPE, doesUrlMatchGlobs } from '../utils';
2+
import { Dataset, KeyValueStore, NonRetryableError, PlaywrightCrawlingContext, log, utils } from 'crawlee';
3+
import { validateInputCssSelectors } from '../configuration.js';
4+
import { ERROR_OCCURRED_MESSAGE, NonRetryableOpenaiAPIError, OpenaiAPIErrorToExitActor } from '../errors.js';
5+
import { getNumberOfTextTokens, htmlToMarkdown, maybeShortsTextByTokenLength, shrinkHtml } from '../processors.js';
6+
import { CrawlerState } from '../types/crawler-state.js';
7+
import { PAGE_FORMAT } from '../types/input.js';
8+
import { ERROR_TYPE, doesUrlMatchGlobs } from '../utils.js';
99

1010
/**
1111
* The main crawling route. Enqueues new URLs and processes the page by calling the GPT model.
1212
*/
1313
export const crawlRoute = async (context: PlaywrightCrawlingContext) => {
1414
const { request, page, enqueueLinks, closeCookieModals, crawler } = context;
1515

16+
const kvStore = await KeyValueStore.open();
17+
1618
const state = await crawler.useState<CrawlerState>();
1719
const {
18-
maxPagesPerCrawl,
19-
maxCrawlingDepth,
20-
includeUrlGlobs,
21-
linkSelector,
2220
excludeUrlGlobs,
23-
skipGptGlobs,
24-
targetSelector,
25-
removeElementsCssSelector,
26-
pageFormat,
21+
includeUrlGlobs,
2722
instructions,
23+
linkSelector,
24+
maxCrawlingDepth,
25+
maxPagesPerCrawl,
2826
model,
27+
modelSettings,
28+
pageFormat,
29+
removeElementsCssSelector,
2930
saveSnapshots,
30-
kvStore,
3131
schema,
32-
modelSettings,
32+
skipGptGlobs,
33+
targetSelector,
3334
} = state.config;
3435

3536
const { depth = 0 } = request.userData;

packages/gpt-scraper-core/src/types/config.ts

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { AnySchema } from 'ajv';
2-
import { Cookie, GlobInput, KeyValueStore, ProxyConfiguration, RequestList } from 'crawlee';
2+
import { Cookie, GlobInput, ProxyConfiguration, Request } from 'crawlee';
33
import { OpenAIModelHandler } from '../models/openai';
44
import { PAGE_FORMAT } from './input';
55
import { OpenAIModelSettings } from './models';
@@ -12,7 +12,6 @@ export interface Config {
1212
includeUrlGlobs?: GlobInput[];
1313
initialCookies?: Cookie[];
1414
instructions: string;
15-
kvStore: KeyValueStore;
1615
linkSelector?: string;
1716
maxCrawlingDepth: number;
1817
maxPagesPerCrawl: number;
@@ -21,7 +20,7 @@ export interface Config {
2120
pageFormat: PAGE_FORMAT;
2221
proxyConfiguration?: ProxyConfiguration;
2322
removeElementsCssSelector?: string;
24-
requestList: RequestList;
23+
requests: Request[];
2524
saveSnapshots: boolean;
2625
schema?: AnySchema;
2726
skipGptGlobs?: GlobInput[];

0 commit comments

Comments
 (0)