Skip to content

Commit f3f7495

Browse files
committed
fix: add +1 check for max pages and improve the coutner keeping
1 parent 3a53a8b commit f3f7495

File tree

3 files changed

+33
-5
lines changed

3 files changed

+33
-5
lines changed

packages/gpt-scraper-core/src/crawler.ts

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import { Input, PAGE_FORMAT } from './types/input.js';
1010
import { parseInput, validateInput, validateInputCssSelectors } from './input.js';
1111
import { NonRetryableOpenaiAPIError } from './errors.js';
1212
import { OpenAIModelSettings } from './types/models.js';
13-
import { doesUrlMatchGlobs } from './utils.js';
13+
import { doesUrlMatchGlobs, ERROR_TYPE } from './utils.js';
1414

1515
interface State {
1616
pagesOpened: number;
@@ -87,7 +87,7 @@ export const createCrawler = async ({ input }: { input: Input }) => {
8787
const state = await crawler.useState<State>(DEFAULT_STATE);
8888
if (state.pagesOpened >= input.maxPagesPerCrawl) {
8989
const err = new NonRetryableError('Skipping this page');
90-
err.name = 'LimitError';
90+
err.name = ERROR_TYPE.LIMIT_ERROR;
9191
throw err;
9292
}
9393
},
@@ -97,7 +97,27 @@ export const createCrawler = async ({ input }: { input: Input }) => {
9797
const { depth = 0 } = request.userData;
9898
const state = await crawler.useState<State>(DEFAULT_STATE);
9999
const isFirstPage = state.pagesOpened === 0;
100-
state.pagesOpened++;
100+
// perform an explicit check (to see if this request has already dealt with counters)
101+
// by the request key, so as to avoid it succeeding in case of other requests inheriting userData with `...userData`
102+
if (request.userData.wasOpenedKey !== request.uniqueKey) {
103+
if (state.pagesOpened >= input.maxPagesPerCrawl) {
104+
// performing a check in the preNavigationHook is helpful to prevent extra requests,
105+
// but as the counters are incremented only later in a different async function,
106+
// a race condition may occur when multiple pages are opened at the same time;
107+
// performing a double check here, synchronously before dealing with counters just below,
108+
// will ensure that this race condition is avoided
109+
const err = new NonRetryableError('Skipping this page');
110+
err.name = ERROR_TYPE.LIMIT_ERROR;
111+
throw err;
112+
}
113+
// only increment this counter once for each page (via the check in the outer `if`);
114+
// also, do not increment in the preNavigationHook, because the page might somehow not exist and before successful
115+
// navigation should not be counted
116+
state.pagesOpened++;
117+
// this flag is used in the checks for reaching the limit - a page that was allowed to open will ignore
118+
// the `pagesOpened` counter, which will deal with possible retries
119+
request.userData.wasOpenedKey = request.uniqueKey;
120+
}
101121
const url = request.loadedUrl || request.url;
102122

103123
if (isFirstPage) await validateInputCssSelectors(input, page);
@@ -117,6 +137,7 @@ export const createCrawler = async ({ input }: { input: Input }) => {
117137
userData: {
118138
depth: depth + 1,
119139
},
140+
limit: input.maxPagesPerCrawl - state.pagesOpened,
120141
});
121142
const enqueuedLinks = processedRequests.filter(({ wasAlreadyPresent }) => !wasAlreadyPresent);
122143
const alreadyPresentLinksCount = processedRequests.length - enqueuedLinks.length;
@@ -240,7 +261,7 @@ export const createCrawler = async ({ input }: { input: Input }) => {
240261
},
241262

242263
async failedRequestHandler({ request }, error: Error) {
243-
if (error.name === 'LimitError') {
264+
if (error.name === ERROR_TYPE.LIMIT_ERROR) {
244265
return;
245266
}
246267
const errorMessage = error.message || 'no error';

packages/gpt-scraper-core/src/input.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import { Input } from './types/input';
66
/**
77
* Parses the Actor input. Throws an Actor fail if the input is invalid.
88
*/
9-
export const parseInput = async (input: Input) => {
9+
export const parseInput = async (input: Input): Promise<Input> => {
1010
// OpenAI defaults to 1, but we want the crawlers to be deterministic
1111
const temperatureOptions = { default: 0, range: { min: 0, max: 2 } };
1212
const temperature = await parseNumberInRange(input.temperature, 'temperature', temperatureOptions);
@@ -22,6 +22,9 @@ export const parseInput = async (input: Input) => {
2222

2323
return {
2424
...input,
25+
// make sure to change 0 (unlimited) to a very high number, because this is used in arithmetics and comparisons
26+
maxPagesPerCrawl: input.maxPagesPerCrawl || 999999,
27+
maxCrawlingDepth: input.maxCrawlingDepth || 999999,
2528
temperature,
2629
topP,
2730
frequencyPenalty,

packages/gpt-scraper-core/src/utils.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,7 @@ const doesUrlMatchGlob = (url: string, glob: GlobInput): boolean => {
1010

1111
return minimatch(url, globString, { nocase: true });
1212
};
13+
14+
export enum ERROR_TYPE {
15+
LIMIT_ERROR = 'LimitError',
16+
}

0 commit comments

Comments
 (0)