Skip to content

Commit

Permalink
fix: add +1 check for max pages and improve the coutner keeping
Browse files Browse the repository at this point in the history
  • Loading branch information
foxt451 committed Jan 10, 2024
1 parent 3a53a8b commit f3f7495
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 5 deletions.
29 changes: 25 additions & 4 deletions packages/gpt-scraper-core/src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import { Input, PAGE_FORMAT } from './types/input.js';
import { parseInput, validateInput, validateInputCssSelectors } from './input.js';
import { NonRetryableOpenaiAPIError } from './errors.js';
import { OpenAIModelSettings } from './types/models.js';
import { doesUrlMatchGlobs } from './utils.js';
import { doesUrlMatchGlobs, ERROR_TYPE } from './utils.js';

interface State {
pagesOpened: number;
Expand Down Expand Up @@ -87,7 +87,7 @@ export const createCrawler = async ({ input }: { input: Input }) => {
const state = await crawler.useState<State>(DEFAULT_STATE);
if (state.pagesOpened >= input.maxPagesPerCrawl) {
const err = new NonRetryableError('Skipping this page');
err.name = 'LimitError';
err.name = ERROR_TYPE.LIMIT_ERROR;
throw err;
}
},
Expand All @@ -97,7 +97,27 @@ export const createCrawler = async ({ input }: { input: Input }) => {
const { depth = 0 } = request.userData;
const state = await crawler.useState<State>(DEFAULT_STATE);
const isFirstPage = state.pagesOpened === 0;
state.pagesOpened++;
// perform an explicit check (to see if this request has already dealt with counters)
// by the request key, so as to avoid it succeeding in case of other requests inheriting userData with `...userData`
if (request.userData.wasOpenedKey !== request.uniqueKey) {
if (state.pagesOpened >= input.maxPagesPerCrawl) {
// performing a check in the preNavigationHook is helpful to prevent extra requests,
// but as the counters are incremented only later in a different async function,
// a race condition may occur when multiple pages are opened at the same time;
// performing a double check here, synchronously before dealing with counters just below,
// will ensure that this race condition is avoided
const err = new NonRetryableError('Skipping this page');
err.name = ERROR_TYPE.LIMIT_ERROR;
throw err;
}
// only increment this counter once for each page (via the check in the outer `if`);
// also, do not increment in the preNavigationHook, because the page might somehow not exist and before successful
// navigation should not be counted
state.pagesOpened++;
// this flag is used in the checks for reaching the limit - a page that was allowed to open will ignore
// the `pagesOpened` counter, which will deal with possible retries
request.userData.wasOpenedKey = request.uniqueKey;
}
const url = request.loadedUrl || request.url;

if (isFirstPage) await validateInputCssSelectors(input, page);
Expand All @@ -117,6 +137,7 @@ export const createCrawler = async ({ input }: { input: Input }) => {
userData: {
depth: depth + 1,
},
limit: input.maxPagesPerCrawl - state.pagesOpened,
});
const enqueuedLinks = processedRequests.filter(({ wasAlreadyPresent }) => !wasAlreadyPresent);
const alreadyPresentLinksCount = processedRequests.length - enqueuedLinks.length;
Expand Down Expand Up @@ -240,7 +261,7 @@ export const createCrawler = async ({ input }: { input: Input }) => {
},

async failedRequestHandler({ request }, error: Error) {
if (error.name === 'LimitError') {
if (error.name === ERROR_TYPE.LIMIT_ERROR) {
return;
}
const errorMessage = error.message || 'no error';
Expand Down
5 changes: 4 additions & 1 deletion packages/gpt-scraper-core/src/input.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { Input } from './types/input';
/**
* Parses the Actor input. Throws an Actor fail if the input is invalid.
*/
export const parseInput = async (input: Input) => {
export const parseInput = async (input: Input): Promise<Input> => {
// OpenAI defaults to 1, but we want the crawlers to be deterministic
const temperatureOptions = { default: 0, range: { min: 0, max: 2 } };
const temperature = await parseNumberInRange(input.temperature, 'temperature', temperatureOptions);
Expand All @@ -22,6 +22,9 @@ export const parseInput = async (input: Input) => {

return {
...input,
// make sure to change 0 (unlimited) to a very high number, because this is used in arithmetics and comparisons
maxPagesPerCrawl: input.maxPagesPerCrawl || 999999,
maxCrawlingDepth: input.maxCrawlingDepth || 999999,
temperature,
topP,
frequencyPenalty,
Expand Down
4 changes: 4 additions & 0 deletions packages/gpt-scraper-core/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,7 @@ const doesUrlMatchGlob = (url: string, glob: GlobInput): boolean => {

return minimatch(url, globString, { nocase: true });
};

export enum ERROR_TYPE {
LIMIT_ERROR = 'LimitError',
}

0 comments on commit f3f7495

Please sign in to comment.