@@ -10,7 +10,7 @@ import { Input, PAGE_FORMAT } from './types/input.js';
10
10
import { parseInput , validateInput , validateInputCssSelectors } from './input.js' ;
11
11
import { NonRetryableOpenaiAPIError } from './errors.js' ;
12
12
import { OpenAIModelSettings } from './types/models.js' ;
13
- import { doesUrlMatchGlobs } from './utils.js' ;
13
+ import { doesUrlMatchGlobs , ERROR_TYPE } from './utils.js' ;
14
14
15
15
interface State {
16
16
pagesOpened : number ;
@@ -87,7 +87,7 @@ export const createCrawler = async ({ input }: { input: Input }) => {
87
87
const state = await crawler . useState < State > ( DEFAULT_STATE ) ;
88
88
if ( state . pagesOpened >= input . maxPagesPerCrawl ) {
89
89
const err = new NonRetryableError ( 'Skipping this page' ) ;
90
- err . name = 'LimitError' ;
90
+ err . name = ERROR_TYPE . LIMIT_ERROR ;
91
91
throw err ;
92
92
}
93
93
} ,
@@ -97,7 +97,27 @@ export const createCrawler = async ({ input }: { input: Input }) => {
97
97
const { depth = 0 } = request . userData ;
98
98
const state = await crawler . useState < State > ( DEFAULT_STATE ) ;
99
99
const isFirstPage = state . pagesOpened === 0 ;
100
- state . pagesOpened ++ ;
100
+ // perform an explicit check (to see if this request has already dealt with counters)
101
+ // by the request key, so as to avoid it succeeding in case of other requests inheriting userData with `...userData`
102
+ if ( request . userData . wasOpenedKey !== request . uniqueKey ) {
103
+ if ( state . pagesOpened >= input . maxPagesPerCrawl ) {
104
+ // performing a check in the preNavigationHook is helpful to prevent extra requests,
105
+ // but as the counters are incremented only later in a different async function,
106
+ // a race condition may occur when multiple pages are opened at the same time;
107
+ // performing a double check here, synchronously before dealing with counters just below,
108
+ // will ensure that this race condition is avoided
109
+ const err = new NonRetryableError ( 'Skipping this page' ) ;
110
+ err . name = ERROR_TYPE . LIMIT_ERROR ;
111
+ throw err ;
112
+ }
113
+ // only increment this counter once for each page (via the check in the outer `if`);
114
+ // also, do not increment in the preNavigationHook, because the page might somehow not exist and before successful
115
+ // navigation should not be counted
116
+ state . pagesOpened ++ ;
117
+ // this flag is used in the checks for reaching the limit - a page that was allowed to open will ignore
118
+ // the `pagesOpened` counter, which will deal with possible retries
119
+ request . userData . wasOpenedKey = request . uniqueKey ;
120
+ }
101
121
const url = request . loadedUrl || request . url ;
102
122
103
123
if ( isFirstPage ) await validateInputCssSelectors ( input , page ) ;
@@ -117,6 +137,7 @@ export const createCrawler = async ({ input }: { input: Input }) => {
117
137
userData : {
118
138
depth : depth + 1 ,
119
139
} ,
140
+ limit : input . maxPagesPerCrawl - state . pagesOpened ,
120
141
} ) ;
121
142
const enqueuedLinks = processedRequests . filter ( ( { wasAlreadyPresent } ) => ! wasAlreadyPresent ) ;
122
143
const alreadyPresentLinksCount = processedRequests . length - enqueuedLinks . length ;
@@ -240,7 +261,7 @@ export const createCrawler = async ({ input }: { input: Input }) => {
240
261
} ,
241
262
242
263
async failedRequestHandler ( { request } , error : Error ) {
243
- if ( error . name === 'LimitError' ) {
264
+ if ( error . name === ERROR_TYPE . LIMIT_ERROR ) {
244
265
return ;
245
266
}
246
267
const errorMessage = error . message || 'no error' ;
0 commit comments