@@ -377,7 +377,7 @@ def __init__(self, validator, file_loader_kwargs=None,
377377 browser_config_kwargs = None , crawl_strategy_kwargs = None ,
378378 crawler_config_kwargs = None , cte_kwargs = None ,
379379 extra_url_filters = None , include_external = False ,
380- url_scorer = None , max_pages = 100 ):
380+ url_scorer = None , max_pages = 100 , page_limit = None ):
381381 """
382382
383383 Parameters
@@ -428,9 +428,16 @@ def __init__(self, validator, file_loader_kwargs=None,
428428 :meth:`ELMLinkScorer.score` method to score the URLs.
429429 By default, ``None``.
430430 max_pages : int, optional
431- Maximum number of pages to crawl. By default, ``100``.
431+ Maximum number of **successful** pages to crawl.
432+ By default, ``100``.
433+ page_limit : int, optional
434+ Maximum number of pages to crawl regardless of success
435+ status. If ``None``, a page limit of 2 * `max_pages` is
436+ used. To set no limit (not recommended), use ``math.inf``.
437+ By default, ``None``.
432438 """
433439 self .validator = validator
440+ self .page_limit = page_limit or 2 * max_pages
434441
435442 flk = {"verify_ssl" : False }
436443 flk .update (file_loader_kwargs or {})
@@ -510,10 +517,17 @@ async def run(self, base_url, termination_callback=None,
510517 out_docs = []
511518 should_stop = (termination_callback
512519 or ELMWebsiteCrawlingStrategy .found_enough_docs )
520+ page_count = 0
513521 async with AsyncWebCrawler (config = self .browser_config ) as crawler :
514522 crawl_results = await crawler .arun (base_url , config = self .config )
515523 async with aclosing (crawl_results ) as agen :
516524 async for result in agen :
525+ page_count += 1
526+ if page_count > self .page_limit :
527+ logger .debug ("Exiting crawl due to page limit" )
528+ break
529+ if not result .success :
530+ continue
517531 results .append (result )
518532 logger .debug ("Crawled %s" , result .url )
519533 if on_result_hook :
0 commit comments