Skip to content

Commit 0020ee9

Browse files
authored
Merge pull request #66 from NREL/pp/crawl_updates
Web Crawl updates
2 parents ad1ec4f + 27e4a68 commit 0020ee9

2 files changed

Lines changed: 17 additions & 3 deletions

File tree

elm/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
ELM version number
33
"""
44

5-
__version__ = "0.0.22"
5+
__version__ = "0.0.23"

elm/web/website_crawl.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,7 @@ def __init__(self, validator, file_loader_kwargs=None,
377377
browser_config_kwargs=None, crawl_strategy_kwargs=None,
378378
crawler_config_kwargs=None, cte_kwargs=None,
379379
extra_url_filters=None, include_external=False,
380-
url_scorer=None, max_pages=100):
380+
url_scorer=None, max_pages=100, page_limit=None):
381381
"""
382382
383383
Parameters
@@ -428,9 +428,16 @@ def __init__(self, validator, file_loader_kwargs=None,
428428
:meth:`ELMLinkScorer.score` method to score the URLs.
429429
By default, ``None``.
430430
max_pages : int, optional
431-
Maximum number of pages to crawl. By default, ``100``.
431+
Maximum number of **successful** pages to crawl.
432+
By default, ``100``.
433+
page_limit : int, optional
434+
Maximum number of pages to crawl regardless of success
435+
status. If ``None``, a page limit of 2 * `max_pages` is
436+
used. To set no limit (not recommended), use ``math.inf``.
437+
By default, ``None``.
432438
"""
433439
self.validator = validator
440+
self.page_limit = page_limit or 2 * max_pages
434441

435442
flk = {"verify_ssl": False}
436443
flk.update(file_loader_kwargs or {})
@@ -510,10 +517,17 @@ async def run(self, base_url, termination_callback=None,
510517
out_docs = []
511518
should_stop = (termination_callback
512519
or ELMWebsiteCrawlingStrategy.found_enough_docs)
520+
page_count = 0
513521
async with AsyncWebCrawler(config=self.browser_config) as crawler:
514522
crawl_results = await crawler.arun(base_url, config=self.config)
515523
async with aclosing(crawl_results) as agen:
516524
async for result in agen:
525+
page_count += 1
526+
if page_count > self.page_limit:
527+
logger.debug("Exiting crawl due to page limit")
528+
break
529+
if not result.success:
530+
continue
517531
results.append(result)
518532
logger.debug("Crawled %s", result.url)
519533
if on_result_hook:

0 commit comments

Comments
 (0)