Skip to content

2025 APR Bug fixes #1107

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 23 commits into
base: next
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
7b9aabc
fix(crawler): ensure max_pages limit is respected during batch proces…
ntohidi Apr 14, 2025
1f3b125
docs(cli): add Crawl4AI CLI installation instructions to the CLI guide
ntohidi Apr 14, 2025
05085b6
fix(requirements): add fake-useragent to requirements
ntohidi Apr 15, 2025
0ec3c4a
fix(crawler): handle navigation aborts during file downloads in Async…
ntohidi Apr 17, 2025
0886153
fix(async_playwright_crawler): improve segment handling and viewport …
ntohidi Apr 17, 2025
14a3145
fix(docs): update browser-crawler-config example to include LLMConten…
ntohidi Apr 21, 2025
094201a
Merge next + resolve conflicts
aravindkarnam Apr 23, 2025
039be1b
feat: add pdf2image dependency to requirements
ntohidi Apr 30, 2025
1d6a2b9
fix(crawler): surface real redirect status codes and keep redirect ch…
ntohidi Apr 30, 2025
e0cd3e1
fix(crawler): initialize captured_console variable for local file pro…
ntohidi May 2, 2025
39e3b79
Merge branch 'next' into 2025-APR-1
aravindkarnam May 7, 2025
12783fa
fix(dependencies): update pillow version constraint to allow newer re…
ntohidi May 7, 2025
eebb8c8
fix(requirements): add PyPDF2 dependency for PDF processing
ntohidi May 7, 2025
2b17f23
docs: update direct passing of content_filter to CrawlerRunConfig and…
aravindkarnam May 7, 2025
ee93acb
fix(async_playwright_crawler): use config directly instead of self.co…
ntohidi May 7, 2025
f6e25e2
fix: check_robots_txt to support wildcard rules ref: #699
aravindkarnam May 7, 2025
c1041b9
fix: exclude_external_images flag simply discards elements ref:https:…
aravindkarnam May 7, 2025
1af3d1c
Merge branch '2025-APR-1' of https://github.com/unclecode/crawl4ai in…
ntohidi May 8, 2025
98a56e6
Merge next branch
aravindkarnam May 13, 2025
25d97d5
fix(dependencies): remove duplicated aiofiles from project dependenci…
ntohidi May 13, 2025
260e2dc
fix(browser): create browser config before launching managed browser …
ntohidi May 13, 2025
137556b
fix the EXTRACT to match the styling of the other methods
medo94my May 14, 2025
a55c2b3
refactor(logging): update extraction logging to use url_status method
Ahmed-Tawfik94 May 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 56 additions & 4 deletions crawl4ai/async_crawler_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,7 @@ async def crawl(
return await self._crawl_web(url, config)

elif url.startswith("file://"):
captured_console = None
# Process local file
local_file_path = url[7:] # Remove 'file://' prefix
if not os.path.exists(local_file_path):
Expand Down Expand Up @@ -741,18 +742,49 @@ def log_consol(
)
redirected_url = page.url
except Error as e:
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
# Allow navigation to be aborted when downloading files
# This is expected behavior for downloads in some browser engines
if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
self.logger.info(
message=f"Navigation aborted, likely due to file download: {url}",
tag="GOTO",
params={"url": url},
)
response = None
else:
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")

await self.execute_hook(
"after_goto", page, context=context, url=url, response=response, config=config
)

# ──────────────────────────────────────────────────────────────
# Walk the redirect chain. Playwright returns only the last
# hop, so we trace the `request.redirected_from` links until the
# first response that differs from the final one and surface its
# status-code.
# ──────────────────────────────────────────────────────────────
if response is None:
status_code = 200
response_headers = {}
else:
status_code = response.status
response_headers = response.headers
first_resp = response
req = response.request
while req and req.redirected_from:
prev_req = req.redirected_from
prev_resp = await prev_req.response()
if prev_resp: # keep earliest
first_resp = prev_resp
req = prev_req

status_code = first_resp.status
response_headers = first_resp.headers
# if response is None:
# status_code = 200
# response_headers = {}
# else:
# status_code = response.status
# response_headers = response.headers

else:
status_code = 200
Expand Down Expand Up @@ -1432,12 +1464,32 @@ async def take_screenshot_scroller(self, page: Page, **kwargs) -> str:
num_segments = (page_height // viewport_height) + 1
for i in range(num_segments):
y_offset = i * viewport_height
# Special handling for the last segment
if i == num_segments - 1:
last_part_height = page_height % viewport_height

# If page_height is an exact multiple of viewport_height,
# we don't need an extra segment
if last_part_height == 0:
# Skip last segment if page height is exact multiple of viewport
break

# Adjust viewport to exactly match the remaining content height
await page.set_viewport_size({"width": page_width, "height": last_part_height})

await page.evaluate(f"window.scrollTo(0, {y_offset})")
await asyncio.sleep(0.01) # wait for render
seg_shot = await page.screenshot(full_page=False)

# Capture the current segment
# Note: Using compression options (format, quality) would go here
seg_shot = await page.screenshot(full_page=False, type="jpeg", quality=85)
# seg_shot = await page.screenshot(full_page=False)
img = Image.open(BytesIO(seg_shot)).convert("RGB")
segments.append(img)

# Reset viewport to original size after capturing segments
await page.set_viewport_size({"width": page_width, "height": viewport_height})

total_height = sum(img.height for img in segments)
stitched = Image.new("RGB", (segments[0].width, total_height))
offset = 0
Expand Down
27 changes: 15 additions & 12 deletions crawl4ai/async_webcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ async def arun(
pdf_data=pdf_data,
verbose=config.verbose,
is_raw_html=True if url.startswith("raw:") else False,
redirected_url=async_response.redirected_url,
redirected_url=async_response.redirected_url,
**kwargs,
)

Expand Down Expand Up @@ -503,7 +503,7 @@ async def aprocess_html(
tables = media.pop("tables", [])
links = result.links.model_dump()
metadata = result.metadata

fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)

################################
Expand Down Expand Up @@ -585,11 +585,13 @@ async def aprocess_html(
# Choose content based on input_format
content_format = config.extraction_strategy.input_format
if content_format == "fit_markdown" and not markdown_result.fit_markdown:
self.logger.warning(
message="Fit markdown requested but not available. Falling back to raw markdown.",
tag="EXTRACT",
params={"url": _url},
)

self.logger.url_status(
url=_url,
success=bool(html),
timing=time.perf_counter() - t1,
tag="EXTRACT",
)
content_format = "markdown"

content = {
Expand All @@ -613,11 +615,12 @@ async def aprocess_html(
)

# Log extraction completion
self.logger.info(
message="Completed for {url:.50}... | Time: {timing}s",
tag="EXTRACT",
params={"url": _url, "timing": time.perf_counter() - t1},
)
self.logger.url_status(
url=_url,
success=bool(html),
timing=time.perf_counter() - t1,
tag="EXTRACT",
)

# Apply HTML formatting if requested
if config.prettiify:
Expand Down
11 changes: 10 additions & 1 deletion crawl4ai/browser_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,9 +615,18 @@ async def launch_standalone_browser(self,
self.logger.info(f"Debugging port: {debugging_port}", tag="CDP")
self.logger.info(f"Headless mode: {headless}", tag="CDP")

# create browser config
browser_config = BrowserConfig(
browser_type=browser_type,
headless=headless,
user_data_dir=profile_path,
debugging_port=debugging_port,
verbose=True
)

# Create managed browser instance
managed_browser = ManagedBrowser(
browser_type=browser_type,
browser_config=browser_config,
user_data_dir=profile_path,
headless=headless,
logger=self.logger,
Expand Down
19 changes: 12 additions & 7 deletions crawl4ai/content_scraping_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -718,13 +718,18 @@ def _process_element(

# Check flag if we should remove external images
if kwargs.get("exclude_external_images", False):
element.decompose()
return False
# src_url_base = src.split('/')[2]
# url_base = url.split('/')[2]
# if url_base not in src_url_base:
# element.decompose()
# return False
# Handle relative URLs (which are always from the same domain)
if not src.startswith('http') and not src.startswith('//'):
return True # Keep relative URLs

# For absolute URLs, compare the base domains using the existing function
src_base_domain = get_base_domain(src)
url_base_domain = get_base_domain(url)

# If the domains don't match and both are valid, the image is external
if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
element.decompose()
return False

# if kwargs.get('exclude_social_media_links', False):
# if image_src_base_domain in exclude_social_media_domains:
Expand Down
12 changes: 12 additions & 0 deletions crawl4ai/deep_crawling/bff_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,14 @@ async def _arun_best_first(
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
break

# Calculate how many more URLs we can process in this batch
remaining = self.max_pages - self._pages_crawled
batch_size = min(BATCH_SIZE, remaining)
if batch_size <= 0:
# No more pages to crawl
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
break

batch: List[Tuple[float, int, str, Optional[str]]] = []
# Retrieve up to BATCH_SIZE items from the priority queue.
for _ in range(BATCH_SIZE):
Expand Down Expand Up @@ -184,6 +192,10 @@ async def _arun_best_first(
# Count only successful crawls toward max_pages limit
if result.success:
self._pages_crawled += 1
# Check if we've reached the limit during batch processing
if self._pages_crawled >= self.max_pages:
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
break # Exit the generator

yield result

Expand Down
9 changes: 9 additions & 0 deletions crawl4ai/deep_crawling/bfs_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,11 @@ async def _arun_batch(
results: List[CrawlResult] = []

while current_level and not self._cancel_event.is_set():
# Check if we've already reached max_pages before starting a new level
if self._pages_crawled >= self.max_pages:
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
break

next_level: List[Tuple[str, Optional[str]]] = []
urls = [url for url, _ in current_level]

Expand Down Expand Up @@ -221,6 +226,10 @@ async def _arun_stream(
# Count only successful crawls
if result.success:
self._pages_crawled += 1
# Check if we've reached the limit during batch processing
if self._pages_crawled >= self.max_pages:
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
break # Exit the generator

results_count += 1
yield result
Expand Down
8 changes: 8 additions & 0 deletions crawl4ai/deep_crawling/dfs_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ async def _arun_batch(
# Count only successful crawls toward max_pages limit
if result.success:
self._pages_crawled += 1
# Check if we've reached the limit during batch processing
if self._pages_crawled >= self.max_pages:
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
break # Exit the generator

# Only discover links from successful crawls
new_links: List[Tuple[str, Optional[str]]] = []
Expand Down Expand Up @@ -94,6 +98,10 @@ async def _arun_stream(
# and only discover links from successful crawls
if result.success:
self._pages_crawled += 1
# Check if we've reached the limit during batch processing
if self._pages_crawled >= self.max_pages:
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
break # Exit the generator

new_links: List[Tuple[str, Optional[str]]] = []
await self.link_discovery(result, url, depth, visited, new_links, depths)
Expand Down
25 changes: 24 additions & 1 deletion crawl4ai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,29 @@
from collections import deque
from typing import Generator, Iterable

# Monkey patch to fix wildcard handling in urllib.robotparser
from urllib.robotparser import RuleLine
import re

original_applies_to = RuleLine.applies_to

def patched_applies_to(self, filename):
# Handle wildcards in paths
if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"):
pattern = self.path.replace('%2A', '*')
pattern = re.escape(pattern).replace('\\*', '.*')
pattern = '^' + pattern
if pattern.endswith('\\$'):
pattern = pattern[:-2] + '$'
try:
return bool(re.match(pattern, filename))
except re.error:
return original_applies_to(self, filename)
return original_applies_to(self, filename)

RuleLine.applies_to = patched_applies_to
# Monkey patch ends

def chunk_documents(
documents: Iterable[str],
chunk_token_threshold: int,
Expand Down Expand Up @@ -303,7 +326,7 @@ async def can_fetch(self, url: str, user_agent: str = "*") -> bool:
robots_url = f"{scheme}://{domain}/robots.txt"

async with aiohttp.ClientSession() as session:
async with session.get(robots_url, timeout=2) as response:
async with session.get(robots_url, timeout=2, ssl=False) as response:
if response.status == 200:
rules = await response.text()
self._cache_rules(domain, rules)
Expand Down
11 changes: 8 additions & 3 deletions deploy/docker/c4ai-doc-context.md
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ async def main():

md_generator = DefaultMarkdownGenerator(
content_filter=filter,
options={"ignore_links": True}
options={"ignore_links": True})

# 4) Crawler run config: skip cache, use extraction
run_conf = CrawlerRunConfig(
Expand Down Expand Up @@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:

```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import LLMContentFilter

async def main():
Expand All @@ -4175,8 +4175,13 @@ async def main():
verbose=True
)

md_generator = DefaultMarkdownGenerator(
content_filter=filter,
options={"ignore_links": True}
)

config = CrawlerRunConfig(
content_filter=filter
markdown_generator=md_generator
)

async with AsyncWebCrawler() as crawler:
Expand Down
9 changes: 5 additions & 4 deletions docs/md_v2/core/browser-crawler-config.md
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi

```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig, LLMContentFilter, DefaultMarkdownGenerator
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

async def main():
Expand All @@ -298,7 +298,7 @@ async def main():
# 3) Example LLM content filtering

gemini_config = LLMConfig(
provider="gemini/gemini-1.5-pro"
provider="gemini/gemini-1.5-pro",
api_token = "env:GEMINI_API_TOKEN"
)

Expand All @@ -322,8 +322,9 @@ async def main():
)

md_generator = DefaultMarkdownGenerator(
content_filter=filter,
options={"ignore_links": True}
content_filter=filter,
options={"ignore_links": True}
)

# 4) Crawler run config: skip cache, use extraction
run_conf = CrawlerRunConfig(
Expand Down
3 changes: 3 additions & 0 deletions docs/md_v2/core/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
- [Configuration Reference](#configuration-reference)
- [Best Practices & Tips](#best-practices--tips)

## Installation
The Crawl4AI CLI will be installed automatically when you install the library.

## Basic Usage

The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:
Expand Down
Loading