Skip to content

Commit 8157670

Browse files
committed
Fix crawler attempting to navigate to binary files
- Add is_binary_file() method to URLHandler to detect 40+ binary extensions - Update RecursiveCrawlStrategy to filter binary URLs before crawl queue - Add comprehensive unit tests for binary file detection - Prevents net::ERR_ABORTED errors when crawler encounters ZIP, PDF, etc. This fixes the issue where the crawler was treating binary file URLs (like .zip downloads) as navigable web pages, causing errors in crawl4ai.
1 parent ad1b8bf commit 8157670

File tree

3 files changed

+179
-1
lines changed

3 files changed

+179
-1
lines changed

python/src/server/services/crawling/helpers/url_handler.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,54 @@ def is_txt(url: str) -> bool:
4848
logger.warning(f"Error checking if URL is text file: {e}")
4949
return False
5050

51+
@staticmethod
52+
def is_binary_file(url: str) -> bool:
53+
"""
54+
Check if a URL points to a binary file that shouldn't be crawled.
55+
56+
Args:
57+
url: URL to check
58+
59+
Returns:
60+
True if URL is a binary file, False otherwise
61+
"""
62+
try:
63+
# Remove query parameters and fragments for cleaner extension checking
64+
parsed = urlparse(url)
65+
path = parsed.path.lower()
66+
67+
# Comprehensive list of binary and non-HTML file extensions
68+
binary_extensions = {
69+
# Archives
70+
'.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz', '.tgz',
71+
# Executables and installers
72+
'.exe', '.dmg', '.pkg', '.deb', '.rpm', '.msi', '.app', '.appimage',
73+
# Documents (non-HTML)
74+
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods',
75+
# Images
76+
'.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp', '.tiff',
77+
# Audio/Video
78+
'.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv', '.wav', '.flac',
79+
# Data files
80+
'.csv', '.sql', '.db', '.sqlite',
81+
# Binary data
82+
'.iso', '.img', '.bin', '.dat',
83+
# Development files (usually not meant to be crawled as pages)
84+
'.wasm', '.pyc', '.jar', '.war', '.class', '.dll', '.so', '.dylib'
85+
}
86+
87+
# Check if the path ends with any binary extension
88+
for ext in binary_extensions:
89+
if path.endswith(ext):
90+
logger.debug(f"Skipping binary file: {url} (matched extension: {ext})")
91+
return True
92+
93+
return False
94+
except Exception as e:
95+
logger.warning(f"Error checking if URL is binary file: {e}")
96+
# In case of error, don't skip the URL (safer to attempt crawl than miss content)
97+
return False
98+
5199
@staticmethod
52100
def transform_github_url(url: str) -> str:
53101
"""

python/src/server/services/crawling/strategies/recursive.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
1111
from ....config.logfire_config import get_logger
1212
from ...credential_service import credential_service
13+
from ..helpers.url_handler import URLHandler
1314

1415
logger = get_logger(__name__)
1516

@@ -27,6 +28,7 @@ def __init__(self, crawler, markdown_generator):
2728
"""
2829
self.crawler = crawler
2930
self.markdown_generator = markdown_generator
31+
self.url_handler = URLHandler()
3032

3133
async def crawl_recursive_with_progress(
3234
self,
@@ -190,8 +192,11 @@ def normalize_url(url):
190192
# Find internal links for next depth
191193
for link in result.links.get("internal", []):
192194
next_url = normalize_url(link["href"])
193-
if next_url not in visited:
195+
# Skip binary files and already visited URLs
196+
if next_url not in visited and not self.url_handler.is_binary_file(next_url):
194197
next_level_urls.add(next_url)
198+
elif self.url_handler.is_binary_file(next_url):
199+
logger.debug(f"Skipping binary file from crawl queue: {next_url}")
195200
else:
196201
logger.warning(f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}")
197202

python/tests/test_url_handler.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
"""Unit tests for URLHandler class."""
2+
import pytest
3+
from src.server.services.crawling.helpers.url_handler import URLHandler
4+
5+
6+
class TestURLHandler:
7+
"""Test suite for URLHandler class."""
8+
9+
def test_is_binary_file_archives(self):
10+
"""Test detection of archive file formats."""
11+
handler = URLHandler()
12+
13+
# Should detect various archive formats
14+
assert handler.is_binary_file("https://example.com/file.zip") is True
15+
assert handler.is_binary_file("https://example.com/archive.tar.gz") is True
16+
assert handler.is_binary_file("https://example.com/compressed.rar") is True
17+
assert handler.is_binary_file("https://example.com/package.7z") is True
18+
assert handler.is_binary_file("https://example.com/backup.tgz") is True
19+
20+
def test_is_binary_file_executables(self):
21+
"""Test detection of executable and installer files."""
22+
handler = URLHandler()
23+
24+
assert handler.is_binary_file("https://example.com/setup.exe") is True
25+
assert handler.is_binary_file("https://example.com/installer.dmg") is True
26+
assert handler.is_binary_file("https://example.com/package.deb") is True
27+
assert handler.is_binary_file("https://example.com/app.msi") is True
28+
assert handler.is_binary_file("https://example.com/program.appimage") is True
29+
30+
def test_is_binary_file_documents(self):
31+
"""Test detection of document files."""
32+
handler = URLHandler()
33+
34+
assert handler.is_binary_file("https://example.com/document.pdf") is True
35+
assert handler.is_binary_file("https://example.com/report.docx") is True
36+
assert handler.is_binary_file("https://example.com/spreadsheet.xlsx") is True
37+
assert handler.is_binary_file("https://example.com/presentation.pptx") is True
38+
39+
def test_is_binary_file_media(self):
40+
"""Test detection of image and media files."""
41+
handler = URLHandler()
42+
43+
# Images
44+
assert handler.is_binary_file("https://example.com/photo.jpg") is True
45+
assert handler.is_binary_file("https://example.com/image.png") is True
46+
assert handler.is_binary_file("https://example.com/icon.svg") is True
47+
assert handler.is_binary_file("https://example.com/favicon.ico") is True
48+
49+
# Audio/Video
50+
assert handler.is_binary_file("https://example.com/song.mp3") is True
51+
assert handler.is_binary_file("https://example.com/video.mp4") is True
52+
assert handler.is_binary_file("https://example.com/movie.mkv") is True
53+
54+
def test_is_binary_file_case_insensitive(self):
55+
"""Test that detection is case-insensitive."""
56+
handler = URLHandler()
57+
58+
assert handler.is_binary_file("https://example.com/FILE.ZIP") is True
59+
assert handler.is_binary_file("https://example.com/Document.PDF") is True
60+
assert handler.is_binary_file("https://example.com/Image.PNG") is True
61+
62+
def test_is_binary_file_with_query_params(self):
63+
"""Test that query parameters don't affect detection."""
64+
handler = URLHandler()
65+
66+
assert handler.is_binary_file("https://example.com/file.zip?version=1.0") is True
67+
assert handler.is_binary_file("https://example.com/document.pdf?download=true") is True
68+
assert handler.is_binary_file("https://example.com/image.png#section") is True
69+
70+
def test_is_binary_file_html_pages(self):
71+
"""Test that HTML pages are not detected as binary."""
72+
handler = URLHandler()
73+
74+
# Regular HTML pages should not be detected as binary
75+
assert handler.is_binary_file("https://example.com/") is False
76+
assert handler.is_binary_file("https://example.com/index.html") is False
77+
assert handler.is_binary_file("https://example.com/page") is False
78+
assert handler.is_binary_file("https://example.com/blog/post") is False
79+
assert handler.is_binary_file("https://example.com/about.htm") is False
80+
assert handler.is_binary_file("https://example.com/contact.php") is False
81+
82+
def test_is_binary_file_edge_cases(self):
83+
"""Test edge cases and special scenarios."""
84+
handler = URLHandler()
85+
86+
# URLs with periods in path but not file extensions
87+
assert handler.is_binary_file("https://example.com/v1.0/api") is False
88+
assert handler.is_binary_file("https://example.com/jquery.min.js") is False # JS files might be crawlable
89+
90+
# Real-world example from the error
91+
assert handler.is_binary_file("https://docs.crawl4ai.com/apps/crawl4ai-assistant/crawl4ai-assistant-v1.3.0.zip") is True
92+
93+
def test_is_sitemap(self):
94+
"""Test sitemap detection."""
95+
handler = URLHandler()
96+
97+
assert handler.is_sitemap("https://example.com/sitemap.xml") is True
98+
assert handler.is_sitemap("https://example.com/path/sitemap.xml") is True
99+
assert handler.is_sitemap("https://example.com/sitemap/index.xml") is True
100+
assert handler.is_sitemap("https://example.com/regular-page") is False
101+
102+
def test_is_txt(self):
103+
"""Test text file detection."""
104+
handler = URLHandler()
105+
106+
assert handler.is_txt("https://example.com/robots.txt") is True
107+
assert handler.is_txt("https://example.com/readme.txt") is True
108+
assert handler.is_txt("https://example.com/file.pdf") is False
109+
110+
def test_transform_github_url(self):
111+
"""Test GitHub URL transformation."""
112+
handler = URLHandler()
113+
114+
# Should transform GitHub blob URLs to raw URLs
115+
original = "https://github.com/owner/repo/blob/main/file.py"
116+
expected = "https://raw.githubusercontent.com/owner/repo/main/file.py"
117+
assert handler.transform_github_url(original) == expected
118+
119+
# Should not transform non-blob URLs
120+
non_blob = "https://github.com/owner/repo"
121+
assert handler.transform_github_url(non_blob) == non_blob
122+
123+
# Should not transform non-GitHub URLs
124+
other = "https://example.com/file"
125+
assert handler.transform_github_url(other) == other

0 commit comments

Comments
 (0)