Skip to content

Commit 8743c05

Browse files
authored
Merge pull request coleam00#218 from coleam00/fix/filter-binary-files-from-crawl
Fix crawler attempting to navigate to binary files
2 parents f96a9a4 + 8157670 commit 8743c05

File tree

3 files changed

+179
-1
lines changed

3 files changed

+179
-1
lines changed

python/src/server/services/crawling/helpers/url_handler.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,54 @@ def is_txt(url: str) -> bool:
4848
logger.warning(f"Error checking if URL is text file: {e}")
4949
return False
5050

51+
@staticmethod
52+
def is_binary_file(url: str) -> bool:
53+
"""
54+
Check if a URL points to a binary file that shouldn't be crawled.
55+
56+
Args:
57+
url: URL to check
58+
59+
Returns:
60+
True if URL is a binary file, False otherwise
61+
"""
62+
try:
63+
# Remove query parameters and fragments for cleaner extension checking
64+
parsed = urlparse(url)
65+
path = parsed.path.lower()
66+
67+
# Comprehensive list of binary and non-HTML file extensions
68+
binary_extensions = {
69+
# Archives
70+
'.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz', '.tgz',
71+
# Executables and installers
72+
'.exe', '.dmg', '.pkg', '.deb', '.rpm', '.msi', '.app', '.appimage',
73+
# Documents (non-HTML)
74+
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods',
75+
# Images
76+
'.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp', '.tiff',
77+
# Audio/Video
78+
'.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv', '.wav', '.flac',
79+
# Data files
80+
'.csv', '.sql', '.db', '.sqlite',
81+
# Binary data
82+
'.iso', '.img', '.bin', '.dat',
83+
# Development files (usually not meant to be crawled as pages)
84+
'.wasm', '.pyc', '.jar', '.war', '.class', '.dll', '.so', '.dylib'
85+
}
86+
87+
# Check if the path ends with any binary extension
88+
for ext in binary_extensions:
89+
if path.endswith(ext):
90+
logger.debug(f"Skipping binary file: {url} (matched extension: {ext})")
91+
return True
92+
93+
return False
94+
except Exception as e:
95+
logger.warning(f"Error checking if URL is binary file: {e}")
96+
# In case of error, don't skip the URL (safer to attempt crawl than miss content)
97+
return False
98+
5199
@staticmethod
52100
def transform_github_url(url: str) -> str:
53101
"""

python/src/server/services/crawling/strategies/recursive.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
1111
from ....config.logfire_config import get_logger
1212
from ...credential_service import credential_service
13+
from ..helpers.url_handler import URLHandler
1314

1415
logger = get_logger(__name__)
1516

@@ -27,6 +28,7 @@ def __init__(self, crawler, markdown_generator):
2728
"""
2829
self.crawler = crawler
2930
self.markdown_generator = markdown_generator
31+
self.url_handler = URLHandler()
3032

3133
async def crawl_recursive_with_progress(
3234
self,
@@ -195,8 +197,11 @@ def normalize_url(url):
195197
# Find internal links for next depth
196198
for link in result.links.get("internal", []):
197199
next_url = normalize_url(link["href"])
198-
if next_url not in visited:
200+
# Skip binary files and already visited URLs
201+
if next_url not in visited and not self.url_handler.is_binary_file(next_url):
199202
next_level_urls.add(next_url)
203+
elif self.url_handler.is_binary_file(next_url):
204+
logger.debug(f"Skipping binary file from crawl queue: {next_url}")
200205
else:
201206
logger.warning(f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}")
202207

python/tests/test_url_handler.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
"""Unit tests for URLHandler class."""
2+
import pytest
3+
from src.server.services.crawling.helpers.url_handler import URLHandler
4+
5+
6+
class TestURLHandler:
7+
"""Test suite for URLHandler class."""
8+
9+
def test_is_binary_file_archives(self):
10+
"""Test detection of archive file formats."""
11+
handler = URLHandler()
12+
13+
# Should detect various archive formats
14+
assert handler.is_binary_file("https://example.com/file.zip") is True
15+
assert handler.is_binary_file("https://example.com/archive.tar.gz") is True
16+
assert handler.is_binary_file("https://example.com/compressed.rar") is True
17+
assert handler.is_binary_file("https://example.com/package.7z") is True
18+
assert handler.is_binary_file("https://example.com/backup.tgz") is True
19+
20+
def test_is_binary_file_executables(self):
21+
"""Test detection of executable and installer files."""
22+
handler = URLHandler()
23+
24+
assert handler.is_binary_file("https://example.com/setup.exe") is True
25+
assert handler.is_binary_file("https://example.com/installer.dmg") is True
26+
assert handler.is_binary_file("https://example.com/package.deb") is True
27+
assert handler.is_binary_file("https://example.com/app.msi") is True
28+
assert handler.is_binary_file("https://example.com/program.appimage") is True
29+
30+
def test_is_binary_file_documents(self):
31+
"""Test detection of document files."""
32+
handler = URLHandler()
33+
34+
assert handler.is_binary_file("https://example.com/document.pdf") is True
35+
assert handler.is_binary_file("https://example.com/report.docx") is True
36+
assert handler.is_binary_file("https://example.com/spreadsheet.xlsx") is True
37+
assert handler.is_binary_file("https://example.com/presentation.pptx") is True
38+
39+
def test_is_binary_file_media(self):
40+
"""Test detection of image and media files."""
41+
handler = URLHandler()
42+
43+
# Images
44+
assert handler.is_binary_file("https://example.com/photo.jpg") is True
45+
assert handler.is_binary_file("https://example.com/image.png") is True
46+
assert handler.is_binary_file("https://example.com/icon.svg") is True
47+
assert handler.is_binary_file("https://example.com/favicon.ico") is True
48+
49+
# Audio/Video
50+
assert handler.is_binary_file("https://example.com/song.mp3") is True
51+
assert handler.is_binary_file("https://example.com/video.mp4") is True
52+
assert handler.is_binary_file("https://example.com/movie.mkv") is True
53+
54+
def test_is_binary_file_case_insensitive(self):
55+
"""Test that detection is case-insensitive."""
56+
handler = URLHandler()
57+
58+
assert handler.is_binary_file("https://example.com/FILE.ZIP") is True
59+
assert handler.is_binary_file("https://example.com/Document.PDF") is True
60+
assert handler.is_binary_file("https://example.com/Image.PNG") is True
61+
62+
def test_is_binary_file_with_query_params(self):
63+
"""Test that query parameters don't affect detection."""
64+
handler = URLHandler()
65+
66+
assert handler.is_binary_file("https://example.com/file.zip?version=1.0") is True
67+
assert handler.is_binary_file("https://example.com/document.pdf?download=true") is True
68+
assert handler.is_binary_file("https://example.com/image.png#section") is True
69+
70+
def test_is_binary_file_html_pages(self):
71+
"""Test that HTML pages are not detected as binary."""
72+
handler = URLHandler()
73+
74+
# Regular HTML pages should not be detected as binary
75+
assert handler.is_binary_file("https://example.com/") is False
76+
assert handler.is_binary_file("https://example.com/index.html") is False
77+
assert handler.is_binary_file("https://example.com/page") is False
78+
assert handler.is_binary_file("https://example.com/blog/post") is False
79+
assert handler.is_binary_file("https://example.com/about.htm") is False
80+
assert handler.is_binary_file("https://example.com/contact.php") is False
81+
82+
def test_is_binary_file_edge_cases(self):
83+
"""Test edge cases and special scenarios."""
84+
handler = URLHandler()
85+
86+
# URLs with periods in path but not file extensions
87+
assert handler.is_binary_file("https://example.com/v1.0/api") is False
88+
assert handler.is_binary_file("https://example.com/jquery.min.js") is False # JS files might be crawlable
89+
90+
# Real-world example from the error
91+
assert handler.is_binary_file("https://docs.crawl4ai.com/apps/crawl4ai-assistant/crawl4ai-assistant-v1.3.0.zip") is True
92+
93+
def test_is_sitemap(self):
94+
"""Test sitemap detection."""
95+
handler = URLHandler()
96+
97+
assert handler.is_sitemap("https://example.com/sitemap.xml") is True
98+
assert handler.is_sitemap("https://example.com/path/sitemap.xml") is True
99+
assert handler.is_sitemap("https://example.com/sitemap/index.xml") is True
100+
assert handler.is_sitemap("https://example.com/regular-page") is False
101+
102+
def test_is_txt(self):
103+
"""Test text file detection."""
104+
handler = URLHandler()
105+
106+
assert handler.is_txt("https://example.com/robots.txt") is True
107+
assert handler.is_txt("https://example.com/readme.txt") is True
108+
assert handler.is_txt("https://example.com/file.pdf") is False
109+
110+
def test_transform_github_url(self):
111+
"""Test GitHub URL transformation."""
112+
handler = URLHandler()
113+
114+
# Should transform GitHub blob URLs to raw URLs
115+
original = "https://github.com/owner/repo/blob/main/file.py"
116+
expected = "https://raw.githubusercontent.com/owner/repo/main/file.py"
117+
assert handler.transform_github_url(original) == expected
118+
119+
# Should not transform non-blob URLs
120+
non_blob = "https://github.com/owner/repo"
121+
assert handler.transform_github_url(non_blob) == non_blob
122+
123+
# Should not transform non-GitHub URLs
124+
other = "https://example.com/file"
125+
assert handler.transform_github_url(other) == other

0 commit comments

Comments
 (0)