1+ """Unit tests for URLHandler class."""
2+ import pytest
3+ from src .server .services .crawling .helpers .url_handler import URLHandler
4+
5+
6+ class TestURLHandler :
7+ """Test suite for URLHandler class."""
8+
9+ def test_is_binary_file_archives (self ):
10+ """Test detection of archive file formats."""
11+ handler = URLHandler ()
12+
13+ # Should detect various archive formats
14+ assert handler .is_binary_file ("https://example.com/file.zip" ) is True
15+ assert handler .is_binary_file ("https://example.com/archive.tar.gz" ) is True
16+ assert handler .is_binary_file ("https://example.com/compressed.rar" ) is True
17+ assert handler .is_binary_file ("https://example.com/package.7z" ) is True
18+ assert handler .is_binary_file ("https://example.com/backup.tgz" ) is True
19+
20+ def test_is_binary_file_executables (self ):
21+ """Test detection of executable and installer files."""
22+ handler = URLHandler ()
23+
24+ assert handler .is_binary_file ("https://example.com/setup.exe" ) is True
25+ assert handler .is_binary_file ("https://example.com/installer.dmg" ) is True
26+ assert handler .is_binary_file ("https://example.com/package.deb" ) is True
27+ assert handler .is_binary_file ("https://example.com/app.msi" ) is True
28+ assert handler .is_binary_file ("https://example.com/program.appimage" ) is True
29+
30+ def test_is_binary_file_documents (self ):
31+ """Test detection of document files."""
32+ handler = URLHandler ()
33+
34+ assert handler .is_binary_file ("https://example.com/document.pdf" ) is True
35+ assert handler .is_binary_file ("https://example.com/report.docx" ) is True
36+ assert handler .is_binary_file ("https://example.com/spreadsheet.xlsx" ) is True
37+ assert handler .is_binary_file ("https://example.com/presentation.pptx" ) is True
38+
39+ def test_is_binary_file_media (self ):
40+ """Test detection of image and media files."""
41+ handler = URLHandler ()
42+
43+ # Images
44+ assert handler .is_binary_file ("https://example.com/photo.jpg" ) is True
45+ assert handler .is_binary_file ("https://example.com/image.png" ) is True
46+ assert handler .is_binary_file ("https://example.com/icon.svg" ) is True
47+ assert handler .is_binary_file ("https://example.com/favicon.ico" ) is True
48+
49+ # Audio/Video
50+ assert handler .is_binary_file ("https://example.com/song.mp3" ) is True
51+ assert handler .is_binary_file ("https://example.com/video.mp4" ) is True
52+ assert handler .is_binary_file ("https://example.com/movie.mkv" ) is True
53+
54+ def test_is_binary_file_case_insensitive (self ):
55+ """Test that detection is case-insensitive."""
56+ handler = URLHandler ()
57+
58+ assert handler .is_binary_file ("https://example.com/FILE.ZIP" ) is True
59+ assert handler .is_binary_file ("https://example.com/Document.PDF" ) is True
60+ assert handler .is_binary_file ("https://example.com/Image.PNG" ) is True
61+
62+ def test_is_binary_file_with_query_params (self ):
63+ """Test that query parameters don't affect detection."""
64+ handler = URLHandler ()
65+
66+ assert handler .is_binary_file ("https://example.com/file.zip?version=1.0" ) is True
67+ assert handler .is_binary_file ("https://example.com/document.pdf?download=true" ) is True
68+ assert handler .is_binary_file ("https://example.com/image.png#section" ) is True
69+
70+ def test_is_binary_file_html_pages (self ):
71+ """Test that HTML pages are not detected as binary."""
72+ handler = URLHandler ()
73+
74+ # Regular HTML pages should not be detected as binary
75+ assert handler .is_binary_file ("https://example.com/" ) is False
76+ assert handler .is_binary_file ("https://example.com/index.html" ) is False
77+ assert handler .is_binary_file ("https://example.com/page" ) is False
78+ assert handler .is_binary_file ("https://example.com/blog/post" ) is False
79+ assert handler .is_binary_file ("https://example.com/about.htm" ) is False
80+ assert handler .is_binary_file ("https://example.com/contact.php" ) is False
81+
82+ def test_is_binary_file_edge_cases (self ):
83+ """Test edge cases and special scenarios."""
84+ handler = URLHandler ()
85+
86+ # URLs with periods in path but not file extensions
87+ assert handler .is_binary_file ("https://example.com/v1.0/api" ) is False
88+ assert handler .is_binary_file ("https://example.com/jquery.min.js" ) is False # JS files might be crawlable
89+
90+ # Real-world example from the error
91+ assert handler .is_binary_file ("https://docs.crawl4ai.com/apps/crawl4ai-assistant/crawl4ai-assistant-v1.3.0.zip" ) is True
92+
93+ def test_is_sitemap (self ):
94+ """Test sitemap detection."""
95+ handler = URLHandler ()
96+
97+ assert handler .is_sitemap ("https://example.com/sitemap.xml" ) is True
98+ assert handler .is_sitemap ("https://example.com/path/sitemap.xml" ) is True
99+ assert handler .is_sitemap ("https://example.com/sitemap/index.xml" ) is True
100+ assert handler .is_sitemap ("https://example.com/regular-page" ) is False
101+
102+ def test_is_txt (self ):
103+ """Test text file detection."""
104+ handler = URLHandler ()
105+
106+ assert handler .is_txt ("https://example.com/robots.txt" ) is True
107+ assert handler .is_txt ("https://example.com/readme.txt" ) is True
108+ assert handler .is_txt ("https://example.com/file.pdf" ) is False
109+
110+ def test_transform_github_url (self ):
111+ """Test GitHub URL transformation."""
112+ handler = URLHandler ()
113+
114+ # Should transform GitHub blob URLs to raw URLs
115+ original = "https://github.com/owner/repo/blob/main/file.py"
116+ expected = "https://raw.githubusercontent.com/owner/repo/main/file.py"
117+ assert handler .transform_github_url (original ) == expected
118+
119+ # Should not transform non-blob URLs
120+ non_blob = "https://github.com/owner/repo"
121+ assert handler .transform_github_url (non_blob ) == non_blob
122+
123+ # Should not transform non-GitHub URLs
124+ other = "https://example.com/file"
125+ assert handler .transform_github_url (other ) == other
0 commit comments