@@ -28,17 +28,32 @@ def mark_link_visited(self, url):
2828def test_is_valid_link ():
2929 db = DummyDB ()
3030 scraper = Scraper (
31- base_url = 'https://example.com' , exclude_patterns = ['/exclude' ], db_manager = db
31+ base_url = 'https://example.com' ,
32+ exclude_patterns = ['/exclude' ],
33+ include_url_patterns = [],
34+ db_manager = db ,
3235 )
3336 assert scraper .is_valid_link ('https://example.com/page' )
3437 assert not scraper .is_valid_link ('https://example.com/exclude/page' )
3538 assert not scraper .is_valid_link ('https://other.com/' )
3639
40+ include_scraper = Scraper (
41+ base_url = 'https://example.com' ,
42+ exclude_patterns = [],
43+ include_url_patterns = ['/docs' ],
44+ db_manager = db ,
45+ )
46+ assert include_scraper .is_valid_link ('https://example.com/docs/page' )
47+ assert not include_scraper .is_valid_link ('https://example.com/blog' )
48+
3749
3850def test_fetch_links ():
3951 db = DummyDB ()
4052 scraper = Scraper (
41- base_url = 'https://example.com' , exclude_patterns = ['/exclude' ], db_manager = db
53+ base_url = 'https://example.com' ,
54+ exclude_patterns = ['/exclude' ],
55+ include_url_patterns = [],
56+ db_manager = db ,
4257 )
4358 html = '''<html><body>
4459 <a href="https://example.com/page1">1</a>
@@ -49,6 +64,23 @@ def test_fetch_links():
4964 assert links == {'https://example.com/page1' , 'https://example.com/page2' }
5065
5166
67+ def test_fetch_links_includes_only_matching_patterns ():
68+ db = DummyDB ()
69+ scraper = Scraper (
70+ base_url = 'https://example.com' ,
71+ exclude_patterns = [],
72+ include_url_patterns = ['/page1' ],
73+ db_manager = db ,
74+ )
75+ html = '''<html><body>
76+ <a href="https://example.com/page1">1</a>
77+ <a href="/page2">2</a>
78+ <a href="https://example.com/page3">3</a>
79+ </body></html>'''
80+ links = scraper .fetch_links (url = 'https://example.com' , html = html )
81+ assert links == {'https://example.com/page1' }
82+
83+
5284
5385
5486@patch ('os.remove' )
@@ -60,7 +92,12 @@ def test_scrape_page_parses_content_and_metadata(mock_tempfile, mock_os_remove):
6092 mock_tempfile .return_value .__enter__ .return_value = mock_file
6193
6294 db = DummyDB ()
63- scraper = Scraper (base_url = 'http://example.com' , exclude_patterns = [], db_manager = db )
95+ scraper = Scraper (
96+ base_url = 'http://example.com' ,
97+ exclude_patterns = [],
98+ include_url_patterns = [],
99+ db_manager = db ,
100+ )
64101 html = '<html><head><title>Test</title></head><body><p>Hello</p></body></html>'
65102
66103 # Act
@@ -83,7 +120,12 @@ def test_scrape_page_with_markitdown(mock_tempfile, mock_os_remove):
83120 mock_tempfile .return_value .__enter__ .return_value = mock_file
84121
85122 db = DummyDB ()
86- scraper = Scraper (base_url = 'http://example.com' , exclude_patterns = [], db_manager = db )
123+ scraper = Scraper (
124+ base_url = 'http://example.com' ,
125+ exclude_patterns = [],
126+ include_url_patterns = [],
127+ db_manager = db ,
128+ )
87129 html = (
88130 '<html><head><title>Test</title></head><body><h1>A Title</h1>'
89131 '<p>This is a paragraph with <strong>bold</strong> text.</p></body></html>'
@@ -121,6 +163,7 @@ def test_scrape_page_include_exclude(mock_tempfile, mock_os_remove):
121163 scraper = Scraper (
122164 base_url = 'http://example.com' ,
123165 exclude_patterns = [],
166+ include_url_patterns = [],
124167 db_manager = db ,
125168 include_filters = ['p' ],
126169 exclude_filters = ['.remove' ],
@@ -181,7 +224,12 @@ def get_all_pages(self):
181224
182225def test_start_scraping_process (monkeypatch ):
183226 db = ListDB ()
184- scraper = Scraper (base_url = 'http://example.com' , exclude_patterns = [], db_manager = db )
227+ scraper = Scraper (
228+ base_url = 'http://example.com' ,
229+ exclude_patterns = [],
230+ include_url_patterns = [],
231+ db_manager = db ,
232+ )
185233
186234 monkeypatch .setattr (Scraper , 'fetch_links' , lambda self , url , html = None : set ())
187235 monkeypatch .setattr (
@@ -219,7 +267,11 @@ def test_scraper_proxy_initialization(monkeypatch):
219267 db = DummyDB ()
220268 monkeypatch .setattr (Scraper , '_test_proxy' , lambda self : None )
221269 scraper = Scraper (
222- base_url = 'http://example.com' , exclude_patterns = [], db_manager = db , proxy = 'http://proxy:8080'
270+ base_url = 'http://example.com' ,
271+ exclude_patterns = [],
272+ include_url_patterns = [],
273+ db_manager = db ,
274+ proxy = 'http://proxy:8080'
223275 )
224276 assert scraper .session .proxies .get ('http' ) == 'http://proxy:8080'
225277 assert scraper .session .proxies .get ('https' ) == 'http://proxy:8080'
@@ -230,7 +282,11 @@ def test_scraper_socks_proxy_initialization(monkeypatch):
230282 proxy = 'socks5://localhost:9050'
231283 monkeypatch .setattr (Scraper , '_test_proxy' , lambda self : None )
232284 scraper = Scraper (
233- base_url = 'http://example.com' , exclude_patterns = [], db_manager = db , proxy = proxy
285+ base_url = 'http://example.com' ,
286+ exclude_patterns = [],
287+ include_url_patterns = [],
288+ db_manager = db ,
289+ proxy = proxy
234290 )
235291 assert scraper .session .proxies .get ('http' ) == proxy
236292 assert scraper .session .proxies .get ('https' ) == proxy
@@ -244,13 +300,22 @@ def fake_head(self, url, timeout=5):
244300 monkeypatch .setattr (requests .Session , 'head' , fake_head )
245301 with pytest .raises (ValueError ):
246302 Scraper (
247- base_url = 'http://example.com' , exclude_patterns = [], db_manager = db , proxy = 'http://proxy:8080'
303+ base_url = 'http://example.com' ,
304+ exclude_patterns = [],
305+ include_url_patterns = [],
306+ db_manager = db ,
307+ proxy = 'http://proxy:8080'
248308 )
249309
250310
251311def test_scrape_page_returns_none_for_empty_content (monkeypatch ):
252312 db = DummyDB ()
253- scraper = Scraper (base_url = 'http://example.com' , exclude_patterns = [], db_manager = db )
313+ scraper = Scraper (
314+ base_url = 'http://example.com' ,
315+ exclude_patterns = [],
316+ include_url_patterns = [],
317+ db_manager = db ,
318+ )
254319 html = '<html><body></body></html>'
255320
256321 with patch ('crawler_to_md.scraper.MarkItDown' ) as mock_markdown :
@@ -266,6 +331,7 @@ def test_start_scraping_excludes_invalid_urls(monkeypatch):
266331 scraper = Scraper (
267332 base_url = 'http://example.com' ,
268333 exclude_patterns = ['/exclude' ],
334+ include_url_patterns = [],
269335 db_manager = db ,
270336 )
271337
@@ -310,6 +376,7 @@ def test_start_scraping_filters_discovered_links(monkeypatch):
310376 scraper = Scraper (
311377 base_url = 'http://example.com' ,
312378 exclude_patterns = ['/exclude' ],
379+ include_url_patterns = [],
313380 db_manager = db ,
314381 )
315382
0 commit comments