@@ -218,3 +218,47 @@ def test_scrape_page_returns_none_for_empty_content(monkeypatch):
218218
219219 assert content is None
220220 assert metadata is None
221+
222+
223+ def test_start_scraping_excludes_invalid_urls (monkeypatch ):
224+ db = ListDB ()
225+ scraper = Scraper (
226+ base_url = 'http://example.com' ,
227+ exclude_patterns = ['/exclude' ],
228+ db_manager = db ,
229+ )
230+
231+ monkeypatch .setattr (Scraper , 'fetch_links' , lambda self , url , html = None : set ())
232+ monkeypatch .setattr (
233+ Scraper , 'scrape_page' , lambda self , html , url : ('# MD' , {'url' : url })
234+ )
235+
236+ class DummyResp :
237+ status_code = 200
238+ headers = {'content-type' : 'text/html' }
239+ content = b'<html></html>'
240+ text = '<html></html>'
241+
242+ monkeypatch .setattr (scraper .session , 'get' , lambda url : DummyResp ())
243+
244+ class DummyTqdm :
245+ def __init__ (self , * a , ** k ):
246+ self .total = k .get ('total' , 0 )
247+ def update (self , n ):
248+ pass
249+ def refresh (self ):
250+ pass
251+ def close (self ):
252+ pass
253+
254+ monkeypatch .setattr (tqdm , 'tqdm' , lambda * a , ** k : DummyTqdm (* a , ** k ))
255+
256+ urls = [
257+ 'http://example.com/page1' ,
258+ 'http://example.com/exclude/page' ,
259+ 'http://example.com/page2' ,
260+ ]
261+
262+ scraper .start_scraping (urls_list = urls )
263+
264+ assert 'http://example.com/exclude/page' not in db .links
0 commit comments