@@ -218,3 +218,91 @@ def test_scrape_page_returns_none_for_empty_content(monkeypatch):
218218
219219 assert content is None
220220 assert metadata is None
221+
222+
223+ def test_start_scraping_excludes_invalid_urls (monkeypatch ):
224+ db = ListDB ()
225+ scraper = Scraper (
226+ base_url = 'http://example.com' ,
227+ exclude_patterns = ['/exclude' ],
228+ db_manager = db ,
229+ )
230+
231+ monkeypatch .setattr (Scraper , 'fetch_links' , lambda self , url , html = None : set ())
232+ monkeypatch .setattr (
233+ Scraper , 'scrape_page' , lambda self , html , url : ('# MD' , {'url' : url })
234+ )
235+
236+ class DummyResp :
237+ status_code = 200
238+ headers = {'content-type' : 'text/html' }
239+ content = b'<html></html>'
240+ text = '<html></html>'
241+
242+ monkeypatch .setattr (scraper .session , 'get' , lambda url : DummyResp ())
243+
244+ class DummyTqdm :
245+ def __init__ (self , * a , ** k ):
246+ self .total = k .get ('total' , 0 )
247+ def update (self , n ):
248+ pass
249+ def refresh (self ):
250+ pass
251+ def close (self ):
252+ pass
253+
254+ monkeypatch .setattr (tqdm , 'tqdm' , lambda * a , ** k : DummyTqdm (* a , ** k ))
255+
256+ urls = [
257+ 'http://example.com/page1' ,
258+ 'http://example.com/exclude/page' ,
259+ 'http://example.com/page2' ,
260+ ]
261+
262+ scraper .start_scraping (urls_list = urls )
263+
264+ assert 'http://example.com/exclude/page' not in db .links
265+
266+
267+ def test_start_scraping_filters_discovered_links (monkeypatch ):
268+ db = ListDB ()
269+ scraper = Scraper (
270+ base_url = 'http://example.com' ,
271+ exclude_patterns = ['/exclude' ],
272+ db_manager = db ,
273+ )
274+
275+ html = (
276+ '<html><body>'
277+ '<a href="/page1">1</a>'
278+ '<a href="/exclude/page">2</a>'
279+ '<a href="/page2">3</a>'
280+ '</body></html>'
281+ )
282+
283+ class DummyResp :
284+ status_code = 200
285+ headers = {'content-type' : 'text/html' }
286+ text = html
287+
288+ monkeypatch .setattr (scraper .session , 'get' , lambda url : DummyResp ())
289+
290+ monkeypatch .setattr (
291+ Scraper , 'scrape_page' , lambda self , html , url : ('# MD' , {'url' : url })
292+ )
293+
294+ class DummyTqdm :
295+ def __init__ (self , * a , ** k ):
296+ self .total = k .get ('total' , 0 )
297+ def update (self , n ):
298+ pass
299+ def refresh (self ):
300+ pass
301+ def close (self ):
302+ pass
303+
304+ monkeypatch .setattr (tqdm , 'tqdm' , lambda * a , ** k : DummyTqdm (* a , ** k ))
305+
306+ scraper .start_scraping (url = 'http://example.com' )
307+
308+ assert 'http://example.com/exclude/page' not in db .links
0 commit comments