@@ -47,3 +47,72 @@ def test_scrape_page_parses_content_and_metadata():
4747 assert 'Hello' in content
4848 assert metadata .get ('title' ) == 'Test'
4949 assert metadata .get ('url' ) == 'http://example.com/test'
50+
51+ import requests
52+ import tqdm
53+
54+ class ListDB (DummyDB ):
55+ def __init__ (self ):
56+ self .links = []
57+ self .visited = set ()
58+ self .pages = []
59+
60+ def insert_link (self , url , visited = False ):
61+ urls = url if isinstance (url , list ) else [url ]
62+ inserted = False
63+ for u in urls :
64+ if u not in self .links :
65+ self .links .append (u )
66+ inserted = True
67+ return inserted
68+
69+ def get_unvisited_links (self ):
70+ return [(u ,) for u in self .links if u not in self .visited ]
71+
72+ def mark_link_visited (self , url ):
73+ self .visited .add (url )
74+
75+ def get_links_count (self ):
76+ return len (self .links )
77+
78+ def get_visited_links_count (self ):
79+ return len (self .visited )
80+
81+ def insert_page (self , url , content , metadata ):
82+ self .pages .append ((url , content , metadata ))
83+
84+ def get_all_pages (self ):
85+ return self .pages
86+
87+
88+ def test_start_scraping_process (monkeypatch ):
89+ db = ListDB ()
90+ scraper = Scraper (base_url = 'http://example.com' , exclude_patterns = [], db_manager = db )
91+
92+ monkeypatch .setattr (Scraper , 'fetch_links' , lambda self , url , html = None : set ())
93+ monkeypatch .setattr (Scraper , 'scrape_page' , lambda self , html , url : ('# MD' , {'url' : url }))
94+
95+ class DummyResp :
96+ status_code = 200
97+ headers = {'content-type' : 'text/html' }
98+ content = b'<html></html>'
99+
100+ monkeypatch .setattr (requests , 'get' , lambda url : DummyResp ())
101+
102+ class DummyTqdm :
103+ def __init__ (self , * a , ** k ):
104+ self .total = k .get ('total' , 0 )
105+ def update (self , n ):
106+ pass
107+ def refresh (self ):
108+ pass
109+ def close (self ):
110+ pass
111+
112+ monkeypatch .setattr (tqdm , 'tqdm' , lambda * a , ** k : DummyTqdm (* a , ** k ))
113+
114+ scraper .start_scraping (url = 'http://example.com/page' )
115+
116+ assert db .get_links_count () == 1
117+ assert db .get_visited_links_count () == 1
118+ assert db .pages [0 ][0 ] == 'http://example.com/page'
0 commit comments