Skip to content

Commit 9ecb8cb

Browse files
committed
fix(scraper): filter invalid urls without mutation
1 parent dc63ef5 commit 9ecb8cb

2 files changed

Lines changed: 49 additions & 3 deletions

File tree

crawler_to_md/scraper.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -198,14 +198,16 @@ def start_scraping(self, url=None, urls_list=[]):
198198
"""
199199
# Validate and insert the provided URLs into the database
200200
if urls_list:
201-
# Iterate through the list to check for valid URLs
201+
# Build a new list of valid URLs without modifying the original list
202+
validated_urls = []
202203
for url_item in urls_list:
203204
if not self.is_valid_link(url_item):
204205
logger.warning(f"Skipping invalid URL: {url_item}")
205-
urls_list.remove(url_item) # Remove invalid URLs from the list
206+
continue
207+
validated_urls.append(url_item)
206208

207209
# Insert the validated list of URLs into the database
208-
self.db_manager.insert_link(urls_list)
210+
self.db_manager.insert_link(validated_urls)
209211
elif url:
210212
# Insert a single URL if provided and valid
211213
self.db_manager.insert_link(url)

tests/test_scraper.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,3 +218,47 @@ def test_scrape_page_returns_none_for_empty_content(monkeypatch):
218218

219219
assert content is None
220220
assert metadata is None
221+
222+
223+
def test_start_scraping_excludes_invalid_urls(monkeypatch):
224+
db = ListDB()
225+
scraper = Scraper(
226+
base_url='http://example.com',
227+
exclude_patterns=['/exclude'],
228+
db_manager=db,
229+
)
230+
231+
monkeypatch.setattr(Scraper, 'fetch_links', lambda self, url, html=None: set())
232+
monkeypatch.setattr(
233+
Scraper, 'scrape_page', lambda self, html, url: ('# MD', {'url': url})
234+
)
235+
236+
class DummyResp:
237+
status_code = 200
238+
headers = {'content-type': 'text/html'}
239+
content = b'<html></html>'
240+
text = '<html></html>'
241+
242+
monkeypatch.setattr(scraper.session, 'get', lambda url: DummyResp())
243+
244+
class DummyTqdm:
245+
def __init__(self, *a, **k):
246+
self.total = k.get('total', 0)
247+
def update(self, n):
248+
pass
249+
def refresh(self):
250+
pass
251+
def close(self):
252+
pass
253+
254+
monkeypatch.setattr(tqdm, 'tqdm', lambda *a, **k: DummyTqdm(*a, **k))
255+
256+
urls = [
257+
'http://example.com/page1',
258+
'http://example.com/exclude/page',
259+
'http://example.com/page2',
260+
]
261+
262+
scraper.start_scraping(urls_list=urls)
263+
264+
assert 'http://example.com/exclude/page' not in db.links

0 commit comments

Comments
 (0)