Skip to content

Commit 0e99a97

Browse files
authored
Merge pull request #53 from obeone/codex/corrige-l-option-e
fix(scraper): filter invalid urls without mutation
2 parents dc63ef5 + 70e610b commit 0e99a97

2 files changed

Lines changed: 97 additions & 6 deletions

File tree

crawler_to_md/scraper.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ def scrape_page(self, html, url):
186186
logger.error(f"Error scraping {url}: {e}")
187187
return None, None
188188

189-
def start_scraping(self, url=None, urls_list=[]):
189+
def start_scraping(self, url=None, urls_list=None):
190190
"""
191191
Initiates the scraping process for a single URL or a list of URLs.
192192
It validates URLs, logs the scraping process, and manages the
@@ -197,15 +197,18 @@ def start_scraping(self, url=None, urls_list=[]):
197197
urls_list (list, optional): A list of URLs to scrape.
198198
"""
199199
# Validate and insert the provided URLs into the database
200-
if urls_list:
201-
# Iterate through the list to check for valid URLs
202-
for url_item in urls_list:
200+
urls = urls_list or []
201+
if urls:
202+
# Build a new list of valid URLs without modifying the original list
203+
validated_urls = []
204+
for url_item in urls:
203205
if not self.is_valid_link(url_item):
204206
logger.warning(f"Skipping invalid URL: {url_item}")
205-
urls_list.remove(url_item) # Remove invalid URLs from the list
207+
continue
208+
validated_urls.append(url_item)
206209

207210
# Insert the validated list of URLs into the database
208-
self.db_manager.insert_link(urls_list)
211+
self.db_manager.insert_link(validated_urls)
209212
elif url:
210213
# Insert a single URL if provided and valid
211214
self.db_manager.insert_link(url)

tests/test_scraper.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,3 +218,91 @@ def test_scrape_page_returns_none_for_empty_content(monkeypatch):
218218

219219
assert content is None
220220
assert metadata is None
221+
222+
223+
def test_start_scraping_excludes_invalid_urls(monkeypatch):
224+
db = ListDB()
225+
scraper = Scraper(
226+
base_url='http://example.com',
227+
exclude_patterns=['/exclude'],
228+
db_manager=db,
229+
)
230+
231+
monkeypatch.setattr(Scraper, 'fetch_links', lambda self, url, html=None: set())
232+
monkeypatch.setattr(
233+
Scraper, 'scrape_page', lambda self, html, url: ('# MD', {'url': url})
234+
)
235+
236+
class DummyResp:
237+
status_code = 200
238+
headers = {'content-type': 'text/html'}
239+
content = b'<html></html>'
240+
text = '<html></html>'
241+
242+
monkeypatch.setattr(scraper.session, 'get', lambda url: DummyResp())
243+
244+
class DummyTqdm:
245+
def __init__(self, *a, **k):
246+
self.total = k.get('total', 0)
247+
def update(self, n):
248+
pass
249+
def refresh(self):
250+
pass
251+
def close(self):
252+
pass
253+
254+
monkeypatch.setattr(tqdm, 'tqdm', lambda *a, **k: DummyTqdm(*a, **k))
255+
256+
urls = [
257+
'http://example.com/page1',
258+
'http://example.com/exclude/page',
259+
'http://example.com/page2',
260+
]
261+
262+
scraper.start_scraping(urls_list=urls)
263+
264+
assert 'http://example.com/exclude/page' not in db.links
265+
266+
267+
def test_start_scraping_filters_discovered_links(monkeypatch):
268+
db = ListDB()
269+
scraper = Scraper(
270+
base_url='http://example.com',
271+
exclude_patterns=['/exclude'],
272+
db_manager=db,
273+
)
274+
275+
html = (
276+
'<html><body>'
277+
'<a href="/page1">1</a>'
278+
'<a href="/exclude/page">2</a>'
279+
'<a href="/page2">3</a>'
280+
'</body></html>'
281+
)
282+
283+
class DummyResp:
284+
status_code = 200
285+
headers = {'content-type': 'text/html'}
286+
text = html
287+
288+
monkeypatch.setattr(scraper.session, 'get', lambda url: DummyResp())
289+
290+
monkeypatch.setattr(
291+
Scraper, 'scrape_page', lambda self, html, url: ('# MD', {'url': url})
292+
)
293+
294+
class DummyTqdm:
295+
def __init__(self, *a, **k):
296+
self.total = k.get('total', 0)
297+
def update(self, n):
298+
pass
299+
def refresh(self):
300+
pass
301+
def close(self):
302+
pass
303+
304+
monkeypatch.setattr(tqdm, 'tqdm', lambda *a, **k: DummyTqdm(*a, **k))
305+
306+
scraper.start_scraping(url='http://example.com')
307+
308+
assert 'http://example.com/exclude/page' not in db.links

0 commit comments

Comments
 (0)