fix(scraper): filter invalid urls without mutation

obeone · obeone · commit 9ecb8cb48571 · 2025-07-13T17:51:59.000+02:00
diff --git a/crawler_to_md/scraper.py b/crawler_to_md/scraper.py
@@ -198,14 +198,16 @@ def start_scraping(self, url=None, urls_list=[]):
         """
         # Validate and insert the provided URLs into the database
         if urls_list:
-            # Iterate through the list to check for valid URLs
+            # Build a new list of valid URLs without modifying the original list
+            validated_urls = []
             for url_item in urls_list:
                 if not self.is_valid_link(url_item):
                     logger.warning(f"Skipping invalid URL: {url_item}")
-                    urls_list.remove(url_item)  # Remove invalid URLs from the list
+                    continue
+                validated_urls.append(url_item)
 
             # Insert the validated list of URLs into the database
-            self.db_manager.insert_link(urls_list)
+            self.db_manager.insert_link(validated_urls)
         elif url:
             # Insert a single URL if provided and valid
             self.db_manager.insert_link(url)
diff --git a/tests/test_scraper.py b/tests/test_scraper.py
@@ -218,3 +218,47 @@ def test_scrape_page_returns_none_for_empty_content(monkeypatch):
 
     assert content is None
     assert metadata is None
+
+
+def test_start_scraping_excludes_invalid_urls(monkeypatch):
+    db = ListDB()
+    scraper = Scraper(
+        base_url='http://example.com',
+        exclude_patterns=['/exclude'],
+        db_manager=db,
+    )
+
+    monkeypatch.setattr(Scraper, 'fetch_links', lambda self, url, html=None: set())
+    monkeypatch.setattr(
+        Scraper, 'scrape_page', lambda self, html, url: ('# MD', {'url': url})
+    )
+
+    class DummyResp:
+        status_code = 200
+        headers = {'content-type': 'text/html'}
+        content = b'<html></html>'
+        text = '<html></html>'
+
+    monkeypatch.setattr(scraper.session, 'get', lambda url: DummyResp())
+
+    class DummyTqdm:
+        def __init__(self, *a, **k):
+            self.total = k.get('total', 0)
+        def update(self, n):
+            pass
+        def refresh(self):
+            pass
+        def close(self):
+            pass
+
+    monkeypatch.setattr(tqdm, 'tqdm', lambda *a, **k: DummyTqdm(*a, **k))
+
+    urls = [
+        'http://example.com/page1',
+        'http://example.com/exclude/page',
+        'http://example.com/page2',
+    ]
+
+    scraper.start_scraping(urls_list=urls)
+
+    assert 'http://example.com/exclude/page' not in db.links