Merge pull request #53 from obeone/codex/corrige-l-option-e

obeone · web-flow · commit 0e99a9759051 · 2025-07-13T18:02:54.000+02:00
fix(scraper): filter invalid urls without mutation
diff --git a/crawler_to_md/scraper.py b/crawler_to_md/scraper.py
@@ -186,7 +186,7 @@ def scrape_page(self, html, url):
             logger.error(f"Error scraping {url}: {e}")
             return None, None
 
-    def start_scraping(self, url=None, urls_list=[]):
+    def start_scraping(self, url=None, urls_list=None):
         """
         Initiates the scraping process for a single URL or a list of URLs.
         It validates URLs, logs the scraping process, and manages the
@@ -197,15 +197,18 @@ def start_scraping(self, url=None, urls_list=[]):
             urls_list (list, optional): A list of URLs to scrape.
         """
         # Validate and insert the provided URLs into the database
-        if urls_list:
-            # Iterate through the list to check for valid URLs
-            for url_item in urls_list:
+        urls = urls_list or []
+        if urls:
+            # Build a new list of valid URLs without modifying the original list
+            validated_urls = []
+            for url_item in urls:
                 if not self.is_valid_link(url_item):
                     logger.warning(f"Skipping invalid URL: {url_item}")
-                    urls_list.remove(url_item)  # Remove invalid URLs from the list
+                    continue
+                validated_urls.append(url_item)
 
             # Insert the validated list of URLs into the database
-            self.db_manager.insert_link(urls_list)
+            self.db_manager.insert_link(validated_urls)
         elif url:
             # Insert a single URL if provided and valid
             self.db_manager.insert_link(url)
diff --git a/tests/test_scraper.py b/tests/test_scraper.py
@@ -218,3 +218,91 @@ def test_scrape_page_returns_none_for_empty_content(monkeypatch):
 
     assert content is None
     assert metadata is None
+
+
+def test_start_scraping_excludes_invalid_urls(monkeypatch):
+    db = ListDB()
+    scraper = Scraper(
+        base_url='http://example.com',
+        exclude_patterns=['/exclude'],
+        db_manager=db,
+    )
+
+    monkeypatch.setattr(Scraper, 'fetch_links', lambda self, url, html=None: set())
+    monkeypatch.setattr(
+        Scraper, 'scrape_page', lambda self, html, url: ('# MD', {'url': url})
+    )
+
+    class DummyResp:
+        status_code = 200
+        headers = {'content-type': 'text/html'}
+        content = b'<html></html>'
+        text = '<html></html>'
+
+    monkeypatch.setattr(scraper.session, 'get', lambda url: DummyResp())
+
+    class DummyTqdm:
+        def __init__(self, *a, **k):
+            self.total = k.get('total', 0)
+        def update(self, n):
+            pass
+        def refresh(self):
+            pass
+        def close(self):
+            pass
+
+    monkeypatch.setattr(tqdm, 'tqdm', lambda *a, **k: DummyTqdm(*a, **k))
+
+    urls = [
+        'http://example.com/page1',
+        'http://example.com/exclude/page',
+        'http://example.com/page2',
+    ]
+
+    scraper.start_scraping(urls_list=urls)
+
+    assert 'http://example.com/exclude/page' not in db.links
+
+
+def test_start_scraping_filters_discovered_links(monkeypatch):
+    db = ListDB()
+    scraper = Scraper(
+        base_url='http://example.com',
+        exclude_patterns=['/exclude'],
+        db_manager=db,
+    )
+
+    html = (
+        '<html><body>'
+        '<a href="/page1">1</a>'
+        '<a href="/exclude/page">2</a>'
+        '<a href="/page2">3</a>'
+        '</body></html>'
+    )
+
+    class DummyResp:
+        status_code = 200
+        headers = {'content-type': 'text/html'}
+        text = html
+
+    monkeypatch.setattr(scraper.session, 'get', lambda url: DummyResp())
+
+    monkeypatch.setattr(
+        Scraper, 'scrape_page', lambda self, html, url: ('# MD', {'url': url})
+    )
+
+    class DummyTqdm:
+        def __init__(self, *a, **k):
+            self.total = k.get('total', 0)
+        def update(self, n):
+            pass
+        def refresh(self):
+            pass
+        def close(self):
+            pass
+
+    monkeypatch.setattr(tqdm, 'tqdm', lambda *a, **k: DummyTqdm(*a, **k))
+
+    scraper.start_scraping(url='http://example.com')
+
+    assert 'http://example.com/exclude/page' not in db.links