refactor(scraper): replace trafilatura with markitdown for HTML to Markdown conversion

obeone · obeone · commit 9cf3f5173054 · 2025-07-05T09:06:33.000+02:00
- Removed trafilatura dependency and its usage in scraper.py.
- Added markitdown as a dependency in requirements.txt and integrated it for Markdown conversion.
- Updated content extraction logic to use markitdown and extract page titles with BeautifulSoup.
- Adjusted tests to mock markitdown usage and verify new scraping workflow.
diff --git a/requirements.txt b/requirements.txt
@@ -5,6 +5,6 @@ mdformat_frontmatter==2.0.8
 mdformat_tables==1.0.0
 requests==2.32.3
 tqdm==4.67.1
-trafilatura==2.0.0
+markitdown==0.1.1
 coloredlogs==15.0.1
 beautifulsoup4==4.13.4
diff --git a/src/scraper.py b/src/scraper.py
@@ -1,15 +1,14 @@
-from curses import meta
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, urldefrag
 from . import log_setup
-import trafilatura
-import mdformat
+from markitdown import MarkItDown
 import json
 from .database_manager import DatabaseManager
 from tqdm import tqdm
-import coloredlogs
 import time
+import tempfile
+import os
 
 
 logger = log_setup.get_logger()
@@ -79,7 +78,7 @@ def fetch_links(self, url, html=None):
                     )
                     return []
                 else:
-                    content = response.content
+                    content = response.text
             else:
                 content = html
 
@@ -115,23 +114,22 @@ def scrape_page(self, html, url):
         logger.info(f"Scraping page {url}")
 
         try:
-            metadata = trafilatura.metadata.extract_metadata(filecontent=html, default_url=url).as_dict()
+            # Parse the content using BeautifulSoup
+            soup = BeautifulSoup(html, "html.parser")
+            
+            # Extract title from the page
+            title = soup.title.string if soup.title else ""
+            
+            metadata = {"title": title}
+            
+            # Convert the HTML to Markdown
+            with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix=".html") as tmp:
+                tmp.write(html)
+                tmp_path = tmp.name
             
-            if "body" in metadata:
-                metadata.pop("body")
-            if "commentsbody" in metadata:
-                metadata.pop("commentsbody")
+            markdown = str(MarkItDown().convert(tmp_path))
             
-            markdown = (
-                trafilatura.extract(
-                    html,
-                    output_format="markdown",
-                    include_formatting=True,
-                    include_links=True,
-                    include_tables=True,
-                )
-                or ""
-            )
+            os.remove(tmp_path)
 
             logger.debug(f"Successfully scraped content and metadata from {url}")
             return markdown, metadata
@@ -229,7 +227,7 @@ def start_scraping(self, url=None, urls_list=[]):
                     continue
 
                 # Extract the HTML content from the response
-                html = response.content
+                html = response.text
 
                 # Scrape the page for content and metadata
                 content, metadata = self.scrape_page(html, url)
diff --git a/tests/test_scraper.py b/tests/test_scraper.py
@@ -39,14 +39,52 @@ def test_fetch_links():
     assert links == {'https://example.com/page1', 'https://example.com/page2'}
 
 
-def test_scrape_page_parses_content_and_metadata():
+from unittest.mock import patch, MagicMock
+
+...
+
+@patch('os.remove')
+@patch('tempfile.NamedTemporaryFile')
+def test_scrape_page_parses_content_and_metadata(mock_tempfile, mock_os_remove):
+    # Arrange
+    mock_file = MagicMock()
+    mock_file.name = "dummy_path"
+    mock_tempfile.return_value.__enter__.return_value = mock_file
+
     db = DummyDB()
     scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
     html = '<html><head><title>Test</title></head><body><p>Hello</p></body></html>'
-    content, metadata = scraper.scrape_page(html, 'http://example.com/test')
+    
+    # Act
+    with patch('src.scraper.MarkItDown') as mock_markdown:
+        mock_markdown.return_value.convert.return_value = "Hello"
+        content, metadata = scraper.scrape_page(html, 'http://example.com/test')
+
+    # Assert
     assert 'Hello' in content
     assert metadata.get('title') == 'Test'
-    assert metadata.get('url') == 'http://example.com/test'
+
+@patch('os.remove')
+@patch('tempfile.NamedTemporaryFile')
+def test_scrape_page_with_markitdown(mock_tempfile, mock_os_remove):
+    # Arrange
+    mock_file = MagicMock()
+    mock_file.name = "dummy_path"
+    mock_tempfile.return_value.__enter__.return_value = mock_file
+    
+    db = DummyDB()
+    scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
+    html = '<html><head><title>Test</title></head><body><h1>A Title</h1><p>This is a paragraph with <strong>bold</strong> text.</p></body></html>'
+    
+    # Act
+    with patch('src.scraper.MarkItDown') as mock_markdown:
+        mock_markdown.return_value.convert.return_value = "# A Title\n\nThis is a paragraph with **bold** text."
+        content, metadata = scraper.scrape_page(html, 'http://example.com/test')
+
+    # Assert
+    assert content == '# A Title\n\nThis is a paragraph with **bold** text.'
+    assert metadata.get('title') == 'Test'
+
 
 import requests
 import tqdm
@@ -96,6 +134,7 @@ class DummyResp:
         status_code = 200
         headers = {'content-type': 'text/html'}
         content = b'<html></html>'
+        text = '<html></html>'
 
     monkeypatch.setattr(requests, 'get', lambda url: DummyResp())