Skip to content

Commit 9cf3f51

Browse files
committed
refactor(scraper): replace trafilatura with markitdown for HTML to Markdown conversion
- Removed trafilatura dependency and its usage in scraper.py. - Added markitdown as a dependency in requirements.txt and integrated it for Markdown conversion. - Updated content extraction logic to use markitdown and extract page titles with BeautifulSoup. - Adjusted tests to mock markitdown usage and verify new scraping workflow.
1 parent 30cee92 commit 9cf3f51

3 files changed

Lines changed: 62 additions & 25 deletions

File tree

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,6 @@ mdformat_frontmatter==2.0.8
55
mdformat_tables==1.0.0
66
requests==2.32.3
77
tqdm==4.67.1
8-
trafilatura==2.0.0
8+
markitdown==0.1.1
99
coloredlogs==15.0.1
1010
beautifulsoup4==4.13.4

src/scraper.py

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
1-
from curses import meta
21
import requests
32
from bs4 import BeautifulSoup
43
from urllib.parse import urljoin, urldefrag
54
from . import log_setup
6-
import trafilatura
7-
import mdformat
5+
from markitdown import MarkItDown
86
import json
97
from .database_manager import DatabaseManager
108
from tqdm import tqdm
11-
import coloredlogs
129
import time
10+
import tempfile
11+
import os
1312

1413

1514
logger = log_setup.get_logger()
@@ -79,7 +78,7 @@ def fetch_links(self, url, html=None):
7978
)
8079
return []
8180
else:
82-
content = response.content
81+
content = response.text
8382
else:
8483
content = html
8584

@@ -115,23 +114,22 @@ def scrape_page(self, html, url):
115114
logger.info(f"Scraping page {url}")
116115

117116
try:
118-
metadata = trafilatura.metadata.extract_metadata(filecontent=html, default_url=url).as_dict()
117+
# Parse the content using BeautifulSoup
118+
soup = BeautifulSoup(html, "html.parser")
119+
120+
# Extract title from the page
121+
title = soup.title.string if soup.title else ""
122+
123+
metadata = {"title": title}
124+
125+
# Convert the HTML to Markdown
126+
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix=".html") as tmp:
127+
tmp.write(html)
128+
tmp_path = tmp.name
119129

120-
if "body" in metadata:
121-
metadata.pop("body")
122-
if "commentsbody" in metadata:
123-
metadata.pop("commentsbody")
130+
markdown = str(MarkItDown().convert(tmp_path))
124131

125-
markdown = (
126-
trafilatura.extract(
127-
html,
128-
output_format="markdown",
129-
include_formatting=True,
130-
include_links=True,
131-
include_tables=True,
132-
)
133-
or ""
134-
)
132+
os.remove(tmp_path)
135133

136134
logger.debug(f"Successfully scraped content and metadata from {url}")
137135
return markdown, metadata
@@ -229,7 +227,7 @@ def start_scraping(self, url=None, urls_list=[]):
229227
continue
230228

231229
# Extract the HTML content from the response
232-
html = response.content
230+
html = response.text
233231

234232
# Scrape the page for content and metadata
235233
content, metadata = self.scrape_page(html, url)

tests/test_scraper.py

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,52 @@ def test_fetch_links():
3939
assert links == {'https://example.com/page1', 'https://example.com/page2'}
4040

4141

42-
def test_scrape_page_parses_content_and_metadata():
42+
from unittest.mock import patch, MagicMock
43+
44+
...
45+
46+
@patch('os.remove')
47+
@patch('tempfile.NamedTemporaryFile')
48+
def test_scrape_page_parses_content_and_metadata(mock_tempfile, mock_os_remove):
49+
# Arrange
50+
mock_file = MagicMock()
51+
mock_file.name = "dummy_path"
52+
mock_tempfile.return_value.__enter__.return_value = mock_file
53+
4354
db = DummyDB()
4455
scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
4556
html = '<html><head><title>Test</title></head><body><p>Hello</p></body></html>'
46-
content, metadata = scraper.scrape_page(html, 'http://example.com/test')
57+
58+
# Act
59+
with patch('src.scraper.MarkItDown') as mock_markdown:
60+
mock_markdown.return_value.convert.return_value = "Hello"
61+
content, metadata = scraper.scrape_page(html, 'http://example.com/test')
62+
63+
# Assert
4764
assert 'Hello' in content
4865
assert metadata.get('title') == 'Test'
49-
assert metadata.get('url') == 'http://example.com/test'
66+
67+
@patch('os.remove')
68+
@patch('tempfile.NamedTemporaryFile')
69+
def test_scrape_page_with_markitdown(mock_tempfile, mock_os_remove):
70+
# Arrange
71+
mock_file = MagicMock()
72+
mock_file.name = "dummy_path"
73+
mock_tempfile.return_value.__enter__.return_value = mock_file
74+
75+
db = DummyDB()
76+
scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
77+
html = '<html><head><title>Test</title></head><body><h1>A Title</h1><p>This is a paragraph with <strong>bold</strong> text.</p></body></html>'
78+
79+
# Act
80+
with patch('src.scraper.MarkItDown') as mock_markdown:
81+
mock_markdown.return_value.convert.return_value = "# A Title\n\nThis is a paragraph with **bold** text."
82+
content, metadata = scraper.scrape_page(html, 'http://example.com/test')
83+
84+
# Assert
85+
assert content == '# A Title\n\nThis is a paragraph with **bold** text.'
86+
assert metadata.get('title') == 'Test'
87+
5088

5189
import requests
5290
import tqdm
@@ -96,6 +134,7 @@ class DummyResp:
96134
status_code = 200
97135
headers = {'content-type': 'text/html'}
98136
content = b'<html></html>'
137+
text = '<html></html>'
99138

100139
monkeypatch.setattr(requests, 'get', lambda url: DummyResp())
101140

0 commit comments

Comments
 (0)