crawler-to-md/tests/test_scraper.py at d0bc49e0ee8f01273e8c9a0f5a72c74d143b1c1d · obeone/crawler-to-md · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
from unittest.mock import MagicMock, patch

import pytest
import requests
import tqdm

from crawler_to_md.database_manager import DatabaseManager
from crawler_to_md.scraper import Scraper


class DummyDB(DatabaseManager):
    def __init__(self):
        pass

    def __del__(self):
        pass

    def insert_link(self, url, visited=False):
        return True

    def get_unvisited_links(self):
        return []

    def mark_link_visited(self, url):
        pass


def test_is_valid_link():
    db = DummyDB()
    scraper = Scraper(
        base_url='https://example.com', exclude_patterns=['/exclude'], db_manager=db
    )
    assert scraper.is_valid_link('https://example.com/page')
    assert not scraper.is_valid_link('https://example.com/exclude/page')
    assert not scraper.is_valid_link('https://other.com/')


def test_fetch_links():
    db = DummyDB()
    scraper = Scraper(
        base_url='https://example.com', exclude_patterns=['/exclude'], db_manager=db
    )
    html = '''<html><body>
    <a href="https://example.com/page1">1</a>
    <a href="/page2">2</a>
    <a href="https://example.com/exclude/hidden">3</a>
    </body></html>'''
    links = scraper.fetch_links(url='https://example.com', html=html)
    assert links == {'https://example.com/page1', 'https://example.com/page2'}


@patch('os.remove')
@patch('tempfile.NamedTemporaryFile')
def test_scrape_page_parses_content_and_metadata(mock_tempfile, mock_os_remove):
    # Arrange
    mock_file = MagicMock()
    mock_file.name = "dummy_path"
    mock_tempfile.return_value.__enter__.return_value = mock_file

    db = DummyDB()
    scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
    html = '<html><head><title>Test</title></head><body><p>Hello</p></body></html>'

    # Act
    with patch('crawler_to_md.scraper.MarkItDown') as mock_markdown:
        mock_markdown.return_value.convert.return_value = "Hello"
        content, metadata = scraper.scrape_page(html, 'http://example.com/test')

    # Assert
    assert content is not None
    assert 'Hello' in content
    assert metadata is not None
    assert metadata.get('title') == 'Test'

@patch('os.remove')
@patch('tempfile.NamedTemporaryFile')
def test_scrape_page_with_markitdown(mock_tempfile, mock_os_remove):
    # Arrange
    mock_file = MagicMock()
    mock_file.name = "dummy_path"
    mock_tempfile.return_value.__enter__.return_value = mock_file

    db = DummyDB()
    scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
    html = (
        '<html><head><title>Test</title></head><body><h1>A Title</h1>'
        '<p>This is a paragraph with <strong>bold</strong> text.</p></body></html>'
    )

    # Act
    with patch('crawler_to_md.scraper.MarkItDown') as mock_markdown:
        mock_markdown.return_value.convert.return_value = (
            "# A Title\n\nThis is a paragraph with **bold** text."
        )
        content, metadata = scraper.scrape_page(html, 'http://example.com/test')

    # Assert
    assert content is not None
    assert content == '# A Title\n\nThis is a paragraph with **bold** text.'
    assert metadata is not None
    assert metadata.get('title') == 'Test'


class ListDB(DummyDB):
    def __init__(self):
        self.links = []
        self.visited = set()
        self.pages = []

    def insert_link(self, url, visited=False):
        urls = url if isinstance(url, list) else [url]
        inserted = False
        for u in urls:
            if u not in self.links:
                self.links.append(u)
                inserted = True
        return inserted

    def get_unvisited_links(self):
        return [(u,) for u in self.links if u not in self.visited]

    def mark_link_visited(self, url):
        self.visited.add(url)

    def get_links_count(self):
        return len(self.links)

    def get_visited_links_count(self):
        return len(self.visited)

    def insert_page(self, url, content, metadata):
        self.pages.append((url, content, metadata))

    def get_all_pages(self):
        return self.pages


def test_start_scraping_process(monkeypatch):
    db = ListDB()
    scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)

    monkeypatch.setattr(Scraper, 'fetch_links', lambda self, url, html=None: set())
    monkeypatch.setattr(
        Scraper, 'scrape_page', lambda self, html, url: ('# MD', {'url': url})
    )

    class DummyResp:
        status_code = 200
        headers = {'content-type': 'text/html'}
        content = b'<html></html>'
        text = '<html></html>'

    monkeypatch.setattr(scraper.session, 'get', lambda url: DummyResp())

    class DummyTqdm:
        def __init__(self, *a, **k):
            self.total = k.get('total', 0)
        def update(self, n):
            pass
        def refresh(self):
            pass
        def close(self):
            pass

    monkeypatch.setattr(tqdm, 'tqdm', lambda *a, **k: DummyTqdm(*a, **k))

    scraper.start_scraping(url='http://example.com/page')

    assert db.get_links_count() == 1
    assert db.get_visited_links_count() == 1
    assert db.pages[0][0] == 'http://example.com/page'


def test_scraper_proxy_initialization(monkeypatch):
    db = DummyDB()
    monkeypatch.setattr(Scraper, '_test_proxy', lambda self: None)
    scraper = Scraper(
        base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy='http://proxy:8080'
    )
    assert scraper.session.proxies.get('http') == 'http://proxy:8080'
    assert scraper.session.proxies.get('https') == 'http://proxy:8080'


def test_scraper_socks_proxy_initialization(monkeypatch):
    db = DummyDB()
    proxy = 'socks5://localhost:9050'
    monkeypatch.setattr(Scraper, '_test_proxy', lambda self: None)
    scraper = Scraper(
        base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy=proxy
    )
    assert scraper.session.proxies.get('http') == proxy
    assert scraper.session.proxies.get('https') == proxy


def test_scraper_proxy_failure_detection(monkeypatch):
    db = DummyDB()
    def fake_head(self, url, timeout=5):
        raise requests.exceptions.ProxyError("fail")

    monkeypatch.setattr(requests.Session, 'head', fake_head)
    with pytest.raises(ValueError):
        Scraper(
            base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy='http://proxy:8080'
        )