Skip to content

Commit 883440d

Browse files
committed
Expand test coverage
1 parent 11b0802 commit 883440d

4 files changed

Lines changed: 121 additions & 0 deletions

File tree

tests/test_database_manager.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,15 @@ def test_database_operations():
2222
db.insert_page('http://example.com', 'content', '{}')
2323
pages = db.get_all_pages()
2424
assert pages == [('http://example.com', 'content', '{}')]
25+
26+
27+
def test_insert_link_duplicates_and_list():
28+
db = DatabaseManager(':memory:')
29+
assert db.insert_link('http://a') is True
30+
# duplicate single link should return False
31+
assert db.insert_link('http://a') is False
32+
# insert list with one new and one duplicate
33+
assert db.insert_link(['http://b', 'http://a']) is True
34+
# total links should be 2
35+
assert db.get_links_count() == 2
36+
assert set(db.get_unvisited_links()) == {('http://a',), ('http://b',)}

tests/test_export_manager.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,34 @@ def test_export_individual_markdown(tmp_path):
7272
expected = tmp_path / 'files' / 'example.com' / 'path' / 'page.md'
7373
assert expected.exists()
7474
assert output_folder == str(tmp_path / 'files')
75+
76+
77+
def test_adjust_headers_upper_limit():
78+
db = DatabaseManager(':memory:')
79+
exporter = ExportManager(db)
80+
content = '###### H6\n####### H7'
81+
adjusted = exporter._adjust_headers(content, level_increment=1)
82+
lines = [l for l in adjusted.split('\n') if l.startswith('#')]
83+
# both lines should not exceed 6 hashes
84+
assert all(len(l.split()[0]) <= 6 for l in lines)
85+
86+
87+
def test_concatenate_skips_none_content():
88+
db = DatabaseManager(':memory:')
89+
db.insert_page('http://a', None, '{}')
90+
db.insert_page('http://b', '# T', '{}')
91+
exporter = ExportManager(db, title='Top')
92+
content = exporter._concatenate_markdown(db.get_all_pages())
93+
assert 'URL: http://a' not in content
94+
assert 'URL: http://b' in content
95+
96+
97+
def test_export_to_json_skips_none(tmp_path):
98+
db = DatabaseManager(':memory:')
99+
db.insert_page('http://a', None, '{}')
100+
db.insert_page('http://b', '# T', '{}')
101+
exporter = ExportManager(db)
102+
json_path = tmp_path / 'out.json'
103+
exporter.export_to_json(str(json_path))
104+
data = json.load(open(json_path, 'r', encoding='utf-8'))
105+
assert len(data) == 1 and data[0]['url'] == 'http://b'

tests/test_scraper.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,72 @@ def test_scrape_page_parses_content_and_metadata():
4747
assert 'Hello' in content
4848
assert metadata.get('title') == 'Test'
4949
assert metadata.get('url') == 'http://example.com/test'
50+
51+
import requests
52+
import tqdm
53+
54+
class ListDB(DummyDB):
55+
def __init__(self):
56+
self.links = []
57+
self.visited = set()
58+
self.pages = []
59+
60+
def insert_link(self, url, visited=False):
61+
urls = url if isinstance(url, list) else [url]
62+
inserted = False
63+
for u in urls:
64+
if u not in self.links:
65+
self.links.append(u)
66+
inserted = True
67+
return inserted
68+
69+
def get_unvisited_links(self):
70+
return [(u,) for u in self.links if u not in self.visited]
71+
72+
def mark_link_visited(self, url):
73+
self.visited.add(url)
74+
75+
def get_links_count(self):
76+
return len(self.links)
77+
78+
def get_visited_links_count(self):
79+
return len(self.visited)
80+
81+
def insert_page(self, url, content, metadata):
82+
self.pages.append((url, content, metadata))
83+
84+
def get_all_pages(self):
85+
return self.pages
86+
87+
88+
def test_start_scraping_process(monkeypatch):
89+
db = ListDB()
90+
scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
91+
92+
monkeypatch.setattr(Scraper, 'fetch_links', lambda self, url, html=None: set())
93+
monkeypatch.setattr(Scraper, 'scrape_page', lambda self, html, url: ('# MD', {'url': url}))
94+
95+
class DummyResp:
96+
status_code = 200
97+
headers = {'content-type': 'text/html'}
98+
content = b'<html></html>'
99+
100+
monkeypatch.setattr(requests, 'get', lambda url: DummyResp())
101+
102+
class DummyTqdm:
103+
def __init__(self, *a, **k):
104+
self.total = k.get('total', 0)
105+
def update(self, n):
106+
pass
107+
def refresh(self):
108+
pass
109+
def close(self):
110+
pass
111+
112+
monkeypatch.setattr(tqdm, 'tqdm', lambda *a, **k: DummyTqdm(*a, **k))
113+
114+
scraper.start_scraping(url='http://example.com/page')
115+
116+
assert db.get_links_count() == 1
117+
assert db.get_visited_links_count() == 1
118+
assert db.pages[0][0] == 'http://example.com/page'

tests/test_utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,12 @@ def test_url_dirname():
1818

1919
def test_deduplicate_list():
2020
assert utils.deduplicate_list([1, 2, 2, 3, 1]) == [1, 2, 3]
21+
22+
23+
def test_randomstring_special_chars():
24+
assert utils.randomstring_to_filename('a!@ b$c#') == 'a_bc'
25+
26+
27+
def test_url_to_filename_invalid():
28+
with pytest.raises(ValueError):
29+
utils.url_to_filename(123)

0 commit comments

Comments
 (0)