Expand test coverage

obeone · obeone · commit 883440d0494b · 2025-06-08T06:45:36.000+02:00
diff --git a/tests/test_database_manager.py b/tests/test_database_manager.py
@@ -22,3 +22,15 @@ def test_database_operations():
         db.insert_page('http://example.com', 'content', '{}')
         pages = db.get_all_pages()
         assert pages == [('http://example.com', 'content', '{}')]
+
+
+def test_insert_link_duplicates_and_list():
+    db = DatabaseManager(':memory:')
+    assert db.insert_link('http://a') is True
+    # duplicate single link should return False
+    assert db.insert_link('http://a') is False
+    # insert list with one new and one duplicate
+    assert db.insert_link(['http://b', 'http://a']) is True
+    # total links should be 2
+    assert db.get_links_count() == 2
+    assert set(db.get_unvisited_links()) == {('http://a',), ('http://b',)}
diff --git a/tests/test_export_manager.py b/tests/test_export_manager.py
@@ -72,3 +72,34 @@ def test_export_individual_markdown(tmp_path):
     expected = tmp_path / 'files' / 'example.com' / 'path' / 'page.md'
     assert expected.exists()
     assert output_folder == str(tmp_path / 'files')
+
+
+def test_adjust_headers_upper_limit():
+    db = DatabaseManager(':memory:')
+    exporter = ExportManager(db)
+    content = '###### H6\n####### H7'
+    adjusted = exporter._adjust_headers(content, level_increment=1)
+    lines = [l for l in adjusted.split('\n') if l.startswith('#')]
+    # both lines should not exceed 6 hashes
+    assert all(len(l.split()[0]) <= 6 for l in lines)
+
+
+def test_concatenate_skips_none_content():
+    db = DatabaseManager(':memory:')
+    db.insert_page('http://a', None, '{}')
+    db.insert_page('http://b', '# T', '{}')
+    exporter = ExportManager(db, title='Top')
+    content = exporter._concatenate_markdown(db.get_all_pages())
+    assert 'URL: http://a' not in content
+    assert 'URL: http://b' in content
+
+
+def test_export_to_json_skips_none(tmp_path):
+    db = DatabaseManager(':memory:')
+    db.insert_page('http://a', None, '{}')
+    db.insert_page('http://b', '# T', '{}')
+    exporter = ExportManager(db)
+    json_path = tmp_path / 'out.json'
+    exporter.export_to_json(str(json_path))
+    data = json.load(open(json_path, 'r', encoding='utf-8'))
+    assert len(data) == 1 and data[0]['url'] == 'http://b'
diff --git a/tests/test_scraper.py b/tests/test_scraper.py
@@ -47,3 +47,72 @@ def test_scrape_page_parses_content_and_metadata():
     assert 'Hello' in content
     assert metadata.get('title') == 'Test'
     assert metadata.get('url') == 'http://example.com/test'
+
+import requests
+import tqdm
+
+class ListDB(DummyDB):
+    def __init__(self):
+        self.links = []
+        self.visited = set()
+        self.pages = []
+
+    def insert_link(self, url, visited=False):
+        urls = url if isinstance(url, list) else [url]
+        inserted = False
+        for u in urls:
+            if u not in self.links:
+                self.links.append(u)
+                inserted = True
+        return inserted
+
+    def get_unvisited_links(self):
+        return [(u,) for u in self.links if u not in self.visited]
+
+    def mark_link_visited(self, url):
+        self.visited.add(url)
+
+    def get_links_count(self):
+        return len(self.links)
+
+    def get_visited_links_count(self):
+        return len(self.visited)
+
+    def insert_page(self, url, content, metadata):
+        self.pages.append((url, content, metadata))
+
+    def get_all_pages(self):
+        return self.pages
+
+
+def test_start_scraping_process(monkeypatch):
+    db = ListDB()
+    scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
+
+    monkeypatch.setattr(Scraper, 'fetch_links', lambda self, url, html=None: set())
+    monkeypatch.setattr(Scraper, 'scrape_page', lambda self, html, url: ('# MD', {'url': url}))
+
+    class DummyResp:
+        status_code = 200
+        headers = {'content-type': 'text/html'}
+        content = b'<html></html>'
+
+    monkeypatch.setattr(requests, 'get', lambda url: DummyResp())
+
+    class DummyTqdm:
+        def __init__(self, *a, **k):
+            self.total = k.get('total', 0)
+        def update(self, n):
+            pass
+        def refresh(self):
+            pass
+        def close(self):
+            pass
+
+    monkeypatch.setattr(tqdm, 'tqdm', lambda *a, **k: DummyTqdm(*a, **k))
+
+    scraper.start_scraping(url='http://example.com/page')
+
+    assert db.get_links_count() == 1
+    assert db.get_visited_links_count() == 1
+    assert db.pages[0][0] == 'http://example.com/page'
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -18,3 +18,12 @@ def test_url_dirname():
 
 def test_deduplicate_list():
     assert utils.deduplicate_list([1, 2, 2, 3, 1]) == [1, 2, 3]
+
+
+def test_randomstring_special_chars():
+    assert utils.randomstring_to_filename('a!@ b$c#') == 'a_bc'
+
+
+def test_url_to_filename_invalid():
+    with pytest.raises(ValueError):
+        utils.url_to_filename(123)