obeone · obeone · Jun 8, 2025 · Jun 8, 2025 · Jun 8, 2025
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_database_manager.py b/tests/test_database_manager.py
@@ -0,0 +1,36 @@
+import os
+import tempfile
+from src.database_manager import DatabaseManager
+
+
+def test_database_operations():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        db_path = os.path.join(tmpdir, 'test.db')
+        db = DatabaseManager(db_path)
+
+        # Insert link and verify count
+        assert db.insert_link('http://example.com') is True
+        assert db.get_links_count() == 1
+        assert db.get_unvisited_links() == [('http://example.com',)]
+
+        # Mark link visited
+        db.mark_link_visited('http://example.com')
+        assert db.get_visited_links_count() == 1
+        assert db.get_unvisited_links() == []
+
+        # Insert page and read back
+        db.insert_page('http://example.com', 'content', '{}')
+        pages = db.get_all_pages()
+        assert pages == [('http://example.com', 'content', '{}')]
+
+
+def test_insert_link_duplicates_and_list():
+    db = DatabaseManager(':memory:')
+    assert db.insert_link('http://a') is True
+    # duplicate single link should return False
+    assert db.insert_link('http://a') is False
+    # insert list with one new and one duplicate
+    assert db.insert_link(['http://b', 'http://a']) is True
+    # total links should be 2
+    assert db.get_links_count() == 2
+    assert set(db.get_unvisited_links()) == {('http://a',), ('http://b',)}
diff --git a/tests/test_export_manager.py b/tests/test_export_manager.py
@@ -0,0 +1,105 @@
+import os
+import tempfile
+import json
+from src.database_manager import DatabaseManager
+from src.export_manager import ExportManager
+
+
+def create_populated_db(tmpdir):
+    db_path = os.path.join(tmpdir, 'db.sqlite')
+    db = DatabaseManager(db_path)
+    db.insert_link('http://example.com')
+    db.mark_link_visited('http://example.com')
+    db.insert_page('http://example.com', '# Title\nParagraph', json.dumps({'author': 'John'}))
+    return db
+
+
+def test_export_markdown_and_json():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        db = create_populated_db(tmpdir)
+        exporter = ExportManager(db, title='My Title')
+        md_path = os.path.join(tmpdir, 'out.md')
+        json_path = os.path.join(tmpdir, 'out.json')
+
+        exporter.export_to_markdown(md_path)
+        exporter.export_to_json(json_path)
+
+        assert os.path.exists(md_path)
+        assert os.path.exists(json_path)
+
+        with open(md_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+            assert content.startswith('# My Title')
+            assert '## Title' in content
+            assert 'URL: http://example.com' in content
+
+        with open(json_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+            assert data[0]['url'] == 'http://example.com'
+            assert 'Title' in data[0]['content']
+            assert data[0]['metadata']['author'] == 'John'
+
+
+def test_adjust_headers_and_cleanup():
+    db = DatabaseManager(':memory:')
+    exporter = ExportManager(db, title='T')
+    content = '# H1\n## H2'
+    adjusted = exporter._adjust_headers(content, level_increment=1)
+    assert '## H1' in adjusted
+    assert '### H2' in adjusted
+    cleaned = exporter._cleanup_markdown('A\n\n\nB')
+    assert cleaned == 'A\n\nB'
+
+
+def test_concatenate_markdown_filters_metadata():
+    db = DatabaseManager(':memory:')
+    db.insert_page('http://a', '# T1', json.dumps({'keep': 'x'}))
+    db.insert_page('http://b', '# T2', json.dumps({'drop': None}))
+    exporter = ExportManager(db, title='Head')
+    result = exporter._concatenate_markdown(db.get_all_pages())
+    assert result.startswith('# Head')
+    assert 'URL: http://a' in result and 'T1' in result
+    assert 'keep: x' in result
+    assert 'drop:' not in result
+
+
+def test_export_individual_markdown(tmp_path):
+    db_path = tmp_path / 'db.sqlite'
+    db = DatabaseManager(str(db_path))
+    db.insert_page('http://example.com/path/page', '# P', '{}')
+    exporter = ExportManager(db)
+    output_folder = exporter.export_individual_markdown(str(tmp_path))
+    expected = tmp_path / 'files' / 'example.com' / 'path' / 'page.md'
+    assert expected.exists()
+    assert output_folder == str(tmp_path / 'files')
+
+
+def test_adjust_headers_upper_limit():
+    db = DatabaseManager(':memory:')
+    exporter = ExportManager(db)
+    content = '###### H6\n####### H7'
+    adjusted = exporter._adjust_headers(content, level_increment=1)
+    lines = [l for l in adjusted.split('\n') if l.startswith('#')]
+    # both lines should not exceed 6 hashes
+    assert all(len(l.split()[0]) <= 6 for l in lines)
+
+
+def test_concatenate_skips_none_content():
+    db = DatabaseManager(':memory:')
+    db.insert_page('http://a', None, '{}')
+    db.insert_page('http://b', '# T', '{}')
+    exporter = ExportManager(db, title='Top')
+    content = exporter._concatenate_markdown(db.get_all_pages())
+    assert 'URL: http://a' not in content
+    assert 'URL: http://b' in content
+
+
+def test_export_to_json_skips_none(tmp_path):
+    db = DatabaseManager(':memory:')
+    db.insert_page('http://a', None, '{}')
+    db.insert_page('http://b', '# T', '{}')
+    exporter = ExportManager(db)
+    json_path = tmp_path / 'out.json'
+    exporter.export_to_json(str(json_path))
+    data = json.load(open(json_path, 'r', encoding='utf-8'))
+    assert len(data) == 1 and data[0]['url'] == 'http://b'
diff --git a/tests/test_scraper.py b/tests/test_scraper.py
@@ -0,0 +1,118 @@
+from src.database_manager import DatabaseManager
+from src.scraper import Scraper
+
+
+class DummyDB(DatabaseManager):
+    def __init__(self):
+        pass
+
+    def __del__(self):
+        pass
+
+    def insert_link(self, url, visited=False):
+        return True
+
+    def get_unvisited_links(self):
+        return []
+
+    def mark_link_visited(self, url):
+        pass
+
+
+def test_is_valid_link():
+    db = DummyDB()
+    scraper = Scraper(base_url='https://example.com', exclude_patterns=['/exclude'], db_manager=db)
+    assert scraper.is_valid_link('https://example.com/page')
+    assert not scraper.is_valid_link('https://example.com/exclude/page')
+    assert not scraper.is_valid_link('https://other.com/')
+
+
+def test_fetch_links():
+    db = DummyDB()
+    scraper = Scraper(base_url='https://example.com', exclude_patterns=['/exclude'], db_manager=db)
+    html = '''<html><body>
+    <a href="https://example.com/page1">1</a>
+    <a href="/page2">2</a>
+    <a href="https://example.com/exclude/hidden">3</a>
+    </body></html>'''
+    links = scraper.fetch_links(url='https://example.com', html=html)
+    assert links == {'https://example.com/page1', 'https://example.com/page2'}
+
+
+def test_scrape_page_parses_content_and_metadata():
+    db = DummyDB()
+    scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
+    html = '<html><head><title>Test</title></head><body><p>Hello</p></body></html>'
+    content, metadata = scraper.scrape_page(html, 'http://example.com/test')
+    assert 'Hello' in content
+    assert metadata.get('title') == 'Test'
+    assert metadata.get('url') == 'http://example.com/test'
+
+import requests
+import tqdm
+
+class ListDB(DummyDB):
+    def __init__(self):
+        self.links = []
+        self.visited = set()
+        self.pages = []
+
+    def insert_link(self, url, visited=False):
+        urls = url if isinstance(url, list) else [url]
+        inserted = False
+        for u in urls:
+            if u not in self.links:
+                self.links.append(u)
+                inserted = True
+        return inserted
+
+    def get_unvisited_links(self):
+        return [(u,) for u in self.links if u not in self.visited]
+
+    def mark_link_visited(self, url):
+        self.visited.add(url)
+
+    def get_links_count(self):
+        return len(self.links)
+
+    def get_visited_links_count(self):
+        return len(self.visited)
+
+    def insert_page(self, url, content, metadata):
+        self.pages.append((url, content, metadata))
+
+    def get_all_pages(self):
+        return self.pages
+
+
+def test_start_scraping_process(monkeypatch):
+    db = ListDB()
+    scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
+
+    monkeypatch.setattr(Scraper, 'fetch_links', lambda self, url, html=None: set())
+    monkeypatch.setattr(Scraper, 'scrape_page', lambda self, html, url: ('# MD', {'url': url}))
+
+    class DummyResp:
+        status_code = 200
+        headers = {'content-type': 'text/html'}
+        content = b'<html></html>'
+
+    monkeypatch.setattr(requests, 'get', lambda url: DummyResp())
+
+    class DummyTqdm:
+        def __init__(self, *a, **k):
+            self.total = k.get('total', 0)
+        def update(self, n):
+            pass
+        def refresh(self):
+            pass
+        def close(self):
+            pass
+
+    monkeypatch.setattr(tqdm, 'tqdm', lambda *a, **k: DummyTqdm(*a, **k))
+
+    scraper.start_scraping(url='http://example.com/page')
+
+    assert db.get_links_count() == 1
+    assert db.get_visited_links_count() == 1
+    assert db.pages[0][0] == 'http://example.com/page'
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,29 @@
+import pytest
+from src import utils
+
+
+def test_randomstring_to_filename():
+    assert utils.randomstring_to_filename('Hello World!') == 'Hello_World'
+
+
+def test_url_to_filename():
+    result = utils.url_to_filename('https://example.com/path/index.html')
+    assert result == 'example_com_path_index_html'
+
+
+def test_url_dirname():
+    assert utils.url_dirname('https://example.com/path/page') == 'https://example.com/path/'
+    assert utils.url_dirname('https://example.com/path/page/') == 'https://example.com/path/page/'
+
+
+def test_deduplicate_list():
+    assert utils.deduplicate_list([1, 2, 2, 3, 1]) == [1, 2, 3]
+
+
+def test_randomstring_special_chars():
+    assert utils.randomstring_to_filename('a!@ b$c#') == 'a_bc'
+
+
+def test_url_to_filename_invalid():
+    with pytest.raises(ValueError):
+        utils.url_to_filename(123)