diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_database_manager.py b/tests/test_database_manager.py new file mode 100644 index 0000000..4ccc337 --- /dev/null +++ b/tests/test_database_manager.py @@ -0,0 +1,36 @@ +import os +import tempfile +from src.database_manager import DatabaseManager + + +def test_database_operations(): + with tempfile.TemporaryDirectory() as tmpdir: + db_path = os.path.join(tmpdir, 'test.db') + db = DatabaseManager(db_path) + + # Insert link and verify count + assert db.insert_link('http://example.com') is True + assert db.get_links_count() == 1 + assert db.get_unvisited_links() == [('http://example.com',)] + + # Mark link visited + db.mark_link_visited('http://example.com') + assert db.get_visited_links_count() == 1 + assert db.get_unvisited_links() == [] + + # Insert page and read back + db.insert_page('http://example.com', 'content', '{}') + pages = db.get_all_pages() + assert pages == [('http://example.com', 'content', '{}')] + + +def test_insert_link_duplicates_and_list(): + db = DatabaseManager(':memory:') + assert db.insert_link('http://a') is True + # duplicate single link should return False + assert db.insert_link('http://a') is False + # insert list with one new and one duplicate + assert db.insert_link(['http://b', 'http://a']) is True + # total links should be 2 + assert db.get_links_count() == 2 + assert set(db.get_unvisited_links()) == {('http://a',), ('http://b',)} diff --git a/tests/test_export_manager.py b/tests/test_export_manager.py new file mode 100644 index 0000000..b522fb6 --- /dev/null +++ b/tests/test_export_manager.py @@ -0,0 +1,105 @@ +import os +import tempfile +import json +from src.database_manager import DatabaseManager +from src.export_manager import ExportManager + + +def create_populated_db(tmpdir): + db_path = os.path.join(tmpdir, 'db.sqlite') + db = DatabaseManager(db_path) + db.insert_link('http://example.com') + db.mark_link_visited('http://example.com') + db.insert_page('http://example.com', '# Title\nParagraph', json.dumps({'author': 'John'})) + return db + + +def test_export_markdown_and_json(): + with tempfile.TemporaryDirectory() as tmpdir: + db = create_populated_db(tmpdir) + exporter = ExportManager(db, title='My Title') + md_path = os.path.join(tmpdir, 'out.md') + json_path = os.path.join(tmpdir, 'out.json') + + exporter.export_to_markdown(md_path) + exporter.export_to_json(json_path) + + assert os.path.exists(md_path) + assert os.path.exists(json_path) + + with open(md_path, 'r', encoding='utf-8') as f: + content = f.read() + assert content.startswith('# My Title') + assert '## Title' in content + assert 'URL: http://example.com' in content + + with open(json_path, 'r', encoding='utf-8') as f: + data = json.load(f) + assert data[0]['url'] == 'http://example.com' + assert 'Title' in data[0]['content'] + assert data[0]['metadata']['author'] == 'John' + + +def test_adjust_headers_and_cleanup(): + db = DatabaseManager(':memory:') + exporter = ExportManager(db, title='T') + content = '# H1\n## H2' + adjusted = exporter._adjust_headers(content, level_increment=1) + assert '## H1' in adjusted + assert '### H2' in adjusted + cleaned = exporter._cleanup_markdown('A\n\n\nB') + assert cleaned == 'A\n\nB' + + +def test_concatenate_markdown_filters_metadata(): + db = DatabaseManager(':memory:') + db.insert_page('http://a', '# T1', json.dumps({'keep': 'x'})) + db.insert_page('http://b', '# T2', json.dumps({'drop': None})) + exporter = ExportManager(db, title='Head') + result = exporter._concatenate_markdown(db.get_all_pages()) + assert result.startswith('# Head') + assert 'URL: http://a' in result and 'T1' in result + assert 'keep: x' in result + assert 'drop:' not in result + + +def test_export_individual_markdown(tmp_path): + db_path = tmp_path / 'db.sqlite' + db = DatabaseManager(str(db_path)) + db.insert_page('http://example.com/path/page', '# P', '{}') + exporter = ExportManager(db) + output_folder = exporter.export_individual_markdown(str(tmp_path)) + expected = tmp_path / 'files' / 'example.com' / 'path' / 'page.md' + assert expected.exists() + assert output_folder == str(tmp_path / 'files') + + +def test_adjust_headers_upper_limit(): + db = DatabaseManager(':memory:') + exporter = ExportManager(db) + content = '###### H6\n####### H7' + adjusted = exporter._adjust_headers(content, level_increment=1) + lines = [l for l in adjusted.split('\n') if l.startswith('#')] + # both lines should not exceed 6 hashes + assert all(len(l.split()[0]) <= 6 for l in lines) + + +def test_concatenate_skips_none_content(): + db = DatabaseManager(':memory:') + db.insert_page('http://a', None, '{}') + db.insert_page('http://b', '# T', '{}') + exporter = ExportManager(db, title='Top') + content = exporter._concatenate_markdown(db.get_all_pages()) + assert 'URL: http://a' not in content + assert 'URL: http://b' in content + + +def test_export_to_json_skips_none(tmp_path): + db = DatabaseManager(':memory:') + db.insert_page('http://a', None, '{}') + db.insert_page('http://b', '# T', '{}') + exporter = ExportManager(db) + json_path = tmp_path / 'out.json' + exporter.export_to_json(str(json_path)) + data = json.load(open(json_path, 'r', encoding='utf-8')) + assert len(data) == 1 and data[0]['url'] == 'http://b' diff --git a/tests/test_scraper.py b/tests/test_scraper.py new file mode 100644 index 0000000..9ad0f7e --- /dev/null +++ b/tests/test_scraper.py @@ -0,0 +1,118 @@ +from src.database_manager import DatabaseManager +from src.scraper import Scraper + + +class DummyDB(DatabaseManager): + def __init__(self): + pass + + def __del__(self): + pass + + def insert_link(self, url, visited=False): + return True + + def get_unvisited_links(self): + return [] + + def mark_link_visited(self, url): + pass + + +def test_is_valid_link(): + db = DummyDB() + scraper = Scraper(base_url='https://example.com', exclude_patterns=['/exclude'], db_manager=db) + assert scraper.is_valid_link('https://example.com/page') + assert not scraper.is_valid_link('https://example.com/exclude/page') + assert not scraper.is_valid_link('https://other.com/') + + +def test_fetch_links(): + db = DummyDB() + scraper = Scraper(base_url='https://example.com', exclude_patterns=['/exclude'], db_manager=db) + html = ''' + 1 + 2 + 3 + ''' + links = scraper.fetch_links(url='https://example.com', html=html) + assert links == {'https://example.com/page1', 'https://example.com/page2'} + + +def test_scrape_page_parses_content_and_metadata(): + db = DummyDB() + scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db) + html = 'Test

Hello

' + content, metadata = scraper.scrape_page(html, 'http://example.com/test') + assert 'Hello' in content + assert metadata.get('title') == 'Test' + assert metadata.get('url') == 'http://example.com/test' + +import requests +import tqdm + +class ListDB(DummyDB): + def __init__(self): + self.links = [] + self.visited = set() + self.pages = [] + + def insert_link(self, url, visited=False): + urls = url if isinstance(url, list) else [url] + inserted = False + for u in urls: + if u not in self.links: + self.links.append(u) + inserted = True + return inserted + + def get_unvisited_links(self): + return [(u,) for u in self.links if u not in self.visited] + + def mark_link_visited(self, url): + self.visited.add(url) + + def get_links_count(self): + return len(self.links) + + def get_visited_links_count(self): + return len(self.visited) + + def insert_page(self, url, content, metadata): + self.pages.append((url, content, metadata)) + + def get_all_pages(self): + return self.pages + + +def test_start_scraping_process(monkeypatch): + db = ListDB() + scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db) + + monkeypatch.setattr(Scraper, 'fetch_links', lambda self, url, html=None: set()) + monkeypatch.setattr(Scraper, 'scrape_page', lambda self, html, url: ('# MD', {'url': url})) + + class DummyResp: + status_code = 200 + headers = {'content-type': 'text/html'} + content = b'' + + monkeypatch.setattr(requests, 'get', lambda url: DummyResp()) + + class DummyTqdm: + def __init__(self, *a, **k): + self.total = k.get('total', 0) + def update(self, n): + pass + def refresh(self): + pass + def close(self): + pass + + monkeypatch.setattr(tqdm, 'tqdm', lambda *a, **k: DummyTqdm(*a, **k)) + + scraper.start_scraping(url='http://example.com/page') + + assert db.get_links_count() == 1 + assert db.get_visited_links_count() == 1 + assert db.pages[0][0] == 'http://example.com/page' diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..b51bcbe --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,29 @@ +import pytest +from src import utils + + +def test_randomstring_to_filename(): + assert utils.randomstring_to_filename('Hello World!') == 'Hello_World' + + +def test_url_to_filename(): + result = utils.url_to_filename('https://example.com/path/index.html') + assert result == 'example_com_path_index_html' + + +def test_url_dirname(): + assert utils.url_dirname('https://example.com/path/page') == 'https://example.com/path/' + assert utils.url_dirname('https://example.com/path/page/') == 'https://example.com/path/page/' + + +def test_deduplicate_list(): + assert utils.deduplicate_list([1, 2, 2, 3, 1]) == [1, 2, 3] + + +def test_randomstring_special_chars(): + assert utils.randomstring_to_filename('a!@ b$c#') == 'a_bc' + + +def test_url_to_filename_invalid(): + with pytest.raises(ValueError): + utils.url_to_filename(123)