Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added tests/__init__.py
Empty file.
36 changes: 36 additions & 0 deletions tests/test_database_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
import tempfile
from src.database_manager import DatabaseManager


def test_database_operations():
with tempfile.TemporaryDirectory() as tmpdir:
db_path = os.path.join(tmpdir, 'test.db')
db = DatabaseManager(db_path)

# Insert link and verify count
assert db.insert_link('http://example.com') is True
assert db.get_links_count() == 1
assert db.get_unvisited_links() == [('http://example.com',)]

# Mark link visited
db.mark_link_visited('http://example.com')
assert db.get_visited_links_count() == 1
assert db.get_unvisited_links() == []

# Insert page and read back
db.insert_page('http://example.com', 'content', '{}')
pages = db.get_all_pages()
assert pages == [('http://example.com', 'content', '{}')]


def test_insert_link_duplicates_and_list():
db = DatabaseManager(':memory:')
assert db.insert_link('http://a') is True
# duplicate single link should return False
assert db.insert_link('http://a') is False
# insert list with one new and one duplicate
assert db.insert_link(['http://b', 'http://a']) is True
# total links should be 2
assert db.get_links_count() == 2
assert set(db.get_unvisited_links()) == {('http://a',), ('http://b',)}
105 changes: 105 additions & 0 deletions tests/test_export_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import os
import tempfile
import json
from src.database_manager import DatabaseManager
from src.export_manager import ExportManager


def create_populated_db(tmpdir):
db_path = os.path.join(tmpdir, 'db.sqlite')
db = DatabaseManager(db_path)
db.insert_link('http://example.com')
db.mark_link_visited('http://example.com')
db.insert_page('http://example.com', '# Title\nParagraph', json.dumps({'author': 'John'}))
return db


def test_export_markdown_and_json():
with tempfile.TemporaryDirectory() as tmpdir:
db = create_populated_db(tmpdir)
exporter = ExportManager(db, title='My Title')
md_path = os.path.join(tmpdir, 'out.md')
json_path = os.path.join(tmpdir, 'out.json')

exporter.export_to_markdown(md_path)
exporter.export_to_json(json_path)

assert os.path.exists(md_path)
assert os.path.exists(json_path)

with open(md_path, 'r', encoding='utf-8') as f:
content = f.read()
assert content.startswith('# My Title')
assert '## Title' in content
assert 'URL: http://example.com' in content

with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
assert data[0]['url'] == 'http://example.com'
assert 'Title' in data[0]['content']
assert data[0]['metadata']['author'] == 'John'


def test_adjust_headers_and_cleanup():
db = DatabaseManager(':memory:')
exporter = ExportManager(db, title='T')
content = '# H1\n## H2'
adjusted = exporter._adjust_headers(content, level_increment=1)
assert '## H1' in adjusted
assert '### H2' in adjusted
cleaned = exporter._cleanup_markdown('A\n\n\nB')
assert cleaned == 'A\n\nB'


def test_concatenate_markdown_filters_metadata():
db = DatabaseManager(':memory:')
db.insert_page('http://a', '# T1', json.dumps({'keep': 'x'}))
db.insert_page('http://b', '# T2', json.dumps({'drop': None}))
exporter = ExportManager(db, title='Head')
result = exporter._concatenate_markdown(db.get_all_pages())
assert result.startswith('# Head')
assert 'URL: http://a' in result and 'T1' in result
assert 'keep: x' in result
assert 'drop:' not in result


def test_export_individual_markdown(tmp_path):
db_path = tmp_path / 'db.sqlite'
db = DatabaseManager(str(db_path))
db.insert_page('http://example.com/path/page', '# P', '{}')
exporter = ExportManager(db)
output_folder = exporter.export_individual_markdown(str(tmp_path))
expected = tmp_path / 'files' / 'example.com' / 'path' / 'page.md'
assert expected.exists()
assert output_folder == str(tmp_path / 'files')


def test_adjust_headers_upper_limit():
db = DatabaseManager(':memory:')
exporter = ExportManager(db)
content = '###### H6\n####### H7'
adjusted = exporter._adjust_headers(content, level_increment=1)
lines = [l for l in adjusted.split('\n') if l.startswith('#')]
# both lines should not exceed 6 hashes
assert all(len(l.split()[0]) <= 6 for l in lines)


def test_concatenate_skips_none_content():
db = DatabaseManager(':memory:')
db.insert_page('http://a', None, '{}')
db.insert_page('http://b', '# T', '{}')
exporter = ExportManager(db, title='Top')
content = exporter._concatenate_markdown(db.get_all_pages())
assert 'URL: http://a' not in content
assert 'URL: http://b' in content


def test_export_to_json_skips_none(tmp_path):
db = DatabaseManager(':memory:')
db.insert_page('http://a', None, '{}')
db.insert_page('http://b', '# T', '{}')
exporter = ExportManager(db)
json_path = tmp_path / 'out.json'
exporter.export_to_json(str(json_path))
data = json.load(open(json_path, 'r', encoding='utf-8'))
assert len(data) == 1 and data[0]['url'] == 'http://b'
118 changes: 118 additions & 0 deletions tests/test_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
from src.database_manager import DatabaseManager
from src.scraper import Scraper


class DummyDB(DatabaseManager):
def __init__(self):
pass

def __del__(self):
pass

def insert_link(self, url, visited=False):
return True

def get_unvisited_links(self):
return []

def mark_link_visited(self, url):
pass


def test_is_valid_link():
db = DummyDB()
scraper = Scraper(base_url='https://example.com', exclude_patterns=['/exclude'], db_manager=db)
assert scraper.is_valid_link('https://example.com/page')
assert not scraper.is_valid_link('https://example.com/exclude/page')
assert not scraper.is_valid_link('https://other.com/')


def test_fetch_links():
db = DummyDB()
scraper = Scraper(base_url='https://example.com', exclude_patterns=['/exclude'], db_manager=db)
html = '''<html><body>
<a href="https://example.com/page1">1</a>
<a href="/page2">2</a>
<a href="https://example.com/exclude/hidden">3</a>
</body></html>'''
links = scraper.fetch_links(url='https://example.com', html=html)
assert links == {'https://example.com/page1', 'https://example.com/page2'}


def test_scrape_page_parses_content_and_metadata():
db = DummyDB()
scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
html = '<html><head><title>Test</title></head><body><p>Hello</p></body></html>'
content, metadata = scraper.scrape_page(html, 'http://example.com/test')
assert 'Hello' in content
assert metadata.get('title') == 'Test'
assert metadata.get('url') == 'http://example.com/test'

import requests
import tqdm

class ListDB(DummyDB):
def __init__(self):
self.links = []
self.visited = set()
self.pages = []

def insert_link(self, url, visited=False):
urls = url if isinstance(url, list) else [url]
inserted = False
for u in urls:
if u not in self.links:
self.links.append(u)
inserted = True
return inserted

def get_unvisited_links(self):
return [(u,) for u in self.links if u not in self.visited]

def mark_link_visited(self, url):
self.visited.add(url)

def get_links_count(self):
return len(self.links)

def get_visited_links_count(self):
return len(self.visited)

def insert_page(self, url, content, metadata):
self.pages.append((url, content, metadata))

def get_all_pages(self):
return self.pages


def test_start_scraping_process(monkeypatch):
db = ListDB()
scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)

monkeypatch.setattr(Scraper, 'fetch_links', lambda self, url, html=None: set())
monkeypatch.setattr(Scraper, 'scrape_page', lambda self, html, url: ('# MD', {'url': url}))

class DummyResp:
status_code = 200
headers = {'content-type': 'text/html'}
content = b'<html></html>'

monkeypatch.setattr(requests, 'get', lambda url: DummyResp())

class DummyTqdm:
def __init__(self, *a, **k):
self.total = k.get('total', 0)
def update(self, n):
pass
def refresh(self):
pass
def close(self):
pass

monkeypatch.setattr(tqdm, 'tqdm', lambda *a, **k: DummyTqdm(*a, **k))

scraper.start_scraping(url='http://example.com/page')

assert db.get_links_count() == 1
assert db.get_visited_links_count() == 1
assert db.pages[0][0] == 'http://example.com/page'
29 changes: 29 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import pytest
from src import utils


def test_randomstring_to_filename():
assert utils.randomstring_to_filename('Hello World!') == 'Hello_World'


def test_url_to_filename():
result = utils.url_to_filename('https://example.com/path/index.html')
assert result == 'example_com_path_index_html'


def test_url_dirname():
assert utils.url_dirname('https://example.com/path/page') == 'https://example.com/path/'
assert utils.url_dirname('https://example.com/path/page/') == 'https://example.com/path/page/'


def test_deduplicate_list():
assert utils.deduplicate_list([1, 2, 2, 3, 1]) == [1, 2, 3]


def test_randomstring_special_chars():
assert utils.randomstring_to_filename('a!@ b$c#') == 'a_bc'


def test_url_to_filename_invalid():
with pytest.raises(ValueError):
utils.url_to_filename(123)