diff --git a/README.md b/README.md index d5d352a..df1eaae 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ pip install . Start scraping with the following command: ```shell -crawler-to-md --url [--output-folder ./output] [--cache-folder ./cache] [--base-url ] [--exclude ] [--title ] [--urls-file <URLS_FILE>] [-p <PROXY_URL>] +crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--overwrite-cache|-w] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>] [-p <PROXY_URL>] ``` Options: @@ -61,6 +61,7 @@ Options: - `--urls-file`: Path to a file containing URLs to scrape, one URL per line. If '-', read from stdin. ๐Ÿ“ - `--output-folder`, `-o`: Where to save Markdown files (default: `./output`). ๐Ÿ“‚ - `--cache-folder`, `-c`: Where to store the database (default: `./cache`). ๐Ÿ’พ +- `--overwrite-cache`, `-w`: Overwrite existing cache database before scraping. ๐Ÿงน - `--base-url`, `-b`: Filter links by base URL (default: URL's base). ๐Ÿ”Ž - `--title`, `-t`: Final title of the markdown file. Defaults to the URL. ๐Ÿท๏ธ - `--exclude`, `-e`: Exclude URLs containing this string (repeatable). โŒ diff --git a/crawler_to_md/cli.py b/crawler_to_md/cli.py index d7bef9a..86cde6c 100644 --- a/crawler_to_md/cli.py +++ b/crawler_to_md/cli.py @@ -50,6 +50,13 @@ def main(): help="Cache folder for storing database", default="~/.cache/crawler-to-md", ) + parser.add_argument( + "--overwrite-cache", + "-w", + action="store_true", + help="Overwrite existing cache database if present", + default=False, + ) parser.add_argument( "--base-url", "-b", @@ -169,6 +176,13 @@ def main(): db_path = os.path.join( args.cache_folder, utils.url_to_filename(first_url) + ".sqlite" ) + if args.overwrite_cache and os.path.exists(db_path): + logger.info(f"Removing existing cache database at {db_path}") + try: + os.remove(db_path) + except OSError as e: + logger.error(f"Failed to remove cache database at {db_path}: {e}") + sys.exit(1) db_manager = DatabaseManager(db_path) logger.info("DatabaseManager initialized.") diff --git a/tests/test_cli.py b/tests/test_cli.py index e772616..22416ff 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,8 +1,11 @@ +import os +import sqlite3 import sys import pytest -from crawler_to_md import cli +from crawler_to_md import cli, utils +from crawler_to_md.database_manager import DatabaseManager from crawler_to_md.export_manager import ExportManager from crawler_to_md.scraper import Scraper @@ -177,3 +180,69 @@ def fake_init(*a, **k): with pytest.raises(SystemExit): cli.main() + +def test_cli_overwrite_cache(monkeypatch, tmp_path): + captured = {} + + def fake_init(self, db_path): + captured['exists'] = os.path.exists(db_path) + self.conn = sqlite3.connect(':memory:') + + monkeypatch.setattr(DatabaseManager, '__init__', fake_init) + monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None) + monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None) + monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None) + + cache_folder = tmp_path / 'cache' + db_name = utils.url_to_filename('http://example.com') + '.sqlite' + db_path = cache_folder / db_name + cache_folder.mkdir() + db_path.write_text('dummy') + + args = [ + 'prog', + '--url', + 'http://example.com', + '--output-folder', + str(tmp_path), + '--cache-folder', + str(cache_folder), + '--overwrite-cache', + ] + monkeypatch.setattr(sys, 'argv', args) + cli.main() + assert captured.get('exists') is False + + +def test_cli_overwrite_cache_short_option(monkeypatch, tmp_path): + captured = {} + + def fake_init(self, db_path): + captured['exists'] = os.path.exists(db_path) + self.conn = sqlite3.connect(':memory:') + + monkeypatch.setattr(DatabaseManager, '__init__', fake_init) + monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None) + monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None) + monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None) + + cache_folder = tmp_path / 'cache' + db_name = utils.url_to_filename('http://example.com') + '.sqlite' + db_path = cache_folder / db_name + cache_folder.mkdir() + db_path.write_text('dummy') + + args = [ + 'prog', + '--url', + 'http://example.com', + '--output-folder', + str(tmp_path), + '--cache-folder', + str(cache_folder), + '-w', + ] + monkeypatch.setattr(sys, 'argv', args) + cli.main() + assert captured.get('exists') is False +