From ef6e96bf991e2f60815a13c1fd238805537279fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20Compagnon?= Date: Sun, 13 Jul 2025 17:42:47 +0200 Subject: [PATCH 1/2] feat(cli): add -w short option for overwrite cache --- README.md | 3 +- crawler_to_md/cli.py | 10 +++++++ tests/test_cli.py | 71 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 82 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d5d352a..df1eaae 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ pip install . Start scraping with the following command: ```shell -crawler-to-md --url [--output-folder ./output] [--cache-folder ./cache] [--base-url ] [--exclude ] [--title ] [--urls-file <URLS_FILE>] [-p <PROXY_URL>] +crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--overwrite-cache|-w] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>] [-p <PROXY_URL>] ``` Options: @@ -61,6 +61,7 @@ Options: - `--urls-file`: Path to a file containing URLs to scrape, one URL per line. If '-', read from stdin. ๐Ÿ“ - `--output-folder`, `-o`: Where to save Markdown files (default: `./output`). ๐Ÿ“‚ - `--cache-folder`, `-c`: Where to store the database (default: `./cache`). ๐Ÿ’พ +- `--overwrite-cache`, `-w`: Overwrite existing cache database before scraping. ๐Ÿงน - `--base-url`, `-b`: Filter links by base URL (default: URL's base). ๐Ÿ”Ž - `--title`, `-t`: Final title of the markdown file. Defaults to the URL. ๐Ÿท๏ธ - `--exclude`, `-e`: Exclude URLs containing this string (repeatable). โŒ diff --git a/crawler_to_md/cli.py b/crawler_to_md/cli.py index d7bef9a..d3f8034 100644 --- a/crawler_to_md/cli.py +++ b/crawler_to_md/cli.py @@ -50,6 +50,13 @@ def main(): help="Cache folder for storing database", default="~/.cache/crawler-to-md", ) + parser.add_argument( + "--overwrite-cache", + "-w", + action="store_true", + help="Overwrite existing cache database if present", + default=False, + ) parser.add_argument( "--base-url", "-b", @@ -169,6 +176,9 @@ def main(): db_path = os.path.join( args.cache_folder, utils.url_to_filename(first_url) + ".sqlite" ) + if args.overwrite_cache and os.path.exists(db_path): + logger.info(f"Removing existing cache database at {db_path}") + os.remove(db_path) db_manager = DatabaseManager(db_path) logger.info("DatabaseManager initialized.") diff --git a/tests/test_cli.py b/tests/test_cli.py index e772616..22416ff 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,8 +1,11 @@ +import os +import sqlite3 import sys import pytest -from crawler_to_md import cli +from crawler_to_md import cli, utils +from crawler_to_md.database_manager import DatabaseManager from crawler_to_md.export_manager import ExportManager from crawler_to_md.scraper import Scraper @@ -177,3 +180,69 @@ def fake_init(*a, **k): with pytest.raises(SystemExit): cli.main() + +def test_cli_overwrite_cache(monkeypatch, tmp_path): + captured = {} + + def fake_init(self, db_path): + captured['exists'] = os.path.exists(db_path) + self.conn = sqlite3.connect(':memory:') + + monkeypatch.setattr(DatabaseManager, '__init__', fake_init) + monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None) + monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None) + monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None) + + cache_folder = tmp_path / 'cache' + db_name = utils.url_to_filename('http://example.com') + '.sqlite' + db_path = cache_folder / db_name + cache_folder.mkdir() + db_path.write_text('dummy') + + args = [ + 'prog', + '--url', + 'http://example.com', + '--output-folder', + str(tmp_path), + '--cache-folder', + str(cache_folder), + '--overwrite-cache', + ] + monkeypatch.setattr(sys, 'argv', args) + cli.main() + assert captured.get('exists') is False + + +def test_cli_overwrite_cache_short_option(monkeypatch, tmp_path): + captured = {} + + def fake_init(self, db_path): + captured['exists'] = os.path.exists(db_path) + self.conn = sqlite3.connect(':memory:') + + monkeypatch.setattr(DatabaseManager, '__init__', fake_init) + monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None) + monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None) + monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None) + + cache_folder = tmp_path / 'cache' + db_name = utils.url_to_filename('http://example.com') + '.sqlite' + db_path = cache_folder / db_name + cache_folder.mkdir() + db_path.write_text('dummy') + + args = [ + 'prog', + '--url', + 'http://example.com', + '--output-folder', + str(tmp_path), + '--cache-folder', + str(cache_folder), + '-w', + ] + monkeypatch.setattr(sys, 'argv', args) + cli.main() + assert captured.get('exists') is False + From 48e081586314cade960407580a35b396011b3e1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20Compagnon?= <obeone@obeone.org> Date: Sun, 13 Jul 2025 17:45:43 +0200 Subject: [PATCH 2/2] Update crawler_to_md/cli.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- crawler_to_md/cli.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crawler_to_md/cli.py b/crawler_to_md/cli.py index d3f8034..86cde6c 100644 --- a/crawler_to_md/cli.py +++ b/crawler_to_md/cli.py @@ -178,7 +178,11 @@ def main(): ) if args.overwrite_cache and os.path.exists(db_path): logger.info(f"Removing existing cache database at {db_path}") - os.remove(db_path) + try: + os.remove(db_path) + except OSError as e: + logger.error(f"Failed to remove cache database at {db_path}: {e}") + sys.exit(1) db_manager = DatabaseManager(db_path) logger.info("DatabaseManager initialized.")