Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ pip install .
Start scraping with the following command:

```shell
crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>] [-p <PROXY_URL>]
crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--overwrite-cache|-w] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>] [-p <PROXY_URL>]
```

Options:
Expand All @@ -61,6 +61,7 @@ Options:
- `--urls-file`: Path to a file containing URLs to scrape, one URL per line. If '-', read from stdin. 📁
- `--output-folder`, `-o`: Where to save Markdown files (default: `./output`). 📂
- `--cache-folder`, `-c`: Where to store the database (default: `./cache`). 💾
- `--overwrite-cache`, `-w`: Overwrite existing cache database before scraping. 🧹
- `--base-url`, `-b`: Filter links by base URL (default: URL's base). 🔎
- `--title`, `-t`: Final title of the markdown file. Defaults to the URL. 🏷️
- `--exclude`, `-e`: Exclude URLs containing this string (repeatable). ❌
Expand Down
10 changes: 10 additions & 0 deletions crawler_to_md/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@ def main():
help="Cache folder for storing database",
default="~/.cache/crawler-to-md",
)
parser.add_argument(
"--overwrite-cache",
"-w",
action="store_true",
help="Overwrite existing cache database if present",
default=False,
)
parser.add_argument(
"--base-url",
"-b",
Expand Down Expand Up @@ -169,6 +176,9 @@ def main():
db_path = os.path.join(
args.cache_folder, utils.url_to_filename(first_url) + ".sqlite"
)
if args.overwrite_cache and os.path.exists(db_path):
logger.info(f"Removing existing cache database at {db_path}")
os.remove(db_path)
Comment thread
obeone marked this conversation as resolved.
Outdated
db_manager = DatabaseManager(db_path)
logger.info("DatabaseManager initialized.")

Expand Down
71 changes: 70 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import os
import sqlite3
import sys

import pytest

from crawler_to_md import cli
from crawler_to_md import cli, utils
from crawler_to_md.database_manager import DatabaseManager
from crawler_to_md.export_manager import ExportManager
from crawler_to_md.scraper import Scraper

Expand Down Expand Up @@ -177,3 +180,69 @@ def fake_init(*a, **k):
with pytest.raises(SystemExit):
cli.main()


def test_cli_overwrite_cache(monkeypatch, tmp_path):
captured = {}

def fake_init(self, db_path):
captured['exists'] = os.path.exists(db_path)
self.conn = sqlite3.connect(':memory:')

monkeypatch.setattr(DatabaseManager, '__init__', fake_init)
monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)
monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)

cache_folder = tmp_path / 'cache'
db_name = utils.url_to_filename('http://example.com') + '.sqlite'
db_path = cache_folder / db_name
cache_folder.mkdir()
db_path.write_text('dummy')

args = [
'prog',
'--url',
'http://example.com',
'--output-folder',
str(tmp_path),
'--cache-folder',
str(cache_folder),
'--overwrite-cache',
]
monkeypatch.setattr(sys, 'argv', args)
cli.main()
assert captured.get('exists') is False


def test_cli_overwrite_cache_short_option(monkeypatch, tmp_path):
captured = {}

def fake_init(self, db_path):
captured['exists'] = os.path.exists(db_path)
self.conn = sqlite3.connect(':memory:')

monkeypatch.setattr(DatabaseManager, '__init__', fake_init)
monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)
monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)

cache_folder = tmp_path / 'cache'
db_name = utils.url_to_filename('http://example.com') + '.sqlite'
db_path = cache_folder / db_name
cache_folder.mkdir()
db_path.write_text('dummy')

args = [
'prog',
'--url',
'http://example.com',
'--output-folder',
str(tmp_path),
'--cache-folder',
str(cache_folder),
'-w',
]
monkeypatch.setattr(sys, 'argv', args)
cli.main()
assert captured.get('exists') is False