Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ crawler-to-md --url https://www.example.com
- Exports each page as an individual Markdown file if `--export-individual` is used. 📝
- Uses SQLite for efficient data management. 📊
- Configurable via command-line arguments. ⚙️
- Include or exclude specific HTML elements using CSS-like selectors (#id, .class, tag) during Markdown conversion. 🧩
- Docker support. 🐳

## 📋 Requirements
Expand All @@ -52,7 +53,7 @@ pip install .
Start scraping with the following command:

```shell
crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--overwrite-cache|-w] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>] [-p <PROXY_URL>]
crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--overwrite-cache|-w] [--base-url <BASE_URL>] [--exclude-url <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>] [-p <PROXY_URL>]
```

Options:
Expand All @@ -64,11 +65,13 @@ Options:
- `--overwrite-cache`, `-w`: Overwrite existing cache database before scraping. 🧹
- `--base-url`, `-b`: Filter links by base URL (default: URL's base). 🔎
- `--title`, `-t`: Final title of the markdown file. Defaults to the URL. 🏷️
- `--exclude`, `-e`: Exclude URLs containing this string (repeatable). ❌
- `--exclude-url`, `-e`: Exclude URLs containing this string (repeatable). ❌
- `--export-individual`, `-ei`: Export each page as an individual Markdown file. 📝
- `--rate-limit`, `-rl`: Maximum number of requests per minute (default: 0, no rate limit). ⏱️
- `--delay`, `-d`: Delay between requests in seconds (default: 0, no delay). 🕒
- `--proxy`, `-p`: Proxy URL for HTTP or SOCKS requests. 🌐
- `--include`, `-i`: CSS-like selector (#id, .class, tag) to include before Markdown conversion (repeatable). ✅
- `--exclude`, `-x`: CSS-like selector (#id, .class, tag) to exclude before Markdown conversion (repeatable). 🚫

One of the `--url` or `--urls-file` options is required.

Expand Down
26 changes: 24 additions & 2 deletions crawler_to_md/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def main():
help="Final title of the markdown file. Defaults to the URL",
)
parser.add_argument(
"--exclude",
"--exclude-url",
"-e",
action="append",
help="Exclude URLs containing this string",
Expand Down Expand Up @@ -113,6 +113,26 @@ def main():
help="Disable generation of the compiled JSON file",
default=False,
)
parser.add_argument(
"--include",
"-i",
action="append",
help=(
"CSS-like selector (#id, .class, tag) to include before Markdown "
"conversion. Repeatable."
),
default=[],
)
parser.add_argument(
"--exclude",
"-x",
action="append",
help=(
"CSS-like selector (#id, .class, tag) to exclude before Markdown "
"conversion. Repeatable."
),
default=[],
)

try:
import argcomplete
Expand Down Expand Up @@ -189,11 +209,13 @@ def main():
try:
scraper = Scraper(
base_url=args.base_url,
exclude_patterns=args.exclude,
exclude_patterns=args.exclude_url,
db_manager=db_manager,
rate_limit=args.rate_limit,
delay=args.delay,
proxy=args.proxy,
include_filters=args.include,
exclude_filters=args.exclude,
)
except ValueError as exc:
parser.error(str(exc))
Expand Down
59 changes: 55 additions & 4 deletions crawler_to_md/scraper.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import json
import os
import tempfile
Expand Down Expand Up @@ -25,18 +26,23 @@ def __init__(
rate_limit=0,
delay=0,
proxy=None,
include_filters=None,
exclude_filters=None,
):
"""
Initialize the Scraper object.
Log the initialization process.
Initialize the Scraper object and log the initialization process.

Args:
base_url (str): The base URL to start scraping from.
exclude_patterns (list): List of patterns to exclude from scraping.
exclude_patterns (list): List of URL patterns to exclude from scraping.
db_manager (DatabaseManager): The database manager object.
rate_limit (int): Maximum number of requests per minute.
delay (float): Delay between requests in seconds.
proxy (str, optional): Proxy URL for HTTP or SOCKS requests.
include_filters (list, optional): CSS-like selectors (#id, .class, tag)
of elements to include before Markdown conversion.
exclude_filters (list, optional): CSS-like selectors (#id, .class, tag)
of elements to exclude before Markdown conversion.

Raises:
ValueError: If a proxy is provided but unreachable.
Expand All @@ -52,6 +58,9 @@ def __init__(
self.session.proxies.update({"http": proxy, "https": proxy})
self.proxy = proxy

self.include_filters = include_filters or []
self.exclude_filters = exclude_filters or []

if proxy:
self._test_proxy()

Expand All @@ -67,6 +76,24 @@ def _test_proxy(self):
except requests.RequestException as exc:
raise ValueError(f"Proxy unreachable: {exc}") from exc

def _find_elements(self, soup: BeautifulSoup, selector: str):
"""
Locate elements in the soup using a CSS-like selector.

Args:
soup (BeautifulSoup): Parsed HTML document.
selector (str): Selector in the form of '#id', '.class', or tag name.

Returns:
list[Tag]: List of matching elements.
"""
if selector.startswith("#"):
element = soup.find(id=selector[1:])
return [element] if element else []
if selector.startswith("."):
return soup.find_all(class_=selector[1:])
return soup.find_all(selector)

def is_valid_link(self, link):
"""
Check if the given link is valid for scraping.
Expand Down Expand Up @@ -157,16 +184,40 @@ def scrape_page(self, html, url):
# Parse the content using BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

if self.include_filters:
# Create a new soup to hold the included elements
new_soup = BeautifulSoup("", "html.parser")
# Ensure the new soup has a body tag if it's a full HTML document
if soup.find("body"):
body = new_soup.new_tag("body")
new_soup.append(body)
else:
body = new_soup

elements = []
for selector in self.include_filters:
elements.extend(self._find_elements(soup, selector))

# Append a copy of each element to the new soup to maintain structure
for el in elements:
body.append(copy.copy(el))
soup = new_soup

for selector in self.exclude_filters:
for element in self._find_elements(soup, selector):
element.decompose()

# Extract title from the page
title = soup.title.string if soup.title else ""

metadata = {"title": title}

filtered_html = str(soup)
# Convert the HTML to Markdown
with tempfile.NamedTemporaryFile(
mode="w+", delete=False, suffix=".html"
) as tmp:
tmp.write(html)
tmp.write(filtered_html)
tmp_path = tmp.name

markdown = str(MarkItDown().convert(tmp_path))
Expand Down
136 changes: 136 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,15 @@ def fake_init(
rate_limit=0,
delay=0,
proxy=None,
include_filters=None,
exclude_filters=None,
):
"""
Fake initializer to capture proxy argument.

Args:
proxy (str, optional): Proxy URL.
"""
captured['proxy'] = proxy

monkeypatch.setattr(Scraper, '__init__', fake_init)
Expand Down Expand Up @@ -98,7 +106,15 @@ def fake_init(
rate_limit=0,
delay=0,
proxy=None,
include_filters=None,
exclude_filters=None,
):
"""
Fake initializer to capture proxy argument.

Args:
proxy (str, optional): Proxy URL.
"""
captured['proxy'] = proxy

monkeypatch.setattr(Scraper, '__init__', fake_init)
Expand Down Expand Up @@ -134,7 +150,15 @@ def fake_init(
rate_limit=0,
delay=0,
proxy=None,
include_filters=None,
exclude_filters=None,
):
"""
Fake initializer to capture proxy argument.

Args:
proxy (str, optional): Proxy URL.
"""
captured['proxy'] = proxy

monkeypatch.setattr(Scraper, '__init__', fake_init)
Expand Down Expand Up @@ -181,6 +205,118 @@ def fake_init(*a, **k):
cli.main()


def test_cli_include_exclude_options(monkeypatch, tmp_path):
"""
Ensure CLI passes include and exclude options to the scraper.

Args:
monkeypatch (pytest.MonkeyPatch): Pytest monkeypatch fixture.
tmp_path (pathlib.Path): Temporary path for tests.
"""
captured = {}

def fake_init(
self,
base_url,
exclude_patterns,
db_manager,
rate_limit=0,
delay=0,
proxy=None,
include_filters=None,
exclude_filters=None,
):
"""
Fake initializer to capture include/exclude arguments.

Args:
include_filters (list, optional): Selectors to include.
exclude_filters (list, optional): Selectors to exclude.
"""
captured['include_filters'] = include_filters
captured['exclude_filters'] = exclude_filters

monkeypatch.setattr(Scraper, '__init__', fake_init)
monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)

cache_folder = tmp_path / 'cache'
args = [
'prog',
'--url',
'http://example.com',
'--output-folder',
str(tmp_path),
'--cache-folder',
str(cache_folder),
'--include',
'p',
'--exclude',
'.remove',
]
monkeypatch.setattr(sys, 'argv', args)
cli.main()
assert captured.get('include_filters') == ['p']
assert captured.get('exclude_filters') == ['.remove']


def test_cli_include_exclude_short_options(monkeypatch, tmp_path):
"""
Ensure short CLI options map to include and exclude selectors.

Args:
monkeypatch (pytest.MonkeyPatch): Pytest monkeypatch fixture.
tmp_path (pathlib.Path): Temporary path for tests.
"""
captured = {}

def fake_init(
self,
base_url,
exclude_patterns,
db_manager,
rate_limit=0,
delay=0,
proxy=None,
include_filters=None,
exclude_filters=None,
):
"""
Capture include and exclude selectors from short options.

Args:
include_filters (list, optional): Selectors to include.
exclude_filters (list, optional): Selectors to exclude.
"""
captured['include_filters'] = include_filters
captured['exclude_filters'] = exclude_filters

monkeypatch.setattr(Scraper, '__init__', fake_init)
monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)

cache_folder = tmp_path / 'cache'
args = [
'prog',
'--url',
'http://example.com',
'--output-folder',
str(tmp_path),
'--cache-folder',
str(cache_folder),
'-i',
'#keep',
'-x',
'span',
]
monkeypatch.setattr(sys, 'argv', args)
cli.main()
assert captured.get('include_filters') == ['#keep']
assert captured.get('exclude_filters') == ['span']


def test_cli_overwrite_cache(monkeypatch, tmp_path):
captured = {}

Expand Down
Loading