Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ pip install .
Start scraping with the following command:

```shell
crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>]
crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>] [-p <PROXY_URL>]
```

Options:
Expand All @@ -67,6 +67,7 @@ Options:
- `--export-individual`, `-ei`: Export each page as an individual Markdown file. 📝
- `--rate-limit`, `-rl`: Maximum number of requests per minute (default: 0, no rate limit). ⏱️
- `--delay`, `-d`: Delay between requests in seconds (default: 0, no delay). 🕒
- `--proxy`, `-p`: Proxy URL for HTTP or SOCKS requests. 🌐

One of the `--url` or `--urls-file` options is required.

Expand Down
24 changes: 17 additions & 7 deletions crawler_to_md/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,12 @@ def main():
help="Delay between requests in seconds",
default=0,
)
parser.add_argument(
"--proxy",
"-p",
help="Proxy URL for HTTP or SOCKS requests",
default=None,
)
parser.add_argument(
"--no-markdown",
action="store_true",
Expand Down Expand Up @@ -166,13 +172,17 @@ def main():
db_manager = DatabaseManager(db_path)
logger.info("DatabaseManager initialized.")

scraper = Scraper(
base_url=args.base_url,
exclude_patterns=args.exclude,
db_manager=db_manager,
rate_limit=args.rate_limit,
delay=args.delay,
)
try:
scraper = Scraper(
base_url=args.base_url,
exclude_patterns=args.exclude,
db_manager=db_manager,
rate_limit=args.rate_limit,
delay=args.delay,
proxy=args.proxy,
)
except ValueError as exc:
parser.error(str(exc))
logger.info("Scraper initialized.")

# Start the scraping process
Expand Down
36 changes: 33 additions & 3 deletions crawler_to_md/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def __init__(
db_manager: DatabaseManager,
rate_limit=0,
delay=0,
proxy=None,
):
"""
Initialize the Scraper object.
Expand All @@ -35,13 +36,36 @@ def __init__(
db_manager (DatabaseManager): The database manager object.
rate_limit (int): Maximum number of requests per minute.
delay (float): Delay between requests in seconds.
proxy (str, optional): Proxy URL for HTTP or SOCKS requests.

Raises:
ValueError: If a proxy is provided but unreachable.
"""
logger.debug(f"Initializing Scraper with base URL: {base_url}")
self.base_url = base_url
self.exclude_patterns = exclude_patterns or []
self.db_manager = db_manager
self.rate_limit = rate_limit
self.delay = delay
self.session = requests.Session()
if proxy:
self.session.proxies.update({"http": proxy, "https": proxy})
self.proxy = proxy

if proxy:
self._test_proxy()

def _test_proxy(self):
"""
Ensure the configured proxy is reachable.

Raises:
ValueError: If the proxy cannot fetch the base URL.
"""
try:
self.session.head(self.base_url, timeout=5)
except requests.RequestException as exc:
raise ValueError(f"Proxy unreachable: {exc}") from exc

def is_valid_link(self, link):
"""
Expand Down Expand Up @@ -79,7 +103,7 @@ def fetch_links(self, url, html=None):
try:
if not html:
# Send a GET request to the URL
response = requests.get(url)
response = self.session.get(url)
if response.status_code != 200:
logger.warning(
f"Failed to fetch {url} with status code {response.status_code}"
Expand Down Expand Up @@ -149,7 +173,13 @@ def scrape_page(self, html, url):

os.remove(tmp_path)

logger.debug(f"Successfully scraped content and metadata from {url}")
if not markdown.strip():
logger.warning("No content scraped from %s", url)
return None, None

logger.debug(
"Successfully scraped content and metadata from %s", url
)
return markdown, metadata

except Exception as e:
Expand Down Expand Up @@ -233,7 +263,7 @@ def start_scraping(self, url=None, urls_list=[]):
url = link[0] # Extract the URL from the link tuple

# Attempt to fetch the page content
response = requests.get(url)
response = self.session.get(url)

# Increment request count for rate limiting
request_count += 1
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ dependencies = [
"mdformat_footnote==0.1.1",
"mdformat_frontmatter==2.0.8",
"mdformat_tables==1.0.0",
"requests==2.32.4",
"requests[socks]==2.32.4",
"tqdm==4.67.1",
"markitdown==0.1.2",
"coloredlogs==15.0.1",
Expand Down
133 changes: 133 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import sys

import pytest

from crawler_to_md import cli
from crawler_to_md.export_manager import ExportManager
from crawler_to_md.scraper import Scraper
Expand Down Expand Up @@ -44,3 +47,133 @@ def test_cli_disable_exports(monkeypatch, tmp_path):
assert calls["md"] is False
assert calls["json"] is False


def test_cli_proxy_option(monkeypatch, tmp_path):
captured = {}

def fake_init(
self,
base_url,
exclude_patterns,
db_manager,
rate_limit=0,
delay=0,
proxy=None,
):
captured['proxy'] = proxy

monkeypatch.setattr(Scraper, '__init__', fake_init)
monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)

cache_folder = tmp_path / 'cache'
args = [
'prog',
'--url',
'http://example.com',
'--output-folder',
str(tmp_path),
'--cache-folder',
str(cache_folder),
'--proxy',
'http://proxy:8080',
]
monkeypatch.setattr(sys, 'argv', args)
cli.main()
assert captured.get('proxy') == 'http://proxy:8080'


def test_cli_proxy_short_option(monkeypatch, tmp_path):
captured = {}

def fake_init(
self,
base_url,
exclude_patterns,
db_manager,
rate_limit=0,
delay=0,
proxy=None,
):
captured['proxy'] = proxy

monkeypatch.setattr(Scraper, '__init__', fake_init)
monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)

cache_folder = tmp_path / 'cache'
args = [
'prog',
'--url',
'http://example.com',
'--output-folder',
str(tmp_path),
'--cache-folder',
str(cache_folder),
'-p',
'http://proxy:8080',
]
monkeypatch.setattr(sys, 'argv', args)
cli.main()
assert captured.get('proxy') == 'http://proxy:8080'


def test_cli_socks_proxy(monkeypatch, tmp_path):
captured = {}

def fake_init(
self,
base_url,
exclude_patterns,
db_manager,
rate_limit=0,
delay=0,
proxy=None,
):
captured['proxy'] = proxy

monkeypatch.setattr(Scraper, '__init__', fake_init)
monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)

cache_folder = tmp_path / 'cache'
args = [
'prog',
'--url',
'http://example.com',
'--output-folder',
str(tmp_path),
'--cache-folder',
str(cache_folder),
'--proxy',
'socks5://localhost:9050',
]
monkeypatch.setattr(sys, 'argv', args)
cli.main()
assert captured.get('proxy') == 'socks5://localhost:9050'


def test_cli_proxy_error(monkeypatch, tmp_path):
def fake_init(*a, **k):
raise ValueError('Proxy unreachable')

monkeypatch.setattr(Scraper, '__init__', fake_init)
cache_folder = tmp_path / 'cache'
args = [
'prog',
'--url',
'http://example.com',
'--output-folder',
str(tmp_path),
'--cache-folder',
str(cache_folder),
'--proxy',
'http://proxy:8080',
]
monkeypatch.setattr(sys, 'argv', args)
with pytest.raises(SystemExit):
cli.main()

49 changes: 48 additions & 1 deletion tests/test_scraper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from unittest.mock import MagicMock, patch

import pytest
import requests
import tqdm

Expand Down Expand Up @@ -152,7 +153,7 @@ class DummyResp:
content = b'<html></html>'
text = '<html></html>'

monkeypatch.setattr(requests, 'get', lambda url: DummyResp())
monkeypatch.setattr(scraper.session, 'get', lambda url: DummyResp())

class DummyTqdm:
def __init__(self, *a, **k):
Expand All @@ -171,3 +172,49 @@ def close(self):
assert db.get_links_count() == 1
assert db.get_visited_links_count() == 1
assert db.pages[0][0] == 'http://example.com/page'


def test_scraper_proxy_initialization(monkeypatch):
db = DummyDB()
monkeypatch.setattr(Scraper, '_test_proxy', lambda self: None)
scraper = Scraper(
base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy='http://proxy:8080'
)
assert scraper.session.proxies.get('http') == 'http://proxy:8080'
assert scraper.session.proxies.get('https') == 'http://proxy:8080'


def test_scraper_socks_proxy_initialization(monkeypatch):
db = DummyDB()
proxy = 'socks5://localhost:9050'
monkeypatch.setattr(Scraper, '_test_proxy', lambda self: None)
scraper = Scraper(
base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy=proxy
)
assert scraper.session.proxies.get('http') == proxy
assert scraper.session.proxies.get('https') == proxy


def test_scraper_proxy_failure_detection(monkeypatch):
db = DummyDB()
def fake_head(self, url, timeout=5):
raise requests.exceptions.ProxyError("fail")

monkeypatch.setattr(requests.Session, 'head', fake_head)
with pytest.raises(ValueError):
Scraper(
base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy='http://proxy:8080'
)


def test_scrape_page_returns_none_for_empty_content(monkeypatch):
db = DummyDB()
scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
html = '<html><body></body></html>'

with patch('crawler_to_md.scraper.MarkItDown') as mock_markdown:
mock_markdown.return_value.convert.return_value = ''
content, metadata = scraper.scrape_page(html, 'http://example.com/empty')

assert content is None
assert metadata is None