Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ pip install .
Start scraping with the following command:

```shell
crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>]
crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>] [-p <PROXY_URL>]
```

Options:
Expand All @@ -67,6 +67,7 @@ Options:
- `--export-individual`, `-ei`: Export each page as an individual Markdown file. 📝
- `--rate-limit`, `-rl`: Maximum number of requests per minute (default: 0, no rate limit). ⏱️
- `--delay`, `-d`: Delay between requests in seconds (default: 0, no delay). 🕒
- `--proxy`, `-p`: Proxy URL for HTTP or SOCKS requests. 🌐

One of the `--url` or `--urls-file` options is required.

Expand Down
24 changes: 17 additions & 7 deletions crawler_to_md/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,12 @@ def main():
help="Delay between requests in seconds",
default=0,
)
parser.add_argument(
"--proxy",
"-p",
help="Proxy URL for HTTP or SOCKS requests",
default=None,
)
parser.add_argument(
"--no-markdown",
action="store_true",
Expand Down Expand Up @@ -166,13 +172,17 @@ def main():
db_manager = DatabaseManager(db_path)
logger.info("DatabaseManager initialized.")

scraper = Scraper(
base_url=args.base_url,
exclude_patterns=args.exclude,
db_manager=db_manager,
rate_limit=args.rate_limit,
delay=args.delay,
)
try:
scraper = Scraper(
base_url=args.base_url,
exclude_patterns=args.exclude,
db_manager=db_manager,
rate_limit=args.rate_limit,
delay=args.delay,
proxy=args.proxy,
)
except ValueError as exc:
parser.error(str(exc))
logger.info("Scraper initialized.")

# Start the scraping process
Expand Down
28 changes: 26 additions & 2 deletions crawler_to_md/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def __init__(
db_manager: DatabaseManager,
rate_limit=0,
delay=0,
proxy=None,
):
"""
Initialize the Scraper object.
Expand All @@ -35,13 +36,36 @@ def __init__(
db_manager (DatabaseManager): The database manager object.
rate_limit (int): Maximum number of requests per minute.
delay (float): Delay between requests in seconds.
proxy (str, optional): Proxy URL for HTTP or SOCKS requests.

Raises:
ValueError: If a proxy is provided but unreachable.
"""
logger.debug(f"Initializing Scraper with base URL: {base_url}")
self.base_url = base_url
self.exclude_patterns = exclude_patterns or []
self.db_manager = db_manager
self.rate_limit = rate_limit
self.delay = delay
self.session = requests.Session()
if proxy:
self.session.proxies.update({"http": proxy, "https": proxy})
self.proxy = proxy

if proxy:
self._test_proxy()

def _test_proxy(self):
"""
Ensure the configured proxy is reachable.

Raises:
ValueError: If the proxy cannot fetch the base URL.
"""
try:
self.session.head(self.base_url, timeout=5)
except requests.RequestException as exc:
raise ValueError(f"Proxy unreachable: {exc}") from exc

def is_valid_link(self, link):
"""
Expand Down Expand Up @@ -79,7 +103,7 @@ def fetch_links(self, url, html=None):
try:
if not html:
# Send a GET request to the URL
response = requests.get(url)
response = self.session.get(url)
if response.status_code != 200:
logger.warning(
f"Failed to fetch {url} with status code {response.status_code}"
Expand Down Expand Up @@ -233,7 +257,7 @@ def start_scraping(self, url=None, urls_list=[]):
url = link[0] # Extract the URL from the link tuple

# Attempt to fetch the page content
response = requests.get(url)
response = self.session.get(url)

# Increment request count for rate limiting
request_count += 1
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ dependencies = [
"mdformat_footnote==0.1.1",
"mdformat_frontmatter==2.0.8",
"mdformat_tables==1.0.0",
"requests==2.32.4",
"requests[socks]==2.32.4",
"tqdm==4.67.1",
"markitdown==0.1.2",
"coloredlogs==15.0.1",
Expand Down
133 changes: 133 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import sys

import pytest

from crawler_to_md import cli
from crawler_to_md.export_manager import ExportManager
from crawler_to_md.scraper import Scraper
Expand Down Expand Up @@ -44,3 +47,133 @@ def test_cli_disable_exports(monkeypatch, tmp_path):
assert calls["md"] is False
assert calls["json"] is False


def test_cli_proxy_option(monkeypatch, tmp_path):
captured = {}

def fake_init(
self,
base_url,
exclude_patterns,
db_manager,
rate_limit=0,
delay=0,
proxy=None,
):
captured['proxy'] = proxy

monkeypatch.setattr(Scraper, '__init__', fake_init)
monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)

cache_folder = tmp_path / 'cache'
args = [
'prog',
'--url',
'http://example.com',
'--output-folder',
str(tmp_path),
'--cache-folder',
str(cache_folder),
'--proxy',
'http://proxy:8080',
]
monkeypatch.setattr(sys, 'argv', args)
cli.main()
assert captured.get('proxy') == 'http://proxy:8080'


def test_cli_proxy_short_option(monkeypatch, tmp_path):
captured = {}

def fake_init(
self,
base_url,
exclude_patterns,
db_manager,
rate_limit=0,
delay=0,
proxy=None,
):
captured['proxy'] = proxy

monkeypatch.setattr(Scraper, '__init__', fake_init)
monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)

cache_folder = tmp_path / 'cache'
args = [
'prog',
'--url',
'http://example.com',
'--output-folder',
str(tmp_path),
'--cache-folder',
str(cache_folder),
'-p',
'http://proxy:8080',
]
monkeypatch.setattr(sys, 'argv', args)
cli.main()
assert captured.get('proxy') == 'http://proxy:8080'


def test_cli_socks_proxy(monkeypatch, tmp_path):
captured = {}

def fake_init(
self,
base_url,
exclude_patterns,
db_manager,
rate_limit=0,
delay=0,
proxy=None,
):
captured['proxy'] = proxy

monkeypatch.setattr(Scraper, '__init__', fake_init)
monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)

cache_folder = tmp_path / 'cache'
args = [
'prog',
'--url',
'http://example.com',
'--output-folder',
str(tmp_path),
'--cache-folder',
str(cache_folder),
'--proxy',
'socks5://localhost:9050',
]
monkeypatch.setattr(sys, 'argv', args)
cli.main()
assert captured.get('proxy') == 'socks5://localhost:9050'


def test_cli_proxy_error(monkeypatch, tmp_path):
def fake_init(*a, **k):
raise ValueError('Proxy unreachable')

monkeypatch.setattr(Scraper, '__init__', fake_init)
cache_folder = tmp_path / 'cache'
args = [
'prog',
'--url',
'http://example.com',
'--output-folder',
str(tmp_path),
'--cache-folder',
str(cache_folder),
'--proxy',
'http://proxy:8080',
]
monkeypatch.setattr(sys, 'argv', args)
with pytest.raises(SystemExit):
cli.main()

36 changes: 35 additions & 1 deletion tests/test_scraper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from unittest.mock import MagicMock, patch

import pytest
import requests
import tqdm

Expand Down Expand Up @@ -152,7 +153,7 @@ class DummyResp:
content = b'<html></html>'
text = '<html></html>'

monkeypatch.setattr(requests, 'get', lambda url: DummyResp())
monkeypatch.setattr(scraper.session, 'get', lambda url: DummyResp())

class DummyTqdm:
def __init__(self, *a, **k):
Expand All @@ -171,3 +172,36 @@ def close(self):
assert db.get_links_count() == 1
assert db.get_visited_links_count() == 1
assert db.pages[0][0] == 'http://example.com/page'


def test_scraper_proxy_initialization(monkeypatch):
db = DummyDB()
monkeypatch.setattr(Scraper, '_test_proxy', lambda self: None)
scraper = Scraper(
base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy='http://proxy:8080'
)
assert scraper.session.proxies.get('http') == 'http://proxy:8080'
assert scraper.session.proxies.get('https') == 'http://proxy:8080'


def test_scraper_socks_proxy_initialization(monkeypatch):
db = DummyDB()
proxy = 'socks5://localhost:9050'
monkeypatch.setattr(Scraper, '_test_proxy', lambda self: None)
scraper = Scraper(
base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy=proxy
)
assert scraper.session.proxies.get('http') == proxy
assert scraper.session.proxies.get('https') == proxy


def test_scraper_proxy_failure_detection(monkeypatch):
db = DummyDB()
def fake_head(self, url, timeout=5):
raise requests.exceptions.ProxyError("fail")

monkeypatch.setattr(requests.Session, 'head', fake_head)
with pytest.raises(ValueError):
Scraper(
base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy='http://proxy:8080'
)