Skip to content

Commit d0bc49e

Browse files
committed
feat(proxy): validate proxy and add short flag
1 parent 9483b05 commit d0bc49e

5 files changed

Lines changed: 110 additions & 12 deletions

File tree

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ pip install .
5252
Start scraping with the following command:
5353

5454
```shell
55-
crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>] [--proxy <PROXY_URL>]
55+
crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>] [-p <PROXY_URL>]
5656
```
5757

5858
Options:
@@ -67,7 +67,7 @@ Options:
6767
- `--export-individual`, `-ei`: Export each page as an individual Markdown file. 📝
6868
- `--rate-limit`, `-rl`: Maximum number of requests per minute (default: 0, no rate limit). ⏱️
6969
- `--delay`, `-d`: Delay between requests in seconds (default: 0, no delay). 🕒
70-
- `--proxy`: Proxy URL for HTTP or SOCKS requests. 🌐
70+
- `--proxy`, `-p`: Proxy URL for HTTP or SOCKS requests. 🌐
7171

7272
One of the `--url` or `--urls-file` options is required.
7373

crawler_to_md/cli.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ def main():
9090
)
9191
parser.add_argument(
9292
"--proxy",
93+
"-p",
9394
help="Proxy URL for HTTP or SOCKS requests",
9495
default=None,
9596
)
@@ -171,14 +172,17 @@ def main():
171172
db_manager = DatabaseManager(db_path)
172173
logger.info("DatabaseManager initialized.")
173174

174-
scraper = Scraper(
175-
base_url=args.base_url,
176-
exclude_patterns=args.exclude,
177-
db_manager=db_manager,
178-
rate_limit=args.rate_limit,
179-
delay=args.delay,
180-
proxy=args.proxy,
181-
)
175+
try:
176+
scraper = Scraper(
177+
base_url=args.base_url,
178+
exclude_patterns=args.exclude,
179+
db_manager=db_manager,
180+
rate_limit=args.rate_limit,
181+
delay=args.delay,
182+
proxy=args.proxy,
183+
)
184+
except ValueError as exc:
185+
parser.error(str(exc))
182186
logger.info("Scraper initialized.")
183187

184188
# Start the scraping process

crawler_to_md/scraper.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ def __init__(
3737
rate_limit (int): Maximum number of requests per minute.
3838
delay (float): Delay between requests in seconds.
3939
proxy (str, optional): Proxy URL for HTTP or SOCKS requests.
40+
41+
Raises:
42+
ValueError: If a proxy is provided but unreachable.
4043
"""
4144
logger.debug(f"Initializing Scraper with base URL: {base_url}")
4245
self.base_url = base_url
@@ -49,6 +52,21 @@ def __init__(
4952
self.session.proxies.update({"http": proxy, "https": proxy})
5053
self.proxy = proxy
5154

55+
if proxy:
56+
self._test_proxy()
57+
58+
def _test_proxy(self):
59+
"""
60+
Ensure the configured proxy is reachable.
61+
62+
Raises:
63+
ValueError: If the proxy cannot fetch the base URL.
64+
"""
65+
try:
66+
self.session.head(self.base_url, timeout=5)
67+
except requests.RequestException as exc:
68+
raise ValueError(f"Proxy unreachable: {exc}") from exc
69+
5270
def is_valid_link(self, link):
5371
"""
5472
Check if the given link is valid for scraping.

tests/test_cli.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import sys
22

3+
import pytest
4+
35
from crawler_to_md import cli
46
from crawler_to_md.export_manager import ExportManager
57
from crawler_to_md.scraper import Scraper
@@ -82,6 +84,42 @@ def fake_init(
8284
assert captured.get('proxy') == 'http://proxy:8080'
8385

8486

87+
def test_cli_proxy_short_option(monkeypatch, tmp_path):
88+
captured = {}
89+
90+
def fake_init(
91+
self,
92+
base_url,
93+
exclude_patterns,
94+
db_manager,
95+
rate_limit=0,
96+
delay=0,
97+
proxy=None,
98+
):
99+
captured['proxy'] = proxy
100+
101+
monkeypatch.setattr(Scraper, '__init__', fake_init)
102+
monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)
103+
monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
104+
monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)
105+
106+
cache_folder = tmp_path / 'cache'
107+
args = [
108+
'prog',
109+
'--url',
110+
'http://example.com',
111+
'--output-folder',
112+
str(tmp_path),
113+
'--cache-folder',
114+
str(cache_folder),
115+
'-p',
116+
'http://proxy:8080',
117+
]
118+
monkeypatch.setattr(sys, 'argv', args)
119+
cli.main()
120+
assert captured.get('proxy') == 'http://proxy:8080'
121+
122+
85123
def test_cli_socks_proxy(monkeypatch, tmp_path):
86124
captured = {}
87125

@@ -117,3 +155,25 @@ def fake_init(
117155
cli.main()
118156
assert captured.get('proxy') == 'socks5://localhost:9050'
119157

158+
159+
def test_cli_proxy_error(monkeypatch, tmp_path):
160+
def fake_init(*a, **k):
161+
raise ValueError('Proxy unreachable')
162+
163+
monkeypatch.setattr(Scraper, '__init__', fake_init)
164+
cache_folder = tmp_path / 'cache'
165+
args = [
166+
'prog',
167+
'--url',
168+
'http://example.com',
169+
'--output-folder',
170+
str(tmp_path),
171+
'--cache-folder',
172+
str(cache_folder),
173+
'--proxy',
174+
'http://proxy:8080',
175+
]
176+
monkeypatch.setattr(sys, 'argv', args)
177+
with pytest.raises(SystemExit):
178+
cli.main()
179+

tests/test_scraper.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from unittest.mock import MagicMock, patch
22

3+
import pytest
4+
import requests
35
import tqdm
46

57
from crawler_to_md.database_manager import DatabaseManager
@@ -172,20 +174,34 @@ def close(self):
172174
assert db.pages[0][0] == 'http://example.com/page'
173175

174176

175-
def test_scraper_proxy_initialization():
177+
def test_scraper_proxy_initialization(monkeypatch):
176178
db = DummyDB()
179+
monkeypatch.setattr(Scraper, '_test_proxy', lambda self: None)
177180
scraper = Scraper(
178181
base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy='http://proxy:8080'
179182
)
180183
assert scraper.session.proxies.get('http') == 'http://proxy:8080'
181184
assert scraper.session.proxies.get('https') == 'http://proxy:8080'
182185

183186

184-
def test_scraper_socks_proxy_initialization():
187+
def test_scraper_socks_proxy_initialization(monkeypatch):
185188
db = DummyDB()
186189
proxy = 'socks5://localhost:9050'
190+
monkeypatch.setattr(Scraper, '_test_proxy', lambda self: None)
187191
scraper = Scraper(
188192
base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy=proxy
189193
)
190194
assert scraper.session.proxies.get('http') == proxy
191195
assert scraper.session.proxies.get('https') == proxy
196+
197+
198+
def test_scraper_proxy_failure_detection(monkeypatch):
199+
db = DummyDB()
200+
def fake_head(self, url, timeout=5):
201+
raise requests.exceptions.ProxyError("fail")
202+
203+
monkeypatch.setattr(requests.Session, 'head', fake_head)
204+
with pytest.raises(ValueError):
205+
Scraper(
206+
base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy='http://proxy:8080'
207+
)

0 commit comments

Comments
 (0)