Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ pip install .
Start scraping with the following command:

```shell
crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>]
crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>] [--proxy <PROXY_URL>]
```

Options:
Expand All @@ -67,6 +67,7 @@ Options:
- `--export-individual`, `-ei`: Export each page as an individual Markdown file. 📝
- `--rate-limit`, `-rl`: Maximum number of requests per minute (default: 0, no rate limit). ⏱️
- `--delay`, `-d`: Delay between requests in seconds (default: 0, no delay). 🕒
- `--proxy`: Proxy URL for HTTP requests. 🌐

One of the `--url` or `--urls-file` options is required.

Expand Down
6 changes: 6 additions & 0 deletions crawler_to_md/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@ def main():
help="Delay between requests in seconds",
default=0,
)
parser.add_argument(
"--proxy",
help="Proxy URL for HTTP requests",
default=None,
)
parser.add_argument(
"--no-markdown",
action="store_true",
Expand Down Expand Up @@ -172,6 +177,7 @@ def main():
db_manager=db_manager,
rate_limit=args.rate_limit,
delay=args.delay,
proxy=args.proxy,
)
logger.info("Scraper initialized.")

Expand Down
10 changes: 8 additions & 2 deletions crawler_to_md/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def __init__(
db_manager: DatabaseManager,
rate_limit=0,
delay=0,
proxy=None,
):
"""
Initialize the Scraper object.
Expand All @@ -35,13 +36,18 @@ def __init__(
db_manager (DatabaseManager): The database manager object.
rate_limit (int): Maximum number of requests per minute.
delay (float): Delay between requests in seconds.
proxy (str, optional): Proxy URL for HTTP requests.
"""
logger.debug(f"Initializing Scraper with base URL: {base_url}")
self.base_url = base_url
self.exclude_patterns = exclude_patterns or []
self.db_manager = db_manager
self.rate_limit = rate_limit
self.delay = delay
self.session = requests.Session()
if proxy:
self.session.proxies.update({"http": proxy, "https": proxy})
self.proxy = proxy

def is_valid_link(self, link):
"""
Expand Down Expand Up @@ -79,7 +85,7 @@ def fetch_links(self, url, html=None):
try:
if not html:
# Send a GET request to the URL
response = requests.get(url)
response = self.session.get(url)
if response.status_code != 200:
logger.warning(
f"Failed to fetch {url} with status code {response.status_code}"
Expand Down Expand Up @@ -233,7 +239,7 @@ def start_scraping(self, url=None, urls_list=[]):
url = link[0] # Extract the URL from the link tuple

# Attempt to fetch the page content
response = requests.get(url)
response = self.session.get(url)

# Increment request count for rate limiting
request_count += 1
Expand Down
36 changes: 36 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,39 @@ def test_cli_disable_exports(monkeypatch, tmp_path):
assert calls["md"] is False
assert calls["json"] is False


def test_cli_proxy_option(monkeypatch, tmp_path):
captured = {}

def fake_init(
self,
base_url,
exclude_patterns,
db_manager,
rate_limit=0,
delay=0,
proxy=None,
):
captured['proxy'] = proxy

monkeypatch.setattr(Scraper, '__init__', fake_init)
monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)

cache_folder = tmp_path / 'cache'
args = [
'prog',
'--url',
'http://example.com',
'--output-folder',
str(tmp_path),
'--cache-folder',
str(cache_folder),
'--proxy',
'http://proxy:8080',
]
monkeypatch.setattr(sys, 'argv', args)
cli.main()
assert captured.get('proxy') == 'http://proxy:8080'

11 changes: 10 additions & 1 deletion tests/test_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ class DummyResp:
content = b'<html></html>'
text = '<html></html>'

monkeypatch.setattr(requests, 'get', lambda url: DummyResp())
monkeypatch.setattr(scraper.session, 'get', lambda url: DummyResp())

class DummyTqdm:
def __init__(self, *a, **k):
Expand All @@ -171,3 +171,12 @@ def close(self):
assert db.get_links_count() == 1
assert db.get_visited_links_count() == 1
assert db.pages[0][0] == 'http://example.com/page'


def test_scraper_proxy_initialization():
db = DummyDB()
scraper = Scraper(
base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy='http://proxy:8080'
)
assert scraper.session.proxies.get('http') == 'http://proxy:8080'
assert scraper.session.proxies.get('https') == 'http://proxy:8080'