obeone · obeone · Jul 9, 2025 · Jul 9, 2025 · Jul 9, 2025 · Jul 9, 2025
diff --git a/README.md b/README.md
@@ -52,7 +52,7 @@ pip install .
 Start scraping with the following command:
 
 ```shell
-crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>]
+crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>] [--proxy <PROXY_URL>]
 ```
 
 Options:
@@ -67,6 +67,7 @@ Options:
 - `--export-individual`, `-ei`: Export each page as an individual Markdown file. 📝
 - `--rate-limit`, `-rl`: Maximum number of requests per minute (default: 0, no rate limit). ⏱️
 - `--delay`, `-d`: Delay between requests in seconds (default: 0, no delay). 🕒
+- `--proxy`: Proxy URL for HTTP requests. 🌐
 
 One of the `--url` or `--urls-file` options is required.
 

diff --git a/crawler_to_md/cli.py b/crawler_to_md/cli.py
@@ -88,6 +88,11 @@ def main():
         help="Delay between requests in seconds",
         default=0,
     )
+    parser.add_argument(
+        "--proxy",
+        help="Proxy URL for HTTP requests",
+        default=None,
+    )
     parser.add_argument(
         "--no-markdown",
         action="store_true",
@@ -172,6 +177,7 @@ def main():
         db_manager=db_manager,
         rate_limit=args.rate_limit,
         delay=args.delay,
+        proxy=args.proxy,
     )
     logger.info("Scraper initialized.")
 

diff --git a/crawler_to_md/scraper.py b/crawler_to_md/scraper.py
@@ -24,6 +24,7 @@ def __init__(
         db_manager: DatabaseManager,
         rate_limit=0,
         delay=0,
+        proxy=None,
     ):
         """
         Initialize the Scraper object.
@@ -35,13 +36,18 @@ def __init__(
             db_manager (DatabaseManager): The database manager object.
             rate_limit (int): Maximum number of requests per minute.
             delay (float): Delay between requests in seconds.
+            proxy (str, optional): Proxy URL for HTTP requests.
         """
         logger.debug(f"Initializing Scraper with base URL: {base_url}")
         self.base_url = base_url
         self.exclude_patterns = exclude_patterns or []
         self.db_manager = db_manager
         self.rate_limit = rate_limit
         self.delay = delay
+        self.session = requests.Session()
+        if proxy:
+            self.session.proxies.update({"http": proxy, "https": proxy})
+        self.proxy = proxy
 
     def is_valid_link(self, link):
         """
@@ -79,7 +85,7 @@ def fetch_links(self, url, html=None):
         try:
             if not html:
                 # Send a GET request to the URL
-                response = requests.get(url)
+                response = self.session.get(url)
                 if response.status_code != 200:
                     logger.warning(
                         f"Failed to fetch {url} with status code {response.status_code}"
@@ -233,7 +239,7 @@ def start_scraping(self, url=None, urls_list=[]):
                 url = link[0]  # Extract the URL from the link tuple
 
                 # Attempt to fetch the page content
-                response = requests.get(url)
+                response = self.session.get(url)
 
                 # Increment request count for rate limiting
                 request_count += 1

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -44,3 +44,39 @@ def test_cli_disable_exports(monkeypatch, tmp_path):
     assert calls["md"] is False
     assert calls["json"] is False
 
+
+def test_cli_proxy_option(monkeypatch, tmp_path):
+    captured = {}
+
+    def fake_init(
+        self,
+        base_url,
+        exclude_patterns,
+        db_manager,
+        rate_limit=0,
+        delay=0,
+        proxy=None,
+    ):
+        captured['proxy'] = proxy
+
+    monkeypatch.setattr(Scraper, '__init__', fake_init)
+    monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)
+    monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
+    monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)
+
+    cache_folder = tmp_path / 'cache'
+    args = [
+        'prog',
+        '--url',
+        'http://example.com',
+        '--output-folder',
+        str(tmp_path),
+        '--cache-folder',
+        str(cache_folder),
+        '--proxy',
+        'http://proxy:8080',
+    ]
+    monkeypatch.setattr(sys, 'argv', args)
+    cli.main()
+    assert captured.get('proxy') == 'http://proxy:8080'
+
diff --git a/tests/test_scraper.py b/tests/test_scraper.py
@@ -152,7 +152,7 @@ class DummyResp:
         content = b'<html></html>'
         text = '<html></html>'
 
-    monkeypatch.setattr(requests, 'get', lambda url: DummyResp())
+    monkeypatch.setattr(scraper.session, 'get', lambda url: DummyResp())
 
     class DummyTqdm:
         def __init__(self, *a, **k):
@@ -171,3 +171,12 @@ def close(self):
     assert db.get_links_count() == 1
     assert db.get_visited_links_count() == 1
     assert db.pages[0][0] == 'http://example.com/page'
+
+
+def test_scraper_proxy_initialization():
+    db = DummyDB()
+    scraper = Scraper(
+        base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy='http://proxy:8080'
+    )
+    assert scraper.session.proxies.get('http') == 'http://proxy:8080'
+    assert scraper.session.proxies.get('https') == 'http://proxy:8080'