obeone · obeone · Jul 9, 2025 · Jul 9, 2025 · Jul 9, 2025 · Jul 9, 2025
diff --git a/README.md b/README.md
@@ -52,7 +52,7 @@ pip install .
 Start scraping with the following command:
 
 ```shell
-crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>]
+crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>] [-p <PROXY_URL>]
 ```
 
 Options:
@@ -67,6 +67,7 @@ Options:
 - `--export-individual`, `-ei`: Export each page as an individual Markdown file. 📝
 - `--rate-limit`, `-rl`: Maximum number of requests per minute (default: 0, no rate limit). ⏱️
 - `--delay`, `-d`: Delay between requests in seconds (default: 0, no delay). 🕒
+- `--proxy`, `-p`: Proxy URL for HTTP or SOCKS requests. 🌐
 
 One of the `--url` or `--urls-file` options is required.
 

diff --git a/crawler_to_md/cli.py b/crawler_to_md/cli.py
@@ -88,6 +88,12 @@ def main():
         help="Delay between requests in seconds",
         default=0,
     )
+    parser.add_argument(
+        "--proxy",
+        "-p",
+        help="Proxy URL for HTTP or SOCKS requests",
+        default=None,
+    )
     parser.add_argument(
         "--no-markdown",
         action="store_true",
@@ -166,13 +172,17 @@ def main():
     db_manager = DatabaseManager(db_path)
     logger.info("DatabaseManager initialized.")
 
-    scraper = Scraper(
-        base_url=args.base_url,
-        exclude_patterns=args.exclude,
-        db_manager=db_manager,
-        rate_limit=args.rate_limit,
-        delay=args.delay,
-    )
+    try:
+        scraper = Scraper(
+            base_url=args.base_url,
+            exclude_patterns=args.exclude,
+            db_manager=db_manager,
+            rate_limit=args.rate_limit,
+            delay=args.delay,
+            proxy=args.proxy,
+        )
+    except ValueError as exc:
+        parser.error(str(exc))
     logger.info("Scraper initialized.")
 
     # Start the scraping process

diff --git a/crawler_to_md/scraper.py b/crawler_to_md/scraper.py
@@ -24,6 +24,7 @@ def __init__(
         db_manager: DatabaseManager,
         rate_limit=0,
         delay=0,
+        proxy=None,
     ):
         """
         Initialize the Scraper object.
@@ -35,13 +36,36 @@ def __init__(
             db_manager (DatabaseManager): The database manager object.
             rate_limit (int): Maximum number of requests per minute.
             delay (float): Delay between requests in seconds.
+            proxy (str, optional): Proxy URL for HTTP or SOCKS requests.
+
+        Raises:
+            ValueError: If a proxy is provided but unreachable.
         """
         logger.debug(f"Initializing Scraper with base URL: {base_url}")
         self.base_url = base_url
         self.exclude_patterns = exclude_patterns or []
         self.db_manager = db_manager
         self.rate_limit = rate_limit
         self.delay = delay
+        self.session = requests.Session()
+        if proxy:
+            self.session.proxies.update({"http": proxy, "https": proxy})
+        self.proxy = proxy
+
+        if proxy:
+            self._test_proxy()
+
+    def _test_proxy(self):
+        """
+        Ensure the configured proxy is reachable.
+
+        Raises:
+            ValueError: If the proxy cannot fetch the base URL.
+        """
+        try:
+            self.session.head(self.base_url, timeout=5)
+        except requests.RequestException as exc:
+            raise ValueError(f"Proxy unreachable: {exc}") from exc
 
     def is_valid_link(self, link):
         """
@@ -79,7 +103,7 @@ def fetch_links(self, url, html=None):
         try:
             if not html:
                 # Send a GET request to the URL
-                response = requests.get(url)
+                response = self.session.get(url)
                 if response.status_code != 200:
                     logger.warning(
                         f"Failed to fetch {url} with status code {response.status_code}"
@@ -149,7 +173,13 @@ def scrape_page(self, html, url):
 
             os.remove(tmp_path)
 
-            logger.debug(f"Successfully scraped content and metadata from {url}")
+            if not markdown.strip():
+                logger.warning("No content scraped from %s", url)
+                return None, None
+
+            logger.debug(
+                "Successfully scraped content and metadata from %s", url
+            )
             return markdown, metadata
 
         except Exception as e:
@@ -233,7 +263,7 @@ def start_scraping(self, url=None, urls_list=[]):
                 url = link[0]  # Extract the URL from the link tuple
 
                 # Attempt to fetch the page content
-                response = requests.get(url)
+                response = self.session.get(url)
 
                 # Increment request count for rate limiting
                 request_count += 1

diff --git a/pyproject.toml b/pyproject.toml
@@ -27,7 +27,7 @@ dependencies = [
     "mdformat_footnote==0.1.1",
     "mdformat_frontmatter==2.0.8",
     "mdformat_tables==1.0.0",
-    "requests==2.32.4",
+    "requests[socks]==2.32.4",
     "tqdm==4.67.1",
     "markitdown==0.1.2",
     "coloredlogs==15.0.1",

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -1,4 +1,7 @@
 import sys
+
+import pytest
+
 from crawler_to_md import cli
 from crawler_to_md.export_manager import ExportManager
 from crawler_to_md.scraper import Scraper
@@ -44,3 +47,133 @@ def test_cli_disable_exports(monkeypatch, tmp_path):
     assert calls["md"] is False
     assert calls["json"] is False
 
+
+def test_cli_proxy_option(monkeypatch, tmp_path):
+    captured = {}
+
+    def fake_init(
+        self,
+        base_url,
+        exclude_patterns,
+        db_manager,
+        rate_limit=0,
+        delay=0,
+        proxy=None,
+    ):
+        captured['proxy'] = proxy
+
+    monkeypatch.setattr(Scraper, '__init__', fake_init)
+    monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)
+    monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
+    monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)
+
+    cache_folder = tmp_path / 'cache'
+    args = [
+        'prog',
+        '--url',
+        'http://example.com',
+        '--output-folder',
+        str(tmp_path),
+        '--cache-folder',
+        str(cache_folder),
+        '--proxy',
+        'http://proxy:8080',
+    ]
+    monkeypatch.setattr(sys, 'argv', args)
+    cli.main()
+    assert captured.get('proxy') == 'http://proxy:8080'
+
+
+def test_cli_proxy_short_option(monkeypatch, tmp_path):
+    captured = {}
+
+    def fake_init(
+        self,
+        base_url,
+        exclude_patterns,
+        db_manager,
+        rate_limit=0,
+        delay=0,
+        proxy=None,
+    ):
+        captured['proxy'] = proxy
+
+    monkeypatch.setattr(Scraper, '__init__', fake_init)
+    monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)
+    monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
+    monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)
+
+    cache_folder = tmp_path / 'cache'
+    args = [
+        'prog',
+        '--url',
+        'http://example.com',
+        '--output-folder',
+        str(tmp_path),
+        '--cache-folder',
+        str(cache_folder),
+        '-p',
+        'http://proxy:8080',
+    ]
+    monkeypatch.setattr(sys, 'argv', args)
+    cli.main()
+    assert captured.get('proxy') == 'http://proxy:8080'
+
+
+def test_cli_socks_proxy(monkeypatch, tmp_path):
+    captured = {}
+
+    def fake_init(
+        self,
+        base_url,
+        exclude_patterns,
+        db_manager,
+        rate_limit=0,
+        delay=0,
+        proxy=None,
+    ):
+        captured['proxy'] = proxy
+
+    monkeypatch.setattr(Scraper, '__init__', fake_init)
+    monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)
+    monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
+    monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)
+
+    cache_folder = tmp_path / 'cache'
+    args = [
+        'prog',
+        '--url',
+        'http://example.com',
+        '--output-folder',
+        str(tmp_path),
+        '--cache-folder',
+        str(cache_folder),
+        '--proxy',
+        'socks5://localhost:9050',
+    ]
+    monkeypatch.setattr(sys, 'argv', args)
+    cli.main()
+    assert captured.get('proxy') == 'socks5://localhost:9050'
+
+
+def test_cli_proxy_error(monkeypatch, tmp_path):
+    def fake_init(*a, **k):
+        raise ValueError('Proxy unreachable')
+
+    monkeypatch.setattr(Scraper, '__init__', fake_init)
+    cache_folder = tmp_path / 'cache'
+    args = [
+        'prog',
+        '--url',
+        'http://example.com',
+        '--output-folder',
+        str(tmp_path),
+        '--cache-folder',
+        str(cache_folder),
+        '--proxy',
+        'http://proxy:8080',
+    ]
+    monkeypatch.setattr(sys, 'argv', args)
+    with pytest.raises(SystemExit):
+        cli.main()
+
diff --git a/tests/test_scraper.py b/tests/test_scraper.py
@@ -1,5 +1,6 @@
 from unittest.mock import MagicMock, patch
 
+import pytest
 import requests
 import tqdm
 
@@ -152,7 +153,7 @@ class DummyResp:
         content = b'<html></html>'
         text = '<html></html>'
 
-    monkeypatch.setattr(requests, 'get', lambda url: DummyResp())
+    monkeypatch.setattr(scraper.session, 'get', lambda url: DummyResp())
 
     class DummyTqdm:
         def __init__(self, *a, **k):
@@ -171,3 +172,49 @@ def close(self):
     assert db.get_links_count() == 1
     assert db.get_visited_links_count() == 1
     assert db.pages[0][0] == 'http://example.com/page'
+
+
+def test_scraper_proxy_initialization(monkeypatch):
+    db = DummyDB()
+    monkeypatch.setattr(Scraper, '_test_proxy', lambda self: None)
+    scraper = Scraper(
+        base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy='http://proxy:8080'
+    )
+    assert scraper.session.proxies.get('http') == 'http://proxy:8080'
+    assert scraper.session.proxies.get('https') == 'http://proxy:8080'
+
+
+def test_scraper_socks_proxy_initialization(monkeypatch):
+    db = DummyDB()
+    proxy = 'socks5://localhost:9050'
+    monkeypatch.setattr(Scraper, '_test_proxy', lambda self: None)
+    scraper = Scraper(
+        base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy=proxy
+    )
+    assert scraper.session.proxies.get('http') == proxy
+    assert scraper.session.proxies.get('https') == proxy
+
+
+def test_scraper_proxy_failure_detection(monkeypatch):
+    db = DummyDB()
+    def fake_head(self, url, timeout=5):
+        raise requests.exceptions.ProxyError("fail")
+
+    monkeypatch.setattr(requests.Session, 'head', fake_head)
+    with pytest.raises(ValueError):
+        Scraper(
+            base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy='http://proxy:8080'
+        )
+
+
+def test_scrape_page_returns_none_for_empty_content(monkeypatch):
+    db = DummyDB()
+    scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
+    html = '<html><body></body></html>'
+
+    with patch('crawler_to_md.scraper.MarkItDown') as mock_markdown:
+        mock_markdown.return_value.convert.return_value = ''
+        content, metadata = scraper.scrape_page(html, 'http://example.com/empty')
+
+    assert content is None
+    assert metadata is None