obeone · obeone · Aug 6, 2025 · Aug 5, 2025 · Aug 5, 2025
diff --git a/README.md b/README.md
@@ -35,6 +35,7 @@ crawler-to-md --url https://www.example.com
 - Exports each page as an individual Markdown file if `--export-individual` is used. 📝
 - Uses SQLite for efficient data management. 📊
 - Configurable via command-line arguments. ⚙️
+- Include or exclude specific HTML elements using CSS-like selectors (#id, .class, tag) during Markdown conversion. 🧩
 - Docker support. 🐳
 
 ## 📋 Requirements
@@ -52,7 +53,7 @@ pip install .
 Start scraping with the following command:
 
 ```shell
-crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--overwrite-cache|-w] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>] [-p <PROXY_URL>]
+crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--overwrite-cache|-w] [--base-url <BASE_URL>] [--exclude-url <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>] [-p <PROXY_URL>]
 ```
 
 Options:
@@ -64,11 +65,13 @@ Options:
 - `--overwrite-cache`, `-w`: Overwrite existing cache database before scraping. 🧹
 - `--base-url`, `-b`: Filter links by base URL (default: URL's base). 🔎
 - `--title`, `-t`: Final title of the markdown file. Defaults to the URL. 🏷️
-- `--exclude`, `-e`: Exclude URLs containing this string (repeatable). ❌
+- `--exclude-url`, `-e`: Exclude URLs containing this string (repeatable). ❌
 - `--export-individual`, `-ei`: Export each page as an individual Markdown file. 📝
 - `--rate-limit`, `-rl`: Maximum number of requests per minute (default: 0, no rate limit). ⏱️
 - `--delay`, `-d`: Delay between requests in seconds (default: 0, no delay). 🕒
 - `--proxy`, `-p`: Proxy URL for HTTP or SOCKS requests. 🌐
+- `--include`, `-i`: CSS-like selector (#id, .class, tag) to include before Markdown conversion (repeatable). ✅
+- `--exclude`, `-x`: CSS-like selector (#id, .class, tag) to exclude before Markdown conversion (repeatable). 🚫
 
 One of the `--url` or `--urls-file` options is required.
 

diff --git a/crawler_to_md/cli.py b/crawler_to_md/cli.py
@@ -68,7 +68,7 @@ def main():
         help="Final title of the markdown file. Defaults to the URL",
     )
     parser.add_argument(
-        "--exclude",
+        "--exclude-url",
         "-e",
         action="append",
         help="Exclude URLs containing this string",
@@ -113,6 +113,26 @@ def main():
         help="Disable generation of the compiled JSON file",
         default=False,
     )
+    parser.add_argument(
+        "--include",
+        "-i",
+        action="append",
+        help=(
+            "CSS-like selector (#id, .class, tag) to include before Markdown "
+            "conversion. Repeatable."
+        ),
+        default=[],
+    )
+    parser.add_argument(
+        "--exclude",
+        "-x",
+        action="append",
+        help=(
+            "CSS-like selector (#id, .class, tag) to exclude before Markdown "
+            "conversion. Repeatable."
+        ),
+        default=[],
+    )
 
     try:
         import argcomplete
@@ -189,11 +209,13 @@ def main():
     try:
         scraper = Scraper(
             base_url=args.base_url,
-            exclude_patterns=args.exclude,
+            exclude_patterns=args.exclude_url,
             db_manager=db_manager,
             rate_limit=args.rate_limit,
             delay=args.delay,
             proxy=args.proxy,
+            include_filters=args.include,
+            exclude_filters=args.exclude,
         )
     except ValueError as exc:
         parser.error(str(exc))

diff --git a/crawler_to_md/scraper.py b/crawler_to_md/scraper.py
@@ -1,3 +1,4 @@
+import copy
 import json
 import os
 import tempfile
@@ -25,18 +26,23 @@ def __init__(
         rate_limit=0,
         delay=0,
         proxy=None,
+        include_filters=None,
+        exclude_filters=None,
     ):
         """
-        Initialize the Scraper object.
-        Log the initialization process.
+        Initialize the Scraper object and log the initialization process.
 
         Args:
             base_url (str): The base URL to start scraping from.
-            exclude_patterns (list): List of patterns to exclude from scraping.
+            exclude_patterns (list): List of URL patterns to exclude from scraping.
             db_manager (DatabaseManager): The database manager object.
             rate_limit (int): Maximum number of requests per minute.
             delay (float): Delay between requests in seconds.
             proxy (str, optional): Proxy URL for HTTP or SOCKS requests.
+            include_filters (list, optional): CSS-like selectors (#id, .class, tag)
+                of elements to include before Markdown conversion.
+            exclude_filters (list, optional): CSS-like selectors (#id, .class, tag)
+                of elements to exclude before Markdown conversion.
 
         Raises:
             ValueError: If a proxy is provided but unreachable.
@@ -52,6 +58,9 @@ def __init__(
             self.session.proxies.update({"http": proxy, "https": proxy})
         self.proxy = proxy
 
+        self.include_filters = include_filters or []
+        self.exclude_filters = exclude_filters or []
+
         if proxy:
             self._test_proxy()
 
@@ -67,6 +76,24 @@ def _test_proxy(self):
         except requests.RequestException as exc:
             raise ValueError(f"Proxy unreachable: {exc}") from exc
 
+    def _find_elements(self, soup: BeautifulSoup, selector: str):
+        """
+        Locate elements in the soup using a CSS-like selector.
+
+        Args:
+            soup (BeautifulSoup): Parsed HTML document.
+            selector (str): Selector in the form of '#id', '.class', or tag name.
+
+        Returns:
+            list[Tag]: List of matching elements.
+        """
+        if selector.startswith("#"):
+            element = soup.find(id=selector[1:])
+            return [element] if element else []
+        if selector.startswith("."):
+            return soup.find_all(class_=selector[1:])
+        return soup.find_all(selector)
+
     def is_valid_link(self, link):
         """
         Check if the given link is valid for scraping.
@@ -157,16 +184,40 @@ def scrape_page(self, html, url):
             # Parse the content using BeautifulSoup
             soup = BeautifulSoup(html, "html.parser")
 
+            if self.include_filters:
+                # Create a new soup to hold the included elements
+                new_soup = BeautifulSoup("", "html.parser")
+                # Ensure the new soup has a body tag if it's a full HTML document
+                if soup.find("body"):
+                    body = new_soup.new_tag("body")
+                    new_soup.append(body)
+                else:
+                    body = new_soup
+
+                elements = []
+                for selector in self.include_filters:
+                    elements.extend(self._find_elements(soup, selector))
+
+                # Append a copy of each element to the new soup to maintain structure
+                for el in elements:
+                    body.append(copy.copy(el))
+                soup = new_soup
+
+            for selector in self.exclude_filters:
+                for element in self._find_elements(soup, selector):
+                    element.decompose()
+
             # Extract title from the page
             title = soup.title.string if soup.title else ""
 
             metadata = {"title": title}
 
+            filtered_html = str(soup)
             # Convert the HTML to Markdown
             with tempfile.NamedTemporaryFile(
                 mode="w+", delete=False, suffix=".html"
             ) as tmp:
-                tmp.write(html)
+                tmp.write(filtered_html)
                 tmp_path = tmp.name
 
             markdown = str(MarkItDown().convert(tmp_path))

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -62,7 +62,15 @@ def fake_init(
         rate_limit=0,
         delay=0,
         proxy=None,
+        include_filters=None,
+        exclude_filters=None,
     ):
+        """
+        Fake initializer to capture proxy argument.
+
+        Args:
+            proxy (str, optional): Proxy URL.
+        """
         captured['proxy'] = proxy
 
     monkeypatch.setattr(Scraper, '__init__', fake_init)
@@ -98,7 +106,15 @@ def fake_init(
         rate_limit=0,
         delay=0,
         proxy=None,
+        include_filters=None,
+        exclude_filters=None,
     ):
+        """
+        Fake initializer to capture proxy argument.
+
+        Args:
+            proxy (str, optional): Proxy URL.
+        """
         captured['proxy'] = proxy
 
     monkeypatch.setattr(Scraper, '__init__', fake_init)
@@ -134,7 +150,15 @@ def fake_init(
         rate_limit=0,
         delay=0,
         proxy=None,
+        include_filters=None,
+        exclude_filters=None,
     ):
+        """
+        Fake initializer to capture proxy argument.
+
+        Args:
+            proxy (str, optional): Proxy URL.
+        """
         captured['proxy'] = proxy
 
     monkeypatch.setattr(Scraper, '__init__', fake_init)
@@ -181,6 +205,118 @@ def fake_init(*a, **k):
         cli.main()
 
 
+def test_cli_include_exclude_options(monkeypatch, tmp_path):
+    """
+    Ensure CLI passes include and exclude options to the scraper.
+
+    Args:
+        monkeypatch (pytest.MonkeyPatch): Pytest monkeypatch fixture.
+        tmp_path (pathlib.Path): Temporary path for tests.
+    """
+    captured = {}
+
+    def fake_init(
+        self,
+        base_url,
+        exclude_patterns,
+        db_manager,
+        rate_limit=0,
+        delay=0,
+        proxy=None,
+        include_filters=None,
+        exclude_filters=None,
+    ):
+        """
+        Fake initializer to capture include/exclude arguments.
+
+        Args:
+            include_filters (list, optional): Selectors to include.
+            exclude_filters (list, optional): Selectors to exclude.
+        """
+        captured['include_filters'] = include_filters
+        captured['exclude_filters'] = exclude_filters
+
+    monkeypatch.setattr(Scraper, '__init__', fake_init)
+    monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)
+    monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
+    monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)
+
+    cache_folder = tmp_path / 'cache'
+    args = [
+        'prog',
+        '--url',
+        'http://example.com',
+        '--output-folder',
+        str(tmp_path),
+        '--cache-folder',
+        str(cache_folder),
+        '--include',
+        'p',
+        '--exclude',
+        '.remove',
+    ]
+    monkeypatch.setattr(sys, 'argv', args)
+    cli.main()
+    assert captured.get('include_filters') == ['p']
+    assert captured.get('exclude_filters') == ['.remove']
+
+
+def test_cli_include_exclude_short_options(monkeypatch, tmp_path):
+    """
+    Ensure short CLI options map to include and exclude selectors.
+
+    Args:
+        monkeypatch (pytest.MonkeyPatch): Pytest monkeypatch fixture.
+        tmp_path (pathlib.Path): Temporary path for tests.
+    """
+    captured = {}
+
+    def fake_init(
+        self,
+        base_url,
+        exclude_patterns,
+        db_manager,
+        rate_limit=0,
+        delay=0,
+        proxy=None,
+        include_filters=None,
+        exclude_filters=None,
+    ):
+        """
+        Capture include and exclude selectors from short options.
+
+        Args:
+            include_filters (list, optional): Selectors to include.
+            exclude_filters (list, optional): Selectors to exclude.
+        """
+        captured['include_filters'] = include_filters
+        captured['exclude_filters'] = exclude_filters
+
+    monkeypatch.setattr(Scraper, '__init__', fake_init)
+    monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)
+    monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
+    monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)
+
+    cache_folder = tmp_path / 'cache'
+    args = [
+        'prog',
+        '--url',
+        'http://example.com',
+        '--output-folder',
+        str(tmp_path),
+        '--cache-folder',
+        str(cache_folder),
+        '-i',
+        '#keep',
+        '-x',
+        'span',
+    ]
+    monkeypatch.setattr(sys, 'argv', args)
+    cli.main()
+    assert captured.get('include_filters') == ['#keep']
+    assert captured.get('exclude_filters') == ['span']
+
+
 def test_cli_overwrite_cache(monkeypatch, tmp_path):
     captured = {}