diff --git a/README.md b/README.md index 922322e..c0b5d69 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ crawler-to-md --url https://www.example.com - Exports each page as an individual Markdown file if `--export-individual` is used. ๐Ÿ“ - Uses SQLite for efficient data management. ๐Ÿ“Š - Configurable via command-line arguments. โš™๏ธ +- Include or exclude specific HTML elements using CSS-like selectors (#id, .class, tag) during Markdown conversion. ๐Ÿงฉ - Docker support. ๐Ÿณ ## ๐Ÿ“‹ Requirements @@ -52,7 +53,7 @@ pip install . Start scraping with the following command: ```shell -crawler-to-md --url [--output-folder ./output] [--cache-folder ./cache] [--overwrite-cache|-w] [--base-url ] [--exclude ] [--title ] [--urls-file <URLS_FILE>] [-p <PROXY_URL>] +crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--overwrite-cache|-w] [--base-url <BASE_URL>] [--exclude-url <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>] [-p <PROXY_URL>] ``` Options: @@ -64,11 +65,13 @@ Options: - `--overwrite-cache`, `-w`: Overwrite existing cache database before scraping. ๐Ÿงน - `--base-url`, `-b`: Filter links by base URL (default: URL's base). ๐Ÿ”Ž - `--title`, `-t`: Final title of the markdown file. Defaults to the URL. ๐Ÿท๏ธ -- `--exclude`, `-e`: Exclude URLs containing this string (repeatable). โŒ +- `--exclude-url`, `-e`: Exclude URLs containing this string (repeatable). โŒ - `--export-individual`, `-ei`: Export each page as an individual Markdown file. ๐Ÿ“ - `--rate-limit`, `-rl`: Maximum number of requests per minute (default: 0, no rate limit). โฑ๏ธ - `--delay`, `-d`: Delay between requests in seconds (default: 0, no delay). ๐Ÿ•’ - `--proxy`, `-p`: Proxy URL for HTTP or SOCKS requests. ๐ŸŒ +- `--include`, `-i`: CSS-like selector (#id, .class, tag) to include before Markdown conversion (repeatable). โœ… +- `--exclude`, `-x`: CSS-like selector (#id, .class, tag) to exclude before Markdown conversion (repeatable). ๐Ÿšซ One of the `--url` or `--urls-file` options is required. diff --git a/crawler_to_md/cli.py b/crawler_to_md/cli.py index 86cde6c..16d93db 100644 --- a/crawler_to_md/cli.py +++ b/crawler_to_md/cli.py @@ -68,7 +68,7 @@ def main(): help="Final title of the markdown file. Defaults to the URL", ) parser.add_argument( - "--exclude", + "--exclude-url", "-e", action="append", help="Exclude URLs containing this string", @@ -113,6 +113,26 @@ def main(): help="Disable generation of the compiled JSON file", default=False, ) + parser.add_argument( + "--include", + "-i", + action="append", + help=( + "CSS-like selector (#id, .class, tag) to include before Markdown " + "conversion. Repeatable." + ), + default=[], + ) + parser.add_argument( + "--exclude", + "-x", + action="append", + help=( + "CSS-like selector (#id, .class, tag) to exclude before Markdown " + "conversion. Repeatable." + ), + default=[], + ) try: import argcomplete @@ -189,11 +209,13 @@ def main(): try: scraper = Scraper( base_url=args.base_url, - exclude_patterns=args.exclude, + exclude_patterns=args.exclude_url, db_manager=db_manager, rate_limit=args.rate_limit, delay=args.delay, proxy=args.proxy, + include_filters=args.include, + exclude_filters=args.exclude, ) except ValueError as exc: parser.error(str(exc)) diff --git a/crawler_to_md/scraper.py b/crawler_to_md/scraper.py index 112a31b..f35053c 100644 --- a/crawler_to_md/scraper.py +++ b/crawler_to_md/scraper.py @@ -1,3 +1,4 @@ +import copy import json import os import tempfile @@ -25,18 +26,23 @@ def __init__( rate_limit=0, delay=0, proxy=None, + include_filters=None, + exclude_filters=None, ): """ - Initialize the Scraper object. - Log the initialization process. + Initialize the Scraper object and log the initialization process. Args: base_url (str): The base URL to start scraping from. - exclude_patterns (list): List of patterns to exclude from scraping. + exclude_patterns (list): List of URL patterns to exclude from scraping. db_manager (DatabaseManager): The database manager object. rate_limit (int): Maximum number of requests per minute. delay (float): Delay between requests in seconds. proxy (str, optional): Proxy URL for HTTP or SOCKS requests. + include_filters (list, optional): CSS-like selectors (#id, .class, tag) + of elements to include before Markdown conversion. + exclude_filters (list, optional): CSS-like selectors (#id, .class, tag) + of elements to exclude before Markdown conversion. Raises: ValueError: If a proxy is provided but unreachable. @@ -52,6 +58,9 @@ def __init__( self.session.proxies.update({"http": proxy, "https": proxy}) self.proxy = proxy + self.include_filters = include_filters or [] + self.exclude_filters = exclude_filters or [] + if proxy: self._test_proxy() @@ -67,6 +76,24 @@ def _test_proxy(self): except requests.RequestException as exc: raise ValueError(f"Proxy unreachable: {exc}") from exc + def _find_elements(self, soup: BeautifulSoup, selector: str): + """ + Locate elements in the soup using a CSS-like selector. + + Args: + soup (BeautifulSoup): Parsed HTML document. + selector (str): Selector in the form of '#id', '.class', or tag name. + + Returns: + list[Tag]: List of matching elements. + """ + if selector.startswith("#"): + element = soup.find(id=selector[1:]) + return [element] if element else [] + if selector.startswith("."): + return soup.find_all(class_=selector[1:]) + return soup.find_all(selector) + def is_valid_link(self, link): """ Check if the given link is valid for scraping. @@ -157,16 +184,40 @@ def scrape_page(self, html, url): # Parse the content using BeautifulSoup soup = BeautifulSoup(html, "html.parser") + if self.include_filters: + # Create a new soup to hold the included elements + new_soup = BeautifulSoup("", "html.parser") + # Ensure the new soup has a body tag if it's a full HTML document + if soup.find("body"): + body = new_soup.new_tag("body") + new_soup.append(body) + else: + body = new_soup + + elements = [] + for selector in self.include_filters: + elements.extend(self._find_elements(soup, selector)) + + # Append a copy of each element to the new soup to maintain structure + for el in elements: + body.append(copy.copy(el)) + soup = new_soup + + for selector in self.exclude_filters: + for element in self._find_elements(soup, selector): + element.decompose() + # Extract title from the page title = soup.title.string if soup.title else "" metadata = {"title": title} + filtered_html = str(soup) # Convert the HTML to Markdown with tempfile.NamedTemporaryFile( mode="w+", delete=False, suffix=".html" ) as tmp: - tmp.write(html) + tmp.write(filtered_html) tmp_path = tmp.name markdown = str(MarkItDown().convert(tmp_path)) diff --git a/tests/test_cli.py b/tests/test_cli.py index 22416ff..013c2cd 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -62,7 +62,15 @@ def fake_init( rate_limit=0, delay=0, proxy=None, + include_filters=None, + exclude_filters=None, ): + """ + Fake initializer to capture proxy argument. + + Args: + proxy (str, optional): Proxy URL. + """ captured['proxy'] = proxy monkeypatch.setattr(Scraper, '__init__', fake_init) @@ -98,7 +106,15 @@ def fake_init( rate_limit=0, delay=0, proxy=None, + include_filters=None, + exclude_filters=None, ): + """ + Fake initializer to capture proxy argument. + + Args: + proxy (str, optional): Proxy URL. + """ captured['proxy'] = proxy monkeypatch.setattr(Scraper, '__init__', fake_init) @@ -134,7 +150,15 @@ def fake_init( rate_limit=0, delay=0, proxy=None, + include_filters=None, + exclude_filters=None, ): + """ + Fake initializer to capture proxy argument. + + Args: + proxy (str, optional): Proxy URL. + """ captured['proxy'] = proxy monkeypatch.setattr(Scraper, '__init__', fake_init) @@ -181,6 +205,118 @@ def fake_init(*a, **k): cli.main() +def test_cli_include_exclude_options(monkeypatch, tmp_path): + """ + Ensure CLI passes include and exclude options to the scraper. + + Args: + monkeypatch (pytest.MonkeyPatch): Pytest monkeypatch fixture. + tmp_path (pathlib.Path): Temporary path for tests. + """ + captured = {} + + def fake_init( + self, + base_url, + exclude_patterns, + db_manager, + rate_limit=0, + delay=0, + proxy=None, + include_filters=None, + exclude_filters=None, + ): + """ + Fake initializer to capture include/exclude arguments. + + Args: + include_filters (list, optional): Selectors to include. + exclude_filters (list, optional): Selectors to exclude. + """ + captured['include_filters'] = include_filters + captured['exclude_filters'] = exclude_filters + + monkeypatch.setattr(Scraper, '__init__', fake_init) + monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None) + monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None) + monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None) + + cache_folder = tmp_path / 'cache' + args = [ + 'prog', + '--url', + 'http://example.com', + '--output-folder', + str(tmp_path), + '--cache-folder', + str(cache_folder), + '--include', + 'p', + '--exclude', + '.remove', + ] + monkeypatch.setattr(sys, 'argv', args) + cli.main() + assert captured.get('include_filters') == ['p'] + assert captured.get('exclude_filters') == ['.remove'] + + +def test_cli_include_exclude_short_options(monkeypatch, tmp_path): + """ + Ensure short CLI options map to include and exclude selectors. + + Args: + monkeypatch (pytest.MonkeyPatch): Pytest monkeypatch fixture. + tmp_path (pathlib.Path): Temporary path for tests. + """ + captured = {} + + def fake_init( + self, + base_url, + exclude_patterns, + db_manager, + rate_limit=0, + delay=0, + proxy=None, + include_filters=None, + exclude_filters=None, + ): + """ + Capture include and exclude selectors from short options. + + Args: + include_filters (list, optional): Selectors to include. + exclude_filters (list, optional): Selectors to exclude. + """ + captured['include_filters'] = include_filters + captured['exclude_filters'] = exclude_filters + + monkeypatch.setattr(Scraper, '__init__', fake_init) + monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None) + monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None) + monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None) + + cache_folder = tmp_path / 'cache' + args = [ + 'prog', + '--url', + 'http://example.com', + '--output-folder', + str(tmp_path), + '--cache-folder', + str(cache_folder), + '-i', + '#keep', + '-x', + 'span', + ] + monkeypatch.setattr(sys, 'argv', args) + cli.main() + assert captured.get('include_filters') == ['#keep'] + assert captured.get('exclude_filters') == ['span'] + + def test_cli_overwrite_cache(monkeypatch, tmp_path): captured = {} diff --git a/tests/test_scraper.py b/tests/test_scraper.py index ccb8b2d..c2b1799 100644 --- a/tests/test_scraper.py +++ b/tests/test_scraper.py @@ -103,6 +103,47 @@ def test_scrape_page_with_markitdown(mock_tempfile, mock_os_remove): assert metadata.get('title') == 'Test' +@patch('os.remove') +@patch('tempfile.NamedTemporaryFile') +def test_scrape_page_include_exclude(mock_tempfile, mock_os_remove): + """ + Verify include and exclude selectors filter HTML before conversion. + + Args: + mock_tempfile (MagicMock): Mock for NamedTemporaryFile. + mock_os_remove (MagicMock): Mock for os.remove. + """ + mock_file = MagicMock() + mock_file.name = "dummy_path" + mock_tempfile.return_value.__enter__.return_value = mock_file + + db = DummyDB() + scraper = Scraper( + base_url='http://example.com', + exclude_patterns=[], + db_manager=db, + include_filters=['p'], + exclude_filters=['.remove'], + ) + html = ( + '<html><body><p class="keep">Keep</p>' + '<p class="remove">Remove</p><span>Ignore</span></body></html>' + ) + + with patch('crawler_to_md.scraper.MarkItDown') as mock_markdown: + def convert_side_effect(path): + """Return the HTML written to the temporary file.""" + return mock_file.write.call_args[0][0] + + mock_markdown.return_value.convert.side_effect = convert_side_effect + content, metadata = scraper.scrape_page(html, 'http://example.com/test') + + assert 'Keep' in content + assert 'Remove' not in content + assert 'Ignore' not in content + assert metadata.get('title') == '' + + class ListDB(DummyDB): def __init__(self):