obeone · coderabbitai · Jul 9, 2025
diff --git a/crawler_to_md/cli.py b/crawler_to_md/cli.py
@@ -19,13 +19,9 @@
 
 def main():
     """
-    Main function to start the web scraper application.
-
-    This function parses command line arguments, initializes necessary components,
-    and manages the scraping and exporting process.
-
-    Raises:
-        ValueError: If neither a URL nor a URLs file is provided.
+    Parses command-line arguments and orchestrates the web scraping, data storage, and export process.
+
+    This function serves as the main entry point for the web scraper application. It handles argument parsing, initializes required components, manages the scraping workflow, and exports the results to Markdown and JSON formats as specified by the user. The function ensures necessary directories exist, validates input, and provides user feedback on output locations.
     """
     logger.info("Starting the web scraper application.")
 

diff --git a/crawler_to_md/scraper.py b/crawler_to_md/scraper.py
@@ -27,16 +27,14 @@ def __init__(
         proxy=None,
     ):
         """
-        Initialize the Scraper object.
-        Log the initialization process.
-
-        Args:
-            base_url (str): The base URL to start scraping from.
-            exclude_patterns (list): List of patterns to exclude from scraping.
-            db_manager (DatabaseManager): The database manager object.
-            rate_limit (int): Maximum number of requests per minute.
-            delay (float): Delay between requests in seconds.
-            proxy (str, optional): Proxy URL for HTTP requests.
+        Initializes a Scraper instance with base URL, exclusion patterns, database manager, and optional rate limiting, delay, and proxy settings.
+
+        Parameters:
+            base_url (str): The root URL from which scraping begins.
+            exclude_patterns (list): URL patterns to exclude from scraping.
+            rate_limit (int): Maximum number of requests allowed per minute.
+            delay (float): Time in seconds to wait between requests.
+            proxy (str, optional): Proxy URL to route HTTP requests through.
         """
         logger.debug(f"Initializing Scraper with base URL: {base_url}")
         self.base_url = base_url
@@ -51,14 +49,15 @@ def __init__(
 
     def is_valid_link(self, link):
         """
-        Check if the given link is valid for scraping.
-        Log the result of the validation.
-
-        Args:
-            link (str): The link to be checked.
-
+        Determine whether a given URL is eligible for scraping based on the base URL and exclusion patterns.
+
+        A link is considered valid if it starts with the configured base URL and does not contain any of the specified exclusion patterns.
+
+        Parameters:
+            link (str): The URL to validate.
+
         Returns:
-            bool: True if the link is valid, False otherwise.
+            bool: True if the link is valid for scraping; False otherwise.
         """
         valid = True
         if self.base_url and not link.startswith(self.base_url):
@@ -71,15 +70,16 @@ def is_valid_link(self, link):
 
     def fetch_links(self, url, html=None):
         """
-        Fetch all valid links from the given URL.
-        Log the fetching process and outcome.
-
-        Args:
-            url (str): The URL to fetch links from.
-            html (str, optional): The HTML content of the page.
-
+        Retrieve all valid links from a specified URL or provided HTML content.
+
+        If HTML is not provided, the method fetches the page content using an HTTP GET request. Extracts and resolves all anchor tag links, removes URL fragments, and filters them using the link validation logic. Returns a set of valid links found on the page. Returns an empty list if the request fails.
+
+        Parameters:
+            url (str): The URL to extract links from.
+            html (str, optional): HTML content to parse instead of fetching from the URL.
+
         Returns:
-            set: Set of valid links found on the page.
+            set: A set of valid, filtered links found on the page, or an empty set if none are found or on error.
         """
         logger.debug(f"Fetching links from {url}")
         try:
@@ -164,14 +164,9 @@ def scrape_page(self, html, url):
 
     def start_scraping(self, url=None, urls_list=[]):
         """
-        Initiates the scraping process for a single URL or a list of URLs.
-        It validates URLs, logs the scraping process, and manages the
-        progress of scraping through the database.
-
-        Args:
-            url (str, optional): A single URL to start scraping from. Defaults to None.
-            urls_list (list, optional): A list of URLs to scrape.
-        """
+        Starts the web scraping process from a given URL or list of URLs, managing progress, rate limiting, and database integration.
+
+        If a list of URLs is provided, only valid URLs are inserted into the database; otherwise, a single URL is used as the starting point. The method iteratively fetches unvisited links from the database, retrieves and processes each page, stores scraped content and metadata, and discovers new links to continue scraping (unless a predefined list is used). Progress is tracked with a progress bar, and rate limiting and delays are enforced as configured. The process continues until all discovered links have been visited.
         # Validate and insert the provided URLs into the database
         if urls_list:
             # Iterate through the list to check for valid URLs

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -40,12 +40,20 @@ def test_cli_default_exports(monkeypatch, tmp_path):
 
 
 def test_cli_disable_exports(monkeypatch, tmp_path):
+    """
+    Test that disabling both markdown and JSON exports via CLI flags prevents export methods from being called.
+    """
     calls = _run_cli(monkeypatch, tmp_path, ["--no-markdown", "--no-json"])
     assert calls["md"] is False
     assert calls["json"] is False
 
 
 def test_cli_proxy_option(monkeypatch, tmp_path):
+    """
+    Test that the CLI correctly passes the --proxy option to the Scraper constructor.
+
+    Verifies that when the CLI is invoked with a proxy URL, the Scraper instance receives the correct proxy argument.
+    """
     captured = {}
 
     def fake_init(
@@ -57,6 +65,17 @@ def fake_init(
         delay=0,
         proxy=None,
     ):
+        """
+        A replacement initializer for the Scraper class that captures the value of the 'proxy' argument for testing purposes.
+
+        Parameters:
+            base_url: The base URL for scraping.
+            exclude_patterns: Patterns to exclude from scraping.
+            db_manager: Database manager instance.
+            rate_limit: Optional rate limit for requests.
+            delay: Optional delay between requests.
+            proxy: Proxy URL to be captured for verification in tests.
+        """
         captured['proxy'] = proxy
 
     monkeypatch.setattr(Scraper, '__init__', fake_init)

diff --git a/tests/test_scraper.py b/tests/test_scraper.py
@@ -138,6 +138,11 @@ def get_all_pages(self):
 
 
 def test_start_scraping_process(monkeypatch):
+    """
+    Test the complete scraping process, verifying link tracking, page storage, and integration with mocked dependencies.
+
+    This test ensures that the `Scraper.start_scraping()` method correctly inserts and marks links as visited, stores scraped page content, and interacts properly with mocked HTTP requests and progress tracking.
+    """
     db = ListDB()
     scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
 
@@ -156,6 +161,12 @@ class DummyResp:
 
     class DummyTqdm:
         def __init__(self, *a, **k):
+            """
+            Initialize the dummy progress bar with an optional total count.
+
+            Parameters:
+            	total (int, optional): The total number of items to track. Defaults to 0.
+            """
             self.total = k.get('total', 0)
         def update(self, n):
             pass
@@ -174,6 +185,9 @@ def close(self):
 
 
 def test_scraper_proxy_initialization():
+    """
+    Test that the Scraper initializes its session with the correct HTTP and HTTPS proxy settings when a proxy URL is provided.
+    """
     db = DummyDB()
     scraper = Scraper(
         base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy='http://proxy:8080'