Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions crawler_to_md/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,9 @@

def main():
"""
Main function to start the web scraper application.

This function parses command line arguments, initializes necessary components,
and manages the scraping and exporting process.

Raises:
ValueError: If neither a URL nor a URLs file is provided.
Parses command-line arguments and orchestrates the web scraping, data storage, and export process.

This function serves as the main entry point for the web scraper application. It handles argument parsing, initializes required components, manages the scraping workflow, and exports the results to Markdown and JSON formats as specified by the user. The function ensures necessary directories exist, validates input, and provides user feedback on output locations.
"""
logger.info("Starting the web scraper application.")

Expand Down
61 changes: 28 additions & 33 deletions crawler_to_md/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,14 @@ def __init__(
proxy=None,
):
"""
Initialize the Scraper object.
Log the initialization process.

Args:
base_url (str): The base URL to start scraping from.
exclude_patterns (list): List of patterns to exclude from scraping.
db_manager (DatabaseManager): The database manager object.
rate_limit (int): Maximum number of requests per minute.
delay (float): Delay between requests in seconds.
proxy (str, optional): Proxy URL for HTTP requests.
Initializes a Scraper instance with base URL, exclusion patterns, database manager, and optional rate limiting, delay, and proxy settings.

Parameters:
base_url (str): The root URL from which scraping begins.
exclude_patterns (list): URL patterns to exclude from scraping.
rate_limit (int): Maximum number of requests allowed per minute.
delay (float): Time in seconds to wait between requests.
proxy (str, optional): Proxy URL to route HTTP requests through.
"""
logger.debug(f"Initializing Scraper with base URL: {base_url}")
self.base_url = base_url
Expand All @@ -51,14 +49,15 @@ def __init__(

def is_valid_link(self, link):
"""
Check if the given link is valid for scraping.
Log the result of the validation.

Args:
link (str): The link to be checked.

Determine whether a given URL is eligible for scraping based on the base URL and exclusion patterns.

A link is considered valid if it starts with the configured base URL and does not contain any of the specified exclusion patterns.

Parameters:
link (str): The URL to validate.

Returns:
bool: True if the link is valid, False otherwise.
bool: True if the link is valid for scraping; False otherwise.
"""
valid = True
if self.base_url and not link.startswith(self.base_url):
Expand All @@ -71,15 +70,16 @@ def is_valid_link(self, link):

def fetch_links(self, url, html=None):
"""
Fetch all valid links from the given URL.
Log the fetching process and outcome.

Args:
url (str): The URL to fetch links from.
html (str, optional): The HTML content of the page.

Retrieve all valid links from a specified URL or provided HTML content.

If HTML is not provided, the method fetches the page content using an HTTP GET request. Extracts and resolves all anchor tag links, removes URL fragments, and filters them using the link validation logic. Returns a set of valid links found on the page. Returns an empty list if the request fails.

Parameters:
url (str): The URL to extract links from.
html (str, optional): HTML content to parse instead of fetching from the URL.

Returns:
set: Set of valid links found on the page.
set: A set of valid, filtered links found on the page, or an empty set if none are found or on error.
"""
logger.debug(f"Fetching links from {url}")
try:
Expand Down Expand Up @@ -164,14 +164,9 @@ def scrape_page(self, html, url):

def start_scraping(self, url=None, urls_list=[]):
"""
Initiates the scraping process for a single URL or a list of URLs.
It validates URLs, logs the scraping process, and manages the
progress of scraping through the database.

Args:
url (str, optional): A single URL to start scraping from. Defaults to None.
urls_list (list, optional): A list of URLs to scrape.
"""
Starts the web scraping process from a given URL or list of URLs, managing progress, rate limiting, and database integration.

If a list of URLs is provided, only valid URLs are inserted into the database; otherwise, a single URL is used as the starting point. The method iteratively fetches unvisited links from the database, retrieves and processes each page, stores scraped content and metadata, and discovers new links to continue scraping (unless a predefined list is used). Progress is tracked with a progress bar, and rate limiting and delays are enforced as configured. The process continues until all discovered links have been visited.
# Validate and insert the provided URLs into the database
if urls_list:
# Iterate through the list to check for valid URLs
Expand Down
19 changes: 19 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,20 @@ def test_cli_default_exports(monkeypatch, tmp_path):


def test_cli_disable_exports(monkeypatch, tmp_path):
"""
Test that disabling both markdown and JSON exports via CLI flags prevents export methods from being called.
"""
calls = _run_cli(monkeypatch, tmp_path, ["--no-markdown", "--no-json"])
assert calls["md"] is False
assert calls["json"] is False


def test_cli_proxy_option(monkeypatch, tmp_path):
"""
Test that the CLI correctly passes the --proxy option to the Scraper constructor.

Verifies that when the CLI is invoked with a proxy URL, the Scraper instance receives the correct proxy argument.
"""
captured = {}

def fake_init(
Expand All @@ -57,6 +65,17 @@ def fake_init(
delay=0,
proxy=None,
):
"""
A replacement initializer for the Scraper class that captures the value of the 'proxy' argument for testing purposes.

Parameters:
base_url: The base URL for scraping.
exclude_patterns: Patterns to exclude from scraping.
db_manager: Database manager instance.
rate_limit: Optional rate limit for requests.
delay: Optional delay between requests.
proxy: Proxy URL to be captured for verification in tests.
"""
captured['proxy'] = proxy

monkeypatch.setattr(Scraper, '__init__', fake_init)
Expand Down
14 changes: 14 additions & 0 deletions tests/test_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,11 @@ def get_all_pages(self):


def test_start_scraping_process(monkeypatch):
"""
Test the complete scraping process, verifying link tracking, page storage, and integration with mocked dependencies.

This test ensures that the `Scraper.start_scraping()` method correctly inserts and marks links as visited, stores scraped page content, and interacts properly with mocked HTTP requests and progress tracking.
"""
db = ListDB()
scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)

Expand All @@ -156,6 +161,12 @@ class DummyResp:

class DummyTqdm:
def __init__(self, *a, **k):
"""
Initialize the dummy progress bar with an optional total count.

Parameters:
total (int, optional): The total number of items to track. Defaults to 0.
"""
self.total = k.get('total', 0)
def update(self, n):
pass
Expand All @@ -174,6 +185,9 @@ def close(self):


def test_scraper_proxy_initialization():
"""
Test that the Scraper initializes its session with the correct HTTP and HTTPS proxy settings when a proxy URL is provided.
"""
db = DummyDB()
scraper = Scraper(
base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy='http://proxy:8080'
Expand Down