Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 19 additions & 22 deletions src/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,16 @@ def is_valid_link(self, link):

def fetch_links(self, url, html=None):
"""
Fetch all valid links from the given URL.
Log the fetching process and outcome.

Args:
url (str): The URL to fetch links from.
html (str, optional): The HTML content of the page.

Retrieve all valid links from a specified URL or provided HTML content.

If HTML content is not provided, sends a GET request to the URL and parses the response. Extracts anchor tags, normalizes and filters links based on validity criteria, and returns a set of valid links. Returns an empty list if the request fails.

Parameters:
url (str): The URL to extract links from.
html (str, optional): HTML content to parse instead of fetching from the URL.

Returns:
set: Set of valid links found on the page.
set: A set of valid, normalized links found on the page.
"""
logger.debug(f"Fetching links from {url}")
try:
Expand Down Expand Up @@ -101,15 +102,14 @@ def fetch_links(self, url, html=None):

def scrape_page(self, html, url):
"""
Scrape the content and metadata from the given URL.
Log the scraping process and outcome.

Args:
html (str): The HTML content of the page.
url (str): The URL to scrape.

Extracts the main content and page title from HTML, converting the content to Markdown format.

Parameters:
html (str): The HTML content of the page.
url (str): The URL of the page being scraped.

Returns:
tuple: A tuple containing the extracted content and metadata of the page.
tuple: A tuple containing the Markdown-formatted content (str) and a metadata dictionary with the page title. Returns (None, None) if an error occurs during extraction.
"""
logger.info(f"Scraping page {url}")

Expand Down Expand Up @@ -140,12 +140,9 @@ def scrape_page(self, html, url):

def start_scraping(self, url=None, urls_list=[]):
"""
Initiates the scraping process for a single URL or a list of URLs. It validates URLs,
logs the scraping process, and manages the progress of scraping through the database.

Args:
url (str, optional): A single URL to start scraping from. Defaults to None.
urls_list (list, optional): A list of URLs to scrape. Defaults to an empty list.
Start the scraping workflow from a single URL or a list of URLs, managing link validation, progress tracking, rate limiting, and database integration.

If a list of URLs is provided, only valid URLs are processed. The method iteratively fetches unvisited links from the database, retrieves their content, extracts and stores page data and metadata, discovers new links (unless working from a predefined list), and marks links as visited. Progress is tracked with a progress bar, and optional rate limiting and request delays are enforced.
"""
# Validate and insert the provided URLs into the database
if urls_list:
Expand Down
18 changes: 18 additions & 0 deletions tests/test_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ def test_is_valid_link():


def test_fetch_links():
"""
Test that `fetch_links` extracts and filters valid links from HTML content.

Verifies that only links within the base URL and not matching the exclude pattern are returned.
"""
db = DummyDB()
scraper = Scraper(base_url='https://example.com', exclude_patterns=['/exclude'], db_manager=db)
html = '''<html><body>
Expand All @@ -47,6 +52,11 @@ def test_fetch_links():
@patch('tempfile.NamedTemporaryFile')
def test_scrape_page_parses_content_and_metadata(mock_tempfile, mock_os_remove):
# Arrange
"""
Test that `scrape_page` correctly parses HTML content, extracts metadata, and uses the markdown converter.

Mocks file handling and the markdown converter to verify that the returned content contains the expected text and the metadata includes the correct page title.
"""
mock_file = MagicMock()
mock_file.name = "dummy_path"
mock_tempfile.return_value.__enter__.return_value = mock_file
Expand All @@ -68,6 +78,9 @@ def test_scrape_page_parses_content_and_metadata(mock_tempfile, mock_os_remove):
@patch('tempfile.NamedTemporaryFile')
def test_scrape_page_with_markitdown(mock_tempfile, mock_os_remove):
# Arrange
"""
Test that `scrape_page` correctly converts HTML to markdown using MarkItDown and extracts the page title as metadata.
"""
mock_file = MagicMock()
mock_file.name = "dummy_path"
mock_tempfile.return_value.__enter__.return_value = mock_file
Expand Down Expand Up @@ -124,6 +137,11 @@ def get_all_pages(self):


def test_start_scraping_process(monkeypatch):
"""
Test that the scraping process inserts, visits, and stores a page correctly using mocked dependencies.

This test verifies that the scraper's `start_scraping` method interacts with the database as expected when all external dependencies (network requests, link extraction, page scraping, and progress bar) are mocked. It asserts that one link is inserted, marked as visited, and the page is stored with the correct URL.
"""
db = ListDB()
scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)

Expand Down
Loading