Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build-and-publish.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
tags: |
ghcr.io/obeone/crawler-to-md:latest
docker.io/obeoneorg/crawler-to-md:latest
platforms: linux/amd64,linux/arm64,linux/i386
platforms: linux/amd64,linux/arm64

- name: Set up cosign
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
Expand Down
103 changes: 80 additions & 23 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,28 +19,70 @@
def main():
"""
Main function to start the web scraper application.

This function parses command line arguments, initializes necessary components,
and manages the scraping and exporting process.

Raises:
ValueError: If neither a URL nor a URLs file is provided.
"""
logger.info("Starting the web scraper application.")

# Parse command line arguments
parser = argparse.ArgumentParser(description="Web Scraper to Markdown")
parser.add_argument("--url", "-u", help="Base URL to start scraping")
parser.add_argument("--urls-file", help="Path to a file containing URLs to scrape, one URL per line. If '-', read from stdin.")
parser.add_argument("--output-folder", "-o", help="Output folder for the markdown file", default="./output")
parser.add_argument("--cache-folder", "-c", help="Cache folder for storing database", default="./cache")
parser.add_argument("--base-url", "-b", help="Base URL for filtering links. Defaults to the URL base")
parser.add_argument("--title", "-t", help="Final title of the markdown file. Defaults to the URL")
parser.add_argument("--exclude", "-e", action="append", help="Exclude URLs containing this string", default=[])
parser.add_argument("--export-individual", "-ei", action="store_true", help="Export each page as an individual Markdown file", default=False)
parser.add_argument("--rate-limit", "-rl", type=int, help="Maximum number of requests per minute", default=0)
parser.add_argument("--delay", "-d", type=float, help="Delay between requests in seconds", default=0)
parser.add_argument(
"--urls-file",
help="Path to a file containing URLs to scrape, one URL per line. If '-', read from stdin.",
)
parser.add_argument(
"--output-folder", "-o", help="Output folder for the markdown file", default="./output"
)
parser.add_argument(
"--cache-folder", "-c", help="Cache folder for storing database", default="./cache"
)
parser.add_argument(
"--base-url", "-b", help="Base URL for filtering links. Defaults to the URL base"
)
parser.add_argument(
"--title", "-t", help="Final title of the markdown file. Defaults to the URL"
)
parser.add_argument(
"--exclude",
"-e",
action="append",
help="Exclude URLs containing this string",
default=[],
)
parser.add_argument(
"--export-individual",
"-ei",
action="store_true",
help="Export each page as an individual Markdown file",
default=False,
)
parser.add_argument(
"--rate-limit",
"-rl",
type=int,
help="Maximum number of requests per minute",
default=0,
)
parser.add_argument(
"--delay",
"-d",
type=float,
help="Delay between requests in seconds",
default=0,
)

try:
import argcomplete

argcomplete.autocomplete(parser)
except ImportError:
pass

args = parser.parse_args()
logger.debug(f"Command line arguments parsed: {args}")

Expand All @@ -52,21 +94,23 @@ def main():
else:
with open(args.urls_file, "r") as file:
urls_list = [line.strip() for line in file.readlines()]

urls_list = utils.deduplicate_list(urls_list)
args.url = None # Ensure args.url is defined even if not used
else:
urls_list = []

if not args.url and not urls_list:
raise ValueError("No URL provided. Please provide either --url or --urls-file.")

output = os.path.join(args.output_folder, utils.url_to_filename(args.url) if args.url else utils.url_to_filename(urls_list[0]))

output = os.path.join(
args.output_folder,
utils.url_to_filename(args.url) if args.url else utils.url_to_filename(urls_list[0]),
)

# Create the output folder if it does not exist
if not os.path.exists(output):
logger.info(f"Creating output folder at {output}")

os.makedirs(output)

# Create the cache folder if it does not exist
Expand All @@ -86,10 +130,21 @@ def main():
logger.debug(f"No title provided. Setting title to {args.title}")

# Initialize managers
db_manager = DatabaseManager(os.path.join(args.cache_folder, utils.url_to_filename(args.url if args.url else urls_list[0]) + ".sqlite"))
db_manager = DatabaseManager(
os.path.join(
args.cache_folder,
utils.url_to_filename(args.url if args.url else urls_list[0]) + ".sqlite",
)
)
logger.info("DatabaseManager initialized.")

scraper = Scraper(base_url=args.base_url, exclude_patterns=args.exclude, db_manager=db_manager, rate_limit=args.rate_limit, delay=args.delay)
scraper = Scraper(
base_url=args.base_url,
exclude_patterns=args.exclude,
db_manager=db_manager,
rate_limit=args.rate_limit,
delay=args.delay,
)
logger.info("Scraper initialized.")

# Start the scraping process
Expand All @@ -101,18 +156,20 @@ def main():
# After the scraping process is completed in the main function
export_manager = ExportManager(db_manager, args.title)
logger.info("ExportManager initialized.")

export_manager.export_to_markdown(os.path.join(output, f"{output_name}.md"))
logger.info("Export to markdown completed.")

export_manager.export_to_json(os.path.join(output, f"{output_name}.json"))
logger.info("Export to JSON completed.")

if args.export_individual:
logger.info("Export of individual pages...")
output_folder_ei = export_manager.export_individual_markdown(output_folder=output, base_url=args.base_url if args.base_url else None)
output_folder_ei = export_manager.export_individual_markdown(
output_folder=output, base_url=args.base_url if args.base_url else None
)
logger.info("Export of individual Markdown files completed.")

markdown_path = os.path.join(output, f"{output_name}.md")
json_path = os.path.join(output, f"{output_name}.json")
print(f"\033[94m Markdown file generated at: \033[0m", markdown_path)
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ mdformat_frontmatter==2.0.8
mdformat_tables==1.0.0
requests==2.32.3
tqdm==4.67.1
trafilatura==2.0.0
markitdown==0.1.2
coloredlogs==15.0.1
beautifulsoup4==4.13.4
40 changes: 19 additions & 21 deletions src/scraper.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
from curses import meta
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urldefrag
from . import log_setup
import trafilatura
import mdformat
from markitdown import MarkItDown
import json
from .database_manager import DatabaseManager
from tqdm import tqdm
import coloredlogs
import time
import tempfile
import os


logger = log_setup.get_logger()
Expand Down Expand Up @@ -79,7 +78,7 @@ def fetch_links(self, url, html=None):
)
return []
else:
content = response.content
content = response.text
else:
content = html

Expand Down Expand Up @@ -115,23 +114,22 @@ def scrape_page(self, html, url):
logger.info(f"Scraping page {url}")

try:
metadata = trafilatura.metadata.extract_metadata(filecontent=html, default_url=url).as_dict()
# Parse the content using BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

# Extract title from the page
title = soup.title.string if soup.title else ""

metadata = {"title": title}

# Convert the HTML to Markdown
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix=".html") as tmp:
tmp.write(html)
tmp_path = tmp.name

if "body" in metadata:
metadata.pop("body")
if "commentsbody" in metadata:
metadata.pop("commentsbody")
markdown = str(MarkItDown().convert(tmp_path))

markdown = (
trafilatura.extract(
html,
output_format="markdown",
include_formatting=True,
include_links=True,
include_tables=True,
)
or ""
)
os.remove(tmp_path)

logger.debug(f"Successfully scraped content and metadata from {url}")
return markdown, metadata
Expand Down Expand Up @@ -229,7 +227,7 @@ def start_scraping(self, url=None, urls_list=[]):
continue

# Extract the HTML content from the response
html = response.content
html = response.text

# Scrape the page for content and metadata
content, metadata = self.scrape_page(html, url)
Expand Down
45 changes: 42 additions & 3 deletions tests/test_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,52 @@ def test_fetch_links():
assert links == {'https://example.com/page1', 'https://example.com/page2'}


def test_scrape_page_parses_content_and_metadata():
from unittest.mock import patch, MagicMock

...

@patch('os.remove')
@patch('tempfile.NamedTemporaryFile')
def test_scrape_page_parses_content_and_metadata(mock_tempfile, mock_os_remove):
# Arrange
mock_file = MagicMock()
mock_file.name = "dummy_path"
mock_tempfile.return_value.__enter__.return_value = mock_file

db = DummyDB()
scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
html = '<html><head><title>Test</title></head><body><p>Hello</p></body></html>'
content, metadata = scraper.scrape_page(html, 'http://example.com/test')

# Act
with patch('src.scraper.MarkItDown') as mock_markdown:
mock_markdown.return_value.convert.return_value = "Hello"
content, metadata = scraper.scrape_page(html, 'http://example.com/test')

# Assert
assert 'Hello' in content
assert metadata.get('title') == 'Test'
assert metadata.get('url') == 'http://example.com/test'

@patch('os.remove')
@patch('tempfile.NamedTemporaryFile')
def test_scrape_page_with_markitdown(mock_tempfile, mock_os_remove):
# Arrange
mock_file = MagicMock()
mock_file.name = "dummy_path"
mock_tempfile.return_value.__enter__.return_value = mock_file

db = DummyDB()
scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
html = '<html><head><title>Test</title></head><body><h1>A Title</h1><p>This is a paragraph with <strong>bold</strong> text.</p></body></html>'

# Act
with patch('src.scraper.MarkItDown') as mock_markdown:
mock_markdown.return_value.convert.return_value = "# A Title\n\nThis is a paragraph with **bold** text."
content, metadata = scraper.scrape_page(html, 'http://example.com/test')

# Assert
assert content == '# A Title\n\nThis is a paragraph with **bold** text.'
assert metadata.get('title') == 'Test'


import requests
import tqdm
Expand Down Expand Up @@ -96,6 +134,7 @@ class DummyResp:
status_code = 200
headers = {'content-type': 'text/html'}
content = b'<html></html>'
text = '<html></html>'

monkeypatch.setattr(requests, 'get', lambda url: DummyResp())

Expand Down