Skip to content

Commit 17dd781

Browse files
📝 Add docstrings to codex/ajouter-option-pour-utiliser-un-proxy
Docstrings generation was requested by @obeone. * #50 (comment) The following files were modified: * `crawler_to_md/cli.py` * `crawler_to_md/scraper.py` * `tests/test_cli.py` * `tests/test_scraper.py`
1 parent af7778a commit 17dd781

4 files changed

Lines changed: 64 additions & 40 deletions

File tree

crawler_to_md/cli.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,9 @@
1919

2020
def main():
2121
"""
22-
Main function to start the web scraper application.
23-
24-
This function parses command line arguments, initializes necessary components,
25-
and manages the scraping and exporting process.
26-
27-
Raises:
28-
ValueError: If neither a URL nor a URLs file is provided.
22+
Parses command-line arguments and orchestrates the web scraping, data storage, and export process.
23+
24+
This function serves as the main entry point for the web scraper application. It handles argument parsing, initializes required components, manages the scraping workflow, and exports the results to Markdown and JSON formats as specified by the user. The function ensures necessary directories exist, validates input, and provides user feedback on output locations.
2925
"""
3026
logger.info("Starting the web scraper application.")
3127

crawler_to_md/scraper.py

Lines changed: 28 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,14 @@ def __init__(
2727
proxy=None,
2828
):
2929
"""
30-
Initialize the Scraper object.
31-
Log the initialization process.
32-
33-
Args:
34-
base_url (str): The base URL to start scraping from.
35-
exclude_patterns (list): List of patterns to exclude from scraping.
36-
db_manager (DatabaseManager): The database manager object.
37-
rate_limit (int): Maximum number of requests per minute.
38-
delay (float): Delay between requests in seconds.
39-
proxy (str, optional): Proxy URL for HTTP requests.
30+
Initializes a Scraper instance with base URL, exclusion patterns, database manager, and optional rate limiting, delay, and proxy settings.
31+
32+
Parameters:
33+
base_url (str): The root URL from which scraping begins.
34+
exclude_patterns (list): URL patterns to exclude from scraping.
35+
rate_limit (int): Maximum number of requests allowed per minute.
36+
delay (float): Time in seconds to wait between requests.
37+
proxy (str, optional): Proxy URL to route HTTP requests through.
4038
"""
4139
logger.debug(f"Initializing Scraper with base URL: {base_url}")
4240
self.base_url = base_url
@@ -51,14 +49,15 @@ def __init__(
5149

5250
def is_valid_link(self, link):
5351
"""
54-
Check if the given link is valid for scraping.
55-
Log the result of the validation.
56-
57-
Args:
58-
link (str): The link to be checked.
59-
52+
Determine whether a given URL is eligible for scraping based on the base URL and exclusion patterns.
53+
54+
A link is considered valid if it starts with the configured base URL and does not contain any of the specified exclusion patterns.
55+
56+
Parameters:
57+
link (str): The URL to validate.
58+
6059
Returns:
61-
bool: True if the link is valid, False otherwise.
60+
bool: True if the link is valid for scraping; False otherwise.
6261
"""
6362
valid = True
6463
if self.base_url and not link.startswith(self.base_url):
@@ -71,15 +70,16 @@ def is_valid_link(self, link):
7170

7271
def fetch_links(self, url, html=None):
7372
"""
74-
Fetch all valid links from the given URL.
75-
Log the fetching process and outcome.
76-
77-
Args:
78-
url (str): The URL to fetch links from.
79-
html (str, optional): The HTML content of the page.
80-
73+
Retrieve all valid links from a specified URL or provided HTML content.
74+
75+
If HTML is not provided, the method fetches the page content using an HTTP GET request. Extracts and resolves all anchor tag links, removes URL fragments, and filters them using the link validation logic. Returns a set of valid links found on the page. Returns an empty list if the request fails.
76+
77+
Parameters:
78+
url (str): The URL to extract links from.
79+
html (str, optional): HTML content to parse instead of fetching from the URL.
80+
8181
Returns:
82-
set: Set of valid links found on the page.
82+
set: A set of valid, filtered links found on the page, or an empty set if none are found or on error.
8383
"""
8484
logger.debug(f"Fetching links from {url}")
8585
try:
@@ -164,14 +164,9 @@ def scrape_page(self, html, url):
164164

165165
def start_scraping(self, url=None, urls_list=[]):
166166
"""
167-
Initiates the scraping process for a single URL or a list of URLs.
168-
It validates URLs, logs the scraping process, and manages the
169-
progress of scraping through the database.
170-
171-
Args:
172-
url (str, optional): A single URL to start scraping from. Defaults to None.
173-
urls_list (list, optional): A list of URLs to scrape.
174-
"""
167+
Starts the web scraping process from a given URL or list of URLs, managing progress, rate limiting, and database integration.
168+
169+
If a list of URLs is provided, only valid URLs are inserted into the database; otherwise, a single URL is used as the starting point. The method iteratively fetches unvisited links from the database, retrieves and processes each page, stores scraped content and metadata, and discovers new links to continue scraping (unless a predefined list is used). Progress is tracked with a progress bar, and rate limiting and delays are enforced as configured. The process continues until all discovered links have been visited.
175170
# Validate and insert the provided URLs into the database
176171
if urls_list:
177172
# Iterate through the list to check for valid URLs

tests/test_cli.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,20 @@ def test_cli_default_exports(monkeypatch, tmp_path):
4040

4141

4242
def test_cli_disable_exports(monkeypatch, tmp_path):
43+
"""
44+
Test that disabling both markdown and JSON exports via CLI flags prevents export methods from being called.
45+
"""
4346
calls = _run_cli(monkeypatch, tmp_path, ["--no-markdown", "--no-json"])
4447
assert calls["md"] is False
4548
assert calls["json"] is False
4649

4750

4851
def test_cli_proxy_option(monkeypatch, tmp_path):
52+
"""
53+
Test that the CLI correctly passes the --proxy option to the Scraper constructor.
54+
55+
Verifies that when the CLI is invoked with a proxy URL, the Scraper instance receives the correct proxy argument.
56+
"""
4957
captured = {}
5058

5159
def fake_init(
@@ -57,6 +65,17 @@ def fake_init(
5765
delay=0,
5866
proxy=None,
5967
):
68+
"""
69+
A replacement initializer for the Scraper class that captures the value of the 'proxy' argument for testing purposes.
70+
71+
Parameters:
72+
base_url: The base URL for scraping.
73+
exclude_patterns: Patterns to exclude from scraping.
74+
db_manager: Database manager instance.
75+
rate_limit: Optional rate limit for requests.
76+
delay: Optional delay between requests.
77+
proxy: Proxy URL to be captured for verification in tests.
78+
"""
6079
captured['proxy'] = proxy
6180

6281
monkeypatch.setattr(Scraper, '__init__', fake_init)

tests/test_scraper.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,11 @@ def get_all_pages(self):
138138

139139

140140
def test_start_scraping_process(monkeypatch):
141+
"""
142+
Test the complete scraping process, verifying link tracking, page storage, and integration with mocked dependencies.
143+
144+
This test ensures that the `Scraper.start_scraping()` method correctly inserts and marks links as visited, stores scraped page content, and interacts properly with mocked HTTP requests and progress tracking.
145+
"""
141146
db = ListDB()
142147
scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
143148

@@ -156,6 +161,12 @@ class DummyResp:
156161

157162
class DummyTqdm:
158163
def __init__(self, *a, **k):
164+
"""
165+
Initialize the dummy progress bar with an optional total count.
166+
167+
Parameters:
168+
total (int, optional): The total number of items to track. Defaults to 0.
169+
"""
159170
self.total = k.get('total', 0)
160171
def update(self, n):
161172
pass
@@ -174,6 +185,9 @@ def close(self):
174185

175186

176187
def test_scraper_proxy_initialization():
188+
"""
189+
Test that the Scraper initializes its session with the correct HTTP and HTTPS proxy settings when a proxy URL is provided.
190+
"""
177191
db = DummyDB()
178192
scraper = Scraper(
179193
base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy='http://proxy:8080'

0 commit comments

Comments
 (0)