Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions crawler_to_md/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def scrape_page(self, html, url):
logger.error(f"Error scraping {url}: {e}")
return None, None

def start_scraping(self, url=None, urls_list=[]):
def start_scraping(self, url=None, urls_list=None):
"""
Initiates the scraping process for a single URL or a list of URLs.
It validates URLs, logs the scraping process, and manages the
Expand All @@ -197,15 +197,18 @@ def start_scraping(self, url=None, urls_list=[]):
urls_list (list, optional): A list of URLs to scrape.
"""
# Validate and insert the provided URLs into the database
if urls_list:
# Iterate through the list to check for valid URLs
for url_item in urls_list:
urls = urls_list or []
if urls:
# Build a new list of valid URLs without modifying the original list
validated_urls = []
for url_item in urls:
if not self.is_valid_link(url_item):
logger.warning(f"Skipping invalid URL: {url_item}")
urls_list.remove(url_item) # Remove invalid URLs from the list
continue
validated_urls.append(url_item)

# Insert the validated list of URLs into the database
self.db_manager.insert_link(urls_list)
self.db_manager.insert_link(validated_urls)
elif url:
# Insert a single URL if provided and valid
self.db_manager.insert_link(url)
Expand Down
88 changes: 88 additions & 0 deletions tests/test_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,91 @@ def test_scrape_page_returns_none_for_empty_content(monkeypatch):

assert content is None
assert metadata is None


def test_start_scraping_excludes_invalid_urls(monkeypatch):
db = ListDB()
scraper = Scraper(
base_url='http://example.com',
exclude_patterns=['/exclude'],
db_manager=db,
)

monkeypatch.setattr(Scraper, 'fetch_links', lambda self, url, html=None: set())
monkeypatch.setattr(
Scraper, 'scrape_page', lambda self, html, url: ('# MD', {'url': url})
)

class DummyResp:
status_code = 200
headers = {'content-type': 'text/html'}
content = b'<html></html>'
text = '<html></html>'

monkeypatch.setattr(scraper.session, 'get', lambda url: DummyResp())

class DummyTqdm:
def __init__(self, *a, **k):
self.total = k.get('total', 0)
def update(self, n):
pass
def refresh(self):
pass
def close(self):
pass

monkeypatch.setattr(tqdm, 'tqdm', lambda *a, **k: DummyTqdm(*a, **k))

urls = [
'http://example.com/page1',
'http://example.com/exclude/page',
'http://example.com/page2',
]

scraper.start_scraping(urls_list=urls)

assert 'http://example.com/exclude/page' not in db.links


def test_start_scraping_filters_discovered_links(monkeypatch):
db = ListDB()
scraper = Scraper(
base_url='http://example.com',
exclude_patterns=['/exclude'],
db_manager=db,
)

html = (
'<html><body>'
'<a href="/page1">1</a>'
'<a href="/exclude/page">2</a>'
'<a href="/page2">3</a>'
'</body></html>'
)

class DummyResp:
status_code = 200
headers = {'content-type': 'text/html'}
text = html

monkeypatch.setattr(scraper.session, 'get', lambda url: DummyResp())

monkeypatch.setattr(
Scraper, 'scrape_page', lambda self, html, url: ('# MD', {'url': url})
)

class DummyTqdm:
def __init__(self, *a, **k):
self.total = k.get('total', 0)
def update(self, n):
pass
def refresh(self):
pass
def close(self):
pass

monkeypatch.setattr(tqdm, 'tqdm', lambda *a, **k: DummyTqdm(*a, **k))

scraper.start_scraping(url='http://example.com')

assert 'http://example.com/exclude/page' not in db.links