Skip to content

Commit daca214

Browse files
authored
Merge pull request #63 from obeone/codex/add-include-url-argument-for-filtering
Add include URL CLI filter
2 parents 583e719 + 67010a4 commit daca214

4 files changed

Lines changed: 148 additions & 9 deletions

File tree

crawler_to_md/cli.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,13 @@ def main():
7474
help="Exclude URLs containing this string",
7575
default=[],
7676
)
77+
parser.add_argument(
78+
"--include-url",
79+
"-I",
80+
action="append",
81+
help="Include only URLs containing this string",
82+
default=[],
83+
)
7784
parser.add_argument(
7885
"--export-individual",
7986
"-ei",
@@ -210,6 +217,7 @@ def main():
210217
scraper = Scraper(
211218
base_url=args.base_url,
212219
exclude_patterns=args.exclude_url,
220+
include_url_patterns=args.include_url,
213221
db_manager=db_manager,
214222
rate_limit=args.rate_limit,
215223
delay=args.delay,

crawler_to_md/scraper.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ def __init__(
2222
self,
2323
base_url,
2424
exclude_patterns,
25+
include_url_patterns,
2526
db_manager: DatabaseManager,
2627
rate_limit=0,
2728
delay=0,
@@ -35,6 +36,7 @@ def __init__(
3536
Args:
3637
base_url (str): The base URL to start scraping from.
3738
exclude_patterns (list): List of URL patterns to exclude from scraping.
39+
include_url_patterns (list): List of URL patterns that must be present to scrape.
3840
db_manager (DatabaseManager): The database manager object.
3941
rate_limit (int): Maximum number of requests per minute.
4042
delay (float): Delay between requests in seconds.
@@ -50,6 +52,7 @@ def __init__(
5052
logger.debug(f"Initializing Scraper with base URL: {base_url}")
5153
self.base_url = base_url
5254
self.exclude_patterns = exclude_patterns or []
55+
self.include_url_patterns = include_url_patterns or []
5356
self.db_manager = db_manager
5457
self.rate_limit = rate_limit
5558
self.delay = delay
@@ -108,6 +111,10 @@ def is_valid_link(self, link):
108111
valid = True
109112
if self.base_url and not link.startswith(self.base_url):
110113
valid = False
114+
if self.include_url_patterns and not any(
115+
pattern in link for pattern in self.include_url_patterns
116+
):
117+
valid = False
111118
for pattern in self.exclude_patterns:
112119
if pattern in link:
113120
valid = False

tests/test_cli.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ def fake_init(
5858
self,
5959
base_url,
6060
exclude_patterns,
61+
include_url_patterns,
6162
db_manager,
6263
rate_limit=0,
6364
delay=0,
@@ -102,6 +103,7 @@ def fake_init(
102103
self,
103104
base_url,
104105
exclude_patterns,
106+
include_url_patterns,
105107
db_manager,
106108
rate_limit=0,
107109
delay=0,
@@ -146,6 +148,7 @@ def fake_init(
146148
self,
147149
base_url,
148150
exclude_patterns,
151+
include_url_patterns,
149152
db_manager,
150153
rate_limit=0,
151154
delay=0,
@@ -219,6 +222,7 @@ def fake_init(
219222
self,
220223
base_url,
221224
exclude_patterns,
225+
include_url_patterns,
222226
db_manager,
223227
rate_limit=0,
224228
delay=0,
@@ -275,6 +279,7 @@ def fake_init(
275279
self,
276280
base_url,
277281
exclude_patterns,
282+
include_url_patterns,
278283
db_manager,
279284
rate_limit=0,
280285
delay=0,
@@ -317,6 +322,58 @@ def fake_init(
317322
assert captured.get('exclude_filters') == ['span']
318323

319324

325+
def test_cli_include_url_option(monkeypatch, tmp_path):
326+
"""
327+
Ensure CLI passes include URL filters to the scraper.
328+
329+
Args:
330+
monkeypatch (pytest.MonkeyPatch): Pytest monkeypatch fixture.
331+
tmp_path (pathlib.Path): Temporary path for tests.
332+
"""
333+
captured = {}
334+
335+
def fake_init(
336+
self,
337+
base_url,
338+
exclude_patterns,
339+
include_url_patterns,
340+
db_manager,
341+
rate_limit=0,
342+
delay=0,
343+
proxy=None,
344+
include_filters=None,
345+
exclude_filters=None,
346+
):
347+
"""
348+
Capture include URL patterns argument.
349+
350+
Args:
351+
include_url_patterns (list): URL substrings to include.
352+
"""
353+
captured['include_url_patterns'] = include_url_patterns
354+
355+
monkeypatch.setattr(Scraper, '__init__', fake_init)
356+
monkeypatch.setattr(Scraper, 'start_scraping', lambda *a, **k: None)
357+
monkeypatch.setattr(ExportManager, 'export_to_markdown', lambda *a, **k: None)
358+
monkeypatch.setattr(ExportManager, 'export_to_json', lambda *a, **k: None)
359+
360+
cache_folder = tmp_path / 'cache'
361+
args = [
362+
'prog',
363+
'--url',
364+
'http://example.com',
365+
'--output-folder',
366+
str(tmp_path),
367+
'--cache-folder',
368+
str(cache_folder),
369+
'--include-url',
370+
'/blog',
371+
]
372+
monkeypatch.setattr(sys, 'argv', args)
373+
cli.main()
374+
assert captured.get('include_url_patterns') == ['/blog']
375+
376+
320377
def test_cli_overwrite_cache(monkeypatch, tmp_path):
321378
captured = {}
322379

tests/test_scraper.py

Lines changed: 76 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,32 @@ def mark_link_visited(self, url):
2828
def test_is_valid_link():
2929
db = DummyDB()
3030
scraper = Scraper(
31-
base_url='https://example.com', exclude_patterns=['/exclude'], db_manager=db
31+
base_url='https://example.com',
32+
exclude_patterns=['/exclude'],
33+
include_url_patterns=[],
34+
db_manager=db,
3235
)
3336
assert scraper.is_valid_link('https://example.com/page')
3437
assert not scraper.is_valid_link('https://example.com/exclude/page')
3538
assert not scraper.is_valid_link('https://other.com/')
3639

40+
include_scraper = Scraper(
41+
base_url='https://example.com',
42+
exclude_patterns=[],
43+
include_url_patterns=['/docs'],
44+
db_manager=db,
45+
)
46+
assert include_scraper.is_valid_link('https://example.com/docs/page')
47+
assert not include_scraper.is_valid_link('https://example.com/blog')
48+
3749

3850
def test_fetch_links():
3951
db = DummyDB()
4052
scraper = Scraper(
41-
base_url='https://example.com', exclude_patterns=['/exclude'], db_manager=db
53+
base_url='https://example.com',
54+
exclude_patterns=['/exclude'],
55+
include_url_patterns=[],
56+
db_manager=db,
4257
)
4358
html = '''<html><body>
4459
<a href="https://example.com/page1">1</a>
@@ -49,6 +64,23 @@ def test_fetch_links():
4964
assert links == {'https://example.com/page1', 'https://example.com/page2'}
5065

5166

67+
def test_fetch_links_includes_only_matching_patterns():
68+
db = DummyDB()
69+
scraper = Scraper(
70+
base_url='https://example.com',
71+
exclude_patterns=[],
72+
include_url_patterns=['/page1'],
73+
db_manager=db,
74+
)
75+
html = '''<html><body>
76+
<a href="https://example.com/page1">1</a>
77+
<a href="/page2">2</a>
78+
<a href="https://example.com/page3">3</a>
79+
</body></html>'''
80+
links = scraper.fetch_links(url='https://example.com', html=html)
81+
assert links == {'https://example.com/page1'}
82+
83+
5284

5385

5486
@patch('os.remove')
@@ -60,7 +92,12 @@ def test_scrape_page_parses_content_and_metadata(mock_tempfile, mock_os_remove):
6092
mock_tempfile.return_value.__enter__.return_value = mock_file
6193

6294
db = DummyDB()
63-
scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
95+
scraper = Scraper(
96+
base_url='http://example.com',
97+
exclude_patterns=[],
98+
include_url_patterns=[],
99+
db_manager=db,
100+
)
64101
html = '<html><head><title>Test</title></head><body><p>Hello</p></body></html>'
65102

66103
# Act
@@ -83,7 +120,12 @@ def test_scrape_page_with_markitdown(mock_tempfile, mock_os_remove):
83120
mock_tempfile.return_value.__enter__.return_value = mock_file
84121

85122
db = DummyDB()
86-
scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
123+
scraper = Scraper(
124+
base_url='http://example.com',
125+
exclude_patterns=[],
126+
include_url_patterns=[],
127+
db_manager=db,
128+
)
87129
html = (
88130
'<html><head><title>Test</title></head><body><h1>A Title</h1>'
89131
'<p>This is a paragraph with <strong>bold</strong> text.</p></body></html>'
@@ -121,6 +163,7 @@ def test_scrape_page_include_exclude(mock_tempfile, mock_os_remove):
121163
scraper = Scraper(
122164
base_url='http://example.com',
123165
exclude_patterns=[],
166+
include_url_patterns=[],
124167
db_manager=db,
125168
include_filters=['p'],
126169
exclude_filters=['.remove'],
@@ -181,7 +224,12 @@ def get_all_pages(self):
181224

182225
def test_start_scraping_process(monkeypatch):
183226
db = ListDB()
184-
scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
227+
scraper = Scraper(
228+
base_url='http://example.com',
229+
exclude_patterns=[],
230+
include_url_patterns=[],
231+
db_manager=db,
232+
)
185233

186234
monkeypatch.setattr(Scraper, 'fetch_links', lambda self, url, html=None: set())
187235
monkeypatch.setattr(
@@ -219,7 +267,11 @@ def test_scraper_proxy_initialization(monkeypatch):
219267
db = DummyDB()
220268
monkeypatch.setattr(Scraper, '_test_proxy', lambda self: None)
221269
scraper = Scraper(
222-
base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy='http://proxy:8080'
270+
base_url='http://example.com',
271+
exclude_patterns=[],
272+
include_url_patterns=[],
273+
db_manager=db,
274+
proxy='http://proxy:8080'
223275
)
224276
assert scraper.session.proxies.get('http') == 'http://proxy:8080'
225277
assert scraper.session.proxies.get('https') == 'http://proxy:8080'
@@ -230,7 +282,11 @@ def test_scraper_socks_proxy_initialization(monkeypatch):
230282
proxy = 'socks5://localhost:9050'
231283
monkeypatch.setattr(Scraper, '_test_proxy', lambda self: None)
232284
scraper = Scraper(
233-
base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy=proxy
285+
base_url='http://example.com',
286+
exclude_patterns=[],
287+
include_url_patterns=[],
288+
db_manager=db,
289+
proxy=proxy
234290
)
235291
assert scraper.session.proxies.get('http') == proxy
236292
assert scraper.session.proxies.get('https') == proxy
@@ -244,13 +300,22 @@ def fake_head(self, url, timeout=5):
244300
monkeypatch.setattr(requests.Session, 'head', fake_head)
245301
with pytest.raises(ValueError):
246302
Scraper(
247-
base_url='http://example.com', exclude_patterns=[], db_manager=db, proxy='http://proxy:8080'
303+
base_url='http://example.com',
304+
exclude_patterns=[],
305+
include_url_patterns=[],
306+
db_manager=db,
307+
proxy='http://proxy:8080'
248308
)
249309

250310

251311
def test_scrape_page_returns_none_for_empty_content(monkeypatch):
252312
db = DummyDB()
253-
scraper = Scraper(base_url='http://example.com', exclude_patterns=[], db_manager=db)
313+
scraper = Scraper(
314+
base_url='http://example.com',
315+
exclude_patterns=[],
316+
include_url_patterns=[],
317+
db_manager=db,
318+
)
254319
html = '<html><body></body></html>'
255320

256321
with patch('crawler_to_md.scraper.MarkItDown') as mock_markdown:
@@ -266,6 +331,7 @@ def test_start_scraping_excludes_invalid_urls(monkeypatch):
266331
scraper = Scraper(
267332
base_url='http://example.com',
268333
exclude_patterns=['/exclude'],
334+
include_url_patterns=[],
269335
db_manager=db,
270336
)
271337

@@ -310,6 +376,7 @@ def test_start_scraping_filters_discovered_links(monkeypatch):
310376
scraper = Scraper(
311377
base_url='http://example.com',
312378
exclude_patterns=['/exclude'],
379+
include_url_patterns=[],
313380
db_manager=db,
314381
)
315382

0 commit comments

Comments
 (0)