Skip to content

Commit e5f089f

Browse files
committed
restructured load
1 parent 088d5e1 commit e5f089f

File tree

4 files changed

+148
-77
lines changed

4 files changed

+148
-77
lines changed

mcp_web_tools/loaders.py

Lines changed: 119 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,87 @@
1616
logger = logging.getLogger(__name__)
1717

1818

19+
def _fetch_with_trafilatura(url: str) -> tuple[str | None, str | None]:
20+
try:
21+
logger.info("Attempting to fetch %s with trafilatura", url)
22+
html = trafilatura.fetch_url(url)
23+
if html:
24+
return html, "trafilatura"
25+
except Exception as exc:
26+
logger.error("Error fetching page with trafilatura: %s", exc)
27+
return None, None
28+
29+
30+
async def _fetch_with_zendriver(url: str) -> tuple[str | None, str | None]:
31+
browser = None
32+
try:
33+
browser = await zd.start(headless=True, sandbox=False)
34+
page = await browser.get(url)
35+
await page.wait_for_ready_state("complete", timeout=5)
36+
await page.wait(t=1) # Allow dynamic content to settle
37+
html = await page.get_content()
38+
if html:
39+
return html, "zendriver"
40+
except Exception as exc:
41+
logger.warning("Error fetching page with zendriver: %s", exc)
42+
finally:
43+
if browser:
44+
try:
45+
await browser.stop()
46+
except Exception:
47+
pass
48+
return None, None
49+
50+
51+
async def _fetch_html(url: str) -> tuple[str | None, str | None]:
52+
html, provider = _fetch_with_trafilatura(url)
53+
if html:
54+
return html, provider
55+
return await _fetch_with_zendriver(url)
56+
57+
58+
def _extract_markdown(html: str) -> tuple[str | None, str | None]:
59+
try:
60+
content = trafilatura.extract(
61+
html,
62+
output_format="markdown",
63+
include_images=True,
64+
include_links=True,
65+
)
66+
except Exception as exc:
67+
logger.error("Error extracting content with trafilatura: %s", exc)
68+
return None, f"Error: Failed to extract readable content: {exc}"
69+
if not content:
70+
return None, None
71+
return content, None
72+
73+
74+
def _format_frontmatter(
75+
*,
76+
fetched: str | None,
77+
extracted: str | None,
78+
start: int,
79+
end: int,
80+
length: int,
81+
) -> str:
82+
lines = ["---"]
83+
lines.append(f"fetched: {fetched or 'unknown'}")
84+
lines.append(f"extracted: {extracted or 'none'}")
85+
lines.append(f"start: {start}")
86+
lines.append(f"end: {end}")
87+
lines.append(f"length: {length}")
88+
lines.append("---")
89+
return "\n".join(lines) + "\n\n"
90+
91+
92+
def _slice_text(text: str, start: int, limit: int) -> tuple[str, int]:
93+
if limit <= 0:
94+
end = len(text)
95+
else:
96+
end = min(start + limit, len(text))
97+
return text[start:end], end
98+
99+
19100
async def load_webpage(
20101
url: str, limit: int = 10_000, offset: int = 0, raw: bool = False
21102
) -> str:
@@ -31,83 +112,55 @@ async def load_webpage(
31112
"""
32113
try:
33114
async with asyncio.timeout(10):
34-
# Initialize html, provider and browser to None
35-
html = None
36-
provider = None # Will be set to 'trafilatura' or 'zendriver'
37-
browser = None
38-
39-
try:
40-
logger.info(f"Attempting to fetch {url} with trafilatura")
41-
html = trafilatura.fetch_url(url)
42-
if html:
43-
provider = "trafilatura"
44-
except Exception as e:
45-
logger.error(f"Error fetching page with trafilatura: {str(e)}")
46-
47-
if not html:
48-
try:
49-
browser = await zd.start(headless=True, sandbox=False)
50-
page = await browser.get(url)
51-
await page.wait_for_ready_state("complete", timeout=5)
52-
await page.wait(t=1) # Wait a bit for dynamic content
53-
html = await page.get_content()
54-
if html:
55-
provider = "zendriver"
56-
except Exception as e:
57-
logger.warning(
58-
f"Error fetching page with zendriver: {str(e)}, trying trafilatura next"
59-
)
60-
finally:
61-
# Ensure browser is closed even if an error occurs
62-
if browser:
63-
try:
64-
await browser.stop()
65-
except Exception:
66-
pass # Ignore errors during browser closing
67-
68-
# If both methods failed, return error
115+
html, fetch_provider = await _fetch_html(url)
69116
if not html:
70117
logger.error(
71-
f"Failed to retrieve content from {url} using both zendriver and trafilatura"
118+
"Failed to retrieve content from %s using both zendriver and trafilatura",
119+
url,
72120
)
73121
return f"Error: Failed to retrieve page content from {url} using multiple methods"
74122

75-
if raw:
76-
note = f"_Fetched via: {provider}_\n\n" if provider else ""
77-
res = html[offset : offset + limit]
78-
res += f"\n\n---Showing {offset} to {min(offset + limit, len(html))} out of {len(html)} characters.---"
79-
return note + res
123+
extraction_provider: str | None = None
124+
warning: str | None = None
125+
source = html
80126

81-
try:
82-
content = trafilatura.extract(
83-
html,
84-
output_format="markdown",
85-
include_images=True,
86-
include_links=True,
87-
)
88-
except Exception as e:
89-
logger.error(f"Error extracting content with trafilatura: {str(e)}")
90-
return f"Error: Failed to extract readable content: {str(e)}"
91-
92-
if not content:
93-
logger.warning(f"Failed to extract content from {url}")
94-
# Fallback to raw HTML with a warning
95-
note = f"_Fetched via: {provider}_\n\n" if provider else ""
96-
return (
97-
f"{note}Warning: Could not extract readable content from {url}. "
98-
f"Showing raw HTML instead.\n\n{html[offset : offset + limit]}"
99-
)
127+
if raw:
128+
extraction_provider = "raw"
129+
else:
130+
content, extraction_error = _extract_markdown(html)
131+
if extraction_error:
132+
return extraction_error
133+
if content:
134+
source = content
135+
extraction_provider = "trafilatura"
136+
else:
137+
extraction_provider = "raw"
138+
warning = (
139+
f"Warning: Could not extract readable content from {url}. "
140+
"Showing raw HTML instead."
141+
)
100142

101-
note = f"_Fetched via: {provider}_\n\n" if provider else ""
102-
res = content[offset : offset + limit]
103-
res += f"\n\n---Showing {offset} to {min(offset + limit, len(content))} out of {len(content)} characters.---"
104-
return note + res
143+
total_length = len(source)
144+
content_slice, slice_end = _slice_text(source, offset, limit)
145+
frontmatter = _format_frontmatter(
146+
fetched=fetch_provider,
147+
extracted=extraction_provider,
148+
start=offset,
149+
end=slice_end,
150+
length=total_length,
151+
)
152+
153+
parts = [frontmatter]
154+
if warning:
155+
parts.append(f"{warning}\n\n")
156+
parts.append(content_slice)
157+
return "".join(parts)
105158

106159
except asyncio.TimeoutError:
107-
logger.error(f"Request timed out after 10 seconds for URL: {url}")
160+
logger.error("Request timed out after 10 seconds for URL: %s", url)
108161
return f"Error: Request timed out after 10 seconds for URL: {url}"
109162
except Exception as e:
110-
logger.error(f"Error loading page: {str(e)}")
163+
logger.error("Error loading page: %s", e)
111164
return f"Error loading page: {str(e)}"
112165

113166

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "mcp-web-tools"
3-
version = "0.8.2"
3+
version = "0.8.3"
44
description = "A powerful MCP server to equip LLMs with web access, search, and content extraction capabilities"
55
readme = "README.md"
66
authors = [

tests/test_loaders.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,14 @@ async def test_load_webpage_uses_trafilatura_first():
2424
):
2525
result = await load_webpage("https://example.com/page", limit=100)
2626

27-
assert result.startswith("_Fetched via: trafilatura_")
28-
assert "Hello from page" in result
29-
assert "---Showing 0 to" in result
27+
frontmatter, body = result.split("\n\n", 1)
28+
assert frontmatter.startswith("---")
29+
assert "fetched: trafilatura" in frontmatter
30+
assert "extracted: trafilatura" in frontmatter
31+
assert "start: 0" in frontmatter
32+
assert f"end: {len(markdown)}" in frontmatter
33+
assert f"length: {len(markdown)}" in frontmatter
34+
assert body.strip() == "Hello from page"
3035
mock_fetch.assert_called_once_with("https://example.com/page")
3136
mock_extract.assert_called_once()
3237
mock_start.assert_not_called()
@@ -55,8 +60,13 @@ async def test_load_webpage_falls_back_to_zendriver():
5560
mock_start.return_value = browser
5661
result = await load_webpage("https://example.com/page")
5762

58-
assert result.startswith("_Fetched via: zendriver_")
59-
assert "Zendriver content" in result
63+
frontmatter, body = result.split("\n\n", 1)
64+
assert "fetched: zendriver" in frontmatter
65+
assert "extracted: trafilatura" in frontmatter
66+
assert "start: 0" in frontmatter
67+
assert f"end: {len(markdown)}" in frontmatter
68+
assert f"length: {len(markdown)}" in frontmatter
69+
assert body.strip() == "Zendriver content"
6070
assert browser.get.await_args_list[0].args == ("https://example.com/page",)
6171
page.wait_for_ready_state.assert_awaited_once_with("complete", timeout=5)
6272
page.wait.assert_awaited_once_with(t=1)
@@ -76,8 +86,13 @@ async def test_load_webpage_raw_html_short_circuits_extraction():
7686
):
7787
result = await load_webpage("https://example.com/raw", limit=5, raw=True)
7888

79-
assert result.startswith("_Fetched via: trafilatura_")
80-
assert "<html" in result
89+
frontmatter, body = result.split("\n\n", 1)
90+
assert "fetched: trafilatura" in frontmatter
91+
assert "extracted: raw" in frontmatter
92+
assert "start: 0" in frontmatter
93+
assert "end: 5" in frontmatter
94+
assert f"length: {len(html)}" in frontmatter
95+
assert body.startswith("<html")
8196
assert mock_extract.call_count == 0
8297
mock_start.assert_not_called()
8398

@@ -92,8 +107,11 @@ async def test_load_webpage_returns_warning_when_extraction_empty():
92107
):
93108
result = await load_webpage("https://example.com/empty")
94109

95-
assert "Warning: Could not extract readable content" in result
96-
assert "<html><body>No extract</body></html>" in result
110+
frontmatter, body = result.split("\n\n", 1)
111+
assert "extracted: raw" in frontmatter
112+
assert f"length: {len(html)}" in frontmatter
113+
assert "Warning: Could not extract readable content" in body
114+
assert "<html><body>No extract</body></html>" in body
97115

98116

99117
@pytest.mark.asyncio

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)