Skip to content

Commit 1b52388

Browse files
committed
feat: update document toolkit with crawl4ai
1 parent eb7cb00 commit 1b52388

File tree

4 files changed

+59
-9
lines changed

4 files changed

+59
-9
lines changed

owl/utils/document_toolkit.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@
3030
import xmltodict
3131
import nest_asyncio
3232
import traceback
33-
import html2text
33+
import asyncio
34+
from crawl4ai import AsyncWebCrawler
3435

3536
nest_asyncio.apply()
3637

@@ -237,20 +238,32 @@ def _extract_webpage_content(self, url: str) -> str:
237238

238239
return str(data["data"][0]["markdown"])
239240
else:
240-
logger.warning("Firecrawl API key is not set. Use html2text to extract the content of the webpage.")
241-
return self._extract_webpage_content_with_html2text(url)
241+
logger.warning("Firecrawl API key is not set. Use crawl4ai to extract the content of the webpage.")
242+
return self._extract_webpage_content_with_crawl4ai(url)
242243

243244

244-
def _extract_webpage_content_with_html2text(self, url: str) -> str:
245-
r"""Extract the content of a webpage using html2text."""
246-
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
245+
def _extract_webpage_content_with_crawl4ai(self, url: str) -> str:
246+
r"""Extract the content of a webpage using crawl4ai."""
247247
try:
248-
response = requests.get(url, headers={"User-Agent": user_agent})
249-
response.raise_for_status()
250-
return html2text.html2text(response.text)
248+
# Use asyncio.run to execute the async function
249+
return asyncio.run(self._async_extract_webpage_content_with_crawl4ai(url))
251250
except Exception as e:
252251
logger.error(f"Error while extracting the content of the webpage: {e}")
253252
return "Error while extracting the content of the webpage."
253+
254+
async def _async_extract_webpage_content_with_crawl4ai(self, url: str) -> str:
255+
r"""Async helper method to extract webpage content using crawl4ai."""
256+
try:
257+
async with AsyncWebCrawler(verbose=False) as crawler:
258+
result = await crawler.arun(url=url)
259+
if result.markdown:
260+
return result.markdown
261+
else:
262+
logger.warning("No markdown content extracted from the webpage.")
263+
return "No content found on the webpage."
264+
except Exception as e:
265+
logger.error(f"Error while extracting the content of the webpage with crawl4ai: {e}")
266+
return "Error while extracting the content of the webpage."
254267

255268

256269
def _download_file(self, url: str):

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ dependencies = [
2828
"mcp-server-fetch==2025.1.17",
2929
"xmltodict>=0.14.2",
3030
"firecrawl>=2.5.3",
31+
"crawl4ai>=0.3.0",
3132
"mistralai>=1.7.0",
3233
"retry==0.9.2",
3334
]

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ mcp-simple-arxiv==0.2.2
55
mcp-server-fetch==2025.1.17
66
xmltodict>=0.14.2
77
firecrawl>=2.5.3
8+
crawl4ai>=0.3.0
89
retry==0.9.2

uv.lock

Lines changed: 35 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)