feat: update document toolkit with crawl4ai

Ralph-Zhou · Ralph-Zhou · commit 1b52388b535b · 2026-02-04T22:12:11.000+08:00
diff --git a/owl/utils/document_toolkit.py b/owl/utils/document_toolkit.py
@@ -30,7 +30,8 @@
 import xmltodict
 import nest_asyncio
 import traceback
-import html2text
+import asyncio
+from crawl4ai import AsyncWebCrawler
 
 nest_asyncio.apply()
 
@@ -237,20 +238,32 @@ def _extract_webpage_content(self, url: str) -> str:
 
             return str(data["data"][0]["markdown"])
         else:
-            logger.warning("Firecrawl API key is not set. Use html2text to extract the content of the webpage.")
-            return self._extract_webpage_content_with_html2text(url)
+            logger.warning("Firecrawl API key is not set. Use crawl4ai to extract the content of the webpage.")
+            return self._extract_webpage_content_with_crawl4ai(url)
     
     
-    def _extract_webpage_content_with_html2text(self, url: str) -> str:
-        r"""Extract the content of a webpage using html2text."""
-        user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+    def _extract_webpage_content_with_crawl4ai(self, url: str) -> str:
+        r"""Extract the content of a webpage using crawl4ai."""
         try:
-            response = requests.get(url, headers={"User-Agent": user_agent})
-            response.raise_for_status()
-            return html2text.html2text(response.text)
+            # Use asyncio.run to execute the async function
+            return asyncio.run(self._async_extract_webpage_content_with_crawl4ai(url))
         except Exception as e:
             logger.error(f"Error while extracting the content of the webpage: {e}")
             return "Error while extracting the content of the webpage."
+    
+    async def _async_extract_webpage_content_with_crawl4ai(self, url: str) -> str:
+        r"""Async helper method to extract webpage content using crawl4ai."""
+        try:
+            async with AsyncWebCrawler(verbose=False) as crawler:
+                result = await crawler.arun(url=url)
+                if result.markdown:
+                    return result.markdown
+                else:
+                    logger.warning("No markdown content extracted from the webpage.")
+                    return "No content found on the webpage."
+        except Exception as e:
+            logger.error(f"Error while extracting the content of the webpage with crawl4ai: {e}")
+            return "Error while extracting the content of the webpage."
 
 
     def _download_file(self, url: str):
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
     "mcp-server-fetch==2025.1.17",
     "xmltodict>=0.14.2",
     "firecrawl>=2.5.3",
+    "crawl4ai>=0.3.0",
     "mistralai>=1.7.0",
     "retry==0.9.2",
 ]
diff --git a/requirements.txt b/requirements.txt
@@ -5,4 +5,5 @@ mcp-simple-arxiv==0.2.2
 mcp-server-fetch==2025.1.17
 xmltodict>=0.14.2
 firecrawl>=2.5.3
+crawl4ai>=0.3.0
 retry==0.9.2
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@ dependencies = [`
`28`	`28`	`"mcp-server-fetch==2025.1.17",`
`29`	`29`	`"xmltodict>=0.14.2",`
`30`	`30`	`"firecrawl>=2.5.3",`
	`31`	`+ "crawl4ai>=0.3.0",`
`31`	`32`	`"mistralai>=1.7.0",`
`32`	`33`	`"retry==0.9.2",`
`33`	`34`	`]`