|
30 | 30 | import xmltodict |
31 | 31 | import nest_asyncio |
32 | 32 | import traceback |
33 | | -import html2text |
| 33 | +import asyncio |
| 34 | +from crawl4ai import AsyncWebCrawler |
34 | 35 |
|
35 | 36 | nest_asyncio.apply() |
36 | 37 |
|
@@ -237,20 +238,32 @@ def _extract_webpage_content(self, url: str) -> str: |
237 | 238 |
|
238 | 239 | return str(data["data"][0]["markdown"]) |
239 | 240 | else: |
240 | | - logger.warning("Firecrawl API key is not set. Use html2text to extract the content of the webpage.") |
241 | | - return self._extract_webpage_content_with_html2text(url) |
| 241 | + logger.warning("Firecrawl API key is not set. Use crawl4ai to extract the content of the webpage.") |
| 242 | + return self._extract_webpage_content_with_crawl4ai(url) |
242 | 243 |
|
243 | 244 |
|
244 | | - def _extract_webpage_content_with_html2text(self, url: str) -> str: |
245 | | - r"""Extract the content of a webpage using html2text.""" |
246 | | - user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" |
| 245 | + def _extract_webpage_content_with_crawl4ai(self, url: str) -> str: |
| 246 | + r"""Extract the content of a webpage using crawl4ai.""" |
247 | 247 | try: |
248 | | - response = requests.get(url, headers={"User-Agent": user_agent}) |
249 | | - response.raise_for_status() |
250 | | - return html2text.html2text(response.text) |
| 248 | + # Use asyncio.run to execute the async function |
| 249 | + return asyncio.run(self._async_extract_webpage_content_with_crawl4ai(url)) |
251 | 250 | except Exception as e: |
252 | 251 | logger.error(f"Error while extracting the content of the webpage: {e}") |
253 | 252 | return "Error while extracting the content of the webpage." |
| 253 | + |
| 254 | + async def _async_extract_webpage_content_with_crawl4ai(self, url: str) -> str: |
| 255 | + r"""Async helper method to extract webpage content using crawl4ai.""" |
| 256 | + try: |
| 257 | + async with AsyncWebCrawler(verbose=False) as crawler: |
| 258 | + result = await crawler.arun(url=url) |
| 259 | + if result.markdown: |
| 260 | + return result.markdown |
| 261 | + else: |
| 262 | + logger.warning("No markdown content extracted from the webpage.") |
| 263 | + return "No content found on the webpage." |
| 264 | + except Exception as e: |
| 265 | + logger.error(f"Error while extracting the content of the webpage with crawl4ai: {e}") |
| 266 | + return "Error while extracting the content of the webpage." |
254 | 267 |
|
255 | 268 |
|
256 | 269 | def _download_file(self, url: str): |
|
0 commit comments