Fix asyncio problems with crawl4ai extractor

leopiney · leopiney · commit 2e691439f3a6 · 2024-11-02T22:56:35.000-03:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "neuralnoise"
-version = "1.2.0"
+version = "1.3.0"
 description = "An AI-powered podcast studio that uses multiple AI agents working together."
 authors = [
     { name = "Leonardo Piñeyro", email = "leopiney@gmail.com" }
diff --git a/src/neuralnoise/extract.py b/src/neuralnoise/extract.py
@@ -29,7 +29,7 @@ def __init__(
         self.url = url
         self.css_selector = css_selector
 
-    async def acrawl(self, url: str, css_selector: str | None = None):
+    async def crawl(self, url: str, css_selector: str | None = None):
         async with AsyncWebCrawler(verbose=True) as crawler:
             result = await crawler.arun(
                 url,
@@ -38,9 +38,6 @@ async def acrawl(self, url: str, css_selector: str | None = None):
 
         return result
 
-    def crawl(self, url: str, css_selector: str | None = None):
-        return asyncio.run(self.acrawl(url, css_selector))
-
     def _process_result(self, result: CrawlResult):
         if result.markdown is None:
             raise ValueError(f"No valid content found at {self.url}")
@@ -52,25 +49,14 @@ def _process_result(self, result: CrawlResult):
 
         return Document(page_content=result.markdown, metadata=metadata)
 
-    def lazy_load(self) -> Iterator[Document]:
-        """Load HTML document into document objects."""
-        # First attempt loading with CSS selector if provided
-        result = self.crawl(self.url, self.css_selector)
-
-        # Second attempt loading without CSS selector if first attempt failed
-        if result.markdown is None and self.css_selector is not None:
-            result = self.crawl(self.url)
-
-        yield self._process_result(result)
-
     async def alazy_load(self) -> AsyncIterator[Document]:
         """Load HTML document into document objects."""
         # First attempt loading with CSS selector if provided
-        result = await self.acrawl(self.url, self.css_selector)
+        result = await self.crawl(self.url, self.css_selector)
 
         # Second attempt loading without CSS selector if first attempt failed
         if result.markdown is None and self.css_selector is not None:
-            result = self.crawl(self.url)
+            result = await self.crawl(self.url)
 
         yield self._process_result(result)
 
@@ -126,7 +112,11 @@ async def _extract_single_source(
     logger.info(f"Extracting content from {extract_from}")
     loader = get_best_loader(extract_from)
 
-    docs = await loader.aload() if use_async else loader.load()
+    docs = (
+        await loader.aload()
+        if use_async or isinstance(loader, Crawl4AILoader)
+        else loader.load()
+    )
 
     content_parts = []
     for doc in docs:
diff --git a/uv.lock b/uv.lock