Skip to content

Commit 2e69143

Browse files
committed
Fix asyncio problems with crawl4ai extractor
1 parent a586021 commit 2e69143

File tree

3 files changed

+10
-20
lines changed

3 files changed

+10
-20
lines changed

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "neuralnoise"
3-
version = "1.2.0"
3+
version = "1.3.0"
44
description = "An AI-powered podcast studio that uses multiple AI agents working together."
55
authors = [
66
{ name = "Leonardo Piñeyro", email = "[email protected]" }

src/neuralnoise/extract.py

+8-18
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def __init__(
2929
self.url = url
3030
self.css_selector = css_selector
3131

32-
async def acrawl(self, url: str, css_selector: str | None = None):
32+
async def crawl(self, url: str, css_selector: str | None = None):
3333
async with AsyncWebCrawler(verbose=True) as crawler:
3434
result = await crawler.arun(
3535
url,
@@ -38,9 +38,6 @@ async def acrawl(self, url: str, css_selector: str | None = None):
3838

3939
return result
4040

41-
def crawl(self, url: str, css_selector: str | None = None):
42-
return asyncio.run(self.acrawl(url, css_selector))
43-
4441
def _process_result(self, result: CrawlResult):
4542
if result.markdown is None:
4643
raise ValueError(f"No valid content found at {self.url}")
@@ -52,25 +49,14 @@ def _process_result(self, result: CrawlResult):
5249

5350
return Document(page_content=result.markdown, metadata=metadata)
5451

55-
def lazy_load(self) -> Iterator[Document]:
56-
"""Load HTML document into document objects."""
57-
# First attempt loading with CSS selector if provided
58-
result = self.crawl(self.url, self.css_selector)
59-
60-
# Second attempt loading without CSS selector if first attempt failed
61-
if result.markdown is None and self.css_selector is not None:
62-
result = self.crawl(self.url)
63-
64-
yield self._process_result(result)
65-
6652
async def alazy_load(self) -> AsyncIterator[Document]:
6753
"""Load HTML document into document objects."""
6854
# First attempt loading with CSS selector if provided
69-
result = await self.acrawl(self.url, self.css_selector)
55+
result = await self.crawl(self.url, self.css_selector)
7056

7157
# Second attempt loading without CSS selector if first attempt failed
7258
if result.markdown is None and self.css_selector is not None:
73-
result = self.crawl(self.url)
59+
result = await self.crawl(self.url)
7460

7561
yield self._process_result(result)
7662

@@ -126,7 +112,11 @@ async def _extract_single_source(
126112
logger.info(f"Extracting content from {extract_from}")
127113
loader = get_best_loader(extract_from)
128114

129-
docs = await loader.aload() if use_async else loader.load()
115+
docs = (
116+
await loader.aload()
117+
if use_async or isinstance(loader, Crawl4AILoader)
118+
else loader.load()
119+
)
130120

131121
content_parts = []
132122
for doc in docs:

uv.lock

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)