Skip to content

Commit a586021

Browse files
committed
Added async support for extract content and Jupyter Notebook example
1 parent 1d50dc8 commit a586021

File tree

7 files changed

+3235
-43
lines changed

7 files changed

+3235
-43
lines changed

README.md

+2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# NeuralNoise: The AI Podcast Studio
22

33
<p align="center">
4+
<a href="https://colab.research.google.com/drive/1-1aaRFoxJL03oUn7IB0DcfxFeWq7Vw5n?usp=sharing" alt="Open in Google Colab">
5+
<img src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
46
<a href="https://github.com/badges/shields/pulse" alt="Activity">
57
<img src="https://img.shields.io/github/commit-activity/m/leopiney/neuralnoise" /></a>
68
<a href="https://pypi.python.org/pypi/neuralnoise" alt="PyPI - Latest version">

examples/01_basics_notebook.ipynb

+3,151
Large diffs are not rendered by default.

pyproject.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "neuralnoise"
3-
version = "1.1.0"
3+
version = "1.2.0"
44
description = "An AI-powered podcast studio that uses multiple AI agents working together."
55
authors = [
66
{ name = "Leonardo Piñeyro", email = "[email protected]" }
@@ -45,6 +45,7 @@ dependencies = [
4545
"python-dotenv>=1.0.1",
4646
"requests>=2.32.3",
4747
"tabulate>=0.9.0",
48+
"tqdm>=4.66.5",
4849
"typer>=0.12.5",
4950
"youtube-transcript-api>=0.6.2",
5051
]

src/neuralnoise/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from neuralnoise.extract import extract_content
1+
from neuralnoise.extract import extract_content, aextract_content
22
from neuralnoise.studio import create_podcast_episode
33

4-
__all__ = ["create_podcast_episode", "extract_content"]
4+
__all__ = ["create_podcast_episode", "extract_content", "aextract_content"]

src/neuralnoise/extract.py

+68-33
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
1+
import asyncio
12
import logging
23
import os
3-
from asyncio import run
44
from pathlib import Path
55
from tempfile import NamedTemporaryFile
66
from textwrap import dedent
7-
from typing import Iterator
7+
from typing import AsyncIterator, Iterator
88

99
import requests # type: ignore
10+
from crawl4ai import AsyncWebCrawler, CrawlResult
1011
from langchain_community.document_loaders import (
1112
BSHTMLLoader,
1213
PyMuPDFLoader,
@@ -28,9 +29,7 @@ def __init__(
2829
self.url = url
2930
self.css_selector = css_selector
3031

31-
async def crawl(self, url: str, css_selector: str | None = None):
32-
from crawl4ai import AsyncWebCrawler
33-
32+
async def acrawl(self, url: str, css_selector: str | None = None):
3433
async with AsyncWebCrawler(verbose=True) as crawler:
3534
result = await crawler.arun(
3635
url,
@@ -39,15 +38,10 @@ async def crawl(self, url: str, css_selector: str | None = None):
3938

4039
return result
4140

42-
def lazy_load(self) -> Iterator[Document]:
43-
"""Load HTML document into document objects."""
44-
# First attempt loading with CSS selector if provided
45-
result = run(self.crawl(self.url, self.css_selector))
46-
47-
# Second attempt loading without CSS selector if first attempt failed
48-
if result.markdown is None and self.css_selector is not None:
49-
result = run(self.crawl(self.url))
41+
def crawl(self, url: str, css_selector: str | None = None):
42+
return asyncio.run(self.acrawl(url, css_selector))
5043

44+
def _process_result(self, result: CrawlResult):
5145
if result.markdown is None:
5246
raise ValueError(f"No valid content found at {self.url}")
5347

@@ -56,7 +50,29 @@ def lazy_load(self) -> Iterator[Document]:
5650
"source": self.url,
5751
}
5852

59-
yield Document(page_content=result.markdown, metadata=metadata)
53+
return Document(page_content=result.markdown, metadata=metadata)
54+
55+
def lazy_load(self) -> Iterator[Document]:
56+
"""Load HTML document into document objects."""
57+
# First attempt loading with CSS selector if provided
58+
result = self.crawl(self.url, self.css_selector)
59+
60+
# Second attempt loading without CSS selector if first attempt failed
61+
if result.markdown is None and self.css_selector is not None:
62+
result = self.crawl(self.url)
63+
64+
yield self._process_result(result)
65+
66+
async def alazy_load(self) -> AsyncIterator[Document]:
67+
"""Load HTML document into document objects."""
68+
# First attempt loading with CSS selector if provided
69+
result = await self.acrawl(self.url, self.css_selector)
70+
71+
# Second attempt loading without CSS selector if first attempt failed
72+
if result.markdown is None and self.css_selector is not None:
73+
result = self.crawl(self.url)
74+
75+
yield self._process_result(result)
6076

6177

6278
def get_best_loader(extract_from: str | Path) -> BaseLoader:
@@ -76,17 +92,16 @@ def get_best_loader(extract_from: str | Path) -> BaseLoader:
7692
except Exception:
7793
logger.warning(
7894
dedent("""
79-
Crawl4AI web loader is not available but it's recommended for
80-
better results. Install `pip install neuralnoise[crawl4ai]` to
81-
use it, or `pip install crawl4ai` to install it.
95+
Crawl4AI web loader didn't work. However, it's recommended for
96+
better results. Install it with `pip install crawl4ai`.
8297
8398
Once installed, make sure to follow the instructions in their
8499
repo: https://github.com/unclecode/crawl4ai
85100
86-
For example, you should run `playwright install` to install
87-
utils for the crawlers to work.
101+
For example, you might need to run `playwright install` to
102+
install utils for the crawlers to work.
88103
89-
Using the default web loader now.
104+
Now I will use the default web loader using BeautifulSoup.
90105
""")
91106
)
92107

@@ -104,27 +119,47 @@ def get_best_loader(extract_from: str | Path) -> BaseLoader:
104119
raise ValueError("Invalid input")
105120

106121

107-
def extract_content_from_source(extract_from: str | Path) -> str:
122+
async def _extract_single_source(
123+
extract_from: str | Path, use_async: bool = True
124+
) -> str:
125+
"""Extract content from a single source with unified async/sync handling."""
108126
logger.info(f"Extracting content from {extract_from}")
109127
loader = get_best_loader(extract_from)
110-
docs = loader.load()
111-
content = ""
112128

129+
docs = await loader.aload() if use_async else loader.load()
130+
131+
content_parts = []
113132
for doc in docs:
114133
if doc.metadata.get("title"):
115-
content += f"\n\n# {doc.metadata['title']}\n\n"
116-
content += doc.page_content.strip()
134+
content_parts.append(f"\n\n# {doc.metadata['title']}\n\n")
135+
content_parts.append(doc.page_content.strip())
117136

118-
return content
137+
return "".join(content_parts)
119138

120139

121-
def extract_content(
140+
async def _extract_multiple_sources(
141+
sources: list[str | Path] | list[str] | list[Path], use_async: bool = True
142+
) -> str:
143+
"""Extract content from multiple sources and wrap them in document tags."""
144+
contents = await asyncio.gather(
145+
*[_extract_single_source(source, use_async=use_async) for source in sources]
146+
)
147+
148+
return "\n\n".join(f"<document>\n{content}\n</document>" for content in contents)
149+
150+
151+
# Public API functions
152+
async def aextract_content(
122153
extract_from: str | Path | list[str] | list[Path] | list[str | Path],
123154
) -> str:
124-
if not isinstance(extract_from, list):
125-
extract_from = [extract_from]
155+
"""Async version of content extraction."""
156+
sources = [extract_from] if not isinstance(extract_from, list) else extract_from
157+
return await _extract_multiple_sources(sources, use_async=True)
126158

127-
return "\n\n".join(
128-
f"<document>\n{extract_content_from_source(item)}\n</document>"
129-
for item in extract_from
130-
)
159+
160+
def extract_content(
161+
extract_from: str | Path | list[str] | list[Path] | list[str | Path],
162+
) -> str:
163+
"""Sync version of content extraction."""
164+
sources = [extract_from] if not isinstance(extract_from, list) else extract_from
165+
return asyncio.run(_extract_multiple_sources(sources, use_async=False))

src/neuralnoise/studio/create.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from pydub import AudioSegment
88
from pydub.effects import normalize
9-
from rich.progress import track
9+
from tqdm import tqdm
1010

1111
from neuralnoise.studio import PodcastStudio
1212
from neuralnoise.tts import generate_audio_segment
@@ -33,10 +33,9 @@ def create_podcast_episode_from_script(
3333

3434
audio_segments = []
3535

36-
for section_id, segment in track(
36+
for section_id, segment in tqdm(
3737
script_segments,
38-
description="Generating audio segments...",
39-
total=len(script_segments),
38+
desc="Generating audio segments",
4039
):
4140
speaker = config.speakers[segment["speaker"]]
4241
content = segment["content"]
@@ -73,7 +72,7 @@ def create_podcast_episode(
7372
config_path: str | Path | None = None,
7473
format: Literal["wav", "mp3", "ogg"] = "wav",
7574
only_script: bool = False,
76-
):
75+
) -> AudioSegment | None:
7776
# Create output directory
7877
output_dir = Path("output") / name
7978
output_dir.mkdir(parents=True, exist_ok=True)
@@ -101,7 +100,7 @@ def create_podcast_episode(
101100
script_path.write_text(json.dumps(script, ensure_ascii=False))
102101

103102
if only_script:
104-
return
103+
return None
105104

106105
# Generate audio segments and create the podcast
107106
logger.info("🎙️ Recording podcast episode")
@@ -113,3 +112,5 @@ def create_podcast_episode(
113112
podcast.export(podcast_filepath, format=format)
114113

115114
logger.info("✅ Podcast generation complete")
115+
116+
return podcast

uv.lock

+3-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)