1+ import asyncio
12import logging
23import os
3- from asyncio import run
44from pathlib import Path
55from tempfile import NamedTemporaryFile
66from textwrap import dedent
7- from typing import Iterator
7+ from typing import AsyncIterator , Iterator
88
99import requests # type: ignore
10+ from crawl4ai import AsyncWebCrawler , CrawlResult
1011from langchain_community .document_loaders import (
1112 BSHTMLLoader ,
1213 PyMuPDFLoader ,
@@ -28,9 +29,7 @@ def __init__(
2829 self .url = url
2930 self .css_selector = css_selector
3031
31- async def crawl (self , url : str , css_selector : str | None = None ):
32- from crawl4ai import AsyncWebCrawler
33-
32+ async def acrawl (self , url : str , css_selector : str | None = None ):
3433 async with AsyncWebCrawler (verbose = True ) as crawler :
3534 result = await crawler .arun (
3635 url ,
@@ -39,15 +38,10 @@ async def crawl(self, url: str, css_selector: str | None = None):
3938
4039 return result
4140
42- def lazy_load (self ) -> Iterator [Document ]:
43- """Load HTML document into document objects."""
44- # First attempt loading with CSS selector if provided
45- result = run (self .crawl (self .url , self .css_selector ))
46-
47- # Second attempt loading without CSS selector if first attempt failed
48- if result .markdown is None and self .css_selector is not None :
49- result = run (self .crawl (self .url ))
41+ def crawl (self , url : str , css_selector : str | None = None ):
42+ return asyncio .run (self .acrawl (url , css_selector ))
5043
44+ def _process_result (self , result : CrawlResult ):
5145 if result .markdown is None :
5246 raise ValueError (f"No valid content found at { self .url } " )
5347
@@ -56,7 +50,29 @@ def lazy_load(self) -> Iterator[Document]:
5650 "source" : self .url ,
5751 }
5852
59- yield Document (page_content = result .markdown , metadata = metadata )
53+ return Document (page_content = result .markdown , metadata = metadata )
54+
55+ def lazy_load (self ) -> Iterator [Document ]:
56+ """Load HTML document into document objects."""
57+ # First attempt loading with CSS selector if provided
58+ result = self .crawl (self .url , self .css_selector )
59+
60+ # Second attempt loading without CSS selector if first attempt failed
61+ if result .markdown is None and self .css_selector is not None :
62+ result = self .crawl (self .url )
63+
64+ yield self ._process_result (result )
65+
66+ async def alazy_load (self ) -> AsyncIterator [Document ]:
67+ """Load HTML document into document objects."""
68+ # First attempt loading with CSS selector if provided
69+ result = await self .acrawl (self .url , self .css_selector )
70+
71+ # Second attempt loading without CSS selector if first attempt failed
72+ if result .markdown is None and self .css_selector is not None :
73+ result = self .crawl (self .url )
74+
75+ yield self ._process_result (result )
6076
6177
6278def get_best_loader (extract_from : str | Path ) -> BaseLoader :
@@ -76,17 +92,16 @@ def get_best_loader(extract_from: str | Path) -> BaseLoader:
7692 except Exception :
7793 logger .warning (
7894 dedent ("""
79- Crawl4AI web loader is not available but it's recommended for
80- better results. Install `pip install neuralnoise[crawl4ai]` to
81- use it, or `pip install crawl4ai` to install it.
95+ Crawl4AI web loader didn't work. However, it's recommended for
96+ better results. Install it with `pip install crawl4ai`.
8297
8398 Once installed, make sure to follow the instructions in their
8499 repo: https://github.com/unclecode/crawl4ai
85100
86- For example, you should run `playwright install` to install
87- utils for the crawlers to work.
101+ For example, you might need to run `playwright install` to
102+ install utils for the crawlers to work.
88103
89- Using the default web loader now .
104+ Now I will use the default web loader using BeautifulSoup .
90105 """ )
91106 )
92107
@@ -104,27 +119,47 @@ def get_best_loader(extract_from: str | Path) -> BaseLoader:
104119 raise ValueError ("Invalid input" )
105120
106121
107- def extract_content_from_source (extract_from : str | Path ) -> str :
122+ async def _extract_single_source (
123+ extract_from : str | Path , use_async : bool = True
124+ ) -> str :
125+ """Extract content from a single source with unified async/sync handling."""
108126 logger .info (f"Extracting content from { extract_from } " )
109127 loader = get_best_loader (extract_from )
110- docs = loader .load ()
111- content = ""
112128
129+ docs = await loader .aload () if use_async else loader .load ()
130+
131+ content_parts = []
113132 for doc in docs :
114133 if doc .metadata .get ("title" ):
115- content += f"\n \n # { doc .metadata ['title' ]} \n \n "
116- content += doc .page_content .strip ()
134+ content_parts . append ( f"\n \n # { doc .metadata ['title' ]} \n \n " )
135+ content_parts . append ( doc .page_content .strip () )
117136
118- return content
137+ return "" . join ( content_parts )
119138
120139
121- def extract_content (
140+ async def _extract_multiple_sources (
141+ sources : list [str | Path ] | list [str ] | list [Path ], use_async : bool = True
142+ ) -> str :
143+ """Extract content from multiple sources and wrap them in document tags."""
144+ contents = await asyncio .gather (
145+ * [_extract_single_source (source , use_async = use_async ) for source in sources ]
146+ )
147+
148+ return "\n \n " .join (f"<document>\n { content } \n </document>" for content in contents )
149+
150+
151+ # Public API functions
152+ async def aextract_content (
122153 extract_from : str | Path | list [str ] | list [Path ] | list [str | Path ],
123154) -> str :
124- if not isinstance (extract_from , list ):
125- extract_from = [extract_from ]
155+ """Async version of content extraction."""
156+ sources = [extract_from ] if not isinstance (extract_from , list ) else extract_from
157+ return await _extract_multiple_sources (sources , use_async = True )
126158
127- return "\n \n " .join (
128- f"<document>\n { extract_content_from_source (item )} \n </document>"
129- for item in extract_from
130- )
159+
160+ def extract_content (
161+ extract_from : str | Path | list [str ] | list [Path ] | list [str | Path ],
162+ ) -> str :
163+ """Sync version of content extraction."""
164+ sources = [extract_from ] if not isinstance (extract_from , list ) else extract_from
165+ return asyncio .run (_extract_multiple_sources (sources , use_async = False ))
0 commit comments