@@ -29,7 +29,7 @@ def __init__(
29
29
self .url = url
30
30
self .css_selector = css_selector
31
31
32
- async def acrawl (self , url : str , css_selector : str | None = None ):
32
+ async def crawl (self , url : str , css_selector : str | None = None ):
33
33
async with AsyncWebCrawler (verbose = True ) as crawler :
34
34
result = await crawler .arun (
35
35
url ,
@@ -38,9 +38,6 @@ async def acrawl(self, url: str, css_selector: str | None = None):
38
38
39
39
return result
40
40
41
- def crawl (self , url : str , css_selector : str | None = None ):
42
- return asyncio .run (self .acrawl (url , css_selector ))
43
-
44
41
def _process_result (self , result : CrawlResult ):
45
42
if result .markdown is None :
46
43
raise ValueError (f"No valid content found at { self .url } " )
@@ -52,25 +49,14 @@ def _process_result(self, result: CrawlResult):
52
49
53
50
return Document (page_content = result .markdown , metadata = metadata )
54
51
55
- def lazy_load (self ) -> Iterator [Document ]:
56
- """Load HTML document into document objects."""
57
- # First attempt loading with CSS selector if provided
58
- result = self .crawl (self .url , self .css_selector )
59
-
60
- # Second attempt loading without CSS selector if first attempt failed
61
- if result .markdown is None and self .css_selector is not None :
62
- result = self .crawl (self .url )
63
-
64
- yield self ._process_result (result )
65
-
66
52
async def alazy_load (self ) -> AsyncIterator [Document ]:
67
53
"""Load HTML document into document objects."""
68
54
# First attempt loading with CSS selector if provided
69
- result = await self .acrawl (self .url , self .css_selector )
55
+ result = await self .crawl (self .url , self .css_selector )
70
56
71
57
# Second attempt loading without CSS selector if first attempt failed
72
58
if result .markdown is None and self .css_selector is not None :
73
- result = self .crawl (self .url )
59
+ result = await self .crawl (self .url )
74
60
75
61
yield self ._process_result (result )
76
62
@@ -126,7 +112,11 @@ async def _extract_single_source(
126
112
logger .info (f"Extracting content from { extract_from } " )
127
113
loader = get_best_loader (extract_from )
128
114
129
- docs = await loader .aload () if use_async else loader .load ()
115
+ docs = (
116
+ await loader .aload ()
117
+ if use_async or isinstance (loader , Crawl4AILoader )
118
+ else loader .load ()
119
+ )
130
120
131
121
content_parts = []
132
122
for doc in docs :
0 commit comments