1
+ import asyncio
1
2
import logging
2
3
import os
3
- from asyncio import run
4
4
from pathlib import Path
5
5
from tempfile import NamedTemporaryFile
6
6
from textwrap import dedent
7
- from typing import Iterator
7
+ from typing import AsyncIterator , Iterator
8
8
9
9
import requests # type: ignore
10
+ from crawl4ai import AsyncWebCrawler , CrawlResult
10
11
from langchain_community .document_loaders import (
11
12
BSHTMLLoader ,
12
13
PyMuPDFLoader ,
@@ -28,9 +29,7 @@ def __init__(
28
29
self .url = url
29
30
self .css_selector = css_selector
30
31
31
- async def crawl (self , url : str , css_selector : str | None = None ):
32
- from crawl4ai import AsyncWebCrawler
33
-
32
+ async def acrawl (self , url : str , css_selector : str | None = None ):
34
33
async with AsyncWebCrawler (verbose = True ) as crawler :
35
34
result = await crawler .arun (
36
35
url ,
@@ -39,15 +38,10 @@ async def crawl(self, url: str, css_selector: str | None = None):
39
38
40
39
return result
41
40
42
- def lazy_load (self ) -> Iterator [Document ]:
43
- """Load HTML document into document objects."""
44
- # First attempt loading with CSS selector if provided
45
- result = run (self .crawl (self .url , self .css_selector ))
46
-
47
- # Second attempt loading without CSS selector if first attempt failed
48
- if result .markdown is None and self .css_selector is not None :
49
- result = run (self .crawl (self .url ))
41
+ def crawl (self , url : str , css_selector : str | None = None ):
42
+ return asyncio .run (self .acrawl (url , css_selector ))
50
43
44
+ def _process_result (self , result : CrawlResult ):
51
45
if result .markdown is None :
52
46
raise ValueError (f"No valid content found at { self .url } " )
53
47
@@ -56,7 +50,29 @@ def lazy_load(self) -> Iterator[Document]:
56
50
"source" : self .url ,
57
51
}
58
52
59
- yield Document (page_content = result .markdown , metadata = metadata )
53
+ return Document (page_content = result .markdown , metadata = metadata )
54
+
55
+ def lazy_load (self ) -> Iterator [Document ]:
56
+ """Load HTML document into document objects."""
57
+ # First attempt loading with CSS selector if provided
58
+ result = self .crawl (self .url , self .css_selector )
59
+
60
+ # Second attempt loading without CSS selector if first attempt failed
61
+ if result .markdown is None and self .css_selector is not None :
62
+ result = self .crawl (self .url )
63
+
64
+ yield self ._process_result (result )
65
+
66
+ async def alazy_load (self ) -> AsyncIterator [Document ]:
67
+ """Load HTML document into document objects."""
68
+ # First attempt loading with CSS selector if provided
69
+ result = await self .acrawl (self .url , self .css_selector )
70
+
71
+ # Second attempt loading without CSS selector if first attempt failed
72
+ if result .markdown is None and self .css_selector is not None :
73
+ result = self .crawl (self .url )
74
+
75
+ yield self ._process_result (result )
60
76
61
77
62
78
def get_best_loader (extract_from : str | Path ) -> BaseLoader :
@@ -76,17 +92,16 @@ def get_best_loader(extract_from: str | Path) -> BaseLoader:
76
92
except Exception :
77
93
logger .warning (
78
94
dedent ("""
79
- Crawl4AI web loader is not available but it's recommended for
80
- better results. Install `pip install neuralnoise[crawl4ai]` to
81
- use it, or `pip install crawl4ai` to install it.
95
+ Crawl4AI web loader didn't work. However, it's recommended for
96
+ better results. Install it with `pip install crawl4ai`.
82
97
83
98
Once installed, make sure to follow the instructions in their
84
99
repo: https://github.com/unclecode/crawl4ai
85
100
86
- For example, you should run `playwright install` to install
87
- utils for the crawlers to work.
101
+ For example, you might need to run `playwright install` to
102
+ install utils for the crawlers to work.
88
103
89
- Using the default web loader now .
104
+ Now I will use the default web loader using BeautifulSoup .
90
105
""" )
91
106
)
92
107
@@ -104,27 +119,47 @@ def get_best_loader(extract_from: str | Path) -> BaseLoader:
104
119
raise ValueError ("Invalid input" )
105
120
106
121
107
- def extract_content_from_source (extract_from : str | Path ) -> str :
122
+ async def _extract_single_source (
123
+ extract_from : str | Path , use_async : bool = True
124
+ ) -> str :
125
+ """Extract content from a single source with unified async/sync handling."""
108
126
logger .info (f"Extracting content from { extract_from } " )
109
127
loader = get_best_loader (extract_from )
110
- docs = loader .load ()
111
- content = ""
112
128
129
+ docs = await loader .aload () if use_async else loader .load ()
130
+
131
+ content_parts = []
113
132
for doc in docs :
114
133
if doc .metadata .get ("title" ):
115
- content += f"\n \n # { doc .metadata ['title' ]} \n \n "
116
- content += doc .page_content .strip ()
134
+ content_parts . append ( f"\n \n # { doc .metadata ['title' ]} \n \n " )
135
+ content_parts . append ( doc .page_content .strip () )
117
136
118
- return content
137
+ return "" . join ( content_parts )
119
138
120
139
121
- def extract_content (
140
+ async def _extract_multiple_sources (
141
+ sources : list [str | Path ] | list [str ] | list [Path ], use_async : bool = True
142
+ ) -> str :
143
+ """Extract content from multiple sources and wrap them in document tags."""
144
+ contents = await asyncio .gather (
145
+ * [_extract_single_source (source , use_async = use_async ) for source in sources ]
146
+ )
147
+
148
+ return "\n \n " .join (f"<document>\n { content } \n </document>" for content in contents )
149
+
150
+
151
+ # Public API functions
152
+ async def aextract_content (
122
153
extract_from : str | Path | list [str ] | list [Path ] | list [str | Path ],
123
154
) -> str :
124
- if not isinstance (extract_from , list ):
125
- extract_from = [extract_from ]
155
+ """Async version of content extraction."""
156
+ sources = [extract_from ] if not isinstance (extract_from , list ) else extract_from
157
+ return await _extract_multiple_sources (sources , use_async = True )
126
158
127
- return "\n \n " .join (
128
- f"<document>\n { extract_content_from_source (item )} \n </document>"
129
- for item in extract_from
130
- )
159
+
160
+ def extract_content (
161
+ extract_from : str | Path | list [str ] | list [Path ] | list [str | Path ],
162
+ ) -> str :
163
+ """Sync version of content extraction."""
164
+ sources = [extract_from ] if not isinstance (extract_from , list ) else extract_from
165
+ return asyncio .run (_extract_multiple_sources (sources , use_async = False ))
0 commit comments