1616logger = logging .getLogger (__name__ )
1717
1818
19+ def _fetch_with_trafilatura (url : str ) -> tuple [str | None , str | None ]:
20+ try :
21+ logger .info ("Attempting to fetch %s with trafilatura" , url )
22+ html = trafilatura .fetch_url (url )
23+ if html :
24+ return html , "trafilatura"
25+ except Exception as exc :
26+ logger .error ("Error fetching page with trafilatura: %s" , exc )
27+ return None , None
28+
29+
30+ async def _fetch_with_zendriver (url : str ) -> tuple [str | None , str | None ]:
31+ browser = None
32+ try :
33+ browser = await zd .start (headless = True , sandbox = False )
34+ page = await browser .get (url )
35+ await page .wait_for_ready_state ("complete" , timeout = 5 )
36+ await page .wait (t = 1 ) # Allow dynamic content to settle
37+ html = await page .get_content ()
38+ if html :
39+ return html , "zendriver"
40+ except Exception as exc :
41+ logger .warning ("Error fetching page with zendriver: %s" , exc )
42+ finally :
43+ if browser :
44+ try :
45+ await browser .stop ()
46+ except Exception :
47+ pass
48+ return None , None
49+
50+
51+ async def _fetch_html (url : str ) -> tuple [str | None , str | None ]:
52+ html , provider = _fetch_with_trafilatura (url )
53+ if html :
54+ return html , provider
55+ return await _fetch_with_zendriver (url )
56+
57+
58+ def _extract_markdown (html : str ) -> tuple [str | None , str | None ]:
59+ try :
60+ content = trafilatura .extract (
61+ html ,
62+ output_format = "markdown" ,
63+ include_images = True ,
64+ include_links = True ,
65+ )
66+ except Exception as exc :
67+ logger .error ("Error extracting content with trafilatura: %s" , exc )
68+ return None , f"Error: Failed to extract readable content: { exc } "
69+ if not content :
70+ return None , None
71+ return content , None
72+
73+
74+ def _format_frontmatter (
75+ * ,
76+ fetched : str | None ,
77+ extracted : str | None ,
78+ start : int ,
79+ end : int ,
80+ length : int ,
81+ ) -> str :
82+ lines = ["---" ]
83+ lines .append (f"fetched: { fetched or 'unknown' } " )
84+ lines .append (f"extracted: { extracted or 'none' } " )
85+ lines .append (f"start: { start } " )
86+ lines .append (f"end: { end } " )
87+ lines .append (f"length: { length } " )
88+ lines .append ("---" )
89+ return "\n " .join (lines ) + "\n \n "
90+
91+
92+ def _slice_text (text : str , start : int , limit : int ) -> tuple [str , int ]:
93+ if limit <= 0 :
94+ end = len (text )
95+ else :
96+ end = min (start + limit , len (text ))
97+ return text [start :end ], end
98+
99+
19100async def load_webpage (
20101 url : str , limit : int = 10_000 , offset : int = 0 , raw : bool = False
21102) -> str :
@@ -31,83 +112,55 @@ async def load_webpage(
31112 """
32113 try :
33114 async with asyncio .timeout (10 ):
34- # Initialize html, provider and browser to None
35- html = None
36- provider = None # Will be set to 'trafilatura' or 'zendriver'
37- browser = None
38-
39- try :
40- logger .info (f"Attempting to fetch { url } with trafilatura" )
41- html = trafilatura .fetch_url (url )
42- if html :
43- provider = "trafilatura"
44- except Exception as e :
45- logger .error (f"Error fetching page with trafilatura: { str (e )} " )
46-
47- if not html :
48- try :
49- browser = await zd .start (headless = True , sandbox = False )
50- page = await browser .get (url )
51- await page .wait_for_ready_state ("complete" , timeout = 5 )
52- await page .wait (t = 1 ) # Wait a bit for dynamic content
53- html = await page .get_content ()
54- if html :
55- provider = "zendriver"
56- except Exception as e :
57- logger .warning (
58- f"Error fetching page with zendriver: { str (e )} , trying trafilatura next"
59- )
60- finally :
61- # Ensure browser is closed even if an error occurs
62- if browser :
63- try :
64- await browser .stop ()
65- except Exception :
66- pass # Ignore errors during browser closing
67-
68- # If both methods failed, return error
115+ html , fetch_provider = await _fetch_html (url )
69116 if not html :
70117 logger .error (
71- f"Failed to retrieve content from { url } using both zendriver and trafilatura"
118+ "Failed to retrieve content from %s using both zendriver and trafilatura" ,
119+ url ,
72120 )
73121 return f"Error: Failed to retrieve page content from { url } using multiple methods"
74122
75- if raw :
76- note = f"_Fetched via: { provider } _\n \n " if provider else ""
77- res = html [offset : offset + limit ]
78- res += f"\n \n ---Showing { offset } to { min (offset + limit , len (html ))} out of { len (html )} characters.---"
79- return note + res
123+ extraction_provider : str | None = None
124+ warning : str | None = None
125+ source = html
80126
81- try :
82- content = trafilatura .extract (
83- html ,
84- output_format = "markdown" ,
85- include_images = True ,
86- include_links = True ,
87- )
88- except Exception as e :
89- logger .error (f"Error extracting content with trafilatura: { str (e )} " )
90- return f"Error: Failed to extract readable content: { str (e )} "
91-
92- if not content :
93- logger .warning (f"Failed to extract content from { url } " )
94- # Fallback to raw HTML with a warning
95- note = f"_Fetched via: { provider } _\n \n " if provider else ""
96- return (
97- f"{ note } Warning: Could not extract readable content from { url } . "
98- f"Showing raw HTML instead.\n \n { html [offset : offset + limit ]} "
99- )
127+ if raw :
128+ extraction_provider = "raw"
129+ else :
130+ content , extraction_error = _extract_markdown (html )
131+ if extraction_error :
132+ return extraction_error
133+ if content :
134+ source = content
135+ extraction_provider = "trafilatura"
136+ else :
137+ extraction_provider = "raw"
138+ warning = (
139+ f"Warning: Could not extract readable content from { url } . "
140+ "Showing raw HTML instead."
141+ )
100142
101- note = f"_Fetched via: { provider } _\n \n " if provider else ""
102- res = content [offset : offset + limit ]
103- res += f"\n \n ---Showing { offset } to { min (offset + limit , len (content ))} out of { len (content )} characters.---"
104- return note + res
143+ total_length = len (source )
144+ content_slice , slice_end = _slice_text (source , offset , limit )
145+ frontmatter = _format_frontmatter (
146+ fetched = fetch_provider ,
147+ extracted = extraction_provider ,
148+ start = offset ,
149+ end = slice_end ,
150+ length = total_length ,
151+ )
152+
153+ parts = [frontmatter ]
154+ if warning :
155+ parts .append (f"{ warning } \n \n " )
156+ parts .append (content_slice )
157+ return "" .join (parts )
105158
106159 except asyncio .TimeoutError :
107- logger .error (f "Request timed out after 10 seconds for URL: { url } " )
160+ logger .error ("Request timed out after 10 seconds for URL: %s" , url )
108161 return f"Error: Request timed out after 10 seconds for URL: { url } "
109162 except Exception as e :
110- logger .error (f "Error loading page: { str ( e ) } " )
163+ logger .error ("Error loading page: %s" , e )
111164 return f"Error loading page: { str (e )} "
112165
113166
0 commit comments