1+ import asyncio
12import warnings
23from typing import Awaitable , Callable , Literal , TypedDict
34
@@ -21,11 +22,10 @@ class ProxySettings(TypedDict, total=False):
2122class PageLoader :
2223 def __init__ (
2324 self ,
24- browser : Literal [ "firefox" , "chromium" ] = "firefox" ,
25+ browser : Browser | None = None
2526 ):
26- self ._browser_id = browser
2727 self .playwright : Playwright | None = None
28- self .browser : Browser | None = None
28+ self .browser : Browser | None = browser
2929 self .context : BrowserContext | None = None
3030 self .page : Page | None = None
3131
@@ -36,10 +36,7 @@ async def new_browser(self) -> None:
3636 if self .browser :
3737 await self .browser .close ()
3838
39- if self ._browser_id == "firefox" :
40- self .browser = await self .playwright .firefox .launch (headless = True )
41- else :
42- self .browser = await self .playwright .chromium .launch (headless = True )
39+ self .browser = await self .playwright .firefox .launch (headless = True )
4340
4441 async def stealth (self , page : Page ) -> Page :
4542 user_agent = await self .page .evaluate ("navigator.userAgent" )
@@ -70,23 +67,76 @@ async def create_session(
7067 async def fetch_page (
7168 self ,
7269 url : str ,
70+ scrolls_limit : int = 0 ,
7371 load_state : Literal [
7472 "domcontentloaded" , "load" , "networkidle"
7573 ] = "domcontentloaded" ,
7674 playwright_script : Callable [[Page ], Awaitable [Page ]] | None = None ,
7775 ) -> None :
7876 # Navigate to the URL
79- # await page.route("**/*.{png,jpg,jpeg}", lambda route: route.abort()) # Can speed up requests
8077 await self .page .goto (url )
8178 await self .page .wait_for_load_state (load_state )
79+
8280 if playwright_script :
8381 self .page = await playwright_script (self .page )
8482
85- return await self .page .content ()
83+ # Start tracking removed content with MutationObserver
84+ await self .page .evaluate (
85+ """
86+ window.removedContent = [];
87+ const observer = new MutationObserver((mutations) => {
88+ mutations.forEach(mutation => {
89+ if (mutation.removedNodes.length > 0) {
90+ mutation.removedNodes.forEach(node => {
91+ if (node.nodeType === 1) { // Only store element nodes
92+ window.removedContent.push(node.outerHTML);
93+ }
94+ });
95+ }
96+ });
97+ });
98+ observer.observe(document.body, { childList: true, subtree: true });
99+ """
100+ )
101+
102+ # Function to perform the scrolling
103+ scrolls = 0
104+ last_height = 0
105+ captured_content = []
106+
107+ while scrolls < scrolls_limit :
108+ # Scroll down to the bottom of the page
109+ await self .page .evaluate ("window.scrollTo(0, document.body.scrollHeight);" )
110+
111+ # Wait for page to load
112+ await asyncio .sleep (2 )
113+
114+ # Capture current visible content and append to the list
115+ current_content = await self .page .content ()
116+ captured_content .append (current_content )
117+
118+ # Check current scroll height
119+ new_height = await self .page .evaluate ("document.body.scrollHeight" )
120+
121+ # Break if no new content is loaded (based on scroll height)
122+ if new_height == last_height :
123+ break
124+
125+ last_height = new_height
126+ scrolls += 1
127+
128+ # Fetch removed content if any
129+ removed_content = await self .page .evaluate ("window.removedContent.join('')" )
130+
131+ # Combine all the captured content, including removed elements
132+ final_content = "" .join (captured_content ) + removed_content
133+
134+ return final_content
86135
87136 async def load_content (
88137 self ,
89138 url : str ,
139+ scrolls_limit : int = 0 ,
90140 proxy_settings : ProxySettings | None = None ,
91141 load_state : Literal [
92142 "domcontentloaded" , "load" , "networkidle"
@@ -95,46 +145,13 @@ async def load_content(
95145 ):
96146 await self .create_session (proxy_settings = proxy_settings )
97147 return await self .fetch_page (
98- url = url , load_state = load_state , playwright_script = playwright_script
148+ url = url ,
149+ scrolls_limit = scrolls_limit ,
150+ load_state = load_state ,
151+ playwright_script = playwright_script ,
99152 )
100153
101154 async def close (self ) -> None :
102155 if self .playwright :
103156 await self .browser .close ()
104157 self .playwright .stop ()
105-
106-
107- async def fetch_page_content (
108- url : str ,
109- proxy_settings : ProxySettings | None = None ,
110- browser : str = "firefox" ,
111- ) -> str :
112- warnings .warn (
113- "fetch_page_content is deprecated and will be removed" ,
114- DeprecationWarning ,
115- )
116- async with async_playwright () as p :
117- # Launch the browser
118- if browser == "firefox" :
119- browser = await p .firefox .launch (headless = True )
120- else :
121- browser = await p .chromium .launch (headless = True )
122- # Open a new browser context
123- context = await browser .new_context (proxy = proxy_settings )
124- # Open a new page
125- page = await context .new_page ()
126- await stealth_async (page )
127-
128- # Navigate to the URL
129- # await page.route("**/*.{png,jpg,jpeg}", lambda route: route.abort()) # Can speed up requests
130- await page .goto (url )
131-
132- # Wait for the content to be dynamically loaded
133- await page .wait_for_load_state ("domcontentloaded" )
134- # Get the page content
135- content = await page .content ()
136-
137- # Close the browser
138- await browser .close ()
139-
140- return content
0 commit comments