Merge pull request #18 from danyathecoder/main

raznem · web-flow · commit 338c375897f0 · 2024-10-22T10:03:19.000+02:00
Add infinite page scrolling
diff --git a/README.md b/README.md
@@ -87,10 +87,10 @@ Before you run `Parsera` as command line tool don't forget to put your `OPENAI_A
 ### Usage
 
 You can configure elements to parse using `JSON string` or `FILE`.
-Optionally, you can provide `FILE` to write output.
+Optionally, you can provide `FILE` to write output and amount of `SCROLLS`, that you want to do on the page
 
 ```sh
-python -m parsera.main URL {--scheme '{"title":"h1"}' | --file FILENAME} [--output FILENAME]
+python -m parsera.main URL {--scheme '{"title":"h1"}' | --file FILENAME} [--scrolls SCROLLS] [--output FILENAME]
 ```
 
 ## Running in Docker
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -10,4 +10,4 @@ services:
     volumes:
       - ./scheme.json:/app/scheme.json
       - ./output:/app/output
-    command: ["$URL", "--file", "$FILE", "--output", "$OUTPUT"]
+    command: ["$URL", "--file", "$FILE", "--output", "$OUTPUT", --scrolls "$SCROLLS"]
diff --git a/docs/features/custom-browser.md b/docs/features/custom-browser.md
@@ -0,0 +1,13 @@
+## Custom browser usage
+
+You can setup playwright browser with custom parameters for development puposes and use it with Parsera!
+
+```python
+    async with async_playwright() as p:
+        browser = await p.firefox.launch(headless=False, slow_mo=100)
+        loader = PageLoader(browser=browser)
+        content = await loader.load_content(url=url, scrolls_limit=10)
+        return content
+```
+
+[Check out full example](https://github.com/raznem/parsera/tree/main/examples/infinite_page_scrolling.py)
diff --git a/docs/features/docker.md b/docs/features/docker.md
@@ -15,6 +15,7 @@ You can get access to the CLI or development environment using Docker.
 URL=https://parsera.org
 FILE=/app/scheme.json
 OUTPUT=/app/output/result.json
+SCROLLS=5
 ```
 
 2. Create `scheme.json` file with the parsing scheme in the repository root directory.
diff --git a/docs/features/scrolling.md b/docs/features/scrolling.md
@@ -0,0 +1,26 @@
+## Page scrolling
+
+[Parsera library](https://github.com/raznem/parsera) can scroll pages now!. To do this you simply should set parameter `scrolls_limit`.
+
+This parameters is available for the `run` and `arun` for `Parsera` and `ParseraScript` classes.
+
+Check out the example below!:
+```python
+async def get_reddit_info():
+    model = GPT4oMiniModel()
+
+    # This script is executed after the url is opened
+    async def pw_script(page: Page) -> Page:
+        await page.wait_for_timeout(1000)  # Wait one second for page to load
+        return page
+
+    parsera = ParseraScript(model=model)
+    return await parsera.arun(
+        url="https://www.reddit.com/",
+        elements={
+            "post name": "post description"
+        },
+        playwright_script=pw_script,
+        scrolls_limit = 10
+    )
+```
diff --git a/examples/infinite_page_scrolling.py b/examples/infinite_page_scrolling.py
@@ -0,0 +1,23 @@
+import asyncio
+
+from playwright.async_api import async_playwright
+
+from parsera.page import PageLoader
+
+"""
+Here's the example how you can load webpage of any length even with the custom browser parameters!
+"""
+
+
+async def main(url):
+    async with async_playwright() as p:
+        browser = await p.firefox.launch(headless=False, slow_mo=100)
+        loader = PageLoader(browser=browser)
+        content = await loader.load_content(url=url, scrolls_limit=10)
+        return content
+
+
+if __name__ == "__main__":
+    URL = "https://www.reddit.com/"
+    result = asyncio.run(main(URL))
+    print(result)
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -30,6 +30,8 @@ nav:
     - Custom models: features/custom-models.md
     - Proxy: features/proxy.md
     - Custom playwright: features/custom-playwright.md
+    - Custom browser: features/custom-browser.md
+    - Scrolling: features/scrolling.md
     - Extractors: features/extractors.md
     - Docker: features/docker.md
   - API:
diff --git a/parsera/main.py b/parsera/main.py
@@ -64,8 +64,8 @@ def fancy_parser():
         epilog=Fore.YELLOW
         + "Example usage:\n"
         + Style.RESET_ALL
-        + '  python -m parsera.main https://example.com --scheme \'{"title":"h1"}\'\n'
-        + "  python -m parsera.main https://example.com --file path/to/elements.json",
+        + '  python -m parsera.main https://example.com --scrolls 5 --scheme \'{"title":"h1"}\'\n '
+        + "  python -m parsera.main https://example.com --scrolls 5 --file path/to/elements.json",
     )
 
     # URL argument
@@ -85,6 +85,17 @@ def fancy_parser():
         required=False,
     )
 
+    # Scrolls argument
+    parser.add_argument(
+        "--scrolls",
+        type=int,
+        help=Fore.GREEN
+        + "Add amount of scrolls for the page on the url."
+        + Style.RESET_ALL,
+        required=False,
+        default=0
+    )
+
     # File argument (with validation for file)
     parser.add_argument(
         "--file",
@@ -107,7 +118,7 @@ def fancy_parser():
     return parser.parse_args()
 
 
-async def get_url_data(url, scheme):
+async def get_url_data(url, scheme, scrolls):
     model = GPT4oMiniModel()
 
     # This script is executed after the url is opened
@@ -117,7 +128,7 @@ async def repeating_script(page: Page) -> Page:
 
     parsera = ParseraScript(model=model)
     return await parsera.arun(
-        url=url, elements=scheme, playwright_script=repeating_script
+        url=url, elements=scheme, playwright_script=repeating_script, scrolls_limit=scrolls
     )
 
 
@@ -142,11 +153,13 @@ async def repeating_script(page: Page) -> Page:
         )
     if args.file:
         print(Fore.CYAN + "Scheme (from file):" + Style.RESET_ALL, args.file)
+    if args.scrolls:
+        print(Fore.CYAN + "Amount of scrolls on the page:" + Style.RESET_ALL, args.scrolls)
 
     # Determine the scheme to use (from scheme argument or file)
     scheme = args.scheme if args.scheme else args.file
 
-    result = asyncio.run(get_url_data(args.url, scheme))
+    result = asyncio.run(get_url_data(args.url, scheme, args.scrolls))
 
     # Print the result to the console
     print(Fore.GREEN + "Parsed result:" + Style.RESET_ALL, result)
diff --git a/parsera/page.py b/parsera/page.py
@@ -1,3 +1,4 @@
+import asyncio
 import warnings
 from typing import Awaitable, Callable, Literal, TypedDict
 
@@ -21,11 +22,10 @@ class ProxySettings(TypedDict, total=False):
 class PageLoader:
     def __init__(
         self,
-        browser: Literal["firefox", "chromium"] = "firefox",
+        browser: Browser | None = None
     ):
-        self._browser_id = browser
         self.playwright: Playwright | None = None
-        self.browser: Browser | None = None
+        self.browser: Browser | None = browser
         self.context: BrowserContext | None = None
         self.page: Page | None = None
 
@@ -36,10 +36,7 @@ async def new_browser(self) -> None:
         if self.browser:
             await self.browser.close()
 
-        if self._browser_id == "firefox":
-            self.browser = await self.playwright.firefox.launch(headless=True)
-        else:
-            self.browser = await self.playwright.chromium.launch(headless=True)
+        self.browser = await self.playwright.firefox.launch(headless=True)
 
     async def stealth(self, page: Page) -> Page:
         user_agent = await self.page.evaluate("navigator.userAgent")
@@ -70,23 +67,76 @@ async def create_session(
     async def fetch_page(
         self,
         url: str,
+        scrolls_limit: int = 0,
         load_state: Literal[
             "domcontentloaded", "load", "networkidle"
         ] = "domcontentloaded",
         playwright_script: Callable[[Page], Awaitable[Page]] | None = None,
     ) -> None:
         # Navigate to the URL
-        # await page.route("**/*.{png,jpg,jpeg}", lambda route: route.abort()) # Can speed up requests
         await self.page.goto(url)
         await self.page.wait_for_load_state(load_state)
+
         if playwright_script:
             self.page = await playwright_script(self.page)
 
-        return await self.page.content()
+        # Start tracking removed content with MutationObserver
+        await self.page.evaluate(
+            """
+            window.removedContent = [];
+            const observer = new MutationObserver((mutations) => {
+                mutations.forEach(mutation => {
+                    if (mutation.removedNodes.length > 0) {
+                        mutation.removedNodes.forEach(node => {
+                            if (node.nodeType === 1) { // Only store element nodes
+                                window.removedContent.push(node.outerHTML);
+                            }
+                        });
+                    }
+                });
+            });
+            observer.observe(document.body, { childList: true, subtree: true });
+        """
+        )
+
+        # Function to perform the scrolling
+        scrolls = 0
+        last_height = 0
+        captured_content = []
+
+        while scrolls < scrolls_limit:
+            # Scroll down to the bottom of the page
+            await self.page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+
+            # Wait for page to load
+            await asyncio.sleep(2)
+
+            # Capture current visible content and append to the list
+            current_content = await self.page.content()
+            captured_content.append(current_content)
+
+            # Check current scroll height
+            new_height = await self.page.evaluate("document.body.scrollHeight")
+
+            # Break if no new content is loaded (based on scroll height)
+            if new_height == last_height:
+                break
+
+            last_height = new_height
+            scrolls += 1
+
+        # Fetch removed content if any
+        removed_content = await self.page.evaluate("window.removedContent.join('')")
+
+        # Combine all the captured content, including removed elements
+        final_content = "".join(captured_content) + removed_content
+
+        return final_content
 
     async def load_content(
         self,
         url: str,
+        scrolls_limit: int = 0,
         proxy_settings: ProxySettings | None = None,
         load_state: Literal[
             "domcontentloaded", "load", "networkidle"
@@ -95,46 +145,13 @@ async def load_content(
     ):
         await self.create_session(proxy_settings=proxy_settings)
         return await self.fetch_page(
-            url=url, load_state=load_state, playwright_script=playwright_script
+            url=url,
+            scrolls_limit=scrolls_limit,
+            load_state=load_state,
+            playwright_script=playwright_script,
         )
 
     async def close(self) -> None:
         if self.playwright:
             await self.browser.close()
             self.playwright.stop()
-
-
-async def fetch_page_content(
-    url: str,
-    proxy_settings: ProxySettings | None = None,
-    browser: str = "firefox",
-) -> str:
-    warnings.warn(
-        "fetch_page_content is deprecated and will be removed",
-        DeprecationWarning,
-    )
-    async with async_playwright() as p:
-        # Launch the browser
-        if browser == "firefox":
-            browser = await p.firefox.launch(headless=True)
-        else:
-            browser = await p.chromium.launch(headless=True)
-        # Open a new browser context
-        context = await browser.new_context(proxy=proxy_settings)
-        # Open a new page
-        page = await context.new_page()
-        await stealth_async(page)
-
-        # Navigate to the URL
-        # await page.route("**/*.{png,jpg,jpeg}", lambda route: route.abort()) # Can speed up requests
-        await page.goto(url)
-
-        # Wait for the content to be dynamically loaded
-        await page.wait_for_load_state("domcontentloaded")
-        # Get the page content
-        content = await page.content()
-
-        # Close the browser
-        await browser.close()
-
-        return content
diff --git a/parsera/parsera.py b/parsera/parsera.py