Skip to content

Commit 338c375

Browse files
authored
Merge pull request #18 from danyathecoder/main
Add infinite page scrolling
2 parents 0fbba0d + 08deefb commit 338c375

File tree

10 files changed

+190
-62
lines changed

10 files changed

+190
-62
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,10 @@ Before you run `Parsera` as command line tool don't forget to put your `OPENAI_A
8787
### Usage
8888

8989
You can configure elements to parse using `JSON string` or `FILE`.
90-
Optionally, you can provide `FILE` to write output.
90+
Optionally, you can provide `FILE` to write output and amount of `SCROLLS`, that you want to do on the page
9191

9292
```sh
93-
python -m parsera.main URL {--scheme '{"title":"h1"}' | --file FILENAME} [--output FILENAME]
93+
python -m parsera.main URL {--scheme '{"title":"h1"}' | --file FILENAME} [--scrolls SCROLLS] [--output FILENAME]
9494
```
9595

9696
## Running in Docker

docker-compose.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@ services:
1010
volumes:
1111
- ./scheme.json:/app/scheme.json
1212
- ./output:/app/output
13-
command: ["$URL", "--file", "$FILE", "--output", "$OUTPUT"]
13+
command: ["$URL", "--file", "$FILE", "--output", "$OUTPUT", --scrolls "$SCROLLS"]

docs/features/custom-browser.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
## Custom browser usage
2+
3+
You can setup playwright browser with custom parameters for development puposes and use it with Parsera!
4+
5+
```python
6+
async with async_playwright() as p:
7+
browser = await p.firefox.launch(headless=False, slow_mo=100)
8+
loader = PageLoader(browser=browser)
9+
content = await loader.load_content(url=url, scrolls_limit=10)
10+
return content
11+
```
12+
13+
[Check out full example](https://github.com/raznem/parsera/tree/main/examples/infinite_page_scrolling.py)

docs/features/docker.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ You can get access to the CLI or development environment using Docker.
1515
URL=https://parsera.org
1616
FILE=/app/scheme.json
1717
OUTPUT=/app/output/result.json
18+
SCROLLS=5
1819
```
1920

2021
2. Create `scheme.json` file with the parsing scheme in the repository root directory.

docs/features/scrolling.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
## Page scrolling
2+
3+
[Parsera library](https://github.com/raznem/parsera) can scroll pages now!. To do this you simply should set parameter `scrolls_limit`.
4+
5+
This parameters is available for the `run` and `arun` for `Parsera` and `ParseraScript` classes.
6+
7+
Check out the example below!:
8+
```python
9+
async def get_reddit_info():
10+
model = GPT4oMiniModel()
11+
12+
# This script is executed after the url is opened
13+
async def pw_script(page: Page) -> Page:
14+
await page.wait_for_timeout(1000) # Wait one second for page to load
15+
return page
16+
17+
parsera = ParseraScript(model=model)
18+
return await parsera.arun(
19+
url="https://www.reddit.com/",
20+
elements={
21+
"post name": "post description"
22+
},
23+
playwright_script=pw_script,
24+
scrolls_limit = 10
25+
)
26+
```
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import asyncio
2+
3+
from playwright.async_api import async_playwright
4+
5+
from parsera.page import PageLoader
6+
7+
"""
8+
Here's the example how you can load webpage of any length even with the custom browser parameters!
9+
"""
10+
11+
12+
async def main(url):
13+
async with async_playwright() as p:
14+
browser = await p.firefox.launch(headless=False, slow_mo=100)
15+
loader = PageLoader(browser=browser)
16+
content = await loader.load_content(url=url, scrolls_limit=10)
17+
return content
18+
19+
20+
if __name__ == "__main__":
21+
URL = "https://www.reddit.com/"
22+
result = asyncio.run(main(URL))
23+
print(result)

mkdocs.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ nav:
3030
- Custom models: features/custom-models.md
3131
- Proxy: features/proxy.md
3232
- Custom playwright: features/custom-playwright.md
33+
- Custom browser: features/custom-browser.md
34+
- Scrolling: features/scrolling.md
3335
- Extractors: features/extractors.md
3436
- Docker: features/docker.md
3537
- API:

parsera/main.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ def fancy_parser():
6464
epilog=Fore.YELLOW
6565
+ "Example usage:\n"
6666
+ Style.RESET_ALL
67-
+ ' python -m parsera.main https://example.com --scheme \'{"title":"h1"}\'\n'
68-
+ " python -m parsera.main https://example.com --file path/to/elements.json",
67+
+ ' python -m parsera.main https://example.com --scrolls 5 --scheme \'{"title":"h1"}\'\n '
68+
+ " python -m parsera.main https://example.com --scrolls 5 --file path/to/elements.json",
6969
)
7070

7171
# URL argument
@@ -85,6 +85,17 @@ def fancy_parser():
8585
required=False,
8686
)
8787

88+
# Scrolls argument
89+
parser.add_argument(
90+
"--scrolls",
91+
type=int,
92+
help=Fore.GREEN
93+
+ "Add amount of scrolls for the page on the url."
94+
+ Style.RESET_ALL,
95+
required=False,
96+
default=0
97+
)
98+
8899
# File argument (with validation for file)
89100
parser.add_argument(
90101
"--file",
@@ -107,7 +118,7 @@ def fancy_parser():
107118
return parser.parse_args()
108119

109120

110-
async def get_url_data(url, scheme):
121+
async def get_url_data(url, scheme, scrolls):
111122
model = GPT4oMiniModel()
112123

113124
# This script is executed after the url is opened
@@ -117,7 +128,7 @@ async def repeating_script(page: Page) -> Page:
117128

118129
parsera = ParseraScript(model=model)
119130
return await parsera.arun(
120-
url=url, elements=scheme, playwright_script=repeating_script
131+
url=url, elements=scheme, playwright_script=repeating_script, scrolls_limit=scrolls
121132
)
122133

123134

@@ -142,11 +153,13 @@ async def repeating_script(page: Page) -> Page:
142153
)
143154
if args.file:
144155
print(Fore.CYAN + "Scheme (from file):" + Style.RESET_ALL, args.file)
156+
if args.scrolls:
157+
print(Fore.CYAN + "Amount of scrolls on the page:" + Style.RESET_ALL, args.scrolls)
145158

146159
# Determine the scheme to use (from scheme argument or file)
147160
scheme = args.scheme if args.scheme else args.file
148161

149-
result = asyncio.run(get_url_data(args.url, scheme))
162+
result = asyncio.run(get_url_data(args.url, scheme, args.scrolls))
150163

151164
# Print the result to the console
152165
print(Fore.GREEN + "Parsed result:" + Style.RESET_ALL, result)

parsera/page.py

Lines changed: 63 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import asyncio
12
import warnings
23
from typing import Awaitable, Callable, Literal, TypedDict
34

@@ -21,11 +22,10 @@ class ProxySettings(TypedDict, total=False):
2122
class PageLoader:
2223
def __init__(
2324
self,
24-
browser: Literal["firefox", "chromium"] = "firefox",
25+
browser: Browser | None = None
2526
):
26-
self._browser_id = browser
2727
self.playwright: Playwright | None = None
28-
self.browser: Browser | None = None
28+
self.browser: Browser | None = browser
2929
self.context: BrowserContext | None = None
3030
self.page: Page | None = None
3131

@@ -36,10 +36,7 @@ async def new_browser(self) -> None:
3636
if self.browser:
3737
await self.browser.close()
3838

39-
if self._browser_id == "firefox":
40-
self.browser = await self.playwright.firefox.launch(headless=True)
41-
else:
42-
self.browser = await self.playwright.chromium.launch(headless=True)
39+
self.browser = await self.playwright.firefox.launch(headless=True)
4340

4441
async def stealth(self, page: Page) -> Page:
4542
user_agent = await self.page.evaluate("navigator.userAgent")
@@ -70,23 +67,76 @@ async def create_session(
7067
async def fetch_page(
7168
self,
7269
url: str,
70+
scrolls_limit: int = 0,
7371
load_state: Literal[
7472
"domcontentloaded", "load", "networkidle"
7573
] = "domcontentloaded",
7674
playwright_script: Callable[[Page], Awaitable[Page]] | None = None,
7775
) -> None:
7876
# Navigate to the URL
79-
# await page.route("**/*.{png,jpg,jpeg}", lambda route: route.abort()) # Can speed up requests
8077
await self.page.goto(url)
8178
await self.page.wait_for_load_state(load_state)
79+
8280
if playwright_script:
8381
self.page = await playwright_script(self.page)
8482

85-
return await self.page.content()
83+
# Start tracking removed content with MutationObserver
84+
await self.page.evaluate(
85+
"""
86+
window.removedContent = [];
87+
const observer = new MutationObserver((mutations) => {
88+
mutations.forEach(mutation => {
89+
if (mutation.removedNodes.length > 0) {
90+
mutation.removedNodes.forEach(node => {
91+
if (node.nodeType === 1) { // Only store element nodes
92+
window.removedContent.push(node.outerHTML);
93+
}
94+
});
95+
}
96+
});
97+
});
98+
observer.observe(document.body, { childList: true, subtree: true });
99+
"""
100+
)
101+
102+
# Function to perform the scrolling
103+
scrolls = 0
104+
last_height = 0
105+
captured_content = []
106+
107+
while scrolls < scrolls_limit:
108+
# Scroll down to the bottom of the page
109+
await self.page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
110+
111+
# Wait for page to load
112+
await asyncio.sleep(2)
113+
114+
# Capture current visible content and append to the list
115+
current_content = await self.page.content()
116+
captured_content.append(current_content)
117+
118+
# Check current scroll height
119+
new_height = await self.page.evaluate("document.body.scrollHeight")
120+
121+
# Break if no new content is loaded (based on scroll height)
122+
if new_height == last_height:
123+
break
124+
125+
last_height = new_height
126+
scrolls += 1
127+
128+
# Fetch removed content if any
129+
removed_content = await self.page.evaluate("window.removedContent.join('')")
130+
131+
# Combine all the captured content, including removed elements
132+
final_content = "".join(captured_content) + removed_content
133+
134+
return final_content
86135

87136
async def load_content(
88137
self,
89138
url: str,
139+
scrolls_limit: int = 0,
90140
proxy_settings: ProxySettings | None = None,
91141
load_state: Literal[
92142
"domcontentloaded", "load", "networkidle"
@@ -95,46 +145,13 @@ async def load_content(
95145
):
96146
await self.create_session(proxy_settings=proxy_settings)
97147
return await self.fetch_page(
98-
url=url, load_state=load_state, playwright_script=playwright_script
148+
url=url,
149+
scrolls_limit=scrolls_limit,
150+
load_state=load_state,
151+
playwright_script=playwright_script,
99152
)
100153

101154
async def close(self) -> None:
102155
if self.playwright:
103156
await self.browser.close()
104157
self.playwright.stop()
105-
106-
107-
async def fetch_page_content(
108-
url: str,
109-
proxy_settings: ProxySettings | None = None,
110-
browser: str = "firefox",
111-
) -> str:
112-
warnings.warn(
113-
"fetch_page_content is deprecated and will be removed",
114-
DeprecationWarning,
115-
)
116-
async with async_playwright() as p:
117-
# Launch the browser
118-
if browser == "firefox":
119-
browser = await p.firefox.launch(headless=True)
120-
else:
121-
browser = await p.chromium.launch(headless=True)
122-
# Open a new browser context
123-
context = await browser.new_context(proxy=proxy_settings)
124-
# Open a new page
125-
page = await context.new_page()
126-
await stealth_async(page)
127-
128-
# Navigate to the URL
129-
# await page.route("**/*.{png,jpg,jpeg}", lambda route: route.abort()) # Can speed up requests
130-
await page.goto(url)
131-
132-
# Wait for the content to be dynamically loaded
133-
await page.wait_for_load_state("domcontentloaded")
134-
# Get the page content
135-
content = await page.content()
136-
137-
# Close the browser
138-
await browser.close()
139-
140-
return content

0 commit comments

Comments
 (0)