Skip to content

Commit 1b594ae

Browse files
committed
Proxy support and firefox as default engine
1 parent 9deb134 commit 1b594ae

File tree

3 files changed

+36
-9
lines changed

3 files changed

+36
-9
lines changed

parsera/main.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,27 @@ def __init__(self, model: BaseChatModel | None = None):
1414
else:
1515
self.model = model
1616

17-
async def _run(self, url: str, elements: dict) -> dict:
18-
content = await fetch_page_content(url=url)
17+
async def _run(
18+
self, url: str, elements: dict, proxy_settings: dict | None = None
19+
) -> dict:
20+
if proxy_settings:
21+
content = await fetch_page_content(url=url, proxy_settings=proxy_settings)
22+
else:
23+
content = await fetch_page_content(url=url)
1924
extractor = TabularExtractor(
2025
elements=elements, model=self.model, content=content
2126
)
2227
result = await extractor.run()
2328
return result
2429

25-
def run(self, url: str, elements: dict) -> dict:
26-
return asyncio.run(self._run(url=url, elements=elements))
30+
def run(self, url: str, elements: dict, proxy_settings: dict | None = None) -> dict:
31+
return asyncio.run(
32+
self._run(url=url, elements=elements, proxy_settings=proxy_settings)
33+
)
2734

28-
async def arun(self, url: str, elements: dict) -> dict:
29-
return await self._run(url=url, elements=elements)
35+
async def arun(
36+
self, url: str, elements: dict, proxy_settings: dict | None = None
37+
) -> dict:
38+
return await self._run(
39+
url=url, elements=elements, proxy_settings=proxy_settings
40+
)

parsera/page.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,27 @@
1+
from typing import TypedDict
2+
13
from playwright.async_api import async_playwright
24
from playwright_stealth import stealth_async
35

46

5-
async def fetch_page_content(url: str) -> str:
7+
class ProxySettings(TypedDict, total=False):
8+
server: str
9+
bypass: str | None = None
10+
username: str | None = None
11+
password: str | None = None
12+
13+
14+
async def fetch_page_content(
15+
url: str,
16+
proxy_settings: ProxySettings | None = None,
17+
browser: str = "firefox",
18+
) -> str:
619
async with async_playwright() as p:
720
# Launch the browser
8-
browser = await p.chromium.launch(headless=True)
21+
if browser == "firefox":
22+
browser = await p.firefox.launch(headless=True, proxy=proxy_settings)
23+
else:
24+
browser = await p.chromium.launch(headless=True, proxy=proxy_settings)
925
# Open a new browser context
1026
context = await browser.new_context()
1127
# Open a new page

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "parsera"
3-
version = "0.1.2"
3+
version = "0.1.3"
44
description = "Lightweight library for scraping web-sites with LLMs"
55
authors = ["Mikhail Zanka <raznem@gmail.com>"]
66
license = "GPL-2.0-or-later"

0 commit comments

Comments
 (0)