Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
0b8fca9
feat: add pydantic as required dependency
thalissonvs Mar 22, 2026
a11380c
feat(extractor): add extraction exception hierarchy
thalissonvs Mar 22, 2026
e84cc1a
feat(extractor): add Field descriptor and ExtractionMetadata
thalissonvs Mar 22, 2026
0bb1bd3
feat(extractor): add ExtractionModel base class
thalissonvs Mar 22, 2026
c852fe4
feat(extractor): add extraction engine with CSS/XPath support
thalissonvs Mar 22, 2026
9ac2221
feat(extractor): add module public API exports
thalissonvs Mar 22, 2026
5f36762
feat(extractor): integrate extract and extract_all into Tab
thalissonvs Mar 22, 2026
e21b1a3
test(extractor): add integration tests with real browser
thalissonvs Mar 22, 2026
219267e
docs(extractor): add usage example with quotes.toscrape.com
thalissonvs Mar 22, 2026
a20858b
Revert "docs(extractor): add usage example with quotes.toscrape.com"
thalissonvs Mar 22, 2026
48e26ce
chore(deps): update dependencies and add new packages to poetry.lock
thalissonvs Mar 22, 2026
4c9ed4d
fix(extractor): resolve mypy type errors
thalissonvs Mar 22, 2026
510b461
style: apply ruff formatting
thalissonvs Mar 22, 2026
533603c
refactor(extractor): use asyncio.gather for concurrent field extraction
thalissonvs Mar 22, 2026
dffc2ac
test(extractor): add concurrent extraction tests
thalissonvs Mar 22, 2026
67a7421
docs: restructure README with extraction showcase and updated positio…
thalissonvs Mar 22, 2026
af7cc3c
docs: update landing pages with extractor examples in all languages
thalissonvs Mar 22, 2026
35f4898
docs(extractor): add structured extraction guide in en, pt, zh
thalissonvs Mar 22, 2026
16eb0ef
fix(extractor): correct coroutine type annotation for mypy
thalissonvs Mar 22, 2026
b4408b4
fix(test): filter only DeprecationWarning in interval deprecated test
thalissonvs Mar 22, 2026
597a914
refactor(extractor): parallelize list field extraction with asyncio.g…
thalissonvs Mar 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 163 additions & 2 deletions poetry.lock

Large diffs are not rendered by default.

64 changes: 64 additions & 0 deletions pydoll/browser/tab.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
Callable,
Optional,
TypeAlias,
TypeVar,
Union,
cast,
overload,
Expand Down Expand Up @@ -86,6 +87,8 @@
rewrite_html_urls,
)

from pydoll.extractor.engine import ExtractionEngine

Comment thread
thalissonvs marked this conversation as resolved.
Outdated
if TYPE_CHECKING:
from pydoll.browser.chromium.base import Browser
from pydoll.protocol.base import EmptyResponse, Response
Expand Down Expand Up @@ -124,6 +127,8 @@

IFrame: TypeAlias = 'Tab'

T = TypeVar('T')

Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
_CLOUDFLARE_CHALLENGE_DOMAIN = 'challenges.cloudflare.com'
_CLOUDFLARE_IFRAME_SELECTOR = f'iframe[src*="{_CLOUDFLARE_CHALLENGE_DOMAIN}"]'
_CLOUDFLARE_CHECKBOX_SELECTOR = 'span.cb-i'
Expand Down Expand Up @@ -176,6 +181,7 @@ def __init__(
self._scroll: Optional[ScrollAPI] = None
self._keyboard: Optional[KeyboardAPI] = None
self._mouse: MouseAPI = MouseAPI(self)
self._extraction_engine: Optional[ExtractionEngine] = None
logger.debug(
(
f'Tab initialized: target_id={self._target_id}, '
Expand Down Expand Up @@ -255,6 +261,64 @@ def mouse(self) -> MouseAPI:
"""
return self._mouse

@property
def _extractor(self) -> ExtractionEngine:
"""Lazy-initialized extraction engine."""
if self._extraction_engine is None:
self._extraction_engine = ExtractionEngine(self)
return self._extraction_engine

async def extract(
self,
model: type[T],
*,
scope: Optional[str] = None,
timeout: int = 0,
) -> T:
"""Extract structured data from the page into a typed model.

Args:
model: ExtractionModel subclass defining the extraction schema.
scope: Optional CSS/XPath selector to limit extraction region.
timeout: Seconds to wait for elements (0 = no wait).

Returns:
Populated model instance with extracted data.

Raises:
FieldExtractionFailed: If a required field cannot be extracted.
InvalidExtractionModel: If model definition is invalid.
"""
return await self._extractor.extract(
model, scope=scope, timeout=timeout
)

async def extract_all(
self,
model: type[T],
*,
scope: str,
timeout: int = 0,
limit: Optional[int] = None,
) -> list[T]:
"""Extract multiple items from repeated containers on the page.

Each element matching the scope selector generates one model instance.
Fields are resolved relative to each scope container.

Args:
model: ExtractionModel subclass defining the extraction schema.
scope: CSS/XPath selector for the repeated container (required).
timeout: Seconds to wait for elements (0 = no wait).
limit: Maximum number of items to extract (None = all).

Returns:
List of populated model instances.
"""
return await self._extractor.extract_all(
model, scope=scope, timeout=timeout, limit=limit
)

@property
def intercept_file_chooser_dialog_enabled(self) -> bool:
"""Whether file chooser dialog interception is active."""
Expand Down
16 changes: 16 additions & 0 deletions pydoll/extractor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from pydoll.extractor.exceptions import (
ExtractionException,
FieldExtractionFailed,
InvalidExtractionModel,
)
from pydoll.extractor.field import ExtractionMetadata, Field
from pydoll.extractor.model import ExtractionModel

__all__ = [
'ExtractionException',
'ExtractionMetadata',
'ExtractionModel',
'Field',
'FieldExtractionFailed',
'InvalidExtractionModel',
]
Loading
Loading