autoscrape-labs · thalissonvs · Mar 23, 2026 · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026
diff --git a/poetry.lock b/poetry.lock
diff --git a/pydoll/browser/tab.py b/pydoll/browser/tab.py
@@ -20,6 +20,7 @@
     Callable,
     Optional,
     TypeAlias,
+    TypeVar,
     Union,
     cast,
     overload,
@@ -86,6 +87,8 @@
     rewrite_html_urls,
 )
 
+from pydoll.extractor.engine import ExtractionEngine
+
 if TYPE_CHECKING:
     from pydoll.browser.chromium.base import Browser
     from pydoll.protocol.base import EmptyResponse, Response
@@ -124,6 +127,8 @@
 
 IFrame: TypeAlias = 'Tab'
 
+T = TypeVar('T')
+
 _CLOUDFLARE_CHALLENGE_DOMAIN = 'challenges.cloudflare.com'
 _CLOUDFLARE_IFRAME_SELECTOR = f'iframe[src*="{_CLOUDFLARE_CHALLENGE_DOMAIN}"]'
 _CLOUDFLARE_CHECKBOX_SELECTOR = 'span.cb-i'
@@ -176,6 +181,7 @@ def __init__(
         self._scroll: Optional[ScrollAPI] = None
         self._keyboard: Optional[KeyboardAPI] = None
         self._mouse: MouseAPI = MouseAPI(self)
+        self._extraction_engine: Optional[ExtractionEngine] = None
         logger.debug(
             (
                 f'Tab initialized: target_id={self._target_id}, '
@@ -255,6 +261,64 @@ def mouse(self) -> MouseAPI:
         """
         return self._mouse
 
+    @property
+    def _extractor(self) -> ExtractionEngine:
+        """Lazy-initialized extraction engine."""
+        if self._extraction_engine is None:
+            self._extraction_engine = ExtractionEngine(self)
+        return self._extraction_engine
+
+    async def extract(
+        self,
+        model: type[T],
+        *,
+        scope: Optional[str] = None,
+        timeout: int = 0,
+    ) -> T:
+        """Extract structured data from the page into a typed model.
+
+        Args:
+            model: ExtractionModel subclass defining the extraction schema.
+            scope: Optional CSS/XPath selector to limit extraction region.
+            timeout: Seconds to wait for elements (0 = no wait).
+
+        Returns:
+            Populated model instance with extracted data.
+
+        Raises:
+            FieldExtractionFailed: If a required field cannot be extracted.
+            InvalidExtractionModel: If model definition is invalid.
+        """
+        return await self._extractor.extract(
+            model, scope=scope, timeout=timeout
+        )
+
+    async def extract_all(
+        self,
+        model: type[T],
+        *,
+        scope: str,
+        timeout: int = 0,
+        limit: Optional[int] = None,
+    ) -> list[T]:
+        """Extract multiple items from repeated containers on the page.
+
+        Each element matching the scope selector generates one model instance.
+        Fields are resolved relative to each scope container.
+
+        Args:
+            model: ExtractionModel subclass defining the extraction schema.
+            scope: CSS/XPath selector for the repeated container (required).
+            timeout: Seconds to wait for elements (0 = no wait).
+            limit: Maximum number of items to extract (None = all).
+
+        Returns:
+            List of populated model instances.
+        """
+        return await self._extractor.extract_all(
+            model, scope=scope, timeout=timeout, limit=limit
+        )
+
     @property
     def intercept_file_chooser_dialog_enabled(self) -> bool:
         """Whether file chooser dialog interception is active."""

diff --git a/pydoll/extractor/__init__.py b/pydoll/extractor/__init__.py
@@ -0,0 +1,16 @@
+from pydoll.extractor.exceptions import (
+    ExtractionException,
+    FieldExtractionFailed,
+    InvalidExtractionModel,
+)
+from pydoll.extractor.field import ExtractionMetadata, Field
+from pydoll.extractor.model import ExtractionModel
+
+__all__ = [
+    'ExtractionException',
+    'ExtractionMetadata',
+    'ExtractionModel',
+    'Field',
+    'FieldExtractionFailed',
+    'InvalidExtractionModel',
+]