Den 422 fix timeout of extract and get element (#68)

charlesmaddock · web-flow · commit 90092c48debd · 2024-10-29T14:49:06.000+01:00
* Added markdown mixin to page and improved markdown function. Also improved wait_for.

* Added strict timeout to cache.

* Fixed incorrect timout and updates docstrings. Built sync.
diff --git a/dendrite_sdk/async_api/_core/dendrite_page.py b/dendrite_sdk/async_api/_core/dendrite_page.py
@@ -34,6 +34,7 @@
 from dendrite_sdk.async_api._core.mixin.fill_fields import FillFieldsMixin
 from dendrite_sdk.async_api._core.mixin.get_element import GetElementMixin
 from dendrite_sdk.async_api._core.mixin.keyboard import KeyboardMixin
+from dendrite_sdk.async_api._core.mixin.markdown import MarkdownMixin
 from dendrite_sdk.async_api._core.mixin.wait_for import WaitForMixin
 from dendrite_sdk.async_api._core.models.page_information import PageInformation
 
@@ -54,6 +55,7 @@
 
 
 class AsyncPage(
+    MarkdownMixin,
     ExtractionMixin,
     WaitForMixin,
     AskMixin,
diff --git a/dendrite_sdk/async_api/_core/mixin/extract.py b/dendrite_sdk/async_api/_core/mixin/extract.py
@@ -18,6 +18,9 @@
 from loguru import logger
 
 
+CACHE_TIMEOUT = 5
+
+
 class ExtractionMixin(DendritePageProtocol):
     """
     Mixin that provides extraction functionality for web pages.
@@ -103,7 +106,9 @@ async def extract(
             prompt (Optional[str]): The prompt to describe the information to extract.
             type_spec (Optional[TypeSpec], optional): The type specification for the extracted data.
             use_cache (bool, optional): Whether to use cached results. Defaults to True.
-            timeout (int, optional): The maximum time to wait for extraction in seconds. Defaults to 180 seconds, which is 3 minutes.
+            timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
+                up to 5000ms will be spent attempting to use cached scripts before falling back to the
+                extraction agent for the remaining time that will attempt to generate a new script. Defaults to 15000 (15 seconds).
 
         Returns:
             ExtractResponse: The extracted data wrapped in a ExtractResponse object.
@@ -129,8 +134,7 @@ async def extract(
         # Check if a script exists in the cache
         if use_cache:
             cache_available = await check_if_extract_cache_available(
-                self,
-                prompt, json_schema
+                self, prompt, json_schema
             )
 
             if cache_available:
@@ -139,8 +143,8 @@ async def extract(
                     self,
                     prompt,
                     json_schema,
+                    remaining_timeout=CACHE_TIMEOUT,
                     only_use_cache=True,
-                    remaining_timeout=timeout - (time.time() - start_time),
                 )
                 if result:
                     return convert_and_return_result(result, type_spec)
@@ -152,8 +156,8 @@ async def extract(
             self,
             prompt,
             json_schema,
-            only_use_cache=False,
             remaining_timeout=timeout - (time.time() - start_time),
+            only_use_cache=False,
         )
 
         if result:
@@ -162,6 +166,7 @@ async def extract(
         logger.error(f"Extraction failed after {time.time() - start_time:.2f} seconds")
         return None
 
+
 async def check_if_extract_cache_available(
     obj: DendritePageProtocol, prompt: str, json_schema: Optional[JsonSchema]
 ) -> bool:
@@ -178,12 +183,13 @@ async def check_if_extract_cache_available(
     )
     return cache_response.exists
 
+
 async def attempt_extraction_with_backoff(
     obj: DendritePageProtocol,
     prompt: str,
     json_schema: Optional[JsonSchema],
-    only_use_cache: bool = False,
     remaining_timeout: float = 180.0,
+    only_use_cache: bool = False,
 ) -> Optional[ExtractResponse]:
     TIMEOUT_INTERVAL: List[float] = [0.15, 0.45, 1.0, 2.0, 4.0, 8.0]
     total_elapsed_time = 0
@@ -234,6 +240,7 @@ async def attempt_extraction_with_backoff(
     )
     return None
 
+
 def convert_and_return_result(
     res: ExtractResponse, type_spec: Optional[TypeSpec]
 ) -> TypeSpec:
diff --git a/dendrite_sdk/async_api/_core/mixin/get_element.py b/dendrite_sdk/async_api/_core/mixin/get_element.py
@@ -13,6 +13,9 @@
 from dendrite_sdk.async_api._core.models.api_config import APIConfig
 
 
+CACHE_TIMEOUT = 5
+
+
 class GetElementMixin(DendritePageProtocol):
     @overload
     async def get_elements(
@@ -28,7 +31,9 @@ async def get_elements(
         Args:
             prompt_or_elements (str): The prompt describing the elements to be retrieved.
             use_cache (bool, optional): Whether to use cached results. Defaults to True.
-            timeout (int, optional): The total timeout (in milliseconds) until the last request is sent to the API. Defaults to 15000 (15 seconds).
+            timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
+                up to 5000ms will be spent attempting to use cached selectors before falling back to the
+                find element agent for the remaining time. Defaults to 15000 (15 seconds).
             context (str, optional): Additional context for the retrieval. Defaults to an empty string.
 
         Returns:
@@ -49,7 +54,9 @@ async def get_elements(
         Args:
             prompt_or_elements (Dict[str, str]): A dictionary where keys are field names and values are prompts describing the elements to be retrieved.
             use_cache (bool, optional): Whether to use cached results. Defaults to True.
-            timeout (int, optional): The total timeout (in milliseconds) until the last request is sent to the API. Defaults to 3000.
+            timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
+                up to 5000ms will be spent attempting to use cached selectors before falling back to the
+                find element agent for the remaining time. Defaults to 15000 (15 seconds).
             context (str, optional): Additional context for the retrieval. Defaults to an empty string.
 
         Returns:
@@ -72,7 +79,9 @@ async def get_elements(
         Args:
             prompt_or_elements (Union[str, Dict[str, str]]): The prompt or dictionary of prompts for element retrieval.
             use_cache (bool, optional): Whether to use cached results. Defaults to True.
-            timeout (int, optional): The total timeout (in milliseconds) until the last request is sent to the API. Defaults to 3000.
+            timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
+                up to 5000ms will be spent attempting to use cached selectors before falling back to the
+                find element agent for the remaining time. Defaults to 15000 (15 seconds).
             context (str, optional): Additional context for the retrieval. Defaults to an empty string.
 
         Returns:
@@ -86,11 +95,9 @@ async def get_elements(
             prompt_or_elements,
             only_one=False,
             use_cache=use_cache,
-            timeout=timeout,
+            timeout=timeout / 1000,
         )
 
-        raise ValueError("Prompt must be either a string prompt or a dictionary")
-
     async def get_element(
         self,
         prompt: str,
@@ -103,7 +110,9 @@ async def get_element(
         Args:
             prompt (str): The prompt describing the element to be retrieved.
             use_cache (bool, optional): Whether to use cached results. Defaults to True.
-            timeout (int, optional): The total timeout (in milliseconds) until the last request is sent to the API. Defaults to 15000 (15 seconds).
+            timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
+                up to 5000ms will be spent attempting to use cached selectors before falling back to the
+                find element agent for the remaining time. Defaults to 15000 (15 seconds).
 
         Returns:
             AsyncElement: The retrieved element.
@@ -112,7 +121,7 @@ async def get_element(
             prompt,
             only_one=True,
             use_cache=use_cache,
-            timeout=timeout,
+            timeout=timeout / 1000,
         )
 
     @overload
@@ -130,7 +139,9 @@ async def _get_element(
             prompt (Union[str, Dict[str, str]]): The prompt describing the element to be retrieved.
             only_one (Literal[True]): Indicates that only one element should be retrieved.
             use_cache (bool): Whether to use cached results.
-            timeout: The total timeout (in milliseconds) until the last request is sent to the API.
+            timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
+                up to 5000ms will be spent attempting to use cached selectors before falling back to the
+                find element agent for the remaining time. Defaults to 15000 (15 seconds).
 
         Returns:
             AsyncElement: The retrieved element.
@@ -151,7 +162,9 @@ async def _get_element(
             prompt (str): The prompt describing the elements to be retrieved.
             only_one (Literal[False]): Indicates that multiple elements should be retrieved.
             use_cache (bool): Whether to use cached results.
-            timeout: The total timeout (in milliseconds) until the last request is sent to the API.
+            timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
+                up to 5000ms will be spent attempting to use cached selectors before falling back to the
+                find element agent for the remaining time. Defaults to 15000 (15 seconds).
 
         Returns:
             List[AsyncElement]: A list of retrieved elements.
@@ -177,7 +190,9 @@ async def _get_element(
             prompt_or_elements (Union[str, Dict[str, str]]): The prompt or dictionary of prompts for element retrieval.
             only_one (bool): Whether to retrieve only one element or a list of elements.
             use_cache (bool): Whether to use cached results.
-            timeout (float): The total timeout (in seconds) for the entire operation.
+            timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
+                up to 5000ms will be spent attempting to use cached selectors before falling back to the
+                find element agent for the remaining time. Defaults to 15000 (15 seconds).
 
         Returns:
             Union[AsyncElement, List[AsyncElement], AsyncElementsResponse]: The retrieved element, list of elements, or response object.
@@ -187,7 +202,7 @@ async def _get_element(
         start_time = time.time()
 
         # First, let's check if there is a cached selector
-        cache_available = await test_if_cache_available(self,prompt_or_elements)
+        cache_available = await test_if_cache_available(self, prompt_or_elements)
 
         # If we have cached elements, attempt to use them with an exponentation backoff
         if cache_available and use_cache == True:
@@ -197,8 +212,8 @@ async def _get_element(
                 prompt_or_elements,
                 only_one,
                 api_config,
+                remaining_timeout=CACHE_TIMEOUT,
                 only_use_cache=True,
-                remaining_timeout=timeout - (time.time() - start_time),
             )
             if res:
                 return res
@@ -216,8 +231,8 @@ async def _get_element(
             prompt_or_elements,
             only_one,
             api_config,
-            only_use_cache=False,
             remaining_timeout=timeout - (time.time() - start_time),
+            only_use_cache=False,
         )
         if res:
             return res
@@ -227,6 +242,7 @@ async def _get_element(
         )
         return None
 
+
 async def test_if_cache_available(
     obj: DendritePageProtocol,
     prompt_or_elements: Union[str, Dict[str, str]],
@@ -240,13 +256,14 @@ async def test_if_cache_available(
     cache_available = await obj._get_browser_api_client().check_selector_cache(dto)
     return cache_available.exists
 
+
 async def attempt_with_backoff(
     obj: DendritePageProtocol,
     prompt_or_elements: Union[str, Dict[str, str]],
     only_one: bool,
     api_config: APIConfig,
+    remaining_timeout: float,
     only_use_cache: bool = False,
-    remaining_timeout: float = 15.0,
 ) -> Union[Optional[AsyncElement], List[AsyncElement], AsyncElementsResponse]:
     TIMEOUT_INTERVAL: List[float] = [0.15, 0.45, 1.0, 2.0, 4.0, 8.0]
     total_elapsed_time = 0
@@ -294,6 +311,7 @@ async def attempt_with_backoff(
     logger.error(f"All attempts failed after {total_elapsed_time:.2f} seconds")
     return None
 
+
 async def get_elements_from_selectors(
     obj: DendritePageProtocol, res: GetElementResponse, only_one: bool
 ) -> Union[Optional[AsyncElement], List[AsyncElement], AsyncElementsResponse]:
@@ -302,9 +320,7 @@ async def get_elements_from_selectors(
         for key, selectors in res.selectors.items():
             for selector in selectors:
                 page = await obj._get_page()
-                dendrite_elements = await page._get_all_elements_from_selector(
-                    selector
-                )
+                dendrite_elements = await page._get_all_elements_from_selector(selector)
                 if len(dendrite_elements) > 0:
                     result[key] = dendrite_elements[0]
                     break
diff --git a/dendrite_sdk/async_api/_core/mixin/markdown.py b/dendrite_sdk/async_api/_core/mixin/markdown.py
@@ -1,4 +1,7 @@
 from typing import Optional
+from bs4 import BeautifulSoup
+import re
+
 from dendrite_sdk.async_api._core.mixin.extract import ExtractionMixin
 from dendrite_sdk.async_api._core.protocol.page_protocol import DendritePageProtocol
 
@@ -10,8 +13,14 @@ async def markdown(self, prompt: Optional[str] = None):
         page = await self._get_page()
         page_information = await page.get_page_information()
         if prompt:
-            extract_prompt = f"Extract and return the html for this requested section of the website:\n\n{prompt}"
-            res = await self.extract(extract_prompt, str)
-            return md(res, heading_style="ATX")
+            extract_prompt = f"Create a script that returns the HTML from one element from the DOM that best matches this requested section of the website.\n\nDescription of section: '{prompt}'\n\nWe will be converting your returned HTML to markdown, so just return ONE stringified HTML element and nothing else. It's OK if extra information is present. Example script: 'response_data = soup.find('tag', {{'attribute': 'value'}}).prettify()'"
+            res = await self.extract(extract_prompt)
+            markdown_text = md(res)
+            # Remove excessive newlines (3 or more) and replace with 2 newlines
+            cleaned_markdown = re.sub(r"\n{3,}", "\n\n", markdown_text)
+            return cleaned_markdown
         else:
-            return md(page_information.raw_html)
+            markdown_text = md(page_information.raw_html)
+            # Remove excessive newlines (3 or more) and replace with 2 newlines
+            cleaned_markdown = re.sub(r"\n{3,}", "\n\n", markdown_text)
+            return cleaned_markdown
diff --git a/dendrite_sdk/async_api/_core/mixin/wait_for.py b/dendrite_sdk/async_api/_core/mixin/wait_for.py
@@ -47,7 +47,7 @@ async def wait_for(
 
             page = await self._get_page()
             page_information = await page.get_page_information()
-            prompt_with_instruction = f"Prompt: '{prompt}'\n\nReturn a boolean that determines if the requested information or thing is available on the page."
+            prompt_with_instruction = f"Prompt: '{prompt}'\n\nReturn a boolean that determines if the requested information or thing is available on the page. {round(page_information.time_since_frame_navigated, 2)} seconds have passed since the page first loaded."
 
             try:
                 res = await self.ask(prompt_with_instruction, bool)
diff --git a/dendrite_sdk/sync_api/_core/dendrite_browser.py b/dendrite_sdk/sync_api/_core/dendrite_browser.py
@@ -333,7 +333,7 @@ def _get_active_page_manager(self) -> PageManager:
             Exception: If there is an issue launching the browser or retrieving the PageManager.
         """
         if not self._active_page_manager:
-            _, _, active_page_manager = self._launch()
+            (_, _, active_page_manager) = self._launch()
             return active_page_manager
         return self._active_page_manager
 
diff --git a/dendrite_sdk/sync_api/_core/dendrite_page.py b/dendrite_sdk/sync_api/_core/dendrite_page.py
@@ -16,6 +16,7 @@
 from dendrite_sdk.sync_api._core.mixin.fill_fields import FillFieldsMixin
 from dendrite_sdk.sync_api._core.mixin.get_element import GetElementMixin
 from dendrite_sdk.sync_api._core.mixin.keyboard import KeyboardMixin
+from dendrite_sdk.sync_api._core.mixin.markdown import MarkdownMixin
 from dendrite_sdk.sync_api._core.mixin.wait_for import WaitForMixin
 from dendrite_sdk.sync_api._core.models.page_information import PageInformation
 
@@ -27,6 +28,7 @@
 
 
 class Page(
+    MarkdownMixin,
     ExtractionMixin,
     WaitForMixin,
     AskMixin,
diff --git a/dendrite_sdk/sync_api/_core/mixin/extract.py b/dendrite_sdk/sync_api/_core/mixin/extract.py
@@ -17,6 +17,8 @@
 from dendrite_sdk.sync_api._core._managers.navigation_tracker import NavigationTracker
 from loguru import logger
 
+CACHE_TIMEOUT = 5
+
 
 class ExtractionMixin(DendritePageProtocol):
     """
@@ -103,7 +105,9 @@ def extract(
             prompt (Optional[str]): The prompt to describe the information to extract.
             type_spec (Optional[TypeSpec], optional): The type specification for the extracted data.
             use_cache (bool, optional): Whether to use cached results. Defaults to True.
-            timeout (int, optional): The maximum time to wait for extraction in seconds. Defaults to 180 seconds, which is 3 minutes.
+            timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
+                up to 5000ms will be spent attempting to use cached scripts before falling back to the
+                extraction agent for the remaining time that will attempt to generate a new script. Defaults to 15000 (15 seconds).
 
         Returns:
             ExtractResponse: The extracted data wrapped in a ExtractResponse object.
@@ -131,8 +135,8 @@ def extract(
                     self,
                     prompt,
                     json_schema,
+                    remaining_timeout=CACHE_TIMEOUT,
                     only_use_cache=True,
-                    remaining_timeout=timeout - (time.time() - start_time),
                 )
                 if result:
                     return convert_and_return_result(result, type_spec)
@@ -143,8 +147,8 @@ def extract(
             self,
             prompt,
             json_schema,
-            only_use_cache=False,
             remaining_timeout=timeout - (time.time() - start_time),
+            only_use_cache=False,
         )
         if result:
             return convert_and_return_result(result, type_spec)
@@ -173,8 +177,8 @@ def attempt_extraction_with_backoff(
     obj: DendritePageProtocol,
     prompt: str,
     json_schema: Optional[JsonSchema],
-    only_use_cache: bool = False,
     remaining_timeout: float = 180.0,
+    only_use_cache: bool = False,
 ) -> Optional[ExtractResponse]:
     TIMEOUT_INTERVAL: List[float] = [0.15, 0.45, 1.0, 2.0, 4.0, 8.0]
     total_elapsed_time = 0
diff --git a/dendrite_sdk/sync_api/_core/mixin/get_element.py b/dendrite_sdk/sync_api/_core/mixin/get_element.py
diff --git a/dendrite_sdk/sync_api/_core/mixin/markdown.py b/dendrite_sdk/sync_api/_core/mixin/markdown.py
diff --git a/dendrite_sdk/sync_api/_core/mixin/wait_for.py b/dendrite_sdk/sync_api/_core/mixin/wait_for.py
diff --git a/dendrite_sdk/sync_api/_dom/util/mild_strip.py b/dendrite_sdk/sync_api/_dom/util/mild_strip.py