Skip to content

Commit 90092c4

Browse files
Den 422 fix timeout of extract and get element (#68)
* Added markdown mixin to page and improved markdown function. Also improved wait_for. * Added strict timeout to cache. * Fixed incorrect timout and updates docstrings. Built sync.
1 parent 9218f9b commit 90092c4

File tree

12 files changed

+117
-53
lines changed

12 files changed

+117
-53
lines changed

dendrite_sdk/async_api/_core/dendrite_page.py

+2
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from dendrite_sdk.async_api._core.mixin.fill_fields import FillFieldsMixin
3535
from dendrite_sdk.async_api._core.mixin.get_element import GetElementMixin
3636
from dendrite_sdk.async_api._core.mixin.keyboard import KeyboardMixin
37+
from dendrite_sdk.async_api._core.mixin.markdown import MarkdownMixin
3738
from dendrite_sdk.async_api._core.mixin.wait_for import WaitForMixin
3839
from dendrite_sdk.async_api._core.models.page_information import PageInformation
3940

@@ -54,6 +55,7 @@
5455

5556

5657
class AsyncPage(
58+
MarkdownMixin,
5759
ExtractionMixin,
5860
WaitForMixin,
5961
AskMixin,

dendrite_sdk/async_api/_core/mixin/extract.py

+13-6
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818
from loguru import logger
1919

2020

21+
CACHE_TIMEOUT = 5
22+
23+
2124
class ExtractionMixin(DendritePageProtocol):
2225
"""
2326
Mixin that provides extraction functionality for web pages.
@@ -103,7 +106,9 @@ async def extract(
103106
prompt (Optional[str]): The prompt to describe the information to extract.
104107
type_spec (Optional[TypeSpec], optional): The type specification for the extracted data.
105108
use_cache (bool, optional): Whether to use cached results. Defaults to True.
106-
timeout (int, optional): The maximum time to wait for extraction in seconds. Defaults to 180 seconds, which is 3 minutes.
109+
timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
110+
up to 5000ms will be spent attempting to use cached scripts before falling back to the
111+
extraction agent for the remaining time that will attempt to generate a new script. Defaults to 15000 (15 seconds).
107112
108113
Returns:
109114
ExtractResponse: The extracted data wrapped in a ExtractResponse object.
@@ -129,8 +134,7 @@ async def extract(
129134
# Check if a script exists in the cache
130135
if use_cache:
131136
cache_available = await check_if_extract_cache_available(
132-
self,
133-
prompt, json_schema
137+
self, prompt, json_schema
134138
)
135139

136140
if cache_available:
@@ -139,8 +143,8 @@ async def extract(
139143
self,
140144
prompt,
141145
json_schema,
146+
remaining_timeout=CACHE_TIMEOUT,
142147
only_use_cache=True,
143-
remaining_timeout=timeout - (time.time() - start_time),
144148
)
145149
if result:
146150
return convert_and_return_result(result, type_spec)
@@ -152,8 +156,8 @@ async def extract(
152156
self,
153157
prompt,
154158
json_schema,
155-
only_use_cache=False,
156159
remaining_timeout=timeout - (time.time() - start_time),
160+
only_use_cache=False,
157161
)
158162

159163
if result:
@@ -162,6 +166,7 @@ async def extract(
162166
logger.error(f"Extraction failed after {time.time() - start_time:.2f} seconds")
163167
return None
164168

169+
165170
async def check_if_extract_cache_available(
166171
obj: DendritePageProtocol, prompt: str, json_schema: Optional[JsonSchema]
167172
) -> bool:
@@ -178,12 +183,13 @@ async def check_if_extract_cache_available(
178183
)
179184
return cache_response.exists
180185

186+
181187
async def attempt_extraction_with_backoff(
182188
obj: DendritePageProtocol,
183189
prompt: str,
184190
json_schema: Optional[JsonSchema],
185-
only_use_cache: bool = False,
186191
remaining_timeout: float = 180.0,
192+
only_use_cache: bool = False,
187193
) -> Optional[ExtractResponse]:
188194
TIMEOUT_INTERVAL: List[float] = [0.15, 0.45, 1.0, 2.0, 4.0, 8.0]
189195
total_elapsed_time = 0
@@ -234,6 +240,7 @@ async def attempt_extraction_with_backoff(
234240
)
235241
return None
236242

243+
237244
def convert_and_return_result(
238245
res: ExtractResponse, type_spec: Optional[TypeSpec]
239246
) -> TypeSpec:

dendrite_sdk/async_api/_core/mixin/get_element.py

+34-18
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
from dendrite_sdk.async_api._core.models.api_config import APIConfig
1414

1515

16+
CACHE_TIMEOUT = 5
17+
18+
1619
class GetElementMixin(DendritePageProtocol):
1720
@overload
1821
async def get_elements(
@@ -28,7 +31,9 @@ async def get_elements(
2831
Args:
2932
prompt_or_elements (str): The prompt describing the elements to be retrieved.
3033
use_cache (bool, optional): Whether to use cached results. Defaults to True.
31-
timeout (int, optional): The total timeout (in milliseconds) until the last request is sent to the API. Defaults to 15000 (15 seconds).
34+
timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
35+
up to 5000ms will be spent attempting to use cached selectors before falling back to the
36+
find element agent for the remaining time. Defaults to 15000 (15 seconds).
3237
context (str, optional): Additional context for the retrieval. Defaults to an empty string.
3338
3439
Returns:
@@ -49,7 +54,9 @@ async def get_elements(
4954
Args:
5055
prompt_or_elements (Dict[str, str]): A dictionary where keys are field names and values are prompts describing the elements to be retrieved.
5156
use_cache (bool, optional): Whether to use cached results. Defaults to True.
52-
timeout (int, optional): The total timeout (in milliseconds) until the last request is sent to the API. Defaults to 3000.
57+
timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
58+
up to 5000ms will be spent attempting to use cached selectors before falling back to the
59+
find element agent for the remaining time. Defaults to 15000 (15 seconds).
5360
context (str, optional): Additional context for the retrieval. Defaults to an empty string.
5461
5562
Returns:
@@ -72,7 +79,9 @@ async def get_elements(
7279
Args:
7380
prompt_or_elements (Union[str, Dict[str, str]]): The prompt or dictionary of prompts for element retrieval.
7481
use_cache (bool, optional): Whether to use cached results. Defaults to True.
75-
timeout (int, optional): The total timeout (in milliseconds) until the last request is sent to the API. Defaults to 3000.
82+
timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
83+
up to 5000ms will be spent attempting to use cached selectors before falling back to the
84+
find element agent for the remaining time. Defaults to 15000 (15 seconds).
7685
context (str, optional): Additional context for the retrieval. Defaults to an empty string.
7786
7887
Returns:
@@ -86,11 +95,9 @@ async def get_elements(
8695
prompt_or_elements,
8796
only_one=False,
8897
use_cache=use_cache,
89-
timeout=timeout,
98+
timeout=timeout / 1000,
9099
)
91100

92-
raise ValueError("Prompt must be either a string prompt or a dictionary")
93-
94101
async def get_element(
95102
self,
96103
prompt: str,
@@ -103,7 +110,9 @@ async def get_element(
103110
Args:
104111
prompt (str): The prompt describing the element to be retrieved.
105112
use_cache (bool, optional): Whether to use cached results. Defaults to True.
106-
timeout (int, optional): The total timeout (in milliseconds) until the last request is sent to the API. Defaults to 15000 (15 seconds).
113+
timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
114+
up to 5000ms will be spent attempting to use cached selectors before falling back to the
115+
find element agent for the remaining time. Defaults to 15000 (15 seconds).
107116
108117
Returns:
109118
AsyncElement: The retrieved element.
@@ -112,7 +121,7 @@ async def get_element(
112121
prompt,
113122
only_one=True,
114123
use_cache=use_cache,
115-
timeout=timeout,
124+
timeout=timeout / 1000,
116125
)
117126

118127
@overload
@@ -130,7 +139,9 @@ async def _get_element(
130139
prompt (Union[str, Dict[str, str]]): The prompt describing the element to be retrieved.
131140
only_one (Literal[True]): Indicates that only one element should be retrieved.
132141
use_cache (bool): Whether to use cached results.
133-
timeout: The total timeout (in milliseconds) until the last request is sent to the API.
142+
timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
143+
up to 5000ms will be spent attempting to use cached selectors before falling back to the
144+
find element agent for the remaining time. Defaults to 15000 (15 seconds).
134145
135146
Returns:
136147
AsyncElement: The retrieved element.
@@ -151,7 +162,9 @@ async def _get_element(
151162
prompt (str): The prompt describing the elements to be retrieved.
152163
only_one (Literal[False]): Indicates that multiple elements should be retrieved.
153164
use_cache (bool): Whether to use cached results.
154-
timeout: The total timeout (in milliseconds) until the last request is sent to the API.
165+
timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
166+
up to 5000ms will be spent attempting to use cached selectors before falling back to the
167+
find element agent for the remaining time. Defaults to 15000 (15 seconds).
155168
156169
Returns:
157170
List[AsyncElement]: A list of retrieved elements.
@@ -177,7 +190,9 @@ async def _get_element(
177190
prompt_or_elements (Union[str, Dict[str, str]]): The prompt or dictionary of prompts for element retrieval.
178191
only_one (bool): Whether to retrieve only one element or a list of elements.
179192
use_cache (bool): Whether to use cached results.
180-
timeout (float): The total timeout (in seconds) for the entire operation.
193+
timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
194+
up to 5000ms will be spent attempting to use cached selectors before falling back to the
195+
find element agent for the remaining time. Defaults to 15000 (15 seconds).
181196
182197
Returns:
183198
Union[AsyncElement, List[AsyncElement], AsyncElementsResponse]: The retrieved element, list of elements, or response object.
@@ -187,7 +202,7 @@ async def _get_element(
187202
start_time = time.time()
188203

189204
# First, let's check if there is a cached selector
190-
cache_available = await test_if_cache_available(self,prompt_or_elements)
205+
cache_available = await test_if_cache_available(self, prompt_or_elements)
191206

192207
# If we have cached elements, attempt to use them with an exponentation backoff
193208
if cache_available and use_cache == True:
@@ -197,8 +212,8 @@ async def _get_element(
197212
prompt_or_elements,
198213
only_one,
199214
api_config,
215+
remaining_timeout=CACHE_TIMEOUT,
200216
only_use_cache=True,
201-
remaining_timeout=timeout - (time.time() - start_time),
202217
)
203218
if res:
204219
return res
@@ -216,8 +231,8 @@ async def _get_element(
216231
prompt_or_elements,
217232
only_one,
218233
api_config,
219-
only_use_cache=False,
220234
remaining_timeout=timeout - (time.time() - start_time),
235+
only_use_cache=False,
221236
)
222237
if res:
223238
return res
@@ -227,6 +242,7 @@ async def _get_element(
227242
)
228243
return None
229244

245+
230246
async def test_if_cache_available(
231247
obj: DendritePageProtocol,
232248
prompt_or_elements: Union[str, Dict[str, str]],
@@ -240,13 +256,14 @@ async def test_if_cache_available(
240256
cache_available = await obj._get_browser_api_client().check_selector_cache(dto)
241257
return cache_available.exists
242258

259+
243260
async def attempt_with_backoff(
244261
obj: DendritePageProtocol,
245262
prompt_or_elements: Union[str, Dict[str, str]],
246263
only_one: bool,
247264
api_config: APIConfig,
265+
remaining_timeout: float,
248266
only_use_cache: bool = False,
249-
remaining_timeout: float = 15.0,
250267
) -> Union[Optional[AsyncElement], List[AsyncElement], AsyncElementsResponse]:
251268
TIMEOUT_INTERVAL: List[float] = [0.15, 0.45, 1.0, 2.0, 4.0, 8.0]
252269
total_elapsed_time = 0
@@ -294,6 +311,7 @@ async def attempt_with_backoff(
294311
logger.error(f"All attempts failed after {total_elapsed_time:.2f} seconds")
295312
return None
296313

314+
297315
async def get_elements_from_selectors(
298316
obj: DendritePageProtocol, res: GetElementResponse, only_one: bool
299317
) -> Union[Optional[AsyncElement], List[AsyncElement], AsyncElementsResponse]:
@@ -302,9 +320,7 @@ async def get_elements_from_selectors(
302320
for key, selectors in res.selectors.items():
303321
for selector in selectors:
304322
page = await obj._get_page()
305-
dendrite_elements = await page._get_all_elements_from_selector(
306-
selector
307-
)
323+
dendrite_elements = await page._get_all_elements_from_selector(selector)
308324
if len(dendrite_elements) > 0:
309325
result[key] = dendrite_elements[0]
310326
break
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
from typing import Optional
2+
from bs4 import BeautifulSoup
3+
import re
4+
25
from dendrite_sdk.async_api._core.mixin.extract import ExtractionMixin
36
from dendrite_sdk.async_api._core.protocol.page_protocol import DendritePageProtocol
47

@@ -10,8 +13,14 @@ async def markdown(self, prompt: Optional[str] = None):
1013
page = await self._get_page()
1114
page_information = await page.get_page_information()
1215
if prompt:
13-
extract_prompt = f"Extract and return the html for this requested section of the website:\n\n{prompt}"
14-
res = await self.extract(extract_prompt, str)
15-
return md(res, heading_style="ATX")
16+
extract_prompt = f"Create a script that returns the HTML from one element from the DOM that best matches this requested section of the website.\n\nDescription of section: '{prompt}'\n\nWe will be converting your returned HTML to markdown, so just return ONE stringified HTML element and nothing else. It's OK if extra information is present. Example script: 'response_data = soup.find('tag', {{'attribute': 'value'}}).prettify()'"
17+
res = await self.extract(extract_prompt)
18+
markdown_text = md(res)
19+
# Remove excessive newlines (3 or more) and replace with 2 newlines
20+
cleaned_markdown = re.sub(r"\n{3,}", "\n\n", markdown_text)
21+
return cleaned_markdown
1622
else:
17-
return md(page_information.raw_html)
23+
markdown_text = md(page_information.raw_html)
24+
# Remove excessive newlines (3 or more) and replace with 2 newlines
25+
cleaned_markdown = re.sub(r"\n{3,}", "\n\n", markdown_text)
26+
return cleaned_markdown

dendrite_sdk/async_api/_core/mixin/wait_for.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ async def wait_for(
4747

4848
page = await self._get_page()
4949
page_information = await page.get_page_information()
50-
prompt_with_instruction = f"Prompt: '{prompt}'\n\nReturn a boolean that determines if the requested information or thing is available on the page."
50+
prompt_with_instruction = f"Prompt: '{prompt}'\n\nReturn a boolean that determines if the requested information or thing is available on the page. {round(page_information.time_since_frame_navigated, 2)} seconds have passed since the page first loaded."
5151

5252
try:
5353
res = await self.ask(prompt_with_instruction, bool)

dendrite_sdk/sync_api/_core/dendrite_browser.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ def _get_active_page_manager(self) -> PageManager:
333333
Exception: If there is an issue launching the browser or retrieving the PageManager.
334334
"""
335335
if not self._active_page_manager:
336-
_, _, active_page_manager = self._launch()
336+
(_, _, active_page_manager) = self._launch()
337337
return active_page_manager
338338
return self._active_page_manager
339339

dendrite_sdk/sync_api/_core/dendrite_page.py

+2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from dendrite_sdk.sync_api._core.mixin.fill_fields import FillFieldsMixin
1717
from dendrite_sdk.sync_api._core.mixin.get_element import GetElementMixin
1818
from dendrite_sdk.sync_api._core.mixin.keyboard import KeyboardMixin
19+
from dendrite_sdk.sync_api._core.mixin.markdown import MarkdownMixin
1920
from dendrite_sdk.sync_api._core.mixin.wait_for import WaitForMixin
2021
from dendrite_sdk.sync_api._core.models.page_information import PageInformation
2122

@@ -27,6 +28,7 @@
2728

2829

2930
class Page(
31+
MarkdownMixin,
3032
ExtractionMixin,
3133
WaitForMixin,
3234
AskMixin,

dendrite_sdk/sync_api/_core/mixin/extract.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
from dendrite_sdk.sync_api._core._managers.navigation_tracker import NavigationTracker
1818
from loguru import logger
1919

20+
CACHE_TIMEOUT = 5
21+
2022

2123
class ExtractionMixin(DendritePageProtocol):
2224
"""
@@ -103,7 +105,9 @@ def extract(
103105
prompt (Optional[str]): The prompt to describe the information to extract.
104106
type_spec (Optional[TypeSpec], optional): The type specification for the extracted data.
105107
use_cache (bool, optional): Whether to use cached results. Defaults to True.
106-
timeout (int, optional): The maximum time to wait for extraction in seconds. Defaults to 180 seconds, which is 3 minutes.
108+
timeout (int, optional): Maximum time in milliseconds for the entire operation. If use_cache=True,
109+
up to 5000ms will be spent attempting to use cached scripts before falling back to the
110+
extraction agent for the remaining time that will attempt to generate a new script. Defaults to 15000 (15 seconds).
107111
108112
Returns:
109113
ExtractResponse: The extracted data wrapped in a ExtractResponse object.
@@ -131,8 +135,8 @@ def extract(
131135
self,
132136
prompt,
133137
json_schema,
138+
remaining_timeout=CACHE_TIMEOUT,
134139
only_use_cache=True,
135-
remaining_timeout=timeout - (time.time() - start_time),
136140
)
137141
if result:
138142
return convert_and_return_result(result, type_spec)
@@ -143,8 +147,8 @@ def extract(
143147
self,
144148
prompt,
145149
json_schema,
146-
only_use_cache=False,
147150
remaining_timeout=timeout - (time.time() - start_time),
151+
only_use_cache=False,
148152
)
149153
if result:
150154
return convert_and_return_result(result, type_spec)
@@ -173,8 +177,8 @@ def attempt_extraction_with_backoff(
173177
obj: DendritePageProtocol,
174178
prompt: str,
175179
json_schema: Optional[JsonSchema],
176-
only_use_cache: bool = False,
177180
remaining_timeout: float = 180.0,
181+
only_use_cache: bool = False,
178182
) -> Optional[ExtractResponse]:
179183
TIMEOUT_INTERVAL: List[float] = [0.15, 0.45, 1.0, 2.0, 4.0, 8.0]
180184
total_elapsed_time = 0

0 commit comments

Comments
 (0)