Skip to content

Commit 84877a7

Browse files
authored
Den 444 Improve cache speed (#85)
1 parent 7f6a1af commit 84877a7

File tree

7 files changed

+192
-99
lines changed

7 files changed

+192
-99
lines changed

dendrite/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
fmt = "<green>{time: HH:mm:ss.SSS}</green> | <level>{level: <8}</level>- <level>{message}</level>"
2020

21-
logger.add(sys.stderr, level="WARNING", format=fmt)
21+
logger.add(sys.stderr, level="DEBUG", format=fmt)
2222

2323

2424
__all__ = [

dendrite/async_api/_core/_utils.py

+81-47
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1-
from typing import Union, List, TYPE_CHECKING
2-
from playwright.async_api import FrameLocator, ElementHandle, Error
1+
from typing import Optional, Union, List, TYPE_CHECKING
2+
from playwright.async_api import FrameLocator, ElementHandle, Error, Frame
33
from bs4 import BeautifulSoup
44
from loguru import logger
55

6+
from dendrite.async_api._api.response.get_element_response import GetElementResponse
67
from dendrite.async_api._core._type_spec import PlaywrightPage
78
from dendrite.async_api._core.dendrite_element import AsyncElement
9+
from dendrite.async_api._core.models.response import AsyncElementsResponse
810

911
if TYPE_CHECKING:
1012
from dendrite.async_api._core.dendrite_page import AsyncPage
@@ -18,51 +20,40 @@
1820
async def expand_iframes(
1921
page: PlaywrightPage,
2022
page_soup: BeautifulSoup,
21-
iframe_path: str = "",
22-
frame: Union[ElementHandle, None] = None,
2323
):
24-
25-
if frame is None:
26-
iframes = await page.query_selector_all("iframe")
27-
else:
28-
content_frame = await frame.content_frame()
29-
if not content_frame:
30-
return
31-
iframes = await content_frame.query_selector_all("iframe")
32-
for iframe in iframes:
33-
# TODO: kolla om iframe inte har doc eller body, skippa då
34-
iframe_id = await iframe.get_attribute("d-id")
24+
async def get_iframe_path(frame: Frame):
25+
path_parts = []
26+
current_frame = frame
27+
while current_frame.parent_frame is not None:
28+
iframe_element = await current_frame.frame_element()
29+
iframe_id = await iframe_element.get_attribute("d-id")
30+
if iframe_id is None:
31+
# If any iframe_id in the path is None, we cannot build the path
32+
return None
33+
path_parts.insert(0, iframe_id)
34+
current_frame = current_frame.parent_frame
35+
return "|".join(path_parts)
36+
37+
for frame in page.frames:
38+
if frame.parent_frame is None:
39+
continue # Skip the main frame
40+
iframe_element = await frame.frame_element()
41+
iframe_id = await iframe_element.get_attribute("d-id")
3542
if iframe_id is None:
3643
continue
37-
38-
new_iframe_path = ""
39-
if iframe_path:
40-
new_iframe_path = f"{iframe_path}|"
41-
new_iframe_path = f"{new_iframe_path}{iframe_id}"
42-
44+
iframe_path = await get_iframe_path(frame)
45+
if iframe_path is None:
46+
continue
4347
try:
44-
content_frame = await iframe.content_frame()
45-
46-
if content_frame is None:
47-
continue
48-
49-
await content_frame.evaluate(
50-
GENERATE_DENDRITE_IDS_IFRAME_SCRIPT, {"frame_path": new_iframe_path}
48+
await frame.evaluate(
49+
GENERATE_DENDRITE_IDS_IFRAME_SCRIPT, {"frame_path": iframe_path}
5150
)
52-
53-
frame_content = await content_frame.content()
54-
55-
frame_tree = BeautifulSoup(frame_content, "html.parser")
51+
frame_content = await frame.content()
52+
frame_tree = BeautifulSoup(frame_content, "lxml")
5653
mild_strip_in_place(frame_tree)
5754
merge_iframe_to_page(iframe_id, page_soup, frame_tree)
58-
await expand_iframes(
59-
page,
60-
page_soup,
61-
new_iframe_path,
62-
iframe,
63-
)
6455
except Error as e:
65-
logger.debug(f"Error getting content frame for iframe {iframe_id}: {e}")
56+
logger.debug(f"Error processing frame {iframe_id}: {e}")
6657
continue
6758

6859

@@ -79,11 +70,54 @@ def merge_iframe_to_page(
7970
iframe_element.replace_with(iframe)
8071

8172

82-
def get_frame_context(
83-
page: PlaywrightPage, iframe_path: str
84-
) -> Union[FrameLocator, PlaywrightPage]:
85-
iframe_path_list = iframe_path.split("|")
86-
frame_context = page
87-
for iframe_id in iframe_path_list:
88-
frame_context = frame_context.frame_locator(f"[tf623_id='{iframe_id}']")
89-
return frame_context
73+
async def _get_all_elements_from_selector_soup(
74+
selector: str, soup: BeautifulSoup, page: "AsyncPage"
75+
) -> List[AsyncElement]:
76+
dendrite_elements: List[AsyncElement] = []
77+
78+
elements = soup.select(selector)
79+
80+
for element in elements:
81+
frame = page._get_context(element)
82+
d_id = element.get("d-id", "")
83+
locator = frame.locator(f"xpath=//*[@d-id='{d_id}']")
84+
85+
if not d_id:
86+
continue
87+
88+
if isinstance(d_id, list):
89+
d_id = d_id[0]
90+
dendrite_elements.append(
91+
AsyncElement(d_id, locator, page.dendrite_browser, page._browser_api_client)
92+
)
93+
94+
return dendrite_elements
95+
96+
97+
async def get_elements_from_selectors_soup(
98+
page: "AsyncPage",
99+
soup: BeautifulSoup,
100+
res: GetElementResponse,
101+
only_one: bool,
102+
) -> Union[Optional[AsyncElement], List[AsyncElement], AsyncElementsResponse]:
103+
if isinstance(res.selectors, dict):
104+
result = {}
105+
for key, selectors in res.selectors.items():
106+
for selector in selectors:
107+
dendrite_elements = await _get_all_elements_from_selector_soup(
108+
selector, soup, page
109+
)
110+
if len(dendrite_elements) > 0:
111+
result[key] = dendrite_elements[0]
112+
break
113+
return AsyncElementsResponse(result)
114+
elif isinstance(res.selectors, list):
115+
for selector in reversed(res.selectors):
116+
dendrite_elements = await _get_all_elements_from_selector_soup(
117+
selector, soup, page
118+
)
119+
120+
if len(dendrite_elements) > 0:
121+
return dendrite_elements[0] if only_one else dendrite_elements
122+
123+
return None

dendrite/async_api/_core/dendrite_page.py

+10
Original file line numberDiff line numberDiff line change
@@ -357,8 +357,18 @@ async def _get_soup(self) -> BeautifulSoup:
357357
page_source = await self.playwright_page.content()
358358
soup = BeautifulSoup(page_source, "lxml")
359359
await self._expand_iframes(soup)
360+
self._previous_soup = soup
360361
return soup
361362

363+
async def _get_previous_soup(self) -> BeautifulSoup:
364+
"""
365+
Retrieves the page source generated by the latest _get_soup() call as a Beautiful soup object. If it hasn't been called yet, it will call it.
366+
"""
367+
368+
if self._previous_soup is None:
369+
return await self._get_soup()
370+
return self._previous_soup
371+
362372
async def _expand_iframes(self, page_source: BeautifulSoup):
363373
"""
364374
Expands iframes in the given page source to make their content accessible.

dendrite/async_api/_core/mixin/get_element.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from dendrite.async_api._api.dto.get_elements_dto import GetElementsDTO
88
from dendrite.async_api._api.response.get_element_response import GetElementResponse
99
from dendrite.async_api._api.dto.get_elements_dto import CheckSelectorCacheDTO
10+
from dendrite.async_api._core._utils import get_elements_from_selectors_soup
1011
from dendrite.async_api._core.dendrite_element import AsyncElement
1112
from dendrite.async_api._core.models.response import AsyncElementsResponse
1213
from dendrite.async_api._core.protocol.page_protocol import DendritePageProtocol
@@ -202,7 +203,10 @@ async def _get_element(
202203
start_time = time.time()
203204

204205
# First, let's check if there is a cached selector
205-
cache_available = await test_if_cache_available(self, prompt_or_elements)
206+
page = await self._get_page()
207+
cache_available = await test_if_cache_available(
208+
self, prompt_or_elements, page.url
209+
)
206210

207211
# If we have cached elements, attempt to use them with an exponentation backoff
208212
if cache_available and use_cache == True:
@@ -244,16 +248,14 @@ async def _get_element(
244248

245249

246250
async def test_if_cache_available(
247-
obj: DendritePageProtocol,
248-
prompt_or_elements: Union[str, Dict[str, str]],
251+
obj: DendritePageProtocol, prompt_or_elements: Union[str, Dict[str, str]], url: str
249252
) -> bool:
250-
page = await obj._get_page()
251-
page_information = await page.get_page_information(include_screenshot=False)
252253
dto = CheckSelectorCacheDTO(
253-
url=page_information.url,
254+
url=url,
254255
prompt=prompt_or_elements,
255256
)
256257
cache_available = await obj._get_browser_api_client().check_selector_cache(dto)
258+
257259
return cache_available.exists
258260

259261

@@ -297,7 +299,9 @@ async def attempt_with_backoff(
297299
return None
298300

299301
if res.status == "success":
300-
response = await get_elements_from_selectors(obj, res, only_one)
302+
response = await get_elements_from_selectors_soup(
303+
page, await page._get_previous_soup(), res, only_one
304+
)
301305
if response:
302306
return response
303307

dendrite/sync_api/_core/_utils.py

+72-38
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,52 @@
1-
from typing import Union, List, TYPE_CHECKING
2-
from playwright.sync_api import FrameLocator, ElementHandle, Error
1+
from typing import Optional, Union, List, TYPE_CHECKING
2+
from playwright.sync_api import FrameLocator, ElementHandle, Error, Frame
33
from bs4 import BeautifulSoup
44
from loguru import logger
5+
from dendrite.sync_api._api.response.get_element_response import GetElementResponse
56
from dendrite.sync_api._core._type_spec import PlaywrightPage
67
from dendrite.sync_api._core.dendrite_element import Element
8+
from dendrite.sync_api._core.models.response import ElementsResponse
79

810
if TYPE_CHECKING:
911
from dendrite.sync_api._core.dendrite_page import Page
1012
from dendrite.sync_api._core._js import GENERATE_DENDRITE_IDS_IFRAME_SCRIPT
1113
from dendrite.sync_api._dom.util.mild_strip import mild_strip_in_place
1214

1315

14-
def expand_iframes(
15-
page: PlaywrightPage,
16-
page_soup: BeautifulSoup,
17-
iframe_path: str = "",
18-
frame: Union[ElementHandle, None] = None,
19-
):
20-
if frame is None:
21-
iframes = page.query_selector_all("iframe")
22-
else:
23-
content_frame = frame.content_frame()
24-
if not content_frame:
25-
return
26-
iframes = content_frame.query_selector_all("iframe")
27-
for iframe in iframes:
28-
iframe_id = iframe.get_attribute("d-id")
16+
def expand_iframes(page: PlaywrightPage, page_soup: BeautifulSoup):
17+
18+
def get_iframe_path(frame: Frame):
19+
path_parts = []
20+
current_frame = frame
21+
while current_frame.parent_frame is not None:
22+
iframe_element = current_frame.frame_element()
23+
iframe_id = iframe_element.get_attribute("d-id")
24+
if iframe_id is None:
25+
return None
26+
path_parts.insert(0, iframe_id)
27+
current_frame = current_frame.parent_frame
28+
return "|".join(path_parts)
29+
30+
for frame in page.frames:
31+
if frame.parent_frame is None:
32+
continue
33+
iframe_element = frame.frame_element()
34+
iframe_id = iframe_element.get_attribute("d-id")
2935
if iframe_id is None:
3036
continue
31-
new_iframe_path = ""
32-
if iframe_path:
33-
new_iframe_path = f"{iframe_path}|"
34-
new_iframe_path = f"{new_iframe_path}{iframe_id}"
37+
iframe_path = get_iframe_path(frame)
38+
if iframe_path is None:
39+
continue
3540
try:
36-
content_frame = iframe.content_frame()
37-
if content_frame is None:
38-
continue
39-
content_frame.evaluate(
40-
GENERATE_DENDRITE_IDS_IFRAME_SCRIPT, {"frame_path": new_iframe_path}
41+
frame.evaluate(
42+
GENERATE_DENDRITE_IDS_IFRAME_SCRIPT, {"frame_path": iframe_path}
4143
)
42-
frame_content = content_frame.content()
43-
frame_tree = BeautifulSoup(frame_content, "html.parser")
44+
frame_content = frame.content()
45+
frame_tree = BeautifulSoup(frame_content, "lxml")
4446
mild_strip_in_place(frame_tree)
4547
merge_iframe_to_page(iframe_id, page_soup, frame_tree)
46-
expand_iframes(page, page_soup, new_iframe_path, iframe)
4748
except Error as e:
48-
logger.debug(f"Error getting content frame for iframe {iframe_id}: {e}")
49+
logger.debug(f"Error processing frame {iframe_id}: {e}")
4950
continue
5051

5152

@@ -57,11 +58,44 @@ def merge_iframe_to_page(iframe_id: str, page: BeautifulSoup, iframe: BeautifulS
5758
iframe_element.replace_with(iframe)
5859

5960

60-
def get_frame_context(
61-
page: PlaywrightPage, iframe_path: str
62-
) -> Union[FrameLocator, PlaywrightPage]:
63-
iframe_path_list = iframe_path.split("|")
64-
frame_context = page
65-
for iframe_id in iframe_path_list:
66-
frame_context = frame_context.frame_locator(f"[tf623_id='{iframe_id}']")
67-
return frame_context
61+
def _get_all_elements_from_selector_soup(
62+
selector: str, soup: BeautifulSoup, page: "Page"
63+
) -> List[Element]:
64+
dendrite_elements: List[Element] = []
65+
elements = soup.select(selector)
66+
for element in elements:
67+
frame = page._get_context(element)
68+
d_id = element.get("d-id", "")
69+
locator = frame.locator(f"xpath=//*[@d-id='{d_id}']")
70+
if not d_id:
71+
continue
72+
if isinstance(d_id, list):
73+
d_id = d_id[0]
74+
dendrite_elements.append(
75+
Element(d_id, locator, page.dendrite_browser, page._browser_api_client)
76+
)
77+
return dendrite_elements
78+
79+
80+
def get_elements_from_selectors_soup(
81+
page: "Page", soup: BeautifulSoup, res: GetElementResponse, only_one: bool
82+
) -> Union[Optional[Element], List[Element], ElementsResponse]:
83+
if isinstance(res.selectors, dict):
84+
result = {}
85+
for key, selectors in res.selectors.items():
86+
for selector in selectors:
87+
dendrite_elements = _get_all_elements_from_selector_soup(
88+
selector, soup, page
89+
)
90+
if len(dendrite_elements) > 0:
91+
result[key] = dendrite_elements[0]
92+
break
93+
return ElementsResponse(result)
94+
elif isinstance(res.selectors, list):
95+
for selector in reversed(res.selectors):
96+
dendrite_elements = _get_all_elements_from_selector_soup(
97+
selector, soup, page
98+
)
99+
if len(dendrite_elements) > 0:
100+
return dendrite_elements[0] if only_one else dendrite_elements
101+
return None

dendrite/sync_api/_core/dendrite_page.py

+9
Original file line numberDiff line numberDiff line change
@@ -299,8 +299,17 @@ def _get_soup(self) -> BeautifulSoup:
299299
page_source = self.playwright_page.content()
300300
soup = BeautifulSoup(page_source, "lxml")
301301
self._expand_iframes(soup)
302+
self._previous_soup = soup
302303
return soup
303304

305+
def _get_previous_soup(self) -> BeautifulSoup:
306+
"""
307+
Retrieves the page source generated by the latest _get_soup() call as a Beautiful soup object. If it hasn't been called yet, it will call it.
308+
"""
309+
if self._previous_soup is None:
310+
return self._get_soup()
311+
return self._previous_soup
312+
304313
def _expand_iframes(self, page_source: BeautifulSoup):
305314
"""
306315
Expands iframes in the given page source to make their content accessible.

0 commit comments

Comments
 (0)