Skip to content

Commit

Permalink
feat(extract-data): extract data from same-origin iframe (#258)
Browse files Browse the repository at this point in the history
* feat: extract data from same-origin iframe

* fix: ci snapshot

* fix: extracting timeout error

* fix: ci timeout

* fix: update assets

* feat: set default size of yaml as 1920x1080

* chore: update default viewport size

---------

Co-authored-by: zhouxiao.shaw <[email protected]>
  • Loading branch information
yuyutaotao and zhoushaw authored Jan 22, 2025
1 parent 37d8aad commit 57f6786
Show file tree
Hide file tree
Showing 19 changed files with 277 additions and 117 deletions.
2 changes: 1 addition & 1 deletion apps/site/docs/en/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ There are some limitations with Midscene. We are still working on them.

1. The interaction types are limited to only tap, type, keyboard press, and scroll.
2. LLM is not 100% stable. Even GPT-4o can't return the right answer all the time. Following the [Prompting Tips](./prompting-tips) will help improve stability.
3. Since we use JavaScript to retrieve elements from the page, the elements inside the iframe cannot be accessed.
3. Since we use JavaScript to retrieve elements from the page, the elements inside the cross-origin iframe cannot be accessed.
4. We cannot access the native elements of Chrome, like the right-click context menu or file upload dialog.
5. Do not use Midscene to bypass CAPTCHA. Some LLM services are set to decline requests that involve CAPTCHA-solving (e.g., OpenAI), while the DOM of some CAPTCHA pages is not accessible by regular web scraping methods. Therefore, using Midscene to bypass CAPTCHA is not a reliable method.

Expand Down
2 changes: 1 addition & 1 deletion apps/site/docs/zh/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Midscene 存在一些局限性,我们仍在努力改进。

1. 交互类型有限:目前仅支持点击、输入、键盘和滚动操作。
2. 稳定性风险:即使是 GPT-4o 也无法确保 100% 返回正确答案。遵循 [编写提示词的技巧](./prompting-tips) 可以帮助提高 SDK 稳定性。
3. 元素访问受限:由于我们使用 JavaScript 从页面提取元素,所以无法访问 iframe 内部的元素。
3. 元素访问受限:由于我们使用 JavaScript 从页面提取元素,所以无法访问跨域 iframe 内部的元素。
4. 无法访问 Chrome 原生元素:无法访问右键菜单、文件上传对话框等。
5. 无法绕过验证码:有些 LLM 服务会拒绝涉及验证码解决的请求(例如 OpenAI),而有些验证码页面的 DOM 无法通过常规的网页抓取方法访问。因此,使用 Midscene 绕过验证码不是一个可靠的方法。

Expand Down
1 change: 1 addition & 0 deletions packages/web-integration/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@
"devtools-protocol": "0.0.1380148",
"dotenv": "16.4.5",
"fs-extra": "11.2.0",
"http-server": "14.1.1",
"js-sha256": "0.11.0",
"js-yaml": "4.1.0",
"playwright": "1.44.1",
Expand Down
127 changes: 73 additions & 54 deletions packages/web-integration/src/extractor/util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,11 @@ function selectorForValue(val: number | string): string {
export function setDataForNode(
node: HTMLElement | Node,
nodeHash: string,
setToParentNode = false,
setToParentNode: boolean, // should be false for default
currentWindow: typeof window,
): string {
const taskId = taskIdKey;
if (!(node instanceof Element)) {
if (!(node instanceof currentWindow.HTMLElement)) {
return '';
}
if (!taskId) {
Expand All @@ -47,7 +48,7 @@ export function setDataForNode(
const selector = selectorForValue(nodeHash);
if (getDebugMode()) {
if (setToParentNode) {
if (node.parentNode instanceof HTMLElement) {
if (node.parentNode instanceof currentWindow.HTMLElement) {
node.parentNode.setAttribute(taskIdKey, nodeHash.toString());
}
} else {
Expand All @@ -57,17 +58,25 @@ export function setDataForNode(
return selector;
}

function isElementPartiallyInViewport(rect: ReturnType<typeof getRect>) {
function isElementPartiallyInViewport(
rect: ReturnType<typeof getRect>,
currentWindow: typeof window,
currentDocument: typeof document,
) {
const elementHeight = rect.height;
const elementWidth = rect.width;

const viewportRect = {
left: 0,
top: 0,
width: window.innerWidth || document.documentElement.clientWidth,
height: window.innerHeight || document.documentElement.clientHeight,
right: window.innerWidth || document.documentElement.clientWidth,
bottom: window.innerHeight || document.documentElement.clientHeight,
width:
currentWindow.innerWidth || currentDocument.documentElement.clientWidth,
height:
currentWindow.innerHeight || currentDocument.documentElement.clientHeight,
right:
currentWindow.innerWidth || currentDocument.documentElement.clientWidth,
bottom:
currentWindow.innerHeight || currentDocument.documentElement.clientHeight,
x: 0,
y: 0,
zoom: 1,
Expand All @@ -84,17 +93,20 @@ function isElementPartiallyInViewport(rect: ReturnType<typeof getRect>) {
return visibleArea / totalArea >= 2 / 3;
}

export function getPseudoElementContent(element: Node): {
export function getPseudoElementContent(
element: Node,
currentWindow: typeof window,
): {
before: string;
after: string;
} {
if (!(element instanceof HTMLElement)) {
if (!(element instanceof currentWindow.HTMLElement)) {
return { before: '', after: '' };
}
const beforeContent = window
const beforeContent = currentWindow
.getComputedStyle(element, '::before')
.getPropertyValue('content');
const afterContent = window
const afterContent = currentWindow
.getComputedStyle(element, '::after')
.getPropertyValue('content');
return {
Expand All @@ -103,8 +115,11 @@ export function getPseudoElementContent(element: Node): {
};
}

export function hasOverflowY(element: HTMLElement): boolean {
const style = window.getComputedStyle(element);
export function hasOverflowY(
element: HTMLElement,
currentWindow: typeof window,
): boolean {
const style = currentWindow.getComputedStyle(element);
return (
style.overflowY === 'scroll' ||
style.overflowY === 'auto' ||
Expand Down Expand Up @@ -149,18 +164,22 @@ export function overlappedRect(
return null;
}

export function getRect(el: HTMLElement | Node, baseZoom = 1): ExtractedRect {
export function getRect(
el: HTMLElement | Node,
baseZoom: number, // base zoom
currentWindow: typeof window,
): ExtractedRect {
let originalRect: DOMRect;
let newZoom = 1;
if (!(el instanceof HTMLElement)) {
const range = document.createRange();
if (!(el instanceof currentWindow.HTMLElement)) {
const range = currentWindow.document.createRange();
range.selectNodeContents(el);
originalRect = range.getBoundingClientRect();
} else {
originalRect = el.getBoundingClientRect();
// from Chrome v128, the API would return differently https://docs.google.com/document/d/1AcnDShjT-kEuRaMchZPm5uaIgNZ4OiYtM4JI9qiV8Po/edit
if (!('currentCSSZoom' in el)) {
newZoom = Number.parseFloat(window.getComputedStyle(el).zoom) || 1;
newZoom = Number.parseFloat(currentWindow.getComputedStyle(el).zoom) || 1;
}
}

Expand All @@ -179,13 +198,17 @@ export function getRect(el: HTMLElement | Node, baseZoom = 1): ExtractedRect {
};
}

const isElementCovered = (el: HTMLElement | Node, rect: ExtractedRect) => {
const isElementCovered = (
el: HTMLElement | Node,
rect: ExtractedRect,
currentWindow: typeof window,
) => {
// Gets the center coordinates of the element
const x = rect.left + rect.width / 2;
const y = rect.top + rect.height / 2;

// Gets the element above that point
const topElement = document.elementFromPoint(x, y);
const topElement = currentWindow.document.elementFromPoint(x, y);
if (!topElement) {
return false; // usually because it's outside the screen
}
Expand All @@ -201,7 +224,7 @@ const isElementCovered = (el: HTMLElement | Node, rect: ExtractedRect) => {
return false;
}

const rectOfTopElement = getRect(topElement as HTMLElement, 1);
const rectOfTopElement = getRect(topElement as HTMLElement, 1, currentWindow);

// get the remaining area of the base element
const overlapRect = overlappedRect(rect, rectOfTopElement);
Expand Down Expand Up @@ -232,6 +255,8 @@ const isElementCovered = (el: HTMLElement | Node, rect: ExtractedRect) => {

export function visibleRect(
el: HTMLElement | Node | null,
currentWindow: typeof window,
currentDocument: typeof document,
baseZoom = 1,
):
| { left: number; top: number; width: number; height: number; zoom: number }
Expand All @@ -242,16 +267,16 @@ export function visibleRect(
}

if (
!(el instanceof HTMLElement) &&
!(el instanceof currentWindow.HTMLElement) &&
el.nodeType !== Node.TEXT_NODE &&
el.nodeName.toLowerCase() !== 'svg'
) {
logger(el, 'Element is not in the DOM hierarchy');
return false;
}

if (el instanceof HTMLElement) {
const style = window.getComputedStyle(el);
if (el instanceof currentWindow.HTMLElement) {
const style = currentWindow.getComputedStyle(el);
if (
style.display === 'none' ||
style.visibility === 'hidden' ||
Expand All @@ -262,7 +287,7 @@ export function visibleRect(
}
}

const rect = getRect(el, baseZoom);
const rect = getRect(el, baseZoom, currentWindow);

if (rect.width === 0 && rect.height === 0) {
logger(el, 'Element has no size');
Expand All @@ -271,18 +296,24 @@ export function visibleRect(

// check if the element is covered by another element
// if the element is zoomed, the coverage check should be done with the original zoom
if (baseZoom === 1 && isElementCovered(el, rect)) {
if (baseZoom === 1 && isElementCovered(el, rect, currentWindow)) {
return false;
}

const scrollLeft = window.pageXOffset || document.documentElement.scrollLeft;
const scrollTop = window.pageYOffset || document.documentElement.scrollTop;
const scrollLeft =
currentWindow.pageXOffset || currentDocument.documentElement.scrollLeft;
const scrollTop =
currentWindow.pageYOffset || currentDocument.documentElement.scrollTop;
const viewportWidth =
window.innerWidth || document.documentElement.clientWidth;
currentWindow.innerWidth || currentDocument.documentElement.clientWidth;
const viewportHeight =
window.innerHeight || document.documentElement.clientHeight;
currentWindow.innerHeight || currentDocument.documentElement.clientHeight;

const isPartiallyInViewport = isElementPartiallyInViewport(rect);
const isPartiallyInViewport = isElementPartiallyInViewport(
rect,
currentWindow,
currentDocument,
);

if (!isPartiallyInViewport) {
logger(el, 'Element is completely outside the viewport', {
Expand All @@ -297,14 +328,14 @@ export function visibleRect(

// check if the element is hidden by an ancestor
let parent: HTMLElement | Node | null = el;
while (parent && parent !== document.body) {
if (!(parent instanceof HTMLElement)) {
while (parent && parent !== currentDocument.body) {
if (!(parent instanceof currentWindow.HTMLElement)) {
parent = parent.parentElement;
continue;
}
const parentStyle = window.getComputedStyle(parent);
const parentStyle = currentWindow.getComputedStyle(parent);
if (parentStyle.overflow === 'hidden') {
const parentRect = getRect(parent, 1);
const parentRect = getRect(parent, 1, currentWindow);
const tolerance = 10;

if (
Expand Down Expand Up @@ -348,23 +379,6 @@ export function validTextNodeContent(node: Node): string | false {
return false;
}

// const everyChildNodeIsText = Array.from(node.childNodes).every((child) => {
// const tagName = ((child as HTMLElement).tagName || '').toLowerCase();
// if (
// tagName === 'script' ||
// tagName === 'style' ||
// tagName === 'link' ||
// tagName !== '#text'
// ) {
// return false;
// }
// return true;
// });

// if (!everyChildNodeIsText) {
// return false;
// }

const content = node.textContent || (node as HTMLElement).innerText;
if (content && !/^\s*$/.test(content)) {
return content.trim();
Expand All @@ -375,8 +389,13 @@ export function validTextNodeContent(node: Node): string | false {

export function getNodeAttributes(
node: HTMLElement | Node,
currentWindow: typeof window,
): Record<string, string> {
if (!node || !(node instanceof HTMLElement) || !node.attributes) {
if (
!node ||
!(node instanceof currentWindow.HTMLElement) ||
!node.attributes
) {
return {};
}

Expand Down Expand Up @@ -464,7 +483,7 @@ export function setExtractTextWithPositionOnWindow() {
}
}

export function getDocument(): HTMLElement {
export function getTopDocument(): HTMLElement {
const container: HTMLElement = document.body || document;
return container;
}
Loading

0 comments on commit 57f6786

Please sign in to comment.