update 30 march 2025

vyokky · vyokky · commit daecca0cb669 · 2025-03-30T18:22:42.000+08:00
diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@ If you find our work useful, please consider citing:
 
 The **[Searchable Paper Page](https://vyokky.github.io/LLM-Brained-GUI-Agents-Survey/)** is a web-based interface that allows you to search and filter through the papers in our survey. You can also view the papers by category, platform, and date.
 
-Last updated: **March 1st, 2025**.
+Last updated: **March 30th, 2025**.
 
 ---
 
diff --git a/data/benchmark.json b/data/benchmark.json
@@ -409,7 +409,7 @@
     },
     {
         "Name": "Beyond Pass or Fail: A Multi-dimensional Benchmark for Mobile UI Navigation",
-        "Platform": "Android",
+        "Platform": "Mobile Android",
         "Date": "January 2025",
         "Paper_Url": "https://arxiv.org/abs/2501.02863",
         "Highlight": "Provides a fully automated benchmarking suite and introduces a multi-dimensional evaluation framework.",
@@ -422,5 +422,37 @@
         "Paper_Url": "https://arxiv.org/abs/2502.08047",
         "Highlight": "First GUI benchmark designed to evaluate dynamic GUI interactions by incorporating various initial states.",
         "Code_Url": ""
+    },
+    {
+        "Name": "AEIA-MN: Evaluating the Robustness of Multimodal LLM-Powered Mobile Agents Against Active Environmental Injection Attacks",
+        "Platform": "Mobile Android",
+        "Date": "February 2025",
+        "Paper_Url": "https://arxiv.org/abs/2502.13053",
+        "Highlight": "Introduces the Active Environment Injection Attack (AEIA) framework that actively manipulates environmental elements (e.g., notifications) in mobile operating systems to mislead multimodal LLM-powered agents.",
+        "Code_Url": ""
+    },
+    {
+        "Name": "WebGames: Challenging General-Purpose Web-Browsing AI Agents",
+        "Platform": "Web",
+        "Date": "February 2025",
+        "Paper_Url": "https://arxiv.org/abs/2502.18356",
+        "Highlight": "A comprehensive benchmark designed to evaluate the capabilities of general-purpose web-browsing AI agents through 50+ interactive challenges. It uniquely provides a hermetic testing environment with verifiable ground-truth solutions.",
+        "Code_Url": "https://github.com/convergence-ai/webgames"
+    },
+    {
+        "Name": "AutoEval: A Practical Framework for Autonomous Evaluation of Mobile Agents",
+        "Platform": "Mobile Android",
+        "Date": "March 2025",
+        "Paper_Url": "https://arxiv.org/abs/2503.02403",
+        "Highlight": "Introduces a fully autonomous evaluation framework for mobile agents, eliminating the need for manual task reward signal definition and extensive evaluation code development.",
+        "Code_Url": ""
+    },
+    {
+        "Name": "SafeArena: Evaluating the Safety of Autonomous Web Agents",
+        "Platform": "Web",
+        "Date": "March 2025",
+        "Paper_Url": "https://arxiv.org/abs/2503.04957",
+        "Highlight": "The first benchmark specifically designed to evaluate the deliberate misuse of web agents by testing their ability to complete both safe and harmful tasks.",
+        "Code_Url": "https://safearena.github.io"
     }
-]
+]
diff --git a/data/dataset.json b/data/dataset.json
@@ -262,6 +262,14 @@
         "Paper_Url": "https://arxiv.org/abs/2404.16048",
         "Highlight": "Integrates images, action sequences, task descriptions, and spatial grounding into a unified dataset.",
         "Code_Url": "https://github.com/superagi/GUIDE"
+    },
+    {
+        "Name": "Explorer: Scaling Exploration-driven Web Trajectory Synthesis for Multimodal Web Agents",
+        "Platform": "Web",
+        "Date": "February 2025",
+        "Paper_Url": "https://arxiv.org/abs/2502.11357",
+        "Highlight": "Largest-scale web trajectory dataset to date; dynamically explores web pages to create contextually relevant tasks",
+        "Code_Url": ""
     }
     
-]
+]
diff --git a/data/framework.json b/data/framework.json
@@ -494,5 +494,96 @@
         "Paper_Url": "https://dl.acm.org/doi/abs/10.1145/3716132",
         "Highlight": "Enables UI automation through free-form textual prompts, eliminating the need for users to script automation tasks.",
         "Code_Url": "Https://github.com/PromptRPA/Prompt2TaskDataset"
+    },
+    {
+        "Name": "Symbiotic Cooperation for Web Agents: Harnessing Complementary Strengths of Large and Small LLMs",
+        "Platform": "Web",
+        "Date": "February 2025",
+        "Paper_Url": "https://arxiv.org/abs/2502.07942",
+        "Highlight": "Multi-agent iterative architecture & Introduces an iterative, symbiotic learning process between large and small LLMs for web automation. Enhances both data synthesis and task performance through speculative data synthesis, multi-task learning, and privacy-preserving hybrid modes.",
+        "Code_Url": ""
+    },
+    {
+        "Name": "PC-Agent: A Hierarchical Multi-Agent Collaboration Framework for Complex Task Automation on PC",
+        "Platform": "Windows computers",
+        "Date": "February 2025",
+        "Paper_Url": "https://arxiv.org/abs/2502.14282",
+        "Highlight": "PC-Agent's hierarchical multi-agent design enables efficient decomposition of complex PC tasks. Its Active Perception Module enhances fine-grained GUI understanding by combining accessibility structures, OCR, and intention grounding.",
+        "Code_Url": "https://github.com/X-PLUG/MobileAgent/tree/main/PC-Agent"
+    },
+    {
+        "Name": "Mobile-Agent-V: Learning Mobile Device Operation Through Video-Guided Multi-Agent Collaboration",
+        "Platform": "Android",
+        "Date": "February 2025",
+        "Paper_Url": "https://arxiv.org/abs/2502.17110",
+        "Highlight": " Introduces video-guided learning, allowing the agent to acquire operational knowledge efficiently.",
+        "Code_Url": "https://github.com/X-PLUG/MobileAgent"
+    },
+    {
+        "Name": "MobileSteward: Integrating Multiple App-Oriented Agents with Self-Evolution to Automate Cross-App Instructions",
+        "Platform": "Android",
+        "Date": "February 2025",
+        "Paper_Url": "https://arxiv.org/abs/2502.16796",
+        "Highlight": "Introduces an app-oriented multi-agent framework with self-evolution, overcoming the complexity of cross-app interactions by dynamically recruiting specialized agents.",
+        "Code_Url": "https://github.com/XiaoMi/MobileSteward"
+    },
+    {
+        "Name": "Programming with Pixels: Computer-Use Meets Software Engineering",
+        "Platform": "Computers",
+        "Date": "February 2025",
+        "Paper_Url": "https://arxiv.org/abs/2502.18525",
+        "Highlight": "Shifts software engineering agents from API-based tool interactions to direct GUI-based computer use, allowing agents to interact with an IDE as a human developer would.",
+        "Code_Url": "https://programmingwithpixels.com"
+    },
+    {
+        "Name": "AppAgentX: Evolving GUI Agents as Proficient Smartphone Users",
+        "Platform": "Mobile Android",
+        "Date": "March 2025",
+        "Paper_Url": "https://arxiv.org/abs/2503.02268",
+        "Highlight": "Introduces an evolutionary mechanism that enables dynamic learning from past interactions and replaces inefficient low-level operations with high-level actions.",
+        "Code_Url": "https://appagentx.github.io/"
+    },
+    {
+        "Name": "LiteWebAgent: The Open-Source Suite for VLM-Based Web-Agent Applications",
+        "Platform": "Web",
+        "Date": "March 2025",
+        "Paper_Url": "https://arxiv.org/abs/2503.02950",
+        "Highlight": "First open-source, production-ready web agent integrating tree search for multi-step task execution.",
+        "Code_Url": "https://github.com/PathOnAI/LiteWebAgent"
+    },
+    {
+        "Name": "CHOP: Mobile Operating Assistant with Constrained High-frequency Optimized Subtask Planning",
+        "Platform": "Mobile Android",
+        "Date": "March 2025",
+        "Paper_Url": "https://arxiv.org/abs/2503.03743",
+        "Highlight": "Introduces a basis subtask framework, where subtasks are predefined based on human task decomposition patterns, ensuring better executability and efficiency.",
+        "Code_Url": "https://github.com/Yuqi-Zhou/CHOP"
+    },
+    {
+        "Name": "Enhancing Language Multi-Agent Learning with Multi-Agent Credit Re-Assignment for Interactive Environment Generalization",
+        "Platform": "Mobile Android, Web",
+        "Date": "February 2025",
+        "Paper_Url": "https://arxiv.org/abs/2502.14496",
+        "Highlight": "A multi-agent reinforcement learning framework that introduces a Credit Re-Assignment (CR) strategy, using LLMs instead of environment-specific rewards to enhance performance and generalization.",
+        "Code_Url": "https://github.com/THUNLP-MT/CollabUIAgents"
+    },
+    {
+        "Name": "Automating the enterprise with foundation models",
+        "Platform": "Web",
+        "Date": "May 2024",
+        "Paper_Url": "https://arxiv.org/abs/2405.03710",
+        "Highlight": "Eliminates the high setup costs, brittle execution, and burdensome maintenance associated with traditional RPA by learning from video and text documentation.",
+        "Code_Url": "https://github.com/HazyResearch/eclair-agents"
+
+    },
+    {
+        "Name": "Towards Ethical and Personalized Web Navigation Agents: A Framework for User-Aligned Task Execution",
+        "Platform": "Web",
+        "Date": "March 2025",
+        "Paper_Url": "https://dl.acm.org/doi/abs/10.1145/3701551.3707420",
+        "Highlight": "User-aligned task execution, where the agent adapts to individual user preferences in an ethical manner.",
+        "Code_Url": "/"
     }
-]
+
+
+]
diff --git a/data/gui-testing.json b/data/gui-testing.json
@@ -102,5 +102,30 @@
         "Paper_Url": "https://arxiv.org/abs/2411.17933",
         "Highlight": "Leverages multimodal LLMs to perform UI test transfers without requiring source code access",
         "Code_Url": ""
+    },
+    {
+        "Name": "UXAgent: An LLM Agent-Based Usability Testing Framework for Web Design",
+        "Platform": "Web platforms",
+        "Date": "February 2025",
+        "Paper_Url": "https://arxiv.org/abs/2502.12561",
+        "Highlight": "Enables LLM-powered automated usability testing by simulating thousands of user interactions, collecting both qualitative and quantitative data, and providing researchers with early feedback before real-user studies.",
+        "Code_Url": "https://uxagent.hailab.io"
+    },
+    {
+        "Name": "Guardian: A Runtime Framework for LLM-Based UI Exploration",
+        "Platform": "Mobile Android",
+        "Date": "September 2024",
+        "Paper_Url": "https://dl.acm.org/doi/abs/10.1145/3650212.3680334",
+        "Highlight": "Autonomously explores mobile applications, interacting with the UI to validate core functionalities.",
+        "Code_Url": ""
+    },
+    {
+        "Name": "Test-Agent: A Multimodal App Automation Testing Framework Based on the Large Language Model",
+        "Platform": "Mobile Android, iOS, Harmony OS",
+        "Date": "October 2024",
+        "Paper_Url": "https://ieeexplore.ieee.org/abstract/document/10778901/",
+        "Highlight": "Eliminates the need for pre-written test scripts by leveraging LLMs and multimodal perception to generate and execute test cases automatically.",
+        "Code_Url": ""
     }
-]
+]
+
diff --git a/data/models.json b/data/models.json
@@ -308,9 +308,64 @@
         "Platform": "Web, Desktop (Windows, MacOS, Linux), Mobile (Android, iOS)",
         "Date": "May 2024",
         "Paper_Url": "https://arxiv.org/abs/2410.05243",
-        "Highlight": "Utilizes hierarchical screen parsing and spatially enhanced element descriptions to enhance LVLMs without additional training.",
+        "Highlight": "A universal GUI grounding model that relies solely on vision, eliminating the need for text-based representations",
         "Code_Url": ""
+    },
+    {
+        "Name": "VSC-RL: Advancing Autonomous Vision-Language Agents with Variational Subgoal-Conditioned Reinforcement Learning",
+        "Platform": "Mobile",
+        "Date": "February 2025",
+        "Paper_Url": "https://arxiv.org/abs/2502.07949",
+        "Highlight": "Addresses sparse-reward, long-horizon tasks for RL by autonomously breaking a complicated goal into subgoals",
+        "Code_Url": "https://ai-agents-2030.github.io/VSC-RL"
+    },
+    {
+        "Name": "Magma: A Foundation Model for Multimodal AI Agents",
+        "Platform": "Web, Mobile, Desktop, Robotics",
+        "Date": "February 2025",
+        "Paper_Url": "https://arxiv.org/abs/2502.13130",
+        "Highlight": "Jointly trains on heterogeneous datasets, enabling generalization across digital and physical tasks",
+        "Code_Url": "https://microsoft.github.io/Magma/"
+    },
+    {
+        "Name": "Digi-Q: Learning Q-Value Functions for Training Device-Control Agents",
+        "Platform": "Mobile Android",
+        "Date": "February 2025",
+        "Paper_Url": "https://arxiv.org/abs/2502.15760",
+        "Highlight": "Introduces a VLM-based Q-function for GUI agent training, enabling reinforcement learning without online interactions.",
+        "Code_Url": "https://github.com/DigiRL-agent/digiq"
+    },
+    {
+        "Name": "VEM: Environment-Free Exploration for Training GUI Agent with Value Environment Model",
+        "Platform": "Mobile Android",
+        "Date": "February 2025",
+        "Paper_Url": "https://arxiv.org/abs/2502.18906",
+        "Highlight": "Unlike traditional RL methods that require environment interactions, VEM enables training purely on offline data with a Value Environment Model.",
+        "Code_Url": "https://github.com/microsoft/GUI-Agent-RL"
+    },
+    {
+        "Name": "AutoGUI: Scaling GUI Grounding with Automatic Functionality Annotations from LLMs",
+        "Platform": " Web, Mobile",
+        "Date": "February 2025",
+        "Paper_Url": "https://arxiv.org/abs/2502.01977",
+        "Highlight": "Automatically labels UI elements based on interaction-induced changes, making it scalable and high-quality.",
+        "Code_Url": "https://autogui-project.github.io/"
+    },
+    {
+        "Name": "Smoothing Grounding and Reasoning for MLLM-Powered GUI Agents with Query-Oriented Pivot Tasks",
+        "Platform": "Mobile Android",
+        "Date": "March 2025",
+        "Paper_Url": "https://arxiv.org/abs/2503.00401",
+        "Highlight": "Improves reasoning without requiring large-scale training data.",
+        "Code_Url": "https://github.com/ZrW00/GUIPivot"
+    },
+    {
+        "Name": "WinClick: GUI Grounding with Multimodal Large Language Models",
+        "Platform": "Windows Computer",
+        "Date": "March 2025",
+        "Paper_Url": "https://arxiv.org/abs/2503.04730",
+        "Highlight": "The first GUI grounding model specifically tailored for Windows.",
+        "Code_Url": "https://github.com/zackhuiiiii/WinSpot"
     }
+]
 
-
-]