Merge branch 'cli_stream_fix_media_comp' into cli_activeskill

tallate · tallate · commit 0efc7448be9b · 2026-03-09T17:01:34.000+08:00
diff --git a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/aworld_agent.py b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/aworld_agent.py
@@ -65,6 +65,7 @@ def _build_beijing_date_line() -> str:
 *   `developer`: a sub-agent that can develop apps/code/html/website and laterimprove this developed apps/code/html/website according to the suggestions from the `evaluator`, by using terminal and other professional tools.
 *   `evaluator`: a sub-agent that can evaluate the apps/code/html/website's (developed by the `developer`) performance, user experience, and so on, and present professional suggestions to the `developer` for the apps/code/html/website improvement.
 *   `terminal`: A tool set that can execute terminal commands. **Path restriction:** Do not `cd` to other directories; always operate from the current working directory. When operating on files, always use explicit relative or absolute paths. **Timeout requirement:** You MUST always set a reasonable `timeout` (in seconds) when calling the terminal tool; do not rely on defaults for long-running commands—choose an appropriate timeout based on the expected duration (e.g., 60–120 seconds for builds, 30–60 for quick commands).
+*   `media_comprehension`: a sub-agent that specially for understanding images, audio, and video files. Cannot process: documents (.pdf, e.g. report.pdf), spreadsheets (.xlsx/.csv, e.g. data.xlsx), presentations (.pptx, e.g. slides.pptx), code (.py/.js/.ts, e.g. main.py), archives (.zip/.tar/.rar, e.g. backup.zip), executables (.exe/.bin, e.g. app.exe), databases (.db/.sqlite, e.g. users.db), structured data (.json/.xml/.yaml, e.g. config.json), web pages (.html/.htm, e.g. index.html).
 
 ## 4. Available Skills
 *    Please be aware that if you need to have access to a particular skill to help you to complete the task, you MUST use the appropriate `SKILL_tool` to activate the skill, which returns you the exact skill content.
diff --git a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/developer/prompt.txt b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/developer/prompt.txt
@@ -11,6 +11,7 @@ You analyze target codebases, identify key modules and entry points, and apply p
 **You must only use one tool call per turn.** Do not chain commands.
 **Prohibit one-shot large file creation:** You must never create or modify large files in a single operation. Large files must always be output in segments. Each segment must not exceed 50 lines or 5,000 characters. This applies to both Mode 1 (creation via `terminal`) and Mode 2 (modification via `CAST_CODER.search_replace`).
 **Do Not Use browser_take_screenshot:** You Must Not use browser_take_screenshot, since this tool call will return very large files which will block the task.
+**Do Not Use Interactive commands. You may use non-interactivealternatives (e.g. --yes, -y, CI=1, DEBIAN_FRONTEND=noninteractive) or different tools.
 
 ## 🔄 Core Workflow: Operating Modes
 You will operate in one of two modes, determined by the user's request. You must identify the correct mode at the beginning of the task and follow its specific workflow.
diff --git a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/evaluator/prompt.txt b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/evaluator/prompt.txt
@@ -26,4 +26,5 @@ You are equipped with multiple assistants. It is your job to know which to use a
 - **Honest Capability Assessment:** If a user's request is beyond the combined capabilities of your available assistants, you must terminate the task and clearly explain to the user why it cannot be completed.
 - **Working Directory:** Always treat the current directory as your working directory for all actions: run shell commands from it, and use it (or paths under it) for any temporary or output files when such operations are permitted (e.g. non-code tasks). You MUST NOT redirect work or temporary files to /tmp; Always use the current directory so outputs stay with the user's context.
 - **Do Not Delete Files:** You MUST NOT use the `terminal_tool` to rm -rf any file, since this will delete the file from the system. except the ms-playwrightmodule installation case.
-- **Do Not Use browser_take_screenshot:** You Must Not use browser_take_screenshot, since this tool call will return very large files which will block the task.
+- **Do Not Use browser_take_screenshot:** You Must Not use browser_take_screenshot, since this tool call will return very large files which will block the task.
+- **Do Not Use Interactive commands. You may use non-interactivealternatives (e.g. --yes, -y, CI=1, DEBIAN_FRONTEND=noninteractive) or different tools.
diff --git a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/media_comprehension/media_comprehension.py b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/media_comprehension/media_comprehension.py
@@ -61,6 +61,8 @@ async def async_policy(self, observation: Observation, info: Dict[str, Any] = {}
 - Images: Recognize, describe, and interpret visual content.
 - Audio: Transcribe speech and analyze audio content.
 - Video: Understand video content, analyze scenes, and perform multimodal comprehension.
+
+Cannot process (do NOT delegate to this agent): Documents (.pdf, e.g. report.pdf), spreadsheets (.xlsx/.csv, e.g. data.xlsx), presentations (.pptx, e.g. slides.pptx), code/scripts (.py/.js/.ts, e.g. main.py), archives (.zip/.tar/.rar, e.g. backup.zip), executables (.exe/.bin, e.g. app.exe), databases (.db/.sqlite, e.g. users.db), structured data (.json/.xml/.yaml, e.g. config.json), web pages (.html/.htm, e.g. index.html).
 """
 )
 def build_media_comprehension_swarm():
diff --git a/examples/gaia/mcp_collections/tools/terminal.py b/examples/gaia/mcp_collections/tools/terminal.py
@@ -104,10 +104,7 @@ def __init__(self, arguments: ActionArguments) -> None:
         # Interactive-only commands (block stdin, cannot run non-interactively)
         self.interactive_command_patterns = [
             r"(?:^|\s)(vim|vi|nano|emacs)(?:\s|$)",
-            r"(?:^|\s)(less|more)(?:\s|$)",
-            r"(?:^|\s)(top|htop)(?:\s|$)",
-            r"(?:^|\s)(ftp|telnet)(?:\s|$)",
-            r"(?:^|\s)(python3?|bash)\s+-i\b",
+            r"(?:^|\s)(ftp|telnet)(?:\s|$)"
         ]
 
         # Get current platform info
@@ -147,9 +144,12 @@ def _check_interactive_command(self, command: str) -> tuple[bool, str | None]:
             Tuple of (is_allowed, reason_if_forbidden)
         """
         for pattern in self.interactive_command_patterns:
-            if re.search(pattern, command, re.IGNORECASE):
+            m = re.search(pattern, command, re.IGNORECASE)
+            if m:
+                forbidden_cmd = m.group(1)
                 return False, (
-                    "Interactive commands are not allowed. Use non-interactive alternatives "
+                    f"Interactive commands are not allowed (forbidden: {forbidden_cmd}). "
+                    "Use non-interactive alternatives "
                     "(e.g. --yes, -y, CI=1, DEBIAN_FRONTEND=noninteractive) or different tools."
                 )
         return True, None

Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,8 @@ async def async_policy(self, observation: Observation, info: Dict[str, Any] = {}`
`61`	`61`	`- Images: Recognize, describe, and interpret visual content.`
`62`	`62`	`- Audio: Transcribe speech and analyze audio content.`
`63`	`63`	`- Video: Understand video content, analyze scenes, and perform multimodal comprehension.`
	`64`	`+`
	`65`	`+Cannot process (do NOT delegate to this agent): Documents (.pdf, e.g. report.pdf), spreadsheets (.xlsx/.csv, e.g. data.xlsx), presentations (.pptx, e.g. slides.pptx), code/scripts (.py/.js/.ts, e.g. main.py), archives (.zip/.tar/.rar, e.g. backup.zip), executables (.exe/.bin, e.g. app.exe), databases (.db/.sqlite, e.g. users.db), structured data (.json/.xml/.yaml, e.g. config.json), web pages (.html/.htm, e.g. index.html).`
`64`	`66`	`"""`
`65`	`67`	`)`
`66`	`68`	`def build_media_comprehension_swarm():`