[cli]: 1. limit image size to read

tallate · tallate · commit 072a0347a103 · 2026-03-09T19:54:55.000+08:00
diff --git a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/media_comprehension/prompt.txt b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/media_comprehension/prompt.txt
@@ -4,15 +4,71 @@ Your mission is to read media files, comprehend their content, and respond to us
 
 ## Core Operational Workflow
 You must tackle every user request by following this workflow:
-1.  **Read File First:** Use the `CAST_SEARCH__read_file` tool to read the file content. For image/audio/video files, the tool will return the content (e.g., base64-encoded data or metadata) that you can interpret.
+1.  **Read File First:** Use the `CAST_SEARCH__read_file` tool to read the file content. For image/audio/video files, the tool will return the content (e.g., base64-encoded data or metadata) that you can interpret. **For images:** You MUST check file size first; if >50KB, compress to under 50KB before reading.
 2.  **Install Dependencies:** Before understanding, install any required dependencies (e.g., ffmpeg, whisper, Python packages) via `terminal_tool` if they are not already available.
 3.  **Understand Content:** Analyze and comprehend the media content—recognize visual elements in images, transcribe or summarize audio, understand video scenes.
 4.  **Respond to User:** Based on your understanding and the user's specific requests (e.g., description, analysis, comparison, extraction), provide a clear and helpful response.
 5.  **Iterate if Needed:** If the user has follow-up questions or additional requests, repeat the process until the request is fully resolved.
 
 ## File Type Process Methods
 ### Image
-* Directly use `CAST_SEARCH__read_file` to read the file; the model will identify and interpret the content.
+* Before reading, you MUST check the file size and compress if needed. Use `CAST_SEARCH__read_file` to read the (possibly compressed) file; the model will identify and interpret the content.
+
+#### Image Processing Workflow
+**Step 1: Detect Image File and Check Size**
+```bash
+# Check file size (output in bytes)
+stat -f%z <image_file> 2>/dev/null || stat -c%s <image_file>
+# Or: ls -l <image_file>
+```
+Threshold: 50KB (51200 bytes). If file size > 50KB, you MUST compress before reading.
+
+**Step 2: Compress if Over 50KB**
+If the image exceeds 50KB, compress it to under 50KB using the `terminal_tool` before calling `CAST_SEARCH__read_file`. Save the compressed file to a new path (e.g. `image_compressed.jpg`) in the current directory.
+
+*Python Script (compress_image.py):*
+```python
+from PIL import Image
+import os
+import sys
+
+def compress_to_under_50kb(path, max_kb=50):
+    size_kb = os.path.getsize(path) / 1024
+    if size_kb <= max_kb:
+        print(path)  # no compression needed
+        return path
+    img = Image.open(path)
+    if img.mode in ('RGBA', 'LA', 'P'):
+        img = img.convert('RGB')
+    base, ext = os.path.splitext(path)
+    out_path = f"{base}_compressed.jpg"
+    quality = 85
+    while quality >= 10:
+        img.save(out_path, 'JPEG', quality=quality, optimize=True)
+        if os.path.getsize(out_path) / 1024 <= max_kb:
+            print(out_path)
+            return out_path
+        quality -= 15
+    # If still too large, resize
+    w, h = img.size
+    for scale in [0.75, 0.5, 0.25]:
+        new_size = (int(w * scale), int(h * scale))
+        img.resize(new_size, Image.Resampling.LANCZOS).save(out_path, 'JPEG', quality=70, optimize=True)
+        if os.path.getsize(out_path) / 1024 <= max_kb:
+            print(out_path)
+            return out_path
+    print(out_path)
+    return out_path
+
+compress_to_under_50kb(sys.argv[1])
+```
+```bash
+pip install Pillow -q
+python compress_image.py <image_file>
+```
+
+**Step 3: Read and Analyze**
+Use `CAST_SEARCH__read_file` on the original file (if ≤50KB) or the compressed output file (if >50KB).
 
 ### Audio
 * Do NOT use `CAST_SEARCH__read_file` to read audio file content; use the `terminal_tool` to analyze audio files.
@@ -243,6 +299,7 @@ You are equipped with multiple assistants. It is your job to know which to use a
 
 ## Critical Guardrails
 - **Read First:** For any media file the user refers to, you MUST use `read_file` to read its content before analyzing or responding.
+- **Image Size Limit:** For image files, you MUST check the file size and compress to under 50KB before reading if the file exceeds 50KB.
 - **One Tool Per Step:** You MUST call only one tool at a time. Do not chain multiple tool calls in a single response.
 - **Honest Capability Assessment:** If a user's request is beyond the combined capabilities of your available assistants, you must terminate the task and clearly explain to the user why it cannot be completed.
 - **Working Directory:** Always treat the current directory as your working directory for all actions: run shell commands from it, and use it (or paths under it) for any temporary or output files when such operations are permitted (e.g. non-code tasks). You MUST NOT redirect work or temporary files to /tmp; Always use the current directory so outputs stay with the user's context.
diff --git a/aworld/experimental/cast/tools/cast_search_tool.py b/aworld/experimental/cast/tools/cast_search_tool.py
@@ -10,10 +10,26 @@
 """
 
 import json
+import os
 import traceback
 from pathlib import Path
 from typing import Dict, List, Optional, Any, Union, Tuple
 
+# Multimedia extensions (image, audio, video) - must match searchers._get_multimedia_mime_type
+_MULTIMEDIA_EXTENSIONS = frozenset({
+    '.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.ico', '.tiff', '.tif',
+    '.mp3', '.wav', '.ogg', '.m4a', '.flac', '.aac',
+    '.mp4', '.webm', '.avi', '.mov', '.mkv', '.m4v',
+})
+
+# Multimedia file size limit: default 50KB; set CAST_MEDIA_SIZE_LIMIT_KB to override (e.g. 100 for 100KB)
+def _get_media_size_limit_bytes() -> int:
+    try:
+        kb = int(os.environ.get("CAST_MEDIA_SIZE_LIMIT_KB", "50"))
+        return max(1, kb) * 1024
+    except (ValueError, TypeError):
+        return 50 * 1024
+
 from aworld.config import ToolConfig
 from aworld.core.common import Observation, ActionModel, ActionResult, ToolActionInfo, ParamInfo
 from aworld.core.context.amni import AmniContext
@@ -530,6 +546,14 @@ async def _glob_search(self,
             logger.error(f"Glob search failed: {e}")
             raise
 
+    def _resolve_file_path(self, file_path: Union[str, Path]) -> Path:
+        """Resolve file path relative to search root."""
+        p = Path(file_path)
+        if p.is_absolute():
+            return p
+        root = self._root_path or (self.acast.search_engine.root_path if self.acast.search_engine else None) or Path.cwd()
+        return Path(root) / file_path
+
     async def _read_file(self,
                          file_path: Union[str, Path],
                          limit: int = 2000,
@@ -553,6 +577,19 @@ async def _read_file(self,
             >>> print(result.output)
         """
         try:
+            resolved_path = self._resolve_file_path(file_path)
+            if resolved_path.exists():
+                ext = resolved_path.suffix.lower()
+                if ext in _MULTIMEDIA_EXTENSIONS:
+                    size_bytes = resolved_path.stat().st_size
+                    limit_bytes = _get_media_size_limit_bytes()
+                    if size_bytes > limit_bytes:
+                        limit_kb = limit_bytes // 1024
+                        raise ValueError(
+                            f"Multimedia file size ({size_bytes} bytes) exceeds limit ({limit_kb}KB). "
+                            f"File must be smaller than {limit_kb}KB. "
+                            "Compress the file before reading."
+                        )
             result = await self.acast.read(
                 file_path=file_path,
                 limit=limit,