inclusionAI · JasonTse1989 · Mar 11, 2026 · Mar 10, 2026 · Mar 10, 2026 · gemini-code-assist
diff --git a/aworld-cli/src/aworld_cli/console.py b/aworld-cli/src/aworld_cli/console.py
@@ -220,7 +220,7 @@ async def _edit_models_config(self, config, current_config: dict):
         else:
             diff_cfg.pop('base_url', None)
 
-        current_diff_provider = diff_cfg.get('provider', 'openai')
+        current_diff_provider = diff_cfg.get('provider', 'together_video')
         diff_provider = Prompt.ask("  DIFFUSION_PROVIDER", default=current_diff_provider)
         if diff_provider:
             diff_cfg['provider'] = diff_provider

diff --git a/aworld-cli/src/aworld_cli/core/config.py b/aworld-cli/src/aworld_cli/core/config.py
@@ -254,7 +254,7 @@ def _apply_diffusion_models_config(models_config: Dict[str, Any]) -> None:
     if not provider:
         provider = (os.environ.get('DIFFUSION_PROVIDER') or '').strip()
     if not provider:
-        provider = 'openai'
+        provider = 'together_video'
     if temperature is None:
         env_temp = (os.environ.get('DIFFUSION_TEMPERATURE') or '').strip()
         if env_temp:

diff --git a/aworld-cli/src/aworld_cli/history.py b/aworld-cli/src/aworld_cli/history.py
@@ -326,7 +326,7 @@ def format_history_display(self, session_id: Optional[str] = None, limit: int =
                 output_tok = token_stats.get("output_tokens", 0)
                 total_tok = token_stats.get("total_tokens", 0)
                 duration = token_stats.get("duration_seconds")
-                tot_str = f" ↑{input_tok} / ↓{output_tok}"
+                tot_str = f" in {input_tok} / out {output_tok}"
                 if duration is not None and duration > 0:
                     tot_str += f" ({duration:.1f}s)"
                 by_model = token_stats.get("by_model") or {}
@@ -338,7 +338,7 @@ def format_history_display(self, session_id: Optional[str] = None, limit: int =
                         mo = mstats.get("output_tokens", 0)
                         md = mstats.get("duration_seconds", 0)
                         rnd = mstats.get("rounds", 1)
-                        ms = f" ↑{mi} / ↓{mo}"
+                        ms = f" in {mi} / out {mo}"
                         if md and md > 0:
                             ms += f" ({md:.1f}s)"
                         rnd_str = f" ({rnd} rounds)" if rnd > 0 else ""

diff --git a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/aworld_agent.py b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/aworld_agent.py
@@ -65,14 +65,11 @@ def _build_beijing_date_line() -> str:
 *   `developer`: a sub-agent that can develop apps/code/html/website and laterimprove this developed apps/code/html/website according to the suggestions from the `evaluator`, by using terminal and other professional tools.
 *   `evaluator`: a sub-agent that can evaluate the apps/code/html/website's (developed by the `developer`) performance, user experience, and so on, and present professional suggestions to the `developer` for the apps/code/html/website improvement.
 *   `terminal`: A tool set that can execute terminal commands. **Path restriction:** Do not `cd` to other directories; always operate from the current working directory. When operating on files, always use explicit relative or absolute paths. **Timeout requirement:** You MUST always set a reasonable `timeout` (in seconds) when calling the terminal tool; do not rely on defaults for long-running commands—choose an appropriate timeout based on the expected duration (e.g., 60–120 seconds for builds, 30–60 for quick commands).
-*   `media_comprehension`: Sub-agent for understanding images, audio, and video files.
-    - **When to invoke:** Only when the user explicitly states they want to analyze, read, or comprehend such media (e.g., "帮我看看这张图", "分析这段音频", "解读这个视频").
-    - **Cannot process:** documents (.pdf), spreadsheets (.xlsx/.csv), presentations (.pptx), code (.py/.js/.ts), archives (.zip/.tar/.rar), executables (.exe/.bin), databases (.db/.sqlite), structured data (.json/.xml/.yaml), web pages (.html/.htm).
 *   `video_creator`: Sub-agent for creating videos from images, audio, and text.
     - **When to invoke:** All video creation tasks MUST be routed to `video_creator`.
     - **Call params:** `content` (required: prompt text); `info` (optional, JSON string).
-    - **Example info:** `{"image_url": "<path_or_base64>", "resolution": "720p", "duration": 5, "fps": 24, "output_dir": "./output"}`; `duration` must be ≤ 5 seconds.
-    - **Supported info keys:** image_url, resolution, duration, fps, poll, poll_interval, poll_timeout, download_video, output_dir.
+    - **Example info:** `{"image_url": "<path_or_base64>", "reference_images": ["<path1>", "<path2>"], "resolution": "720p", "duration": 5, "fps": 24, "output_dir": "./output"}`; `duration` must be ≤ 5 seconds.
+    - **Supported info keys:** image_url, reference_images (list of paths/URLs/base64), resolution, duration, fps, poll, poll_interval, poll_timeout, download_video, output_dir.
 
 ## 4. Available Skills
 *    Please be aware that if you need to have access to a particular skill to help you to complete the task, you MUST use the appropriate `SKILL_tool` to activate the skill, which returns you the exact skill content.

diff --git a/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/video_creator/video_creator.py b/aworld-cli/src/aworld_cli/inner_plugins/smllc/agents/video_creator/video_creator.py
@@ -69,8 +69,8 @@ async def async_policy(self, observation: Observation, info: Dict[str, Any] = {}
 **Invocation format (MUST follow when calling):**
 - `content`: Required. The video generation prompt (text description of what to create).
 - `info`: Optional JSON string. Use when passing image/video params, e.g.:
-  {"image_url": "<data_path_or_base64_string>", "resolution": "720p", "duration": 5, "fps": 24, "output_dir": "./output"}
-  Supported keys: image_url, resolution, duration (must be ≤ 5 seconds), fps, poll, poll_interval, poll_timeout, download_video, output_dir.
+  {"image_url": "<data_path_or_base64_string>", "reference_images": ["<path1>", "<path2>"], "resolution": "720p", "duration": 5, "fps": 24, "output_dir": "./output"}
+  Supported keys: image_url, reference_images (list of paths/URLs/base64), resolution, duration (must be ≤ 5 seconds), fps, poll, poll_interval, poll_timeout, download_video, output_dir.
 """
 )
 def build_video_creator_swarm():

diff --git a/aworld/agents/video_agent.py b/aworld/agents/video_agent.py
@@ -193,6 +193,12 @@ async def async_policy(
         # Resolve video parameters (observation.info overrides instance defaults)
         image_url: Optional[str] = obs_info.pop("image_url", None)
         image_url = _resolve_image_url_to_base64(image_url)
+        reference_images = obs_info.pop("reference_images", None)
+        if reference_images:
+            reference_images = [
+                _resolve_image_url_to_base64(image_url)
+                for image_url in reference_images
+            ]
-            reference_images = [
-                _resolve_image_url_to_base64(image_url)
-                for image_url in reference_images
-            ]
+            reference_images = [
+                _resolve_image_url_to_base64(ref_image_url)
+                for ref_image_url in reference_images
+            ]
-            reference_images = [
-                _resolve_image_url_to_base64(image_url)
-                for image_url in reference_images
-            ]
+            reference_images = [
+                _resolve_image_url_to_base64(ref_image_url)
+                for ref_image_url in reference_images
+            ]
         resolution: Optional[str] = obs_info.pop("resolution", self.default_resolution)
         duration: Optional[float] = obs_info.pop("duration", self.default_duration)
         fps: Optional[int] = obs_info.pop("fps", self.default_fps)
@@ -204,6 +210,8 @@ async def async_policy(
 
         # Any remaining keys in obs_info are forwarded to the provider
         extra_kwargs = obs_info
+        if reference_images:
+            extra_kwargs["reference_images"] = reference_images
 
         logger.info(
             f"[VideoAgent:{self.id()}] Generating video: "

diff --git a/aworld/models/ant_video_provider.py b/aworld/models/ant_video_provider.py
@@ -44,6 +44,8 @@ def is_terminal_status(self, status_raw): return status_raw in {"succeeded", "fa
 """
 
 import abc
+import base64
+import mimetypes
 import os
 import re
 import time
@@ -80,6 +82,88 @@ def is_terminal_status(self, status_raw): return status_raw in {"succeeded", "fa
     "failed":     "failed",
 }
 
+# Veo accepts only these image MIME types
+_VEO_ALLOWED_IMAGE_MIMES: frozenset = frozenset({"image/jpeg", "image/png", "image/webp"})
+
+# Image magic bytes for MIME inference (signature -> mime_type)
+_IMAGE_MAGIC: Dict[bytes, str] = {
+    b"\xff\xd8\xff": "image/jpeg",
+    b"\x89PNG\r\n\x1a\n": "image/png",
+    b"GIF87a": "image/gif",
+    b"GIF89a": "image/gif",
+}
+
+
+def _infer_image_mime_from_bytes(data: bytes) -> str:
+    """Infer MIME type from raw image bytes using magic signatures."""
+    if data.startswith(b"RIFF") and len(data) >= 12 and data[8:12] == b"WEBP":
+        return "image/webp"
+    for sig, mime in _IMAGE_MAGIC.items():
+        if sig != b"RIFF" and data.startswith(sig):
+            return mime
+    return "image/jpeg"  # fallback
+
+
+def _parse_image_for_veo_payload(
+    image_data: Optional[str],
+    image_path: Optional[str],
+) -> Optional[Tuple[str, str]]:
+    """Parse image input into (base64_str, mime_type) for Veo bytesBase64Encoded payload.
+
+    - data URL (data:image/xxx;base64,yyy): extract mime from prefix, base64 from body
+    - raw base64: infer mime from decoded magic bytes
+    - image_path: read file, infer mime from magic bytes (fallback: mimetypes from ext)
+    """
+    b64_str: Optional[str] = None
+    mime: Optional[str] = None
+
+    if image_data:
+        s = image_data.strip()
+        if s.startswith(("http://", "https://")):
+            image_data = None  # fall through to image_path if available
+        elif s.startswith("data:") and ";base64," in s:
+            prefix, _, b64_part = s.partition(";base64,")
+            b64_str = b64_part
+            # data:image/jpeg;base64,  -> image/jpeg
+            if prefix.lower().startswith("data:image/"):
+                mime = prefix[11:].split(";")[0].strip().lower() or "image/jpeg"
+            else:
+                mime = "image/jpeg"
+        else:
+            b64_str = s
+            mime = None  # infer from bytes
+
+    if not b64_str and image_path:
+        b64_str = VideoGenProviderBase.read_file_as_base64(image_path)
+        mime_guess, _ = mimetypes.guess_type(image_path)
+        if mime_guess and mime_guess.startswith("image/"):
+            mime = mime_guess
+        else:
+            mime = None  # infer from bytes
+
+    if not b64_str:
+        return None
+
+    if not mime:
+        try:
+            chunk = b64_str[:32]
+            pad = (4 - len(chunk) % 4) % 4
+            raw = base64.b64decode(chunk + "=" * pad)
+            mime = _infer_image_mime_from_bytes(raw)
+        except Exception:
+            mime = "image/jpeg"
+
+    # Veo only accepts image/jpeg, image/png, image/webp
+    if mime not in _VEO_ALLOWED_IMAGE_MIMES:
+        logger.warning(
+            "[VeoAdapter] image mime %r not in allowed set %s; using image/jpeg",
+            mime,
+            sorted(_VEO_ALLOWED_IMAGE_MIMES),
+        )
+        mime = "image/jpeg"
+
+    return (b64_str, mime)
+
 
 # ---------------------------------------------------------------------------
 # ModelAdapter — base class for per-vendor payload/response logic
@@ -788,9 +872,9 @@ class VeoAdapter(ModelAdapter):
 
     # Veo resolution strings (width x height)
     _RESOLUTION_MAP: Dict[VideoResolution, str] = {
-        VideoResolution.RES_480P:  "854x480",
-        VideoResolution.RES_720P:  "1280x720",
-        VideoResolution.RES_1080P: "1920x1080",
+        VideoResolution.RES_480P:  "480p",
+        VideoResolution.RES_720P:  "720p",
+        VideoResolution.RES_1080P: "1080p",
     }
 
     # ------------------------------------------------------------------
@@ -806,15 +890,13 @@ def build_submit_payload(self,
 
         instance: Dict[str, Any] = {"prompt": request.prompt or ""}
 
-        # Image-to-video: Veo accepts an image in the instance (base64 or URL)
+        # Image-to-video: Veo accepts an image in the instance (base64 + mimeType)
         is_image2video = False
-        image_data: Optional[str] = request.image_url
-        if not image_data and request.image_path:
-            b64        = VideoGenProviderBase.read_file_as_base64(request.image_path)
-            image_data = f"data:image/jpeg;base64,{b64}"
-        if image_data:
+        parsed = _parse_image_for_veo_payload(request.image_url, request.image_path)
+        if parsed:
+            b64_str, mime_type = parsed
             is_image2video = True
-            instance["image"] = {"bytesBase64Encoded": image_data}
+            instance["image"] = {"bytesBase64Encoded": b64_str, "mimeType": mime_type}
 
         parameters: Dict[str, Any] = {}