Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion aworld-cli/src/aworld_cli/console.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ async def _edit_models_config(self, config, current_config: dict):
else:
diff_cfg.pop('base_url', None)

current_diff_provider = diff_cfg.get('provider', 'openai')
current_diff_provider = diff_cfg.get('provider', 'together_video')
diff_provider = Prompt.ask(" DIFFUSION_PROVIDER", default=current_diff_provider)
if diff_provider:
diff_cfg['provider'] = diff_provider
Expand Down
2 changes: 1 addition & 1 deletion aworld-cli/src/aworld_cli/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def _apply_diffusion_models_config(models_config: Dict[str, Any]) -> None:
if not provider:
provider = (os.environ.get('DIFFUSION_PROVIDER') or '').strip()
if not provider:
provider = 'openai'
provider = 'together_video'
if temperature is None:
env_temp = (os.environ.get('DIFFUSION_TEMPERATURE') or '').strip()
if env_temp:
Expand Down
4 changes: 2 additions & 2 deletions aworld-cli/src/aworld_cli/history.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ def format_history_display(self, session_id: Optional[str] = None, limit: int =
output_tok = token_stats.get("output_tokens", 0)
total_tok = token_stats.get("total_tokens", 0)
duration = token_stats.get("duration_seconds")
tot_str = f" {input_tok} / {output_tok}"
tot_str = f" in {input_tok} / out {output_tok}"
if duration is not None and duration > 0:
tot_str += f" ({duration:.1f}s)"
by_model = token_stats.get("by_model") or {}
Expand All @@ -338,7 +338,7 @@ def format_history_display(self, session_id: Optional[str] = None, limit: int =
mo = mstats.get("output_tokens", 0)
md = mstats.get("duration_seconds", 0)
rnd = mstats.get("rounds", 1)
ms = f" {mi} / {mo}"
ms = f" in {mi} / out {mo}"
if md and md > 0:
ms += f" ({md:.1f}s)"
rnd_str = f" ({rnd} rounds)" if rnd > 0 else ""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,11 @@ def _build_beijing_date_line() -> str:
* `developer`: a sub-agent that can develop apps/code/html/website and laterimprove this developed apps/code/html/website according to the suggestions from the `evaluator`, by using terminal and other professional tools.
* `evaluator`: a sub-agent that can evaluate the apps/code/html/website's (developed by the `developer`) performance, user experience, and so on, and present professional suggestions to the `developer` for the apps/code/html/website improvement.
* `terminal`: A tool set that can execute terminal commands. **Path restriction:** Do not `cd` to other directories; always operate from the current working directory. When operating on files, always use explicit relative or absolute paths. **Timeout requirement:** You MUST always set a reasonable `timeout` (in seconds) when calling the terminal tool; do not rely on defaults for long-running commands—choose an appropriate timeout based on the expected duration (e.g., 60–120 seconds for builds, 30–60 for quick commands).
* `media_comprehension`: Sub-agent for understanding images, audio, and video files.
- **When to invoke:** Only when the user explicitly states they want to analyze, read, or comprehend such media (e.g., "帮我看看这张图", "分析这段音频", "解读这个视频").
- **Cannot process:** documents (.pdf), spreadsheets (.xlsx/.csv), presentations (.pptx), code (.py/.js/.ts), archives (.zip/.tar/.rar), executables (.exe/.bin), databases (.db/.sqlite), structured data (.json/.xml/.yaml), web pages (.html/.htm).
* `video_creator`: Sub-agent for creating videos from images, audio, and text.
- **When to invoke:** All video creation tasks MUST be routed to `video_creator`.
- **Call params:** `content` (required: prompt text); `info` (optional, JSON string).
- **Example info:** `{"image_url": "<path_or_base64>", "resolution": "720p", "duration": 5, "fps": 24, "output_dir": "./output"}`; `duration` must be ≤ 5 seconds.
- **Supported info keys:** image_url, resolution, duration, fps, poll, poll_interval, poll_timeout, download_video, output_dir.
- **Example info:** `{"image_url": "<path_or_base64>", "reference_images": ["<path1>", "<path2>"], "resolution": "720p", "duration": 5, "fps": 24, "output_dir": "./output"}`; `duration` must be ≤ 5 seconds.
- **Supported info keys:** image_url, reference_images (list of paths/URLs/base64), resolution, duration, fps, poll, poll_interval, poll_timeout, download_video, output_dir.

## 4. Available Skills
* Please be aware that if you need to have access to a particular skill to help you to complete the task, you MUST use the appropriate `SKILL_tool` to activate the skill, which returns you the exact skill content.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ async def async_policy(self, observation: Observation, info: Dict[str, Any] = {}
**Invocation format (MUST follow when calling):**
- `content`: Required. The video generation prompt (text description of what to create).
- `info`: Optional JSON string. Use when passing image/video params, e.g.:
{"image_url": "<data_path_or_base64_string>", "resolution": "720p", "duration": 5, "fps": 24, "output_dir": "./output"}
Supported keys: image_url, resolution, duration (must be ≤ 5 seconds), fps, poll, poll_interval, poll_timeout, download_video, output_dir.
{"image_url": "<data_path_or_base64_string>", "reference_images": ["<path1>", "<path2>"], "resolution": "720p", "duration": 5, "fps": 24, "output_dir": "./output"}
Supported keys: image_url, reference_images (list of paths/URLs/base64), resolution, duration (must be ≤ 5 seconds), fps, poll, poll_interval, poll_timeout, download_video, output_dir.
"""
)
def build_video_creator_swarm():
Expand Down
8 changes: 8 additions & 0 deletions aworld/agents/video_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,12 @@ async def async_policy(
# Resolve video parameters (observation.info overrides instance defaults)
image_url: Optional[str] = obs_info.pop("image_url", None)
image_url = _resolve_image_url_to_base64(image_url)
reference_images = obs_info.pop("reference_images", None)
if reference_images:
reference_images = [
_resolve_image_url_to_base64(image_url)
for image_url in reference_images
]
Comment on lines +198 to +201
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The loop variable image_url in this list comprehension shadows the outer variable image_url defined on line 194. While this is functionally correct in Python 3 due to list comprehensions having their own scope, it is very confusing to read and maintain. A future reader could easily mistake this for a bug where the outer image_url is being used repeatedly. To improve clarity, please use a different name for the loop variable.

Suggested change
reference_images = [
_resolve_image_url_to_base64(image_url)
for image_url in reference_images
]
reference_images = [
_resolve_image_url_to_base64(ref_image_url)
for ref_image_url in reference_images
]

resolution: Optional[str] = obs_info.pop("resolution", self.default_resolution)
duration: Optional[float] = obs_info.pop("duration", self.default_duration)
fps: Optional[int] = obs_info.pop("fps", self.default_fps)
Expand All @@ -204,6 +210,8 @@ async def async_policy(

# Any remaining keys in obs_info are forwarded to the provider
extra_kwargs = obs_info
if reference_images:
extra_kwargs["reference_images"] = reference_images

logger.info(
f"[VideoAgent:{self.id()}] Generating video: "
Expand Down
102 changes: 92 additions & 10 deletions aworld/models/ant_video_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ def is_terminal_status(self, status_raw): return status_raw in {"succeeded", "fa
"""

import abc
import base64
import mimetypes
import os
import re
import time
Expand Down Expand Up @@ -80,6 +82,88 @@ def is_terminal_status(self, status_raw): return status_raw in {"succeeded", "fa
"failed": "failed",
}

# Veo accepts only these image MIME types
_VEO_ALLOWED_IMAGE_MIMES: frozenset = frozenset({"image/jpeg", "image/png", "image/webp"})

# Image magic bytes for MIME inference (signature -> mime_type)
_IMAGE_MAGIC: Dict[bytes, str] = {
b"\xff\xd8\xff": "image/jpeg",
b"\x89PNG\r\n\x1a\n": "image/png",
b"GIF87a": "image/gif",
b"GIF89a": "image/gif",
}


def _infer_image_mime_from_bytes(data: bytes) -> str:
"""Infer MIME type from raw image bytes using magic signatures."""
if data.startswith(b"RIFF") and len(data) >= 12 and data[8:12] == b"WEBP":
return "image/webp"
for sig, mime in _IMAGE_MAGIC.items():
if sig != b"RIFF" and data.startswith(sig):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The condition sig != b"RIFF" is redundant because the _IMAGE_MAGIC dictionary does not contain a b"RIFF" key. This check can be removed to simplify the code and improve clarity.

        if data.startswith(sig):

return mime
return "image/jpeg" # fallback


def _parse_image_for_veo_payload(
image_data: Optional[str],
image_path: Optional[str],
) -> Optional[Tuple[str, str]]:
"""Parse image input into (base64_str, mime_type) for Veo bytesBase64Encoded payload.

- data URL (data:image/xxx;base64,yyy): extract mime from prefix, base64 from body
- raw base64: infer mime from decoded magic bytes
- image_path: read file, infer mime from magic bytes (fallback: mimetypes from ext)
"""
b64_str: Optional[str] = None
mime: Optional[str] = None

if image_data:
s = image_data.strip()
if s.startswith(("http://", "https://")):
image_data = None # fall through to image_path if available
elif s.startswith("data:") and ";base64," in s:
prefix, _, b64_part = s.partition(";base64,")
b64_str = b64_part
# data:image/jpeg;base64, -> image/jpeg
if prefix.lower().startswith("data:image/"):
mime = prefix[11:].split(";")[0].strip().lower() or "image/jpeg"
else:
mime = "image/jpeg"
else:
b64_str = s
mime = None # infer from bytes

if not b64_str and image_path:
b64_str = VideoGenProviderBase.read_file_as_base64(image_path)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

security-high high

The function _parse_image_for_veo_payload is vulnerable to Local File Inclusion (LFI) / Path Traversal. It uses VideoGenProviderBase.read_file_as_base64(image_path) to read a file from a path provided in the image_path argument, which is derived from untrusted LLM output (via tool call arguments). An attacker can use prompt injection to force the LLM to provide a malicious path (e.g., /etc/passwd), allowing the attacker to read arbitrary files from the system that the process has access to. The base64-encoded content is then sent to the video generation provider, which could lead to data exfiltration.

    if not b64_str and image_path:
        # TODO: Implement strict path validation here.
        # Ensure image_path is within an allowed directory and does not contain traversal sequences.
        b64_str = VideoGenProviderBase.read_file_as_base64(image_path)

mime_guess, _ = mimetypes.guess_type(image_path)
if mime_guess and mime_guess.startswith("image/"):
mime = mime_guess
else:
mime = None # infer from bytes

if not b64_str:
return None

if not mime:
try:
chunk = b64_str[:32]
pad = (4 - len(chunk) % 4) % 4
raw = base64.b64decode(chunk + "=" * pad)
mime = _infer_image_mime_from_bytes(raw)
except Exception:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using a bare except Exception: is risky as it can swallow any error, including ones you didn't anticipate (like KeyboardInterrupt), making debugging harder. Please narrow the exception type to only what you expect from the try block, such as errors related to base64 decoding (e.g., binascii.Error). This helps ensure that other unexpected errors are not accidentally hidden.

mime = "image/jpeg"

# Veo only accepts image/jpeg, image/png, image/webp
if mime not in _VEO_ALLOWED_IMAGE_MIMES:
logger.warning(
"[VeoAdapter] image mime %r not in allowed set %s; using image/jpeg",
mime,
sorted(_VEO_ALLOWED_IMAGE_MIMES),
)
mime = "image/jpeg"

return (b64_str, mime)


# ---------------------------------------------------------------------------
# ModelAdapter — base class for per-vendor payload/response logic
Expand Down Expand Up @@ -788,9 +872,9 @@ class VeoAdapter(ModelAdapter):

# Veo resolution strings (width x height)
_RESOLUTION_MAP: Dict[VideoResolution, str] = {
VideoResolution.RES_480P: "854x480",
VideoResolution.RES_720P: "1280x720",
VideoResolution.RES_1080P: "1920x1080",
VideoResolution.RES_480P: "480p",
VideoResolution.RES_720P: "720p",
VideoResolution.RES_1080P: "1080p",
}

# ------------------------------------------------------------------
Expand All @@ -806,15 +890,13 @@ def build_submit_payload(self,

instance: Dict[str, Any] = {"prompt": request.prompt or ""}

# Image-to-video: Veo accepts an image in the instance (base64 or URL)
# Image-to-video: Veo accepts an image in the instance (base64 + mimeType)
is_image2video = False
image_data: Optional[str] = request.image_url
if not image_data and request.image_path:
b64 = VideoGenProviderBase.read_file_as_base64(request.image_path)
image_data = f"data:image/jpeg;base64,{b64}"
if image_data:
parsed = _parse_image_for_veo_payload(request.image_url, request.image_path)
if parsed:
b64_str, mime_type = parsed
is_image2video = True
instance["image"] = {"bytesBase64Encoded": image_data}
instance["image"] = {"bytesBase64Encoded": b64_str, "mimeType": mime_type}

parameters: Dict[str, Any] = {}

Expand Down