diff --git a/src/agentscope_runtime/tools/__init__.py b/src/agentscope_runtime/tools/__init__.py index 8eb3d6017..63d7786a4 100644 --- a/src/agentscope_runtime/tools/__init__.py +++ b/src/agentscope_runtime/tools/__init__.py @@ -60,6 +60,21 @@ from .generations.image_generation_wan25 import ( ImageGenerationWan25, ) +from .generations.image_generation_wan26 import ( + ImageGenerationWan26, +) +from .generations.async_image_to_video_wan26 import ( + ImageToVideoWan26Submit, +) +from .generations.async_text_to_video_wan26 import ( + TextToVideoWan26Submit, +) +from .generations.fetch_wan import ( + WanVideoFetch, +) +from .generations.qwen_image_edit_new import ( + QwenImageEditNew, +) class McpServerMeta(BaseModel): @@ -102,7 +117,11 @@ class McpServerMeta(BaseModel): ), "modelstudio_qwen_image": McpServerMeta( instructions="基于通义千问大模型的智能图像生成服务,提供高质量的图像处理和编辑功能", - components=[QwenImageGen, QwenImageEdit], + components=[ + QwenImageGen, + QwenImageEdit, + QwenImageEditNew, + ], ), "modelstudio_web_search": McpServerMeta( instructions="提供实时互联网搜索服务,提供准确及时的信息检索功能", @@ -116,4 +135,42 @@ class McpServerMeta(BaseModel): instructions="基于通义千问大模型的语音合成服务,支持多种语言语音合成功能", components=[QwenTextToSpeech], ), + "modelstudio_wan_multimodal": McpServerMeta( + instructions=( + "通义万相(Wan)多模态生成统一服务,支持文本/图像/语音到图像或视频的多种AI生成能力," + "包括图像生成、编辑、风格迁移、文生视频、图生视频、数字人表演等。" + "当前支持 anx-style-repaint-v1、wan2.1、wan2.2、wan2.5、wan2.6 模型版本" + "(wan2.1 仅用于基础图像编辑,wanx-style-repaint-v1仅用于人体风格重绘),各版本能力如下:\n" + "- 文本生成图像:wan2.2、wan2.5、wan2.6 均支持,优先使用 wan2.6(画质最优),其次 wan2.5\n" + "- 图像编辑:wan2.1(基础)、wan2.5、wan2.6 支持,优先使用 wan2.6\n" + "- 文本/图像生成视频:wan2.2、wan2.5、wan2.6 均支持,但能力逐代增强:\n" + " - 视频时长:wan2.2 仅支持 5 秒;wan2.5 支持 5 或 10 秒;wan2.6 支持 5、10 或 15 秒\n" # noqa + " - 音频能力:支持自动配音或传入自定义音频实现声画同步(仅 wan2.5 和 wan2.6 支持)\n" + " - 多镜头叙事:可生成包含多个镜头的视频,并在切换时保持主体一致性(仅 wan2.6 支持)\n" + "- 数字人生成(音频驱动人物视频):基于单张人物图像与音频,生成自然说话、唱歌或表演视频;" + "支持肖像、半身或全身画面,不限画幅比例;由 wan2.2 提供基础支持,wan2.5/2.6 支持更高质量与音频同步\n" + "注意:异步视频仅提交生成任务,需配合的 Fetch 工具获取结果。\n" + "注意:不同任务对模型版本有严格依赖,请务必结合具体工具描述中的[模型版本]信息进行调用。" + ), + components=[ + # 基于通义万相大模型的智能图像生成服务,提供高质量的图像处理和编辑功能 + ImageGeneration, # wan2.2-t2i 文生图 + ImageEdit, # wan2.1-edit 图生图 + ImageStyleRepaint, # wan2.2-repaint 图风格迁移 + # 基于通义万相大模型提供AI视频生成服务,支持文本到视频、图像到视频和语音到视频的多模态生成功能 + TextToVideoSubmit, # wan2.2-t2v 文生视频提交 + ImageToVideoSubmit, # wan2.2-i2v 图生视频提交 + SpeechToVideoSubmit, # wan2.2-s2v + # 基于通义万相大模型2.5版本提供的图像和视频生成服务 + ImageGenerationWan25, # wan2.5 文生图 + ImageEditWan25, # wan2.5 图生图 + TextToVideoWan25Submit, # wan2.5 文生视频提交 + ImageToVideoWan25Submit, # wan2.5 图生视频提交 + # 基于通义万相2.6大模型的智能图像生成服务,提供高质量的图像处理和编辑功能 + ImageGenerationWan26, # wanx2.6-t2i 文生图 + ImageToVideoWan26Submit, # wan2.6-i2v 图生视频提交 + TextToVideoWan26Submit, # wan2.6-t2v 文生视频提交 + WanVideoFetch, # wan 所有异步视频任务结果查询 + ], + ), } diff --git a/src/agentscope_runtime/tools/generations/async_image_to_video.py b/src/agentscope_runtime/tools/generations/async_image_to_video.py index d100aaec0..842bbf4ba 100644 --- a/src/agentscope_runtime/tools/generations/async_image_to_video.py +++ b/src/agentscope_runtime/tools/generations/async_image_to_video.py @@ -101,8 +101,9 @@ class ImageToVideoSubmit( name: str = "modelstudio_image_to_video_submit_task" description: str = ( - "通义万相-图生视频模型的异步任务提交工具。根据首帧图像和文本提示词,生成时长为5秒的无声视频。" - "同时支持特效模板,可添加“魔法悬浮”、“气球膨胀”等效果,适用于创意视频制作、娱乐特效展示等场景。" + "[版本: wan2.2] 通义万相图生视频模型(wan2.2-i2v-flash)异步任务提交工具。基于单张首帧图像和文本提示,生成一段5秒的无声动态视频。\n" # noqa + "支持分辨率:480P、720P、1080P;不支持音频(无声视频)。\n" + "提供特效模板(如“魔法悬浮”、“气球膨胀”),适用于创意视频制作、娱乐特效展示等场景。\n" ) @trace(trace_type="AIGC", trace_name="image_to_video_submit") diff --git a/src/agentscope_runtime/tools/generations/async_image_to_video_wan25.py b/src/agentscope_runtime/tools/generations/async_image_to_video_wan25.py index 0833f633b..a0bd5a804 100644 --- a/src/agentscope_runtime/tools/generations/async_image_to_video_wan25.py +++ b/src/agentscope_runtime/tools/generations/async_image_to_video_wan25.py @@ -111,8 +111,9 @@ class ImageToVideoWan25Submit( name: str = "modelstudio_image_to_video_wan25_submit_task" description: str = ( - "通义万相-图生视频模型的异步任务提交工具。根据首帧图像和文本提示词,生成时长为5秒的无声视频。" - "同时支持特效模板,可添加“魔法悬浮”、“气球膨胀”等效果,适用于创意视频制作、娱乐特效展示等场景。" + "[版本: wan2.5] 通义万相图生视频模型(wan2.5-i2v)异步提交工具。基于单张首帧图像和文本提示,生成一段流畅的有声视频。\n" # noqa + "支持视频时长:5秒或10秒;分辨率:480P、720P、1080P;支持自动配音或传入自定义音频,实现音画同步。\n" + "提供特效模板(如“魔法悬浮”、“气球膨胀”),适用于创意视频、娱乐特效等场景。\n" ) @trace(trace_type="AIGC", trace_name="image_to_video_wan25_submit") diff --git a/src/agentscope_runtime/tools/generations/async_image_to_video_wan26.py b/src/agentscope_runtime/tools/generations/async_image_to_video_wan26.py new file mode 100644 index 000000000..34f44e2fb --- /dev/null +++ b/src/agentscope_runtime/tools/generations/async_image_to_video_wan26.py @@ -0,0 +1,290 @@ +# -*- coding: utf-8 -*- +# pylint:disable=abstract-method, deprecated-module, wrong-import-order + +import os +import uuid +from http import HTTPStatus +from typing import Any, Optional + +from dashscope.aigc.video_synthesis import AioVideoSynthesis +from mcp.server.fastmcp import Context +from pydantic import BaseModel, Field + +from ..base import Tool +from ..utils.api_key_util import get_api_key, ApiNames +from ...engine.tracing import trace, TracingUtil + + +class ImageToVideoWan26SubmitInput(BaseModel): + """ + Input model for submitting an image-to-video task using wan2.6-i2v. + """ + + image_url: str = Field( + ..., + description="首帧图像的公网可访问URL,支持 JPG/PNG 格式,Base64编码径", + ) + prompt: Optional[str] = Field( + default=None, + description="正向提示词,描述希望视频中发生的动作或变化,例如“镜头缓慢推进,风吹动树叶”。", + ) + negative_prompt: Optional[str] = Field( + default=None, + description="反向提示词,用于排除不希望出现的内容,例如“模糊、闪烁、变形、水印”。", + ) + audio_url: Optional[str] = Field( + default=None, + description="自定义音频文件的公网URL。参数优先级:audio_url > audio。", + ) + audio: Optional[bool] = Field( + default=None, + description="是否自动生成配音。仅在 audio_url 未提供时生效。", + ) + template: Optional[str] = Field( + default=None, + description="视频特效模板,如:squish(解压捏捏)、flying(魔法悬浮)、carousel(时光木马)等。", + ) + resolution: Optional[str] = Field( + default=None, + description="视频分辨率,可选值:'720P'、'1080P'。默认为 '1080P'。", + ) + duration: Optional[int] = Field( + default=None, + description="视频时长(秒),可选值:5、10、15。默认为 5。", + ) + prompt_extend: Optional[bool] = Field( + default=None, + description=" Prompt 智能改写。开启后可提升生成效果,并使 shot_type 生效," + "默认值为 true:开启智能改写。false:不开启智能改写。", + ) + shot_type: Optional[str] = Field( + default=None, + description="镜头类型,仅在 prompt_extend=true 时生效。" + "可选值:'single'(单镜头,默认)、'multi'(多镜头切换)。" + "参数优先级高于 prompt 中的描述。", + ) + watermark: Optional[bool] = Field( + default=None, + description="是否在视频中添加水印(如“AI生成”标识)。默认不添加。", + ) + seed: Optional[int] = Field( + default=None, + description="随机种子,用于结果复现。", + ) + ctx: Optional[Context] = Field( + default=None, + description="HTTP request context containing headers for mcp only, " + "don't generate it", + ) + + +class ImageToVideoWan26SubmitOutput(BaseModel): + """ + Output of the image-to-video task submission. + """ + + task_id: str = Field( + title="Task ID", + description="异步任务的唯一标识符。", + ) + task_status: str = Field( + title="Task Status", + description="视频生成的任务状态,PENDING:任务排队中,RUNNING:任务处理中,SUCCEEDED:任务执行成功," + "FAILED:任务执行失败,CANCELED:任务取消成功,UNKNOWN:任务不存在或状态未知", + ) + request_id: Optional[str] = Field( + default=None, + title="Request ID", + description="本次请求的唯一ID,可用于日志追踪。", + ) + + +class ImageToVideoWan26Submit( + Tool[ImageToVideoWan26SubmitInput, ImageToVideoWan26SubmitOutput], +): + """ + Submit an image-to-video generation task using the wan2.6-i2v model. + """ + + name: str = "modelstudio_image_to_video_wan26_submit_task" + description: str = ( + "[版本: wan2.6] 通义万相图生视频模型(wan2.6-i2v)异步任务提交工具。基于单张首帧图像和文本提示,生成一段流畅的有声视频。\n" # noqa + "支持视频时长:5秒、10秒或15秒;分辨率:720P、1080P;支持自动配音或传入自定义音频,实现音画同步。\n" + "独家支持多镜头叙事:可生成包含多个镜头的视频,并在镜头切换时保持主体一致性。\n" + "提供特效模板(如“魔法悬浮”、“气球膨胀”),适用于创意视频制作、娱乐特效展示等场景。\n" + ) + + @trace(trace_type="AIGC", trace_name="image_to_video_wan26_submit") + async def arun( + self, + args: ImageToVideoWan26SubmitInput, + **kwargs: Any, + ) -> ImageToVideoWan26SubmitOutput: + trace_event = kwargs.pop("trace_event", None) + request_id = TracingUtil.get_request_id() + + try: + api_key = get_api_key(ApiNames.dashscope_api_key, **kwargs) + except AssertionError as e: + raise ValueError("Please set valid DASHSCOPE_API_KEY!") from e + + model_name = kwargs.get( + "model_name", + os.getenv("IMAGE_TO_VIDEO_MODEL_NAME", "wan2.6-i2v"), + ) + + # 构建 parameters(全部为可选参数) + parameters = {} + if args.audio is not None: + parameters["audio"] = args.audio + if args.resolution: + parameters["resolution"] = args.resolution + if args.duration is not None: + parameters["duration"] = args.duration + if args.prompt_extend is not None: + parameters["prompt_extend"] = args.prompt_extend + if args.shot_type: + parameters["shot_type"] = args.shot_type + if args.watermark is not None: + parameters["watermark"] = args.watermark + if args.seed is not None: + parameters["seed"] = args.seed + aio_video_synthesis = AioVideoSynthesis() + + # ⚠️ 关键修正:DashScope SDK 要求使用 img_url,不是 input + response = await aio_video_synthesis.async_call( + model=model_name, + api_key=api_key, + img_url=args.image_url, # ✅ 正确参数名 + prompt=args.prompt, + negative_prompt=args.negative_prompt, + audio_url=args.audio_url, + template=args.template, + **parameters, + ) + + if trace_event: + trace_event.on_log( + "", + **{ + "step_suffix": "results", + "payload": { + "request_id": request_id, + "submit_task": response, + }, + }, + ) + + if ( + response.status_code != HTTPStatus.OK + or not response.output + or response.output.task_status in ["FAILED", "CANCELED"] + ): + raise RuntimeError( + f"Failed to submit image-to-video task: {response}", + ) + + request_id = response.request_id or request_id or str(uuid.uuid4()) + + return ImageToVideoWan26SubmitOutput( + request_id=request_id, + task_id=response.output.task_id, + task_status=response.output.task_status, + ) + + +# ========== Fetch 部分保持不变(仅微调描述) ========== + + +class ImageToVideoWan26FetchInput(BaseModel): # noqa + task_id: str = Field( + title="Task ID", + description="要查询的视频生成任务ID。", + ) + ctx: Optional[Context] = Field( + default=None, + description="HTTP request context containing headers for mcp only, " + "don't generate it", + ) + + +class ImageToVideoWan26FetchOutput(BaseModel): + video_url: str = Field( + title="Video URL", + description="生成视频的公网可访问URL(MP4格式)。", + ) + task_id: str = Field( + title="Task ID", + description="任务ID,与输入一致。", + ) + task_status: str = Field( + title="Task Status", + description="任务最终状态,成功时为 SUCCEEDED。", + ) + request_id: Optional[str] = Field( + default=None, + title="Request ID", + description="请求ID,用于追踪。", + ) + + +class ImageToVideoWan26Fetch( + Tool[ImageToVideoWan26FetchInput, ImageToVideoWan26FetchOutput], +): + name: str = "modelstudio_image_to_video_wan26_fetch_result" + description: str = ( + "查询通义万相 wan2.6-i2v 图生视频任务的结果。" + "输入 Task ID,返回生成的视频 URL 及任务状态。" + "请在提交任务后轮询此接口,直到任务状态变为 SUCCEEDED。" + ) + + @trace(trace_type="AIGC", trace_name="image_to_video_wan26_fetch") + async def arun( + self, + args: ImageToVideoWan26FetchInput, + **kwargs: Any, + ) -> ImageToVideoWan26FetchOutput: + trace_event = kwargs.pop("trace_event", None) + request_id = TracingUtil.get_request_id() + + try: + api_key = get_api_key(ApiNames.dashscope_api_key, **kwargs) + except AssertionError as e: + raise ValueError("Please set valid DASHSCOPE_API_KEY!") from e + + aio_video_synthesis = AioVideoSynthesis() + + response = await aio_video_synthesis.fetch( + api_key=api_key, + task=args.task_id, + ) + + if trace_event: + trace_event.on_log( + "", + **{ + "step_suffix": "results", + "payload": { + "request_id": response.request_id, + "fetch_result": response, + }, + }, + ) + + if ( + response.status_code != HTTPStatus.OK + or not response.output + or response.output.task_status in ["FAILED", "CANCELED"] + ): + raise RuntimeError( + f"Failed to fetch image-to-video result: {response}", + ) + + request_id = response.request_id or request_id or str(uuid.uuid4()) + + return ImageToVideoWan26FetchOutput( + video_url=response.output.video_url, + task_id=response.output.task_id, + task_status=response.output.task_status, + request_id=request_id, + ) diff --git a/src/agentscope_runtime/tools/generations/async_speech_to_video.py b/src/agentscope_runtime/tools/generations/async_speech_to_video.py index 3b14f1406..949d6b768 100644 --- a/src/agentscope_runtime/tools/generations/async_speech_to_video.py +++ b/src/agentscope_runtime/tools/generations/async_speech_to_video.py @@ -79,9 +79,9 @@ class SpeechToVideoSubmit( name: str = "modelstudio_speech_to_video_submit_task" description: str = ( - "数字人wan2.2-s2v模型的异步任务提交工具。能基于单张图片和音频,生成动作自然的说话、" - "唱歌或表演视频。通过输入的人声音频,驱动静态图片中的人物实现口型、表情和动作与音频同步。" - "支持说话、唱歌、表演三种对口型场景,支持真人及卡通人物,提供480P、720P两档分辨率选项。" + "[版本: wan2.2] 通义万相语音驱动视频模型(wan2.2-s2v)异步任务提交工具。基于单张人物图像和一段音频,生成动作自然的说话、唱歌或表演视频。\n" # noqa + "支持肖像、半身或全身人物图像,不限画幅比例;视频时长固定为5秒,为有声视频(音频即输入源)。\n" + "适用于数字人播报、虚拟表演等场景。\n" ) @staticmethod diff --git a/src/agentscope_runtime/tools/generations/async_text_to_video.py b/src/agentscope_runtime/tools/generations/async_text_to_video.py index fc8274244..d7c92a6a9 100644 --- a/src/agentscope_runtime/tools/generations/async_text_to_video.py +++ b/src/agentscope_runtime/tools/generations/async_text_to_video.py @@ -84,8 +84,9 @@ class TextToVideoSubmit( name: str = "modelstudio_text_to_video_submit_task" description: str = ( - "通义万相-文生视频模型的异步任务提交工具。可根据文本生成5秒无声视频,支持 480P、720P、1080P 多种分辨率档位," - "并在各档位下提供多个具体尺寸选项,以适配不同业务场景。" + "[版本: wan2.2] 通义万相文生视频模型(wan2.2-t2v-plus)" + "异步任务提交工具。基于文本提示生成一段固定5秒的高清无声视频。\n" + "支持分辨率:480P、1080P;不支持音频、不支持口型同步、不支持超过5秒的视频。\n" ) @trace(trace_type="AIGC", trace_name="text_to_video_submit") diff --git a/src/agentscope_runtime/tools/generations/async_text_to_video_wan25.py b/src/agentscope_runtime/tools/generations/async_text_to_video_wan25.py index fca97a978..8adc2e1e4 100644 --- a/src/agentscope_runtime/tools/generations/async_text_to_video_wan25.py +++ b/src/agentscope_runtime/tools/generations/async_text_to_video_wan25.py @@ -94,8 +94,10 @@ class TextToVideoWan25Submit( name: str = "modelstudio_text_to_video_wan25_submit_task" description: str = ( - "通义万相-文生视频模型的异步任务提交工具。可根据文本生成5秒或10秒有声视频,支持 480P、720P、1080P 多种分辨率档位," - "支持自动配音,或传入自定义音频文件,实现音画同步。" + "[版本: wan2.5] 通义万相文生视频模型(wan2.5-t2v-preview)" + "异步任务提交工具。基于文本提示生成一段流畅的有声视频。\n" + "支持视频时长:5秒或10秒;分辨率:480P、720P、1080P;支持自动配音或传入自定义音频,实现音画同步。\n" + "不支持多镜头叙事(仅 wan2.6 支持)。\n" ) @trace(trace_type="AIGC", trace_name="text_to_video_wan25_submit") diff --git a/src/agentscope_runtime/tools/generations/async_text_to_video_wan26.py b/src/agentscope_runtime/tools/generations/async_text_to_video_wan26.py new file mode 100644 index 000000000..4cb580eb6 --- /dev/null +++ b/src/agentscope_runtime/tools/generations/async_text_to_video_wan26.py @@ -0,0 +1,295 @@ +# -*- coding: utf-8 -*- +# pylint:disable=abstract-method, deprecated-module, wrong-import-order + +import os +import uuid +from http import HTTPStatus +from typing import Any, Optional + +from dashscope.aigc.video_synthesis import AioVideoSynthesis +from mcp.server.fastmcp import Context +from pydantic import BaseModel, Field + +from ..base import Tool +from ..utils.api_key_util import get_api_key, ApiNames +from ...engine.tracing import trace, TracingUtil + + +class TextToVideoWan26SubmitInput(BaseModel): + """ + Input model for text-to-video generation submission using wan2.6-t2v. + """ + + prompt: str = Field( + ..., + description="正向提示词,描述希望生成的视频内容,例如“一只宇航员猫在火星上跳舞”", + ) + negative_prompt: Optional[str] = Field( + default=None, + description="反向提示词,描述不希望出现在视频中的内容,例如“模糊、水印、文字、变形”", + ) + audio_url: Optional[str] = Field( + default=None, + description="自定义音频文件URL,模型将使用该音频生成视频。" + "参数优先级:audio_url > audio,仅在 audio_url 为空时 audio 生效。", + ) + audio: Optional[bool] = Field( + default=None, + description="是否自动生成音频。" + "参数优先级:audio_url > audio,仅在 audio_url 为空时 audio 生效。", + ) + resolution: Optional[str] = Field( + default=None, + description="视频分辨率,例如:720p、1080p 等(具体支持值请参考文档)", + ) + duration: Optional[int] = Field( + default=None, + description="视频生成时长,单位为秒,通常为5秒", + ) + prompt_extend: Optional[bool] = Field( + default=None, + description="是否开启prompt智能改写,开启后使用大模型对输入prompt进行智能优化", + ) + shot_type: Optional[str] = Field( + default=None, + description="镜头类型,仅在 prompt_extend=true 时生效。" + "可选值:'single'(单镜头,默认)、'multi'(多镜头切换)。" + "参数优先级高于 prompt 中的描述。", + ) + watermark: Optional[bool] = Field( + default=None, + description="是否添加水印,默认不设置", + ) + seed: Optional[int] = Field( + default=None, + description="随机种子,用于结果复现。", + ) + ctx: Optional[Context] = Field( + default=None, + description="HTTP request context containing headers " + "for mcp only, don't generate it", + ) + + +class TextToVideoWan26SubmitOutput(BaseModel): + """ + Output model for text-to-video generation submission. + """ + + task_id: str = Field( + title="Task ID", + description="视频生成的任务ID", + ) + + task_status: str = Field( + title="Task Status", + description="任务状态:PENDING(排队中)、RUNNING(处理中)、SUCCEEDED(成功)、" + "FAILED(失败)、CANCELED(已取消)、UNKNOWN(未知)", + ) + + request_id: Optional[str] = Field( + default=None, + title="Request ID", + description="请求ID,用于追踪", + ) + + +class TextToVideoWan26Submit( + Tool[TextToVideoWan26SubmitInput, TextToVideoWan26SubmitOutput], +): + """ + Service for submitting text-to-video + generation tasks using Wan 2.6 T2V model. + """ + + name: str = "modelstudio_text_to_video_wan26_submit_task" + description: str = ( + "[版本: wan2.6] 通义万相文生视频模型(wan2.6-t2v)异步任务提交工具。基于纯文本提示生成一段流畅的有声视频。\n" + "支持视频时长:5秒、10秒或15秒;分辨率:720P、1080P;支持自动配音或传入自定义音频,实现音画同步。\n" + "独家支持多镜头叙事:可生成包含多个镜头的视频,并在镜头切换时保持主体一致性。\n" + ) + + @trace(trace_type="AIGC", trace_name="text_to_video_wan26_submit") + async def arun( + self, + args: TextToVideoWan26SubmitInput, + **kwargs: Any, + ) -> TextToVideoWan26SubmitOutput: + trace_event = kwargs.pop("trace_event", None) + request_id = TracingUtil.get_request_id() + + try: + api_key = get_api_key(ApiNames.dashscope_api_key, **kwargs) + except AssertionError as e: + raise ValueError("Please set valid DASHSCOPE_API_KEY!") from e + + model_name = kwargs.get( + "model_name", + os.getenv("TEXT_TO_VIDEO_MODEL_NAME", "wan2.6-t2v"), + ) + + parameters = {} + if args.audio is not None: + parameters["audio"] = args.audio + if args.resolution: + parameters["resolution"] = args.resolution + if args.duration is not None: + parameters["duration"] = args.duration + if args.prompt_extend is not None: + parameters["prompt_extend"] = args.prompt_extend + if args.watermark is not None: + parameters["watermark"] = args.watermark + if args.shot_type: + parameters["shot_type"] = args.shot_type + if args.seed is not None: + parameters["seed"] = args.seed + aio_video_synthesis = AioVideoSynthesis() + + response = await aio_video_synthesis.async_call( + model=model_name, + api_key=api_key, + prompt=args.prompt, + negative_prompt=args.negative_prompt, + audio_url=args.audio_url, + **parameters, + ) + + if trace_event: + trace_event.on_log( + "", + **{ + "step_suffix": "results", + "payload": { + "request_id": request_id, + "submit_task": response, + }, + }, + ) + + if ( + response.status_code != HTTPStatus.OK + or not response.output + or response.output.task_status in ["FAILED", "CANCELED"] + ): + raise RuntimeError( + f"Failed to submit text-to-video task: {response}", + ) + + if not request_id: + request_id = response.request_id or str(uuid.uuid4()) + + result = TextToVideoWan26SubmitOutput( + request_id=request_id, + task_id=response.output.task_id, + task_status=response.output.task_status, + ) + return result + + +class TextToVideoWan26FetchInput(BaseModel): + """ + Input model for fetching text-to-video generation results. + """ + + task_id: str = Field( + title="Task ID", + description="视频生成的任务ID", + ) + ctx: Optional[Context] = Field( + default=None, + description="HTTP request context containing headers " + "for mcp only, don't generate it", + ) + + +class TextToVideoWan26FetchOutput(BaseModel): + """ + Output model for fetching text-to-video generation results. + """ + + video_url: str = Field( + title="Video URL", + description="生成的视频公网可访问URL", + ) + + task_id: str = Field( + title="Task ID", + description="视频生成的任务ID", + ) + + task_status: str = Field( + title="Task Status", + description="任务状态:PENDING、RUNNING、SUCCEEDED、FAILED、CANCELED、UNKNOWN", + ) + + request_id: Optional[str] = Field( + default=None, + title="Request ID", + description="请求ID", + ) + + +class TextToVideoWan26Fetch( + Tool[TextToVideoWan26FetchInput, TextToVideoWan26FetchOutput], +): + """ + Service for fetching text-to-video generation results. + """ + + name: str = "modelstudio_text_to_video_wan26_fetch_result" + description: str = ( + "通义万相-文生视频模型(wan2.6-t2v)的异步任务结果查询工具,根据Task ID查询生成的视频URL。" + ) + + @trace(trace_type="AIGC", trace_name="text_to_video_wan26_fetch") + async def arun( + self, + args: TextToVideoWan26FetchInput, + **kwargs: Any, + ) -> TextToVideoWan26FetchOutput: + trace_event = kwargs.pop("trace_event", None) + request_id = TracingUtil.get_request_id() + + try: + api_key = get_api_key(ApiNames.dashscope_api_key, **kwargs) + except AssertionError as e: + raise ValueError("Please set valid DASHSCOPE_API_KEY!") from e + + aio_video_synthesis = AioVideoSynthesis() + + response = await aio_video_synthesis.fetch( + api_key=api_key, + task=args.task_id, + ) + + if trace_event: + trace_event.on_log( + "", + **{ + "step_suffix": "results", + "payload": { + "request_id": response.request_id, + "fetch_result": response, + }, + }, + ) + + if ( + response.status_code != HTTPStatus.OK + or not response.output + or response.output.task_status in ["FAILED", "CANCELED"] + ): + raise RuntimeError( + f"Failed to fetch text-to-video result: {response}", + ) + + if not request_id: + request_id = response.request_id or str(uuid.uuid4()) + + result = TextToVideoWan26FetchOutput( + video_url=response.output.video_url, + task_id=response.output.task_id, + task_status=response.output.task_status, + request_id=request_id, + ) + return result diff --git a/src/agentscope_runtime/tools/generations/fetch_wan.py b/src/agentscope_runtime/tools/generations/fetch_wan.py new file mode 100644 index 000000000..53c6d45ae --- /dev/null +++ b/src/agentscope_runtime/tools/generations/fetch_wan.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- +# pylint:disable=abstract-method, deprecated-module, wrong-import-order +import uuid +from http import HTTPStatus +from typing import Any, Optional + +from dashscope.aigc.video_synthesis import AioVideoSynthesis +from mcp.server.fastmcp import Context +from pydantic import BaseModel, Field + +from ..base import Tool +from ..utils.api_key_util import get_api_key, ApiNames +from ...engine.tracing import trace, TracingUtil + + +class WanVideoFetchInput(BaseModel): + """ + Input for fetching any Tongyi Wanxiang video generation task result. + """ + + task_id: str = Field( + ..., + title="Task ID", + description="通义万相(Wan)视频生成任务返回的任务ID,适用于文生视频、图生视频等所有异步视频任务", + ) + ctx: Optional[Context] = Field( + default=None, + description="HTTP request context for MCP internal " + "use only — do not provide manually.", + ) + + +class WanVideoFetchOutput(BaseModel): + """ + Output of the Wan video task fetch result. + """ + + video_url: str = Field( + ..., + title="Video URL", + description="生成的视频公网可访问url", + ) + task_id: str = Field( + ..., + title="Task ID", + description="对应的任务ID", + ) + task_status: str = Field( + ..., + title="Task Status", + description="任务状态:SUCCEEDED(成功)、FAILED(失败)、" + "CANCELED(取消)、PENDING/RUNNING(进行中)", + ) + request_id: Optional[str] = Field( + default=None, + title="Request ID", + description="本次查询请求的唯一标识", + ) + + +class WanVideoFetch(Tool[WanVideoFetchInput, WanVideoFetchOutput]): + """ + Universal fetch tool for all Tongyi Wanxiang (Wan) video generation tasks. + """ + + name: str = "modelstudio_wan_video_fetch_result" + description: str = ( + "通义万相(Wan)异步任务结果查询工具,根据Task ID查询生成的视频URL。适用于文生视频、图生视频等所有异步视频任务" + ) + + @trace(trace_type="AIGC", trace_name="wan_video_fetch") + async def arun( + self, + args: WanVideoFetchInput, + **kwargs: Any, + ) -> WanVideoFetchOutput: + trace_event = kwargs.pop("trace_event", None) + request_id = TracingUtil.get_request_id() + + try: + api_key = get_api_key(ApiNames.dashscope_api_key, **kwargs) + except AssertionError as e: + raise ValueError("Please set valid DASHSCOPE_API_KEY!") from e + + aio_video_synthesis = AioVideoSynthesis() + + response = await aio_video_synthesis.fetch( + api_key=api_key, + task=args.task_id, + ) + + if trace_event: + trace_event.on_log( + "", + **{ + "step_suffix": "results", + "payload": { + "request_id": response.request_id, + "fetch_result": response, + }, + }, + ) + + if ( + response.status_code != HTTPStatus.OK + or not response.output + or getattr(response.output, "task_status", None) + in ["FAILED", "CANCELED"] + ): + raise RuntimeError(f"Failed to fetch Wan video result: {response}") + + final_request_id = ( + request_id or response.request_id or str(uuid.uuid4()) + ) + + return WanVideoFetchOutput( + video_url=response.output.video_url, + task_id=response.output.task_id, + task_status=response.output.task_status, + request_id=final_request_id, + ) diff --git a/src/agentscope_runtime/tools/generations/image_edit.py b/src/agentscope_runtime/tools/generations/image_edit.py index ee52ae922..fb73857b5 100644 --- a/src/agentscope_runtime/tools/generations/image_edit.py +++ b/src/agentscope_runtime/tools/generations/image_edit.py @@ -82,7 +82,11 @@ class ImageEdit(Tool[ImageGenInput, ImageGenOutput]): """ name: str = "modelstudio_image_edit" - description: str = "AI图像编辑(图生图)服务,输入原图URL、编辑功能、文本描述和分辨率," "返回编辑后的图片URL。" + description: str = ( + "[版本: wan2.1] 通义万相图生图模型(wan2.1-imageedit)。AI图像编辑服务," + "输入一张图像和文本描述,生成语义一致的编辑结果,并返回图片URL。\n" + "仅支持单张输入图像,不支持多图输入。支持功能包括:图像风格化、局部风格化、文本引导编辑、去水印、画布扩展、超分辨率、上色、涂鸦等。\n" + ) @trace(trace_type="AIGC", trace_name="image_edit") async def arun(self, args: ImageGenInput, **kwargs: Any) -> ImageGenOutput: diff --git a/src/agentscope_runtime/tools/generations/image_edit_wan25.py b/src/agentscope_runtime/tools/generations/image_edit_wan25.py index 8a639ec82..ee2e755d8 100644 --- a/src/agentscope_runtime/tools/generations/image_edit_wan25.py +++ b/src/agentscope_runtime/tools/generations/image_edit_wan25.py @@ -68,7 +68,12 @@ class ImageEditWan25(Tool[ImageGenInput, ImageGenOutput]): """ name: str = "modelstudio_image_edit_wan25" - description: str = "AI图像编辑(图生图)服务,输入原图URL、编辑功能、文本描述和分辨率,返回编辑后的图片URL。" + description: str = ( + "[版本: wan2.5] 通义万相图生图模型(wan2.5-imageedit)。AI图像编辑服务," + "输入一张图像和文本描述,生成语义一致的编辑结果,并返回图片URL。\n" + "支持单张输入图像,支持多图融合或跨图主体一致性控制。\n" + "支持功能包括:图像风格化、局部风格化、文本引导编辑、去水印、画布扩展、超分辨率、上色、涂鸦等。\n" + ) @trace(trace_type="AIGC", trace_name="image_edit_wan25") async def arun(self, args: ImageGenInput, **kwargs: Any) -> ImageGenOutput: diff --git a/src/agentscope_runtime/tools/generations/image_generation.py b/src/agentscope_runtime/tools/generations/image_generation.py index c6c5d438f..5cc1da313 100644 --- a/src/agentscope_runtime/tools/generations/image_generation.py +++ b/src/agentscope_runtime/tools/generations/image_generation.py @@ -73,7 +73,12 @@ class ImageGeneration(Tool[ImageGenInput, ImageGenOutput]): """ name: str = "modelstudio_image_gen" - description: str = "AI绘画(图像生成)服务,输入文本描述和图像分辨率,返回根据文本信息绘制的图片URL。" + description: str = ( + "[版本: wan2.2] 通义万相文生图模型(wan2.2-t2i-flash)。" + "AI绘画服务,专注于快速生成概念图、插画、设计素材等。\n" + "此为轻量级 flash 版本,优先保障生成速度,适用于对响应时效要求高、画质要求适中的场景。\n" + "支持固定分辨率(如 1024×1024),提供正向/反向提示词、智能 prompt 改写功能。输入文本描述,返回生成图像的URL。\n" + ) @trace(trace_type="AIGC", trace_name="image_generation") async def arun(self, args: ImageGenInput, **kwargs: Any) -> ImageGenOutput: diff --git a/src/agentscope_runtime/tools/generations/image_generation_wan25.py b/src/agentscope_runtime/tools/generations/image_generation_wan25.py index a6e76ef96..e3579ede2 100644 --- a/src/agentscope_runtime/tools/generations/image_generation_wan25.py +++ b/src/agentscope_runtime/tools/generations/image_generation_wan25.py @@ -72,7 +72,13 @@ class ImageGenerationWan25(Tool[ImageGenInput, ImageGenOutput]): """ name: str = "modelstudio_image_gen_wan25" - description: str = "AI绘画(图像生成)服务,输入文本描述和图像分辨率,返回根据文本信息绘制的图片URL。" + description: str = ( + "[版本: wan2.5] 通义万相文生图模型(wan2.5-t2i-preview)。" + "AI绘画服务,根据文本描述生成高质量图像,并返回图片URL。\n" + "支持自定义分辨率:图像面积介于 768×768 至 1440×1440 像素之间," + "可在该范围内自由设置宽高比(例如 768×2700)。\n" + "可生成摄影、动漫、油画等多种艺术风格,支持中英文复杂语义理解与文本渲染。\n" + ) @trace(trace_type="AIGC", trace_name="image_generation_wan25") async def arun(self, args: ImageGenInput, **kwargs: Any) -> ImageGenOutput: diff --git a/src/agentscope_runtime/tools/generations/image_generation_wan26.py b/src/agentscope_runtime/tools/generations/image_generation_wan26.py new file mode 100644 index 000000000..2022d1465 --- /dev/null +++ b/src/agentscope_runtime/tools/generations/image_generation_wan26.py @@ -0,0 +1,204 @@ +# -*- coding: utf-8 -*- +# pylint:disable=abstract-method, deprecated-module, wrong-import-order +# pylint:disable=too-many-nested-blocks, too-many-branches, too-many-statements + +import uuid +from typing import Any, Optional + +from dashscope import AioMultiModalConversation +from mcp.server.fastmcp import Context +from pydantic import BaseModel, Field + +from ..base import Tool +from ..utils.api_key_util import get_api_key, ApiNames +from ...engine.tracing import trace, TraceType, TracingUtil + + +class ImageGenerationWan26Input(BaseModel): + """ + Input schema for Wanx 2.6 text-to-image generation. + """ + + prompt: str = Field( + ..., + description="正向提示词,描述期望生成的图像内容,建议详细且清晰。超过800字符将被截断。", + ) + negative_prompt: Optional[str] = Field( + default=None, + description="反向提示词,描述不希望出现的内容,如低质量、模糊、文字等。超过500字符将被截断。", + ) + size: Optional[str] = Field( + default=None, + description="输出图像的分辨率。默认值是1280*1280,可不填。", + ) + prompt_extend: Optional[bool] = Field( + default=None, + description="是否开启 Prompt 智能改写。将使用大模型优化正向提示词。true: 开启(默认),false:不开启。", + ) + n: int = Field( + default=1, + description="生成图片数量,取值范围1~4,默认为1。", + ) + style: Optional[str] = Field( + default=None, + description="艺术风格,如 'photography', 'anime', 'oil-painting' 等。", + ) + seed: Optional[int] = Field( + default=None, + description="随机种子,用于结果复现。", + ) + watermark: Optional[bool] = Field( + default=None, + description="是否添加阿里云水印,默认不添加。", + ) + ctx: Optional[Context] = Field( + default=None, + description="HTTP request context for " + "MCP internal use only, do not generate it.", + ) + + +class ImageGenerationWan26Output(BaseModel): + """ + Output schema for Wanx 2.6 text-to-image generation. + """ + + results: list[str] = Field( + title="Results", + description="生成的图片URL列表。", + ) + request_id: Optional[str] = Field( + default=None, + title="Request ID", + description="本次请求的唯一标识。", + ) + + +class ImageGenerationWan26( + Tool[ImageGenerationWan26Input, ImageGenerationWan26Output], +): + """ + Wanx 2.6 Text-to-Image Generation Tool. + Uses the 'wanx2.6-t2i' model from DashScope + to generate high-quality images from text. + """ + + name: str = "modelstudio_wanx26_image_generation" + description: str = ( + "[版本: wan2.6] 通义万相文生图模型(wanx2.6-t2i)。AI绘画服务,根据文本描述生成高质量图像,并返回图片URL。\n" + "新功能包括图像编辑和图文混合输出,满足更多样化的生成与集成需求。\n" + "支持自定义分辨率:图像面积介于 768×768 至 1440×1440 像素之间," + "允许在该范围内自由调整宽高比(例如 768×2700)。\n" + ) + + @trace(trace_type=TraceType.AIGC, trace_name="wanx26_image_generation") + async def arun( + self, + args: ImageGenerationWan26Input, + **kwargs: Any, + ) -> ImageGenerationWan26Output: + trace_event = kwargs.pop("trace_event", None) + request_id = TracingUtil.get_request_id() + + try: + api_key = get_api_key(ApiNames.dashscope_api_key, **kwargs) + except AssertionError as e: + raise ValueError("Please set valid DASHSCOPE_API_KEY!") from e + + model_name = "wan2.6-t2i" + + messages = [ + { + "role": "user", + "content": [{"text": args.prompt}], + }, + ] + + # Normalize watermark + if args.watermark is not None: + if isinstance(args.watermark, str): + args.watermark = args.watermark.strip().lower() in ( + "true", + "1", + ) + else: + args.watermark = bool(args.watermark) + + parameters = {} + if args.negative_prompt: + parameters["negative_prompt"] = args.negative_prompt + if args.size and args.size != "1024*1024": + parameters["size"] = args.size + if args.n != 1: + parameters["n"] = args.n + if args.style: + parameters["style"] = args.style + if args.seed is not None: + parameters["seed"] = args.seed + if args.watermark is not None: + parameters["watermark"] = args.watermark + if args.prompt_extend is not None: + parameters["prompt_extend"] = args.prompt_extend + + try: + response = await AioMultiModalConversation.call( + api_key=api_key, + model=model_name, + messages=messages, + **parameters, + ) + except Exception as e: + raise RuntimeError( + f"Failed to call Wanx 2.6 image generation API: {str(e)}", + ) from e + + if response.status_code != 200 or not response.output: + raise RuntimeError(f"Wanx 2.6 image generation failed: {response}") + + results = [] + try: + if hasattr(response, "output") and response.output: + choices = getattr(response.output, "choices", []) + if choices: + message = getattr(choices[0], "message", {}) + content = getattr(message, "content", []) + if isinstance(content, list): + for item in content: + if isinstance(item, dict) and "image" in item: + results.append(item["image"]) + elif isinstance(content, str): + results.append(content) + elif isinstance(content, dict) and "image" in content: + results.append(content["image"]) + except Exception as e: + raise RuntimeError( + f"Failed to parse Wanx 2.6 API response: {str(e)}", + ) from e + + if not results: + raise RuntimeError(f"No image URLs found in response: {response}") + + if not request_id: + request_id = getattr(response, "request_id", None) or str( + uuid.uuid4(), + ) + + if trace_event: + trace_event.on_log( + "", + **{ + "step_suffix": "results", + "payload": { + "request_id": request_id, + "wanx26_image_generation_result": { + "status_code": response.status_code, + "results": results, + }, + }, + }, + ) + + return ImageGenerationWan26Output( + results=results, + request_id=request_id, + ) diff --git a/src/agentscope_runtime/tools/generations/image_style_repaint.py b/src/agentscope_runtime/tools/generations/image_style_repaint.py index 3feb438ae..0210464e0 100644 --- a/src/agentscope_runtime/tools/generations/image_style_repaint.py +++ b/src/agentscope_runtime/tools/generations/image_style_repaint.py @@ -74,7 +74,11 @@ class ImageStyleRepaint( """ name: str = "modelstudio_image_style_repaint" - description: str = "人像风格重绘服务,输入原始图像和风格数据(索引或参考图像),返回重绘后的图像。" + description: str = ( + "[模型: wanx-style-repaint-v1] 人像风格重绘服务。支持将人物照片转换为多种预设或自定义的艺术风格。\n" + "输入:一张人物图像 + 风格数据(可为风格索引或参考图像);输出:风格化后的人像图像URL。\n" + "适用于艺术创作、个性化头像、视觉设计等场景。\n" + ) def __init__(self, name: str = None, description: str = None): super().__init__(name=name, description=description) diff --git a/src/agentscope_runtime/tools/generations/qwen_image_edit.py b/src/agentscope_runtime/tools/generations/qwen_image_edit.py index c48edd7ee..8314459c5 100644 --- a/src/agentscope_runtime/tools/generations/qwen_image_edit.py +++ b/src/agentscope_runtime/tools/generations/qwen_image_edit.py @@ -121,6 +121,16 @@ async def arun( }, ] + # >>> 新增:标准化 watermark 输入为布尔值 <<< + if args.watermark is not None: + if isinstance(args.watermark, str): + args.watermark = args.watermark.strip().lower() in ( + "true", + "1", + ) + else: + args.watermark = bool(args.watermark) + parameters = {} if args.negative_prompt: parameters["negative_prompt"] = args.negative_prompt diff --git a/src/agentscope_runtime/tools/generations/qwen_image_edit_new.py b/src/agentscope_runtime/tools/generations/qwen_image_edit_new.py new file mode 100644 index 000000000..b40ea2f57 --- /dev/null +++ b/src/agentscope_runtime/tools/generations/qwen_image_edit_new.py @@ -0,0 +1,217 @@ +# -*- coding: utf-8 -*- +# pylint:disable=abstract-method, deprecated-module, wrong-import-order +# pylint:disable=too-many-nested-blocks, too-many-branches, unused-import + +import os +import uuid +from typing import Any, Optional, List + +from dashscope import AioMultiModalConversation +from mcp.server.fastmcp import Context +from pydantic import BaseModel, Field + +from ..base import Tool +from ..utils.api_key_util import get_api_key, ApiNames +from ...engine.tracing import trace, TracingUtil + + +class QwenImageEditInputNew(BaseModel): + """ + Qwen Image Edit Input New(Supports single or multiple images for fusion) + """ + + image_urls: list[str] = Field( + ..., + description="输入图像的URL地址列表,每个URL需为公网可访问地址,支持 HTTP 或 " + "HTTPS 协议。格式:JPG、JPEG、PNG、BMP、TIFF、WEBP,分辨率[384," + "3072],大小不超过10MB。URL不能包含中文字符。", + ) + prompt: str = Field( + ..., + description="正向提示词,用来描述生成图像中期望包含的元素和视觉特点, 超过800个字符自动截断", + ) + negative_prompt: Optional[str] = Field( + default=None, + description="反向提示词,用来描述不希望在画面中看到的内容,可以对画面进行限制,超过500个字符自动截断", + ) + watermark: Optional[bool] = Field( + default=None, + description="是否添加水印,默认不设置", + ) + ctx: Optional[Context] = Field( + default=None, + description="HTTP request context containing headers for mcp only, " + "don't generate it", + ) + + +class QwenImageEditOutputNew(BaseModel): + """ + Qwen Image Edit Output New + """ + + results: list[str] = Field( + title="Results", + description="输出的融合后图片url列表,仅包含1个URL", + ) + request_id: Optional[str] = Field( + default=None, + title="Request ID", + description="请求ID", + ) + + +class QwenImageEditNew(Tool[QwenImageEditInputNew, QwenImageEditOutputNew]): + """ + Qwen Image Edit Tool for AI-powered image editing. + Supports single or multiple images for fusion. + """ + + name: str = "modelstudio_qwen_image_edit_new" + description: str = ( + "通义千问-多图融合模型,基于 qwen-image-edit,支持将多张图像按提示词语义融合为一张新图。" + "可用于风格混合、场景合成、元素组合等复杂图像生成任务。" + ) + + @trace(trace_type="AIGC", trace_name="qwen_image_edit_new") + async def arun( + self, + args: QwenImageEditInputNew, + **kwargs: Any, + ) -> QwenImageEditOutputNew: + """Qwen Image Edit using MultiModalConversation API + + This method uses DashScope's MultiModalConversation service to edit + images based on text prompts. The API supports various image editing + operations through natural language instructions. + + Args: + args: QwenImageEditInputNew containing image_urls, text_prompt, + watermark, and negative_prompt. + **kwargs: Additional keyword arguments including request_id, + trace_event, model_name, api_key. + + Returns: + QwenImageEditOutputNew containing + the edited image URL and request ID. + + Raises: + ValueError: If DASHSCOPE_API_KEY is not set or invalid. + RuntimeError: If the API call fails or returns an error. + """ + + trace_event = kwargs.pop("trace_event", None) + request_id = TracingUtil.get_request_id() + + try: + api_key = get_api_key(ApiNames.dashscope_api_key, **kwargs) + except AssertionError as e: + raise ValueError("Please set valid DASHSCOPE_API_KEY!") from e + + model_name = kwargs.get( + "model_name", + os.getenv("QWEN_IMAGE_EDIT_MODEL_NAME", "qwen-image-edit"), + ) + + # Prepare messages in the format expected by MultiModalConversation + content = [{"image": url} for url in args.image_urls] + content.append({"text": args.prompt}) + + messages = [ + { + "role": "user", + "content": content, + }, + ] + + # 标准化 watermark 输入为布尔值 + if args.watermark is not None: + if isinstance(args.watermark, str): + args.watermark = args.watermark.strip().lower() in ( + "true", + "1", + ) + else: + args.watermark = bool(args.watermark) + + parameters = {} + if args.negative_prompt: + parameters["negative_prompt"] = args.negative_prompt + if args.watermark is not None: + parameters["watermark"] = args.watermark + + # Call the AioMultiModalConversation API asynchronously + try: + response = await AioMultiModalConversation.call( + api_key=api_key, + model=model_name, + messages=messages, + **parameters, + ) + except Exception as e: + raise RuntimeError( + f"Failed to call Qwen Image Edit API: {str(e)}", + ) from e + + # Check response status + if response.status_code != 200 or not response.output: + raise RuntimeError(f"Failed to generate: {response}") + + # Extract the edited image URLs from response + try: + results = [] + + # Try to get from output.choices[0].message.content + if hasattr(response, "output") and response.output: + choices = getattr(response.output, "choices", []) + if choices: + message = getattr(choices[0], "message", {}) + if hasattr(message, "content"): + content = message.content + if isinstance(content, list): + # Look for image content in the list + for item in content: + if isinstance(item, dict) and "image" in item: + results.append(item["image"]) + elif isinstance(content, str): + results.append(content) + elif isinstance(content, dict) and "image" in content: + results.append(content["image"]) + + if not results: + raise RuntimeError( + f"Could not extract edited image URLs from response: " + f" {response}", + ) + + except Exception as e: + raise RuntimeError( + f"Failed to parse response from Qwen Image Edit API: {str(e)}", + ) from e + + # Get request ID + if request_id == "": + request_id = getattr(response, "request_id", None) or str( + uuid.uuid4(), + ) + + # Log trace event if provided + if trace_event: + trace_event.on_log( + "", + **{ + "step_suffix": "results", + "payload": { + "request_id": request_id, + "qwen_image_edit_result": { + "status_code": response.status_code, + "results": results, + }, + }, + }, + ) + + return QwenImageEditOutputNew( + results=results, + request_id=request_id, + ) diff --git a/src/agentscope_runtime/tools/generations/qwen_image_generation.py b/src/agentscope_runtime/tools/generations/qwen_image_generation.py index d53d5c583..c77a533ad 100644 --- a/src/agentscope_runtime/tools/generations/qwen_image_generation.py +++ b/src/agentscope_runtime/tools/generations/qwen_image_generation.py @@ -123,7 +123,15 @@ async def arun( ], }, ] - + # >>> 新增:标准化 watermark 输入为布尔值 <<< + if args.watermark is not None: + if isinstance(args.watermark, str): + args.watermark = args.watermark.strip().lower() in ( + "true", + "1", + ) + else: + args.watermark = bool(args.watermark) parameters = {} if args.negative_prompt: parameters["negative_prompt"] = args.negative_prompt