Skip to content

Commit 928b957

Browse files
committed
feat(tool): Add video multiModal tools
- Add dashscope_text_to_video, dashscope_image_to_video, dashscope_first_and_last_frame_image_to_video and dashscope_video_to_text tools. - Add multiModal tool example. - Update tool.md doc.
1 parent aa62e7c commit 928b957

File tree

7 files changed

+2149
-75
lines changed

7 files changed

+2149
-75
lines changed

agentscope-core/src/main/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalTool.java

Lines changed: 668 additions & 3 deletions
Large diffs are not rendered by default.

agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolE2ETest.java

Lines changed: 330 additions & 18 deletions
Large diffs are not rendered by default.

agentscope-core/src/test/java/io/agentscope/core/tool/multimodal/DashScopeMultiModalToolTest.java

Lines changed: 942 additions & 54 deletions
Large diffs are not rendered by default.
2.83 MB
Binary file not shown.
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
/*
2+
* Copyright 2024-2026 the original author or authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package io.agentscope.examples.quickstart;
17+
18+
import io.agentscope.core.ReActAgent;
19+
import io.agentscope.core.formatter.dashscope.DashScopeChatFormatter;
20+
import io.agentscope.core.hook.Hook;
21+
import io.agentscope.core.hook.HookEvent;
22+
import io.agentscope.core.hook.PostActingEvent;
23+
import io.agentscope.core.hook.PreActingEvent;
24+
import io.agentscope.core.memory.InMemoryMemory;
25+
import io.agentscope.core.message.AudioBlock;
26+
import io.agentscope.core.message.Base64Source;
27+
import io.agentscope.core.message.ContentBlock;
28+
import io.agentscope.core.message.ImageBlock;
29+
import io.agentscope.core.message.Source;
30+
import io.agentscope.core.message.TextBlock;
31+
import io.agentscope.core.message.ToolResultBlock;
32+
import io.agentscope.core.message.URLSource;
33+
import io.agentscope.core.message.VideoBlock;
34+
import io.agentscope.core.model.DashScopeChatModel;
35+
import io.agentscope.core.tool.Toolkit;
36+
import io.agentscope.core.tool.multimodal.DashScopeMultiModalTool;
37+
import java.util.List;
38+
import reactor.core.publisher.Mono;
39+
40+
/**
41+
* ToolCallingExample - Demonstrates how to equip an Agent with tools.
42+
*/
43+
public class MultiModalToolExample {
44+
45+
public static void main(String[] args) throws Exception {
46+
// Print welcome message
47+
ExampleUtils.printWelcome(
48+
"MultiModal Tool Calling Example",
49+
"This example demonstrates how to equip an Agent with multimodal tools.\n"
50+
+ "The agent has image, audio and video multimodal tools.");
51+
52+
// Get API key
53+
String apiKey = ExampleUtils.getDashScopeApiKey();
54+
55+
// Create and register tools
56+
Toolkit toolkit = new Toolkit();
57+
toolkit.registerTool(new DashScopeMultiModalTool(apiKey));
58+
printRegisterTools();
59+
60+
// Create Agent with tools
61+
ReActAgent agent =
62+
ReActAgent.builder()
63+
.name("MultiModalToolAgent")
64+
.sysPrompt(
65+
"You are a helpful assistant with access to multimodal"
66+
+ " tools. Use tools when needed to answer questions"
67+
+ " accurately. Always explain what you're doing when using"
68+
+ " tools.")
69+
.model(
70+
DashScopeChatModel.builder()
71+
.apiKey(apiKey)
72+
.modelName("qwen-plus")
73+
.stream(true)
74+
.enableThinking(false)
75+
.formatter(new DashScopeChatFormatter())
76+
.build())
77+
.hook(new ToolCallHook())
78+
.toolkit(toolkit)
79+
.memory(new InMemoryMemory())
80+
.build();
81+
82+
printExamplePrompts();
83+
84+
ExampleUtils.startChat(agent);
85+
}
86+
87+
private static void printRegisterTools() {
88+
String registeredTools =
89+
"""
90+
Registered tools:
91+
- dashscope_text_to_image: Generate image(s) based on the given text.
92+
- dashscope_image_to_text: Generate text based on the given images.
93+
- dashscope_text_to_audio: Convert the given text to audio.
94+
- dashscope_audio_to_text: Convert the given audio to text.
95+
- dashscope_text_to_video: Generate video based on the given text prompt.
96+
- dashscope_image_to_video: Generate a video from a single input image and an optional text prompt.
97+
- dashscope_first_and_last_frame_image_to_video: Generate video transitioning from a first frame to a last frame and an optional text prompt.
98+
- dashscope_video_to_text: Analyze video and generate a text description or answer questions based on the video content.
99+
""";
100+
101+
System.out.println(registeredTools);
102+
System.out.println("\n");
103+
}
104+
105+
private static void printExamplePrompts() {
106+
String examplePrompts =
107+
"""
108+
Example Prompts:
109+
[dashscope_text_to_image]:
110+
Generate a black dog image url.
111+
[dashscope_image_to_text]:
112+
Describe the image url of 'https://dashscope.oss-cn-beijing.aliyuncs.com/images/tiger.png'.
113+
[dashscope_text_to_audio]:
114+
Convert the texts of 'hello, qwen!' to audio url.
115+
[dashscope_audio_to_text]:
116+
Convert the audio url of 'https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_male2.wav' to text.
117+
[dashscope_text_to_video]:
118+
Generate a smart cat is running in the moonlight video.
119+
[dashscope_image_to_video]:
120+
Generate a video that a tiger is running in moonlight based on the image url of 'https://dashscope.oss-cn-beijing.aliyuncs.com/images/tiger.png'.
121+
[dashscope_first_and_last_frame_image_to_video]:
122+
Generate a video that a black kitten curiously looking at the sky based on the first frame image url of 'https://wanx.alicdn.com/material/20250318/first_frame.png' and the last frame image url of 'https://wanx.alicdn.com/material/20250318/last_frame.png'.
123+
[dashscope_video_to_text]:
124+
Describe the video url of 'https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241115/cqqkru/1.mp4'.
125+
""";
126+
System.out.println(examplePrompts);
127+
System.out.println("\n");
128+
}
129+
130+
static class ToolCallHook implements Hook {
131+
132+
@Override
133+
public <T extends HookEvent> Mono<T> onEvent(T event) {
134+
if (event instanceof PreActingEvent preActing) {
135+
System.out.println(
136+
"\n[HOOK] PreActingEvent - Tool: "
137+
+ preActing.getToolUse().getName()
138+
+ ", Input: "
139+
+ preActing.getToolUse().getInput());
140+
141+
} else if (event instanceof PostActingEvent postActingEvent) {
142+
ToolResultBlock toolResult = postActingEvent.getToolResult();
143+
List<ContentBlock> contentBlocks = toolResult.getOutput();
144+
if (contentBlocks != null && !contentBlocks.isEmpty()) {
145+
for (ContentBlock cb : contentBlocks) {
146+
if (cb instanceof ImageBlock ib) {
147+
Source source = ib.getSource();
148+
if (source instanceof URLSource urlSource) {
149+
System.out.println(
150+
"\n[HOOK] PostActingEvent - Tool Result: \nImage URL: "
151+
+ urlSource.getUrl());
152+
} else if (source instanceof Base64Source base64Source) {
153+
System.out.println(
154+
"\n"
155+
+ "[HOOK] PostActingEvent - Tool Result: \n"
156+
+ "Image Base64 data: "
157+
+ base64Source.getData());
158+
}
159+
} else if (cb instanceof AudioBlock ab) {
160+
Source source = ab.getSource();
161+
if (source instanceof URLSource urlSource) {
162+
System.out.println(
163+
"\n[HOOK] PostActingEvent - Tool Result: \nAudio URL: "
164+
+ urlSource.getUrl());
165+
} else if (source instanceof Base64Source base64Source) {
166+
System.out.println(
167+
"\n"
168+
+ "[HOOK] PostActingEvent - Tool Result: \n"
169+
+ "Audio Base64 data: "
170+
+ base64Source.getData());
171+
}
172+
} else if (cb instanceof VideoBlock vb) {
173+
Source source = vb.getSource();
174+
if (source instanceof URLSource urlSource) {
175+
System.out.println(
176+
"\n[HOOK] PostActingEvent - Tool Result: \nVideo URL: "
177+
+ urlSource.getUrl());
178+
} else if (source instanceof Base64Source base64Source) {
179+
System.out.println(
180+
"\n"
181+
+ "[HOOK] PostActingEvent - Tool Result: \n"
182+
+ "Video Base64 data: "
183+
+ base64Source.getData());
184+
}
185+
} else if (cb instanceof TextBlock tb) {
186+
System.out.println(
187+
"\n[HOOK] PostActingEvent - Tool Result: \nText: "
188+
+ tb.getText());
189+
}
190+
}
191+
System.out.println("\n");
192+
}
193+
}
194+
return Mono.just(event);
195+
}
196+
}
197+
}

docs/en/task/tool.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,3 +370,9 @@ boolean isExternal = toolkit.isExternalTool("query_database"); // true
370370
```
371371

372372
The call flow is the same as Tool Suspend: LLM calls → returns `TOOL_SUSPENDED` → external execution → provide result to resume.
373+
374+
## Complete Examples
375+
376+
- **Tool Call Example**: [ToolCallingExample.java](https://github.com/agentscope-ai/agentscope-java/blob/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/ToolCallingExample.java)
377+
- **Tool Group Example**: [ToolGroupExample.java](https://github.com/agentscope-ai/agentscope-java/blob/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/ToolGroupExample.java)
378+
- **MultiModal Tool Example**: [MultiModalToolExample.java](https://github.com/agentscope-ai/agentscope-java/tree/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/MultiModalToolExample.java)

docs/zh/task/tool.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,3 +370,9 @@ boolean isExternal = toolkit.isExternalTool("query_database"); // true
370370
```
371371

372372
调用流程与工具挂起相同:LLM 调用 → 返回 `TOOL_SUSPENDED` → 外部执行 → 提供结果恢复。
373+
374+
## 完整示例
375+
376+
- **工具调用示例**: [ToolCallingExample.java](https://github.com/agentscope-ai/agentscope-java/blob/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/ToolCallingExample.java)
377+
- **工具组示例**: [ToolGroupExample.java](https://github.com/agentscope-ai/agentscope-java/blob/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/ToolGroupExample.java)
378+
- **多模态工具示例**: [MultiModalToolExample.java](https://github.com/agentscope-ai/agentscope-java/tree/main/agentscope-examples/quickstart/src/main/java/io/agentscope/examples/quickstart/MultiModalToolExample.java)

0 commit comments

Comments
 (0)