Skip to content

Commit ccba3cc

Browse files
authored
refactor(tool): wrap media content with descriptive tags in ReadMediaFile (#744)
Signed-off-by: Richard Chien <[email protected]>
1 parent a8d65d9 commit ccba3cc

File tree

10 files changed

+75
-34
lines changed

10 files changed

+75
-34
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Only write entries that are worth mentioning to users.
1212
## Unreleased
1313

1414
- Auth: Fix authentication issue during agent turns
15+
- Tool: Wrap media content with descriptive tags in `ReadMediaFile` for better path traceability
1516

1617
## 1.2 (2026-01-27)
1718

docs/en/release-notes/changelog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ This page documents the changes in each Kimi Code CLI release.
55
## Unreleased
66

77
- Auth: Fix authentication issue during agent turns
8+
- Tool: Wrap media content with descriptive tags in `ReadMediaFile` for better path traceability
89

910
## 1.2 (2026-01-27)
1011

docs/zh/release-notes/changelog.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44

55
## 未发布
66

7-
- Auth: 修复 Agent 轮次期间的认证问题
7+
- Auth:修复 Agent 轮次期间的认证问题
8+
- Tool:为 `ReadMediaFile` 中的媒体内容添加描述性标签,提高路径可追溯性
89

910
## 1.2 (2026-01-27)
1011

src/kimi_cli/tools/file/read_media.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from kimi_cli.tools import SkipThisTool
1313
from kimi_cli.tools.file.utils import MEDIA_SNIFF_BYTES, FileType, detect_file_type
1414
from kimi_cli.tools.utils import load_desc_jinja
15+
from kimi_cli.utils.media_tags import wrap_media_part
1516
from kimi_cli.utils.path import is_within_directory
1617
from kimi_cli.wire.types import ImageURLPart, VideoURLPart
1718

@@ -86,7 +87,7 @@ async def _validate_path(self, path: KaosPath) -> ToolError | None:
8687
async def _read_media(self, path: KaosPath, file_type: FileType) -> ToolReturnValue:
8788
assert file_type.kind in ("image", "video")
8889

89-
media_id = str(path)
90+
media_path = str(path)
9091
stat = await path.stat()
9192
size = stat.st_size
9293
if size == 0:
@@ -107,7 +108,8 @@ async def _read_media(self, path: KaosPath, file_type: FileType) -> ToolReturnVa
107108
case "image":
108109
data = await path.read_bytes()
109110
data_url = _to_data_url(file_type.mime_type, data)
110-
part = ImageURLPart(image_url=ImageURLPart.ImageURL(url=data_url, id=media_id))
111+
part = ImageURLPart(image_url=ImageURLPart.ImageURL(url=data_url))
112+
wrapped = wrap_media_part(part, tag="image", attrs={"path": media_path})
111113
image_size = _extract_image_size(data)
112114
case "video":
113115
data = await path.read_bytes()
@@ -116,10 +118,11 @@ async def _read_media(self, path: KaosPath, file_type: FileType) -> ToolReturnVa
116118
data=data,
117119
mime_type=file_type.mime_type,
118120
)
119-
part.video_url.id = media_id
121+
wrapped = wrap_media_part(part, tag="video", attrs={"path": media_path})
120122
else:
121123
data_url = _to_data_url(file_type.mime_type, data)
122-
part = VideoURLPart(video_url=VideoURLPart.VideoURL(url=data_url, id=media_id))
124+
part = VideoURLPart(video_url=VideoURLPart.VideoURL(url=data_url))
125+
wrapped = wrap_media_part(part, tag="video", attrs={"path": media_path})
123126
image_size = None
124127

125128
size_hint = ""
@@ -132,7 +135,7 @@ async def _read_media(self, path: KaosPath, file_type: FileType) -> ToolReturnVa
132135
"before continuing."
133136
)
134137
return ToolOk(
135-
output=part,
138+
output=wrapped,
136139
message=(
137140
f"Loaded {file_type.kind} file `{path}` "
138141
f"({file_type.mime_type}, {size} bytes{size_hint}).{note}"

src/kimi_cli/ui/shell/debug.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,7 @@ def _format_content_part(part: ContentPart) -> Text | Panel | Group:
5151

5252
case ImageURLPart(image_url=img):
5353
url_display = img.url[:80] + "..." if len(img.url) > 80 else img.url
54-
id_text = f" (id: {img.id})" if img.id else ""
55-
return Text(f"[Image{id_text}] {url_display}", style="blue")
54+
return Text(f"[Image] {url_display}", style="blue")
5655

5756
case AudioURLPart(audio_url=audio):
5857
url_display = audio.url[:80] + "..." if len(audio.url) > 80 else audio.url
@@ -61,8 +60,7 @@ def _format_content_part(part: ContentPart) -> Text | Panel | Group:
6160

6261
case VideoURLPart(video_url=video):
6362
url_display = video.url[:80] + "..." if len(video.url) > 80 else video.url
64-
id_text = f" (id: {video.id})" if video.id else ""
65-
return Text(f"[Video{id_text}] {url_display}", style="blue")
63+
return Text(f"[Video] {url_display}", style="blue")
6664

6765
case _:
6866
return Text(f"[Unknown content type: {type(part).__name__}]", style="red")

src/kimi_cli/ui/shell/prompt.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
from kimi_cli.ui.shell.console import console
4848
from kimi_cli.utils.clipboard import grab_image_from_clipboard, is_clipboard_available
4949
from kimi_cli.utils.logging import logger
50+
from kimi_cli.utils.media_tags import wrap_media_part
5051
from kimi_cli.utils.slashcmd import SlashCommand
5152
from kimi_cli.utils.string import random_string
5253
from kimi_cli.wire.types import ContentPart, ImageURLPart, TextPart
@@ -508,12 +509,11 @@ def _guess_image_mime(path: Path) -> str:
508509
return "image/png"
509510

510511

511-
def _build_image_part(image_bytes: bytes, image_id: str, mime_type: str) -> ImageURLPart:
512+
def _build_image_part(image_bytes: bytes, mime_type: str) -> ImageURLPart:
512513
image_base64 = base64.b64encode(image_bytes).decode("ascii")
513514
return ImageURLPart(
514515
image_url=ImageURLPart.ImageURL(
515516
url=f"data:{mime_type};base64,{image_base64}",
516-
id=image_id,
517517
)
518518
)
519519

@@ -607,16 +607,17 @@ def load_bytes(
607607
)
608608
return None
609609

610-
def load_content_part(
610+
def load_content_parts(
611611
self, kind: CachedAttachmentKind, attachment_id: str
612-
) -> ContentPart | None:
612+
) -> list[ContentPart] | None:
613613
if kind == "image":
614614
payload = self.load_bytes(kind, attachment_id)
615615
if payload is None:
616616
return None
617617
path, image_bytes = payload
618618
mime_type = _guess_image_mime(path)
619-
return _build_image_part(image_bytes, str(path), mime_type)
619+
part = _build_image_part(image_bytes, mime_type)
620+
return wrap_media_part(part, tag="image", attrs={"path": str(path)})
620621
return None
621622

622623

@@ -841,9 +842,9 @@ async def prompt(self) -> UserInput:
841842
attachment_kind = _parse_attachment_kind(match.group("type"))
842843
part = None
843844
if attachment_kind is not None:
844-
part = self._attachment_cache.load_content_part(attachment_kind, attachment_id)
845+
part = self._attachment_cache.load_content_parts(attachment_kind, attachment_id)
845846
if part is not None:
846-
content.append(part)
847+
content.extend(part)
847848
else:
848849
logger.warning(
849850
"Attachment placeholder found but no matching attachment part: {placeholder}",

src/kimi_cli/utils/media_tags.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from __future__ import annotations
2+
3+
from collections.abc import Mapping
4+
from html import escape
5+
6+
from kimi_cli.wire.types import ContentPart, TextPart
7+
8+
9+
def _format_tag(tag: str, attrs: Mapping[str, str | None] | None = None) -> str:
10+
if not attrs:
11+
return f"<{tag}>"
12+
rendered: list[str] = []
13+
for key, value in sorted(attrs.items()):
14+
if not value:
15+
continue
16+
rendered.append(f'{key}="{escape(str(value), quote=True)}"')
17+
if not rendered:
18+
return f"<{tag}>"
19+
return f"<{tag} " + " ".join(rendered) + ">"
20+
21+
22+
def wrap_media_part(
23+
part: ContentPart, *, tag: str, attrs: Mapping[str, str | None] | None = None
24+
) -> list[ContentPart]:
25+
return [
26+
TextPart(text=_format_tag(tag, attrs)),
27+
part,
28+
TextPart(text=f"</{tag}>"),
29+
]

src/kimi_cli/utils/message.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,12 @@ def message_stringify(message: Message) -> str:
1313
if isinstance(part, TextPart):
1414
parts.append(part.text)
1515
elif isinstance(part, ImageURLPart):
16-
suffix = f":{part.image_url.id}" if part.image_url.id else ""
17-
parts.append(f"[image{suffix}]")
16+
parts.append("[image]")
1817
elif isinstance(part, AudioURLPart):
1918
suffix = f":{part.audio_url.id}" if part.audio_url.id else ""
2019
parts.append(f"[audio{suffix}]")
2120
elif isinstance(part, VideoURLPart):
22-
suffix = f":{part.video_url.id}" if part.video_url.id else ""
23-
parts.append(f"[video{suffix}]")
21+
parts.append("[video]")
2422
else:
2523
parts.append(f"[{part.type}]")
2624
return "".join(parts)

tests/test_attachment_cache.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from PIL import Image
66

77
from kimi_cli.ui.shell.prompt import AttachmentCache, _parse_attachment_kind
8-
from kimi_cli.wire.types import ImageURLPart
8+
from kimi_cli.wire.types import ImageURLPart, TextPart
99

1010

1111
def _make_image() -> Image.Image:
@@ -21,12 +21,15 @@ def test_attachment_cache_roundtrip(tmp_path) -> None:
2121
assert cached.path.exists()
2222
assert cached.path.parent == tmp_path / "images"
2323

24-
part = cache.load_content_part("image", cached.attachment_id)
25-
assert isinstance(part, ImageURLPart)
26-
assert part.image_url.id == str(cached.path)
27-
assert part.image_url.url.startswith("data:image/png;base64,")
24+
parts = cache.load_content_parts("image", cached.attachment_id)
25+
assert parts is not None
26+
assert len(parts) == 3
27+
assert parts[0] == TextPart(text=f'<image path="{cached.path}">')
28+
assert isinstance(parts[1], ImageURLPart)
29+
assert parts[2] == TextPart(text="</image>")
30+
assert parts[1].image_url.url.startswith("data:image/png;base64,")
2831

29-
encoded = part.image_url.url.split(",", 1)[1]
32+
encoded = parts[1].image_url.url.split(",", 1)[1]
3033
assert base64.b64decode(encoded).startswith(b"\x89PNG")
3134

3235

tests/tools/test_read_media_file.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from kimi_cli.llm import ModelCapability
1313
from kimi_cli.soul.agent import Runtime
1414
from kimi_cli.tools.file.read_media import Params, ReadMediaFile
15-
from kimi_cli.wire.types import ImageURLPart, VideoURLPart
15+
from kimi_cli.wire.types import ImageURLPart, TextPart, VideoURLPart
1616

1717

1818
async def test_read_image_file(read_media_file_tool: ReadMediaFile, temp_work_dir: KaosPath):
@@ -25,8 +25,10 @@ async def test_read_image_file(read_media_file_tool: ReadMediaFile, temp_work_di
2525

2626
assert not result.is_error
2727
assert isinstance(result.output, list)
28-
assert len(result.output) == 1
29-
part = result.output[0]
28+
assert len(result.output) == 3
29+
assert result.output[0] == TextPart(text=f'<image path="{image_file}">')
30+
assert result.output[2] == TextPart(text="</image>")
31+
part = result.output[1]
3032
assert isinstance(part, ImageURLPart)
3133
assert part.image_url.url.startswith("data:image/png;base64,")
3234
assert result.message == snapshot(
@@ -50,8 +52,10 @@ async def test_read_extensionless_image_file(
5052

5153
assert not result.is_error
5254
assert isinstance(result.output, list)
53-
assert len(result.output) == 1
54-
part = result.output[0]
55+
assert len(result.output) == 3
56+
assert result.output[0] == TextPart(text=f'<image path="{image_file}">')
57+
assert result.output[2] == TextPart(text="</image>")
58+
part = result.output[1]
5559
assert isinstance(part, ImageURLPart)
5660
assert part.image_url.url.startswith("data:image/png;base64,")
5761
assert result.message == snapshot(
@@ -97,8 +101,10 @@ async def test_read_video_file(read_media_file_tool: ReadMediaFile, temp_work_di
97101

98102
assert not result.is_error
99103
assert isinstance(result.output, list)
100-
assert len(result.output) == 1
101-
part = result.output[0]
104+
assert len(result.output) == 3
105+
assert result.output[0] == TextPart(text=f'<video path="{video_file}">')
106+
assert result.output[2] == TextPart(text="</video>")
107+
part = result.output[1]
102108
assert isinstance(part, VideoURLPart)
103109
assert part.video_url.url.startswith("data:video/mp4;base64,")
104110
assert result.message == snapshot(

0 commit comments

Comments
 (0)