yandex-cloud
diff --git a/‎examples/async/chat/example.png‎
23.4 KB b/‎examples/async/chat/example.png‎
23.4 KB
diff --git a/‎examples/async/chat/multimodal.py‎
Lines changed: 51 additions & 0 deletions b/‎examples/async/chat/multimodal.py‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎examples/sync/chat/example.png‎
23.4 KB b/‎examples/sync/chat/example.png‎
23.4 KB
diff --git a/‎examples/sync/chat/multimodal.py‎
Lines changed: 50 additions & 0 deletions b/‎examples/sync/chat/multimodal.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎src/yandex_cloud_ml_sdk/_chat/completions/message.py‎
Lines changed: 36 additions & 12 deletions b/‎src/yandex_cloud_ml_sdk/_chat/completions/message.py‎
Lines changed: 36 additions & 12 deletions
diff --git a/‎tests/chat/cassettes/test_completions/test_multimodal.yaml‎
Lines changed: 56 additions & 0 deletions b/‎tests/chat/cassettes/test_completions/test_multimodal.yaml‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎tests/chat/example.png‎
23.4 KB b/‎tests/chat/example.png‎
23.4 KB
diff --git a/‎tests/chat/test_completions.py‎
Lines changed: 29 additions & 0 deletions b/‎tests/chat/test_completions.py‎
Lines changed: 29 additions & 0 deletions
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import asyncio
+import base64
+import pathlib
+
+from yandex_cloud_ml_sdk import AsyncYCloudML
+
+
+def get_image_base64():
+    image_path = pathlib.Path(__file__).parent / 'example.png'
+    image_data = image_path.read_bytes()
+    image_base64 = base64.b64encode(image_data)
+    return image_base64.decode('utf-8')
+
+
+async def main() -> None:
+    sdk = AsyncYCloudML(folder_id='b1ghsjum2v37c2un8h64')
+    sdk.setup_default_logging()
+
+    # at this moment this is only model which supports image processing
+    model = sdk.chat.completions('gemma-3-27b-it')
+
+    request = [
+        # this is special kind of multimodal message which allows you to
+        # mix image with text data;
+        {
+            'role': 'user',
+            'content': [
+                {
+                    'type': 'text', 'text': "What is depicted in the following image",
+                },
+                {
+                    'type': 'image_url',
+                    'image_url': {
+                        'url': f'data:image/png;base64,{get_image_base64()}'
+                    }
+                }
+            ]
+        }
+    ]
+
+    result = await model.run(request)
+
+    print(result.text)
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import base64
+import pathlib
+
+from yandex_cloud_ml_sdk import YCloudML
+
+
+def get_image_base64():
+    image_path = pathlib.Path(__file__).parent / 'example.png'
+    image_data = image_path.read_bytes()
+    image_base64 = base64.b64encode(image_data)
+    return image_base64.decode('utf-8')
+
+
+def main() -> None:
+    sdk = YCloudML(folder_id='b1ghsjum2v37c2un8h64')
+    sdk.setup_default_logging()
+
+    # at this moment this is only model which supports image processing
+    model = sdk.chat.completions('gemma-3-27b-it')
+
+    request = [
+        # this is special kind of multimodal message which allows you to
+        # mix image with text data;
+        {
+            'role': 'user',
+            'content': [
+                {
+                    'type': 'text', 'text': "What is depicted in the following image",
+                },
+                {
+                    'type': 'image_url',
+                    'image_url': {
+                        'url': f'data:image/png;base64,{get_image_base64()}'
+                    }
+                }
+            ]
+        }
+    ]
+
+    result = model.run(request)
+
+    print(result.text)
+
+
+if __name__ == '__main__':
+    main()
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
-from collections.abc import Iterable
-from typing import TypedDict, Union, cast
+from collections.abc import Iterable, Sequence
+from typing import Literal, TypedDict, Union, cast
 
 from typing_extensions import NotRequired, Required
 
@@ -20,7 +20,26 @@ class ChatFunctionResultMessageDict(TypedDict):
     content: Required[str]
 
 
-ChatCompletionsMessageType = Union[MessageType, ChatFunctionResultMessageDict, MessageInputType]
+class ImageUrlDict(TypedDict):
+    url: str
+
+
+class ImageUrlContent(TypedDict):
+    type: Literal['image_url']
+    image_url: ImageUrlDict
+
+
+class TextContent(TypedDict):
+    type: Literal['text']
+    text: str
+
+
+class MultimodalMessageDict(TypedDict):
+    role: NotRequired[str]
+    content: Sequence[ImageUrlDict | TextContent]
+
+
+ChatCompletionsMessageType = Union[MessageType, ChatFunctionResultMessageDict, MessageInputType, MultimodalMessageDict]
 ChatMessageInputType = Union[ChatCompletionsMessageType, Iterable[ChatCompletionsMessageType]]
 
 
@@ -43,41 +62,46 @@ def message_to_json(message: ChatCompletionsMessageType, tool_name_ids: dict[str
             "content": message.text,
             "role": message.role,
         }
+
     if isinstance(message, dict):
-        text = message.get('text') or message.get('content', '')
+        role: str | None = message.get('role')
+        content: Sequence | str | None = message.get('content')  # type: ignore[assignment]
+        if isinstance(content, Sequence):
+            return {
+                'role': role or 'user',
+                'content': list(content),
+            }
+
+        text: str | None = message.get('text') or content  # type: ignore[assignment]
         assert isinstance(text, str)
 
         if tool_call_id := message.get('tool_call_id'):
             assert isinstance(tool_call_id, str)
             message = cast(ChatFunctionResultMessageDict, message)
-            role = message.get('role', 'tool')
             return {
-                'role': role,
+                'role': role or 'tool',
                 'content': text,
                 'tool_call_id': tool_call_id,
             }
 
         if tool_calls := message.get('tool_calls'):
             tool_calls = cast(JsonObject, tool_calls)
-            role = message.get('role', 'assistant')
             return {
                 'tool_calls': tool_calls,
-                'role': role,
+                'role': role or 'assistant',
             }
 
         if text:
             message = cast(TextMessageDict, message)
-            role = message.get('role', 'user')
             return {
                 'content': text,
-                'role': role
+                'role': role or 'user'
             }
 
         if tool_results := message.get('tool_results'):
             assert isinstance(tool_results, list)
             message = cast(FunctionResultMessageDict, message)
 
-            role = message.get('role', 'tool')
             result: list[JsonObject] = []
             for tool_result in tool_results:
                 tool_result = cast(ToolResultDictType, tool_result)
@@ -91,7 +115,7 @@ def message_to_json(message: ChatCompletionsMessageType, tool_name_ids: dict[str
                     )
 
                 result.append({
-                    'role': role,
+                    'role': role or 'tool',
                     'content': content,
                     'tool_call_id': id_,
                 })
 
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
+import base64
 import json
+import pathlib
 from typing import cast
 
 import pytest
@@ -359,3 +361,30 @@ async def test_tool_choice(async_sdk: AsyncYCloudML, tool, schema) -> None:
     model = model.configure(tool_choice=None)  # type: ignore[arg-type]
     result = await model.run(message)
     assert result.status.name == 'TOOL_CALLS'
+
+
+async def test_multimodal(async_sdk: AsyncYCloudML) -> None:
+    model = async_sdk.chat.completions('gemma-3-27b-it')
+    image_path = pathlib.Path(__file__).parent / 'example.png'
+    image_data = image_path.read_bytes()
+    image_base64 = base64.b64encode(image_data)
+    image = image_base64.decode('utf-8')
+
+    request = [
+        {
+            'role': 'user',
+            'content': [
+                {
+                    'type': 'text', 'text': "What is depicted in the following image",
+                },
+                {
+                    'type': 'image_url',
+                    'image_url': {
+                        'url': f'data:image/png;base64,{image}'
+                    }
+                }
+            ]
+        }
+    ]
+    result = await model.run(request)
+    assert 'bricks' in result.text