یکپارچه‌سازی چندرسانه‌ای

برنامه‌های چندرسانه‌ای در هوش مصنوعی اهمیت فزاینده‌ای پیدا کرده‌اند و امکان تعاملات غنی‌تر و انجام وظایف پیچیده‌تر را فراهم می‌کنند. پروتکل زمینه مدل (MCP) چارچوبی برای ساخت برنامه‌های چندرسانه‌ای ارائه می‌دهد که می‌توانند انواع مختلف داده‌ها مانند متن، تصویر و صدا را مدیریت کنند.

MCP نه تنها از تعاملات مبتنی بر متن پشتیبانی می‌کند، بلکه قابلیت‌های چندرسانه‌ای را نیز فراهم می‌آورد و به مدل‌ها اجازه می‌دهد با تصاویر، صدا و سایر انواع داده‌ها کار کنند.

مقدمه

در این درس، یاد می‌گیرید چگونه یک برنامه چندرسانه‌ای بسازید.

اهداف یادگیری

تا پایان این درس، قادر خواهید بود:

انتخاب‌های چندرسانه‌ای را درک کنید
یک برنامه چندرسانه‌ای پیاده‌سازی کنید.

معماری پشتیبانی چندرسانه‌ای

پیاده‌سازی‌های MCP چندرسانه‌ای معمولاً شامل موارد زیر است:

تجزیه‌کننده‌های مخصوص هر رسانه: اجزایی که انواع مختلف رسانه را به فرمت‌هایی تبدیل می‌کنند که مدل قادر به پردازش آن‌ها باشد.
ابزارهای مخصوص هر رسانه: ابزارهای ویژه‌ای که برای مدیریت رسانه‌های خاص طراحی شده‌اند (تحلیل تصویر، پردازش صدا)
مدیریت یکپارچه زمینه: سیستمی برای حفظ زمینه در میان رسانه‌های مختلف
تولید پاسخ: توانایی تولید پاسخ‌هایی که ممکن است شامل چندین رسانه باشند.

مثال چندرسانه‌ای: تحلیل تصویر

در مثال زیر، تصویری را تحلیل کرده و اطلاعاتی استخراج خواهیم کرد.

پیاده‌سازی C#

using ModelContextProtocol.SDK.Server;
using ModelContextProtocol.SDK.Server.Tools;
using ModelContextProtocol.SDK.Server.Content;
using System.Text.Json;
using System.IO;
using System.Threading.Tasks;
using System.Collections.Generic;

namespace MultiModalMcpExample
{
    // Tool for image analysis
    public class ImageAnalysisTool : ITool
    {
        private readonly IImageAnalysisService _imageService;
        
        public ImageAnalysisTool(IImageAnalysisService imageService)
        {
            _imageService = imageService;
        }
        
        public string Name => "imageAnalysis";
        public string Description => "Analyzes image content and extracts information";
          public ToolDefinition GetDefinition()
        {
            return new ToolDefinition
            {
                Name = Name,
                Description = Description,
                Parameters = new Dictionary<string, ParameterDefinition>
                {
                    ["imageUrl"] = new ParameterDefinition
                    {
                        Type = ParameterType.String,
                        Description = "URL to the image to analyze" 
                    },
                    ["analysisType"] = new ParameterDefinition
                    {
                        Type = ParameterType.String,
                        Description = "Type of analysis to perform",
                        Enum = new[] { "general", "objects", "text", "faces" },
                        Default = "general"
                    }
                },
                Required = new[] { "imageUrl" }
            };
        }
        
        public async Task<ToolResponse> ExecuteAsync(IDictionary<string, object> parameters)
        {
            // Extract parameters
            string imageUrl = parameters["imageUrl"].ToString();
            string analysisType = parameters.ContainsKey("analysisType") 
                ? parameters["analysisType"].ToString() 
                : "general";
              // Download or access the image
            byte[] imageData = await DownloadImageAsync(imageUrl);
            
            // Analyze based on the requested analysis type
            var analysisResult = analysisType switch
            {
                "objects" => await _imageService.DetectObjectsAsync(imageData),                "text" => await _imageService.RecognizeTextAsync(imageData),
                "faces" => await _imageService.DetectFacesAsync(imageData),
                _ => await _imageService.AnalyzeGeneralAsync(imageData) // Default general analysis
            };
            
            // Return structured result as a ToolResponse
            // Format follows the MCP specification for content structure
            var content = new List<ContentItem>
            {
                new ContentItem
                {
                    Type = ContentType.Text,
                    Text = JsonSerializer.Serialize(analysisResult)
                }
            };
            
            return new ToolResponse
            {
                Content = content,
                IsError = false
            };
        }
        
        private async Task<byte[]> DownloadImageAsync(string url)
        {
            using var httpClient = new HttpClient();
            return await httpClient.GetByteArrayAsync(url);
        }
    }
    
    // Multi-modal MCP server with image and text processing
    public class MultiModalMcpServer
    {
        public static async Task Main(string[] args)
        {
            // Create an MCP server
            var server = new McpServer(
                name: "Multi-Modal MCP Server",
                version: "1.0.0"
            );
            
            // Configure server for multi-modal support
            var serverOptions = new McpServerOptions
            {
                MaxRequestSize = 10 * 1024 * 1024, // 10MB for larger payloads like images
                SupportedContentTypes = new[]
                {
                    "image/jpeg",
                    "image/png",
                    "text/plain",
                    "application/json"
                }
            };
            
            // Create image analysis service
            var imageService = new ComputerVisionService();
            
            // Register image analysis tools
            server.AddTool(new ImageAnalysisTool(imageService));
            
            // Register a text-to-image tool
            services.AddMcpTool<TextAnalysisTool>();
            services.AddMcpTool<ImageAnalysisTool>();
            services.AddMcpTool<DocumentGenerationTool>(); // Tool that can generate documents with text and images
        }
    }
}

در مثال بالا، ما:

یک ImageAnalysisTool ایجاد کردیم که می‌تواند تصاویر را با استفاده از سرویس فرضی IImageAnalysisService تحلیل کند.
سرور MCP را برای مدیریت درخواست‌های بزرگ‌تر و پشتیبانی از نوع محتوای تصویر پیکربندی کردیم.
ابزار تحلیل تصویر را در سرور ثبت کردیم.
متدی برای دانلود تصاویر از یک URL و تحلیل آن‌ها بر اساس نوع درخواست شده (اشیاء، متن، چهره‌ها و غیره) پیاده‌سازی کردیم.
نتایج ساختاریافته را در قالبی مطابق با مشخصات MCP بازگرداندیم.

مثال چندرسانه‌ای: پردازش صدا

پردازش صدا یکی دیگر از رسانه‌های رایج در برنامه‌های چندرسانه‌ای است. در ادامه نمونه‌ای از نحوه پیاده‌سازی ابزار رونویسی صدا که می‌تواند فایل‌های صوتی را پردازش کرده و رونویسی‌ها را بازگرداند، آورده شده است.

پیاده‌سازی Java

package com.example.mcp.multimodal;

import com.mcp.server.McpServer;
import com.mcp.tools.Tool;
import com.mcp.tools.ToolRequest;
import com.mcp.tools.ToolResponse;
import com.mcp.tools.ToolExecutionException;
import com.example.audio.AudioProcessor;

import java.util.Base64;
import java.util.HashMap;
import java.util.Map;

// Audio transcription tool
public class AudioTranscriptionTool implements Tool {
    private final AudioProcessor audioProcessor;
    
    public AudioTranscriptionTool(AudioProcessor audioProcessor) {
        this.audioProcessor = audioProcessor;
    }
    
    @Override
    public String getName() {
        return "audioTranscription";
    }
    
    @Override
    public String getDescription() {
        return "Transcribes speech from audio files to text";
    }
    
    @Override
    public Object getSchema() {
        Map<String, Object> schema = new HashMap<>();
        schema.put("type", "object");
        
        Map<String, Object> properties = new HashMap<>();
        
        Map<String, Object> audioUrl = new HashMap<>();
        audioUrl.put("type", "string");
        audioUrl.put("description", "URL to the audio file to transcribe");
        
        Map<String, Object> audioData = new HashMap<>();
        audioData.put("type", "string");
        audioData.put("description", "Base64-encoded audio data (alternative to URL)");
        
        Map<String, Object> language = new HashMap<>();
        language.put("type", "string");
        language.put("description", "Language code (e.g., 'en-US', 'es-ES')");
        language.put("default", "en-US");
        
        properties.put("audioUrl", audioUrl);
        properties.put("audioData", audioData);
        properties.put("language", language);
        
        schema.put("properties", properties);
        schema.put("required", Arrays.asList("audioUrl"));
        
        return schema;
    }
    
    @Override
    public ToolResponse execute(ToolRequest request) {
        try {
            byte[] audioData;
            String language = request.getParameters().has("language") ? 
                request.getParameters().get("language").asText() : "en-US";
                
            // Get audio either from URL or direct data
            if (request.getParameters().has("audioUrl")) {
                String audioUrl = request.getParameters().get("audioUrl").asText();
                audioData = downloadAudio(audioUrl);
            } else if (request.getParameters().has("audioData")) {
                String base64Audio = request.getParameters().get("audioData").asText();
                audioData = Base64.getDecoder().decode(base64Audio);
            } else {
                throw new ToolExecutionException("Either audioUrl or audioData must be provided");
            }
            
            // Process audio and transcribe
            Map<String, Object> transcriptionResult = audioProcessor.transcribe(audioData, language);
            
            // Return transcription result
            return new ToolResponse.Builder()
                .setResult(transcriptionResult)
                .build();
        } catch (Exception ex) {
            throw new ToolExecutionException("Audio transcription failed: " + ex.getMessage(), ex);
        }
    }
    
    private byte[] downloadAudio(String url) {
        // Implementation for downloading audio from URL
        // ...
        return new byte[0]; // Placeholder
    }
}

// Main application with audio and other modalities
public class MultiModalApplication {
    public static void main(String[] args) {
        // Configure services
        AudioProcessor audioProcessor = new AudioProcessor();
        ImageProcessor imageProcessor = new ImageProcessor();
        
        // Create and configure server
        McpServer server = new McpServer.Builder()
            .setName("Multi-Modal MCP Server")
            .setVersion("1.0.0")
            .setPort(5000)
            .setMaxRequestSize(20 * 1024 * 1024) // 20MB for audio/video content
            .build();
            
        // Register multi-modal tools
        server.registerTool(new AudioTranscriptionTool(audioProcessor));
        server.registerTool(new ImageAnalysisTool(imageProcessor));
        server.registerTool(new VideoProcessingTool());
        
        // Start server
        server.start();
        System.out.println("Multi-Modal MCP Server started on port 5000");
    }
}

در مثال بالا، ما:

یک AudioTranscriptionTool ایجاد کردیم که می‌تواند فایل‌های صوتی را رونویسی کند.
ساختار ابزار را طوری تعریف کردیم که یا URL یا داده‌های صوتی کدگذاری شده به صورت base64 را بپذیرد.
متد execute را برای پردازش صدا و رونویسی پیاده‌سازی کردیم.
سرور MCP را برای مدیریت درخواست‌های چندرسانه‌ای، شامل پردازش صدا و تصویر، پیکربندی کردیم.
ابزار رونویسی صدا را در سرور ثبت کردیم.
متدی برای دانلود فایل‌های صوتی از URL یا رمزگشایی داده‌های صوتی base64 پیاده‌سازی کردیم.
از سرویس AudioProcessor برای انجام منطق واقعی رونویسی استفاده کردیم.
سرور MCP را برای دریافت درخواست‌ها راه‌اندازی کردیم.

مثال چندرسانه‌ای: تولید پاسخ چندرسانه‌ای

پیاده‌سازی Python

from mcp_server import McpServer
from mcp_tools import Tool, ToolRequest, ToolResponse, ToolExecutionException
import base64
from PIL import Image
import io
import requests
import json
from typing import Dict, Any, List, Optional

# Image generation tool
class ImageGenerationTool(Tool):
    def get_name(self):
        return "imageGeneration"
        
    def get_description(self):
        return "Generates images based on text descriptions"
    
    def get_schema(self):
        return {
            "type": "object",
            "properties": {
                "prompt": {
                    "type": "string", 
                    "description": "Text description of the image to generate"
                },
                "style": {
                    "type": "string",
                    "enum": ["realistic", "artistic", "cartoon", "sketch"],
                    "default": "realistic"
                },
                "width": {
                    "type": "integer",
                    "default": 512
                },
                "height": {
                    "type": "integer",
                    "default": 512
                }
            },
            "required": ["prompt"]
        }
    
    async def execute_async(self, request: ToolRequest) -> ToolResponse:
        try:
            # Extract parameters
            prompt = request.parameters.get("prompt")
            style = request.parameters.get("style", "realistic")
            width = request.parameters.get("width", 512)
            height = request.parameters.get("height", 512)
            
            # Generate image using external service (example implementation)
            image_data = await self._generate_image(prompt, style, width, height)
            
            # Convert image to base64 for response
            buffered = io.BytesIO()
            image_data.save(buffered, format="PNG")
            img_str = base64.b64encode(buffered.getvalue()).decode()
            
            # Return result with both the image and metadata
            return ToolResponse(
                result={
                    "imageBase64": img_str,
                    "format": "image/png",
                    "width": width,
                    "height": height,
                    "generationPrompt": prompt,
                    "style": style
                }
            )
        except Exception as e:
            raise ToolExecutionException(f"Image generation failed: {str(e)}")
    
    async def _generate_image(self, prompt: str, style: str, width: int, height: int) -> Image.Image:
        """
        This would call an actual image generation API
        Simplified placeholder implementation
        """
        # Return a placeholder image or call actual image generation API
        # For this example, we'll create a simple colored image
        image = Image.new('RGB', (width, height), color=(73, 109, 137))
        return image

# Multi-modal response handler
class MultiModalResponseHandler:
    """Handler for creating responses that combine text, images, and other modalities"""
    
    def __init__(self, mcp_client):
        self.client = mcp_client
    
    async def create_multi_modal_response(self, 
                                         text_content: str, 
                                         generate_images: bool = False,
                                         image_prompts: Optional[List[str]] = None) -> Dict[str, Any]:
        """
        Creates a response that may include generated images alongside text
        """
        response = {
            "text": text_content,
            "images": []
        }
        
        # Generate images if requested
        if generate_images and image_prompts:
            for prompt in image_prompts:
                image_result = await self.client.execute_tool(
                    "imageGeneration",
                    {
                        "prompt": prompt,
                        "style": "realistic",
                        "width": 512,
                        "height": 512
                    }
                )
                
                response["images"].append({
                    "imageData": image_result.result["imageBase64"],
                    "format": image_result.result["format"],
                    "prompt": prompt
                })
        
        return response

# Main application
async def main():
    # Create server
    server = McpServer(
        name="Multi-Modal MCP Server",
        version="1.0.0",
        port=5000
    )
    
    # Register multi-modal tools
    server.register_tool(ImageGenerationTool())
    server.register_tool(AudioAnalysisTool())
    server.register_tool(VideoFrameExtractionTool())
    
    # Start server
    await server.start()
    print("Multi-Modal MCP Server running on port 5000")

if __name__ == "__main__":
    import asyncio
    asyncio.run(main())

مرحله بعد

5.3 Oauth 2

سلب مسئولیت:
این سند با استفاده از سرویس ترجمه هوش مصنوعی Co-op Translator ترجمه شده است. در حالی که ما در تلاش برای دقت هستیم، لطفاً توجه داشته باشید که ترجمه‌های خودکار ممکن است حاوی خطاها یا نواقصی باشند. سند اصلی به زبان بومی خود باید به عنوان منبع معتبر در نظر گرفته شود. برای اطلاعات حیاتی، ترجمه حرفه‌ای انسانی توصیه می‌شود. ما مسئول هیچ گونه سوءتفاهم یا تفسیر نادرستی که از استفاده این ترجمه ناشی شود، نیستیم.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

یکپارچه‌سازی چندرسانه‌ای

مقدمه

اهداف یادگیری

معماری پشتیبانی چندرسانه‌ای

مثال چندرسانه‌ای: تحلیل تصویر

پیاده‌سازی C#

مثال چندرسانه‌ای: پردازش صدا

پیاده‌سازی Java

مثال چندرسانه‌ای: تولید پاسخ چندرسانه‌ای

پیاده‌سازی Python

مرحله بعد

FilesExpand file tree

README.md

Latest commit

History

README.md

File metadata and controls

یکپارچه‌سازی چندرسانه‌ای

مقدمه

اهداف یادگیری

معماری پشتیبانی چندرسانه‌ای

مثال چندرسانه‌ای: تحلیل تصویر

پیاده‌سازی C#

مثال چندرسانه‌ای: پردازش صدا

پیاده‌سازی Java

مثال چندرسانه‌ای: تولید پاسخ چندرسانه‌ای

پیاده‌سازی Python

مرحله بعد