autogen/agents/experimental/evaluation/evaluation_agent.py

# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors
#
# SPDX-License-Identifier: Apache-2.0

import asyncio
from copy import deepcopy
from typing import Any, Optional, Union

from pydantic import BaseModel, Field

from .... import (
    Agent,
    ConversableAgent,
    UpdateSystemMessage,
)
from ....doc_utils import export_module
from ....oai.client import OpenAIWrapper

__all__ = ["EvaluationAgent"]


@export_module("autogen.agents.contrib")
class EvaluationAgent(ConversableAgent):
    """Utilises multiple agents, evaluating their performance then selecting and returning the best one.

    The agent follows the internal process:
    1. Synthesize the task from the input using an LLM
    2. Ask each agent to respond to the task (asynchronously)
    3. Evaluator evaluates and selects the response
    4. Return the selected response

    You must pass in at least two agents.
    """

    # Evaluator agent system message, cannot be overridden
    DEFAULT_EVALUATOR_MESSAGE = (
        "You are responsible for evaluating and selecting the best response from a set of agents. "
        "Each agent, identified by a name, will be given a chance to respond. "
        "Remember, you are only evaluating and it's not your opinion so don't add your judgement, change responses, or decline to respond. "
        "Evaluation Criteria:\n[evaluation_guidance]\n"
        "[agent_outputs]"
    )

    # Default evaluation guidance, can be overridden
    DEFAULT_EVALUATON_GUIDANCE = (
        "1. Carefully review each approach and result\n"
        "2. Evaluate each solution based on criteria appropriate to the task\n"
        "3. Select the absolute best response\n"
        "4. You must select a response as the best response"
    )

    # Default reply template for the EvaluationAgent, can be overridden
    DEFAULT_REPLY_TEMPLATE = "AGENT '[agent_name]' RESPONSE SELECTED.\n\nREASON:\n[reason]\n\nRESPONSE:\n[response]"

    def __init__(
        self,
        *,
        llm_config: dict[str, Any],
        agents: list[ConversableAgent],
        response_instructions: Optional[str] = None,
        evaluation_guidance: Optional[str] = None,
        reply_template: Optional[str] = None,
        async_responses: bool = True,
        silent: bool = False,
        **kwargs: Any,
    ) -> None:
        """Initialize the EvaluationAgent.

        Args:
            llm_config (dict[str, Any]): LLM Configuration for the internal synthesizer and evaluator agents.
            agents (list[ConversableAgent]): List of agents that will provide their responses for evaluation.
            response_instructions (str): Instructions for the agents on how to respond to the task. This will be appended to the end of the synthesized task message.
            evaluation_guidance (str): Guidance on how to evaluate the agents, used by the internal evaluator agent.
                Default is:
                "1. Carefully review each approach and result\n2. Evaluate each solution based on criteria appropriate to the task\n3. Select the absolute best response\n4. You must select a response as the best response"
            reply_template (str): Template for the reply to be generated by the EvaluationAgent.
                Three placeholders are available for substitution: [agent_name], [reason], and [response].
                Default is:
                "AGENT '[agent_name]' RESPONSE SELECTED.\n\nREASON:\n[reason]\n\nRESPONSE:\n[response]"
            async_responses (bool): Whether to gather responses asynchronously. Default is True.
            silent (bool): Whether to silence the agent's internal conversations. Default is False meaning all internal conversations will be visible.
            **kwargs (Any): Additional keyword arguments to pass to the base class.
        """

        assert len(agents) > 1, "EvaluationAgent requires at least two agents for evaluation."
        assert llm_config, "EvaluationAgent requires llm_config for the internal synthesizer and evaluator agents."

        # Initialise the base class, ignoring llm_config as we'll put that on internal agents
        super().__init__(**kwargs)

        # Store custom parameters
        self._evaluation_agents = agents
        self._evaluation_response_instructions = response_instructions
        self._evaluation_llm_config = llm_config
        self._evaluation_guidance = evaluation_guidance if evaluation_guidance else self.DEFAULT_EVALUATON_GUIDANCE
        self._evaluation_reply_template = reply_template if reply_template else self.DEFAULT_REPLY_TEMPLATE
        self._evaluation_silent = silent
        self._evaluation_async = async_responses

        # Register our reply function for evaluation with the agent
        # This will be the agent's only reply function
        self.register_reply(
            trigger=[Agent, None], reply_func=self._generate_evaluate_reply, remove_other_reply_funcs=True
        )

    # Class used internally to get the string result from a function or, if it fails, what we should return
    class FunctionStringResult(BaseModel):
        result: str = ""
        success: bool = True

    # SYNTHESIZING TASK

    # Structured Output for the synthesizer agent
    class EvaluationTask(BaseModel):
        task: str = Field(description="The task to be solved by the agents.")
        clarification_needed: Optional[str] = Field(
            description="If the task is not clear, describe clarity needed. Only ask if absolutely critical."
        )

    # Consolidate messages from the outside chat for the synthesizer to determine the task from
    def _consolidate_messages(self, messages: Optional[Union[list[dict[str, Any]], str]]) -> str:  # type: ignore[type-arg]
        """Consolidates the external chat's messages for the Synthesizer to analyse"""
        if isinstance(messages, str):
            return messages
        elif isinstance(messages, list) and len(messages) > 0:
            # Loop through messages and consolidate, taking into account same may be tool calls and some may be tool responses, which we'll ignore.
            # If the message has content and name then it should combine as "name:\ncontent\n\n"
            consolidated_message = ""
            for message in messages:
                if "content" in message and "name" in message:
                    consolidated_message += f"{message['name']}:\n{message['content']}\n\n"
            return consolidated_message.strip()
        else:
            raise NotImplementedError("Invalid messages format. Must be a list of messages or a string.")

    def _create_synthesizer(self) -> None:
        """Create the internal synthesizer agent."""

        # Add the response_format to the agent
        synthesizer_llm_config = deepcopy(self._evaluation_llm_config)
        synthesizer_llm_config["response_format"] = EvaluationAgent.EvaluationTask

        self._synthesizer_agent = ConversableAgent(
            name="evaluationagent_synthesizer",
            llm_config=synthesizer_llm_config,
            system_message=(
                "Analyze the messages and determine the task being asked to be solved and reply with it, keeping it as close to word-to-word as possible. "
                "If clarification is needed, provide details on the clarity needed. No other information is being be provided to the respondents."
            ),
        )

    def _synthesize_task(self, user_agent: ConversableAgent, messages: list[dict[str, Any]]) -> FunctionStringResult:
        """Synthesize the task from the outside messages."""
        self._create_synthesizer()

        consolidate_incoming_messages = self._consolidate_messages(messages)

        sythesized_result = user_agent.initiate_chat(
            recipient=self._synthesizer_agent,
            message=consolidate_incoming_messages,
            max_turns=1,
            silent=self._evaluation_silent,
        )

        # Evaluate the result of the task synthesis
        try:
            evaluation_task = EvaluationAgent.EvaluationTask.model_validate_json(sythesized_result.summary)
        except Exception as e:
            return EvaluationAgent.FunctionStringResult(
                result=f"EvaluationAgent was unable to determine the task: {e}", success=False
            )

        if not evaluation_task.task:
            return EvaluationAgent.FunctionStringResult(
                result="EvaluationAgent was unable to determine the task.", success=False
            )

        if evaluation_task.clarification_needed:
            return EvaluationAgent.FunctionStringResult(
                result=f"I need clarity on the task: {evaluation_task.clarification_needed}", success=False
            )

        return EvaluationAgent.FunctionStringResult(result=evaluation_task.task)

    # GATHER RESPONSES

    def _compile_responses_nested_chat(self, task: str) -> list[dict[str, Any]]:
        """Compile the nested chat for the responses part of the evaluation process."""

        nested_chats = []

        for i, agent in enumerate(self._evaluation_agents):
            agent_dict = {
                "recipient": agent,
                "chat_id": agent.name,
                "message": f"Please provide your response to the task:\n\n{task}",
                "summary_method": "last_msg",
                "max_turns": 1,
                # Exclude all chat results before this one from being carried over (so the agent only sees the task)
                "finished_chat_indexes_to_exclude_from_carryover": [] if i == 0 else list(range(i)),
            }

            nested_chats.append(agent_dict)

        return nested_chats

    def _compile_nested_responses(
        self, sender: ConversableAgent, recipient: ConversableAgent, summary_args: dict[str, Any]
    ) -> str:
        response: str = ""
        self._evaluation_agent_responses: dict[str, str] = {}

        for agent in self._evaluation_agents:
            if recipient.chat_messages[agent] and len(recipient.chat_messages[agent]) == 2:
                agent_response = recipient.chat_messages[agent][-1]
                response += f"AGENT '{agent.name}' RESPONSE:\n{agent_response['content']}\n\n" + "-" * 50 + "\n\n"
                self._evaluation_agent_responses[agent.name] = agent_response["content"]
            else:
                return ""  # At least one of the agents didn't respond, abort.

        return response

    def _gather_responses(self, user_agent: ConversableAgent, task: str) -> FunctionStringResult:
        """Gather responses from all agents for the task."""
        gathering_agent = ConversableAgent(
            name="evaluation_gather",
        )

        # Create the nested chats for all the agents to respond
        responses_nested_chat = self._compile_responses_nested_chat(task=task)

        # Associate that with the gathering_agent
        gathering_agent.register_nested_chats(
            chat_queue=responses_nested_chat,
            position=0,
            use_async=self._evaluation_async,
            trigger=Agent,  # Any agent sender will trigger this
        )

        if self._evaluation_async:
            # Asynchronously get the responses
            responses_result = asyncio.run(
                user_agent.a_initiate_chat(
                    recipient=gathering_agent,
                    max_turns=1,
                    message="",  # Prevent it trying to get user input
                    silent=self._evaluation_silent,
                    summary_method=self._compile_nested_responses,
                )
            )
        else:
            # Synchronously get the responses
            responses_result = user_agent.initiate_chat(
                recipient=gathering_agent,
                max_turns=1,
                message="",  # Prevent it trying to get user input
                silent=self._evaluation_silent,
                summary_method=self._compile_nested_responses,
            )

        if responses_result.summary == "":
            return EvaluationAgent.FunctionStringResult(
                result="EvaluationAgent was unable to gather responses from all agents.", success=False
            )

        # Compiled responses
        return EvaluationAgent.FunctionStringResult(result=responses_result.summary)

    # EVALUATOR AGENT

    # Structured Output for the evaluator agent
    class NominatedResponse(BaseModel):
        agent_name: str = Field(description="Name of agent that provided the response.")
        response: str = Field(description="Exact, word-for-word, response selected.")
        reason: str = Field(description="Brief reason why it was the best response.")

    def _generate_evaluator_system_message(self, agent: ConversableAgent, messages: list[dict[str, Any]]) -> str:
        """Generate the system message for the internal evaluator agent."""
        # Substitute the evaluation guidance into the system message
        return EvaluationAgent.DEFAULT_EVALUATOR_MESSAGE.replace("[evaluation_guidance]", self._evaluation_guidance)

    def _create_evaluator(self) -> None:
        """Create the internal evaluator agent."""

        # Add the response_format to the agent
        evaluator_llm_config = deepcopy(self._evaluation_llm_config)
        evaluator_llm_config["response_format"] = EvaluationAgent.NominatedResponse

        self._evaluator_agent = ConversableAgent(
            name="evaluationagent_evaluator",
            llm_config=evaluator_llm_config,
            update_agent_state_before_reply=[UpdateSystemMessage(self._generate_evaluator_system_message)],
        )

    # Inner evaluation process
    def _generate_evaluate_reply(
        self,
        agent: ConversableAgent,
        messages: Optional[list[dict[str, Any]]] = None,
        sender: Optional[Agent] = None,
        config: Optional[OpenAIWrapper] = None,
    ) -> tuple[bool, Union[str, dict[str, Any]]]:
        if not messages:
            return True, {"content": "EvaluationAgent requires messages to evaluate, please reply with a task."}

        # Supplemental agent used for chatting with internal agents
        user_agent = ConversableAgent(
            name="evaluation_user",
            human_input_mode="NEVER",
        )

        # 1. Synthesize the task from the input
        synthesized_task = self._synthesize_task(user_agent, messages)

        if not synthesized_task.success:
            return True, {"content": synthesized_task.result}

        task = synthesized_task.result

        if self._evaluation_response_instructions:
            task += f"\n\n{self._evaluation_response_instructions}"

        # 2. Each agent gives their response using an asynchronous nested chat
        gather_compiled_responses = self._gather_responses(user_agent, task)

        if not gather_compiled_responses.success:
            return True, {"content": gather_compiled_responses.result}

        compiled_responses = gather_compiled_responses.result

        # 3. Evaluator evaluates and selects the response
        self._create_evaluator()

        evaluation = user_agent.initiate_chat(
            recipient=self._evaluator_agent, message=compiled_responses, max_turns=1, silent=self._evaluation_silent
        )

        # Extract the nominated response
        try:
            nominated_response = EvaluationAgent.NominatedResponse.model_validate_json(evaluation.summary)
        except Exception as e:
            return True, {"content": f"EvaluationAgent was unable to select the best response: {e}"}

        if not nominated_response.response:
            return True, {"content": "EvaluationAgent was unable to select a response."}

        # Ensure the nominated agent name exists
        if nominated_response.agent_name not in [a.name for a in self._evaluation_agents]:
            return True, {"content": "EvaluationAgent provided an invalid agent name when selecting a response."}

        # We'll get the response from the agent's original response, rather than this structured output one
        # so that we can ensure it remains as it was originally
        agent_response = self._evaluation_agent_responses[nominated_response.agent_name]

        # Compile the response and return it using the self._evaluation_reply_template
        compiled_reply = (
            self._evaluation_reply_template.replace("[agent_name]", nominated_response.agent_name)
            .replace("[reason]", nominated_response.reason)
            .replace("[response]", agent_response)
        )

        # 4. Return the selected response in the specified compiled format
        return True, compiled_reply