Add LLM as a Judge component

Ololoshechkin · Ololoshechkin · commit 08bbf81add85 · 2025-09-26T04:12:02.000+02:00
diff --git a/agents/agents-ext/src/commonMain/kotlin/ai/koog/agents/ext/agent/LLMAsAJudge.kt b/agents/agents-ext/src/commonMain/kotlin/ai/koog/agents/ext/agent/LLMAsAJudge.kt
@@ -0,0 +1,111 @@
+package ai.koog.agents.ext.agent
+
+import ai.koog.agents.core.dsl.builder.AIAgentNodeDelegate
+import ai.koog.agents.core.dsl.builder.AIAgentSubgraphBuilderBase
+import ai.koog.agents.core.tools.annotations.LLMDescription
+import ai.koog.prompt.dsl.prompt
+import ai.koog.prompt.executor.clients.openai.OpenAIModels
+import ai.koog.prompt.llm.LLModel
+import ai.koog.prompt.message.Message
+import ai.koog.prompt.structure.StructureFixingParser
+import kotlinx.serialization.Serializable
+
+@Serializable
+@LLMDescription("Result of the evaluation")
+internal data class CriticResultFromLLM(
+    @property:LLMDescription("Was the plan correct?")
+    val isCorrect: Boolean,
+    @property:LLMDescription(
+        "Optional feedback about the plan. " +
+            "Only needed if `isCorrect == false` and if plan needs adjustments."
+    )
+    val feedback: String
+)
+
+/**
+ * Represents the result of a critique or feedback process.
+ *
+ * @property successful Indicates whether the critique operation was successful.
+ * @property feedback A textual message providing details about the*/
+public data class CriticResult(
+    val successful: Boolean,
+    val feedback: String,
+    val input: String
+)
+
+/**
+ * A method to utilize a language model (LLM) as a critic or judge for evaluating tasks with context-aware feedback.
+ * This method processes a given task and the interaction history to provide structured feedback on the task's correctness.
+ *
+ * @param llmModel The optional language model to override the default model during the session. If `null`, the default model will be used.
+ * @param task The task or instruction to be presented to the language model for critical evaluation.
+ */
+public fun AIAgentSubgraphBuilderBase<*, *>.llmAsAJudge(
+    llmModel: LLModel? = null,
+    task: String
+): AIAgentNodeDelegate<String, CriticResult> = node<String, CriticResult> { nodeInput ->
+    llm.writeSession {
+        val initialPrompt = prompt.copy()
+        val initialModel = model
+
+        prompt = prompt("critic") {
+            // Combine all history into one message with XML tags
+            // to prevent LLM from continuing answering in a tool_call -> tool_result pattern
+            val combinedMessage = buildString {
+                append("<previous_conversation>\n")
+                initialPrompt.messages.forEach { message ->
+                    when (message) {
+                        is Message.System -> append("<user>\n${message.content}\n</user>\n")
+                        is Message.User -> append("<user>\n${message.content}\n</user>\n")
+                        is Message.Assistant -> append("<assistant>\n${message.content}\n</assistant>\n")
+                        is Message.Tool.Call -> append(
+                            "<tool_call tool=${message.tool}>\n${message.content}\n</tool_call>\n"
+                        )
+
+                        is Message.Tool.Result -> append(
+                            "<tool_result tool=${message.tool}>\n${message.content}\n</tool_result>\n"
+                        )
+                    }
+                }
+                append("</previous_conversation>\n")
+            }
+
+            // Put Critic Task as a System instruction
+            system(task)
+            // And rest of the history -- in a combined XML message
+            user(combinedMessage)
+        }
+
+        if (llmModel != null) {
+            model = llmModel
+        }
+
+        val result = requestLLMStructured<CriticResultFromLLM>(
+            // optional field -- recommented for LLM awareness and reliability of the output
+            examples = listOf(
+                CriticResultFromLLM(
+                    isCorrect = true,
+                    feedback = "All good"
+                ),
+                CriticResultFromLLM(
+                    isCorrect = false,
+                    feedback = "Following parts of the plan have problems: *, *, *. Please consider changing ..."
+                )
+            ),
+            // optional field -- recommented for reliability of the format
+            fixingParser = StructureFixingParser(
+                fixingModel = OpenAIModels.CostOptimized.GPT4oMini,
+                retries = 3,
+            )
+        ).getOrThrow().structure
+
+        prompt = initialPrompt
+        model = initialModel
+
+        CriticResult(
+            successful = result.isCorrect,
+            feedback = result.feedback,
+            input = nodeInput
+        )
+    }
+}