Merge pull request #726 from alphagov/2996-add-coherence-metric

davidgisbey · web-flow · commit 4a878b4ae84c · 2026-01-06T10:22:45.000Z
Add Coherence metric
diff --git a/lib/auto_evaluation.rb b/lib/auto_evaluation.rb
@@ -0,0 +1,9 @@
+module AutoEvaluation
+  ScoreResult = Data.define(
+    :score,
+    :reason,
+    :success,
+    :llm_responses,
+    :metrics,
+  )
+end
diff --git a/lib/auto_evaluation/answer_relevancy.rb b/lib/auto_evaluation/answer_relevancy.rb
@@ -1,12 +1,4 @@
 class AutoEvaluation::AnswerRelevancy
-  Result = Data.define(
-    :score,
-    :reason,
-    :success,
-    :llm_responses,
-    :metrics,
-  )
-
   THRESHOLD = 0.5
 
   def self.call(...) = new(...).call
@@ -55,7 +47,7 @@ def call
       question_message:, verdicts:, score:,
     )
 
-    Result.new(
+    AutoEvaluation::ScoreResult.new(
       score:,
       reason:,
       success: score >= THRESHOLD,
@@ -78,7 +70,7 @@ def calculate_score(verdicts)
   end
 
   def build_maximum_score_result(reason:, llm_responses:, metrics:)
-    Result.new(
+    AutoEvaluation::ScoreResult.new(
       score: 1.0,
       reason:,
       success: true,
diff --git a/lib/auto_evaluation/bedrock_openai_oss_invoke.rb b/lib/auto_evaluation/bedrock_openai_oss_invoke.rb
@@ -1,5 +1,6 @@
 module AutoEvaluation
   class BedrockOpenAIOssInvoke
+    class InvalidToolCallSchemaError < StandardError; end
     Result = Data.define(
       :evaluation_data,
       :llm_response,
@@ -33,12 +34,14 @@ def call
         }.to_json,
       )
       parsed_response = JSON.parse(response.body.read)
-      parsed_structured_output = JSON.parse(
+      parsed_tool_output = JSON.parse(
         parsed_response["choices"][0]["message"]["tool_calls"][0]["function"]["arguments"],
       )
 
+      validate_tool_output_against_schema(parsed_tool_output)
+
       Result.new(
-        evaluation_data: parsed_structured_output,
+        evaluation_data: parsed_tool_output,
         llm_response: parsed_response,
         metrics: build_metrics(start_time, parsed_response),
       )
@@ -57,5 +60,12 @@ def build_metrics(start_time, response)
         model: response["model"],
       }
     end
+
+    def validate_tool_output_against_schema(tool_output)
+      schema = tools.dig(0, "function", "parameters")
+      JSON::Validator.validate!(schema, tool_output)
+    rescue JSON::Schema::ValidationError => e
+      raise InvalidToolCallSchemaError, "Tool call response does not match schema: #{e.message}"
+    end
   end
 end
diff --git a/lib/auto_evaluation/coherence.rb b/lib/auto_evaluation/coherence.rb
@@ -0,0 +1,52 @@
+module AutoEvaluation
+  class Coherence
+    THRESHOLD = 0.75
+
+    def self.call(...) = new(...).call
+
+    def initialize(question_message:, answer_message:)
+      @question_message = question_message
+      @answer_message = answer_message
+    end
+
+    def call
+      result = BedrockOpenAIOssInvoke.call(user_prompt, tools)
+      score = normalise_rubric_score(result.evaluation_data.fetch("score"))
+
+      AutoEvaluation::ScoreResult.new(
+        score:,
+        reason: result.evaluation_data.fetch("reason").strip,
+        success: score >= THRESHOLD,
+        llm_responses: { coherence: result.llm_response },
+        metrics: { coherence: result.metrics },
+      )
+    end
+
+  private
+
+    attr_reader :question_message, :answer_message
+
+    def llm_prompts
+      Prompts.config.coherence
+    end
+
+    def user_prompt
+      sprintf(
+        llm_prompts.fetch(:user_prompt),
+        answer: answer_message,
+        question: question_message,
+      )
+    end
+
+    def tools
+      [llm_prompts.fetch(:tool_spec)]
+    end
+
+    def normalise_rubric_score(rubric_score)
+      min_rubric_score = llm_prompts.fetch(:config).fetch(:min_rubric_score)
+      max_rubric_score = llm_prompts.fetch(:config).fetch(:max_rubric_score)
+
+      (rubric_score.to_d - min_rubric_score) / (max_rubric_score - min_rubric_score)
+    end
+  end
+end
diff --git a/lib/auto_evaluation/evaluate_answer_from_question_message.rb b/lib/auto_evaluation/evaluate_answer_from_question_message.rb
@@ -0,0 +1,35 @@
+module AutoEvaluation
+  class EvaluateAnswerFromQuestionMessage
+    class TaskFailedError < StandardError; end
+
+    def self.call(...) = new(...).call
+
+    def initialize(evaluation_class:, question_message:)
+      @evaluation_class = evaluation_class
+      @question_message = question_message
+    end
+
+    def call
+      question = Question.new(message: question_message, conversation: Conversation.new)
+      answer = AnswerComposition::PipelineRunner.call(question:, pipeline: [
+        AnswerComposition::Pipeline::SearchResultFetcher,
+        AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
+      ])
+
+      if answer.status =~ /^error/
+        error_message = "Answer has an error status: #{answer.status} " \
+                        "and error message: #{answer.error_message}"
+        raise TaskFailedError, error_message
+      end
+
+      evaluation_class.call(
+        question_message:,
+        answer_message: answer.message,
+      )
+    end
+
+  private
+
+    attr_reader :evaluation_class, :question_message
+  end
+end
diff --git a/lib/tasks/evaluation.rake b/lib/tasks/evaluation.rake
@@ -177,24 +177,31 @@ namespace :evaluation do
   task generate_answer_relevancy_evaluation: :environment do
     raise "Requires an INPUT env var" if ENV["INPUT"].blank?
 
-    question = Question.new(message: ENV["INPUT"], conversation: Conversation.new)
-
-    answer = AnswerComposition::PipelineRunner.call(question:, pipeline: [
-      AnswerComposition::Pipeline::Claude::QuestionRouter,
-      AnswerComposition::Pipeline::SearchResultFetcher,
-      AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
-    ])
-
-    if answer.status =~ /^error/
-      warn "Warning: answer has an error status: #{answer.status}"
-      abort(answer.error_message)
+    begin
+      result = AutoEvaluation::EvaluateAnswerFromQuestionMessage.call(
+        evaluation_class: AutoEvaluation::AnswerRelevancy,
+        question_message: ENV["INPUT"],
+      )
+
+      puts result.to_json
+    rescue AutoEvaluation::EvaluateAnswerFromQuestionMessage::TaskFailedError => e
+      abort e.message
     end
+  end
 
-    result = AutoEvaluation::AnswerRelevancy.call(
-      question_message: answer.rephrased_question || question.message,
-      answer_message: answer.message,
-    )
+  desc "Run answer coherence evaluation for a user input"
+  task generate_coherence_evaluation: :environment do
+    raise "Requires an INPUT env var" if ENV["INPUT"].blank?
 
-    puts(result.to_json)
+    begin
+      result = AutoEvaluation::EvaluateAnswerFromQuestionMessage.call(
+        evaluation_class: AutoEvaluation::Coherence,
+        question_message: ENV["INPUT"],
+      )
+
+      puts result.to_json
+    rescue AutoEvaluation::EvaluateAnswerFromQuestionMessage::TaskFailedError => e
+      abort e.message
+    end
   end
 end
diff --git a/spec/factories/auto_evaluation_score_result.rb b/spec/factories/auto_evaluation_score_result.rb
@@ -0,0 +1,13 @@
+FactoryBot.define do
+  factory :auto_evaluation_score_result, class: "AutoEvaluation::ScoreResult" do
+    skip_create
+
+    score { 0.85.to_d }
+    reason { "Most statements are relevant." }
+    success { true }
+    llm_responses { {} }
+    metrics { {} }
+
+    initialize_with { new(**attributes) }
+  end
+end
diff --git a/spec/lib/auto_evaluation/answer_relevancy/verdicts_generator_spec.rb b/spec/lib/auto_evaluation/answer_relevancy/verdicts_generator_spec.rb
@@ -4,8 +4,8 @@
     let(:statements) { ["Statement one.", "Statement two."] }
     let(:verdicts) do
       [
-        { "verdict" => "Yes" },
-        { "verdict" => "No", "reason" => "The statement is irrelevant." },
+        { "verdict" => "yes" },
+        { "verdict" => "no", "reason" => "The statement is irrelevant." },
       ]
     end
     let(:verdicts_json) do
diff --git a/spec/lib/auto_evaluation/answer_relevancy_spec.rb b/spec/lib/auto_evaluation/answer_relevancy_spec.rb
@@ -23,8 +23,8 @@
 
     let(:verdicts) do
       [
-        { "verdict" => "Yes" },
-        { "verdict" => "No", "reason" => "The statement is irrelevant." },
+        { "verdict" => "yes" },
+        { "verdict" => "no", "reason" => "The statement is irrelevant." },
       ]
     end
     let(:verdicts_json) { { verdicts: }.to_json }
@@ -90,7 +90,7 @@
         reason: shared_expected_metrics_attributes,
       }
       expect(result)
-        .to be_a(described_class::Result)
+        .to be_a(AutoEvaluation::ScoreResult)
         .and have_attributes(
           score: 0.5,
           reason:,
@@ -104,7 +104,7 @@
       let(:verdicts) do
         [
           { "verdict" => "idk", "reason" => "Cannot determine relevance." },
-          { "verdict" => "No", "reason" => "The statement is irrelevant." },
+          { "verdict" => "no", "reason" => "The statement is irrelevant." },
         ]
       end
 
@@ -130,7 +130,7 @@
         )
 
         expect(result)
-          .to be_a(described_class::Result)
+          .to be_a(AutoEvaluation::ScoreResult)
           .and have_attributes(
             score: 1.0,
             reason: "No statements were extracted from the answer.",
@@ -154,7 +154,7 @@
         )
 
         expect(result)
-          .to be_a(described_class::Result)
+          .to be_a(AutoEvaluation::ScoreResult)
           .and have_attributes(
             score: 1.0,
             reason: "No verdicts were generated for the extracted statements.",
@@ -172,7 +172,7 @@
     end
 
     context "when verdicts are generated and none have a 'no' verdict" do
-      let(:verdicts_json) { { verdicts: [{ "verdict" => "Yes" }, { "verdict" => "Yes" }] }.to_json }
+      let(:verdicts_json) { { verdicts: [{ "verdict" => "yes" }, { "verdict" => "yes" }] }.to_json }
 
       it "returns a result object with the expected attributes" do
         allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0, 204.0, 206.0)
@@ -183,7 +183,7 @@
         )
 
         expect(result)
-          .to be_a(described_class::Result)
+          .to be_a(AutoEvaluation::ScoreResult)
           .and have_attributes(
             score: 1.0,
             reason: "The response fully addressed the input with no irrelevant statements.",
diff --git a/spec/lib/auto_evaluation/bedrock_openai_oss_invoke_spec.rb b/spec/lib/auto_evaluation/bedrock_openai_oss_invoke_spec.rb
@@ -4,18 +4,18 @@
     let(:tools) do
       [
         {
-          type: "function",
-          function: {
-            name: "test_schema",
-            description: "A test JSON schema",
-            schema: {
-              type: "object",
-              properties: {
-                response: { type: "string" },
+          "type" => "function",
+          "function" => {
+            "name" => "test_schema",
+            "description" => "A test JSON schema",
+            "parameters" => {
+              "type" => "object",
+              "properties" => {
+                "response" => { "type" => "string" },
               },
-              required: %w[response],
+              "required" => %w[response],
             },
-            strict: true,
+            "strict" => true,
           },
         },
       ]
@@ -55,5 +55,20 @@
         },
       )
     end
+
+    it "raises an error if the response does not conform to the schema" do
+      bedrock_invoke_model_openai_oss_tool_call(
+        user_message,
+        tools,
+        { "invalid_key" => "This does not conform to the schema." }.to_json,
+      )
+
+      expect {
+        described_class.call(user_message, tools)
+      }.to raise_error(
+        described_class::InvalidToolCallSchemaError,
+        /The property '#\/' did not contain a required property of 'response'/,
+      )
+    end
   end
 end
diff --git a/spec/lib/auto_evaluation/coherence_spec.rb b/spec/lib/auto_evaluation/coherence_spec.rb
diff --git a/spec/lib/auto_evaluation/evaluate_answer_from_question_message_spec.rb b/spec/lib/auto_evaluation/evaluate_answer_from_question_message_spec.rb
diff --git a/spec/lib/tasks/evaluation_spec.rb b/spec/lib/tasks/evaluation_spec.rb

Original file line number	Diff line number	Diff line change
`@@ -4,8 +4,8 @@`
`4`	`4`	`let(:statements) { ["Statement one.", "Statement two."] }`
`5`	`5`	`let(:verdicts) do`
`6`	`6`	`[`
`7`		`- { "verdict" => "Yes" },`
`8`		`- { "verdict" => "No", "reason" => "The statement is irrelevant." },`
	`7`	`+ { "verdict" => "yes" },`
	`8`	`+ { "verdict" => "no", "reason" => "The statement is irrelevant." },`
`9`	`9`	`]`
`10`	`10`	`end`
`11`	`11`	`let(:verdicts_json) do`