From 1d1f131f2aede12d321e54e51f3e4d66b8a75721 Mon Sep 17 00:00:00 2001
From: David Gisbey <david.gisbey@digital.cabinet-office.gov.uk>
Date: Tue, 6 Jan 2026 13:15:54 +0000
Subject: [PATCH] Update metrics to take answer model as argument

When I added the EvaluateAnswerFromQuestionMessage class, I made a faulty
assumption that the metric inputs would be a question and answer message.

However, the remaining two metrics take an answer message and used sources.
This commit updates the AnswerRelevancy and Coherence metrics to take the
answer model as an argument. The class then decides what it wants to
do with it, allowing us more flexibility.

I've updated the EvaluateAnswerFromQuestionMessage class to reflect this
change by calling the evaluation class with the answer.
---
 lib/auto_evaluation/answer_relevancy.rb       | 13 ++++---
 lib/auto_evaluation/coherence.rb              | 13 ++++---
 .../evaluate_answer_from_question_message.rb  |  5 +--
 .../auto_evaluation/answer_relevancy_spec.rb  | 37 +++++++++----------
 spec/lib/auto_evaluation/coherence_spec.rb    | 29 +++++++++++----
 ...luate_answer_from_question_message_spec.rb |  5 +--
 6 files changed, 56 insertions(+), 46 deletions(-)

diff --git a/lib/auto_evaluation/answer_relevancy.rb b/lib/auto_evaluation/answer_relevancy.rb
index 31d99e8ec..d07d82d0a 100644
--- a/lib/auto_evaluation/answer_relevancy.rb
+++ b/lib/auto_evaluation/answer_relevancy.rb
@@ -3,15 +3,14 @@ class AutoEvaluation::AnswerRelevancy
 
   def self.call(...) = new(...).call
 
-  def initialize(question_message:, answer_message:)
-    @question_message = question_message
-    @answer_message = answer_message
+  def initialize(answer)
+    @answer = answer
     @llm_responses = {}
     @metrics = {}
   end
 
   def call
-    statements, llm_responses[:statements], metrics[:statements] = StatementGenerator.call(answer_message:)
+    statements, llm_responses[:statements], metrics[:statements] = StatementGenerator.call(answer_message: answer.message)
 
     if statements.empty?
       return build_maximum_score_result(
@@ -58,9 +57,13 @@ def call
 
 private
 
-  attr_reader :question_message, :answer_message
+  attr_reader :answer
   attr_accessor :llm_responses, :metrics
 
+  def question_message
+    answer.rephrased_question || answer.question.message
+  end
+
   def calculate_score(verdicts)
     verdict_count = verdicts.count
     return 1.0 if verdict_count.zero?
diff --git a/lib/auto_evaluation/coherence.rb b/lib/auto_evaluation/coherence.rb
index 22b2d9e81..fbac8ab96 100644
--- a/lib/auto_evaluation/coherence.rb
+++ b/lib/auto_evaluation/coherence.rb
@@ -4,9 +4,8 @@ class Coherence
 
     def self.call(...) = new(...).call
 
-    def initialize(question_message:, answer_message:)
-      @question_message = question_message
-      @answer_message = answer_message
+    def initialize(answer)
+      @answer = answer
     end
 
     def call
@@ -24,7 +23,7 @@ def call
 
   private
 
-    attr_reader :question_message, :answer_message
+    attr_reader :answer
 
     def llm_prompts
       Prompts.config.coherence
@@ -33,7 +32,7 @@ def llm_prompts
     def user_prompt
       sprintf(
         llm_prompts.fetch(:user_prompt),
-        answer: answer_message,
+        answer: answer.message,
         question: question_message,
       )
     end
@@ -48,5 +47,9 @@ def normalise_rubric_score(rubric_score)
 
       (rubric_score.to_d - min_rubric_score) / (max_rubric_score - min_rubric_score)
     end
+
+    def question_message
+      answer.rephrased_question || answer.question.message
+    end
   end
 end
diff --git a/lib/auto_evaluation/evaluate_answer_from_question_message.rb b/lib/auto_evaluation/evaluate_answer_from_question_message.rb
index 0db613faf..3516c3bd9 100644
--- a/lib/auto_evaluation/evaluate_answer_from_question_message.rb
+++ b/lib/auto_evaluation/evaluate_answer_from_question_message.rb
@@ -22,10 +22,7 @@ def call
         raise TaskFailedError, error_message
       end
 
-      evaluation_class.call(
-        question_message:,
-        answer_message: answer.message,
-      )
+      evaluation_class.call(answer)
     end
 
   private
diff --git a/spec/lib/auto_evaluation/answer_relevancy_spec.rb b/spec/lib/auto_evaluation/answer_relevancy_spec.rb
index 6eeefb8aa..3375bb272 100644
--- a/spec/lib/auto_evaluation/answer_relevancy_spec.rb
+++ b/spec/lib/auto_evaluation/answer_relevancy_spec.rb
@@ -3,6 +3,8 @@
     let(:prompts) { AutoEvaluation::Prompts.config.answer_relevancy }
     let(:question_message) { "This is a test question message." }
     let(:answer_message) { "This is a test answer message." }
+    let(:question) { build(:question, message: question_message) }
+    let(:answer) { build(:answer, question:, message: answer_message) }
 
     let(:statements) { ["This is the first statement.", "This is the second statement."] }
     let(:statements_json) { { statements: }.to_json }
@@ -67,10 +69,7 @@
       allow(Clock).to receive(:monotonic_time)
                   .and_return(200.0, 202.0, 204.0, 206.0, 208.0, 210.0)
 
-      result = described_class.call(
-        question_message:,
-        answer_message:,
-      )
+      result = described_class.call(answer)
 
       expected_llm_responses = {
         statements: JSON.parse(statements_stub.response.body),
@@ -100,6 +99,16 @@
         )
     end
 
+    context "when the answer has a rephrased question" do
+      let(:question_message) { "This is a rephrased test question." }
+      let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) }
+
+      it "uses the rephrased question in the prompt" do
+        result = described_class.call(answer)
+        expect(result.reason).to eq(reason)
+      end
+    end
+
     context "when 'idk' verdicts are present" do
       let(:verdicts) do
         [
@@ -109,10 +118,7 @@
       end
 
       it "treats 'idk' verdicts as positive in the score" do
-        result = described_class.call(
-          question_message:,
-          answer_message:,
-        )
+        result = described_class.call(answer)
 
         expect(result.score).to eq(0.5)
       end
@@ -124,10 +130,7 @@
       it "returns a result object with the expected attributes" do
         allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0)
 
-        result = described_class.call(
-          question_message:,
-          answer_message:,
-        )
+        result = described_class.call(answer)
 
         expect(result)
           .to be_a(AutoEvaluation::ScoreResult)
@@ -148,10 +151,7 @@
         allow(Clock).to receive(:monotonic_time)
                     .and_return(200.0, 202.0, 204.0, 206.0)
 
-        result = described_class.call(
-          question_message:,
-          answer_message:,
-        )
+        result = described_class.call(answer)
 
         expect(result)
           .to be_a(AutoEvaluation::ScoreResult)
@@ -177,10 +177,7 @@
       it "returns a result object with the expected attributes" do
         allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0, 204.0, 206.0)
 
-        result = described_class.call(
-          question_message:,
-          answer_message:,
-        )
+        result = described_class.call(answer)
 
         expect(result)
           .to be_a(AutoEvaluation::ScoreResult)
diff --git a/spec/lib/auto_evaluation/coherence_spec.rb b/spec/lib/auto_evaluation/coherence_spec.rb
index 41df0907f..1f77a906d 100644
--- a/spec/lib/auto_evaluation/coherence_spec.rb
+++ b/spec/lib/auto_evaluation/coherence_spec.rb
@@ -1,6 +1,8 @@
 RSpec.describe AutoEvaluation::Coherence, :aws_credentials_stubbed do
   describe ".call" do
     let(:prompts) { AutoEvaluation::Prompts.config.coherence }
+    let(:question) { build(:question, message: question_message) }
+    let(:answer) { build(:answer,  question:, message: answer_message) }
     let(:question_message) { "This is a test question message." }
     let(:answer_message) { "This is a test answer message." }
     let(:reason) { "This is the reason for the score." }
@@ -22,10 +24,7 @@
         response_json,
       )
 
-      result = described_class.call(
-        question_message:,
-        answer_message:,
-      )
+      result = described_class.call(answer)
 
       expected_metrics = {
         coherence: {
@@ -62,14 +61,28 @@
           response_json,
         )
 
-        result = described_class.call(
-          question_message:,
-          answer_message:,
-        )
+        result = described_class.call(answer)
 
         expect(result.score).to eq(expected_score)
         expect(result.success).to eq(expected_score >= described_class::THRESHOLD)
       end
     end
+
+    context "when the answer has a rephrased question" do
+      let(:question_message) { "This is a rephrased test question." }
+      let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) }
+
+      it "uses the rephrased question in the prompt" do
+        stub = bedrock_invoke_model_openai_oss_tool_call(
+          user_prompt,
+          tools,
+          response_json,
+        )
+
+        described_class.call(answer)
+
+        expect(stub).to have_been_requested
+      end
+    end
   end
 end
diff --git a/spec/lib/auto_evaluation/evaluate_answer_from_question_message_spec.rb b/spec/lib/auto_evaluation/evaluate_answer_from_question_message_spec.rb
index f90533271..a39217594 100644
--- a/spec/lib/auto_evaluation/evaluate_answer_from_question_message_spec.rb
+++ b/spec/lib/auto_evaluation/evaluate_answer_from_question_message_spec.rb
@@ -27,10 +27,7 @@
       question_message:,
     )
 
-    expect(evaluation_klass).to have_received(:call).with(
-      question_message:,
-      answer_message: answer.message,
-    )
+    expect(evaluation_klass).to have_received(:call).with(answer)
   end
 
   it "returns the AutoEvaluation::ScoreResult generated by the evaluation class" do