Update metrics to take answer model as argument

davidgisbey · davidgisbey · commit 1d1f131f2aed · 2026-01-06T13:21:29.000Z
When I added the EvaluateAnswerFromQuestionMessage class, I made a faulty
assumption that the metric inputs would be a question and answer message.

However, the remaining two metrics take an answer message and used sources.
This commit updates the AnswerRelevancy and Coherence metrics to take the
answer model as an argument. The class then decides what it wants to
do with it, allowing us more flexibility.

I've updated the EvaluateAnswerFromQuestionMessage class to reflect this
change by calling the evaluation class with the answer.
diff --git a/lib/auto_evaluation/answer_relevancy.rb b/lib/auto_evaluation/answer_relevancy.rb
@@ -3,15 +3,14 @@ class AutoEvaluation::AnswerRelevancy
 
   def self.call(...) = new(...).call
 
-  def initialize(question_message:, answer_message:)
-    @question_message = question_message
-    @answer_message = answer_message
+  def initialize(answer)
+    @answer = answer
     @llm_responses = {}
     @metrics = {}
   end
 
   def call
-    statements, llm_responses[:statements], metrics[:statements] = StatementGenerator.call(answer_message:)
+    statements, llm_responses[:statements], metrics[:statements] = StatementGenerator.call(answer_message: answer.message)
 
     if statements.empty?
       return build_maximum_score_result(
@@ -58,9 +57,13 @@ def call
 
 private
 
-  attr_reader :question_message, :answer_message
+  attr_reader :answer
   attr_accessor :llm_responses, :metrics
 
+  def question_message
+    answer.rephrased_question || answer.question.message
+  end
+
   def calculate_score(verdicts)
     verdict_count = verdicts.count
     return 1.0 if verdict_count.zero?
diff --git a/lib/auto_evaluation/coherence.rb b/lib/auto_evaluation/coherence.rb
@@ -4,9 +4,8 @@ class Coherence
 
     def self.call(...) = new(...).call
 
-    def initialize(question_message:, answer_message:)
-      @question_message = question_message
-      @answer_message = answer_message
+    def initialize(answer)
+      @answer = answer
     end
 
     def call
@@ -24,7 +23,7 @@ def call
 
   private
 
-    attr_reader :question_message, :answer_message
+    attr_reader :answer
 
     def llm_prompts
       Prompts.config.coherence
@@ -33,7 +32,7 @@ def llm_prompts
     def user_prompt
       sprintf(
         llm_prompts.fetch(:user_prompt),
-        answer: answer_message,
+        answer: answer.message,
         question: question_message,
       )
     end
@@ -48,5 +47,9 @@ def normalise_rubric_score(rubric_score)
 
       (rubric_score.to_d - min_rubric_score) / (max_rubric_score - min_rubric_score)
     end
+
+    def question_message
+      answer.rephrased_question || answer.question.message
+    end
   end
 end
diff --git a/lib/auto_evaluation/evaluate_answer_from_question_message.rb b/lib/auto_evaluation/evaluate_answer_from_question_message.rb
@@ -22,10 +22,7 @@ def call
         raise TaskFailedError, error_message
       end
 
-      evaluation_class.call(
-        question_message:,
-        answer_message: answer.message,
-      )
+      evaluation_class.call(answer)
     end
 
   private
diff --git a/spec/lib/auto_evaluation/answer_relevancy_spec.rb b/spec/lib/auto_evaluation/answer_relevancy_spec.rb
@@ -3,6 +3,8 @@
     let(:prompts) { AutoEvaluation::Prompts.config.answer_relevancy }
     let(:question_message) { "This is a test question message." }
     let(:answer_message) { "This is a test answer message." }
+    let(:question) { build(:question, message: question_message) }
+    let(:answer) { build(:answer, question:, message: answer_message) }
 
     let(:statements) { ["This is the first statement.", "This is the second statement."] }
     let(:statements_json) { { statements: }.to_json }
@@ -67,10 +69,7 @@
       allow(Clock).to receive(:monotonic_time)
                   .and_return(200.0, 202.0, 204.0, 206.0, 208.0, 210.0)
 
-      result = described_class.call(
-        question_message:,
-        answer_message:,
-      )
+      result = described_class.call(answer)
 
       expected_llm_responses = {
         statements: JSON.parse(statements_stub.response.body),
@@ -100,6 +99,16 @@
         )
     end
 
+    context "when the answer has a rephrased question" do
+      let(:question_message) { "This is a rephrased test question." }
+      let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) }
+
+      it "uses the rephrased question in the prompt" do
+        result = described_class.call(answer)
+        expect(result.reason).to eq(reason)
+      end
+    end
+
     context "when 'idk' verdicts are present" do
       let(:verdicts) do
         [
@@ -109,10 +118,7 @@
       end
 
       it "treats 'idk' verdicts as positive in the score" do
-        result = described_class.call(
-          question_message:,
-          answer_message:,
-        )
+        result = described_class.call(answer)
 
         expect(result.score).to eq(0.5)
       end
@@ -124,10 +130,7 @@
       it "returns a result object with the expected attributes" do
         allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0)
 
-        result = described_class.call(
-          question_message:,
-          answer_message:,
-        )
+        result = described_class.call(answer)
 
         expect(result)
           .to be_a(AutoEvaluation::ScoreResult)
@@ -148,10 +151,7 @@
         allow(Clock).to receive(:monotonic_time)
                     .and_return(200.0, 202.0, 204.0, 206.0)
 
-        result = described_class.call(
-          question_message:,
-          answer_message:,
-        )
+        result = described_class.call(answer)
 
         expect(result)
           .to be_a(AutoEvaluation::ScoreResult)
@@ -177,10 +177,7 @@
       it "returns a result object with the expected attributes" do
         allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0, 204.0, 206.0)
 
-        result = described_class.call(
-          question_message:,
-          answer_message:,
-        )
+        result = described_class.call(answer)
 
         expect(result)
           .to be_a(AutoEvaluation::ScoreResult)
diff --git a/spec/lib/auto_evaluation/coherence_spec.rb b/spec/lib/auto_evaluation/coherence_spec.rb
@@ -1,6 +1,8 @@
 RSpec.describe AutoEvaluation::Coherence, :aws_credentials_stubbed do
   describe ".call" do
     let(:prompts) { AutoEvaluation::Prompts.config.coherence }
+    let(:question) { build(:question, message: question_message) }
+    let(:answer) { build(:answer,  question:, message: answer_message) }
     let(:question_message) { "This is a test question message." }
     let(:answer_message) { "This is a test answer message." }
     let(:reason) { "This is the reason for the score." }
@@ -22,10 +24,7 @@
         response_json,
       )
 
-      result = described_class.call(
-        question_message:,
-        answer_message:,
-      )
+      result = described_class.call(answer)
 
       expected_metrics = {
         coherence: {
@@ -62,14 +61,28 @@
           response_json,
         )
 
-        result = described_class.call(
-          question_message:,
-          answer_message:,
-        )
+        result = described_class.call(answer)
 
         expect(result.score).to eq(expected_score)
         expect(result.success).to eq(expected_score >= described_class::THRESHOLD)
       end
     end
+
+    context "when the answer has a rephrased question" do
+      let(:question_message) { "This is a rephrased test question." }
+      let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) }
+
+      it "uses the rephrased question in the prompt" do
+        stub = bedrock_invoke_model_openai_oss_tool_call(
+          user_prompt,
+          tools,
+          response_json,
+        )
+
+        described_class.call(answer)
+
+        expect(stub).to have_been_requested
+      end
+    end
   end
 end
diff --git a/spec/lib/auto_evaluation/evaluate_answer_from_question_message_spec.rb b/spec/lib/auto_evaluation/evaluate_answer_from_question_message_spec.rb
@@ -27,10 +27,7 @@
       question_message:,
     )
 
-    expect(evaluation_klass).to have_received(:call).with(
-      question_message:,
-      answer_message: answer.message,
-    )
+    expect(evaluation_klass).to have_received(:call).with(answer)
   end
 
   it "returns the AutoEvaluation::ScoreResult generated by the evaluation class" do