From 1d1f131f2aede12d321e54e51f3e4d66b8a75721 Mon Sep 17 00:00:00 2001 From: David Gisbey Date: Tue, 6 Jan 2026 13:15:54 +0000 Subject: [PATCH] Update metrics to take answer model as argument When I added the EvaluateAnswerFromQuestionMessage class, I made a faulty assumption that the metric inputs would be a question and answer message. However, the remaining two metrics take an answer message and used sources. This commit updates the AnswerRelevancy and Coherence metrics to take the answer model as an argument. The class then decides what it wants to do with it, allowing us more flexibility. I've updated the EvaluateAnswerFromQuestionMessage class to reflect this change by calling the evaluation class with the answer. --- lib/auto_evaluation/answer_relevancy.rb | 13 ++++--- lib/auto_evaluation/coherence.rb | 13 ++++--- .../evaluate_answer_from_question_message.rb | 5 +-- .../auto_evaluation/answer_relevancy_spec.rb | 37 +++++++++---------- spec/lib/auto_evaluation/coherence_spec.rb | 29 +++++++++++---- ...luate_answer_from_question_message_spec.rb | 5 +-- 6 files changed, 56 insertions(+), 46 deletions(-) diff --git a/lib/auto_evaluation/answer_relevancy.rb b/lib/auto_evaluation/answer_relevancy.rb index 31d99e8ec..d07d82d0a 100644 --- a/lib/auto_evaluation/answer_relevancy.rb +++ b/lib/auto_evaluation/answer_relevancy.rb @@ -3,15 +3,14 @@ class AutoEvaluation::AnswerRelevancy def self.call(...) = new(...).call - def initialize(question_message:, answer_message:) - @question_message = question_message - @answer_message = answer_message + def initialize(answer) + @answer = answer @llm_responses = {} @metrics = {} end def call - statements, llm_responses[:statements], metrics[:statements] = StatementGenerator.call(answer_message:) + statements, llm_responses[:statements], metrics[:statements] = StatementGenerator.call(answer_message: answer.message) if statements.empty? return build_maximum_score_result( @@ -58,9 +57,13 @@ def call private - attr_reader :question_message, :answer_message + attr_reader :answer attr_accessor :llm_responses, :metrics + def question_message + answer.rephrased_question || answer.question.message + end + def calculate_score(verdicts) verdict_count = verdicts.count return 1.0 if verdict_count.zero? diff --git a/lib/auto_evaluation/coherence.rb b/lib/auto_evaluation/coherence.rb index 22b2d9e81..fbac8ab96 100644 --- a/lib/auto_evaluation/coherence.rb +++ b/lib/auto_evaluation/coherence.rb @@ -4,9 +4,8 @@ class Coherence def self.call(...) = new(...).call - def initialize(question_message:, answer_message:) - @question_message = question_message - @answer_message = answer_message + def initialize(answer) + @answer = answer end def call @@ -24,7 +23,7 @@ def call private - attr_reader :question_message, :answer_message + attr_reader :answer def llm_prompts Prompts.config.coherence @@ -33,7 +32,7 @@ def llm_prompts def user_prompt sprintf( llm_prompts.fetch(:user_prompt), - answer: answer_message, + answer: answer.message, question: question_message, ) end @@ -48,5 +47,9 @@ def normalise_rubric_score(rubric_score) (rubric_score.to_d - min_rubric_score) / (max_rubric_score - min_rubric_score) end + + def question_message + answer.rephrased_question || answer.question.message + end end end diff --git a/lib/auto_evaluation/evaluate_answer_from_question_message.rb b/lib/auto_evaluation/evaluate_answer_from_question_message.rb index 0db613faf..3516c3bd9 100644 --- a/lib/auto_evaluation/evaluate_answer_from_question_message.rb +++ b/lib/auto_evaluation/evaluate_answer_from_question_message.rb @@ -22,10 +22,7 @@ def call raise TaskFailedError, error_message end - evaluation_class.call( - question_message:, - answer_message: answer.message, - ) + evaluation_class.call(answer) end private diff --git a/spec/lib/auto_evaluation/answer_relevancy_spec.rb b/spec/lib/auto_evaluation/answer_relevancy_spec.rb index 6eeefb8aa..3375bb272 100644 --- a/spec/lib/auto_evaluation/answer_relevancy_spec.rb +++ b/spec/lib/auto_evaluation/answer_relevancy_spec.rb @@ -3,6 +3,8 @@ let(:prompts) { AutoEvaluation::Prompts.config.answer_relevancy } let(:question_message) { "This is a test question message." } let(:answer_message) { "This is a test answer message." } + let(:question) { build(:question, message: question_message) } + let(:answer) { build(:answer, question:, message: answer_message) } let(:statements) { ["This is the first statement.", "This is the second statement."] } let(:statements_json) { { statements: }.to_json } @@ -67,10 +69,7 @@ allow(Clock).to receive(:monotonic_time) .and_return(200.0, 202.0, 204.0, 206.0, 208.0, 210.0) - result = described_class.call( - question_message:, - answer_message:, - ) + result = described_class.call(answer) expected_llm_responses = { statements: JSON.parse(statements_stub.response.body), @@ -100,6 +99,16 @@ ) end + context "when the answer has a rephrased question" do + let(:question_message) { "This is a rephrased test question." } + let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) } + + it "uses the rephrased question in the prompt" do + result = described_class.call(answer) + expect(result.reason).to eq(reason) + end + end + context "when 'idk' verdicts are present" do let(:verdicts) do [ @@ -109,10 +118,7 @@ end it "treats 'idk' verdicts as positive in the score" do - result = described_class.call( - question_message:, - answer_message:, - ) + result = described_class.call(answer) expect(result.score).to eq(0.5) end @@ -124,10 +130,7 @@ it "returns a result object with the expected attributes" do allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0) - result = described_class.call( - question_message:, - answer_message:, - ) + result = described_class.call(answer) expect(result) .to be_a(AutoEvaluation::ScoreResult) @@ -148,10 +151,7 @@ allow(Clock).to receive(:monotonic_time) .and_return(200.0, 202.0, 204.0, 206.0) - result = described_class.call( - question_message:, - answer_message:, - ) + result = described_class.call(answer) expect(result) .to be_a(AutoEvaluation::ScoreResult) @@ -177,10 +177,7 @@ it "returns a result object with the expected attributes" do allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0, 204.0, 206.0) - result = described_class.call( - question_message:, - answer_message:, - ) + result = described_class.call(answer) expect(result) .to be_a(AutoEvaluation::ScoreResult) diff --git a/spec/lib/auto_evaluation/coherence_spec.rb b/spec/lib/auto_evaluation/coherence_spec.rb index 41df0907f..1f77a906d 100644 --- a/spec/lib/auto_evaluation/coherence_spec.rb +++ b/spec/lib/auto_evaluation/coherence_spec.rb @@ -1,6 +1,8 @@ RSpec.describe AutoEvaluation::Coherence, :aws_credentials_stubbed do describe ".call" do let(:prompts) { AutoEvaluation::Prompts.config.coherence } + let(:question) { build(:question, message: question_message) } + let(:answer) { build(:answer, question:, message: answer_message) } let(:question_message) { "This is a test question message." } let(:answer_message) { "This is a test answer message." } let(:reason) { "This is the reason for the score." } @@ -22,10 +24,7 @@ response_json, ) - result = described_class.call( - question_message:, - answer_message:, - ) + result = described_class.call(answer) expected_metrics = { coherence: { @@ -62,14 +61,28 @@ response_json, ) - result = described_class.call( - question_message:, - answer_message:, - ) + result = described_class.call(answer) expect(result.score).to eq(expected_score) expect(result.success).to eq(expected_score >= described_class::THRESHOLD) end end + + context "when the answer has a rephrased question" do + let(:question_message) { "This is a rephrased test question." } + let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) } + + it "uses the rephrased question in the prompt" do + stub = bedrock_invoke_model_openai_oss_tool_call( + user_prompt, + tools, + response_json, + ) + + described_class.call(answer) + + expect(stub).to have_been_requested + end + end end end diff --git a/spec/lib/auto_evaluation/evaluate_answer_from_question_message_spec.rb b/spec/lib/auto_evaluation/evaluate_answer_from_question_message_spec.rb index f90533271..a39217594 100644 --- a/spec/lib/auto_evaluation/evaluate_answer_from_question_message_spec.rb +++ b/spec/lib/auto_evaluation/evaluate_answer_from_question_message_spec.rb @@ -27,10 +27,7 @@ question_message:, ) - expect(evaluation_klass).to have_received(:call).with( - question_message:, - answer_message: answer.message, - ) + expect(evaluation_klass).to have_received(:call).with(answer) end it "returns the AutoEvaluation::ScoreResult generated by the evaluation class" do