Skip to content

Commit 1d1f131

Browse files
committed
Update metrics to take answer model as argument
When I added the EvaluateAnswerFromQuestionMessage class, I made a faulty assumption that the metric inputs would be a question and answer message. However, the remaining two metrics take an answer message and used sources. This commit updates the AnswerRelevancy and Coherence metrics to take the answer model as an argument. The class then decides what it wants to do with it, allowing us more flexibility. I've updated the EvaluateAnswerFromQuestionMessage class to reflect this change by calling the evaluation class with the answer.
1 parent c863cfa commit 1d1f131

6 files changed

Lines changed: 56 additions & 46 deletions

File tree

lib/auto_evaluation/answer_relevancy.rb

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,14 @@ class AutoEvaluation::AnswerRelevancy
33

44
def self.call(...) = new(...).call
55

6-
def initialize(question_message:, answer_message:)
7-
@question_message = question_message
8-
@answer_message = answer_message
6+
def initialize(answer)
7+
@answer = answer
98
@llm_responses = {}
109
@metrics = {}
1110
end
1211

1312
def call
14-
statements, llm_responses[:statements], metrics[:statements] = StatementGenerator.call(answer_message:)
13+
statements, llm_responses[:statements], metrics[:statements] = StatementGenerator.call(answer_message: answer.message)
1514

1615
if statements.empty?
1716
return build_maximum_score_result(
@@ -58,9 +57,13 @@ def call
5857

5958
private
6059

61-
attr_reader :question_message, :answer_message
60+
attr_reader :answer
6261
attr_accessor :llm_responses, :metrics
6362

63+
def question_message
64+
answer.rephrased_question || answer.question.message
65+
end
66+
6467
def calculate_score(verdicts)
6568
verdict_count = verdicts.count
6669
return 1.0 if verdict_count.zero?

lib/auto_evaluation/coherence.rb

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,8 @@ class Coherence
44

55
def self.call(...) = new(...).call
66

7-
def initialize(question_message:, answer_message:)
8-
@question_message = question_message
9-
@answer_message = answer_message
7+
def initialize(answer)
8+
@answer = answer
109
end
1110

1211
def call
@@ -24,7 +23,7 @@ def call
2423

2524
private
2625

27-
attr_reader :question_message, :answer_message
26+
attr_reader :answer
2827

2928
def llm_prompts
3029
Prompts.config.coherence
@@ -33,7 +32,7 @@ def llm_prompts
3332
def user_prompt
3433
sprintf(
3534
llm_prompts.fetch(:user_prompt),
36-
answer: answer_message,
35+
answer: answer.message,
3736
question: question_message,
3837
)
3938
end
@@ -48,5 +47,9 @@ def normalise_rubric_score(rubric_score)
4847

4948
(rubric_score.to_d - min_rubric_score) / (max_rubric_score - min_rubric_score)
5049
end
50+
51+
def question_message
52+
answer.rephrased_question || answer.question.message
53+
end
5154
end
5255
end

lib/auto_evaluation/evaluate_answer_from_question_message.rb

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,7 @@ def call
2222
raise TaskFailedError, error_message
2323
end
2424

25-
evaluation_class.call(
26-
question_message:,
27-
answer_message: answer.message,
28-
)
25+
evaluation_class.call(answer)
2926
end
3027

3128
private

spec/lib/auto_evaluation/answer_relevancy_spec.rb

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
let(:prompts) { AutoEvaluation::Prompts.config.answer_relevancy }
44
let(:question_message) { "This is a test question message." }
55
let(:answer_message) { "This is a test answer message." }
6+
let(:question) { build(:question, message: question_message) }
7+
let(:answer) { build(:answer, question:, message: answer_message) }
68

79
let(:statements) { ["This is the first statement.", "This is the second statement."] }
810
let(:statements_json) { { statements: }.to_json }
@@ -67,10 +69,7 @@
6769
allow(Clock).to receive(:monotonic_time)
6870
.and_return(200.0, 202.0, 204.0, 206.0, 208.0, 210.0)
6971

70-
result = described_class.call(
71-
question_message:,
72-
answer_message:,
73-
)
72+
result = described_class.call(answer)
7473

7574
expected_llm_responses = {
7675
statements: JSON.parse(statements_stub.response.body),
@@ -100,6 +99,16 @@
10099
)
101100
end
102101

102+
context "when the answer has a rephrased question" do
103+
let(:question_message) { "This is a rephrased test question." }
104+
let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) }
105+
106+
it "uses the rephrased question in the prompt" do
107+
result = described_class.call(answer)
108+
expect(result.reason).to eq(reason)
109+
end
110+
end
111+
103112
context "when 'idk' verdicts are present" do
104113
let(:verdicts) do
105114
[
@@ -109,10 +118,7 @@
109118
end
110119

111120
it "treats 'idk' verdicts as positive in the score" do
112-
result = described_class.call(
113-
question_message:,
114-
answer_message:,
115-
)
121+
result = described_class.call(answer)
116122

117123
expect(result.score).to eq(0.5)
118124
end
@@ -124,10 +130,7 @@
124130
it "returns a result object with the expected attributes" do
125131
allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0)
126132

127-
result = described_class.call(
128-
question_message:,
129-
answer_message:,
130-
)
133+
result = described_class.call(answer)
131134

132135
expect(result)
133136
.to be_a(AutoEvaluation::ScoreResult)
@@ -148,10 +151,7 @@
148151
allow(Clock).to receive(:monotonic_time)
149152
.and_return(200.0, 202.0, 204.0, 206.0)
150153

151-
result = described_class.call(
152-
question_message:,
153-
answer_message:,
154-
)
154+
result = described_class.call(answer)
155155

156156
expect(result)
157157
.to be_a(AutoEvaluation::ScoreResult)
@@ -177,10 +177,7 @@
177177
it "returns a result object with the expected attributes" do
178178
allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0, 204.0, 206.0)
179179

180-
result = described_class.call(
181-
question_message:,
182-
answer_message:,
183-
)
180+
result = described_class.call(answer)
184181

185182
expect(result)
186183
.to be_a(AutoEvaluation::ScoreResult)

spec/lib/auto_evaluation/coherence_spec.rb

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
RSpec.describe AutoEvaluation::Coherence, :aws_credentials_stubbed do
22
describe ".call" do
33
let(:prompts) { AutoEvaluation::Prompts.config.coherence }
4+
let(:question) { build(:question, message: question_message) }
5+
let(:answer) { build(:answer, question:, message: answer_message) }
46
let(:question_message) { "This is a test question message." }
57
let(:answer_message) { "This is a test answer message." }
68
let(:reason) { "This is the reason for the score." }
@@ -22,10 +24,7 @@
2224
response_json,
2325
)
2426

25-
result = described_class.call(
26-
question_message:,
27-
answer_message:,
28-
)
27+
result = described_class.call(answer)
2928

3029
expected_metrics = {
3130
coherence: {
@@ -62,14 +61,28 @@
6261
response_json,
6362
)
6463

65-
result = described_class.call(
66-
question_message:,
67-
answer_message:,
68-
)
64+
result = described_class.call(answer)
6965

7066
expect(result.score).to eq(expected_score)
7167
expect(result.success).to eq(expected_score >= described_class::THRESHOLD)
7268
end
7369
end
70+
71+
context "when the answer has a rephrased question" do
72+
let(:question_message) { "This is a rephrased test question." }
73+
let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) }
74+
75+
it "uses the rephrased question in the prompt" do
76+
stub = bedrock_invoke_model_openai_oss_tool_call(
77+
user_prompt,
78+
tools,
79+
response_json,
80+
)
81+
82+
described_class.call(answer)
83+
84+
expect(stub).to have_been_requested
85+
end
86+
end
7487
end
7588
end

spec/lib/auto_evaluation/evaluate_answer_from_question_message_spec.rb

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,7 @@
2727
question_message:,
2828
)
2929

30-
expect(evaluation_klass).to have_received(:call).with(
31-
question_message:,
32-
answer_message: answer.message,
33-
)
30+
expect(evaluation_klass).to have_received(:call).with(answer)
3431
end
3532

3633
it "returns the AutoEvaluation::ScoreResult generated by the evaluation class" do

0 commit comments

Comments
 (0)