Skip to content

Commit 1434d83

Browse files
authored
Merge pull request #749 from alphagov/update-metrics-to-take-answer-model
Update metrics to take answer model as argument
2 parents f46d3e6 + 1d1f131 commit 1434d83

6 files changed

Lines changed: 56 additions & 46 deletions

File tree

lib/auto_evaluation/answer_relevancy.rb

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,14 @@ class AutoEvaluation::AnswerRelevancy
33

44
def self.call(...) = new(...).call
55

6-
def initialize(question_message:, answer_message:)
7-
@question_message = question_message
8-
@answer_message = answer_message
6+
def initialize(answer)
7+
@answer = answer
98
@llm_responses = {}
109
@metrics = {}
1110
end
1211

1312
def call
14-
statements, llm_responses[:statements], metrics[:statements] = StatementGenerator.call(answer_message:)
13+
statements, llm_responses[:statements], metrics[:statements] = StatementGenerator.call(answer_message: answer.message)
1514

1615
if statements.empty?
1716
return build_maximum_score_result(
@@ -58,9 +57,13 @@ def call
5857

5958
private
6059

61-
attr_reader :question_message, :answer_message
60+
attr_reader :answer
6261
attr_accessor :llm_responses, :metrics
6362

63+
def question_message
64+
answer.rephrased_question || answer.question.message
65+
end
66+
6467
def calculate_score(verdicts)
6568
verdict_count = verdicts.count
6669
return 1.0 if verdict_count.zero?

lib/auto_evaluation/coherence.rb

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,8 @@ class Coherence
44

55
def self.call(...) = new(...).call
66

7-
def initialize(question_message:, answer_message:)
8-
@question_message = question_message
9-
@answer_message = answer_message
7+
def initialize(answer)
8+
@answer = answer
109
end
1110

1211
def call
@@ -24,7 +23,7 @@ def call
2423

2524
private
2625

27-
attr_reader :question_message, :answer_message
26+
attr_reader :answer
2827

2928
def llm_prompts
3029
Prompts.config.coherence
@@ -33,7 +32,7 @@ def llm_prompts
3332
def user_prompt
3433
sprintf(
3534
llm_prompts.fetch(:user_prompt),
36-
answer: answer_message,
35+
answer: answer.message,
3736
question: question_message,
3837
)
3938
end
@@ -48,5 +47,9 @@ def normalise_rubric_score(rubric_score)
4847

4948
(rubric_score.to_d - min_rubric_score) / (max_rubric_score - min_rubric_score)
5049
end
50+
51+
def question_message
52+
answer.rephrased_question || answer.question.message
53+
end
5154
end
5255
end

lib/auto_evaluation/evaluate_answer_from_question_message.rb

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,7 @@ def call
2222
raise TaskFailedError, error_message
2323
end
2424

25-
evaluation_class.call(
26-
question_message:,
27-
answer_message: answer.message,
28-
)
25+
evaluation_class.call(answer)
2926
end
3027

3128
private

spec/lib/auto_evaluation/answer_relevancy_spec.rb

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
let(:prompts) { AutoEvaluation::Prompts.config.answer_relevancy }
44
let(:question_message) { "This is a test question message." }
55
let(:answer_message) { "This is a test answer message." }
6+
let(:question) { build(:question, message: question_message) }
7+
let(:answer) { build(:answer, question:, message: answer_message) }
68

79
let(:statements) { ["This is the first statement.", "This is the second statement."] }
810
let(:statements_json) { { statements: }.to_json }
@@ -67,10 +69,7 @@
6769
allow(Clock).to receive(:monotonic_time)
6870
.and_return(200.0, 202.0, 204.0, 206.0, 208.0, 210.0)
6971

70-
result = described_class.call(
71-
question_message:,
72-
answer_message:,
73-
)
72+
result = described_class.call(answer)
7473

7574
expected_llm_responses = {
7675
statements: JSON.parse(statements_stub.response.body),
@@ -100,6 +99,16 @@
10099
)
101100
end
102101

102+
context "when the answer has a rephrased question" do
103+
let(:question_message) { "This is a rephrased test question." }
104+
let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) }
105+
106+
it "uses the rephrased question in the prompt" do
107+
result = described_class.call(answer)
108+
expect(result.reason).to eq(reason)
109+
end
110+
end
111+
103112
context "when 'idk' verdicts are present" do
104113
let(:verdicts) do
105114
[
@@ -109,10 +118,7 @@
109118
end
110119

111120
it "treats 'idk' verdicts as positive in the score" do
112-
result = described_class.call(
113-
question_message:,
114-
answer_message:,
115-
)
121+
result = described_class.call(answer)
116122

117123
expect(result.score).to eq(0.5)
118124
end
@@ -124,10 +130,7 @@
124130
it "returns a result object with the expected attributes" do
125131
allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0)
126132

127-
result = described_class.call(
128-
question_message:,
129-
answer_message:,
130-
)
133+
result = described_class.call(answer)
131134

132135
expect(result)
133136
.to be_a(AutoEvaluation::ScoreResult)
@@ -148,10 +151,7 @@
148151
allow(Clock).to receive(:monotonic_time)
149152
.and_return(200.0, 202.0, 204.0, 206.0)
150153

151-
result = described_class.call(
152-
question_message:,
153-
answer_message:,
154-
)
154+
result = described_class.call(answer)
155155

156156
expect(result)
157157
.to be_a(AutoEvaluation::ScoreResult)
@@ -177,10 +177,7 @@
177177
it "returns a result object with the expected attributes" do
178178
allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0, 204.0, 206.0)
179179

180-
result = described_class.call(
181-
question_message:,
182-
answer_message:,
183-
)
180+
result = described_class.call(answer)
184181

185182
expect(result)
186183
.to be_a(AutoEvaluation::ScoreResult)

spec/lib/auto_evaluation/coherence_spec.rb

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
RSpec.describe AutoEvaluation::Coherence, :aws_credentials_stubbed do
22
describe ".call" do
33
let(:prompts) { AutoEvaluation::Prompts.config.coherence }
4+
let(:question) { build(:question, message: question_message) }
5+
let(:answer) { build(:answer, question:, message: answer_message) }
46
let(:question_message) { "This is a test question message." }
57
let(:answer_message) { "This is a test answer message." }
68
let(:reason) { "This is the reason for the score." }
@@ -22,10 +24,7 @@
2224
response_json,
2325
)
2426

25-
result = described_class.call(
26-
question_message:,
27-
answer_message:,
28-
)
27+
result = described_class.call(answer)
2928

3029
expected_metrics = {
3130
coherence: {
@@ -62,14 +61,28 @@
6261
response_json,
6362
)
6463

65-
result = described_class.call(
66-
question_message:,
67-
answer_message:,
68-
)
64+
result = described_class.call(answer)
6965

7066
expect(result.score).to eq(expected_score)
7167
expect(result.success).to eq(expected_score >= described_class::THRESHOLD)
7268
end
7369
end
70+
71+
context "when the answer has a rephrased question" do
72+
let(:question_message) { "This is a rephrased test question." }
73+
let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) }
74+
75+
it "uses the rephrased question in the prompt" do
76+
stub = bedrock_invoke_model_openai_oss_tool_call(
77+
user_prompt,
78+
tools,
79+
response_json,
80+
)
81+
82+
described_class.call(answer)
83+
84+
expect(stub).to have_been_requested
85+
end
86+
end
7487
end
7588
end

spec/lib/auto_evaluation/evaluate_answer_from_question_message_spec.rb

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,7 @@
2727
question_message:,
2828
)
2929

30-
expect(evaluation_klass).to have_received(:call).with(
31-
question_message:,
32-
answer_message: answer.message,
33-
)
30+
expect(evaluation_klass).to have_received(:call).with(answer)
3431
end
3532

3633
it "returns the AutoEvaluation::ScoreResult generated by the evaluation class" do

0 commit comments

Comments
 (0)