Skip to content

Commit e4988f2

Browse files
committed
Add AutoEvaluation::GenerateMetricResult
We're adding a few metrics. Each of these requires a basic rake task that can be called with an INPUT environment variable. The task will generate a ScoreResult for the given metric and print it as JSON. This adds the AutoEvaluation::GenerateMetricResult class which: - takes a question_message and a metric class as arguments - generates an answer using the question_message, - calls the metric class with the question and answer to get a ScoreResult - returns the result as JSON If the generated answer has an error status, it logs a warning and returns the error message instead. This does mean that we don't abort the rake tasks using STDERR and just print the error message to STDOUT, but i think that's probably fine. Happy to discuss if people disagree though.
1 parent 3ce120d commit e4988f2

4 files changed

Lines changed: 126 additions & 88 deletions

File tree

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
module AutoEvaluation
2+
class GenerateMetricResult
3+
delegate :logger, to: Rails
4+
5+
def self.call(...) = new(...).call
6+
7+
def initialize(metric_class:, question_message:)
8+
@metric_class = metric_class
9+
@question_message = question_message
10+
end
11+
12+
def call
13+
question = Question.new(message: ENV["INPUT"], conversation: Conversation.new)
14+
answer = AnswerComposition::PipelineRunner.call(question:, pipeline: [
15+
AnswerComposition::Pipeline::SearchResultFetcher,
16+
AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
17+
])
18+
19+
if answer.status =~ /^error/
20+
logger.warn "Warning: answer has an error status: #{answer.status}"
21+
return answer.error_message
22+
end
23+
24+
result = metric_class.call(
25+
question_message:,
26+
answer_message: answer.message,
27+
)
28+
result.to_json
29+
end
30+
31+
private
32+
33+
attr_reader :metric_class, :question_message
34+
end
35+
end

lib/tasks/evaluation.rake

Lines changed: 10 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -177,47 +177,23 @@ namespace :evaluation do
177177
task generate_answer_relevancy_evaluation: :environment do
178178
raise "Requires an INPUT env var" if ENV["INPUT"].blank?
179179

180-
question = Question.new(message: ENV["INPUT"], conversation: Conversation.new)
181-
182-
answer = AnswerComposition::PipelineRunner.call(question:, pipeline: [
183-
AnswerComposition::Pipeline::SearchResultFetcher,
184-
AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
185-
])
186-
187-
if answer.status =~ /^error/
188-
warn "Warning: answer has an error status: #{answer.status}"
189-
abort(answer.error_message)
190-
end
191-
192-
result = AutoEvaluation::AnswerRelevancy.call(
193-
question_message: answer.rephrased_question || question.message,
194-
answer_message: answer.message,
180+
puts(
181+
AutoEvaluation::GenerateMetricResult.call(
182+
metric_name: "answer_relevancy",
183+
question_message: ENV["INPUT"],
184+
)
195185
)
196-
197-
puts(result.to_json)
198186
end
199187

200188
desc "Run answer coherence evaluation for a user input"
201189
task generate_coherence_evaluation: :environment do
202190
raise "Requires an INPUT env var" if ENV["INPUT"].blank?
203191

204-
question = Question.new(message: ENV["INPUT"], conversation: Conversation.new)
205-
206-
answer = AnswerComposition::PipelineRunner.call(question:, pipeline: [
207-
AnswerComposition::Pipeline::SearchResultFetcher,
208-
AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
209-
])
210-
211-
if answer.status =~ /^error/
212-
warn "Warning: answer has an error status: #{answer.status}"
213-
abort(answer.error_message)
214-
end
215-
216-
result = AutoEvaluation::Coherence.call(
217-
question_message: answer.rephrased_question || question.message,
218-
answer_message: answer.message,
192+
puts(
193+
AutoEvaluation::GenerateMetricResult.call(
194+
metric_name: "coherence",
195+
question_message: ENV["INPUT"],
196+
)
219197
)
220-
221-
puts(result.to_json)
222198
end
223199
end
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
RSpec.describe AutoEvaluation::GenerateMetricResult, :aws_credentials_stubbed do
2+
let(:answer) { build(:answer) }
3+
let(:question_message) { "What is the capital of France?" }
4+
let(:evaluation_result) do
5+
AutoEvaluation::ScoreResult.new(
6+
score: 0.85,
7+
reason: "The answer is mostly correct but misses some details.",
8+
success: true,
9+
llm_responses: [],
10+
metrics: {},
11+
)
12+
end
13+
14+
before do
15+
allow(AnswerComposition::PipelineRunner)
16+
.to receive(:call)
17+
.with(
18+
question: an_instance_of(Question),
19+
pipeline: [
20+
AnswerComposition::Pipeline::SearchResultFetcher,
21+
AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
22+
],
23+
)
24+
.and_return(answer)
25+
end
26+
27+
[
28+
AutoEvaluation::AnswerRelevancy,
29+
AutoEvaluation::Coherence,
30+
].each do |metric_class|
31+
context "when passed the #{metric_class} metric class" do
32+
before do
33+
allow(metric_class)
34+
.to receive(:call)
35+
.and_return(evaluation_result)
36+
end
37+
38+
it "calls #{metric_class} with the correct parameters" do
39+
result = described_class.call(
40+
metric_class: metric_class,
41+
question_message:,
42+
)
43+
44+
expect(metric_class).to have_received(:call).with(
45+
question_message:,
46+
answer_message: answer.message,
47+
)
48+
end
49+
50+
it "returns the metrics ScoreResult as JSON" do
51+
result = described_class.call(
52+
metric_class: metric_class,
53+
question_message:,
54+
)
55+
expect(result).to eq(evaluation_result.to_json)
56+
end
57+
end
58+
end
59+
60+
context "when the generated answer has an error status" do
61+
let(:answer) { build(:answer, status: :error_answer_service_error, error_message: "Contrived error.") }
62+
63+
it "logs a warning and returns the error message" do
64+
expect(Rails.logger).to receive(:warn).with("Warning: answer has an error status: error_answer_service_error")
65+
66+
result = described_class.call(
67+
metric_class: AutoEvaluation::Coherence,
68+
question_message:,
69+
)
70+
expect(result).to eq(answer.error_message)
71+
end
72+
end
73+
end

spec/lib/tasks/evaluation_spec.rb

Lines changed: 8 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -30,35 +30,24 @@
3030
end
3131

3232
shared_examples "a task that returns a ScoreResult" do
33+
let(:question_message) { "What is the current VAT rate?" }
3334
let(:answer) { build(:answer) }
34-
let(:evaluation_result) do
35-
AutoEvaluation::ScoreResult.new(
36-
score: 0.7,
37-
reason: "Most statements are relevant.",
38-
success: true,
39-
llm_responses: {},
40-
metrics: {},
41-
)
42-
end
4335

4436
before do
4537
Rake::Task[task_name].reenable
4638

47-
allow(AnswerComposition::PipelineRunner)
39+
allow(AutoEvaluation::GenerateMetricResult)
4840
.to receive(:call)
4941
.with(
50-
question: an_instance_of(Question),
51-
pipeline: [
52-
AnswerComposition::Pipeline::SearchResultFetcher,
53-
AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
54-
],
42+
metric_name:,
43+
question_message:,
5544
)
56-
.and_return(answer)
45+
.and_return(evaluation_result.to_json)
5746
end
5847

5948
it_behaves_like "a task requiring an input"
6049

61-
it "outputs the evaluation result as JSON to stdout" do
50+
it "outputs the evaluation result as to stdout" do
6251
ClimateControl.modify(INPUT: question_message) do
6352
expected_result_output = {
6453
score: evaluation_result.score,
@@ -72,20 +61,6 @@
7261
.to output("#{expected_result_output}\n").to_stdout
7362
end
7463
end
75-
76-
context "when an answer has an error status" do
77-
let(:error_message) { "Answer generation failed" }
78-
let(:answer) { build(:answer, status: :error_answer_service_error, error_message:) }
79-
80-
it "warns the user and outputs the error message" do
81-
ClimateControl.modify(INPUT: question_message) do
82-
expected_stderr = "Warning: answer has an error status: #{answer.status}\n#{error_message}\n"
83-
expect { Rake::Task[task_name].invoke }
84-
.to raise_error(SystemExit)
85-
.and output(expected_stderr).to_stderr
86-
end
87-
end
88-
end
8964
end
9065

9166
describe "generate_answer" do
@@ -582,37 +557,16 @@
582557
end
583558

584559
describe "generate_answer_relevancy_evaluation" do
585-
let(:question_message) { "What is the current VAT rate?" }
586-
587560
it_behaves_like "a task that returns a ScoreResult" do
588561
let(:task_name) { "evaluation:generate_answer_relevancy_evaluation" }
589-
590-
before do
591-
allow(AutoEvaluation::AnswerRelevancy)
592-
.to receive(:call)
593-
.with(
594-
question_message:,
595-
answer_message: answer.message,
596-
)
597-
.and_return(evaluation_result)
598-
end
562+
let(:metric_name) { "answer_relevancy" }
599563
end
600564
end
601565

602566
describe "generate_coherence_evaluation" do
603567
it_behaves_like "a task that returns a ScoreResult" do
604-
let(:question_message) { "What is the current VAT rate?" }
605568
let(:task_name) { "evaluation:generate_coherence_evaluation" }
606-
607-
before do
608-
allow(AutoEvaluation::Coherence)
609-
.to receive(:call)
610-
.with(
611-
question_message:,
612-
answer_message: answer.message,
613-
)
614-
.and_return(evaluation_result)
615-
end
569+
let(:metric_name) { "coherence" }
616570
end
617571
end
618572
end

0 commit comments

Comments
 (0)