Skip to content

Commit 9a95c26

Browse files
committed
Add coherence metric auto-evaluation task
This adds a new Rake task to generate coherence for a given question. Much like the answer relevancy task it: 1. generates an answer for the input question using the existing answer composition pipeline 2. evaluates the coherence of the generated answer against the question using AutoEvaluation::Coherence 3. outputs the result json to stdout 4. handles errors answers appropriately Because there's so much shared functionality i've added a shared example to the existing evaluation_spec to reduce duplication between the two tasks. Once all the metrics are ported, we might want to consider updating this so have a single rake task that takes the metric as an argument rather than separate tasks for each metric. I've held off on doing this for now just to make sure all the rake tasks do have shared logic with the exception of the metric called. I'm pretty sure they will though.
1 parent adb4021 commit 9a95c26

2 files changed

Lines changed: 107 additions & 60 deletions

File tree

lib/tasks/evaluation.rake

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,6 @@ namespace :evaluation do
180180
question = Question.new(message: ENV["INPUT"], conversation: Conversation.new)
181181

182182
answer = AnswerComposition::PipelineRunner.call(question:, pipeline: [
183-
AnswerComposition::Pipeline::Claude::QuestionRouter,
184183
AnswerComposition::Pipeline::SearchResultFetcher,
185184
AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
186185
])
@@ -197,4 +196,28 @@ namespace :evaluation do
197196

198197
puts(result.to_json)
199198
end
199+
200+
desc "Run answer coherence evaluation for a user input"
201+
task generate_coherence_evaluation: :environment do
202+
raise "Requires an INPUT env var" if ENV["INPUT"].blank?
203+
204+
question = Question.new(message: ENV["INPUT"], conversation: Conversation.new)
205+
206+
answer = AnswerComposition::PipelineRunner.call(question:, pipeline: [
207+
AnswerComposition::Pipeline::SearchResultFetcher,
208+
AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
209+
])
210+
211+
if answer.status =~ /^error/
212+
warn "Warning: answer has an error status: #{answer.status}"
213+
abort(answer.error_message)
214+
end
215+
216+
result = AutoEvaluation::Coherence.call(
217+
question_message: answer.rephrased_question || question.message,
218+
answer_message: answer.message,
219+
)
220+
221+
puts(result.to_json)
222+
end
200223
end

spec/lib/tasks/evaluation_spec.rb

Lines changed: 83 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,65 @@
2929
end
3030
end
3131

32+
shared_examples "an auto-evaluation generate task" do
33+
let(:answer) { build(:answer) }
34+
let(:evaluation_result) do
35+
AutoEvaluation::Result.new(
36+
score: 0.7,
37+
reason: "Most statements are relevant.",
38+
success: true,
39+
llm_responses: {},
40+
metrics: {},
41+
)
42+
end
43+
44+
before do
45+
Rake::Task[task_name].reenable
46+
47+
allow(AnswerComposition::PipelineRunner)
48+
.to receive(:call)
49+
.with(
50+
question: an_instance_of(Question),
51+
pipeline: [
52+
AnswerComposition::Pipeline::SearchResultFetcher,
53+
AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
54+
],
55+
)
56+
.and_return(answer)
57+
end
58+
59+
it_behaves_like "a task requiring an input"
60+
61+
it "outputs the evaluation result as JSON to stdout" do
62+
ClimateControl.modify(INPUT: question_message) do
63+
expected_result_output = {
64+
score: evaluation_result.score,
65+
reason: evaluation_result.reason,
66+
success: evaluation_result.success,
67+
llm_responses: evaluation_result.llm_responses,
68+
metrics: evaluation_result.metrics,
69+
}.to_json
70+
71+
expect { Rake::Task[task_name].invoke }
72+
.to output("#{expected_result_output}\n").to_stdout
73+
end
74+
end
75+
76+
context "when an answer has an error status" do
77+
let(:error_message) { "Answer generation failed" }
78+
let(:answer) { build(:answer, status: :error_answer_service_error, error_message:) }
79+
80+
it "warns the user and outputs the error message" do
81+
ClimateControl.modify(INPUT: question_message) do
82+
expected_stderr = "Warning: answer has an error status: #{answer.status}\n#{error_message}\n"
83+
expect { Rake::Task[task_name].invoke }
84+
.to raise_error(SystemExit)
85+
.and output(expected_stderr).to_stderr
86+
end
87+
end
88+
end
89+
end
90+
3291
describe "generate_answer" do
3392
let(:task_name) { "evaluation:generate_answer" }
3493
let(:input) { "What is the current VAT rate?" }
@@ -523,71 +582,36 @@
523582
end
524583

525584
describe "generate_answer_relevancy_evaluation" do
526-
let(:task_name) { "evaluation:generate_answer_relevancy_evaluation" }
527585
let(:question_message) { "What is the current VAT rate?" }
528-
let(:answer) { build(:answer) }
529-
let(:evaluation_result) do
530-
AutoEvaluation::Result.new(
531-
score: 0.7,
532-
reason: "Most statements are relevant.",
533-
success: true,
534-
llm_responses: {},
535-
metrics: {},
536-
)
537-
end
538586

539-
before do
540-
Rake::Task[task_name].reenable
541-
542-
allow(AnswerComposition::PipelineRunner)
543-
.to receive(:call)
544-
.with(
545-
question: an_instance_of(Question),
546-
pipeline: [
547-
AnswerComposition::Pipeline::Claude::QuestionRouter,
548-
AnswerComposition::Pipeline::SearchResultFetcher,
549-
AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
550-
],
551-
)
552-
.and_return(answer)
553-
554-
allow(AutoEvaluation::AnswerRelevancy)
555-
.to receive(:call)
556-
.with(
557-
question_message:,
558-
answer_message: answer.message,
559-
)
560-
.and_return(evaluation_result)
561-
end
587+
it_behaves_like "an auto-evaluation generate task" do
588+
let(:task_name) { "evaluation:generate_answer_relevancy_evaluation" }
562589

563-
it_behaves_like "a task requiring an input"
564-
565-
it "outputs the evaluation result as JSON to stdout" do
566-
ClimateControl.modify(INPUT: question_message) do
567-
expected_result_output = {
568-
score: evaluation_result.score,
569-
reason: evaluation_result.reason,
570-
success: evaluation_result.success,
571-
llm_responses: evaluation_result.llm_responses,
572-
metrics: evaluation_result.metrics,
573-
}.to_json
574-
575-
expect { Rake::Task[task_name].invoke }
576-
.to output("#{expected_result_output}\n").to_stdout
590+
before do
591+
allow(AutoEvaluation::AnswerRelevancy)
592+
.to receive(:call)
593+
.with(
594+
question_message:,
595+
answer_message: answer.message,
596+
)
597+
.and_return(evaluation_result)
577598
end
578599
end
600+
end
579601

580-
context "when an answer has an error status" do
581-
let(:error_message) { "Answer generation failed" }
582-
let(:answer) { build(:answer, status: :error_answer_service_error, error_message:) }
583-
584-
it "warns the user and outputs the error message" do
585-
ClimateControl.modify(INPUT: question_message) do
586-
expected_stderr = "Warning: answer has an error status: #{answer.status}\n#{error_message}\n"
587-
expect { Rake::Task[task_name].invoke }
588-
.to raise_error(SystemExit)
589-
.and output(expected_stderr).to_stderr
590-
end
602+
describe "generate_coherence_evaluation" do
603+
it_behaves_like "an auto-evaluation generate task" do
604+
let(:question_message) { "What is the current VAT rate?" }
605+
let(:task_name) { "evaluation:generate_coherence_evaluation" }
606+
607+
before do
608+
allow(AutoEvaluation::Coherence)
609+
.to receive(:call)
610+
.with(
611+
question_message: question_message,
612+
answer_message: answer.message,
613+
)
614+
.and_return(evaluation_result)
591615
end
592616
end
593617
end

0 commit comments

Comments
 (0)