Add coherence metric auto-evaluation task

davidgisbey · davidgisbey · commit 9a95c262df11 · 2025-12-22T14:43:42.000Z
This adds a new Rake task to generate coherence for a given question.
Much like the answer relevancy task it:

1. generates an answer for the input question using the existing
   answer composition pipeline
2. evaluates the coherence of the generated answer against the question
   using AutoEvaluation::Coherence
3. outputs the result json to stdout
4. handles errors answers appropriately

Because there's so much shared functionality i've added a shared example
to the existing evaluation_spec to reduce duplication between the two tasks.

Once all the metrics are ported, we might want to consider updating this
so have a single rake task that takes the metric as an argument rather
than separate tasks for each metric.

I've held off on doing this for now just to make sure all the rake tasks
do have shared logic with the exception of the metric called. I'm pretty
sure they will though.
diff --git a/lib/tasks/evaluation.rake b/lib/tasks/evaluation.rake
@@ -180,7 +180,6 @@ namespace :evaluation do
     question = Question.new(message: ENV["INPUT"], conversation: Conversation.new)
 
     answer = AnswerComposition::PipelineRunner.call(question:, pipeline: [
-      AnswerComposition::Pipeline::Claude::QuestionRouter,
       AnswerComposition::Pipeline::SearchResultFetcher,
       AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
     ])
@@ -197,4 +196,28 @@ namespace :evaluation do
 
     puts(result.to_json)
   end
+
+  desc "Run answer coherence evaluation for a user input"
+  task generate_coherence_evaluation: :environment do
+    raise "Requires an INPUT env var" if ENV["INPUT"].blank?
+
+    question = Question.new(message: ENV["INPUT"], conversation: Conversation.new)
+
+    answer = AnswerComposition::PipelineRunner.call(question:, pipeline: [
+      AnswerComposition::Pipeline::SearchResultFetcher,
+      AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
+    ])
+
+    if answer.status =~ /^error/
+      warn "Warning: answer has an error status: #{answer.status}"
+      abort(answer.error_message)
+    end
+
+    result = AutoEvaluation::Coherence.call(
+      question_message: answer.rephrased_question || question.message,
+      answer_message: answer.message,
+    )
+
+    puts(result.to_json)
+  end
 end
diff --git a/spec/lib/tasks/evaluation_spec.rb b/spec/lib/tasks/evaluation_spec.rb
@@ -29,6 +29,65 @@
     end
   end
 
+  shared_examples "an auto-evaluation generate task" do
+    let(:answer) { build(:answer) }
+    let(:evaluation_result) do
+      AutoEvaluation::Result.new(
+        score: 0.7,
+        reason: "Most statements are relevant.",
+        success: true,
+        llm_responses: {},
+        metrics: {},
+      )
+    end
+
+    before do
+      Rake::Task[task_name].reenable
+
+      allow(AnswerComposition::PipelineRunner)
+        .to receive(:call)
+        .with(
+          question: an_instance_of(Question),
+          pipeline: [
+            AnswerComposition::Pipeline::SearchResultFetcher,
+            AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
+          ],
+        )
+        .and_return(answer)
+    end
+
+    it_behaves_like "a task requiring an input"
+
+    it "outputs the evaluation result as JSON to stdout" do
+      ClimateControl.modify(INPUT: question_message) do
+        expected_result_output = {
+          score: evaluation_result.score,
+          reason: evaluation_result.reason,
+          success: evaluation_result.success,
+          llm_responses: evaluation_result.llm_responses,
+          metrics: evaluation_result.metrics,
+        }.to_json
+
+        expect { Rake::Task[task_name].invoke }
+          .to output("#{expected_result_output}\n").to_stdout
+      end
+    end
+
+    context "when an answer has an error status" do
+      let(:error_message) { "Answer generation failed" }
+      let(:answer) { build(:answer, status: :error_answer_service_error, error_message:) }
+
+      it "warns the user and outputs the error message" do
+        ClimateControl.modify(INPUT: question_message) do
+          expected_stderr = "Warning: answer has an error status: #{answer.status}\n#{error_message}\n"
+          expect { Rake::Task[task_name].invoke }
+            .to raise_error(SystemExit)
+            .and output(expected_stderr).to_stderr
+        end
+      end
+    end
+  end
+
   describe "generate_answer" do
     let(:task_name) { "evaluation:generate_answer" }
     let(:input) { "What is the current VAT rate?" }
@@ -523,71 +582,36 @@
   end
 
   describe "generate_answer_relevancy_evaluation" do
-    let(:task_name) { "evaluation:generate_answer_relevancy_evaluation" }
     let(:question_message) { "What is the current VAT rate?" }
-    let(:answer) { build(:answer) }
-    let(:evaluation_result) do
-      AutoEvaluation::Result.new(
-        score: 0.7,
-        reason: "Most statements are relevant.",
-        success: true,
-        llm_responses: {},
-        metrics: {},
-      )
-    end
 
-    before do
-      Rake::Task[task_name].reenable
-
-      allow(AnswerComposition::PipelineRunner)
-        .to receive(:call)
-        .with(
-          question: an_instance_of(Question),
-          pipeline: [
-            AnswerComposition::Pipeline::Claude::QuestionRouter,
-            AnswerComposition::Pipeline::SearchResultFetcher,
-            AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
-          ],
-        )
-        .and_return(answer)
-
-      allow(AutoEvaluation::AnswerRelevancy)
-        .to receive(:call)
-        .with(
-          question_message:,
-          answer_message: answer.message,
-        )
-        .and_return(evaluation_result)
-    end
+    it_behaves_like "an auto-evaluation generate task" do
+      let(:task_name) { "evaluation:generate_answer_relevancy_evaluation" }
 
-    it_behaves_like "a task requiring an input"
-
-    it "outputs the evaluation result as JSON to stdout" do
-      ClimateControl.modify(INPUT: question_message) do
-        expected_result_output = {
-          score: evaluation_result.score,
-          reason: evaluation_result.reason,
-          success: evaluation_result.success,
-          llm_responses: evaluation_result.llm_responses,
-          metrics: evaluation_result.metrics,
-        }.to_json
-
-        expect { Rake::Task[task_name].invoke }
-          .to output("#{expected_result_output}\n").to_stdout
+      before do
+        allow(AutoEvaluation::AnswerRelevancy)
+          .to receive(:call)
+          .with(
+            question_message:,
+            answer_message: answer.message,
+          )
+          .and_return(evaluation_result)
       end
     end
+  end
 
-    context "when an answer has an error status" do
-      let(:error_message) { "Answer generation failed" }
-      let(:answer) { build(:answer, status: :error_answer_service_error, error_message:) }
-
-      it "warns the user and outputs the error message" do
-        ClimateControl.modify(INPUT: question_message) do
-          expected_stderr = "Warning: answer has an error status: #{answer.status}\n#{error_message}\n"
-          expect { Rake::Task[task_name].invoke }
-            .to raise_error(SystemExit)
-            .and output(expected_stderr).to_stderr
-        end
+  describe "generate_coherence_evaluation" do
+    it_behaves_like "an auto-evaluation generate task" do
+      let(:question_message) { "What is the current VAT rate?" }
+      let(:task_name) { "evaluation:generate_coherence_evaluation" }
+
+      before do
+        allow(AutoEvaluation::Coherence)
+          .to receive(:call)
+          .with(
+            question_message: question_message,
+            answer_message: answer.message,
+          )
+          .and_return(evaluation_result)
       end
     end
   end