Add AutoEvaluation::GenerateMetricResult

davidgisbey · davidgisbey · commit e4988f2c0ee2 · 2025-12-30T16:46:09.000Z
We're adding a few metrics. Each of these requires a basic rake task
that can be called with an INPUT environment variable. The task will
generate a ScoreResult for the given metric and print it as JSON.

This adds the AutoEvaluation::GenerateMetricResult class which:

- takes a question_message and a metric class as arguments
- generates an answer using the question_message,
- calls the metric class with the question and answer to get a ScoreResult
- returns the result as JSON

If the generated answer has an error status, it logs a warning and
returns the error message instead.

This does mean that we don't abort the rake tasks using STDERR and just
print the error message to STDOUT, but i think that's probably fine. Happy
to discuss if people disagree though.
diff --git a/lib/auto_evaluation/generate_metric_result.rb b/lib/auto_evaluation/generate_metric_result.rb
@@ -0,0 +1,35 @@
+module AutoEvaluation
+  class GenerateMetricResult
+    delegate :logger, to: Rails
+
+    def self.call(...) = new(...).call
+
+    def initialize(metric_class:, question_message:)
+      @metric_class = metric_class
+      @question_message = question_message
+    end
+
+    def call
+      question = Question.new(message: ENV["INPUT"], conversation: Conversation.new)
+      answer = AnswerComposition::PipelineRunner.call(question:, pipeline: [
+        AnswerComposition::Pipeline::SearchResultFetcher,
+        AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
+      ])
+
+      if answer.status =~ /^error/
+        logger.warn "Warning: answer has an error status: #{answer.status}"
+        return answer.error_message
+      end
+
+      result = metric_class.call(
+        question_message:,
+        answer_message: answer.message,
+      )
+      result.to_json
+    end
+
+  private
+
+    attr_reader :metric_class, :question_message
+  end
+end
diff --git a/lib/tasks/evaluation.rake b/lib/tasks/evaluation.rake
@@ -177,47 +177,23 @@ namespace :evaluation do
   task generate_answer_relevancy_evaluation: :environment do
     raise "Requires an INPUT env var" if ENV["INPUT"].blank?
 
-    question = Question.new(message: ENV["INPUT"], conversation: Conversation.new)
-
-    answer = AnswerComposition::PipelineRunner.call(question:, pipeline: [
-      AnswerComposition::Pipeline::SearchResultFetcher,
-      AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
-    ])
-
-    if answer.status =~ /^error/
-      warn "Warning: answer has an error status: #{answer.status}"
-      abort(answer.error_message)
-    end
-
-    result = AutoEvaluation::AnswerRelevancy.call(
-      question_message: answer.rephrased_question || question.message,
-      answer_message: answer.message,
+    puts(
+      AutoEvaluation::GenerateMetricResult.call(
+        metric_name: "answer_relevancy",
+        question_message: ENV["INPUT"],
+      )
     )
-
-    puts(result.to_json)
   end
 
   desc "Run answer coherence evaluation for a user input"
   task generate_coherence_evaluation: :environment do
     raise "Requires an INPUT env var" if ENV["INPUT"].blank?
 
-    question = Question.new(message: ENV["INPUT"], conversation: Conversation.new)
-
-    answer = AnswerComposition::PipelineRunner.call(question:, pipeline: [
-      AnswerComposition::Pipeline::SearchResultFetcher,
-      AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
-    ])
-
-    if answer.status =~ /^error/
-      warn "Warning: answer has an error status: #{answer.status}"
-      abort(answer.error_message)
-    end
-
-    result = AutoEvaluation::Coherence.call(
-      question_message: answer.rephrased_question || question.message,
-      answer_message: answer.message,
+    puts(
+      AutoEvaluation::GenerateMetricResult.call(
+        metric_name: "coherence",
+        question_message: ENV["INPUT"],
+      )
     )
-
-    puts(result.to_json)
   end
 end
diff --git a/spec/lib/auto_evaluation/generate_metric_result_spec.rb b/spec/lib/auto_evaluation/generate_metric_result_spec.rb
@@ -0,0 +1,73 @@
+RSpec.describe AutoEvaluation::GenerateMetricResult, :aws_credentials_stubbed do
+  let(:answer) { build(:answer) }
+  let(:question_message) { "What is the capital of France?" }
+  let(:evaluation_result) do
+    AutoEvaluation::ScoreResult.new(
+      score: 0.85,
+      reason: "The answer is mostly correct but misses some details.",
+      success: true,
+      llm_responses: [],
+      metrics: {},
+    )
+  end
+
+  before do
+    allow(AnswerComposition::PipelineRunner)
+      .to receive(:call)
+      .with(
+        question: an_instance_of(Question),
+        pipeline: [
+          AnswerComposition::Pipeline::SearchResultFetcher,
+          AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
+        ],
+      )
+      .and_return(answer)
+  end
+
+  [
+    AutoEvaluation::AnswerRelevancy,
+    AutoEvaluation::Coherence,
+  ].each do |metric_class|
+    context "when passed the #{metric_class} metric class" do
+      before do
+        allow(metric_class)
+          .to receive(:call)
+          .and_return(evaluation_result)
+      end
+
+      it "calls #{metric_class} with the correct parameters" do
+        result = described_class.call(
+          metric_class: metric_class,
+          question_message:,
+        )
+
+        expect(metric_class).to have_received(:call).with(
+          question_message:,
+          answer_message: answer.message,
+        )
+      end
+
+      it "returns the metrics ScoreResult as JSON" do
+        result = described_class.call(
+          metric_class: metric_class,
+          question_message:,
+        )
+        expect(result).to eq(evaluation_result.to_json)
+      end
+    end
+  end
+
+  context "when the generated answer has an error status" do
+    let(:answer) { build(:answer, status: :error_answer_service_error, error_message: "Contrived error.") }
+
+    it "logs a warning and returns the error message" do
+      expect(Rails.logger).to receive(:warn).with("Warning: answer has an error status: error_answer_service_error")
+
+      result = described_class.call(
+        metric_class: AutoEvaluation::Coherence,
+        question_message:,
+      )
+      expect(result).to eq(answer.error_message)
+    end
+  end
+end
diff --git a/spec/lib/tasks/evaluation_spec.rb b/spec/lib/tasks/evaluation_spec.rb
@@ -30,35 +30,24 @@
   end
 
   shared_examples "a task that returns a ScoreResult" do
+    let(:question_message) { "What is the current VAT rate?" }
     let(:answer) { build(:answer) }
-    let(:evaluation_result) do
-      AutoEvaluation::ScoreResult.new(
-        score: 0.7,
-        reason: "Most statements are relevant.",
-        success: true,
-        llm_responses: {},
-        metrics: {},
-      )
-    end
 
     before do
       Rake::Task[task_name].reenable
 
-      allow(AnswerComposition::PipelineRunner)
+      allow(AutoEvaluation::GenerateMetricResult)
         .to receive(:call)
         .with(
-          question: an_instance_of(Question),
-          pipeline: [
-            AnswerComposition::Pipeline::SearchResultFetcher,
-            AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
-          ],
+          metric_name:,
+          question_message:,
         )
-        .and_return(answer)
+        .and_return(evaluation_result.to_json)
     end
 
     it_behaves_like "a task requiring an input"
 
-    it "outputs the evaluation result as JSON to stdout" do
+    it "outputs the evaluation result as to stdout" do
       ClimateControl.modify(INPUT: question_message) do
         expected_result_output = {
           score: evaluation_result.score,
@@ -72,20 +61,6 @@
           .to output("#{expected_result_output}\n").to_stdout
       end
     end
-
-    context "when an answer has an error status" do
-      let(:error_message) { "Answer generation failed" }
-      let(:answer) { build(:answer, status: :error_answer_service_error, error_message:) }
-
-      it "warns the user and outputs the error message" do
-        ClimateControl.modify(INPUT: question_message) do
-          expected_stderr = "Warning: answer has an error status: #{answer.status}\n#{error_message}\n"
-          expect { Rake::Task[task_name].invoke }
-            .to raise_error(SystemExit)
-            .and output(expected_stderr).to_stderr
-        end
-      end
-    end
   end
 
   describe "generate_answer" do
@@ -582,37 +557,16 @@
   end
 
   describe "generate_answer_relevancy_evaluation" do
-    let(:question_message) { "What is the current VAT rate?" }
-
     it_behaves_like "a task that returns a ScoreResult" do
       let(:task_name) { "evaluation:generate_answer_relevancy_evaluation" }
-
-      before do
-        allow(AutoEvaluation::AnswerRelevancy)
-          .to receive(:call)
-          .with(
-            question_message:,
-            answer_message: answer.message,
-          )
-          .and_return(evaluation_result)
-      end
+      let(:metric_name) { "answer_relevancy" }
     end
   end
 
   describe "generate_coherence_evaluation" do
     it_behaves_like "a task that returns a ScoreResult" do
-      let(:question_message) { "What is the current VAT rate?" }
       let(:task_name) { "evaluation:generate_coherence_evaluation" }
-
-      before do
-        allow(AutoEvaluation::Coherence)
-          .to receive(:call)
-          .with(
-            question_message:,
-            answer_message: answer.message,
-          )
-          .and_return(evaluation_result)
-      end
+      let(:metric_name) { "coherence" }
     end
   end
 end