Move TopicTagger into AutoEvaluation module

davidgisbey · davidgisbey · commit 7cdcb5806e84 · 2025-12-19T16:00:46.000Z
I know this isn't strictly part of AutoEvaluation, but we will be using
it as part of our analysis for auto-evaluation, as we will be aggregating
metric scores across topics.

For that reason i think it makes sense to move it into the AutoEvaluation
module. Particularly as we've grouped topics with the other metrics in
jobs and models.
diff --git a/app/jobs/answer_analysis/answer_topics_job.rb b/app/jobs/answer_analysis/answer_topics_job.rb
@@ -12,7 +12,7 @@ def perform(answer_id)
       end
       return if quota_limit_reached?
 
-      result = TopicTagger.call(answer.rephrased_question || answer.question.message)
+      result = AutoEvaluation::TopicTagger.call(answer.rephrased_question || answer.question.message)
       topics = answer.build_topics(
         primary_topic: result.primary_topic,
         secondary_topic: result.secondary_topic,
diff --git a/lib/auto_evaluation/topic_tagger.rb b/lib/auto_evaluation/topic_tagger.rb
@@ -0,0 +1,83 @@
+module AutoEvaluation
+  class TopicTagger
+    Result = Data.define(:primary_topic,
+                         :secondary_topic,
+                         :metrics,
+                         :llm_response)
+
+    def self.call(...) = new(...).call
+
+    def initialize(user_question)
+      @user_question = user_question
+    end
+
+    def call
+      start_time = Clock.monotonic_time
+      response = anthropic_bedrock_client.messages.create(
+        messages:,
+        system: [
+          { type: "text", text: system_prompt, cache_control: { type: "ephemeral" } },
+        ],
+        model: BedrockModels.model_id(:claude_sonnet),
+        tools:,
+        tool_choice: { type: "tool", name: tools.first[:name] },
+        **inference_config,
+      )
+
+      Result.new(
+        primary_topic: response[:content][0][:input][:primary_topic],
+        secondary_topic: response[:content][0][:input][:secondary_topic],
+        metrics: build_metrics(response, start_time),
+        llm_response: response.to_h,
+      )
+    end
+
+  private
+
+    attr_reader :user_question
+
+    def anthropic_bedrock_client
+      @anthropic_bedrock_client ||= Anthropic::BedrockClient.new(
+        aws_region: ENV["CLAUDE_AWS_REGION"],
+      )
+    end
+
+    def build_metrics(response, start_time)
+      {
+        duration: Clock.monotonic_time - start_time,
+        llm_prompt_tokens: BedrockModels.claude_total_prompt_tokens(response[:usage]),
+        llm_completion_tokens: response[:usage][:output_tokens],
+        llm_cached_tokens: response[:usage][:cache_read_input_tokens],
+        model: response[:model],
+      }
+    end
+
+    def system_prompt
+      topic_tagger_config["system_prompt"]
+    end
+
+    def messages
+      [
+        {
+          role: "user",
+          content: user_question,
+        },
+      ]
+    end
+
+    def topic_tagger_config
+      Rails.configuration.govuk_chat_private.llm_prompts.claude.topic_tagger
+    end
+
+    def inference_config
+      {
+        max_tokens: 4096,
+        temperature: 0.0,
+      }
+    end
+
+    def tools
+      [topic_tagger_config["tool_spec"]]
+    end
+  end
+end
diff --git a/lib/tasks/evaluation.rake b/lib/tasks/evaluation.rake
@@ -119,7 +119,7 @@ namespace :evaluation do
   task generate_topics_for_question: :environment do
     raise "Requires an INPUT env var" if ENV["INPUT"].blank?
 
-    result = TopicTagger.call(ENV["INPUT"])
+    result = AutoEvaluation::TopicTagger.call(ENV["INPUT"])
 
     puts(result.to_json)
   end
diff --git a/lib/topic_tagger.rb b/lib/topic_tagger.rb
diff --git a/spec/jobs/answer_analysis/answer_topics_job_spec.rb b/spec/jobs/answer_analysis/answer_topics_job_spec.rb
@@ -3,7 +3,7 @@
   let(:answer) { create(:answer) }
   let(:question) { answer.question }
   let(:topic_tagger_result) do
-    TopicTagger::Result.new(
+    AutoEvaluation::TopicTagger::Result.new(
       primary_topic: "business",
       secondary_topic: "benefits",
       metrics: {
@@ -16,21 +16,21 @@
     )
   end
 
-  before { allow(TopicTagger).to receive(:call).and_return(topic_tagger_result) }
+  before { allow(AutoEvaluation::TopicTagger).to receive(:call).and_return(topic_tagger_result) }
 
   it_behaves_like "a job in queue", "default"
-  it_behaves_like "a job that adheres to the metric quota", TopicTagger
+  it_behaves_like "a job that adheres to the metric quota", AutoEvaluation::TopicTagger
   it_behaves_like "a job that retries on service errors", Anthropic::Errors::APIError do
     before do
-      allow(TopicTagger).to receive(:call)
+      allow(AutoEvaluation::TopicTagger).to receive(:call)
                         .and_raise(Anthropic::Errors::APIError.new(url: "url"))
     end
   end
 
   describe "#perform" do
-    it "calls the TopicTagger with the answer message" do
+    it "calls the AutoEvaluation::TopicTagger with the answer message" do
       described_class.new.perform(answer.id)
-      expect(TopicTagger).to have_received(:call).with(question.message)
+      expect(AutoEvaluation::TopicTagger).to have_received(:call).with(question.message)
     end
 
     it "creates topics for the answer based of the returned result" do
@@ -49,10 +49,10 @@
     context "when the answer has a rephrased_question" do
       let(:rephrased_question) { "This is a rephrased_question" }
 
-      it "calls the TopicTagger with the rephrased question" do
+      it "calls the AutoEvaluation::TopicTagger with the rephrased question" do
         answer = create(:answer, rephrased_question: rephrased_question)
         described_class.new.perform(answer.id)
-        expect(TopicTagger).to have_received(:call).with(rephrased_question)
+        expect(AutoEvaluation::TopicTagger).to have_received(:call).with(rephrased_question)
       end
     end
 
@@ -67,9 +67,9 @@
         described_class.new.perform(answer_id)
       end
 
-      it "doesn't call the TopicTagger" do
+      it "doesn't call the AutoEvaluation::TopicTagger" do
         described_class.new.perform(answer_id)
-        expect(TopicTagger).not_to have_received(:call)
+        expect(AutoEvaluation::TopicTagger).not_to have_received(:call)
       end
     end
 
@@ -96,8 +96,8 @@
         described_class.new.perform(answer.id)
       end
 
-      it "does not call the TopicTagger" do
-        expect(TopicTagger).not_to receive(:call)
+      it "does not call the AutoEvaluation::TopicTagger" do
+        expect(AutoEvaluation::TopicTagger).not_to receive(:call)
         described_class.new.perform(answer.id)
       end
     end
diff --git a/spec/lib/auto_evaluation/topic_tagger_spec.rb b/spec/lib/auto_evaluation/topic_tagger_spec.rb
@@ -1,4 +1,4 @@
-RSpec.describe TopicTagger, :aws_credentials_stubbed do
+RSpec.describe AutoEvaluation::TopicTagger, :aws_credentials_stubbed do
   describe ".call" do
     let(:message) { "This is a test message." }
 
@@ -7,7 +7,7 @@
     it "returns a results object with the expected topics" do
       result = described_class.call(message)
       expect(result)
-        .to be_a(TopicTagger::Result)
+        .to be_a(AutoEvaluation::TopicTagger::Result)
         .and have_attributes(
           primary_topic: "business",
           secondary_topic: "benefits",
diff --git a/spec/lib/tasks/evaluation_spec.rb b/spec/lib/tasks/evaluation_spec.rb
@@ -384,13 +384,13 @@
 
     it "outputs the response as JSON to stdout" do
       ClimateControl.modify(INPUT: input) do
-        result = TopicTagger::Result.new(
+        result = AutoEvaluation::TopicTagger::Result.new(
           primary_topic: "tax",
           secondary_topic: "benefits",
           metrics: {},
           llm_response: {},
         )
-        allow(TopicTagger).to receive(:call).with(input).and_return(result)
+        allow(AutoEvaluation::TopicTagger).to receive(:call).with(input).and_return(result)
 
         expect { Rake::Task[task_name].invoke }
           .to output("#{result.to_json}\n").to_stdout