From a3c1e4bb839a8c6d8b3502e86ea5ad14dcf6e3c7 Mon Sep 17 00:00:00 2001
From: David Gisbey <david.gisbey@digital.cabinet-office.gov.uk>
Date: Tue, 16 Dec 2025 10:47:52 +0000
Subject: [PATCH 1/7] Add AnswerRelevancyAggregate & AnswerRelevancyRun

This adds a migration to the two new tables needed to store answer relevancy
metrics. It also adds the corresponding models and factories.

We will need to record llm multiple llm responses and metrics for each
run so i've included the LlmCallsRecordable module in the AnswerRelevancyRun
model.
---
 app/models/answer.rb                          |  1 +
 .../answer_relevancy_aggregate.rb             | 11 +++++++++
 .../answer_analysis/answer_relevancy_run.rb   | 11 +++++++++
 ...51216092915_add_answer_relevancy_tables.rb | 18 +++++++++++++++
 db/schema.rb                                  | 23 ++++++++++++++++++-
 .../answer_relevancy_aggregate_factory.rb     |  6 +++++
 .../factories/answer_relevancy_run_factory.rb |  7 ++++++
 .../answer_relevancy_run_spec.rb              |  5 ++++
 8 files changed, 81 insertions(+), 1 deletion(-)
 create mode 100644 app/models/answer_analysis/answer_relevancy_aggregate.rb
 create mode 100644 app/models/answer_analysis/answer_relevancy_run.rb
 create mode 100644 db/migrate/20251216092915_add_answer_relevancy_tables.rb
 create mode 100644 spec/factories/answer_relevancy_aggregate_factory.rb
 create mode 100644 spec/factories/answer_relevancy_run_factory.rb
 create mode 100644 spec/models/answer_analysis/answer_relevancy_run_spec.rb

diff --git a/app/models/answer.rb b/app/models/answer.rb
index f062f5960..e27b742e8 100644
--- a/app/models/answer.rb
+++ b/app/models/answer.rb
@@ -55,6 +55,7 @@ def self.response_for_question_routing_label(label)
   has_many :sources, -> { order(relevancy: :asc) }, class_name: "AnswerSource"
   has_one :feedback, class_name: "AnswerFeedback"
   has_one :topics, class_name: "AnswerAnalysis::Topics"
+  has_one :answer_relevancy_aggregate, class_name: "AnswerAnalysis::AnswerRelevancyAggregate"
 
   enum :status,
        {
diff --git a/app/models/answer_analysis/answer_relevancy_aggregate.rb b/app/models/answer_analysis/answer_relevancy_aggregate.rb
new file mode 100644
index 000000000..8c578e4bb
--- /dev/null
+++ b/app/models/answer_analysis/answer_relevancy_aggregate.rb
@@ -0,0 +1,11 @@
+module AnswerAnalysis
+  class AnswerRelevancyAggregate < ApplicationRecord
+    self.table_name = "answer_analysis_answer_relevancy_aggregates"
+
+    belongs_to :answer
+    has_many :runs,
+             -> { order(:created_at) },
+             class_name: "AnswerAnalysis::AnswerRelevancyRun",
+             foreign_key: :answer_analysis_answer_relevancy_aggregate_id
+  end
+end
diff --git a/app/models/answer_analysis/answer_relevancy_run.rb b/app/models/answer_analysis/answer_relevancy_run.rb
new file mode 100644
index 000000000..212ab36ff
--- /dev/null
+++ b/app/models/answer_analysis/answer_relevancy_run.rb
@@ -0,0 +1,11 @@
+module AnswerAnalysis
+  class AnswerRelevancyRun < ApplicationRecord
+    include LlmCallsRecordable
+
+    self.table_name = "answer_analysis_answer_relevancy_runs"
+
+    belongs_to :aggregate,
+               class_name: "AnswerAnalysis::AnswerRelevancyAggregate",
+               foreign_key: :answer_analysis_answer_relevancy_aggregate_id
+  end
+end
diff --git a/db/migrate/20251216092915_add_answer_relevancy_tables.rb b/db/migrate/20251216092915_add_answer_relevancy_tables.rb
new file mode 100644
index 000000000..900041796
--- /dev/null
+++ b/db/migrate/20251216092915_add_answer_relevancy_tables.rb
@@ -0,0 +1,18 @@
+class AddAnswerRelevancyTables < ActiveRecord::Migration[8.0]
+  def change
+    create_table :answer_analysis_answer_relevancy_aggregates, id: :uuid do |t|
+      t.decimal :mean_score, null: false
+      t.references :answer, type: :uuid, null: false, foreign_key: { on_delete: :cascade }, index: { unique: true }
+      t.timestamps
+    end
+
+    create_table :answer_analysis_answer_relevancy_runs, id: :uuid do |t|
+      t.decimal :score, null: false
+      t.string :reason, null: false
+      t.jsonb :llm_responses
+      t.jsonb :metrics
+      t.references :answer_analysis_answer_relevancy_aggregate, type: :uuid, null: false, foreign_key: { on_delete: :cascade }
+      t.timestamps
+    end
+  end
+end
diff --git a/db/schema.rb b/db/schema.rb
index a48c8be54..7e6a83f29 100644
--- a/db/schema.rb
+++ b/db/schema.rb
@@ -10,7 +10,7 @@
 #
 # It's strongly recommended that you check this file into your version control system.
 
-ActiveRecord::Schema[8.0].define(version: 2025_12_15_161508) do
+ActiveRecord::Schema[8.0].define(version: 2025_12_16_092915) do
   # These are extensions that must be enabled in order to support this database
   enable_extension "citext"
   enable_extension "pg_catalog.plpgsql"
@@ -24,6 +24,25 @@
   create_enum "guardrails_status", ["pass", "fail", "error"]
   create_enum "question_routing_label", ["about_mps", "advice_opinions_predictions", "character_fun", "genuine_rag", "gov_transparency", "greetings", "harmful_vulgar_controversy", "multi_questions", "negative_acknowledgement", "non_english", "personal_info", "positive_acknowledgement", "vague_acronym_grammar", "unclear_intent", "requires_account_data", "about_chat"]
 
+  create_table "answer_analysis_answer_relevancy_aggregates", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
+    t.decimal "mean_score", null: false
+    t.uuid "answer_id", null: false
+    t.datetime "created_at", null: false
+    t.datetime "updated_at", null: false
+    t.index ["answer_id"], name: "index_answer_analysis_answer_relevancy_aggregates_on_answer_id", unique: true
+  end
+
+  create_table "answer_analysis_answer_relevancy_runs", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
+    t.decimal "score", null: false
+    t.string "reason", null: false
+    t.jsonb "llm_responses"
+    t.jsonb "metrics"
+    t.uuid "answer_analysis_answer_relevancy_aggregate_id", null: false
+    t.datetime "created_at", null: false
+    t.datetime "updated_at", null: false
+    t.index ["answer_analysis_answer_relevancy_aggregate_id"], name: "idx_on_answer_analysis_answer_relevancy_aggregate_i_d9d79a637a"
+  end
+
   create_table "answer_analysis_topics", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
     t.string "primary_topic"
     t.string "secondary_topic"
@@ -170,6 +189,8 @@
     t.datetime "updated_at", null: false
   end
 
+  add_foreign_key "answer_analysis_answer_relevancy_aggregates", "answers", on_delete: :cascade
+  add_foreign_key "answer_analysis_answer_relevancy_runs", "answer_analysis_answer_relevancy_aggregates", on_delete: :cascade
   add_foreign_key "answer_analysis_topics", "answers", on_delete: :cascade
   add_foreign_key "answer_feedback", "answers", on_delete: :cascade
   add_foreign_key "answer_sources", "answer_source_chunks", on_delete: :restrict
diff --git a/spec/factories/answer_relevancy_aggregate_factory.rb b/spec/factories/answer_relevancy_aggregate_factory.rb
new file mode 100644
index 000000000..bd7fb500c
--- /dev/null
+++ b/spec/factories/answer_relevancy_aggregate_factory.rb
@@ -0,0 +1,6 @@
+FactoryBot.define do
+  factory :answer_relevancy_aggregate, class: "AnswerAnalysis::AnswerRelevancyAggregate" do
+    answer
+    mean_score { 0.5 }
+  end
+end
diff --git a/spec/factories/answer_relevancy_run_factory.rb b/spec/factories/answer_relevancy_run_factory.rb
new file mode 100644
index 000000000..a5f8bdcd3
--- /dev/null
+++ b/spec/factories/answer_relevancy_run_factory.rb
@@ -0,0 +1,7 @@
+FactoryBot.define do
+  factory :answer_relevancy_run, class: "AnswerAnalysis::AnswerRelevancyRun" do
+    association :aggregate, factory: :answer_relevancy_aggregate
+    score { 0.5 }
+    reason { "The answer was okay." }
+  end
+end
diff --git a/spec/models/answer_analysis/answer_relevancy_run_spec.rb b/spec/models/answer_analysis/answer_relevancy_run_spec.rb
new file mode 100644
index 000000000..28129c625
--- /dev/null
+++ b/spec/models/answer_analysis/answer_relevancy_run_spec.rb
@@ -0,0 +1,5 @@
+RSpec.describe AnswerAnalysis::AnswerRelevancyRun do
+  include_examples "llm calls recordable" do
+    let(:model) { build(:answer_relevancy_run) }
+  end
+end

From 73780fdd8bf1faa8379720fcb05a3f3fe2730724 Mon Sep 17 00:00:00 2001
From: David Gisbey <david.gisbey@digital.cabinet-office.gov.uk>
Date: Wed, 17 Dec 2025 14:30:39 +0000
Subject: [PATCH 2/7] Add additional bedrock stub to stub all answer relevancy
 calls

We're going to need to stub out these calls in multiple places so it makes
sense to have a single method that does all the stubbing for us.

I've also prepended stub_ to bedrock_invoke_model_openai_oss_tool_call.
All other stubs have this so it makes sense to be consistent.
---
 .../answer_relevancy/reason_generator_spec.rb |  2 +-
 .../statement_generator_spec.rb               |  2 +-
 .../verdicts_generator_spec.rb                |  2 +-
 .../auto_evaluation/answer_relevancy_spec.rb  | 56 ++++---------------
 .../bedrock_openai_oss_invoke_spec.rb         |  4 +-
 spec/lib/auto_evaluation/coherence_spec.rb    |  6 +-
 spec/support/stub_bedrock.rb                  | 51 ++++++++++++++++-
 7 files changed, 68 insertions(+), 55 deletions(-)

diff --git a/spec/lib/auto_evaluation/answer_relevancy/reason_generator_spec.rb b/spec/lib/auto_evaluation/answer_relevancy/reason_generator_spec.rb
index 5827c716f..c90314d47 100644
--- a/spec/lib/auto_evaluation/answer_relevancy/reason_generator_spec.rb
+++ b/spec/lib/auto_evaluation/answer_relevancy/reason_generator_spec.rb
@@ -23,7 +23,7 @@
     end
     let(:tools) { [prompts.fetch(:tool_spec)] }
     let!(:stub_bedrock) do
-      bedrock_invoke_model_openai_oss_tool_call(
+      stub_bedrock_invoke_model_openai_oss_tool_call(
         user_prompt,
         tools,
         reason_json,
diff --git a/spec/lib/auto_evaluation/answer_relevancy/statement_generator_spec.rb b/spec/lib/auto_evaluation/answer_relevancy/statement_generator_spec.rb
index 5cb63ac4f..bbd7b66a2 100644
--- a/spec/lib/auto_evaluation/answer_relevancy/statement_generator_spec.rb
+++ b/spec/lib/auto_evaluation/answer_relevancy/statement_generator_spec.rb
@@ -14,7 +14,7 @@
     end
     let(:tools) { [prompts.fetch(:tool_spec)] }
     let!(:stub_bedrock) do
-      bedrock_invoke_model_openai_oss_tool_call(
+      stub_bedrock_invoke_model_openai_oss_tool_call(
         user_prompt,
         tools,
         statements_json,
diff --git a/spec/lib/auto_evaluation/answer_relevancy/verdicts_generator_spec.rb b/spec/lib/auto_evaluation/answer_relevancy/verdicts_generator_spec.rb
index 2878b02f7..7c62709e7 100644
--- a/spec/lib/auto_evaluation/answer_relevancy/verdicts_generator_spec.rb
+++ b/spec/lib/auto_evaluation/answer_relevancy/verdicts_generator_spec.rb
@@ -21,7 +21,7 @@
     end
     let(:tools) { [prompts.fetch(:tool_spec)] }
     let!(:stub_bedrock) do
-      bedrock_invoke_model_openai_oss_tool_call(
+      stub_bedrock_invoke_model_openai_oss_tool_call(
         user_prompt,
         tools,
         verdicts_json,
diff --git a/spec/lib/auto_evaluation/answer_relevancy_spec.rb b/spec/lib/auto_evaluation/answer_relevancy_spec.rb
index 3375bb272..83ee4c09f 100644
--- a/spec/lib/auto_evaluation/answer_relevancy_spec.rb
+++ b/spec/lib/auto_evaluation/answer_relevancy_spec.rb
@@ -1,6 +1,5 @@
 RSpec.describe AutoEvaluation::AnswerRelevancy, :aws_credentials_stubbed do
   describe ".call" do
-    let(:prompts) { AutoEvaluation::Prompts.config.answer_relevancy }
     let(:question_message) { "This is a test question message." }
     let(:answer_message) { "This is a test answer message." }
     let(:question) { build(:question, message: question_message) }
@@ -8,21 +7,6 @@
 
     let(:statements) { ["This is the first statement.", "This is the second statement."] }
     let(:statements_json) { { statements: }.to_json }
-    let(:user_prompt_statements) do
-      sprintf(
-        prompts.fetch(:statements).fetch(:user_prompt),
-        answer: answer_message,
-      )
-    end
-    let(:statements_tools) { [prompts.fetch(:statements).fetch(:tool_spec)] }
-    let!(:statements_stub) do
-      bedrock_invoke_model_openai_oss_tool_call(
-        user_prompt_statements,
-        statements_tools,
-        statements_json,
-      )
-    end
-
     let(:verdicts) do
       [
         { "verdict" => "yes" },
@@ -30,40 +14,20 @@
       ]
     end
     let(:verdicts_json) { { verdicts: }.to_json }
-    let(:user_prompt_verdicts) do
-      sprintf(
-        prompts.fetch(:verdicts).fetch(:user_prompt),
-        question: question_message,
-        statements:,
-      )
-    end
-    let(:verdicts_tools) { [prompts.fetch(:verdicts).fetch(:tool_spec)] }
-    let!(:verdicts_stub) do
-      bedrock_invoke_model_openai_oss_tool_call(
-        user_prompt_verdicts,
-        verdicts_tools,
-        verdicts_json,
-      )
-    end
-
     let(:reason) { "This is the reason for the score." }
     let(:reason_json) { { reason: }.to_json }
-    let(:user_prompt_reason) do
-      sprintf(
-        prompts.fetch(:reason).fetch(:user_prompt),
-        score: 0.5,
-        unsuccessful_verdicts_reasons: ["The statement is irrelevant."],
-        question: question_message,
-      )
-    end
-    let(:reason_tools) { [prompts.fetch(:reason).fetch(:tool_spec)] }
-    let!(:reason_stub) do
-      bedrock_invoke_model_openai_oss_tool_call(
-        user_prompt_reason,
-        reason_tools,
-        reason_json,
+    let!(:answer_relevancy_stubs) do
+      stub_bedrock_invoke_model_openai_oss_answer_relevancy(
+        question_message:,
+        answer_message:,
+        statements_json:,
+        verdicts_json:,
+        reason_json:,
       )
     end
+    let(:statements_stub) { answer_relevancy_stubs[:statements] }
+    let(:verdicts_stub) { answer_relevancy_stubs[:verdicts] }
+    let(:reason_stub) { answer_relevancy_stubs[:reason] }
 
     it "returns a results object with the expected attributes" do
       allow(Clock).to receive(:monotonic_time)
diff --git a/spec/lib/auto_evaluation/bedrock_openai_oss_invoke_spec.rb b/spec/lib/auto_evaluation/bedrock_openai_oss_invoke_spec.rb
index f93cf1897..64e646953 100644
--- a/spec/lib/auto_evaluation/bedrock_openai_oss_invoke_spec.rb
+++ b/spec/lib/auto_evaluation/bedrock_openai_oss_invoke_spec.rb
@@ -21,7 +21,7 @@
       ]
     end
     let!(:stub) do
-      bedrock_invoke_model_openai_oss_tool_call(
+      stub_bedrock_invoke_model_openai_oss_tool_call(
         user_message,
         tools,
         { "response" => "Expected response." }.to_json,
@@ -57,7 +57,7 @@
     end
 
     it "raises an error if the response does not conform to the schema" do
-      bedrock_invoke_model_openai_oss_tool_call(
+      stub_bedrock_invoke_model_openai_oss_tool_call(
         user_message,
         tools,
         { "invalid_key" => "This does not conform to the schema." }.to_json,
diff --git a/spec/lib/auto_evaluation/coherence_spec.rb b/spec/lib/auto_evaluation/coherence_spec.rb
index 1f77a906d..f1a7b1799 100644
--- a/spec/lib/auto_evaluation/coherence_spec.rb
+++ b/spec/lib/auto_evaluation/coherence_spec.rb
@@ -18,7 +18,7 @@
 
     it "returns a results object with the expected attributes" do
       allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0)
-      stub = bedrock_invoke_model_openai_oss_tool_call(
+      stub = stub_bedrock_invoke_model_openai_oss_tool_call(
         user_prompt,
         tools,
         response_json,
@@ -55,7 +55,7 @@
         5 => 1.0,
       }.each do |rubric_score, expected_score|
         response_json = { score: rubric_score, reason: }.to_json
-        bedrock_invoke_model_openai_oss_tool_call(
+        stub_bedrock_invoke_model_openai_oss_tool_call(
           user_prompt,
           tools,
           response_json,
@@ -73,7 +73,7 @@
       let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) }
 
       it "uses the rephrased question in the prompt" do
-        stub = bedrock_invoke_model_openai_oss_tool_call(
+        stub = stub_bedrock_invoke_model_openai_oss_tool_call(
           user_prompt,
           tools,
           response_json,
diff --git a/spec/support/stub_bedrock.rb b/spec/support/stub_bedrock.rb
index c30747b7f..e6432b05a 100644
--- a/spec/support/stub_bedrock.rb
+++ b/spec/support/stub_bedrock.rb
@@ -47,7 +47,7 @@ def mock_titan_embedding(text, dimensions: Search::ChunkedContentRepository::TIT
     dimensions.times.map { random_generator.rand }
   end
 
-  def bedrock_invoke_model_openai_oss_tool_call(user_message, tools, content)
+  def stub_bedrock_invoke_model_openai_oss_tool_call(user_message, tools, content)
     request_body = {
       include_reasoning: false,
       messages: [
@@ -85,4 +85,53 @@ def bedrock_invoke_model_openai_oss_tool_call(user_message, tools, content)
       endpoint_regex: OPENAI_GPT_OSS_ENDPOINT_REGEX,
     )
   end
+
+  def stub_bedrock_invoke_model_openai_oss_answer_relevancy(question_message:,
+                                                            answer_message:,
+                                                            statements_json: { statements: ["Statement."] }.to_json,
+                                                            verdicts_json: { verdicts: [{ "verdict" => "yes" }] }.to_json,
+                                                            reason_json: { reason: "This is the reason for the score." }.to_json)
+    prompts = AutoEvaluation::Prompts.config.answer_relevancy
+
+    statements_user_prompt = sprintf(
+      prompts.fetch(:statements).fetch(:user_prompt),
+      answer: answer_message,
+    )
+    verdicts_user_prompt = sprintf(
+      prompts.fetch(:verdicts).fetch(:user_prompt),
+      question: question_message,
+      statements: JSON.parse(statements_json).fetch("statements"),
+    )
+    reason_user_prompt = sprintf(
+      prompts.fetch(:reason).fetch(:user_prompt),
+      score: 0.5,
+      unsuccessful_verdicts_reasons: ["The statement is irrelevant."],
+      question: question_message,
+    )
+
+    statements_tools = [prompts.fetch(:statements).fetch(:tool_spec)]
+    verdicts_tools = [prompts.fetch(:verdicts).fetch(:tool_spec)]
+    reason_tools = [prompts.fetch(:reason).fetch(:tool_spec)]
+
+    stubs = {}
+    stubs[:statements] = stub_bedrock_invoke_model_openai_oss_tool_call(
+      statements_user_prompt,
+      statements_tools,
+      statements_json,
+    )
+
+    stubs[:verdicts] = stub_bedrock_invoke_model_openai_oss_tool_call(
+      verdicts_user_prompt,
+      verdicts_tools,
+      verdicts_json,
+    )
+
+    stubs[:reason] = stub_bedrock_invoke_model_openai_oss_tool_call(
+      reason_user_prompt,
+      reason_tools,
+      reason_json,
+    )
+
+    stubs
+  end
 end

From f8fb85e3b40e3998fb20d75433dca3913d545f2f Mon Sep 17 00:00:00 2001
From: David Gisbey <david.gisbey@digital.cabinet-office.gov.uk>
Date: Mon, 5 Jan 2026 14:05:23 +0000
Subject: [PATCH 3/7] Add auto_evaluation_results_creatable concern

This adds a concern to encapsulate the logic for creating aggregate and run
records for metrics. It will be called from the various evaluation jobs
that require wisdom of the crowd.
---
 .../answer_relevancy_aggregate.rb             |  2 ++
 .../auto_evaluation_results_creatable.rb      | 27 ++++++++++++++++
 .../answer_relevancy_aggregate_spec.rb        |  5 +++
 ...o_evaluation_results_creatable_examples.rb | 31 +++++++++++++++++++
 4 files changed, 65 insertions(+)
 create mode 100644 app/models/concerns/auto_evaluation_results_creatable.rb
 create mode 100644 spec/models/answer_analysis/answer_relevancy_aggregate_spec.rb
 create mode 100644 spec/support/auto_evaluation_results_creatable_examples.rb

diff --git a/app/models/answer_analysis/answer_relevancy_aggregate.rb b/app/models/answer_analysis/answer_relevancy_aggregate.rb
index 8c578e4bb..891433893 100644
--- a/app/models/answer_analysis/answer_relevancy_aggregate.rb
+++ b/app/models/answer_analysis/answer_relevancy_aggregate.rb
@@ -1,5 +1,7 @@
 module AnswerAnalysis
   class AnswerRelevancyAggregate < ApplicationRecord
+    include AutoEvaluationResultsCreatable
+
     self.table_name = "answer_analysis_answer_relevancy_aggregates"
 
     belongs_to :answer
diff --git a/app/models/concerns/auto_evaluation_results_creatable.rb b/app/models/concerns/auto_evaluation_results_creatable.rb
new file mode 100644
index 000000000..c323d6939
--- /dev/null
+++ b/app/models/concerns/auto_evaluation_results_creatable.rb
@@ -0,0 +1,27 @@
+module AutoEvaluationResultsCreatable
+  extend ActiveSupport::Concern
+
+  class_methods do
+    def create_mean_aggregate_and_score_runs(answer, results)
+      mean_score = results.map { |result| result.score.to_d }.sum / results.size
+      aggregate = new(answer:, mean_score:)
+
+      results.each do |result|
+        run = aggregate.runs.build(
+          aggregate:,
+          score: result.score,
+          reason: result.reason,
+        )
+
+        result.llm_responses.stringify_keys.each do |name, llm_response|
+          run.assign_llm_response(name, llm_response)
+        end
+        result.metrics.stringify_keys.each do |name, metrics|
+          run.assign_metrics(name, metrics)
+        end
+      end
+
+      aggregate.save!
+    end
+  end
+end
diff --git a/spec/models/answer_analysis/answer_relevancy_aggregate_spec.rb b/spec/models/answer_analysis/answer_relevancy_aggregate_spec.rb
new file mode 100644
index 000000000..07f173ab0
--- /dev/null
+++ b/spec/models/answer_analysis/answer_relevancy_aggregate_spec.rb
@@ -0,0 +1,5 @@
+RSpec.describe AnswerAnalysis::AnswerRelevancyAggregate do
+  include_examples "auto_evaluation results creatable",
+                   :answer_relevancy_aggregate,
+                   AnswerAnalysis::AnswerRelevancyRun
+end
diff --git a/spec/support/auto_evaluation_results_creatable_examples.rb b/spec/support/auto_evaluation_results_creatable_examples.rb
new file mode 100644
index 000000000..9d019a8c3
--- /dev/null
+++ b/spec/support/auto_evaluation_results_creatable_examples.rb
@@ -0,0 +1,31 @@
+shared_examples "auto_evaluation results creatable" do |aggregate_association, run_class|
+  describe ".create_mean_aggregate_and_score_runs" do
+    let(:first_run_result) { build(:auto_evaluation_score_result, score: 0.8) }
+    let(:second_run_result) { build(:auto_evaluation_score_result, score: 0.9) }
+    let(:results) { [first_run_result, second_run_result] }
+    let(:answer) { create(:answer) }
+    let(:answer_id) { answer.id }
+
+    it "creates an aggregate with correct mean score" do
+      answer = Answer.includes(aggregate_association).find(answer_id)
+      expect { described_class.create_mean_aggregate_and_score_runs(answer, results) }
+        .to change(described_class, :count).by(1)
+
+      answer = Answer.includes(aggregate_association).find(answer_id)
+      expect(answer.answer_relevancy_aggregate.mean_score).to eq(0.85)
+    end
+
+    it "creates runs with correct attributes and associations" do
+      answer = Answer.includes("#{aggregate_association}": :runs).find(answer_id)
+
+      expect {
+        described_class.create_mean_aggregate_and_score_runs(answer, results)
+      }.to change(run_class, :count).by(2)
+
+      first_run, second_run = answer.reload.public_send(aggregate_association).runs.order(:created_at)
+
+      expect(first_run).to have_attributes(first_run_result.to_h.except(:success))
+      expect(second_run).to have_attributes(second_run_result.to_h.except(:success))
+    end
+  end
+end

From ccbda92e25991864a0ed5dd709e14d672290d9ac Mon Sep 17 00:00:00 2001
From: David Gisbey <david.gisbey@digital.cabinet-office.gov.uk>
Date: Tue, 6 Jan 2026 10:33:09 +0000
Subject: [PATCH 4/7] Add AnswerAnalysis AnswerRelevancy and Base jobs

This adds the BaseMetricjob and AnswerRelevancyJob. The AnswerRelevancyJobs
handles:
- making calls to the AnswerRelevancy class
- compiling the results
- calling the AnswerRelevancyAggregate#create_run_from_result method to
  delegate record creation to the AutoEvaluationMetricRun model

The BaseJob is used to store shard functionality for future metric jobs.
The next commit will integrate this job into the analysis workflow.

As part of this commit i've updated the ScoreResult factory to use a
sequence to build unique attributes for the reason, llm_responses and
metrics fields. This ensures that we are correctly persiting all the
attributes returned from the evaluation classes correctly.

I've also updated the answer relevancy scoring method to use BigDecimal
as part of this commit. Without this I was forced to use round(2) in
the tests to avoid rounding issues caused by floats.
---
 .../answer_analysis/answer_relevancy_job.rb   |  24 +++
 app/jobs/answer_analysis/base_job.rb          |  19 +++
 lib/auto_evaluation/answer_relevancy.rb       |   2 +-
 .../factories/auto_evaluation_score_result.rb |   6 +-
 .../answer_relevancy_job_spec.rb              | 153 ++++++++++++++++++
 5 files changed, 200 insertions(+), 4 deletions(-)
 create mode 100644 app/jobs/answer_analysis/answer_relevancy_job.rb
 create mode 100644 app/jobs/answer_analysis/base_job.rb
 create mode 100644 spec/jobs/answer_analysis/answer_relevancy_job_spec.rb

diff --git a/app/jobs/answer_analysis/answer_relevancy_job.rb b/app/jobs/answer_analysis/answer_relevancy_job.rb
new file mode 100644
index 000000000..e9b983c01
--- /dev/null
+++ b/app/jobs/answer_analysis/answer_relevancy_job.rb
@@ -0,0 +1,24 @@
+module AnswerAnalysis
+  class AnswerRelevancyJob < BaseJob
+    def perform(answer_id)
+      return unless eligible_for_answer_analysis?(answer_id)
+
+      answer = Answer.includes(:question, :answer_relevancy_aggregate).find(answer_id)
+      return logger.warn(aggregate_exists_warn_message(answer.id)) if answer.answer_relevancy_aggregate.present?
+
+      results = NUMBER_OF_RUNS.times.map { AutoEvaluation::AnswerRelevancy.call(answer) }
+
+      begin
+        AnswerAnalysis::AnswerRelevancyAggregate.create_mean_aggregate_and_score_runs(answer, results)
+      rescue ActiveRecord::RecordNotUnique
+        logger.warn(aggregate_exists_warn_message(answer.id))
+      end
+    end
+
+  private
+
+    def aggregate_exists_warn_message(answer_id)
+      "Answer #{answer_id} has already been evaluated for relevancy"
+    end
+  end
+end
diff --git a/app/jobs/answer_analysis/base_job.rb b/app/jobs/answer_analysis/base_job.rb
new file mode 100644
index 000000000..508fc7b6f
--- /dev/null
+++ b/app/jobs/answer_analysis/base_job.rb
@@ -0,0 +1,19 @@
+module AnswerAnalysis
+  class BaseJob < ApplicationJob
+    NUMBER_OF_RUNS = 3
+    MAX_RETRIES = 5
+    retry_on Aws::Errors::ServiceError, wait: 1.minute, attempts: MAX_RETRIES
+
+  private
+
+    def eligible_for_answer_analysis?(answer_id)
+      eligible = Answer.status_answered.exists?(id: answer_id)
+
+      unless eligible
+        logger.warn("Couldn't find an answer #{answer_id} that was eligible for auto-evaluation")
+      end
+
+      eligible
+    end
+  end
+end
diff --git a/lib/auto_evaluation/answer_relevancy.rb b/lib/auto_evaluation/answer_relevancy.rb
index d07d82d0a..fde51c8e8 100644
--- a/lib/auto_evaluation/answer_relevancy.rb
+++ b/lib/auto_evaluation/answer_relevancy.rb
@@ -69,7 +69,7 @@ def calculate_score(verdicts)
     return 1.0 if verdict_count.zero?
 
     relevant_count = verdicts.count { |verdict| verdict["verdict"].strip.downcase != "no" }
-    relevant_count.to_f / verdict_count
+    relevant_count.to_d / verdict_count
   end
 
   def build_maximum_score_result(reason:, llm_responses:, metrics:)
diff --git a/spec/factories/auto_evaluation_score_result.rb b/spec/factories/auto_evaluation_score_result.rb
index cfb4af097..8bc5f3781 100644
--- a/spec/factories/auto_evaluation_score_result.rb
+++ b/spec/factories/auto_evaluation_score_result.rb
@@ -3,10 +3,10 @@
     skip_create
 
     score { 0.85.to_d }
-    reason { "Most statements are relevant." }
+    sequence(:reason) { |n| "Reason #{n}" }
     success { true }
-    llm_responses { {} }
-    metrics { {} }
+    sequence(:llm_responses) { |n| { "llm_response" => { "reason" => "Reason #{n}" } } }
+    sequence(:metrics) { |n| { "llm_response" => { "duration" => n } } }
 
     initialize_with { new(**attributes) }
   end
diff --git a/spec/jobs/answer_analysis/answer_relevancy_job_spec.rb b/spec/jobs/answer_analysis/answer_relevancy_job_spec.rb
new file mode 100644
index 000000000..f96b15d23
--- /dev/null
+++ b/spec/jobs/answer_analysis/answer_relevancy_job_spec.rb
@@ -0,0 +1,153 @@
+RSpec.describe AnswerAnalysis::AnswerRelevancyJob do
+  include ActiveJob::TestHelper
+
+  let(:answer) { create(:answer) }
+  let(:question) { answer.question }
+  let(:results) do
+    [
+      build(:auto_evaluation_score_result, score: 0.8),
+      build(:auto_evaluation_score_result, score: 0.7),
+      build(:auto_evaluation_score_result, score: 0.9),
+    ]
+  end
+
+  before do
+    allow(AutoEvaluation::AnswerRelevancy)
+      .to receive(:call).and_return(*results)
+  end
+
+  it_behaves_like "a job in queue", "default"
+
+  describe "#perform" do
+    it "calls AutoEvaluation::AnswerRelevancy the configured number of times with the correct arguments" do
+      described_class.new.perform(answer.id)
+
+      expect(AutoEvaluation::AnswerRelevancy)
+        .to have_received(:call)
+        .with(answer)
+        .exactly(AnswerAnalysis::BaseJob::NUMBER_OF_RUNS).times
+    end
+
+    it "creates answer relevancy aggregate with the correct score" do
+      expect {
+        described_class.new.perform(answer.id)
+      }.to change(AnswerAnalysis::AnswerRelevancyAggregate, :count).by(1)
+      answer = Answer.includes(:answer_relevancy_aggregate)
+                     .find(AnswerAnalysis::AnswerRelevancyAggregate.last.answer_id)
+      expect(answer.answer_relevancy_aggregate.mean_score).to eq(0.8)
+    end
+
+    it "creates answer relevancy runs for each result" do
+      expect {
+        described_class.new.perform(answer.id)
+      }.to change(AnswerAnalysis::AnswerRelevancyRun, :count).by(results.count)
+
+      answer = Answer.includes(answer_relevancy_aggregate: :runs)
+                     .find(AnswerAnalysis::AnswerRelevancyAggregate.last.answer_id)
+
+      results.each_with_index do |result, index|
+        expect(answer.answer_relevancy_aggregate.runs[index])
+          .to have_attributes(result.to_h.except(:success))
+      end
+    end
+
+    context "when the answer has a rephrased_question" do
+      let(:rephrased_question) { "This is a rephrased_question" }
+
+      it "passes the rephrased question to AutoEvaluation::AnswerRelevancy as the question_message" do
+        answer = create(:answer, rephrased_question: rephrased_question)
+
+        described_class.new.perform(answer.id)
+
+        expect(AutoEvaluation::AnswerRelevancy)
+          .to have_received(:call)
+          .with(answer)
+          .exactly(AnswerAnalysis::BaseJob::NUMBER_OF_RUNS).times
+      end
+    end
+
+    context "when the answer does not exist" do
+      let(:answer_id) { 999 }
+
+      it "logs a warning" do
+        expect(described_class.logger)
+          .to receive(:warn)
+          .with("Couldn't find an answer 999 that was eligible for auto-evaluation")
+
+        described_class.new.perform(answer_id)
+      end
+
+      it "doesn't call AutoEvaluation::AnswerRelevancy" do
+        described_class.new.perform(answer_id)
+        expect(AutoEvaluation::AnswerRelevancy).not_to have_received(:call)
+      end
+    end
+
+    context "when answer relevancy has already been evaluated" do
+      let(:aggregate) { create(:answer_relevancy_aggregate) }
+      let(:answer) { aggregate.answer }
+
+      it "logs a warning" do
+        expect(described_class.logger)
+          .to receive(:warn)
+          .with("Answer #{answer.id} has already been evaluated for relevancy")
+
+        described_class.new.perform(answer.id)
+      end
+
+      it "doesn't call AutoEvaluation::AnswerRelevancy" do
+        described_class.new.perform(answer.id)
+        expect(AutoEvaluation::AnswerRelevancy).not_to have_received(:call)
+      end
+    end
+
+    context "when aggregate data is persisted mid job" do
+      before do
+        allow(AnswerAnalysis::AnswerRelevancyAggregate)
+          .to receive(:create_mean_aggregate_and_score_runs)
+          .with(answer, anything)
+          .and_raise(ActiveRecord::RecordNotUnique)
+      end
+
+      it "logs a warning" do
+        expect(described_class.logger)
+          .to receive(:warn)
+          .with("Answer #{answer.id} has already been evaluated for relevancy")
+
+        described_class.new.perform(answer.id)
+      end
+    end
+
+    context "when the AnswerRelevancy metric raises an Aws::Errors::ServiceError" do
+      it "retries the job the max number of times" do
+        allow(AutoEvaluation::AnswerRelevancy)
+          .to receive(:call)
+          .and_raise(Aws::Errors::ServiceError.new(nil, "error"))
+
+        described_class.perform_later(answer.id)
+
+        assert_performed_jobs described_class::MAX_RETRIES do
+          expect { perform_enqueued_jobs }
+            .to raise_error(Aws::Errors::ServiceError)
+        end
+      end
+    end
+
+    context "when the answer is not eligible for auto-evaluation" do
+      let(:answer) { create(:answer, status: Answer.statuses.except(:answered).keys.sample) }
+
+      it "logs a warning message" do
+        expect(described_class.logger)
+          .to receive(:warn)
+          .with("Couldn't find an answer #{answer.id} that was eligible for auto-evaluation")
+
+        described_class.new.perform(answer.id)
+      end
+
+      it "does not call AutoEvaluation::AnswerRelevancy" do
+        expect(AutoEvaluation::AnswerRelevancy).not_to receive(:call)
+        described_class.new.perform(answer.id)
+      end
+    end
+  end
+end

From 9ace3164ea844a7bd1d98656f6df152d02a34898 Mon Sep 17 00:00:00 2001
From: David Gisbey <david.gisbey@digital.cabinet-office.gov.uk>
Date: Tue, 6 Jan 2026 10:41:48 +0000
Subject: [PATCH 5/7] Integraate Answer Relevancy Analysis into analysis
 workflow

This updates the compose answer job to call the answer relevancy job
after an answer has been successfully composed and persisted.
---
 app/jobs/compose_answer_job.rb                            | 7 ++++++-
 spec/jobs/compose_answer_job_spec.rb                      | 6 ++++++
 spec/requests/api/v1/conversation_flow_spec.rb            | 1 +
 spec/system/conversation_js_features_spec.rb              | 4 ++++
 .../conversation_with_claude_structured_answer_spec.rb    | 8 ++++++++
 ...nversation_with_open_ai_with_structured_answer_spec.rb | 8 ++++++++
 .../user_conversation_activity_is_shown_in_admin_spec.rb  | 4 ++++
 7 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/app/jobs/compose_answer_job.rb b/app/jobs/compose_answer_job.rb
index e3a97c442..e4a9863fb 100644
--- a/app/jobs/compose_answer_job.rb
+++ b/app/jobs/compose_answer_job.rb
@@ -14,6 +14,11 @@ def perform(question_id)
       logger.warn("Already an answer created for #{question_id}")
     end
 
-    AnswerAnalysis::TagTopicsJob.perform_later(answer.id) if answer.persisted?
+    if answer.persisted?
+      # TODO: Once we've added a few metrics we should move these to a single job that
+      # kicks off all analysis jobs.
+      AnswerAnalysis::TagTopicsJob.perform_later(answer.id)
+      AnswerAnalysis::AnswerRelevancyJob.perform_later(answer.id)
+    end
   end
 end
diff --git a/spec/jobs/compose_answer_job_spec.rb b/spec/jobs/compose_answer_job_spec.rb
index e7e09fd6a..24d907807 100644
--- a/spec/jobs/compose_answer_job_spec.rb
+++ b/spec/jobs/compose_answer_job_spec.rb
@@ -6,6 +6,7 @@
   before do
     allow(AnswerComposition::Composer).to receive(:call).and_return(returned_answer)
     allow(AnswerAnalysis::TagTopicsJob).to receive(:perform_later)
+    allow(AnswerAnalysis::AnswerRelevancyJob).to receive(:perform_later)
   end
 
   it_behaves_like "a job in queue", "answer"
@@ -22,6 +23,11 @@
       expect(AnswerAnalysis::TagTopicsJob).to have_received(:perform_later).with(returned_answer.id)
     end
 
+    it "calls the AnswerAnalysis::AnswerRelevancyJob with the answer_id" do
+      described_class.new.perform(question.id)
+      expect(AnswerAnalysis::AnswerRelevancyJob).to have_received(:perform_later).with(returned_answer.id)
+    end
+
     context "when the question has already been answered" do
       let(:question) { create(:question, :with_answer) }
 
diff --git a/spec/requests/api/v1/conversation_flow_spec.rb b/spec/requests/api/v1/conversation_flow_spec.rb
index c6a6e7b90..214d05688 100644
--- a/spec/requests/api/v1/conversation_flow_spec.rb
+++ b/spec/requests/api/v1/conversation_flow_spec.rb
@@ -74,6 +74,7 @@ def when_i_create_a_conversation
       )
     end
     allow(AnswerAnalysis::TagTopicsJob).to receive(:perform_later)
+    allow(AnswerAnalysis::AnswerRelevancyJob).to receive(:perform_later)
 
     post api_v1_create_conversation_path,
          params: { user_question: "What is the capital of France?" },
diff --git a/spec/system/conversation_js_features_spec.rb b/spec/system/conversation_js_features_spec.rb
index 7d1893393..c2e8bba1f 100644
--- a/spec/system/conversation_js_features_spec.rb
+++ b/spec/system/conversation_js_features_spec.rb
@@ -282,6 +282,10 @@ def stubs_for_mock_answer(question,
 
     stub_claude_output_guardrails(answer)
     stub_claude_messages_topic_tagger(question)
+    stub_bedrock_invoke_model_openai_oss_answer_relevancy(
+      question_message: question,
+      answer_message: answer,
+    )
   end
 
   def then_i_cant_see_the_clear_chat_link
diff --git a/spec/system/conversation_with_claude_structured_answer_spec.rb b/spec/system/conversation_with_claude_structured_answer_spec.rb
index 911f97ff5..c6bc2fb0e 100644
--- a/spec/system/conversation_with_claude_structured_answer_spec.rb
+++ b/spec/system/conversation_with_claude_structured_answer_spec.rb
@@ -51,6 +51,10 @@ def when_the_first_answer_is_generated
     stub_claude_structured_answer(@first_question, @first_answer)
     stub_claude_output_guardrails(@first_answer, "False | None")
     stub_claude_messages_topic_tagger(@first_question)
+    stub_bedrock_invoke_model_openai_oss_answer_relevancy(
+      question_message: @first_question,
+      answer_message: @first_answer,
+    )
 
     execute_queued_sidekiq_jobs
   end
@@ -83,6 +87,10 @@ def when_the_second_answer_is_generated
     stub_claude_structured_answer(rephrased_question, @second_answer)
     stub_claude_output_guardrails(@second_answer, "False | None")
     stub_claude_messages_topic_tagger(rephrased_question)
+    stub_bedrock_invoke_model_openai_oss_answer_relevancy(
+      question_message: rephrased_question,
+      answer_message: @second_answer,
+    )
 
     execute_queued_sidekiq_jobs
   end
diff --git a/spec/system/conversation_with_open_ai_with_structured_answer_spec.rb b/spec/system/conversation_with_open_ai_with_structured_answer_spec.rb
index cc157d940..4def8e862 100644
--- a/spec/system/conversation_with_open_ai_with_structured_answer_spec.rb
+++ b/spec/system/conversation_with_open_ai_with_structured_answer_spec.rb
@@ -55,6 +55,10 @@ def when_the_first_answer_is_generated
     )
     stub_openai_output_guardrail("Lots of tax.")
     stub_claude_messages_topic_tagger(@first_question)
+    stub_bedrock_invoke_model_openai_oss_answer_relevancy(
+      question_message: @first_question,
+      answer_message: "Lots of tax.",
+    )
 
     execute_queued_sidekiq_jobs
   end
@@ -75,6 +79,10 @@ def when_the_second_answer_is_generated
     )
     stub_openai_output_guardrail("Even more tax.")
     stub_claude_messages_topic_tagger(rephrased_question)
+    stub_bedrock_invoke_model_openai_oss_answer_relevancy(
+      question_message: rephrased_question,
+      answer_message: "Even more tax.",
+    )
 
     execute_queued_sidekiq_jobs
   end
diff --git a/spec/system/user_conversation_activity_is_shown_in_admin_spec.rb b/spec/system/user_conversation_activity_is_shown_in_admin_spec.rb
index 06caf458c..5176eb45f 100644
--- a/spec/system/user_conversation_activity_is_shown_in_admin_spec.rb
+++ b/spec/system/user_conversation_activity_is_shown_in_admin_spec.rb
@@ -52,6 +52,10 @@ def and_the_answer_is_generated
     stub_claude_structured_answer(@question, @answer)
     stub_claude_output_guardrails(@answer, "False | None")
     stub_claude_messages_topic_tagger(@question)
+    stub_bedrock_invoke_model_openai_oss_answer_relevancy(
+      question_message: @question,
+      answer_message: @answer,
+    )
 
     execute_queued_sidekiq_jobs
   end

From 6f363628b756796c4d12a3499b84de85e4e56683 Mon Sep 17 00:00:00 2001
From: David Gisbey <david.gisbey@digital.cabinet-office.gov.uk>
Date: Wed, 17 Dec 2025 17:52:15 +0000
Subject: [PATCH 6/7] Expose answer relevancy metrics in admin UI

I've added an additional tab for answer relevancy metrics in the admin
interface on the question show page.

My thoughts for this are if we don't split out the metrics into their own
tabs then the page will get incredibly noisy. This makes it easier to
navigate.

Due to this, i've renamed the analysis tab to topics.
---
 app/controllers/admin/questions_controller.rb |  2 +-
 app/models/answer.rb                          |  4 +
 .../admin/questions/_analysis_tab.html.erb    | 80 +++++++++++--------
 ...generic_aggregate_auto_evaluation.html.erb | 75 +++++++++++++++++
 app/views/admin/questions/show.html.erb       |  3 +-
 spec/models/answer_spec.rb                    | 19 +++++
 spec/requests/admin/questions_spec.rb         | 56 ++++++++++++-
 ...rsation_activity_is_shown_in_admin_spec.rb |  5 ++
 8 files changed, 206 insertions(+), 38 deletions(-)
 create mode 100644 app/views/admin/questions/_generic_aggregate_auto_evaluation.html.erb

diff --git a/app/controllers/admin/questions_controller.rb b/app/controllers/admin/questions_controller.rb
index 6fa05a276..229e9b894 100644
--- a/app/controllers/admin/questions_controller.rb
+++ b/app/controllers/admin/questions_controller.rb
@@ -7,7 +7,7 @@ def index
   def show
     question_scope = Question.includes(
       conversation: :signon_user,
-      answer: [{ sources: :chunk }, :feedback, :topics],
+      answer: [{ sources: :chunk }, :feedback, :topics, { answer_relevancy_aggregate: :runs }],
     )
 
     @question = question_scope.find(params[:id])
diff --git a/app/models/answer.rb b/app/models/answer.rb
index e27b742e8..6c74f931e 100644
--- a/app/models/answer.rb
+++ b/app/models/answer.rb
@@ -197,4 +197,8 @@ def group_used_answer_sources_by_base_path
       }
     end
   end
+
+  def has_analysis?
+    topics.present? || answer_relevancy_aggregate.present?
+  end
 end
diff --git a/app/views/admin/questions/_analysis_tab.html.erb b/app/views/admin/questions/_analysis_tab.html.erb
index 21d3d23cd..337dc664e 100644
--- a/app/views/admin/questions/_analysis_tab.html.erb
+++ b/app/views/admin/questions/_analysis_tab.html.erb
@@ -1,41 +1,51 @@
-<%= render "govuk_publishing_components/components/summary_list", {
-  items: [
-    {
-      field: "Primary topic",
-      value: topics.primary_topic&.humanize,
-    },
-    {
-      field: "Secondary topic",
-      value: topics.secondary_topic&.humanize,
-    },
-  ],
-} %>
+<% if topics.present? %>
+  <%= render "govuk_publishing_components/components/summary_list", {
+    title: "Topics",
+    heading_size: "l",
+    heading_level: 2,
+    margin_bottom: 4,
+    items: [
+      {
+        field: "Primary topic",
+        value: topics.primary_topic.humanize,
+      },
+      {
+        field: "Secondary topic",
+        value: topics.secondary_topic&.humanize,
+      },
+    ],
+  } %>
 
-<% if topics.llm_responses.present? %>
-  <%= render "govuk_publishing_components/components/details", {
-    title: "LLM responses",
-  } do %>
-    <% topics.llm_responses.each do |namespace, response| %>
-      <h3 class="govuk-heading-m"><%= namespace %></h3>
-      <p class="govuk-body">
-        <%= render("components/code_snippet", content: JSON.pretty_generate(response)) %>
-      </p>
+  <% if topics.llm_responses.present? %>
+    <%= render "govuk_publishing_components/components/details", {
+      title: "LLM responses",
+    } do %>
+      <% topics.llm_responses.each do |namespace, response| %>
+        <h3 class="govuk-heading-m"><%= namespace %></h3>
+        <p class="govuk-body">
+          <%= render("components/code_snippet", content: JSON.pretty_generate(response)) %>
+        </p>
+      <% end %>
     <% end %>
   <% end %>
-<% end %>
 
-<% if topics.metrics.present? %>
-  <%= render "govuk_publishing_components/components/details", {
-    title: "Metrics",
-  } do %>
-    <%= render "govuk_publishing_components/components/summary_list", {
-      items: topics.metrics.map do |metric, value|
-        {
-          field: metric,
-          value: value,
-        }
-      end,
-      borderless: true,
-    } %>
+  <% if topics.metrics.present? %>
+    <%= render "govuk_publishing_components/components/details", {
+      title: "Metrics",
+    } do %>
+      <%= render "govuk_publishing_components/components/summary_list", {
+        items: topics.metrics.map do |metric, value|
+          {
+            field: metric,
+            value: value,
+          }
+        end,
+        borderless: true,
+      } %>
+    <% end %>
   <% end %>
 <% end %>
+
+<% if answer_relevancy_aggregate.present? %>
+  <%= render "generic_aggregate_auto_evaluation", aggregate: answer_relevancy_aggregate, title: "Answer relevancy" %>
+<% end %>
diff --git a/app/views/admin/questions/_generic_aggregate_auto_evaluation.html.erb b/app/views/admin/questions/_generic_aggregate_auto_evaluation.html.erb
new file mode 100644
index 000000000..f84f4868f
--- /dev/null
+++ b/app/views/admin/questions/_generic_aggregate_auto_evaluation.html.erb
@@ -0,0 +1,75 @@
+<%
+  items = [
+    {
+      field: "Mean score",
+      value: aggregate.mean_score,
+    },
+  ]
+
+  items += aggregate.runs.flat_map.with_index(1) do |run, index|
+    [
+      { field: "Run #{index} score", value: run.score },
+      { field: "Run #{index} reason", value: run.reason },
+    ]
+  end
+%>
+
+<%= render "govuk_publishing_components/components/summary_list", {
+  title:,
+  heading_level: 2,
+  margin_bottom: 4,
+  heading_size: "l",
+  items: items,
+} %>
+
+<%= render "govuk_publishing_components/components/details", {
+  title: "LLM responses",
+} do %>
+  <% aggregate.runs.each.with_index(1) do |run, index| %>
+    <%= render "govuk_publishing_components/components/heading", {
+      text: "Run #{index}",
+      font_size: "m",
+      heading_level: 2,
+      margin_bottom: 4,
+    } %>
+
+    <% run.llm_responses.each do |namespace, response| %>
+      <%= render "govuk_publishing_components/components/heading", {
+        text: namespace.capitalize,
+        font_size: "s",
+        heading_level: 3,
+      } %>
+
+      <p class="govuk-body">
+        <%= render("components/code_snippet", content: JSON.pretty_generate(response)) %>
+      </p>
+    <% end %>
+  <% end %>
+<% end %>
+
+<%= render "govuk_publishing_components/components/details", {
+  title: "Metrics",
+} do %>
+  <% aggregate.runs.each.with_index(1) do |run, index| %>
+    <%= render "govuk_publishing_components/components/heading", {
+      text: "Run #{index}",
+      font_size: "m",
+      heading_level: 2,
+    } %>
+
+    <% run.metrics.sort.each do |namespace, metrics| %>
+      <%= render "govuk_publishing_components/components/summary_list", {
+        title: namespace.capitalize,
+        items: metrics.map do |metric, value|
+          {
+            field: metric,
+            value: value,
+          }
+        end,
+        borderless: true,
+        heading_size: "s",
+        margin_bottom: 6,
+      } %>
+    <% end %>
+  <% end %>
+<% end %>
diff --git a/app/views/admin/questions/show.html.erb b/app/views/admin/questions/show.html.erb
index 724f6cff5..0dd09c4dc 100644
--- a/app/views/admin/questions/show.html.erb
+++ b/app/views/admin/questions/show.html.erb
@@ -39,8 +39,9 @@ content_for(:active_navigation_item, admin_questions_path)
     content: render(
       "analysis_tab",
       topics: @answer.topics,
+      answer_relevancy_aggregate: @answer.answer_relevancy_aggregate,
     ),
-  } if @answer&.topics.present?
+  } if @answer&.has_analysis?
 %>
 
 <div class="govuk-grid-row">
diff --git a/spec/models/answer_spec.rb b/spec/models/answer_spec.rb
index 95f4cfb14..ac019bbea 100644
--- a/spec/models/answer_spec.rb
+++ b/spec/models/answer_spec.rb
@@ -353,4 +353,23 @@
       end
     end
   end
+
+  describe "#has_analysis?" do
+    it "returns true if topics are present" do
+      answer = build(:answer, :with_topics)
+      expect(answer.has_analysis?).to be(true)
+    end
+
+    it "returns true if answer_relevancy_aggregate is present" do
+      answer = build(
+        :answer, answer_relevancy_aggregate: build(:answer_relevancy_aggregate)
+      )
+      expect(answer.has_analysis?).to be(true)
+    end
+
+    it "returns false if no analysis is present" do
+      answer = build(:answer)
+      expect(answer.has_analysis?).to be(false)
+    end
+  end
 end
diff --git a/spec/requests/admin/questions_spec.rb b/spec/requests/admin/questions_spec.rb
index 26cbf26e4..28cb117d0 100644
--- a/spec/requests/admin/questions_spec.rb
+++ b/spec/requests/admin/questions_spec.rb
@@ -284,7 +284,7 @@
         .and have_content('"id": "call_dqGpbb39drQDafLsjDLtnbGD"')
     end
 
-    it "doesn't render the tabs component when there is no analysis" do
+    it "doesn't render the tabs component when there are no topics or auto-eval aggregate data" do
       question = create(:question, :with_answer)
       get admin_show_question_path(question)
 
@@ -361,6 +361,60 @@
          .and have_selector("#analysis-tab", text: topics.secondary_topic.capitalize)
       end
     end
+
+    context "when answer relevancy aggregate data is present" do
+      let(:run) do
+        create(
+          :answer_relevancy_run,
+          score: 0.85,
+          reason: "The answer is relevant to the question.",
+          llm_responses: {
+            "statements" => { "statements" => ["The answer is relevant."] },
+            "verdicts" => { "verdicts" => [{ "verdict" => "yes" }] },
+          },
+          metrics: {
+            "statements" => { duration: 1.55556 },
+            "verdicts" => { duration: 1.44445 },
+          },
+        )
+      end
+      let!(:aggregate) do
+        create(
+          :answer_relevancy_aggregate,
+          runs: [run],
+        )
+      end
+      let(:question) { aggregate.answer.question }
+
+      it "renders the answer relevancy aggregate and run details" do
+        get admin_show_question_path(question)
+
+        expect(response.body.squish)
+          .to have_content("Answer relevancy")
+          .and have_content("Run 1 score")
+          .and have_content("0.85")
+          .and have_content("Run 1 reason")
+          .and have_content("The answer is relevant to the question.")
+      end
+
+      it "renders the runs llm responses" do
+        get admin_show_question_path(question)
+
+        expect(response.body.squish)
+          .to have_content('{ "statements": [ "The answer is relevant." ] }')
+          .and have_content('{ "verdicts": [ { "verdict": "yes" } ] }')
+      end
+
+      it "renders the runs metrics" do
+        get admin_show_question_path(question)
+
+        expect(response.body.squish)
+          .to have_content("Statements")
+          .and have_content(/duration.*1\.55556/)
+          .and have_content("Verdicts")
+          .and have_content(/duration.*1\.44445/)
+      end
+    end
   end
 
   def expect_unprocessable_content_with_date_errors
diff --git a/spec/system/user_conversation_activity_is_shown_in_admin_spec.rb b/spec/system/user_conversation_activity_is_shown_in_admin_spec.rb
index 5176eb45f..6072a4b2c 100644
--- a/spec/system/user_conversation_activity_is_shown_in_admin_spec.rb
+++ b/spec/system/user_conversation_activity_is_shown_in_admin_spec.rb
@@ -18,6 +18,7 @@
 
     when_i_click_the_analysis_tab
     then_i_see_the_topics_have_been_tagged
+    and_i_see_the_answer_relevancy_statistics
     and_i_dont_see_the_answer
   end
 
@@ -105,4 +106,8 @@ def then_i_see_the_topics_have_been_tagged
   def and_i_dont_see_the_answer
     expect(page).not_to have_content(@answer)
   end
+
+  def and_i_see_the_answer_relevancy_statistics
+    expect(page).to have_content(/Mean score.*1.0/)
+  end
 end

From 4cabe13801c8ebb71d66f8b4c39dce333dbe533e Mon Sep 17 00:00:00 2001
From: David Gisbey <david.gisbey@digital.cabinet-office.gov.uk>
Date: Wed, 7 Jan 2026 09:24:21 +0000
Subject: [PATCH 7/7] Add Answer#question_used

We've got a few places in our codebase where we want to use the rephrased
question if it exists, otherwise fall back to the original question message
in our LLM calls.

This adds the Answer#question_used method to encapsulate that logic, and updates
all relevant places to use this new method.

I've removed the tests that were specifically checking for the rephrased
question logic in the metrics, since that is now covered by the new method.
---
 app/jobs/answer_analysis/tag_topics_job.rb      |  2 +-
 app/models/answer.rb                            |  4 ++++
 lib/auto_evaluation/answer_relevancy.rb         |  2 +-
 lib/auto_evaluation/coherence.rb                |  2 +-
 .../jobs/answer_analysis/tag_topics_job_spec.rb | 10 ----------
 .../auto_evaluation/answer_relevancy_spec.rb    | 10 ----------
 spec/lib/auto_evaluation/coherence_spec.rb      | 17 -----------------
 spec/models/answer_spec.rb                      | 14 ++++++++++++++
 8 files changed, 21 insertions(+), 40 deletions(-)

diff --git a/app/jobs/answer_analysis/tag_topics_job.rb b/app/jobs/answer_analysis/tag_topics_job.rb
index c49bbb933..0093a39d0 100644
--- a/app/jobs/answer_analysis/tag_topics_job.rb
+++ b/app/jobs/answer_analysis/tag_topics_job.rb
@@ -12,7 +12,7 @@ def perform(answer_id)
         return logger.info("Answer #{answer_id} is not eligible for topic analysis")
       end
 
-      result = AutoEvaluation::TopicTagger.call(answer.rephrased_question || answer.question.message)
+      result = AutoEvaluation::TopicTagger.call(answer.question_used)
 
       topics = answer.build_topics(
         primary_topic: result.primary_topic,
diff --git a/app/models/answer.rb b/app/models/answer.rb
index 6c74f931e..2b3de6e3e 100644
--- a/app/models/answer.rb
+++ b/app/models/answer.rb
@@ -201,4 +201,8 @@ def group_used_answer_sources_by_base_path
   def has_analysis?
     topics.present? || answer_relevancy_aggregate.present?
   end
+
+  def question_used
+    rephrased_question || question.message
+  end
 end
diff --git a/lib/auto_evaluation/answer_relevancy.rb b/lib/auto_evaluation/answer_relevancy.rb
index fde51c8e8..88508216e 100644
--- a/lib/auto_evaluation/answer_relevancy.rb
+++ b/lib/auto_evaluation/answer_relevancy.rb
@@ -61,7 +61,7 @@ def call
   attr_accessor :llm_responses, :metrics
 
   def question_message
-    answer.rephrased_question || answer.question.message
+    answer.question_used
   end
 
   def calculate_score(verdicts)
diff --git a/lib/auto_evaluation/coherence.rb b/lib/auto_evaluation/coherence.rb
index fbac8ab96..1674eeb2d 100644
--- a/lib/auto_evaluation/coherence.rb
+++ b/lib/auto_evaluation/coherence.rb
@@ -49,7 +49,7 @@ def normalise_rubric_score(rubric_score)
     end
 
     def question_message
-      answer.rephrased_question || answer.question.message
+      answer.question_used
     end
   end
 end
diff --git a/spec/jobs/answer_analysis/tag_topics_job_spec.rb b/spec/jobs/answer_analysis/tag_topics_job_spec.rb
index 5e19377e2..f57770c20 100644
--- a/spec/jobs/answer_analysis/tag_topics_job_spec.rb
+++ b/spec/jobs/answer_analysis/tag_topics_job_spec.rb
@@ -39,16 +39,6 @@
         )
     end
 
-    context "when the answer has a rephrased_question" do
-      let(:rephrased_question) { "This is a rephrased_question" }
-
-      it "calls the AutoEvaluation::TopicTagger with the rephrased question" do
-        answer = create(:answer, rephrased_question: rephrased_question)
-        described_class.new.perform(answer.id)
-        expect(AutoEvaluation::TopicTagger).to have_received(:call).with(rephrased_question)
-      end
-    end
-
     context "when the answer does not exist" do
       let(:answer_id) { 999 }
 
diff --git a/spec/lib/auto_evaluation/answer_relevancy_spec.rb b/spec/lib/auto_evaluation/answer_relevancy_spec.rb
index 83ee4c09f..194a57a66 100644
--- a/spec/lib/auto_evaluation/answer_relevancy_spec.rb
+++ b/spec/lib/auto_evaluation/answer_relevancy_spec.rb
@@ -63,16 +63,6 @@
         )
     end
 
-    context "when the answer has a rephrased question" do
-      let(:question_message) { "This is a rephrased test question." }
-      let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) }
-
-      it "uses the rephrased question in the prompt" do
-        result = described_class.call(answer)
-        expect(result.reason).to eq(reason)
-      end
-    end
-
     context "when 'idk' verdicts are present" do
       let(:verdicts) do
         [
diff --git a/spec/lib/auto_evaluation/coherence_spec.rb b/spec/lib/auto_evaluation/coherence_spec.rb
index f1a7b1799..b1b2f0023 100644
--- a/spec/lib/auto_evaluation/coherence_spec.rb
+++ b/spec/lib/auto_evaluation/coherence_spec.rb
@@ -67,22 +67,5 @@
         expect(result.success).to eq(expected_score >= described_class::THRESHOLD)
       end
     end
-
-    context "when the answer has a rephrased question" do
-      let(:question_message) { "This is a rephrased test question." }
-      let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) }
-
-      it "uses the rephrased question in the prompt" do
-        stub = stub_bedrock_invoke_model_openai_oss_tool_call(
-          user_prompt,
-          tools,
-          response_json,
-        )
-
-        described_class.call(answer)
-
-        expect(stub).to have_been_requested
-      end
-    end
   end
 end
diff --git a/spec/models/answer_spec.rb b/spec/models/answer_spec.rb
index ac019bbea..6908131dc 100644
--- a/spec/models/answer_spec.rb
+++ b/spec/models/answer_spec.rb
@@ -372,4 +372,18 @@
       expect(answer.has_analysis?).to be(false)
     end
   end
+
+  describe "#question_used" do
+    let(:question) { build(:question, message: "Original question") }
+
+    it "returns the rephrased question if present" do
+      answer = build(:answer, question:, rephrased_question: "Rephrased question")
+      expect(answer.question_used).to eq("Rephrased question")
+    end
+
+    it "returns the original question message if no rephrased question is present" do
+      answer = build(:answer, question:, rephrased_question: nil)
+      expect(answer.question_used).to eq("Original question")
+    end
+  end
 end