From a3c1e4bb839a8c6d8b3502e86ea5ad14dcf6e3c7 Mon Sep 17 00:00:00 2001 From: David Gisbey Date: Tue, 16 Dec 2025 10:47:52 +0000 Subject: [PATCH 1/7] Add AnswerRelevancyAggregate & AnswerRelevancyRun This adds a migration to the two new tables needed to store answer relevancy metrics. It also adds the corresponding models and factories. We will need to record llm multiple llm responses and metrics for each run so i've included the LlmCallsRecordable module in the AnswerRelevancyRun model. --- app/models/answer.rb | 1 + .../answer_relevancy_aggregate.rb | 11 +++++++++ .../answer_analysis/answer_relevancy_run.rb | 11 +++++++++ ...51216092915_add_answer_relevancy_tables.rb | 18 +++++++++++++++ db/schema.rb | 23 ++++++++++++++++++- .../answer_relevancy_aggregate_factory.rb | 6 +++++ .../factories/answer_relevancy_run_factory.rb | 7 ++++++ .../answer_relevancy_run_spec.rb | 5 ++++ 8 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 app/models/answer_analysis/answer_relevancy_aggregate.rb create mode 100644 app/models/answer_analysis/answer_relevancy_run.rb create mode 100644 db/migrate/20251216092915_add_answer_relevancy_tables.rb create mode 100644 spec/factories/answer_relevancy_aggregate_factory.rb create mode 100644 spec/factories/answer_relevancy_run_factory.rb create mode 100644 spec/models/answer_analysis/answer_relevancy_run_spec.rb diff --git a/app/models/answer.rb b/app/models/answer.rb index f062f5960..e27b742e8 100644 --- a/app/models/answer.rb +++ b/app/models/answer.rb @@ -55,6 +55,7 @@ def self.response_for_question_routing_label(label) has_many :sources, -> { order(relevancy: :asc) }, class_name: "AnswerSource" has_one :feedback, class_name: "AnswerFeedback" has_one :topics, class_name: "AnswerAnalysis::Topics" + has_one :answer_relevancy_aggregate, class_name: "AnswerAnalysis::AnswerRelevancyAggregate" enum :status, { diff --git a/app/models/answer_analysis/answer_relevancy_aggregate.rb b/app/models/answer_analysis/answer_relevancy_aggregate.rb new file mode 100644 index 000000000..8c578e4bb --- /dev/null +++ b/app/models/answer_analysis/answer_relevancy_aggregate.rb @@ -0,0 +1,11 @@ +module AnswerAnalysis + class AnswerRelevancyAggregate < ApplicationRecord + self.table_name = "answer_analysis_answer_relevancy_aggregates" + + belongs_to :answer + has_many :runs, + -> { order(:created_at) }, + class_name: "AnswerAnalysis::AnswerRelevancyRun", + foreign_key: :answer_analysis_answer_relevancy_aggregate_id + end +end diff --git a/app/models/answer_analysis/answer_relevancy_run.rb b/app/models/answer_analysis/answer_relevancy_run.rb new file mode 100644 index 000000000..212ab36ff --- /dev/null +++ b/app/models/answer_analysis/answer_relevancy_run.rb @@ -0,0 +1,11 @@ +module AnswerAnalysis + class AnswerRelevancyRun < ApplicationRecord + include LlmCallsRecordable + + self.table_name = "answer_analysis_answer_relevancy_runs" + + belongs_to :aggregate, + class_name: "AnswerAnalysis::AnswerRelevancyAggregate", + foreign_key: :answer_analysis_answer_relevancy_aggregate_id + end +end diff --git a/db/migrate/20251216092915_add_answer_relevancy_tables.rb b/db/migrate/20251216092915_add_answer_relevancy_tables.rb new file mode 100644 index 000000000..900041796 --- /dev/null +++ b/db/migrate/20251216092915_add_answer_relevancy_tables.rb @@ -0,0 +1,18 @@ +class AddAnswerRelevancyTables < ActiveRecord::Migration[8.0] + def change + create_table :answer_analysis_answer_relevancy_aggregates, id: :uuid do |t| + t.decimal :mean_score, null: false + t.references :answer, type: :uuid, null: false, foreign_key: { on_delete: :cascade }, index: { unique: true } + t.timestamps + end + + create_table :answer_analysis_answer_relevancy_runs, id: :uuid do |t| + t.decimal :score, null: false + t.string :reason, null: false + t.jsonb :llm_responses + t.jsonb :metrics + t.references :answer_analysis_answer_relevancy_aggregate, type: :uuid, null: false, foreign_key: { on_delete: :cascade } + t.timestamps + end + end +end diff --git a/db/schema.rb b/db/schema.rb index a48c8be54..7e6a83f29 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[8.0].define(version: 2025_12_15_161508) do +ActiveRecord::Schema[8.0].define(version: 2025_12_16_092915) do # These are extensions that must be enabled in order to support this database enable_extension "citext" enable_extension "pg_catalog.plpgsql" @@ -24,6 +24,25 @@ create_enum "guardrails_status", ["pass", "fail", "error"] create_enum "question_routing_label", ["about_mps", "advice_opinions_predictions", "character_fun", "genuine_rag", "gov_transparency", "greetings", "harmful_vulgar_controversy", "multi_questions", "negative_acknowledgement", "non_english", "personal_info", "positive_acknowledgement", "vague_acronym_grammar", "unclear_intent", "requires_account_data", "about_chat"] + create_table "answer_analysis_answer_relevancy_aggregates", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t| + t.decimal "mean_score", null: false + t.uuid "answer_id", null: false + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + t.index ["answer_id"], name: "index_answer_analysis_answer_relevancy_aggregates_on_answer_id", unique: true + end + + create_table "answer_analysis_answer_relevancy_runs", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t| + t.decimal "score", null: false + t.string "reason", null: false + t.jsonb "llm_responses" + t.jsonb "metrics" + t.uuid "answer_analysis_answer_relevancy_aggregate_id", null: false + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + t.index ["answer_analysis_answer_relevancy_aggregate_id"], name: "idx_on_answer_analysis_answer_relevancy_aggregate_i_d9d79a637a" + end + create_table "answer_analysis_topics", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t| t.string "primary_topic" t.string "secondary_topic" @@ -170,6 +189,8 @@ t.datetime "updated_at", null: false end + add_foreign_key "answer_analysis_answer_relevancy_aggregates", "answers", on_delete: :cascade + add_foreign_key "answer_analysis_answer_relevancy_runs", "answer_analysis_answer_relevancy_aggregates", on_delete: :cascade add_foreign_key "answer_analysis_topics", "answers", on_delete: :cascade add_foreign_key "answer_feedback", "answers", on_delete: :cascade add_foreign_key "answer_sources", "answer_source_chunks", on_delete: :restrict diff --git a/spec/factories/answer_relevancy_aggregate_factory.rb b/spec/factories/answer_relevancy_aggregate_factory.rb new file mode 100644 index 000000000..bd7fb500c --- /dev/null +++ b/spec/factories/answer_relevancy_aggregate_factory.rb @@ -0,0 +1,6 @@ +FactoryBot.define do + factory :answer_relevancy_aggregate, class: "AnswerAnalysis::AnswerRelevancyAggregate" do + answer + mean_score { 0.5 } + end +end diff --git a/spec/factories/answer_relevancy_run_factory.rb b/spec/factories/answer_relevancy_run_factory.rb new file mode 100644 index 000000000..a5f8bdcd3 --- /dev/null +++ b/spec/factories/answer_relevancy_run_factory.rb @@ -0,0 +1,7 @@ +FactoryBot.define do + factory :answer_relevancy_run, class: "AnswerAnalysis::AnswerRelevancyRun" do + association :aggregate, factory: :answer_relevancy_aggregate + score { 0.5 } + reason { "The answer was okay." } + end +end diff --git a/spec/models/answer_analysis/answer_relevancy_run_spec.rb b/spec/models/answer_analysis/answer_relevancy_run_spec.rb new file mode 100644 index 000000000..28129c625 --- /dev/null +++ b/spec/models/answer_analysis/answer_relevancy_run_spec.rb @@ -0,0 +1,5 @@ +RSpec.describe AnswerAnalysis::AnswerRelevancyRun do + include_examples "llm calls recordable" do + let(:model) { build(:answer_relevancy_run) } + end +end From 73780fdd8bf1faa8379720fcb05a3f3fe2730724 Mon Sep 17 00:00:00 2001 From: David Gisbey Date: Wed, 17 Dec 2025 14:30:39 +0000 Subject: [PATCH 2/7] Add additional bedrock stub to stub all answer relevancy calls We're going to need to stub out these calls in multiple places so it makes sense to have a single method that does all the stubbing for us. I've also prepended stub_ to bedrock_invoke_model_openai_oss_tool_call. All other stubs have this so it makes sense to be consistent. --- .../answer_relevancy/reason_generator_spec.rb | 2 +- .../statement_generator_spec.rb | 2 +- .../verdicts_generator_spec.rb | 2 +- .../auto_evaluation/answer_relevancy_spec.rb | 56 ++++--------------- .../bedrock_openai_oss_invoke_spec.rb | 4 +- spec/lib/auto_evaluation/coherence_spec.rb | 6 +- spec/support/stub_bedrock.rb | 51 ++++++++++++++++- 7 files changed, 68 insertions(+), 55 deletions(-) diff --git a/spec/lib/auto_evaluation/answer_relevancy/reason_generator_spec.rb b/spec/lib/auto_evaluation/answer_relevancy/reason_generator_spec.rb index 5827c716f..c90314d47 100644 --- a/spec/lib/auto_evaluation/answer_relevancy/reason_generator_spec.rb +++ b/spec/lib/auto_evaluation/answer_relevancy/reason_generator_spec.rb @@ -23,7 +23,7 @@ end let(:tools) { [prompts.fetch(:tool_spec)] } let!(:stub_bedrock) do - bedrock_invoke_model_openai_oss_tool_call( + stub_bedrock_invoke_model_openai_oss_tool_call( user_prompt, tools, reason_json, diff --git a/spec/lib/auto_evaluation/answer_relevancy/statement_generator_spec.rb b/spec/lib/auto_evaluation/answer_relevancy/statement_generator_spec.rb index 5cb63ac4f..bbd7b66a2 100644 --- a/spec/lib/auto_evaluation/answer_relevancy/statement_generator_spec.rb +++ b/spec/lib/auto_evaluation/answer_relevancy/statement_generator_spec.rb @@ -14,7 +14,7 @@ end let(:tools) { [prompts.fetch(:tool_spec)] } let!(:stub_bedrock) do - bedrock_invoke_model_openai_oss_tool_call( + stub_bedrock_invoke_model_openai_oss_tool_call( user_prompt, tools, statements_json, diff --git a/spec/lib/auto_evaluation/answer_relevancy/verdicts_generator_spec.rb b/spec/lib/auto_evaluation/answer_relevancy/verdicts_generator_spec.rb index 2878b02f7..7c62709e7 100644 --- a/spec/lib/auto_evaluation/answer_relevancy/verdicts_generator_spec.rb +++ b/spec/lib/auto_evaluation/answer_relevancy/verdicts_generator_spec.rb @@ -21,7 +21,7 @@ end let(:tools) { [prompts.fetch(:tool_spec)] } let!(:stub_bedrock) do - bedrock_invoke_model_openai_oss_tool_call( + stub_bedrock_invoke_model_openai_oss_tool_call( user_prompt, tools, verdicts_json, diff --git a/spec/lib/auto_evaluation/answer_relevancy_spec.rb b/spec/lib/auto_evaluation/answer_relevancy_spec.rb index 3375bb272..83ee4c09f 100644 --- a/spec/lib/auto_evaluation/answer_relevancy_spec.rb +++ b/spec/lib/auto_evaluation/answer_relevancy_spec.rb @@ -1,6 +1,5 @@ RSpec.describe AutoEvaluation::AnswerRelevancy, :aws_credentials_stubbed do describe ".call" do - let(:prompts) { AutoEvaluation::Prompts.config.answer_relevancy } let(:question_message) { "This is a test question message." } let(:answer_message) { "This is a test answer message." } let(:question) { build(:question, message: question_message) } @@ -8,21 +7,6 @@ let(:statements) { ["This is the first statement.", "This is the second statement."] } let(:statements_json) { { statements: }.to_json } - let(:user_prompt_statements) do - sprintf( - prompts.fetch(:statements).fetch(:user_prompt), - answer: answer_message, - ) - end - let(:statements_tools) { [prompts.fetch(:statements).fetch(:tool_spec)] } - let!(:statements_stub) do - bedrock_invoke_model_openai_oss_tool_call( - user_prompt_statements, - statements_tools, - statements_json, - ) - end - let(:verdicts) do [ { "verdict" => "yes" }, @@ -30,40 +14,20 @@ ] end let(:verdicts_json) { { verdicts: }.to_json } - let(:user_prompt_verdicts) do - sprintf( - prompts.fetch(:verdicts).fetch(:user_prompt), - question: question_message, - statements:, - ) - end - let(:verdicts_tools) { [prompts.fetch(:verdicts).fetch(:tool_spec)] } - let!(:verdicts_stub) do - bedrock_invoke_model_openai_oss_tool_call( - user_prompt_verdicts, - verdicts_tools, - verdicts_json, - ) - end - let(:reason) { "This is the reason for the score." } let(:reason_json) { { reason: }.to_json } - let(:user_prompt_reason) do - sprintf( - prompts.fetch(:reason).fetch(:user_prompt), - score: 0.5, - unsuccessful_verdicts_reasons: ["The statement is irrelevant."], - question: question_message, - ) - end - let(:reason_tools) { [prompts.fetch(:reason).fetch(:tool_spec)] } - let!(:reason_stub) do - bedrock_invoke_model_openai_oss_tool_call( - user_prompt_reason, - reason_tools, - reason_json, + let!(:answer_relevancy_stubs) do + stub_bedrock_invoke_model_openai_oss_answer_relevancy( + question_message:, + answer_message:, + statements_json:, + verdicts_json:, + reason_json:, ) end + let(:statements_stub) { answer_relevancy_stubs[:statements] } + let(:verdicts_stub) { answer_relevancy_stubs[:verdicts] } + let(:reason_stub) { answer_relevancy_stubs[:reason] } it "returns a results object with the expected attributes" do allow(Clock).to receive(:monotonic_time) diff --git a/spec/lib/auto_evaluation/bedrock_openai_oss_invoke_spec.rb b/spec/lib/auto_evaluation/bedrock_openai_oss_invoke_spec.rb index f93cf1897..64e646953 100644 --- a/spec/lib/auto_evaluation/bedrock_openai_oss_invoke_spec.rb +++ b/spec/lib/auto_evaluation/bedrock_openai_oss_invoke_spec.rb @@ -21,7 +21,7 @@ ] end let!(:stub) do - bedrock_invoke_model_openai_oss_tool_call( + stub_bedrock_invoke_model_openai_oss_tool_call( user_message, tools, { "response" => "Expected response." }.to_json, @@ -57,7 +57,7 @@ end it "raises an error if the response does not conform to the schema" do - bedrock_invoke_model_openai_oss_tool_call( + stub_bedrock_invoke_model_openai_oss_tool_call( user_message, tools, { "invalid_key" => "This does not conform to the schema." }.to_json, diff --git a/spec/lib/auto_evaluation/coherence_spec.rb b/spec/lib/auto_evaluation/coherence_spec.rb index 1f77a906d..f1a7b1799 100644 --- a/spec/lib/auto_evaluation/coherence_spec.rb +++ b/spec/lib/auto_evaluation/coherence_spec.rb @@ -18,7 +18,7 @@ it "returns a results object with the expected attributes" do allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0) - stub = bedrock_invoke_model_openai_oss_tool_call( + stub = stub_bedrock_invoke_model_openai_oss_tool_call( user_prompt, tools, response_json, @@ -55,7 +55,7 @@ 5 => 1.0, }.each do |rubric_score, expected_score| response_json = { score: rubric_score, reason: }.to_json - bedrock_invoke_model_openai_oss_tool_call( + stub_bedrock_invoke_model_openai_oss_tool_call( user_prompt, tools, response_json, @@ -73,7 +73,7 @@ let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) } it "uses the rephrased question in the prompt" do - stub = bedrock_invoke_model_openai_oss_tool_call( + stub = stub_bedrock_invoke_model_openai_oss_tool_call( user_prompt, tools, response_json, diff --git a/spec/support/stub_bedrock.rb b/spec/support/stub_bedrock.rb index c30747b7f..e6432b05a 100644 --- a/spec/support/stub_bedrock.rb +++ b/spec/support/stub_bedrock.rb @@ -47,7 +47,7 @@ def mock_titan_embedding(text, dimensions: Search::ChunkedContentRepository::TIT dimensions.times.map { random_generator.rand } end - def bedrock_invoke_model_openai_oss_tool_call(user_message, tools, content) + def stub_bedrock_invoke_model_openai_oss_tool_call(user_message, tools, content) request_body = { include_reasoning: false, messages: [ @@ -85,4 +85,53 @@ def bedrock_invoke_model_openai_oss_tool_call(user_message, tools, content) endpoint_regex: OPENAI_GPT_OSS_ENDPOINT_REGEX, ) end + + def stub_bedrock_invoke_model_openai_oss_answer_relevancy(question_message:, + answer_message:, + statements_json: { statements: ["Statement."] }.to_json, + verdicts_json: { verdicts: [{ "verdict" => "yes" }] }.to_json, + reason_json: { reason: "This is the reason for the score." }.to_json) + prompts = AutoEvaluation::Prompts.config.answer_relevancy + + statements_user_prompt = sprintf( + prompts.fetch(:statements).fetch(:user_prompt), + answer: answer_message, + ) + verdicts_user_prompt = sprintf( + prompts.fetch(:verdicts).fetch(:user_prompt), + question: question_message, + statements: JSON.parse(statements_json).fetch("statements"), + ) + reason_user_prompt = sprintf( + prompts.fetch(:reason).fetch(:user_prompt), + score: 0.5, + unsuccessful_verdicts_reasons: ["The statement is irrelevant."], + question: question_message, + ) + + statements_tools = [prompts.fetch(:statements).fetch(:tool_spec)] + verdicts_tools = [prompts.fetch(:verdicts).fetch(:tool_spec)] + reason_tools = [prompts.fetch(:reason).fetch(:tool_spec)] + + stubs = {} + stubs[:statements] = stub_bedrock_invoke_model_openai_oss_tool_call( + statements_user_prompt, + statements_tools, + statements_json, + ) + + stubs[:verdicts] = stub_bedrock_invoke_model_openai_oss_tool_call( + verdicts_user_prompt, + verdicts_tools, + verdicts_json, + ) + + stubs[:reason] = stub_bedrock_invoke_model_openai_oss_tool_call( + reason_user_prompt, + reason_tools, + reason_json, + ) + + stubs + end end From f8fb85e3b40e3998fb20d75433dca3913d545f2f Mon Sep 17 00:00:00 2001 From: David Gisbey Date: Mon, 5 Jan 2026 14:05:23 +0000 Subject: [PATCH 3/7] Add auto_evaluation_results_creatable concern This adds a concern to encapsulate the logic for creating aggregate and run records for metrics. It will be called from the various evaluation jobs that require wisdom of the crowd. --- .../answer_relevancy_aggregate.rb | 2 ++ .../auto_evaluation_results_creatable.rb | 27 ++++++++++++++++ .../answer_relevancy_aggregate_spec.rb | 5 +++ ...o_evaluation_results_creatable_examples.rb | 31 +++++++++++++++++++ 4 files changed, 65 insertions(+) create mode 100644 app/models/concerns/auto_evaluation_results_creatable.rb create mode 100644 spec/models/answer_analysis/answer_relevancy_aggregate_spec.rb create mode 100644 spec/support/auto_evaluation_results_creatable_examples.rb diff --git a/app/models/answer_analysis/answer_relevancy_aggregate.rb b/app/models/answer_analysis/answer_relevancy_aggregate.rb index 8c578e4bb..891433893 100644 --- a/app/models/answer_analysis/answer_relevancy_aggregate.rb +++ b/app/models/answer_analysis/answer_relevancy_aggregate.rb @@ -1,5 +1,7 @@ module AnswerAnalysis class AnswerRelevancyAggregate < ApplicationRecord + include AutoEvaluationResultsCreatable + self.table_name = "answer_analysis_answer_relevancy_aggregates" belongs_to :answer diff --git a/app/models/concerns/auto_evaluation_results_creatable.rb b/app/models/concerns/auto_evaluation_results_creatable.rb new file mode 100644 index 000000000..c323d6939 --- /dev/null +++ b/app/models/concerns/auto_evaluation_results_creatable.rb @@ -0,0 +1,27 @@ +module AutoEvaluationResultsCreatable + extend ActiveSupport::Concern + + class_methods do + def create_mean_aggregate_and_score_runs(answer, results) + mean_score = results.map { |result| result.score.to_d }.sum / results.size + aggregate = new(answer:, mean_score:) + + results.each do |result| + run = aggregate.runs.build( + aggregate:, + score: result.score, + reason: result.reason, + ) + + result.llm_responses.stringify_keys.each do |name, llm_response| + run.assign_llm_response(name, llm_response) + end + result.metrics.stringify_keys.each do |name, metrics| + run.assign_metrics(name, metrics) + end + end + + aggregate.save! + end + end +end diff --git a/spec/models/answer_analysis/answer_relevancy_aggregate_spec.rb b/spec/models/answer_analysis/answer_relevancy_aggregate_spec.rb new file mode 100644 index 000000000..07f173ab0 --- /dev/null +++ b/spec/models/answer_analysis/answer_relevancy_aggregate_spec.rb @@ -0,0 +1,5 @@ +RSpec.describe AnswerAnalysis::AnswerRelevancyAggregate do + include_examples "auto_evaluation results creatable", + :answer_relevancy_aggregate, + AnswerAnalysis::AnswerRelevancyRun +end diff --git a/spec/support/auto_evaluation_results_creatable_examples.rb b/spec/support/auto_evaluation_results_creatable_examples.rb new file mode 100644 index 000000000..9d019a8c3 --- /dev/null +++ b/spec/support/auto_evaluation_results_creatable_examples.rb @@ -0,0 +1,31 @@ +shared_examples "auto_evaluation results creatable" do |aggregate_association, run_class| + describe ".create_mean_aggregate_and_score_runs" do + let(:first_run_result) { build(:auto_evaluation_score_result, score: 0.8) } + let(:second_run_result) { build(:auto_evaluation_score_result, score: 0.9) } + let(:results) { [first_run_result, second_run_result] } + let(:answer) { create(:answer) } + let(:answer_id) { answer.id } + + it "creates an aggregate with correct mean score" do + answer = Answer.includes(aggregate_association).find(answer_id) + expect { described_class.create_mean_aggregate_and_score_runs(answer, results) } + .to change(described_class, :count).by(1) + + answer = Answer.includes(aggregate_association).find(answer_id) + expect(answer.answer_relevancy_aggregate.mean_score).to eq(0.85) + end + + it "creates runs with correct attributes and associations" do + answer = Answer.includes("#{aggregate_association}": :runs).find(answer_id) + + expect { + described_class.create_mean_aggregate_and_score_runs(answer, results) + }.to change(run_class, :count).by(2) + + first_run, second_run = answer.reload.public_send(aggregate_association).runs.order(:created_at) + + expect(first_run).to have_attributes(first_run_result.to_h.except(:success)) + expect(second_run).to have_attributes(second_run_result.to_h.except(:success)) + end + end +end From ccbda92e25991864a0ed5dd709e14d672290d9ac Mon Sep 17 00:00:00 2001 From: David Gisbey Date: Tue, 6 Jan 2026 10:33:09 +0000 Subject: [PATCH 4/7] Add AnswerAnalysis AnswerRelevancy and Base jobs This adds the BaseMetricjob and AnswerRelevancyJob. The AnswerRelevancyJobs handles: - making calls to the AnswerRelevancy class - compiling the results - calling the AnswerRelevancyAggregate#create_run_from_result method to delegate record creation to the AutoEvaluationMetricRun model The BaseJob is used to store shard functionality for future metric jobs. The next commit will integrate this job into the analysis workflow. As part of this commit i've updated the ScoreResult factory to use a sequence to build unique attributes for the reason, llm_responses and metrics fields. This ensures that we are correctly persiting all the attributes returned from the evaluation classes correctly. I've also updated the answer relevancy scoring method to use BigDecimal as part of this commit. Without this I was forced to use round(2) in the tests to avoid rounding issues caused by floats. --- .../answer_analysis/answer_relevancy_job.rb | 24 +++ app/jobs/answer_analysis/base_job.rb | 19 +++ lib/auto_evaluation/answer_relevancy.rb | 2 +- .../factories/auto_evaluation_score_result.rb | 6 +- .../answer_relevancy_job_spec.rb | 153 ++++++++++++++++++ 5 files changed, 200 insertions(+), 4 deletions(-) create mode 100644 app/jobs/answer_analysis/answer_relevancy_job.rb create mode 100644 app/jobs/answer_analysis/base_job.rb create mode 100644 spec/jobs/answer_analysis/answer_relevancy_job_spec.rb diff --git a/app/jobs/answer_analysis/answer_relevancy_job.rb b/app/jobs/answer_analysis/answer_relevancy_job.rb new file mode 100644 index 000000000..e9b983c01 --- /dev/null +++ b/app/jobs/answer_analysis/answer_relevancy_job.rb @@ -0,0 +1,24 @@ +module AnswerAnalysis + class AnswerRelevancyJob < BaseJob + def perform(answer_id) + return unless eligible_for_answer_analysis?(answer_id) + + answer = Answer.includes(:question, :answer_relevancy_aggregate).find(answer_id) + return logger.warn(aggregate_exists_warn_message(answer.id)) if answer.answer_relevancy_aggregate.present? + + results = NUMBER_OF_RUNS.times.map { AutoEvaluation::AnswerRelevancy.call(answer) } + + begin + AnswerAnalysis::AnswerRelevancyAggregate.create_mean_aggregate_and_score_runs(answer, results) + rescue ActiveRecord::RecordNotUnique + logger.warn(aggregate_exists_warn_message(answer.id)) + end + end + + private + + def aggregate_exists_warn_message(answer_id) + "Answer #{answer_id} has already been evaluated for relevancy" + end + end +end diff --git a/app/jobs/answer_analysis/base_job.rb b/app/jobs/answer_analysis/base_job.rb new file mode 100644 index 000000000..508fc7b6f --- /dev/null +++ b/app/jobs/answer_analysis/base_job.rb @@ -0,0 +1,19 @@ +module AnswerAnalysis + class BaseJob < ApplicationJob + NUMBER_OF_RUNS = 3 + MAX_RETRIES = 5 + retry_on Aws::Errors::ServiceError, wait: 1.minute, attempts: MAX_RETRIES + + private + + def eligible_for_answer_analysis?(answer_id) + eligible = Answer.status_answered.exists?(id: answer_id) + + unless eligible + logger.warn("Couldn't find an answer #{answer_id} that was eligible for auto-evaluation") + end + + eligible + end + end +end diff --git a/lib/auto_evaluation/answer_relevancy.rb b/lib/auto_evaluation/answer_relevancy.rb index d07d82d0a..fde51c8e8 100644 --- a/lib/auto_evaluation/answer_relevancy.rb +++ b/lib/auto_evaluation/answer_relevancy.rb @@ -69,7 +69,7 @@ def calculate_score(verdicts) return 1.0 if verdict_count.zero? relevant_count = verdicts.count { |verdict| verdict["verdict"].strip.downcase != "no" } - relevant_count.to_f / verdict_count + relevant_count.to_d / verdict_count end def build_maximum_score_result(reason:, llm_responses:, metrics:) diff --git a/spec/factories/auto_evaluation_score_result.rb b/spec/factories/auto_evaluation_score_result.rb index cfb4af097..8bc5f3781 100644 --- a/spec/factories/auto_evaluation_score_result.rb +++ b/spec/factories/auto_evaluation_score_result.rb @@ -3,10 +3,10 @@ skip_create score { 0.85.to_d } - reason { "Most statements are relevant." } + sequence(:reason) { |n| "Reason #{n}" } success { true } - llm_responses { {} } - metrics { {} } + sequence(:llm_responses) { |n| { "llm_response" => { "reason" => "Reason #{n}" } } } + sequence(:metrics) { |n| { "llm_response" => { "duration" => n } } } initialize_with { new(**attributes) } end diff --git a/spec/jobs/answer_analysis/answer_relevancy_job_spec.rb b/spec/jobs/answer_analysis/answer_relevancy_job_spec.rb new file mode 100644 index 000000000..f96b15d23 --- /dev/null +++ b/spec/jobs/answer_analysis/answer_relevancy_job_spec.rb @@ -0,0 +1,153 @@ +RSpec.describe AnswerAnalysis::AnswerRelevancyJob do + include ActiveJob::TestHelper + + let(:answer) { create(:answer) } + let(:question) { answer.question } + let(:results) do + [ + build(:auto_evaluation_score_result, score: 0.8), + build(:auto_evaluation_score_result, score: 0.7), + build(:auto_evaluation_score_result, score: 0.9), + ] + end + + before do + allow(AutoEvaluation::AnswerRelevancy) + .to receive(:call).and_return(*results) + end + + it_behaves_like "a job in queue", "default" + + describe "#perform" do + it "calls AutoEvaluation::AnswerRelevancy the configured number of times with the correct arguments" do + described_class.new.perform(answer.id) + + expect(AutoEvaluation::AnswerRelevancy) + .to have_received(:call) + .with(answer) + .exactly(AnswerAnalysis::BaseJob::NUMBER_OF_RUNS).times + end + + it "creates answer relevancy aggregate with the correct score" do + expect { + described_class.new.perform(answer.id) + }.to change(AnswerAnalysis::AnswerRelevancyAggregate, :count).by(1) + answer = Answer.includes(:answer_relevancy_aggregate) + .find(AnswerAnalysis::AnswerRelevancyAggregate.last.answer_id) + expect(answer.answer_relevancy_aggregate.mean_score).to eq(0.8) + end + + it "creates answer relevancy runs for each result" do + expect { + described_class.new.perform(answer.id) + }.to change(AnswerAnalysis::AnswerRelevancyRun, :count).by(results.count) + + answer = Answer.includes(answer_relevancy_aggregate: :runs) + .find(AnswerAnalysis::AnswerRelevancyAggregate.last.answer_id) + + results.each_with_index do |result, index| + expect(answer.answer_relevancy_aggregate.runs[index]) + .to have_attributes(result.to_h.except(:success)) + end + end + + context "when the answer has a rephrased_question" do + let(:rephrased_question) { "This is a rephrased_question" } + + it "passes the rephrased question to AutoEvaluation::AnswerRelevancy as the question_message" do + answer = create(:answer, rephrased_question: rephrased_question) + + described_class.new.perform(answer.id) + + expect(AutoEvaluation::AnswerRelevancy) + .to have_received(:call) + .with(answer) + .exactly(AnswerAnalysis::BaseJob::NUMBER_OF_RUNS).times + end + end + + context "when the answer does not exist" do + let(:answer_id) { 999 } + + it "logs a warning" do + expect(described_class.logger) + .to receive(:warn) + .with("Couldn't find an answer 999 that was eligible for auto-evaluation") + + described_class.new.perform(answer_id) + end + + it "doesn't call AutoEvaluation::AnswerRelevancy" do + described_class.new.perform(answer_id) + expect(AutoEvaluation::AnswerRelevancy).not_to have_received(:call) + end + end + + context "when answer relevancy has already been evaluated" do + let(:aggregate) { create(:answer_relevancy_aggregate) } + let(:answer) { aggregate.answer } + + it "logs a warning" do + expect(described_class.logger) + .to receive(:warn) + .with("Answer #{answer.id} has already been evaluated for relevancy") + + described_class.new.perform(answer.id) + end + + it "doesn't call AutoEvaluation::AnswerRelevancy" do + described_class.new.perform(answer.id) + expect(AutoEvaluation::AnswerRelevancy).not_to have_received(:call) + end + end + + context "when aggregate data is persisted mid job" do + before do + allow(AnswerAnalysis::AnswerRelevancyAggregate) + .to receive(:create_mean_aggregate_and_score_runs) + .with(answer, anything) + .and_raise(ActiveRecord::RecordNotUnique) + end + + it "logs a warning" do + expect(described_class.logger) + .to receive(:warn) + .with("Answer #{answer.id} has already been evaluated for relevancy") + + described_class.new.perform(answer.id) + end + end + + context "when the AnswerRelevancy metric raises an Aws::Errors::ServiceError" do + it "retries the job the max number of times" do + allow(AutoEvaluation::AnswerRelevancy) + .to receive(:call) + .and_raise(Aws::Errors::ServiceError.new(nil, "error")) + + described_class.perform_later(answer.id) + + assert_performed_jobs described_class::MAX_RETRIES do + expect { perform_enqueued_jobs } + .to raise_error(Aws::Errors::ServiceError) + end + end + end + + context "when the answer is not eligible for auto-evaluation" do + let(:answer) { create(:answer, status: Answer.statuses.except(:answered).keys.sample) } + + it "logs a warning message" do + expect(described_class.logger) + .to receive(:warn) + .with("Couldn't find an answer #{answer.id} that was eligible for auto-evaluation") + + described_class.new.perform(answer.id) + end + + it "does not call AutoEvaluation::AnswerRelevancy" do + expect(AutoEvaluation::AnswerRelevancy).not_to receive(:call) + described_class.new.perform(answer.id) + end + end + end +end From 9ace3164ea844a7bd1d98656f6df152d02a34898 Mon Sep 17 00:00:00 2001 From: David Gisbey Date: Tue, 6 Jan 2026 10:41:48 +0000 Subject: [PATCH 5/7] Integraate Answer Relevancy Analysis into analysis workflow This updates the compose answer job to call the answer relevancy job after an answer has been successfully composed and persisted. --- app/jobs/compose_answer_job.rb | 7 ++++++- spec/jobs/compose_answer_job_spec.rb | 6 ++++++ spec/requests/api/v1/conversation_flow_spec.rb | 1 + spec/system/conversation_js_features_spec.rb | 4 ++++ .../conversation_with_claude_structured_answer_spec.rb | 8 ++++++++ ...nversation_with_open_ai_with_structured_answer_spec.rb | 8 ++++++++ .../user_conversation_activity_is_shown_in_admin_spec.rb | 4 ++++ 7 files changed, 37 insertions(+), 1 deletion(-) diff --git a/app/jobs/compose_answer_job.rb b/app/jobs/compose_answer_job.rb index e3a97c442..e4a9863fb 100644 --- a/app/jobs/compose_answer_job.rb +++ b/app/jobs/compose_answer_job.rb @@ -14,6 +14,11 @@ def perform(question_id) logger.warn("Already an answer created for #{question_id}") end - AnswerAnalysis::TagTopicsJob.perform_later(answer.id) if answer.persisted? + if answer.persisted? + # TODO: Once we've added a few metrics we should move these to a single job that + # kicks off all analysis jobs. + AnswerAnalysis::TagTopicsJob.perform_later(answer.id) + AnswerAnalysis::AnswerRelevancyJob.perform_later(answer.id) + end end end diff --git a/spec/jobs/compose_answer_job_spec.rb b/spec/jobs/compose_answer_job_spec.rb index e7e09fd6a..24d907807 100644 --- a/spec/jobs/compose_answer_job_spec.rb +++ b/spec/jobs/compose_answer_job_spec.rb @@ -6,6 +6,7 @@ before do allow(AnswerComposition::Composer).to receive(:call).and_return(returned_answer) allow(AnswerAnalysis::TagTopicsJob).to receive(:perform_later) + allow(AnswerAnalysis::AnswerRelevancyJob).to receive(:perform_later) end it_behaves_like "a job in queue", "answer" @@ -22,6 +23,11 @@ expect(AnswerAnalysis::TagTopicsJob).to have_received(:perform_later).with(returned_answer.id) end + it "calls the AnswerAnalysis::AnswerRelevancyJob with the answer_id" do + described_class.new.perform(question.id) + expect(AnswerAnalysis::AnswerRelevancyJob).to have_received(:perform_later).with(returned_answer.id) + end + context "when the question has already been answered" do let(:question) { create(:question, :with_answer) } diff --git a/spec/requests/api/v1/conversation_flow_spec.rb b/spec/requests/api/v1/conversation_flow_spec.rb index c6a6e7b90..214d05688 100644 --- a/spec/requests/api/v1/conversation_flow_spec.rb +++ b/spec/requests/api/v1/conversation_flow_spec.rb @@ -74,6 +74,7 @@ def when_i_create_a_conversation ) end allow(AnswerAnalysis::TagTopicsJob).to receive(:perform_later) + allow(AnswerAnalysis::AnswerRelevancyJob).to receive(:perform_later) post api_v1_create_conversation_path, params: { user_question: "What is the capital of France?" }, diff --git a/spec/system/conversation_js_features_spec.rb b/spec/system/conversation_js_features_spec.rb index 7d1893393..c2e8bba1f 100644 --- a/spec/system/conversation_js_features_spec.rb +++ b/spec/system/conversation_js_features_spec.rb @@ -282,6 +282,10 @@ def stubs_for_mock_answer(question, stub_claude_output_guardrails(answer) stub_claude_messages_topic_tagger(question) + stub_bedrock_invoke_model_openai_oss_answer_relevancy( + question_message: question, + answer_message: answer, + ) end def then_i_cant_see_the_clear_chat_link diff --git a/spec/system/conversation_with_claude_structured_answer_spec.rb b/spec/system/conversation_with_claude_structured_answer_spec.rb index 911f97ff5..c6bc2fb0e 100644 --- a/spec/system/conversation_with_claude_structured_answer_spec.rb +++ b/spec/system/conversation_with_claude_structured_answer_spec.rb @@ -51,6 +51,10 @@ def when_the_first_answer_is_generated stub_claude_structured_answer(@first_question, @first_answer) stub_claude_output_guardrails(@first_answer, "False | None") stub_claude_messages_topic_tagger(@first_question) + stub_bedrock_invoke_model_openai_oss_answer_relevancy( + question_message: @first_question, + answer_message: @first_answer, + ) execute_queued_sidekiq_jobs end @@ -83,6 +87,10 @@ def when_the_second_answer_is_generated stub_claude_structured_answer(rephrased_question, @second_answer) stub_claude_output_guardrails(@second_answer, "False | None") stub_claude_messages_topic_tagger(rephrased_question) + stub_bedrock_invoke_model_openai_oss_answer_relevancy( + question_message: rephrased_question, + answer_message: @second_answer, + ) execute_queued_sidekiq_jobs end diff --git a/spec/system/conversation_with_open_ai_with_structured_answer_spec.rb b/spec/system/conversation_with_open_ai_with_structured_answer_spec.rb index cc157d940..4def8e862 100644 --- a/spec/system/conversation_with_open_ai_with_structured_answer_spec.rb +++ b/spec/system/conversation_with_open_ai_with_structured_answer_spec.rb @@ -55,6 +55,10 @@ def when_the_first_answer_is_generated ) stub_openai_output_guardrail("Lots of tax.") stub_claude_messages_topic_tagger(@first_question) + stub_bedrock_invoke_model_openai_oss_answer_relevancy( + question_message: @first_question, + answer_message: "Lots of tax.", + ) execute_queued_sidekiq_jobs end @@ -75,6 +79,10 @@ def when_the_second_answer_is_generated ) stub_openai_output_guardrail("Even more tax.") stub_claude_messages_topic_tagger(rephrased_question) + stub_bedrock_invoke_model_openai_oss_answer_relevancy( + question_message: rephrased_question, + answer_message: "Even more tax.", + ) execute_queued_sidekiq_jobs end diff --git a/spec/system/user_conversation_activity_is_shown_in_admin_spec.rb b/spec/system/user_conversation_activity_is_shown_in_admin_spec.rb index 06caf458c..5176eb45f 100644 --- a/spec/system/user_conversation_activity_is_shown_in_admin_spec.rb +++ b/spec/system/user_conversation_activity_is_shown_in_admin_spec.rb @@ -52,6 +52,10 @@ def and_the_answer_is_generated stub_claude_structured_answer(@question, @answer) stub_claude_output_guardrails(@answer, "False | None") stub_claude_messages_topic_tagger(@question) + stub_bedrock_invoke_model_openai_oss_answer_relevancy( + question_message: @question, + answer_message: @answer, + ) execute_queued_sidekiq_jobs end From 6f363628b756796c4d12a3499b84de85e4e56683 Mon Sep 17 00:00:00 2001 From: David Gisbey Date: Wed, 17 Dec 2025 17:52:15 +0000 Subject: [PATCH 6/7] Expose answer relevancy metrics in admin UI I've added an additional tab for answer relevancy metrics in the admin interface on the question show page. My thoughts for this are if we don't split out the metrics into their own tabs then the page will get incredibly noisy. This makes it easier to navigate. Due to this, i've renamed the analysis tab to topics. --- app/controllers/admin/questions_controller.rb | 2 +- app/models/answer.rb | 4 + .../admin/questions/_analysis_tab.html.erb | 80 +++++++++++-------- ...generic_aggregate_auto_evaluation.html.erb | 75 +++++++++++++++++ app/views/admin/questions/show.html.erb | 3 +- spec/models/answer_spec.rb | 19 +++++ spec/requests/admin/questions_spec.rb | 56 ++++++++++++- ...rsation_activity_is_shown_in_admin_spec.rb | 5 ++ 8 files changed, 206 insertions(+), 38 deletions(-) create mode 100644 app/views/admin/questions/_generic_aggregate_auto_evaluation.html.erb diff --git a/app/controllers/admin/questions_controller.rb b/app/controllers/admin/questions_controller.rb index 6fa05a276..229e9b894 100644 --- a/app/controllers/admin/questions_controller.rb +++ b/app/controllers/admin/questions_controller.rb @@ -7,7 +7,7 @@ def index def show question_scope = Question.includes( conversation: :signon_user, - answer: [{ sources: :chunk }, :feedback, :topics], + answer: [{ sources: :chunk }, :feedback, :topics, { answer_relevancy_aggregate: :runs }], ) @question = question_scope.find(params[:id]) diff --git a/app/models/answer.rb b/app/models/answer.rb index e27b742e8..6c74f931e 100644 --- a/app/models/answer.rb +++ b/app/models/answer.rb @@ -197,4 +197,8 @@ def group_used_answer_sources_by_base_path } end end + + def has_analysis? + topics.present? || answer_relevancy_aggregate.present? + end end diff --git a/app/views/admin/questions/_analysis_tab.html.erb b/app/views/admin/questions/_analysis_tab.html.erb index 21d3d23cd..337dc664e 100644 --- a/app/views/admin/questions/_analysis_tab.html.erb +++ b/app/views/admin/questions/_analysis_tab.html.erb @@ -1,41 +1,51 @@ -<%= render "govuk_publishing_components/components/summary_list", { - items: [ - { - field: "Primary topic", - value: topics.primary_topic&.humanize, - }, - { - field: "Secondary topic", - value: topics.secondary_topic&.humanize, - }, - ], -} %> +<% if topics.present? %> + <%= render "govuk_publishing_components/components/summary_list", { + title: "Topics", + heading_size: "l", + heading_level: 2, + margin_bottom: 4, + items: [ + { + field: "Primary topic", + value: topics.primary_topic.humanize, + }, + { + field: "Secondary topic", + value: topics.secondary_topic&.humanize, + }, + ], + } %> -<% if topics.llm_responses.present? %> - <%= render "govuk_publishing_components/components/details", { - title: "LLM responses", - } do %> - <% topics.llm_responses.each do |namespace, response| %> -

<%= namespace %>

-

- <%= render("components/code_snippet", content: JSON.pretty_generate(response)) %> -

+ <% if topics.llm_responses.present? %> + <%= render "govuk_publishing_components/components/details", { + title: "LLM responses", + } do %> + <% topics.llm_responses.each do |namespace, response| %> +

<%= namespace %>

+

+ <%= render("components/code_snippet", content: JSON.pretty_generate(response)) %> +

+ <% end %> <% end %> <% end %> -<% end %> -<% if topics.metrics.present? %> - <%= render "govuk_publishing_components/components/details", { - title: "Metrics", - } do %> - <%= render "govuk_publishing_components/components/summary_list", { - items: topics.metrics.map do |metric, value| - { - field: metric, - value: value, - } - end, - borderless: true, - } %> + <% if topics.metrics.present? %> + <%= render "govuk_publishing_components/components/details", { + title: "Metrics", + } do %> + <%= render "govuk_publishing_components/components/summary_list", { + items: topics.metrics.map do |metric, value| + { + field: metric, + value: value, + } + end, + borderless: true, + } %> + <% end %> <% end %> <% end %> + +<% if answer_relevancy_aggregate.present? %> + <%= render "generic_aggregate_auto_evaluation", aggregate: answer_relevancy_aggregate, title: "Answer relevancy" %> +<% end %> diff --git a/app/views/admin/questions/_generic_aggregate_auto_evaluation.html.erb b/app/views/admin/questions/_generic_aggregate_auto_evaluation.html.erb new file mode 100644 index 000000000..f84f4868f --- /dev/null +++ b/app/views/admin/questions/_generic_aggregate_auto_evaluation.html.erb @@ -0,0 +1,75 @@ +<% + items = [ + { + field: "Mean score", + value: aggregate.mean_score, + }, + ] + + items += aggregate.runs.flat_map.with_index(1) do |run, index| + [ + { field: "Run #{index} score", value: run.score }, + { field: "Run #{index} reason", value: run.reason }, + ] + end +%> + +<%= render "govuk_publishing_components/components/summary_list", { + title:, + heading_level: 2, + margin_bottom: 4, + heading_size: "l", + items: items, +} %> + +<%= render "govuk_publishing_components/components/details", { + title: "LLM responses", +} do %> + <% aggregate.runs.each.with_index(1) do |run, index| %> + <%= render "govuk_publishing_components/components/heading", { + text: "Run #{index}", + font_size: "m", + heading_level: 2, + margin_bottom: 4, + } %> + + <% run.llm_responses.each do |namespace, response| %> + <%= render "govuk_publishing_components/components/heading", { + text: namespace.capitalize, + font_size: "s", + heading_level: 3, + } %> + +

+ <%= render("components/code_snippet", content: JSON.pretty_generate(response)) %> +

+ <% end %> + <% end %> +<% end %> + +<%= render "govuk_publishing_components/components/details", { + title: "Metrics", +} do %> + <% aggregate.runs.each.with_index(1) do |run, index| %> + <%= render "govuk_publishing_components/components/heading", { + text: "Run #{index}", + font_size: "m", + heading_level: 2, + } %> + + <% run.metrics.sort.each do |namespace, metrics| %> + <%= render "govuk_publishing_components/components/summary_list", { + title: namespace.capitalize, + items: metrics.map do |metric, value| + { + field: metric, + value: value, + } + end, + borderless: true, + heading_size: "s", + margin_bottom: 6, + } %> + <% end %> + <% end %> +<% end %> diff --git a/app/views/admin/questions/show.html.erb b/app/views/admin/questions/show.html.erb index 724f6cff5..0dd09c4dc 100644 --- a/app/views/admin/questions/show.html.erb +++ b/app/views/admin/questions/show.html.erb @@ -39,8 +39,9 @@ content_for(:active_navigation_item, admin_questions_path) content: render( "analysis_tab", topics: @answer.topics, + answer_relevancy_aggregate: @answer.answer_relevancy_aggregate, ), - } if @answer&.topics.present? + } if @answer&.has_analysis? %>
diff --git a/spec/models/answer_spec.rb b/spec/models/answer_spec.rb index 95f4cfb14..ac019bbea 100644 --- a/spec/models/answer_spec.rb +++ b/spec/models/answer_spec.rb @@ -353,4 +353,23 @@ end end end + + describe "#has_analysis?" do + it "returns true if topics are present" do + answer = build(:answer, :with_topics) + expect(answer.has_analysis?).to be(true) + end + + it "returns true if answer_relevancy_aggregate is present" do + answer = build( + :answer, answer_relevancy_aggregate: build(:answer_relevancy_aggregate) + ) + expect(answer.has_analysis?).to be(true) + end + + it "returns false if no analysis is present" do + answer = build(:answer) + expect(answer.has_analysis?).to be(false) + end + end end diff --git a/spec/requests/admin/questions_spec.rb b/spec/requests/admin/questions_spec.rb index 26cbf26e4..28cb117d0 100644 --- a/spec/requests/admin/questions_spec.rb +++ b/spec/requests/admin/questions_spec.rb @@ -284,7 +284,7 @@ .and have_content('"id": "call_dqGpbb39drQDafLsjDLtnbGD"') end - it "doesn't render the tabs component when there is no analysis" do + it "doesn't render the tabs component when there are no topics or auto-eval aggregate data" do question = create(:question, :with_answer) get admin_show_question_path(question) @@ -361,6 +361,60 @@ .and have_selector("#analysis-tab", text: topics.secondary_topic.capitalize) end end + + context "when answer relevancy aggregate data is present" do + let(:run) do + create( + :answer_relevancy_run, + score: 0.85, + reason: "The answer is relevant to the question.", + llm_responses: { + "statements" => { "statements" => ["The answer is relevant."] }, + "verdicts" => { "verdicts" => [{ "verdict" => "yes" }] }, + }, + metrics: { + "statements" => { duration: 1.55556 }, + "verdicts" => { duration: 1.44445 }, + }, + ) + end + let!(:aggregate) do + create( + :answer_relevancy_aggregate, + runs: [run], + ) + end + let(:question) { aggregate.answer.question } + + it "renders the answer relevancy aggregate and run details" do + get admin_show_question_path(question) + + expect(response.body.squish) + .to have_content("Answer relevancy") + .and have_content("Run 1 score") + .and have_content("0.85") + .and have_content("Run 1 reason") + .and have_content("The answer is relevant to the question.") + end + + it "renders the runs llm responses" do + get admin_show_question_path(question) + + expect(response.body.squish) + .to have_content('{ "statements": [ "The answer is relevant." ] }') + .and have_content('{ "verdicts": [ { "verdict": "yes" } ] }') + end + + it "renders the runs metrics" do + get admin_show_question_path(question) + + expect(response.body.squish) + .to have_content("Statements") + .and have_content(/duration.*1\.55556/) + .and have_content("Verdicts") + .and have_content(/duration.*1\.44445/) + end + end end def expect_unprocessable_content_with_date_errors diff --git a/spec/system/user_conversation_activity_is_shown_in_admin_spec.rb b/spec/system/user_conversation_activity_is_shown_in_admin_spec.rb index 5176eb45f..6072a4b2c 100644 --- a/spec/system/user_conversation_activity_is_shown_in_admin_spec.rb +++ b/spec/system/user_conversation_activity_is_shown_in_admin_spec.rb @@ -18,6 +18,7 @@ when_i_click_the_analysis_tab then_i_see_the_topics_have_been_tagged + and_i_see_the_answer_relevancy_statistics and_i_dont_see_the_answer end @@ -105,4 +106,8 @@ def then_i_see_the_topics_have_been_tagged def and_i_dont_see_the_answer expect(page).not_to have_content(@answer) end + + def and_i_see_the_answer_relevancy_statistics + expect(page).to have_content(/Mean score.*1.0/) + end end From 4cabe13801c8ebb71d66f8b4c39dce333dbe533e Mon Sep 17 00:00:00 2001 From: David Gisbey Date: Wed, 7 Jan 2026 09:24:21 +0000 Subject: [PATCH 7/7] Add Answer#question_used We've got a few places in our codebase where we want to use the rephrased question if it exists, otherwise fall back to the original question message in our LLM calls. This adds the Answer#question_used method to encapsulate that logic, and updates all relevant places to use this new method. I've removed the tests that were specifically checking for the rephrased question logic in the metrics, since that is now covered by the new method. --- app/jobs/answer_analysis/tag_topics_job.rb | 2 +- app/models/answer.rb | 4 ++++ lib/auto_evaluation/answer_relevancy.rb | 2 +- lib/auto_evaluation/coherence.rb | 2 +- .../jobs/answer_analysis/tag_topics_job_spec.rb | 10 ---------- .../auto_evaluation/answer_relevancy_spec.rb | 10 ---------- spec/lib/auto_evaluation/coherence_spec.rb | 17 ----------------- spec/models/answer_spec.rb | 14 ++++++++++++++ 8 files changed, 21 insertions(+), 40 deletions(-) diff --git a/app/jobs/answer_analysis/tag_topics_job.rb b/app/jobs/answer_analysis/tag_topics_job.rb index c49bbb933..0093a39d0 100644 --- a/app/jobs/answer_analysis/tag_topics_job.rb +++ b/app/jobs/answer_analysis/tag_topics_job.rb @@ -12,7 +12,7 @@ def perform(answer_id) return logger.info("Answer #{answer_id} is not eligible for topic analysis") end - result = AutoEvaluation::TopicTagger.call(answer.rephrased_question || answer.question.message) + result = AutoEvaluation::TopicTagger.call(answer.question_used) topics = answer.build_topics( primary_topic: result.primary_topic, diff --git a/app/models/answer.rb b/app/models/answer.rb index 6c74f931e..2b3de6e3e 100644 --- a/app/models/answer.rb +++ b/app/models/answer.rb @@ -201,4 +201,8 @@ def group_used_answer_sources_by_base_path def has_analysis? topics.present? || answer_relevancy_aggregate.present? end + + def question_used + rephrased_question || question.message + end end diff --git a/lib/auto_evaluation/answer_relevancy.rb b/lib/auto_evaluation/answer_relevancy.rb index fde51c8e8..88508216e 100644 --- a/lib/auto_evaluation/answer_relevancy.rb +++ b/lib/auto_evaluation/answer_relevancy.rb @@ -61,7 +61,7 @@ def call attr_accessor :llm_responses, :metrics def question_message - answer.rephrased_question || answer.question.message + answer.question_used end def calculate_score(verdicts) diff --git a/lib/auto_evaluation/coherence.rb b/lib/auto_evaluation/coherence.rb index fbac8ab96..1674eeb2d 100644 --- a/lib/auto_evaluation/coherence.rb +++ b/lib/auto_evaluation/coherence.rb @@ -49,7 +49,7 @@ def normalise_rubric_score(rubric_score) end def question_message - answer.rephrased_question || answer.question.message + answer.question_used end end end diff --git a/spec/jobs/answer_analysis/tag_topics_job_spec.rb b/spec/jobs/answer_analysis/tag_topics_job_spec.rb index 5e19377e2..f57770c20 100644 --- a/spec/jobs/answer_analysis/tag_topics_job_spec.rb +++ b/spec/jobs/answer_analysis/tag_topics_job_spec.rb @@ -39,16 +39,6 @@ ) end - context "when the answer has a rephrased_question" do - let(:rephrased_question) { "This is a rephrased_question" } - - it "calls the AutoEvaluation::TopicTagger with the rephrased question" do - answer = create(:answer, rephrased_question: rephrased_question) - described_class.new.perform(answer.id) - expect(AutoEvaluation::TopicTagger).to have_received(:call).with(rephrased_question) - end - end - context "when the answer does not exist" do let(:answer_id) { 999 } diff --git a/spec/lib/auto_evaluation/answer_relevancy_spec.rb b/spec/lib/auto_evaluation/answer_relevancy_spec.rb index 83ee4c09f..194a57a66 100644 --- a/spec/lib/auto_evaluation/answer_relevancy_spec.rb +++ b/spec/lib/auto_evaluation/answer_relevancy_spec.rb @@ -63,16 +63,6 @@ ) end - context "when the answer has a rephrased question" do - let(:question_message) { "This is a rephrased test question." } - let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) } - - it "uses the rephrased question in the prompt" do - result = described_class.call(answer) - expect(result.reason).to eq(reason) - end - end - context "when 'idk' verdicts are present" do let(:verdicts) do [ diff --git a/spec/lib/auto_evaluation/coherence_spec.rb b/spec/lib/auto_evaluation/coherence_spec.rb index f1a7b1799..b1b2f0023 100644 --- a/spec/lib/auto_evaluation/coherence_spec.rb +++ b/spec/lib/auto_evaluation/coherence_spec.rb @@ -67,22 +67,5 @@ expect(result.success).to eq(expected_score >= described_class::THRESHOLD) end end - - context "when the answer has a rephrased question" do - let(:question_message) { "This is a rephrased test question." } - let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) } - - it "uses the rephrased question in the prompt" do - stub = stub_bedrock_invoke_model_openai_oss_tool_call( - user_prompt, - tools, - response_json, - ) - - described_class.call(answer) - - expect(stub).to have_been_requested - end - end end end diff --git a/spec/models/answer_spec.rb b/spec/models/answer_spec.rb index ac019bbea..6908131dc 100644 --- a/spec/models/answer_spec.rb +++ b/spec/models/answer_spec.rb @@ -372,4 +372,18 @@ expect(answer.has_analysis?).to be(false) end end + + describe "#question_used" do + let(:question) { build(:question, message: "Original question") } + + it "returns the rephrased question if present" do + answer = build(:answer, question:, rephrased_question: "Rephrased question") + expect(answer.question_used).to eq("Rephrased question") + end + + it "returns the original question message if no rephrased question is present" do + answer = build(:answer, question:, rephrased_question: nil) + expect(answer.question_used).to eq("Original question") + end + end end