Skip to content

Commit b0372c2

Browse files
committed
Integrate Answer Relevancy Analysis into analysis workflow
This adds the BaseMetricjob and AnswerRelevancyJob. The AnswerRelevancyJobs handles: - making calls to the AnswerRelevancy class - compiling the results - creating a AnswerRelevancyAggregate record to store the mean score - calling the AnswerRelevancyAggregate#create_run_from_result method to delegate run creation to the AutoEvaluationMetricRun model The BaseMetricJob is used to store shard functionality for future metric jobs. I've moved logic for saving the records into a concern so it can be reused by other metrics.
1 parent 24ecf68 commit b0372c2

14 files changed

Lines changed: 397 additions & 1 deletion
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
module AnswerAnalysis
2+
class AnswerRelevancyJob < BaseMetricJob
3+
def perform(answer_id)
4+
return if warning_logged?(answer_id)
5+
6+
answer = Answer.includes(:question, :answer_relevancy_aggregate).find_by(id: answer_id)
7+
return logger.warn(aggregate_exists_warn_message(answer.id)) if answer.answer_relevancy_aggregate.present?
8+
9+
results = []
10+
NUMBER_OF_RUNS.times do
11+
results << AutoEvaluation::AnswerRelevancy.call(
12+
question_message: answer.rephrased_question || answer.question.message,
13+
answer_message: answer.message,
14+
)
15+
end
16+
17+
begin
18+
AnswerAnalysis::AnswerRelevancyAggregate.create_mean_aggregate_and_score_runs(answer, results)
19+
rescue ActiveRecord::RecordNotUnique
20+
logger.warn(aggregate_exists_warn_message(answer.id))
21+
end
22+
end
23+
24+
private
25+
26+
def aggregate_exists_warn_message(answer_id)
27+
"Answer #{answer_id} has already been evaluated for relevancy"
28+
end
29+
end
30+
end
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
module AnswerAnalysis
2+
class BaseMetricJob < ApplicationJob
3+
NUMBER_OF_RUNS = 3
4+
MAX_RETRIES = 5
5+
retry_on Aws::Errors::ServiceError, wait: 1.minute, attempts: MAX_RETRIES
6+
7+
private
8+
9+
def warning_logged?(answer_id)
10+
answer = Answer.find_by(id: answer_id)
11+
unless answer
12+
logger.warn("No answer found for #{answer_id}")
13+
return true
14+
end
15+
16+
unless answer.status_answered?
17+
logger.info("Answer #{answer.id} is not eligible for auto-evaluation")
18+
return true
19+
end
20+
21+
false
22+
end
23+
end
24+
end

app/jobs/compose_answer_job.rb

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@ def perform(question_id)
1414
logger.warn("Already an answer created for #{question_id}")
1515
end
1616

17-
AnswerTopicsJob.perform_later(answer.id) if answer.persisted?
17+
if answer.persisted?
18+
# TODO: Once we've added a few metrics we should move these to a single job that
19+
# kicks off all analysis jobs.
20+
AnswerTopicsJob.perform_later(answer.id)
21+
AnswerAnalysis::AnswerRelevancyJob.perform_later(answer.id)
22+
end
1823
end
1924
end

app/models/answer_analysis/answer_relevancy_aggregate.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
module AnswerAnalysis
22
class AnswerRelevancyAggregate < ApplicationRecord
3+
include AnalysisResultsCreatable
4+
35
belongs_to :answer
46
has_many :runs, class_name: "AnswerAnalysis::AnswerRelevancyRun"
57
end
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
module AnalysisResultsCreatable
2+
extend ActiveSupport::Concern
3+
4+
included do
5+
def self.create_mean_aggregate_and_score_runs(answer, results)
6+
mean_score = results.sum(&:score) / results.size.to_f
7+
aggregate = create!(answer:, mean_score:)
8+
9+
results.each do |result|
10+
run = aggregate.runs.build(
11+
score: result.score,
12+
reason: result.reason,
13+
)
14+
15+
result.llm_responses.stringify_keys.each do |name, llm_response|
16+
run.assign_llm_response(name, llm_response)
17+
end
18+
result.metrics.stringify_keys.each do |name, metrics|
19+
run.assign_metrics(name, metrics)
20+
end
21+
end
22+
23+
aggregate.save!
24+
end
25+
end
26+
end
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
RSpec.describe AnswerAnalysis::AnswerRelevancyJob do
2+
include ActiveJob::TestHelper
3+
4+
let(:answer) { create(:answer) }
5+
let(:question) { answer.question }
6+
let(:first_result) do
7+
AutoEvaluation::AnswerRelevancy::Result.new(
8+
score: 0.8,
9+
reason: "The first reason.",
10+
success: true,
11+
llm_responses: {
12+
"response_1" => { "content" => "LLM response content 1" },
13+
"response_2" => { "content" => "LLM response content 2" },
14+
},
15+
metrics: {
16+
"metric_1" => { "detail" => "Metric detail 1" },
17+
"metric_2" => { "detail" => "Metric detail 2" },
18+
},
19+
)
20+
end
21+
let(:second_result) do
22+
AutoEvaluation::AnswerRelevancy::Result.new(
23+
score: 0.7,
24+
reason: "The second reason.",
25+
success: true,
26+
llm_responses: {
27+
"response_3" => { "content" => "LLM response content 3" },
28+
"response_4" => { "content" => "LLM response content 4" },
29+
},
30+
metrics: {
31+
"metric_3" => { "detail" => "Metric detail 3" },
32+
"metric_4" => { "detail" => "Metric detail 4" },
33+
},
34+
)
35+
end
36+
let(:third_result) do
37+
AutoEvaluation::AnswerRelevancy::Result.new(
38+
score: 0.9,
39+
reason: "The third reason.",
40+
success: true,
41+
llm_responses: {
42+
"response_5" => { "content" => "LLM response content 5" },
43+
"response_6" => { "content" => "LLM response content 6" },
44+
},
45+
metrics: {
46+
"metric_5" => { "detail" => "Metric detail 5" },
47+
"metric_6" => { "detail" => "Metric detail 6" },
48+
},
49+
)
50+
end
51+
52+
before do
53+
allow(AutoEvaluation::AnswerRelevancy)
54+
.to receive(:call).and_return(first_result, second_result, third_result)
55+
stub_const("AnswerAnalysis::BaseMetricJob::NUMBER_OF_RUNS", 3)
56+
end
57+
58+
it_behaves_like "a job in queue", "default"
59+
60+
describe "#perform" do
61+
it "calls AutoEvaluation::AnswerRelevancy the configured number of times with the correct arguments" do
62+
described_class.new.perform(answer.id)
63+
64+
expect(AutoEvaluation::AnswerRelevancy)
65+
.to have_received(:call)
66+
.with(
67+
question_message: question.message,
68+
answer_message: answer.message,
69+
)
70+
.exactly(3).times
71+
end
72+
73+
it "creates answer relevancy aggregate with the correct score" do
74+
expect {
75+
described_class.new.perform(answer.id)
76+
}.to change(AnswerAnalysis::AnswerRelevancyAggregate, :count).by(1)
77+
answer = Answer.includes(:answer_relevancy_aggregate)
78+
.find(AnswerAnalysis::AnswerRelevancyAggregate.last.answer_id)
79+
expect(answer.answer_relevancy_aggregate.mean_score.round(2)).to eq(0.8)
80+
end
81+
82+
it "creates answer relevancy runs for each result" do
83+
expect {
84+
described_class.new.perform(answer.id)
85+
}.to change(AnswerAnalysis::AnswerRelevancyRun, :count).by(3)
86+
87+
answer = Answer.includes(answer_relevancy_aggregate: :runs)
88+
.find(AnswerAnalysis::AnswerRelevancyAggregate.last.answer_id)
89+
90+
[first_result, second_result, third_result].each_with_index do |result, index|
91+
expect(answer.answer_relevancy_aggregate.runs.sort_by(&:created_at)[index])
92+
.to have_attributes(
93+
score: result.score.round(2),
94+
reason: result.reason,
95+
llm_responses: result.llm_responses,
96+
metrics: result.metrics,
97+
)
98+
end
99+
end
100+
101+
context "when the answer has a rephrased_question" do
102+
let(:rephrased_question) { "This is a rephrased_question" }
103+
104+
it "passes the rephrased question to AutoEvaluation::AnswerRelevancy as the question_message" do
105+
answer = create(:answer, rephrased_question: rephrased_question)
106+
107+
described_class.new.perform(answer.id)
108+
109+
expect(AutoEvaluation::AnswerRelevancy)
110+
.to have_received(:call)
111+
.with(
112+
question_message: rephrased_question,
113+
answer_message: answer.message,
114+
)
115+
.exactly(3).times
116+
end
117+
end
118+
119+
context "when aggegate data is persisted mid job" do
120+
before do
121+
allow(AnswerAnalysis::AnswerRelevancyAggregate)
122+
.to receive(:create_mean_aggregate_and_score_runs)
123+
.with(answer, anything)
124+
.and_raise(ActiveRecord::RecordNotUnique)
125+
end
126+
127+
it "logs a warning" do
128+
expect(described_class.logger)
129+
.to receive(:warn)
130+
.with("Answer #{answer.id} has already been evaluated for relevancy")
131+
132+
described_class.new.perform(answer.id)
133+
end
134+
end
135+
136+
context "when the answer does not exist" do
137+
let(:answer_id) { 999 }
138+
139+
it "logs a warning" do
140+
expect(described_class.logger)
141+
.to receive(:warn)
142+
.with("No answer found for #{answer_id}")
143+
144+
described_class.new.perform(answer_id)
145+
end
146+
147+
it "doesn't call AutoEvaluation::AnswerRelevancy" do
148+
described_class.new.perform(answer_id)
149+
expect(AutoEvaluation::AnswerRelevancy).not_to have_received(:call)
150+
end
151+
end
152+
153+
context "when answer relevancy has already been evaluated" do
154+
let(:aggregate) { create(:answer_relevancy_aggregate) }
155+
let(:answer) { aggregate.answer }
156+
157+
it "logs a warning" do
158+
expect(described_class.logger)
159+
.to receive(:warn)
160+
.with("Answer #{answer.id} has already been evaluated for relevancy")
161+
162+
described_class.new.perform(answer.id)
163+
end
164+
165+
it "doesn't call AutoEvaluation::AnswerRelevancy" do
166+
described_class.new.perform(answer.id)
167+
expect(AutoEvaluation::AnswerRelevancy).not_to have_received(:call)
168+
end
169+
end
170+
171+
context "when the AnswerRelevancy metric raises an Aws::Errors::ServiceError" do
172+
it "retries the job the max number of times" do
173+
allow(AutoEvaluation::AnswerRelevancy).to receive(:call)
174+
.and_raise(Aws::Errors::ServiceError.new(nil, "error"))
175+
176+
(described_class::MAX_RETRIES - 1).times do
177+
described_class.perform_later(answer.id)
178+
expect { perform_enqueued_jobs }.not_to raise_error
179+
end
180+
181+
described_class.perform_later(answer.id)
182+
expect { perform_enqueued_jobs }.to raise_error(Aws::Errors::ServiceError)
183+
end
184+
end
185+
186+
context "when the answer is not eligible for auto-evaluation" do
187+
let(:answer) { create(:answer, status: Answer.statuses.except(:answered).keys.sample) }
188+
189+
it "logs an info message" do
190+
expect(described_class.logger)
191+
.to receive(:info)
192+
.with("Answer #{answer.id} is not eligible for auto-evaluation")
193+
194+
described_class.new.perform(answer.id)
195+
end
196+
197+
it "does not call AutoEvaluation::AnswerRelevancy" do
198+
expect(AutoEvaluation::AnswerRelevancy).not_to receive(:call)
199+
described_class.new.perform(answer.id)
200+
end
201+
end
202+
end
203+
end

spec/jobs/compose_answer_job_spec.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
before do
77
allow(AnswerComposition::Composer).to receive(:call).and_return(returned_answer)
88
allow(AnswerTopicsJob).to receive(:perform_later)
9+
allow(AnswerAnalysis::AnswerRelevancyJob).to receive(:perform_later)
910
end
1011

1112
it_behaves_like "a job in queue", "answer"
@@ -22,6 +23,11 @@
2223
expect(AnswerTopicsJob).to have_received(:perform_later).with(returned_answer.id)
2324
end
2425

26+
it "calls the AnswerAnalysis::AnswerRelevancyJob with the answer_id" do
27+
described_class.new.perform(question.id)
28+
expect(AnswerAnalysis::AnswerRelevancyJob).to have_received(:perform_later).with(returned_answer.id)
29+
end
30+
2531
context "when the question has already been answered" do
2632
let(:question) { create(:question, :with_answer) }
2733

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
RSpec.describe AnswerAnalysis::AnswerRelevancyAggregate do
2+
include_examples "analysis results creatable",
3+
:answer_relevancy_aggregate,
4+
AnswerAnalysis::AnswerRelevancyRun,
5+
AutoEvaluation::AnswerRelevancy::Result
6+
end

spec/requests/api/v1/conversation_flow_spec.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ def when_i_create_a_conversation
7474
)
7575
end
7676
allow(AnswerTopicsJob).to receive(:perform_later)
77+
allow(AnswerAnalysis::AnswerRelevancyJob).to receive(:perform_later)
7778

7879
post api_v1_create_conversation_path,
7980
params: { user_question: "What is the capital of France?" },

0 commit comments

Comments
 (0)