Skip to content

Commit 28829ab

Browse files
committed
Get basic implementation working
1 parent d024092 commit 28829ab

8 files changed

Lines changed: 337 additions & 1 deletion

File tree

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
module AnswerAnalysis
2+
class AnswerRelevancyJob < BaseMetricJob
3+
def perform(answer_id)
4+
answer = Answer.includes(:question, :answer_relevancy_aggregate).find_by(id: answer_id)
5+
return if log_warnings(answer, answer_id)
6+
return logger.warn(aggregate_exists_warn_message(answer.id)) if answer.answer_relevancy_aggregate.present?
7+
8+
results = []
9+
BaseMetricJob::NUMBER_OF_RUNS.times do
10+
results << AutoEvaluation::AnswerRelevancy.call(
11+
question_message: answer.rephrased_question || answer.question.message,
12+
answer_message: answer.message,
13+
)
14+
end
15+
16+
if AnswerAnalysis::AnswerRelevancyAggregate.exists?(answer_id: answer.id)
17+
return logger.warn(aggregate_exists_warn_message(answer.id))
18+
end
19+
20+
mean_score = results.sum(&:score) / results.size.to_f
21+
aggregate = answer.create_answer_relevancy_aggregate!(score: mean_score)
22+
23+
results.each { |result| aggregate.create_run_from_result(result) }
24+
end
25+
26+
private
27+
28+
attr_reader :answer_id
29+
30+
def aggregate_exists_warn_message(answer_id)
31+
"Answer #{answer_id} has already been evaluated for relevancy"
32+
end
33+
end
34+
end
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
module AnswerAnalysis
2+
class BaseMetricJob < ApplicationJob
3+
NUMBER_OF_RUNS = 3
4+
MAX_RETRIES = 5
5+
retry_on Aws::Errors::ServiceError,
6+
wait: 1.minute,
7+
attempts: MAX_RETRIES
8+
9+
private
10+
11+
def log_warnings(answer, answer_id)
12+
unless answer
13+
logger.warn("No answer found for #{answer_id}")
14+
return true
15+
end
16+
17+
unless answer.status_answered?
18+
logger.info("Answer #{answer.id} is not eligible for auto-evaluation")
19+
return true
20+
end
21+
22+
false
23+
end
24+
end
25+
end

app/jobs/compose_answer_job.rb

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ def perform(question_id)
1414
logger.warn("Already an answer created for #{question_id}")
1515
end
1616

17-
AnswerTopicsJob.perform_later(answer.id) if answer.persisted?
17+
if answer.persisted?
18+
AnswerTopicsJob.perform_later(answer.id)
19+
AnswerRelevancyJob.perform_later(answer.id)
20+
end
1821
end
1922
end

app/models/answer_analysis/answer_relevancy_aggregate.rb

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,21 @@ module AnswerAnalysis
22
class AnswerRelevancyAggregate < ApplicationRecord
33
belongs_to :answer
44
has_many :runs, class_name: "AnswerAnalysis::AnswerRelevancyRun"
5+
6+
def create_run_from_result(result)
7+
run = runs.build(
8+
score: result.score,
9+
reason: result.reason,
10+
)
11+
12+
result.llm_responses.stringify_keys.each do |name, llm_response|
13+
run.assign_llm_response(name, llm_response)
14+
end
15+
result.metrics.stringify_keys.each do |name, metrics|
16+
run.assign_metrics(name, metrics)
17+
end
18+
19+
run.save!
20+
end
521
end
622
end

app/models/auto_evaluation_metric_run.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
class AutoEvaluationMetricRun < ApplicationRecord
2+
include LlmCallsRecordable
3+
24
belongs_to :metric_aggregate,
35
class_name: "AutoEvaluationMetricAggregate",
46
foreign_key: :auto_evaluation_metric_aggregate_id
Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
RSpec.describe AnswerAnalysis::AnswerRelevancyJob do
2+
include ActiveJob::TestHelper
3+
4+
let(:answer) { create(:answer) }
5+
let(:question) { answer.question }
6+
let(:first_result) do
7+
AutoEvaluation::AnswerRelevancy::Result.new(
8+
score: 0.8,
9+
reason: "The first reason.",
10+
success: true,
11+
llm_responses: {
12+
"response_1" => { "content" => "LLM response content 1" },
13+
"response_2" => { "content" => "LLM response content 2" },
14+
},
15+
metrics: {
16+
"metric_1" => { "detail" => "Metric detail 1" },
17+
"metric_2" => { "detail" => "Metric detail 2" },
18+
},
19+
)
20+
end
21+
let(:second_result) do
22+
AutoEvaluation::AnswerRelevancy::Result.new(
23+
score: 0.7,
24+
reason: "The second reason.",
25+
success: true,
26+
llm_responses: {
27+
"response_3" => { "content" => "LLM response content 3" },
28+
"response_4" => { "content" => "LLM response content 4" },
29+
},
30+
metrics: {
31+
"metric_3" => { "detail" => "Metric detail 3" },
32+
"metric_4" => { "detail" => "Metric detail 4" },
33+
},
34+
)
35+
end
36+
let(:third_result) do
37+
AutoEvaluation::AnswerRelevancy::Result.new(
38+
score: 0.9,
39+
reason: "The third reason.",
40+
success: true,
41+
llm_responses: {
42+
"response_5" => { "content" => "LLM response content 5" },
43+
"response_6" => { "content" => "LLM response content 6" },
44+
},
45+
metrics: {
46+
"metric_5" => { "detail" => "Metric detail 5" },
47+
"metric_6" => { "detail" => "Metric detail 6" },
48+
},
49+
)
50+
end
51+
52+
before do
53+
allow(AutoEvaluation::AnswerRelevancy)
54+
.to receive(:call).and_return(first_result, second_result, third_result)
55+
stub_const("AnswerAnalysis::BaseMetricJob::NUMBER_OF_RUNS", 3)
56+
end
57+
58+
it_behaves_like "a job in queue", "default"
59+
60+
describe "#perform" do
61+
it "calls AutoEvaluation::AnswerRelevancy the configured number of times with the correct arguments" do
62+
described_class.new.perform(answer.id)
63+
64+
expect(AutoEvaluation::AnswerRelevancy)
65+
.to have_received(:call)
66+
.with(
67+
question_message: question.message,
68+
answer_message: answer.message,
69+
)
70+
.exactly(3).times
71+
end
72+
73+
it "creates answer relevancy aggregate with the correct score" do
74+
expect {
75+
described_class.new.perform(answer.id)
76+
}.to change(AnswerAnalysis::AnswerRelevancyAggregate, :count).by(1)
77+
answer = Answer.includes(:answer_relevancy_aggregate)
78+
.find(AnswerAnalysis::AnswerRelevancyAggregate.last.answer_id)
79+
expect(answer.answer_relevancy_aggregate.score.round(2)).to eq(0.8)
80+
end
81+
82+
it "creates answer relevancy runs for each result" do
83+
expect {
84+
described_class.new.perform(answer.id)
85+
}.to change(AnswerAnalysis::AnswerRelevancyRun, :count).by(3)
86+
87+
answer = Answer.includes(answer_relevancy_aggregate: :runs)
88+
.find(AnswerAnalysis::AnswerRelevancyAggregate.last.answer_id)
89+
90+
[first_result, second_result, third_result].each_with_index do |result, index|
91+
expect(answer.answer_relevancy_aggregate.runs[index])
92+
.to have_attributes(
93+
score: result.score.round(2),
94+
reason: result.reason,
95+
llm_responses: result.llm_responses,
96+
metrics: result.metrics,
97+
)
98+
end
99+
end
100+
101+
context "when the answer has a rephrased_question" do
102+
let(:rephrased_question) { "This is a rephrased_question" }
103+
104+
it "passes the rephrased question to AutoEvaluation::AnswerRelevancy as the question_message" do
105+
answer = create(:answer, rephrased_question: rephrased_question)
106+
107+
described_class.new.perform(answer.id)
108+
109+
expect(AutoEvaluation::AnswerRelevancy)
110+
.to have_received(:call)
111+
.with(
112+
question_message: rephrased_question,
113+
answer_message: answer.message,
114+
)
115+
.exactly(3).times
116+
end
117+
end
118+
119+
context "when aggegate data is persisted mid job" do
120+
before do
121+
allow(AnswerAnalysis::AnswerRelevancyAggregate)
122+
.to receive(:exists?)
123+
.with(answer_id: answer.id)
124+
.and_return(true)
125+
end
126+
127+
it "logs a warning" do
128+
expect(described_class.logger)
129+
.to receive(:warn)
130+
.with("Answer #{answer.id} has already been evaluated for relevancy")
131+
132+
described_class.new.perform(answer.id)
133+
end
134+
135+
it "doesn't create an aggregate or runs" do
136+
expect {
137+
described_class.new.perform(answer.id)
138+
}.to not_change(AnswerAnalysis::AnswerRelevancyAggregate, :count)
139+
.and not_change(AnswerAnalysis::AnswerRelevancyRun, :count)
140+
end
141+
end
142+
143+
context "when the answer does not exist" do
144+
let(:answer_id) { 999 }
145+
146+
it "logs a warning" do
147+
expect(described_class.logger)
148+
.to receive(:warn)
149+
.with("No answer found for #{answer_id}")
150+
151+
described_class.new.perform(answer_id)
152+
end
153+
154+
it "doesn't call AutoEvaluation::AnswerRelevancy" do
155+
described_class.new.perform(answer_id)
156+
expect(AutoEvaluation::AnswerRelevancy).not_to have_received(:call)
157+
end
158+
end
159+
160+
context "when answer relevancy has already been evaluated" do
161+
let(:aggregate) { create(:answer_relevancy_aggregate) }
162+
let(:answer) { aggregate.answer }
163+
164+
it "logs a warning" do
165+
expect(described_class.logger)
166+
.to receive(:warn)
167+
.with("Answer #{answer.id} has already been evaluated for relevancy")
168+
169+
described_class.new.perform(answer.id)
170+
end
171+
172+
it "doesn't call AutoEvaluation::AnswerRelevancy" do
173+
described_class.new.perform(answer.id)
174+
expect(AutoEvaluation::AnswerRelevancy).not_to have_received(:call)
175+
end
176+
end
177+
178+
context "when the AnswerRelevancy metric raises an Aws::Errors::ServiceError" do
179+
it "retries the job the max number of times" do
180+
allow(AutoEvaluation::AnswerRelevancy).to receive(:call)
181+
.and_raise(Aws::Errors::ServiceError.new(nil, "error"))
182+
183+
(described_class::MAX_RETRIES - 1).times do
184+
described_class.perform_later(answer.id)
185+
expect { perform_enqueued_jobs }.not_to raise_error
186+
end
187+
188+
described_class.perform_later(answer.id)
189+
expect { perform_enqueued_jobs }.to raise_error(Aws::Errors::ServiceError)
190+
end
191+
end
192+
193+
context "when the answer is not eligible for auto-evaluation" do
194+
let(:answer) { create(:answer, status: Answer.statuses.except(:answered).keys.sample) }
195+
196+
it "logs an info message" do
197+
expect(described_class.logger)
198+
.to receive(:info)
199+
.with("Answer #{answer.id} is not eligible for auto-evaluation")
200+
201+
described_class.new.perform(answer.id)
202+
end
203+
204+
it "does not call AutoEvaluation::AnswerRelevancy" do
205+
expect(AutoEvaluation::AnswerRelevancy).not_to receive(:call)
206+
described_class.new.perform(answer.id)
207+
end
208+
end
209+
end
210+
end

spec/jobs/compose_answer_job_spec.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
before do
77
allow(AnswerComposition::Composer).to receive(:call).and_return(returned_answer)
88
allow(AnswerTopicsJob).to receive(:perform_later)
9+
allow(AnswerRelevancyJob).to receive(:perform_later)
910
end
1011

1112
it_behaves_like "a job in queue", "answer"
@@ -22,6 +23,11 @@
2223
expect(AnswerTopicsJob).to have_received(:perform_later).with(returned_answer.id)
2324
end
2425

26+
it "calls the AnswerRelevancyJob with the answer_id" do
27+
described_class.new.perform(question.id)
28+
expect(AnswerRelevancyJob).to have_received(:perform_later).with(returned_answer.id)
29+
end
30+
2531
context "when the question has already been answered" do
2632
let(:question) { create(:question, :with_answer) }
2733

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
RSpec.describe AnswerAnalysis::AnswerRelevancyAggregate do
2+
describe "#create_run_from_result" do
3+
let(:aggregate) { create(:answer_relevancy_aggregate) }
4+
let(:result) do
5+
AutoEvaluation::AnswerRelevancy::Result.new(
6+
score: 0.85,
7+
reason: "The answer is relevant to the question.",
8+
success: true,
9+
llm_responses: {
10+
"response_1" => { "content" => "LLM response content 1" },
11+
"response_2" => { "content" => "LLM response content 2" },
12+
},
13+
metrics: {
14+
"metric_1" => { "detail" => "Metric detail 1" },
15+
"metric_2" => { "detail" => "Metric detail 2" },
16+
},
17+
)
18+
end
19+
20+
it "creates a run with correct attributes and associations" do
21+
expect {
22+
aggregate.create_run_from_result(result)
23+
}.to change { aggregate.runs.count }.by(1)
24+
25+
run = aggregate.runs.strict_loading(false).last
26+
expect(run).to have_attributes(
27+
score: 0.85,
28+
reason: "The answer is relevant to the question.",
29+
llm_responses: {
30+
"response_1" => { "content" => "LLM response content 1" },
31+
"response_2" => { "content" => "LLM response content 2" },
32+
},
33+
metrics: {
34+
"metric_1" => { "detail" => "Metric detail 1" },
35+
"metric_2" => { "detail" => "Metric detail 2" },
36+
},
37+
)
38+
end
39+
end
40+
end

0 commit comments

Comments
 (0)