Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions lib/auto_evaluation/answer_relevancy.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@ class AutoEvaluation::AnswerRelevancy

def self.call(...) = new(...).call

def initialize(question_message:, answer_message:)
@question_message = question_message
@answer_message = answer_message
def initialize(answer)
@answer = answer
@llm_responses = {}
@metrics = {}
end

def call
statements, llm_responses[:statements], metrics[:statements] = StatementGenerator.call(answer_message:)
statements, llm_responses[:statements], metrics[:statements] = StatementGenerator.call(answer_message: answer.message)

if statements.empty?
return build_maximum_score_result(
Expand Down Expand Up @@ -58,9 +57,13 @@ def call

private

attr_reader :question_message, :answer_message
attr_reader :answer
attr_accessor :llm_responses, :metrics

def question_message
answer.rephrased_question || answer.question.message
end

def calculate_score(verdicts)
verdict_count = verdicts.count
return 1.0 if verdict_count.zero?
Expand Down
13 changes: 8 additions & 5 deletions lib/auto_evaluation/coherence.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@ class Coherence

def self.call(...) = new(...).call

def initialize(question_message:, answer_message:)
@question_message = question_message
@answer_message = answer_message
def initialize(answer)
@answer = answer
end

def call
Expand All @@ -24,7 +23,7 @@ def call

private

attr_reader :question_message, :answer_message
attr_reader :answer

def llm_prompts
Prompts.config.coherence
Expand All @@ -33,7 +32,7 @@ def llm_prompts
def user_prompt
sprintf(
llm_prompts.fetch(:user_prompt),
answer: answer_message,
answer: answer.message,
question: question_message,
)
end
Expand All @@ -48,5 +47,9 @@ def normalise_rubric_score(rubric_score)

(rubric_score.to_d - min_rubric_score) / (max_rubric_score - min_rubric_score)
end

def question_message
answer.rephrased_question || answer.question.message
end
end
end
5 changes: 1 addition & 4 deletions lib/auto_evaluation/evaluate_answer_from_question_message.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,7 @@ def call
raise TaskFailedError, error_message
end

evaluation_class.call(
question_message:,
answer_message: answer.message,
)
evaluation_class.call(answer)
end

private
Expand Down
37 changes: 17 additions & 20 deletions spec/lib/auto_evaluation/answer_relevancy_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
let(:prompts) { AutoEvaluation::Prompts.config.answer_relevancy }
let(:question_message) { "This is a test question message." }
let(:answer_message) { "This is a test answer message." }
let(:question) { build(:question, message: question_message) }
let(:answer) { build(:answer, question:, message: answer_message) }

let(:statements) { ["This is the first statement.", "This is the second statement."] }
let(:statements_json) { { statements: }.to_json }
Expand Down Expand Up @@ -67,10 +69,7 @@
allow(Clock).to receive(:monotonic_time)
.and_return(200.0, 202.0, 204.0, 206.0, 208.0, 210.0)

result = described_class.call(
question_message:,
answer_message:,
)
result = described_class.call(answer)

expected_llm_responses = {
statements: JSON.parse(statements_stub.response.body),
Expand Down Expand Up @@ -100,6 +99,16 @@
)
end

context "when the answer has a rephrased question" do
let(:question_message) { "This is a rephrased test question." }
let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) }

it "uses the rephrased question in the prompt" do
result = described_class.call(answer)
expect(result.reason).to eq(reason)
end
end

context "when 'idk' verdicts are present" do
let(:verdicts) do
[
Expand All @@ -109,10 +118,7 @@
end

it "treats 'idk' verdicts as positive in the score" do
result = described_class.call(
question_message:,
answer_message:,
)
result = described_class.call(answer)

expect(result.score).to eq(0.5)
end
Expand All @@ -124,10 +130,7 @@
it "returns a result object with the expected attributes" do
allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0)

result = described_class.call(
question_message:,
answer_message:,
)
result = described_class.call(answer)

expect(result)
.to be_a(AutoEvaluation::ScoreResult)
Expand All @@ -148,10 +151,7 @@
allow(Clock).to receive(:monotonic_time)
.and_return(200.0, 202.0, 204.0, 206.0)

result = described_class.call(
question_message:,
answer_message:,
)
result = described_class.call(answer)

expect(result)
.to be_a(AutoEvaluation::ScoreResult)
Expand All @@ -177,10 +177,7 @@
it "returns a result object with the expected attributes" do
allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0, 204.0, 206.0)

result = described_class.call(
question_message:,
answer_message:,
)
result = described_class.call(answer)

expect(result)
.to be_a(AutoEvaluation::ScoreResult)
Expand Down
29 changes: 21 additions & 8 deletions spec/lib/auto_evaluation/coherence_spec.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
RSpec.describe AutoEvaluation::Coherence, :aws_credentials_stubbed do
describe ".call" do
let(:prompts) { AutoEvaluation::Prompts.config.coherence }
let(:question) { build(:question, message: question_message) }
let(:answer) { build(:answer, question:, message: answer_message) }
let(:question_message) { "This is a test question message." }
let(:answer_message) { "This is a test answer message." }
let(:reason) { "This is the reason for the score." }
Expand All @@ -22,10 +24,7 @@
response_json,
)

result = described_class.call(
question_message:,
answer_message:,
)
result = described_class.call(answer)

expected_metrics = {
coherence: {
Expand Down Expand Up @@ -62,14 +61,28 @@
response_json,
)

result = described_class.call(
question_message:,
answer_message:,
)
result = described_class.call(answer)

expect(result.score).to eq(expected_score)
expect(result.success).to eq(expected_score >= described_class::THRESHOLD)
end
end

context "when the answer has a rephrased question" do
let(:question_message) { "This is a rephrased test question." }
let(:answer) { build(:answer, message: answer_message, rephrased_question: question_message) }

it "uses the rephrased question in the prompt" do
stub = bedrock_invoke_model_openai_oss_tool_call(
user_prompt,
tools,
response_json,
)

described_class.call(answer)

expect(stub).to have_been_requested
end
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,7 @@
question_message:,
)

expect(evaluation_klass).to have_received(:call).with(
question_message:,
answer_message: answer.message,
)
expect(evaluation_klass).to have_received(:call).with(answer)
end

it "returns the AutoEvaluation::ScoreResult generated by the evaluation class" do
Expand Down