Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions lib/auto_evaluation.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
module AutoEvaluation
ScoreResult = Data.define(
:score,
:reason,
:success,
:llm_responses,
:metrics,
)
end
12 changes: 2 additions & 10 deletions lib/auto_evaluation/answer_relevancy.rb
Original file line number Diff line number Diff line change
@@ -1,12 +1,4 @@
class AutoEvaluation::AnswerRelevancy
Result = Data.define(
:score,
:reason,
:success,
:llm_responses,
:metrics,
)

THRESHOLD = 0.5

def self.call(...) = new(...).call
Expand Down Expand Up @@ -55,7 +47,7 @@ def call
question_message:, verdicts:, score:,
)

Result.new(
AutoEvaluation::ScoreResult.new(
score:,
reason:,
success: score >= THRESHOLD,
Expand All @@ -78,7 +70,7 @@ def calculate_score(verdicts)
end

def build_maximum_score_result(reason:, llm_responses:, metrics:)
Result.new(
AutoEvaluation::ScoreResult.new(
score: 1.0,
reason:,
success: true,
Expand Down
14 changes: 12 additions & 2 deletions lib/auto_evaluation/bedrock_openai_oss_invoke.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
module AutoEvaluation
class BedrockOpenAIOssInvoke
class InvalidToolCallSchemaError < StandardError; end
Result = Data.define(
:evaluation_data,
:llm_response,
Expand Down Expand Up @@ -33,12 +34,14 @@ def call
}.to_json,
)
parsed_response = JSON.parse(response.body.read)
parsed_structured_output = JSON.parse(
parsed_tool_output = JSON.parse(
parsed_response["choices"][0]["message"]["tool_calls"][0]["function"]["arguments"],
)

validate_tool_output_against_schema(parsed_tool_output)

Result.new(
evaluation_data: parsed_structured_output,
evaluation_data: parsed_tool_output,
llm_response: parsed_response,
metrics: build_metrics(start_time, parsed_response),
)
Expand All @@ -57,5 +60,12 @@ def build_metrics(start_time, response)
model: response["model"],
}
end

def validate_tool_output_against_schema(tool_output)
schema = tools.dig(0, "function", "parameters")
JSON::Validator.validate!(schema, tool_output)
rescue JSON::Schema::ValidationError => e
raise InvalidToolCallSchemaError, "Tool call response does not match schema: #{e.message}"
end
end
end
52 changes: 52 additions & 0 deletions lib/auto_evaluation/coherence.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
module AutoEvaluation
class Coherence
THRESHOLD = 0.75

def self.call(...) = new(...).call

def initialize(question_message:, answer_message:)
@question_message = question_message
@answer_message = answer_message
end

def call
result = BedrockOpenAIOssInvoke.call(user_prompt, tools)
score = normalise_rubric_score(result.evaluation_data.fetch("score"))

AutoEvaluation::ScoreResult.new(
score:,
reason: result.evaluation_data.fetch("reason").strip,
success: score >= THRESHOLD,
llm_responses: { coherence: result.llm_response },
metrics: { coherence: result.metrics },
)
end

private

attr_reader :question_message, :answer_message

def llm_prompts
Prompts.config.coherence
end

def user_prompt
sprintf(
llm_prompts.fetch(:user_prompt),
answer: answer_message,
question: question_message,
)
end

def tools
[llm_prompts.fetch(:tool_spec)]
end

def normalise_rubric_score(rubric_score)
min_rubric_score = llm_prompts.fetch(:config).fetch(:min_rubric_score)
max_rubric_score = llm_prompts.fetch(:config).fetch(:max_rubric_score)

(rubric_score.to_d - min_rubric_score) / (max_rubric_score - min_rubric_score)
end
end
end
35 changes: 35 additions & 0 deletions lib/auto_evaluation/evaluate_answer_from_question_message.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
module AutoEvaluation
class EvaluateAnswerFromQuestionMessage
class TaskFailedError < StandardError; end

def self.call(...) = new(...).call

def initialize(evaluation_class:, question_message:)
@evaluation_class = evaluation_class
@question_message = question_message
end

def call
question = Question.new(message: question_message, conversation: Conversation.new)
answer = AnswerComposition::PipelineRunner.call(question:, pipeline: [
AnswerComposition::Pipeline::SearchResultFetcher,
AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
])

if answer.status =~ /^error/
error_message = "Answer has an error status: #{answer.status} " \
"and error message: #{answer.error_message}"
raise TaskFailedError, error_message
end

evaluation_class.call(
question_message:,
answer_message: answer.message,
)
end

private

attr_reader :evaluation_class, :question_message
end
end
39 changes: 23 additions & 16 deletions lib/tasks/evaluation.rake
Original file line number Diff line number Diff line change
Expand Up @@ -177,24 +177,31 @@ namespace :evaluation do
task generate_answer_relevancy_evaluation: :environment do
raise "Requires an INPUT env var" if ENV["INPUT"].blank?

question = Question.new(message: ENV["INPUT"], conversation: Conversation.new)

answer = AnswerComposition::PipelineRunner.call(question:, pipeline: [
AnswerComposition::Pipeline::Claude::QuestionRouter,
AnswerComposition::Pipeline::SearchResultFetcher,
AnswerComposition::Pipeline::Claude::StructuredAnswerComposer,
])

if answer.status =~ /^error/
warn "Warning: answer has an error status: #{answer.status}"
abort(answer.error_message)
begin
result = AutoEvaluation::EvaluateAnswerFromQuestionMessage.call(
evaluation_class: AutoEvaluation::AnswerRelevancy,
question_message: ENV["INPUT"],
)

puts result.to_json
rescue AutoEvaluation::EvaluateAnswerFromQuestionMessage::TaskFailedError => e
abort e.message
end
end

result = AutoEvaluation::AnswerRelevancy.call(
question_message: answer.rephrased_question || question.message,
answer_message: answer.message,
)
desc "Run answer coherence evaluation for a user input"
task generate_coherence_evaluation: :environment do
raise "Requires an INPUT env var" if ENV["INPUT"].blank?

puts(result.to_json)
begin
result = AutoEvaluation::EvaluateAnswerFromQuestionMessage.call(
evaluation_class: AutoEvaluation::Coherence,
question_message: ENV["INPUT"],
)

puts result.to_json
rescue AutoEvaluation::EvaluateAnswerFromQuestionMessage::TaskFailedError => e
abort e.message
end
end
end
13 changes: 13 additions & 0 deletions spec/factories/auto_evaluation_score_result.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FactoryBot.define do
factory :auto_evaluation_score_result, class: "AutoEvaluation::ScoreResult" do
Comment thread
davidgisbey marked this conversation as resolved.
skip_create

score { 0.85.to_d }
reason { "Most statements are relevant." }
success { true }
llm_responses { {} }
metrics { {} }

initialize_with { new(**attributes) }
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
let(:statements) { ["Statement one.", "Statement two."] }
let(:verdicts) do
[
{ "verdict" => "Yes" },
{ "verdict" => "No", "reason" => "The statement is irrelevant." },
{ "verdict" => "yes" },
{ "verdict" => "no", "reason" => "The statement is irrelevant." },
]
end
let(:verdicts_json) do
Expand Down
16 changes: 8 additions & 8 deletions spec/lib/auto_evaluation/answer_relevancy_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@

let(:verdicts) do
[
{ "verdict" => "Yes" },
{ "verdict" => "No", "reason" => "The statement is irrelevant." },
{ "verdict" => "yes" },
{ "verdict" => "no", "reason" => "The statement is irrelevant." },
]
end
let(:verdicts_json) { { verdicts: }.to_json }
Expand Down Expand Up @@ -90,7 +90,7 @@
reason: shared_expected_metrics_attributes,
}
expect(result)
.to be_a(described_class::Result)
.to be_a(AutoEvaluation::ScoreResult)
.and have_attributes(
score: 0.5,
reason:,
Expand All @@ -104,7 +104,7 @@
let(:verdicts) do
[
{ "verdict" => "idk", "reason" => "Cannot determine relevance." },
{ "verdict" => "No", "reason" => "The statement is irrelevant." },
{ "verdict" => "no", "reason" => "The statement is irrelevant." },
]
end

Expand All @@ -130,7 +130,7 @@
)

expect(result)
.to be_a(described_class::Result)
.to be_a(AutoEvaluation::ScoreResult)
.and have_attributes(
score: 1.0,
reason: "No statements were extracted from the answer.",
Expand All @@ -154,7 +154,7 @@
)

expect(result)
.to be_a(described_class::Result)
.to be_a(AutoEvaluation::ScoreResult)
.and have_attributes(
score: 1.0,
reason: "No verdicts were generated for the extracted statements.",
Expand All @@ -172,7 +172,7 @@
end

context "when verdicts are generated and none have a 'no' verdict" do
let(:verdicts_json) { { verdicts: [{ "verdict" => "Yes" }, { "verdict" => "Yes" }] }.to_json }
let(:verdicts_json) { { verdicts: [{ "verdict" => "yes" }, { "verdict" => "yes" }] }.to_json }

it "returns a result object with the expected attributes" do
allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0, 204.0, 206.0)
Expand All @@ -183,7 +183,7 @@
)

expect(result)
.to be_a(described_class::Result)
.to be_a(AutoEvaluation::ScoreResult)
.and have_attributes(
score: 1.0,
reason: "The response fully addressed the input with no irrelevant statements.",
Expand Down
35 changes: 25 additions & 10 deletions spec/lib/auto_evaluation/bedrock_openai_oss_invoke_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@
let(:tools) do
[
{
type: "function",
function: {
name: "test_schema",
description: "A test JSON schema",
schema: {
type: "object",
properties: {
response: { type: "string" },
"type" => "function",
"function" => {
"name" => "test_schema",
"description" => "A test JSON schema",
"parameters" => {
"type" => "object",
"properties" => {
"response" => { "type" => "string" },
},
required: %w[response],
"required" => %w[response],
},
strict: true,
"strict" => true,
},
},
]
Expand Down Expand Up @@ -55,5 +55,20 @@
},
)
end

it "raises an error if the response does not conform to the schema" do
bedrock_invoke_model_openai_oss_tool_call(
user_message,
tools,
{ "invalid_key" => "This does not conform to the schema." }.to_json,
)

expect {
described_class.call(user_message, tools)
}.to raise_error(
described_class::InvalidToolCallSchemaError,
/The property '#\/' did not contain a required property of 'response'/,
)
end
end
end
Loading