From a444c34cde6621d8dfd45ef93e92b5c816c0d362 Mon Sep 17 00:00:00 2001 From: Chae Cramb Date: Mon, 29 Dec 2025 16:46:11 +0000 Subject: [PATCH 1/2] Add Faithfulness metric This adds the Faithfulness metric to the auto-evaluation module. It follows the established Ruby patterns from the AnswerRelevancy metric, using BedrockOpenAIOssInvoke to make tool calls to the LLM. The metric evaluates whether the AI's answer is faithful to the retrieval context through a multi-step process: 1. Extract truths from the retrieval context 2. Extract claims from the answer 3. Generate verdicts comparing claims against truths 4. Calculate score and generate a reason The score is calculated as the proportion of claims that don't contradict the retrieval context. Verdicts of "yes" and "idk" are treated as faithful (non-contradictory), while only "no" verdicts count against the score. This follows the DeepEval implementation. The metric returns early with a perfect score (1.0) when: - No claims are extracted from the answer - No truths are extracted from the retrieval context - No verdicts are generated - All verdicts are "yes" (no contradictions found) --- lib/auto_evaluation/faithfulness.rb | 101 +++++++++++ .../faithfulness/claims_generator.rb | 35 ++++ .../faithfulness/reason_generator.rb | 42 +++++ .../faithfulness/truths_generator.rb | 35 ++++ .../faithfulness/verdicts_generator.rb | 37 ++++ .../faithfulness/claims_generator_spec.rb | 44 +++++ .../faithfulness/reason_generator_spec.rb | 52 ++++++ .../faithfulness/truths_generator_spec.rb | 44 +++++ .../faithfulness/verdicts_generator_spec.rb | 51 ++++++ spec/lib/auto_evaluation/faithfulness_spec.rb | 166 ++++++++++++++++++ spec/support/stub_bedrock.rb | 74 ++++++++ 11 files changed, 681 insertions(+) create mode 100644 lib/auto_evaluation/faithfulness.rb create mode 100644 lib/auto_evaluation/faithfulness/claims_generator.rb create mode 100644 lib/auto_evaluation/faithfulness/reason_generator.rb create mode 100644 lib/auto_evaluation/faithfulness/truths_generator.rb create mode 100644 lib/auto_evaluation/faithfulness/verdicts_generator.rb create mode 100644 spec/lib/auto_evaluation/faithfulness/claims_generator_spec.rb create mode 100644 spec/lib/auto_evaluation/faithfulness/reason_generator_spec.rb create mode 100644 spec/lib/auto_evaluation/faithfulness/truths_generator_spec.rb create mode 100644 spec/lib/auto_evaluation/faithfulness/verdicts_generator_spec.rb create mode 100644 spec/lib/auto_evaluation/faithfulness_spec.rb diff --git a/lib/auto_evaluation/faithfulness.rb b/lib/auto_evaluation/faithfulness.rb new file mode 100644 index 000000000..344d96fa1 --- /dev/null +++ b/lib/auto_evaluation/faithfulness.rb @@ -0,0 +1,101 @@ +class AutoEvaluation::Faithfulness + THRESHOLD = 0.5 + + def self.call(...) = new(...).call + + def initialize(answer) + @answer = answer + @llm_responses = {} + @metrics = {} + end + + def call + truths, llm_responses[:truths], metrics[:truths] = TruthsGenerator.call(retrieval_context:) + + if truths.empty? + return build_maximum_score_result( + reason: "No truths were extracted from the retrieval context.", + llm_responses:, + metrics:, + ) + end + + claims, llm_responses[:claims], metrics[:claims] = ClaimsGenerator.call(answer_message:) + + if claims.empty? + return build_maximum_score_result( + reason: "No claims were extracted from the answer.", + llm_responses:, + metrics:, + ) + end + + verdicts, llm_responses[:verdicts], metrics[:verdicts] = VerdictsGenerator.call( + claims:, truths:, + ) + + if verdicts.empty? + return build_maximum_score_result( + reason: "No verdicts were generated for the extracted claims.", + llm_responses:, + metrics:, + ) + end + + if verdicts.none? { |verdict| verdict["verdict"].strip.downcase == "no" } + return build_maximum_score_result( + reason: "The response is fully supported by the retrieval context.", + llm_responses:, + metrics:, + ) + end + + score = calculate_score(verdicts) + + reason, llm_responses[:reason], metrics[:reason] = ReasonGenerator.call( + score: score.round(2), verdicts:, + ) + + AutoEvaluation::ScoreResult.new( + score:, + reason:, + success: score >= THRESHOLD, + llm_responses:, + metrics:, + ) + end + +private + + attr_reader :answer + attr_accessor :llm_responses, :metrics + + def answer_message + answer.message + end + + def retrieval_context + used_sources.map(&:plain_content).join("\n\n") + end + + def calculate_score(verdicts) + return 1.0 if verdicts.empty? + + faithful_count = verdicts.count { |verdict| verdict["verdict"].strip.downcase != "no" } + faithful_count.to_d / verdicts.count + end + + def used_sources + answer.sources.used + end + + def build_maximum_score_result(reason:, llm_responses:, metrics:) + AutoEvaluation::ScoreResult.new( + score: 1.0, + reason:, + success: true, + llm_responses:, + metrics:, + ) + end +end diff --git a/lib/auto_evaluation/faithfulness/claims_generator.rb b/lib/auto_evaluation/faithfulness/claims_generator.rb new file mode 100644 index 000000000..d75610cea --- /dev/null +++ b/lib/auto_evaluation/faithfulness/claims_generator.rb @@ -0,0 +1,35 @@ +module AutoEvaluation + class Faithfulness::ClaimsGenerator + def self.call(...) = new(...).call + + def initialize(answer_message:) + @answer_message = answer_message + end + + def call + result = BedrockOpenAIOssInvoke.call(user_prompt, tools) + [result.evaluation_data.fetch("claims"), result.llm_response, result.metrics] + end + + private + + attr_reader :answer_message + + def llm_prompts + Prompts.config + .faithfulness + .fetch(:claims) + end + + def user_prompt + sprintf( + llm_prompts.fetch(:user_prompt), + answer: answer_message, + ) + end + + def tools + [llm_prompts.fetch(:tool_spec)] + end + end +end diff --git a/lib/auto_evaluation/faithfulness/reason_generator.rb b/lib/auto_evaluation/faithfulness/reason_generator.rb new file mode 100644 index 000000000..abc9cad70 --- /dev/null +++ b/lib/auto_evaluation/faithfulness/reason_generator.rb @@ -0,0 +1,42 @@ +module AutoEvaluation + class Faithfulness::ReasonGenerator + def self.call(...) = new(...).call + + def initialize(score:, verdicts:) + @score = score + @verdicts = verdicts + end + + def call + result = BedrockOpenAIOssInvoke.call(user_prompt, tools) + [result.evaluation_data.fetch("reason"), result.llm_response, result.metrics] + end + + private + + attr_reader :score, :verdicts + + def llm_prompts + Prompts.config + .faithfulness + .fetch(:reason) + end + + def user_prompt + sprintf( + llm_prompts.fetch(:user_prompt), + score:, + contradictions:, + ) + end + + def tools + [llm_prompts.fetch(:tool_spec)] + end + + def contradictions + verdicts.select { |verdict| verdict["verdict"].strip.downcase == "no" } + .map { |verdict| verdict["reason"] } + end + end +end diff --git a/lib/auto_evaluation/faithfulness/truths_generator.rb b/lib/auto_evaluation/faithfulness/truths_generator.rb new file mode 100644 index 000000000..b74adb05a --- /dev/null +++ b/lib/auto_evaluation/faithfulness/truths_generator.rb @@ -0,0 +1,35 @@ +module AutoEvaluation + class Faithfulness::TruthsGenerator + def self.call(...) = new(...).call + + def initialize(retrieval_context:) + @retrieval_context = retrieval_context + end + + def call + result = BedrockOpenAIOssInvoke.call(user_prompt, tools) + [result.evaluation_data.fetch("truths"), result.llm_response, result.metrics] + end + + private + + attr_reader :retrieval_context + + def llm_prompts + Prompts.config + .faithfulness + .fetch(:truths) + end + + def user_prompt + sprintf( + llm_prompts.fetch(:user_prompt), + retrieval_context:, + ) + end + + def tools + [llm_prompts.fetch(:tool_spec)] + end + end +end diff --git a/lib/auto_evaluation/faithfulness/verdicts_generator.rb b/lib/auto_evaluation/faithfulness/verdicts_generator.rb new file mode 100644 index 000000000..e01f486dd --- /dev/null +++ b/lib/auto_evaluation/faithfulness/verdicts_generator.rb @@ -0,0 +1,37 @@ +module AutoEvaluation + class Faithfulness::VerdictsGenerator + def self.call(...) = new(...).call + + def initialize(claims:, truths:) + @claims = claims + @truths = truths + end + + def call + result = BedrockOpenAIOssInvoke.call(user_prompt, tools) + [result.evaluation_data.fetch("verdicts"), result.llm_response, result.metrics] + end + + private + + attr_reader :claims, :truths + + def llm_prompts + Prompts.config + .faithfulness + .fetch(:verdicts) + end + + def user_prompt + sprintf( + llm_prompts.fetch(:user_prompt), + claims:, + retrieval_context: truths.join("\n\n"), + ) + end + + def tools + [llm_prompts.fetch(:tool_spec)] + end + end +end diff --git a/spec/lib/auto_evaluation/faithfulness/claims_generator_spec.rb b/spec/lib/auto_evaluation/faithfulness/claims_generator_spec.rb new file mode 100644 index 000000000..f073a4d94 --- /dev/null +++ b/spec/lib/auto_evaluation/faithfulness/claims_generator_spec.rb @@ -0,0 +1,44 @@ +RSpec.describe AutoEvaluation::Faithfulness::ClaimsGenerator, :aws_credentials_stubbed do + describe ".call" do + let(:answer_message) { "Einstein won the Nobel Prize in 1968 for the photoelectric effect." } + let(:claims) { ["Einstein won the Nobel Prize in 1968.", "Einstein won the Nobel Prize for the photoelectric effect."] } + let(:claims_json) do + { claims: }.to_json + end + let(:prompts) { AutoEvaluation::Prompts.config.faithfulness.fetch(:claims) } + let(:user_prompt) do + sprintf( + prompts.fetch(:user_prompt), + answer: answer_message, + ) + end + let(:tools) { [prompts.fetch(:tool_spec)] } + let!(:stub_bedrock) do + stub_bedrock_invoke_model_openai_oss_tool_call( + user_prompt, + tools, + claims_json, + ) + end + + it "returns an array with the claims, llm_response, and metrics" do + allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0) + + result = described_class.call(answer_message:) + + expected_llm_response = JSON.parse(stub_bedrock.response.body) + expected_metrics = { + duration: 2.0, + model: AutoEvaluation::BedrockOpenAIOssInvoke::MODEL, + llm_prompt_tokens: 25, + llm_completion_tokens: 35, + llm_cached_tokens: nil, + } + expect(result).to contain_exactly( + claims, + expected_llm_response, + expected_metrics, + ) + end + end +end diff --git a/spec/lib/auto_evaluation/faithfulness/reason_generator_spec.rb b/spec/lib/auto_evaluation/faithfulness/reason_generator_spec.rb new file mode 100644 index 000000000..be5e6913c --- /dev/null +++ b/spec/lib/auto_evaluation/faithfulness/reason_generator_spec.rb @@ -0,0 +1,52 @@ +RSpec.describe AutoEvaluation::Faithfulness::ReasonGenerator, :aws_credentials_stubbed do + describe ".call" do + let(:score) { 0.5 } + let(:verdicts) do + [ + { "verdict" => "no", "reason" => "The retrieval context states Einstein won in 1921, not 1968." }, + { "verdict" => "yes" }, + ] + end + let(:contradictions) { ["The retrieval context states Einstein won in 1921, not 1968."] } + let(:reason) { "The score is 0.5 because the answer incorrectly stated the year Einstein won the Nobel Prize." } + let(:reason_json) do + { reason: }.to_json + end + let(:prompts) { AutoEvaluation::Prompts.config.faithfulness.fetch(:reason) } + let(:user_prompt) do + sprintf( + prompts.fetch(:user_prompt), + score:, + contradictions:, + ) + end + let(:tools) { [prompts.fetch(:tool_spec)] } + let!(:stub_bedrock) do + stub_bedrock_invoke_model_openai_oss_tool_call( + user_prompt, + tools, + reason_json, + ) + end + + it "returns an array with the reason, llm_response, and metrics" do + allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0) + + result = described_class.call(score:, verdicts:) + + expected_llm_response = JSON.parse(stub_bedrock.response.body) + expected_metrics = { + duration: 2.0, + model: AutoEvaluation::BedrockOpenAIOssInvoke::MODEL, + llm_prompt_tokens: 25, + llm_completion_tokens: 35, + llm_cached_tokens: nil, + } + expect(result).to contain_exactly( + reason, + expected_llm_response, + expected_metrics, + ) + end + end +end diff --git a/spec/lib/auto_evaluation/faithfulness/truths_generator_spec.rb b/spec/lib/auto_evaluation/faithfulness/truths_generator_spec.rb new file mode 100644 index 000000000..c698b8a12 --- /dev/null +++ b/spec/lib/auto_evaluation/faithfulness/truths_generator_spec.rb @@ -0,0 +1,44 @@ +RSpec.describe AutoEvaluation::Faithfulness::TruthsGenerator, :aws_credentials_stubbed do + describe ".call" do + let(:retrieval_context) { "Einstein won the Nobel Prize in 1921 for the photoelectric effect." } + let(:truths) { ["Einstein won the Nobel Prize in 1921.", "Einstein won the Nobel Prize for the photoelectric effect."] } + let(:truths_json) do + { truths: }.to_json + end + let(:prompts) { AutoEvaluation::Prompts.config.faithfulness.fetch(:truths) } + let(:user_prompt) do + sprintf( + prompts.fetch(:user_prompt), + retrieval_context:, + ) + end + let(:tools) { [prompts.fetch(:tool_spec)] } + let!(:stub_bedrock) do + stub_bedrock_invoke_model_openai_oss_tool_call( + user_prompt, + tools, + truths_json, + ) + end + + it "returns an array with the truths, llm_response, and metrics" do + allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0) + + result = described_class.call(retrieval_context:) + + expected_llm_response = JSON.parse(stub_bedrock.response.body) + expected_metrics = { + duration: 2.0, + model: AutoEvaluation::BedrockOpenAIOssInvoke::MODEL, + llm_prompt_tokens: 25, + llm_completion_tokens: 35, + llm_cached_tokens: nil, + } + expect(result).to contain_exactly( + truths, + expected_llm_response, + expected_metrics, + ) + end + end +end diff --git a/spec/lib/auto_evaluation/faithfulness/verdicts_generator_spec.rb b/spec/lib/auto_evaluation/faithfulness/verdicts_generator_spec.rb new file mode 100644 index 000000000..f401795f5 --- /dev/null +++ b/spec/lib/auto_evaluation/faithfulness/verdicts_generator_spec.rb @@ -0,0 +1,51 @@ +RSpec.describe AutoEvaluation::Faithfulness::VerdictsGenerator, :aws_credentials_stubbed do + describe ".call" do + let(:claims) { ["Einstein won the Nobel Prize in 1968.", "Einstein won the Nobel Prize for the photoelectric effect."] } + let(:truths) { ["Einstein won the Nobel Prize in 1921 for the photoelectric effect."] } + let(:verdicts) do + [ + { "verdict" => "no", "reason" => "The retrieval context states Einstein won in 1921, not 1968." }, + { "verdict" => "yes" }, + ] + end + let(:verdicts_json) do + { verdicts: }.to_json + end + let(:prompts) { AutoEvaluation::Prompts.config.faithfulness.fetch(:verdicts) } + let(:user_prompt) do + sprintf( + prompts.fetch(:user_prompt), + claims:, + retrieval_context: truths.join("\n\n"), + ) + end + let(:tools) { [prompts.fetch(:tool_spec)] } + let!(:stub_bedrock) do + stub_bedrock_invoke_model_openai_oss_tool_call( + user_prompt, + tools, + verdicts_json, + ) + end + + it "returns an array with the verdicts, llm_response, and metrics" do + allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0) + + result = described_class.call(claims:, truths:) + + expected_llm_response = JSON.parse(stub_bedrock.response.body) + expected_metrics = { + duration: 2.0, + model: AutoEvaluation::BedrockOpenAIOssInvoke::MODEL, + llm_prompt_tokens: 25, + llm_completion_tokens: 35, + llm_cached_tokens: nil, + } + expect(result).to contain_exactly( + verdicts, + expected_llm_response, + expected_metrics, + ) + end + end +end diff --git a/spec/lib/auto_evaluation/faithfulness_spec.rb b/spec/lib/auto_evaluation/faithfulness_spec.rb new file mode 100644 index 000000000..27c6cab91 --- /dev/null +++ b/spec/lib/auto_evaluation/faithfulness_spec.rb @@ -0,0 +1,166 @@ +RSpec.describe AutoEvaluation::Faithfulness, :aws_credentials_stubbed do + describe ".call" do + let(:answer_message) { "Einstein won the Nobel Prize in 1968 for the photoelectric effect." } + let(:retrieval_context) { "Einstein won the Nobel Prize in 1921 for the photoelectric effect." } + let(:question) { build(:question, message: "When did Einstein win the Nobel Prize?") } + let(:chunk) { create(:answer_source_chunk, plain_content: retrieval_context) } + let(:answer) { create(:answer, question:, message: answer_message, sources: [create(:answer_source, chunk:, used: true)]) } + + let(:truths) { ["Einstein won the Nobel Prize in 1921.", "Einstein won the Nobel Prize for the photoelectric effect."] } + let(:truths_json) { { truths: }.to_json } + let(:claims) { ["Einstein won the Nobel Prize in 1968.", "Einstein won the Nobel Prize for the photoelectric effect."] } + let(:claims_json) { { claims: }.to_json } + let(:verdicts) do + [ + { "verdict" => "no", "reason" => "The retrieval context states Einstein won in 1921, not 1968." }, + { "verdict" => "yes" }, + ] + end + let(:verdicts_json) { { verdicts: }.to_json } + let(:reason) { "The score is 0.5 because the answer incorrectly stated the year Einstein won the Nobel Prize." } + let(:reason_json) { { reason: }.to_json } + + let!(:faithfulness_stubs) do + stub_bedrock_invoke_model_openai_oss_faithfulness( + retrieval_context:, + answer_message:, + truths_json:, + claims_json:, + verdicts_json:, + reason_json:, + ) + end + let(:truths_stub) { faithfulness_stubs[:truths] } + let(:claims_stub) { faithfulness_stubs[:claims] } + let(:verdicts_stub) { faithfulness_stubs[:verdicts] } + let(:reason_stub) { faithfulness_stubs[:reason] } + + it "returns a results object with the expected attributes" do + allow(Clock).to receive(:monotonic_time) + .and_return(200.0, 202.0, 204.0, 206.0, 208.0, 210.0, 212.0, 214.0) + + result = described_class.call(answer) + + expected_llm_responses = { + truths: JSON.parse(truths_stub.response.body), + claims: JSON.parse(claims_stub.response.body), + verdicts: JSON.parse(verdicts_stub.response.body), + reason: JSON.parse(reason_stub.response.body), + } + shared_expected_metrics_attributes = { + duration: 2.0, + model: AutoEvaluation::BedrockOpenAIOssInvoke::MODEL, + llm_prompt_tokens: 25, + llm_completion_tokens: 35, + llm_cached_tokens: nil, + } + expected_metrics = { + truths: shared_expected_metrics_attributes, + claims: shared_expected_metrics_attributes, + verdicts: shared_expected_metrics_attributes, + reason: shared_expected_metrics_attributes, + } + expect(result) + .to be_a(AutoEvaluation::ScoreResult) + .and have_attributes( + score: 0.5, + reason:, + success: true, + llm_responses: expected_llm_responses, + metrics: expected_metrics, + ) + end + + context "when 'idk' verdicts are present alongside 'no' verdicts" do + let(:verdicts) do + [ + { "verdict" => "idk", "reason" => "Cannot determine if correct." }, + { "verdict" => "no", "reason" => "The retrieval context states Einstein won in 1921, not 1968." }, + ] + end + + it "treats 'idk' verdicts as faithful (not contradictions)" do + result = described_class.call(answer) + + expect(result.score).to eq(0.5) + end + end + + context "when no truths are extracted from the retrieval context" do + let(:truths_json) { { truths: [] }.to_json } + + it "returns early with score 1.0 and skips claims, verdicts and reason LLM calls" do + allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0) + + result = described_class.call(answer) + + expect(result) + .to be_a(AutoEvaluation::ScoreResult) + .and have_attributes( + score: 1.0, + reason: "No truths were extracted from the retrieval context.", + success: true, + ) + expect(result.llm_responses.keys).to contain_exactly(:truths) + expect(result.metrics.keys).to contain_exactly(:truths) + end + end + + context "when no claims are extracted from the answer" do + let(:claims_json) { { claims: [] }.to_json } + + it "returns early with score 1.0 and skips verdicts and reason LLM calls" do + allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0, 204.0, 206.0) + + result = described_class.call(answer) + + expect(result) + .to be_a(AutoEvaluation::ScoreResult) + .and have_attributes( + score: 1.0, + reason: "No claims were extracted from the answer.", + success: true, + ) + expect(result.llm_responses.keys).to contain_exactly(:truths, :claims) + expect(result.metrics.keys).to contain_exactly(:truths, :claims) + end + end + + context "when all verdicts are faithful (no 'no' verdicts)" do + let(:verdicts_json) { { verdicts: [{ "verdict" => "yes" }, { "verdict" => "idk" }] }.to_json } + + it "returns early with score 1.0 and skips reason LLM call" do + allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0, 204.0, 206.0, 208.0, 210.0) + + result = described_class.call(answer) + + expect(result) + .to be_a(AutoEvaluation::ScoreResult) + .and have_attributes( + score: 1.0, + reason: "The response is fully supported by the retrieval context.", + success: true, + ) + expect(result.llm_responses.keys).to contain_exactly(:truths, :claims, :verdicts) + expect(result.metrics.keys).to contain_exactly(:truths, :claims, :verdicts) + end + end + + context "when score is below threshold" do + let(:verdicts) do + [ + { "verdict" => "no", "reason" => "Contradiction 1" }, + { "verdict" => "no", "reason" => "Contradiction 2" }, + { "verdict" => "yes" }, + ] + end + + it "returns success: false" do + result = described_class.call(answer) + + expect(result.success).to be false + expect(result.score).to be_within(0.01).of(0.33) + end + end + end +end diff --git a/spec/support/stub_bedrock.rb b/spec/support/stub_bedrock.rb index e6432b05a..cd1123640 100644 --- a/spec/support/stub_bedrock.rb +++ b/spec/support/stub_bedrock.rb @@ -134,4 +134,78 @@ def stub_bedrock_invoke_model_openai_oss_answer_relevancy(question_message:, stubs end + + def stub_bedrock_invoke_model_openai_oss_faithfulness(retrieval_context:, + answer_message:, + truths_json: { truths: ["Truth."] }.to_json, + claims_json: { claims: ["Claim."] }.to_json, + verdicts_json: { verdicts: [{ "verdict" => "yes" }] }.to_json, + reason_json: { reason: "This is the reason for the score." }.to_json) + prompts = AutoEvaluation::Prompts.config.faithfulness + + truths = JSON.parse(truths_json).fetch("truths") + claims = JSON.parse(claims_json).fetch("claims") + verdicts = JSON.parse(verdicts_json).fetch("verdicts") + + score = if verdicts.empty? + 1.0 + else + faithful_count = verdicts.count { |v| v["verdict"].strip.downcase != "no" } + (faithful_count.to_d / verdicts.count).round(2).to_f + end + + contradictions = verdicts.select { |v| v["verdict"].strip.downcase == "no" } + .map { |v| v["reason"] } + + truths_user_prompt = sprintf( + prompts.fetch(:truths).fetch(:user_prompt), + retrieval_context:, + ) + claims_user_prompt = sprintf( + prompts.fetch(:claims).fetch(:user_prompt), + answer: answer_message, + ) + verdicts_user_prompt = sprintf( + prompts.fetch(:verdicts).fetch(:user_prompt), + claims:, + retrieval_context: truths.join("\n\n"), + ) + reason_user_prompt = sprintf( + prompts.fetch(:reason).fetch(:user_prompt), + score:, + contradictions:, + ) + + truths_tools = [prompts.fetch(:truths).fetch(:tool_spec)] + claims_tools = [prompts.fetch(:claims).fetch(:tool_spec)] + verdicts_tools = [prompts.fetch(:verdicts).fetch(:tool_spec)] + reason_tools = [prompts.fetch(:reason).fetch(:tool_spec)] + + stubs = {} + stubs[:truths] = stub_bedrock_invoke_model_openai_oss_tool_call( + truths_user_prompt, + truths_tools, + truths_json, + ) + + stubs[:claims] = stub_bedrock_invoke_model_openai_oss_tool_call( + claims_user_prompt, + claims_tools, + claims_json, + ) + + stubs[:verdicts] = stub_bedrock_invoke_model_openai_oss_tool_call( + verdicts_user_prompt, + verdicts_tools, + verdicts_json, + ) + + stubs[:reason] = stub_bedrock_invoke_model_openai_oss_tool_call( + reason_user_prompt, + reason_tools, + reason_json, + ) + + stubs + end end From e75c28f06788bf66a52e052b83562cdcaa5d1dc1 Mon Sep 17 00:00:00 2001 From: Chae Cramb Date: Mon, 29 Dec 2025 16:48:11 +0000 Subject: [PATCH 2/2] Add faithfulness metric auto-evaluation task This adds a new Rake task to generate faithfulness evaluation for a given question. Like the answer relevancy and coherence tasks it: 1. generates an answer for the input question using the existing answer composition pipeline 2. evaluates the faithfulness of the generated answer against the retrieval context using AutoEvaluation::Faithfulness 3. outputs the result json to stdout 4. handles error answers appropriately The key difference from the other metrics is that faithfulness evaluates the answer against the retrieval context (the sources used to generate the answer) rather than the original question. The retrieval context is extracted from the answer's used sources joined with double newlines, matching the DeepEval approach. --- lib/tasks/evaluation.rake | 16 ++++++++++++++++ spec/lib/tasks/evaluation_spec.rb | 7 +++++++ 2 files changed, 23 insertions(+) diff --git a/lib/tasks/evaluation.rake b/lib/tasks/evaluation.rake index 4f165fb25..21886d04a 100644 --- a/lib/tasks/evaluation.rake +++ b/lib/tasks/evaluation.rake @@ -204,4 +204,20 @@ namespace :evaluation do abort e.message end end + + desc "Run faithfulness evaluation for a user input" + task generate_faithfulness_evaluation: :environment do + raise "Requires an INPUT env var" if ENV["INPUT"].blank? + + begin + result = AutoEvaluation::EvaluateAnswerFromQuestionMessage.call( + evaluation_class: AutoEvaluation::Faithfulness, + question_message: ENV["INPUT"], + ) + + puts result.to_json + rescue AutoEvaluation::EvaluateAnswerFromQuestionMessage::TaskFailedError => e + abort e.message + end + end end diff --git a/spec/lib/tasks/evaluation_spec.rb b/spec/lib/tasks/evaluation_spec.rb index da009aede..b6299898d 100644 --- a/spec/lib/tasks/evaluation_spec.rb +++ b/spec/lib/tasks/evaluation_spec.rb @@ -588,4 +588,11 @@ let(:evaluation_class) { AutoEvaluation::Coherence } end end + + describe "generate_faithfulness_evaluation" do + it_behaves_like "a task that returns a ScoreResult" do + let(:task_name) { "evaluation:generate_faithfulness_evaluation" } + let(:evaluation_class) { AutoEvaluation::Faithfulness } + end + end end