diff --git a/lib/auto_evaluation/faithfulness.rb b/lib/auto_evaluation/faithfulness.rb index c02272b90..95a7ee1ba 100644 --- a/lib/auto_evaluation/faithfulness.rb +++ b/lib/auto_evaluation/faithfulness.rb @@ -49,7 +49,7 @@ def answer_message end def retrieval_context - used_sources.map(&:plain_content).join("\n\n") + answer.sources.map { |source| format_chunk_for_evaluation(source.chunk) }.join("\n") end def calculate_score(verdicts) @@ -57,10 +57,6 @@ def calculate_score(verdicts) faithful_count.to_d / verdicts.count end - def used_sources - answer.sources.select(&:used) - end - def build_error_result(error_message) AutoEvaluation::Result.new( status: "error", @@ -79,4 +75,13 @@ def build_result_with_score(score, reason) metrics:, ) end + + def format_chunk_for_evaluation(chunk) + <<~STRING + #{chunk.title} + #{chunk.heading_hierarchy.join(' > ')} + #{chunk.description} + #{chunk.html_content} + STRING + end end diff --git a/spec/lib/auto_evaluation/faithfulness_spec.rb b/spec/lib/auto_evaluation/faithfulness_spec.rb index f172dac7e..2d56095e5 100644 --- a/spec/lib/auto_evaluation/faithfulness_spec.rb +++ b/spec/lib/auto_evaluation/faithfulness_spec.rb @@ -1,12 +1,26 @@ RSpec.describe AutoEvaluation::Faithfulness, :aws_credentials_stubbed do describe ".call" do let(:answer_message) { "Einstein won the Nobel Prize in 1968 for the photoelectric effect." } - let(:retrieval_context) { "Einstein won the Nobel Prize in 1921 for the photoelectric effect." } + let(:used_chunk_conext) { "Einstein won the Nobel Prize in 1921 for the photoelectric effect." } let(:question) { build(:question, message: "When did Einstein win the Nobel Prize?") } - let(:chunk) { build(:answer_source_chunk, plain_content: retrieval_context) } - let(:used_source) { build(:answer_source, used: true, chunk:) } - let(:answer) { build(:answer, question:, message: answer_message, sources: [used_source]) } - + let(:used_chunk) { build(:answer_source_chunk, plain_content: used_chunk_conext) } + let(:unused_chunk) { build(:answer_source_chunk, plain_content: "Some other context.") } + let(:used_source) { build(:answer_source, used: true, chunk: used_chunk) } + let(:unused_source) { build(:answer_source, used: false, chunk: unused_chunk) } + let(:answer) { build(:answer, question:, message: answer_message, sources: [used_source, unused_source]) } + let(:retrieval_context) do + <<~STRING + #{used_chunk.title} + #{used_chunk.heading_hierarchy.join(' > ')} + #{used_chunk.description} + #{used_chunk.html_content} + + #{unused_chunk.title} + #{unused_chunk.heading_hierarchy.join(' > ')} + #{unused_chunk.description} + #{unused_chunk.html_content} + STRING + end let(:truths) { ["Einstein won the Nobel Prize in 1921.", "Einstein won the Nobel Prize for the photoelectric effect."] } let(:claims) { ["Einstein won the Nobel Prize in 1968.", "Einstein won the Nobel Prize for the photoelectric effect."] } let(:verdicts) do diff --git a/spec/support/system_spec_helpers.rb b/spec/support/system_spec_helpers.rb index 6aa271317..fa762bae6 100644 --- a/spec/support/system_spec_helpers.rb +++ b/spec/support/system_spec_helpers.rb @@ -60,8 +60,16 @@ def stubs_for_mock_answer(question, question_message: question, answer_message: answer, ) + + retrieval_context = <<~STRING + Title + Heading 1 > Heading 2 + Description +

Some content

+ STRING + stub_bedrock_invoke_model_openai_oss_faithfulness( - retrieval_context: "Some content", + retrieval_context: retrieval_context, answer_message: answer, ) stub_bedrock_invoke_model_openai_oss_coherence(