-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathfaithfulness.rb
More file actions
87 lines (67 loc) · 2.26 KB
/
faithfulness.rb
File metadata and controls
87 lines (67 loc) · 2.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
class AutoEvaluation::Faithfulness
THRESHOLD = 0.5
def self.call(...) = new(...).call
def initialize(answer)
@answer = answer
@llm_responses = {}
@metrics = {}
end
def call
truths, llm_responses[:truths], metrics[:truths] = TruthsGenerator.call(retrieval_context:)
return build_error_result("No truths were extracted from the retrieval context.") if truths.empty?
claims, llm_responses[:claims], metrics[:claims] = ClaimsGenerator.call(answer_message:)
return build_error_result("No claims were extracted from the answer.") if claims.empty?
verdicts, llm_responses[:verdicts], metrics[:verdicts] = VerdictsGenerator.call(
claims:, truths:,
)
return build_error_result("No verdicts were generated for the extracted claims.") if verdicts.empty?
if verdicts.none? { |verdict| verdict["verdict"].strip.downcase == "no" }
return build_result_with_score(1.0, "The response is fully supported by the retrieval context.")
end
score = calculate_score(verdicts)
reason, llm_responses[:reason], metrics[:reason] = ReasonGenerator.call(
score: score.round(2), verdicts:,
)
build_result_with_score(score, reason)
rescue AutoEvaluation::BedrockOpenAIOssInvoke::InvalidLlmResponseError => e
build_error_result(e.message)
end
private
attr_reader :answer
attr_accessor :llm_responses, :metrics
def answer_message
answer.message
end
def retrieval_context
answer.sources.map { |source| format_chunk_for_evaluation(source.chunk) }.join("\n")
end
def calculate_score(verdicts)
faithful_count = verdicts.count { |verdict| verdict["verdict"].strip.downcase != "no" }
faithful_count.to_d / verdicts.count
end
def build_error_result(error_message)
AutoEvaluation::Result.new(
status: "error",
error_message:,
llm_responses:,
metrics:,
)
end
def build_result_with_score(score, reason)
AutoEvaluation::Result.new(
status: score >= THRESHOLD ? "success" : "failure",
score:,
reason:,
llm_responses:,
metrics:,
)
end
def format_chunk_for_evaluation(chunk)
<<~STRING
#{chunk.title}
#{chunk.heading_hierarchy.join(' > ')}
#{chunk.description}
#{chunk.html_content}
STRING
end
end