Skip to content

Commit 056b438

Browse files
committed
Return pass/fail status for each output guardrail
Output guardrail results now include a hash keyed by guardrail name with a boolean pass/fail flag, rather than an array that only contained the names of guardrails that failed. This is in order to provide the exact number of guardrail definitions, as well as their names to the evaluation tool (1). Without providing these details they must be hardcoded in the evaluation codebase, resulting in potential discrepancies if guardrail definitions are updated. 1: alphagov/govuk-chat-evaluation#24
1 parent f040171 commit 056b438

5 files changed

Lines changed: 40 additions & 21 deletions

File tree

lib/answer_composition/pipeline/answer_guardrails.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ def call(context)
99
context.abort_pipeline!(
1010
message: Answer::CannedResponses::ANSWER_GUARDRAILS_FAILED_MESSAGE,
1111
status: "guardrails_answer",
12-
answer_guardrails_failures: response.guardrails,
12+
answer_guardrails_failures: response.triggered_guardrails,
1313
answer_guardrails_status: :fail,
1414
metrics: { guardrail_name => build_metrics(start_time, response) },
1515
)

lib/answer_composition/pipeline/question_routing_guardrails.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def call(context)
1111
context.answer.assign_attributes(
1212
message: Answer::CannedResponses::QUESTION_ROUTING_GUARDRAILS_FAILED_MESSAGE,
1313
status: "guardrails_question_routing",
14-
question_routing_guardrails_failures: response.guardrails,
14+
question_routing_guardrails_failures: response.triggered_guardrails,
1515
)
1616
end
1717

lib/guardrails/multiple_checker.rb

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,14 @@
11
module Guardrails
22
class MultipleChecker
33
Result = Data.define(:triggered, :guardrails, :llm_response, :llm_guardrail_result,
4-
:llm_prompt_tokens, :llm_completion_tokens, :llm_cached_tokens)
4+
:llm_prompt_tokens, :llm_completion_tokens, :llm_cached_tokens) do
5+
def triggered_guardrails
6+
return [] unless guardrails
7+
8+
guardrails.select { |_, v| v }.keys
9+
end
10+
end
11+
512
class ResponseError < StandardError
613
attr_reader :llm_response, :llm_prompt_tokens, :llm_completion_tokens, :llm_cached_tokens
714

@@ -94,7 +101,7 @@ def parse_response(llm_response:, llm_guardrail_result:, llm_prompt_tokens:, llm
94101

95102
parts = llm_guardrail_result.split(" | ")
96103
triggered = parts.first.chomp == "True"
97-
guardrails = triggered ? extract_guardrails(parts.second) : []
104+
guardrails = to_guardrail_hash(parts.second)
98105

99106
Result.new(
100107
llm_response: llm_response,
@@ -122,9 +129,12 @@ def response_pattern
122129
end
123130
end
124131

125-
def extract_guardrails(parts)
126-
guardrail_numbers = parts.scan(/\d+/).map(&:to_i)
127-
prompt.guardrails.select { |guardrail| guardrail.key.in?(guardrail_numbers) }.map(&:name)
132+
def to_guardrail_hash(parts)
133+
triggered_guardrail_numbers = parts.scan(/\d+/).map(&:to_i)
134+
135+
prompt.guardrails.each_with_object({}) do |guardrail, guardrails_hash|
136+
guardrails_hash[guardrail.name.to_sym] = triggered_guardrail_numbers.include?(guardrail.key)
137+
end
128138
end
129139
end
130140
end

spec/factories/output_guardrail_result_factory.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@
1818

1919
trait :pass do
2020
triggered { false }
21-
guardrails { [] }
21+
guardrails { { political: false, appropriate_language: false } }
2222
llm_guardrail_result { "False | None" }
2323
end
2424

2525
trait :fail do
2626
triggered { true }
27-
guardrails { %w[political] }
27+
guardrails { { political: true, appropriate_language: false } }
2828
llm_guardrail_result { 'True | "3"' }
2929
end
3030
end

spec/lib/guardrails/multiple_checker_spec.rb

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,19 @@
88
let(:guardrail_response_hash) do
99
{
1010
llm_response: {
11-
"message" => {
12-
"role" => "assistant",
13-
"content" => "False | None",
11+
message: {
12+
role: "assistant",
13+
content: "False | None",
1414
},
15-
"finish_reason" => "stop",
15+
finish_reason: "stop",
1616
},
1717
llm_guardrail_result: "False | None",
1818
llm_prompt_tokens: 13,
1919
llm_completion_tokens: 7,
2020
llm_cached_tokens: 10,
2121
}
2222
end
23+
let(:guardrail_result) { build(:guardrails_multiple_checker_result, :pass) }
2324

2425
it "raises an error if the llm_provider is unknown" do
2526
expect { described_class.call(input, llm_prompt_name, :unknown_provider) }
@@ -33,11 +34,10 @@
3334
guardrails_config = {
3435
system_prompt: "{guardrails} {date}",
3536
user_prompt: "{input}",
36-
guardrails: %w[costs personal unique_answer_guardrail],
37+
guardrails: %w[political appropriate_language],
3738
guardrail_definitions: {
38-
"costs" => "This is a costs guardrail",
39-
"personal" => "This is a personal guardrail",
40-
"unique_answer_guardrail" => "This is a unique answer guardrail",
39+
"political" => "This is a political guardrail",
40+
"appropriate_language" => "This is an appropriate language guardrail",
4141
},
4242
}.with_indifferent_access
4343

@@ -49,6 +49,11 @@
4949
described_class.call(input, llm_prompt_name, llm_provider)
5050
expect(Guardrails::OpenAI::MultipleChecker).to have_received(:call).with(input, instance_of(Guardrails::MultipleChecker::Prompt))
5151
end
52+
53+
it "returns the guardrail result" do
54+
result = described_class.call(input, llm_prompt_name, llm_provider)
55+
expect(result).to eq(guardrail_result)
56+
end
5257
end
5358

5459
context "when the llm_provider is :claude" do
@@ -58,11 +63,10 @@
5863
guardrails_config = {
5964
system_prompt: "{guardrails} {date}",
6065
user_prompt: "{input}",
61-
guardrails: %w[costs personal unique_answer_guardrail],
66+
guardrails: %w[political appropriate_language],
6267
guardrail_definitions: {
63-
"costs" => "This is a costs guardrail",
64-
"personal" => "This is a personal guardrail",
65-
"unique_answer_guardrail" => "This is a unique answer guardrail",
68+
"political" => "This is a political guardrail",
69+
"appropriate_language" => "This is an appropriate language guardrail",
6670
},
6771
}.with_indifferent_access
6872

@@ -75,6 +79,11 @@
7579
expect(Guardrails::Claude::MultipleChecker).to have_received(:call).with(input, instance_of(Guardrails::MultipleChecker::Prompt))
7680
end
7781

82+
it "returns the guardrail result" do
83+
result = described_class.call(input, llm_prompt_name, llm_provider)
84+
expect(result).to eq(guardrail_result)
85+
end
86+
7887
context "when the response format is incorrect" do
7988
it "throws a ResponseError" do
8089
guardrail_result = 'False | "1, 2"'

0 commit comments

Comments
 (0)