Skip to content

Commit a589392

Browse files
authored
Merge pull request #168 from alphagov/add-guardrail-names-for-eval
Add guardrail names to output guardrail evaluation task
2 parents c870877 + 056b438 commit a589392

5 files changed

Lines changed: 40 additions & 21 deletions

File tree

lib/answer_composition/pipeline/answer_guardrails.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ def call(context)
99
context.abort_pipeline!(
1010
message: Answer::CannedResponses::ANSWER_GUARDRAILS_FAILED_MESSAGE,
1111
status: "guardrails_answer",
12-
answer_guardrails_failures: response.guardrails,
12+
answer_guardrails_failures: response.triggered_guardrails,
1313
answer_guardrails_status: :fail,
1414
metrics: { guardrail_name => build_metrics(start_time, response) },
1515
)

lib/answer_composition/pipeline/question_routing_guardrails.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def call(context)
1111
context.answer.assign_attributes(
1212
message: Answer::CannedResponses::QUESTION_ROUTING_GUARDRAILS_FAILED_MESSAGE,
1313
status: "guardrails_question_routing",
14-
question_routing_guardrails_failures: response.guardrails,
14+
question_routing_guardrails_failures: response.triggered_guardrails,
1515
)
1616
end
1717

lib/guardrails/multiple_checker.rb

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,14 @@
11
module Guardrails
22
class MultipleChecker
33
Result = Data.define(:triggered, :guardrails, :llm_response, :llm_guardrail_result,
4-
:llm_prompt_tokens, :llm_completion_tokens, :llm_cached_tokens)
4+
:llm_prompt_tokens, :llm_completion_tokens, :llm_cached_tokens) do
5+
def triggered_guardrails
6+
return [] unless guardrails
7+
8+
guardrails.select { |_, v| v }.keys
9+
end
10+
end
11+
512
class ResponseError < StandardError
613
attr_reader :llm_response, :llm_prompt_tokens, :llm_completion_tokens, :llm_cached_tokens
714

@@ -94,7 +101,7 @@ def parse_response(llm_response:, llm_guardrail_result:, llm_prompt_tokens:, llm
94101

95102
parts = llm_guardrail_result.split(" | ")
96103
triggered = parts.first.chomp == "True"
97-
guardrails = triggered ? extract_guardrails(parts.second) : []
104+
guardrails = to_guardrail_hash(parts.second)
98105

99106
Result.new(
100107
llm_response: llm_response,
@@ -122,9 +129,12 @@ def response_pattern
122129
end
123130
end
124131

125-
def extract_guardrails(parts)
126-
guardrail_numbers = parts.scan(/\d+/).map(&:to_i)
127-
prompt.guardrails.select { |guardrail| guardrail.key.in?(guardrail_numbers) }.map(&:name)
132+
def to_guardrail_hash(parts)
133+
triggered_guardrail_numbers = parts.scan(/\d+/).map(&:to_i)
134+
135+
prompt.guardrails.each_with_object({}) do |guardrail, guardrails_hash|
136+
guardrails_hash[guardrail.name.to_sym] = triggered_guardrail_numbers.include?(guardrail.key)
137+
end
128138
end
129139
end
130140
end

spec/factories/output_guardrail_result_factory.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@
1818

1919
trait :pass do
2020
triggered { false }
21-
guardrails { [] }
21+
guardrails { { political: false, appropriate_language: false } }
2222
llm_guardrail_result { "False | None" }
2323
end
2424

2525
trait :fail do
2626
triggered { true }
27-
guardrails { %w[political] }
27+
guardrails { { political: true, appropriate_language: false } }
2828
llm_guardrail_result { 'True | "3"' }
2929
end
3030
end

spec/lib/guardrails/multiple_checker_spec.rb

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,19 @@
88
let(:guardrail_response_hash) do
99
{
1010
llm_response: {
11-
"message" => {
12-
"role" => "assistant",
13-
"content" => "False | None",
11+
message: {
12+
role: "assistant",
13+
content: "False | None",
1414
},
15-
"finish_reason" => "stop",
15+
finish_reason: "stop",
1616
},
1717
llm_guardrail_result: "False | None",
1818
llm_prompt_tokens: 13,
1919
llm_completion_tokens: 7,
2020
llm_cached_tokens: 10,
2121
}
2222
end
23+
let(:guardrail_result) { build(:guardrails_multiple_checker_result, :pass) }
2324

2425
it "raises an error if the llm_provider is unknown" do
2526
expect { described_class.call(input, llm_prompt_name, :unknown_provider) }
@@ -33,11 +34,10 @@
3334
guardrails_config = {
3435
system_prompt: "{guardrails} {date}",
3536
user_prompt: "{input}",
36-
guardrails: %w[costs personal unique_answer_guardrail],
37+
guardrails: %w[political appropriate_language],
3738
guardrail_definitions: {
38-
"costs" => "This is a costs guardrail",
39-
"personal" => "This is a personal guardrail",
40-
"unique_answer_guardrail" => "This is a unique answer guardrail",
39+
"political" => "This is a political guardrail",
40+
"appropriate_language" => "This is an appropriate language guardrail",
4141
},
4242
}.with_indifferent_access
4343

@@ -49,6 +49,11 @@
4949
described_class.call(input, llm_prompt_name, llm_provider)
5050
expect(Guardrails::OpenAI::MultipleChecker).to have_received(:call).with(input, instance_of(Guardrails::MultipleChecker::Prompt))
5151
end
52+
53+
it "returns the guardrail result" do
54+
result = described_class.call(input, llm_prompt_name, llm_provider)
55+
expect(result).to eq(guardrail_result)
56+
end
5257
end
5358

5459
context "when the llm_provider is :claude" do
@@ -58,11 +63,10 @@
5863
guardrails_config = {
5964
system_prompt: "{guardrails} {date}",
6065
user_prompt: "{input}",
61-
guardrails: %w[costs personal unique_answer_guardrail],
66+
guardrails: %w[political appropriate_language],
6267
guardrail_definitions: {
63-
"costs" => "This is a costs guardrail",
64-
"personal" => "This is a personal guardrail",
65-
"unique_answer_guardrail" => "This is a unique answer guardrail",
68+
"political" => "This is a political guardrail",
69+
"appropriate_language" => "This is an appropriate language guardrail",
6670
},
6771
}.with_indifferent_access
6872

@@ -75,6 +79,11 @@
7579
expect(Guardrails::Claude::MultipleChecker).to have_received(:call).with(input, instance_of(Guardrails::MultipleChecker::Prompt))
7680
end
7781

82+
it "returns the guardrail result" do
83+
result = described_class.call(input, llm_prompt_name, llm_provider)
84+
expect(result).to eq(guardrail_result)
85+
end
86+
7887
context "when the response format is incorrect" do
7988
it "throws a ResponseError" do
8089
guardrail_result = 'False | "1, 2"'

0 commit comments

Comments
 (0)