Add schema validation to BedrockOpenAIOssInvoke

davidgisbey · davidgisbey · commit c05b0a6d72de · 2025-12-31T11:04:43.000Z
This updates the BedrockOpenAIOssInvoke class to include JSON schema validation
for the structured output received from the LLM.

It uses the first tool's schema since all of our auto-eval metrics
use a single tool. If this is to change we could consider passing the
schema in via the method parameters down the line.

While doing this I noticed that i'd not been consistent with adhering to
the schemas it some test cases so i've updated those.
diff --git a/lib/auto_evaluation/bedrock_openai_oss_invoke.rb b/lib/auto_evaluation/bedrock_openai_oss_invoke.rb
@@ -1,5 +1,6 @@
 module AutoEvaluation
   class BedrockOpenAIOssInvoke
+    class InvalidToolCallSchemaError < StandardError; end
     Result = Data.define(
       :evaluation_data,
       :llm_response,
@@ -33,12 +34,14 @@ def call
         }.to_json,
       )
       parsed_response = JSON.parse(response.body.read)
-      parsed_structured_output = JSON.parse(
+      parsed_tool_output = JSON.parse(
         parsed_response["choices"][0]["message"]["tool_calls"][0]["function"]["arguments"],
       )
 
+      validate_tool_output_against_schema(parsed_tool_output)
+
       Result.new(
-        evaluation_data: parsed_structured_output,
+        evaluation_data: parsed_tool_output,
         llm_response: parsed_response,
         metrics: build_metrics(start_time, parsed_response),
       )
@@ -57,5 +60,12 @@ def build_metrics(start_time, response)
         model: response["model"],
       }
     end
+
+    def validate_tool_output_against_schema(tool_output)
+      schema = tools.dig(0, "function", "parameters")
+      JSON::Validator.validate!(schema, tool_output)
+    rescue JSON::Schema::ValidationError => e
+      raise InvalidToolCallSchemaError, "Tool call response does not match schema: #{e.message}"
+    end
   end
 end
diff --git a/spec/lib/auto_evaluation/answer_relevancy/verdicts_generator_spec.rb b/spec/lib/auto_evaluation/answer_relevancy/verdicts_generator_spec.rb
@@ -4,8 +4,8 @@
     let(:statements) { ["Statement one.", "Statement two."] }
     let(:verdicts) do
       [
-        { "verdict" => "Yes" },
-        { "verdict" => "No", "reason" => "The statement is irrelevant." },
+        { "verdict" => "yes" },
+        { "verdict" => "no", "reason" => "The statement is irrelevant." },
       ]
     end
     let(:verdicts_json) do
diff --git a/spec/lib/auto_evaluation/answer_relevancy_spec.rb b/spec/lib/auto_evaluation/answer_relevancy_spec.rb
@@ -23,8 +23,8 @@
 
     let(:verdicts) do
       [
-        { "verdict" => "Yes" },
-        { "verdict" => "No", "reason" => "The statement is irrelevant." },
+        { "verdict" => "yes" },
+        { "verdict" => "no", "reason" => "The statement is irrelevant." },
       ]
     end
     let(:verdicts_json) { { verdicts: }.to_json }
@@ -104,7 +104,7 @@
       let(:verdicts) do
         [
           { "verdict" => "idk", "reason" => "Cannot determine relevance." },
-          { "verdict" => "No", "reason" => "The statement is irrelevant." },
+          { "verdict" => "no", "reason" => "The statement is irrelevant." },
         ]
       end
 
@@ -172,7 +172,7 @@
     end
 
     context "when verdicts are generated and none have a 'no' verdict" do
-      let(:verdicts_json) { { verdicts: [{ "verdict" => "Yes" }, { "verdict" => "Yes" }] }.to_json }
+      let(:verdicts_json) { { verdicts: [{ "verdict" => "yes" }, { "verdict" => "yes" }] }.to_json }
 
       it "returns a result object with the expected attributes" do
         allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0, 204.0, 206.0)
diff --git a/spec/lib/auto_evaluation/bedrock_openai_oss_invoke_spec.rb b/spec/lib/auto_evaluation/bedrock_openai_oss_invoke_spec.rb
@@ -4,18 +4,18 @@
     let(:tools) do
       [
         {
-          type: "function",
-          function: {
-            name: "test_schema",
-            description: "A test JSON schema",
-            schema: {
-              type: "object",
-              properties: {
-                response: { type: "string" },
+          "type" => "function",
+          "function" => {
+            "name" => "test_schema",
+            "description" => "A test JSON schema",
+            "parameters" => {
+              "type" => "object",
+              "properties" => {
+                "response" => { "type" => "string" },
               },
-              required: %w[response],
+              "required" => %w[response],
             },
-            strict: true,
+            "strict" => true,
           },
         },
       ]
@@ -55,5 +55,20 @@
         },
       )
     end
+
+    it "raises an error if the response does not conform to the schema" do
+      bedrock_invoke_model_openai_oss_tool_call(
+        user_message,
+        tools,
+        { "invalid_key" => "This does not conform to the schema." }.to_json,
+      )
+
+      expect {
+        described_class.call(user_message, tools)
+      }.to raise_error(
+        described_class::InvalidToolCallSchemaError,
+        /The property '#\/' did not contain a required property of 'response'/,
+      )
+    end
   end
 end
diff --git a/spec/lib/auto_evaluation/coherence_spec.rb b/spec/lib/auto_evaluation/coherence_spec.rb
@@ -4,7 +4,7 @@
     let(:question_message) { "This is a test question message." }
     let(:answer_message) { "This is a test answer message." }
     let(:reason) { "This is the reason for the score." }
-    let(:response_json) { { score: 3.0, reason: }.to_json }
+    let(:response_json) { { score: 3, reason: }.to_json }
     let(:user_prompt) do
       sprintf(
         prompts.fetch(:user_prompt),
@@ -49,11 +49,11 @@
 
     it "returns the correct score and success for each rubric score" do
       {
-        1.0 => 0.0,
-        2.0 => 0.25,
-        3.0 => 0.5,
-        4.0 => 0.75,
-        5.0 => 1.0,
+        1 => 0.0,
+        2 => 0.25,
+        3 => 0.5,
+        4 => 0.75,
+        5 => 1.0,
       }.each do |rubric_score, expected_score|
         response_json = { score: rubric_score, reason: }.to_json
         bedrock_invoke_model_openai_oss_tool_call(

Original file line number	Diff line number	Diff line change
`@@ -4,8 +4,8 @@`
`4`	`4`	`let(:statements) { ["Statement one.", "Statement two."] }`
`5`	`5`	`let(:verdicts) do`
`6`	`6`	`[`
`7`		`- { "verdict" => "Yes" },`
`8`		`- { "verdict" => "No", "reason" => "The statement is irrelevant." },`
	`7`	`+ { "verdict" => "yes" },`
	`8`	`+ { "verdict" => "no", "reason" => "The statement is irrelevant." },`
`9`	`9`	`]`
`10`	`10`	`end`
`11`	`11`	`let(:verdicts_json) do`
Original file line number	Diff line number	Diff line change
`@@ -23,8 +23,8 @@`
`23`	`23`
`24`	`24`	`let(:verdicts) do`
`25`	`25`	`[`
`26`		`- { "verdict" => "Yes" },`
`27`		`- { "verdict" => "No", "reason" => "The statement is irrelevant." },`
	`26`	`+ { "verdict" => "yes" },`
	`27`	`+ { "verdict" => "no", "reason" => "The statement is irrelevant." },`
`28`	`28`	`]`
`29`	`29`	`end`
`30`	`30`	`let(:verdicts_json) { { verdicts: }.to_json }`
`@@ -104,7 +104,7 @@`
`104`	`104`	`let(:verdicts) do`
`105`	`105`	`[`
`106`	`106`	`{ "verdict" => "idk", "reason" => "Cannot determine relevance." },`
`107`		`- { "verdict" => "No", "reason" => "The statement is irrelevant." },`
	`107`	`+ { "verdict" => "no", "reason" => "The statement is irrelevant." },`
`108`	`108`	`]`
`109`	`109`	`end`
`110`	`110`
`@@ -172,7 +172,7 @@`
`172`	`172`	`end`
`173`	`173`
`174`	`174`	`context "when verdicts are generated and none have a 'no' verdict" do`
`175`		`- let(:verdicts_json) { { verdicts: [{ "verdict" => "Yes" }, { "verdict" => "Yes" }] }.to_json }`
	`175`	`+ let(:verdicts_json) { { verdicts: [{ "verdict" => "yes" }, { "verdict" => "yes" }] }.to_json }`
`176`	`176`
`177`	`177`	`it "returns a result object with the expected attributes" do`
`178`	`178`	`allow(Clock).to receive(:monotonic_time).and_return(200.0, 202.0, 204.0, 206.0)`