spiceai · lukekim · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/.github/actions/parse-llm-test-result/action.yml b/.github/actions/parse-llm-test-result/action.yml
@@ -50,6 +50,26 @@ runs:
         # Also get just the last 50 lines for focused matching (LLM typically puts result at end)
         TAIL_OUTPUT=$(tail -50 "$OUTPUT_FILE" | tr '[:upper:]' '[:lower:]' | tr -s '[:space:]' ' ')
 
+        # Require explicit final verdict line
+        FINAL_LINE=$(awk 'NF{line=$0} END{print line}' "$OUTPUT_FILE" | tr -d '\r')
+        FINAL_LINE_LOWER=$(echo "$FINAL_LINE" | tr '[:upper:]' '[:lower:]')
+
+        if [ "$FINAL_LINE" = "TEST PASSED" ]; then
+          RESULT="passed"
+          REASON=""
+        elif echo "$FINAL_LINE_LOWER" | grep -q '^test failed:'; then
+          RESULT="failed"
+          REASON="${FINAL_LINE#TEST FAILED: }"
+
+          # Normalize output mismatch reasons to explicit recipe-update wording
+          if echo "$FINAL_LINE_LOWER" | grep -Eq 'mismatch|did not match|does not match|unexpected output|output differs'; then
+            REASON="Spice output does not match the recipe documentation; recipe needs updating. ${REASON}"
+          fi
+        else
+          RESULT="error"
+          REASON="Missing strict final verdict line. Last line must be exactly 'TEST PASSED' or start with 'TEST FAILED:'."
+        fi
+
         # === FAIL PATTERNS (check first - fail takes priority) ===
         # These patterns indicate test failure
         FAIL_PATTERNS=(
@@ -84,66 +104,71 @@ runs:
           "timeout"
         )
 
-        RESULT=""
-        REASON=""
+        # If strict final verdict already set, skip fallback heuristics.
+        if [ "$RESULT" = "passed" ] || [ "$RESULT" = "failed" ] || [ "$RESULT" = "error" ]; then
+          :
+        else
+          RESULT=""
+          REASON=""
 
-        # Check for explicit fail patterns in the tail (most relevant)
-        for pattern in "${FAIL_PATTERNS[@]}"; do
-          if echo "$TAIL_OUTPUT" | grep -qi "$pattern"; then
-            RESULT="failed"
-            # Try to extract the reason after "failed:" or similar
-            REASON=$(echo "$TAIL_OUTPUT" | grep -oi "failed[: ]*[^.]*" | tail -1 || echo "Test failed")
-            break
-          fi
-        done
-
-        # If no fail found, check for error patterns
-        if [ -z "$RESULT" ]; then
-          for pattern in "${ERROR_PATTERNS[@]}"; do
+          # Check for explicit fail patterns in the tail (most relevant)
+          for pattern in "${FAIL_PATTERNS[@]}"; do
             if echo "$TAIL_OUTPUT" | grep -qi "$pattern"; then
-              RESULT="error"
-              REASON="Test inconclusive or timed out"
+              RESULT="failed"
+              # Try to extract the reason after "failed:" or similar
+              REASON=$(echo "$TAIL_OUTPUT" | grep -oi "failed[: ]*[^.]*" | tail -1 || echo "Test failed")
               break
             fi
           done
-        fi
 
-        # If no fail/error found, check for pass patterns
-        if [ -z "$RESULT" ]; then
-          for pattern in "${PASS_PATTERNS[@]}"; do
-            if echo "$TAIL_OUTPUT" | grep -qi "$pattern"; then
-              RESULT="passed"
-              REASON=""
-              break
-            fi
-          done
-        fi
+          # If no fail found, check for error patterns
+          if [ -z "$RESULT" ]; then
+            for pattern in "${ERROR_PATTERNS[@]}"; do
+              if echo "$TAIL_OUTPUT" | grep -qi "$pattern"; then
+                RESULT="error"
+                REASON="Test inconclusive or timed out"
+                break
+              fi
+            done
+          fi
 
-        # If still no result, check the full output as fallback
-        if [ -z "$RESULT" ]; then
-          for pattern in "${FAIL_PATTERNS[@]}"; do
-            if echo "$NORMALIZED_OUTPUT" | grep -qi "$pattern"; then
-              RESULT="failed"
-              REASON="Test failed (detected in full output)"
-              break
-            fi
-          done
-        fi
+          # If no fail/error found, check for pass patterns
+          if [ -z "$RESULT" ]; then
+            for pattern in "${PASS_PATTERNS[@]}"; do
+              if echo "$TAIL_OUTPUT" | grep -qi "$pattern"; then
+                RESULT="passed"
+                REASON=""
+                break
+              fi
+            done
+          fi
 
-        if [ -z "$RESULT" ]; then
-          for pattern in "${PASS_PATTERNS[@]}"; do
-            if echo "$NORMALIZED_OUTPUT" | grep -qi "$pattern"; then
-              RESULT="passed"
-              REASON=""
-              break
-            fi
-          done
-        fi
+          # If still no result, check the full output as fallback
+          if [ -z "$RESULT" ]; then
+            for pattern in "${FAIL_PATTERNS[@]}"; do
+              if echo "$NORMALIZED_OUTPUT" | grep -qi "$pattern"; then
+                RESULT="failed"
+                REASON="Test failed (detected in full output)"
+                break
+              fi
+            done
+          fi
 
-        # Default to error if we couldn't determine result
-        if [ -z "$RESULT" ]; then
-          RESULT="error"
-          REASON="Could not determine test result from LLM output"
+          if [ -z "$RESULT" ]; then
+            for pattern in "${PASS_PATTERNS[@]}"; do
+              if echo "$NORMALIZED_OUTPUT" | grep -qi "$pattern"; then
+                RESULT="passed"
+                REASON=""
+                break
+              fi
+            done
+          fi
+
+          # Default to error if we couldn't determine result
+          if [ -z "$RESULT" ]; then
+            RESULT="error"
+            REASON="Could not determine test result from LLM output"
+          fi
         fi
 
         # Output results
@@ -154,6 +179,7 @@ runs:
         if [ "$DEBUG" = "true" ]; then
           echo "--- Parse LLM Test Result Debug ---"
           echo "File size: $FILE_SIZE bytes"
+          echo "Final line: $FINAL_LINE"
           echo "Result: $RESULT"
           echo "Reason: $REASON"
           echo ""

diff --git a/.github/workflows/codex-test-ai.yml b/.github/workflows/codex-test-ai.yml
@@ -29,5 +29,6 @@ jobs:
         This recipe demonstrates the ai() SQL function for LLM integration.
         Requires SPICE_OPENAI_API_KEY to be set in .env file.
         Test with simple ai() queries as shown in the README.
+        Do not skip steps; any Spice/runtime error or output mismatch must fail and indicates the recipe needs updating.
     secrets:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/codex-test-arrow.yml b/.github/workflows/codex-test-arrow.yml
@@ -27,5 +27,6 @@ jobs:
       requires_secrets: false
       additional_instructions: |
         This recipe demonstrates using Apache Arrow as an in-memory acceleration engine.
+        Do not skip steps; any Spice/runtime error or output mismatch must fail and indicates the recipe needs updating.
     secrets:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/codex-test-deepseek.yml b/.github/workflows/codex-test-deepseek.yml
@@ -28,5 +28,6 @@ jobs:
       additional_instructions: |
         This recipe demonstrates using DeepSeek models.
         Requires DeepSeek API key.
+        Do not skip steps; any Spice/runtime error or output mismatch must fail and indicates the recipe needs updating.
     secrets:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/codex-test-duckdb-connector.yml b/.github/workflows/codex-test-duckdb-connector.yml
@@ -28,5 +28,6 @@ jobs:
       additional_instructions: |
         This recipe demonstrates using DuckDB as a data source.
         The DuckDB database file may need to be created or downloaded.
+        Do not skip steps; any Spice/runtime error or output mismatch must fail and indicates the recipe needs updating.
     secrets:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/codex-test-evals.yml b/.github/workflows/codex-test-evals.yml
@@ -28,5 +28,6 @@ jobs:
       additional_instructions: |
         This recipe demonstrates language model evaluations.
         Requires SPICE_OPENAI_API_KEY.
+        Do not skip steps; any Spice/runtime error, config error, or output mismatch must fail and indicates the recipe needs updating.
     secrets:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/codex-test-file.yml b/.github/workflows/codex-test-file.yml
@@ -29,5 +29,6 @@ jobs:
         This recipe demonstrates using local files (Parquet, CSV, Markdown) as data sources.
         You may need to download sample data files as shown in the README.
         Test the Parquet file example first as it's the simplest.
+        Do not skip steps; any Spice/runtime error or output mismatch must fail and indicates the recipe needs updating.
     secrets:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/codex-test-hashed-partitioning.yml b/.github/workflows/codex-test-hashed-partitioning.yml
@@ -27,5 +27,6 @@ jobs:
       requires_secrets: false
       additional_instructions: |
         This recipe demonstrates hashed partitioning with DuckDB.
+        Do not skip steps; any Spice/runtime error or output mismatch must fail and indicates the recipe needs updating.
     secrets:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/codex-test-http.yml b/.github/workflows/codex-test-http.yml
@@ -28,5 +28,6 @@ jobs:
       additional_instructions: |
         This recipe demonstrates using HTTP endpoints as data sources.
         Uses public APIs that don't require authentication.
+        Do not skip steps; any Spice/runtime error or output mismatch must fail and indicates the recipe needs updating.
     secrets:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/codex-test-llm-judge.yml b/.github/workflows/codex-test-llm-judge.yml
@@ -28,5 +28,6 @@ jobs:
       additional_instructions: |
         This recipe demonstrates LLM as a judge for evaluations.
         Requires SPICE_OPENAI_API_KEY.
+        Do not skip steps; any Spice/runtime error or output mismatch must fail and indicates the recipe needs updating.
     secrets:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/codex-test-llm-memory.yml b/.github/workflows/codex-test-llm-memory.yml
@@ -28,5 +28,6 @@ jobs:
       additional_instructions: |
         This recipe demonstrates persistent memory for language models.
         Requires SPICE_OPENAI_API_KEY.
+        Do not skip steps; any Spice/runtime error or output mismatch must fail and indicates the recipe needs updating.
     secrets:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/codex-test-localpod.yml b/.github/workflows/codex-test-localpod.yml
@@ -27,5 +27,7 @@ jobs:
       requires_secrets: false
       additional_instructions: |
         This recipe demonstrates using local pod configurations.
+        After running `./generate_data.sh`, wait up to 30 seconds for the documented refresh to apply before concluding failure on `SELECT COUNT(*) FROM local_time_series;`.
+        Do not skip steps; any Spice/runtime error or output mismatch must fail and indicates the recipe needs updating.
     secrets:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/codex-test-mcp.yml b/.github/workflows/codex-test-mcp.yml
@@ -27,5 +27,6 @@ jobs:
       requires_secrets: false
       additional_instructions: |
         This recipe demonstrates Model Context Protocol integration.
+        Do not skip steps; any Spice/runtime error or output mismatch must fail and indicates the recipe needs updating.
     secrets:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/codex-test-models-openai.yml b/.github/workflows/codex-test-models-openai.yml
@@ -28,5 +28,6 @@ jobs:
       additional_instructions: |
         This recipe demonstrates using OpenAI LLM and embedding models.
         Requires SPICE_OPENAI_API_KEY.
+        Do not skip steps; any Spice/runtime error or output mismatch must fail and indicates the recipe needs updating.
     secrets:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/codex-test-openai-responses-api.yml b/.github/workflows/codex-test-openai-responses-api.yml
@@ -28,5 +28,6 @@ jobs:
       additional_instructions: |
         This recipe demonstrates OpenAI's Responses API with Spice.
         Requires OPENAI_API_KEY.
+        Do not skip steps; any Spice/runtime error or output mismatch must fail and indicates the recipe needs updating.
     secrets:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/codex-test-openai-sdk.yml b/.github/workflows/codex-test-openai-sdk.yml
@@ -28,5 +28,6 @@ jobs:
       additional_instructions: |
         This recipe demonstrates using the OpenAI SDK with Spice-hosted models.
         Requires OPENAI_API_KEY.
+        Do not skip steps; any Spice/runtime error or output mismatch must fail and indicates the recipe needs updating.
     secrets:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}