FinanceBench: update scorer instructions and switch scoring model to gpt-4.1 (spiceai#5395)

sgrebnov · web-flow · commit 1a715383a43a · 2025-04-16T02:46:21.000Z
* FinanceBench: update scorer instructions and switch scoring model to `gpt-4.1`

* Include evals response message when running evals benchmark
diff --git a/test/financebench/spicepod_gpt-4o.yaml b/test/financebench/spicepod_gpt-4o.yaml
@@ -32,16 +32,51 @@ models:
         - Keep responses under 512 characters.
 
   - name: judge
-    from: openai:gpt-4o
+    from: openai:gpt-4.1-2025-04-14
     params:
       openai_api_key: ${ secrets:OPENAI_API_KEY }
       parameterized_prompt: enabled
       system_prompt: |
-        You are a financial expert. Score the correctness of the answer below between 0.0 and 1.0.
-        Use 0 if the answer is wrong or information was not found and 1.0 if the answer is correct.
-        Question: '{{ input }}'
-        Correct answer: '{{ ideal }}'
-        Actual answer to score: '{{ actual }}'
+        You are an expert evaluator in finance, using your expertise to assess the quality of responses generated by a Retrieval-Augmented Generation (RAG) system.
+
+        You will receive three inputs:
+          - **User Question**: the original query posed by the user.
+          - **Reference Answer**: a known correct and complete answer.
+          - **Generated Answer**: the response provided by the RAG model, based on retrieved documents.
+
+        Evaluate the Generated Answer strictly according to these criteria:
+          1. **Correctness**: All facts must accurately reflect the Reference Answer. Do not reward plausible but incorrect or unsupported claims.
+          2. **Groundedness**: The answer must be fully grounded in provided documents without introducing any external or unsupported information.
+          3. **Faithfulness**: There should be no hallucinated content; every claim must explicitly derive from the retrieved documents.
+          4. **Completeness**: The answer must comprehensively cover all critical elements of the User Question, leaving no essential details out.
+          5. **Relevance**: Information included should directly address the User Question without extraneous or irrelevant details.
+
+        Evaluation Guidelines:
+          - For **numerical data**, strictly verify precision, rounding, and consistency against the provided source.
+          - For **qualitative claims**, ensure logic and rationale are sound and factually accurate.
+          - For **financial terms or metrics**, verify alignment with industry standards and provided definitions.
+          - Ensure Generated Answer includes references (document name or citations).
+          - Strongly penalize even minor instances of hallucination, speculation, or unsupported assumptions.
+
+        Scoring Instructions:
+          - Assign a score between **0.0 and 1.0**, where:
+              - **1.0**: Fully correct, relevant, complete, and strictly grounded in provided data.
+              - **0.0**: Incorrect, misleading, fabricated, speculative, or explicitly states "I don't know."
+              - Intermediate scores (e.g., 0.6, 0.8) indicate partially correct, incomplete, or partially grounded responses.
+          - Do NOT reward answers that sound correct but lack explicit grounding in provided documents.
+          - Assign **0.0** immediately if required information from retrieved documents is missing entirely or if the answer explicitly admits ignorance.
+
+        You must ONLY return final score, no commentary or explanation
+
+        # User Question:
+        {{ input }}
+
+        # Reference Answer:
+        {{ ideal }}
+
+        # Generated Answer:
+        {{ actual }}
+
       openai_response_format:
         type: json_schema
         json_schema:
@@ -50,12 +85,14 @@ models:
             type: object
             properties:
               score:
+                description: >
+                  The score assigned to the actual answer based on the evaluation criteria.
                 type: number
                 format: float
             additionalProperties: true
             required:
               - score
-        strict: false
+        strict: true
 
 views:
   - name: financebench.evals
diff --git a/tools/testoperator/src/commands/evals/mod.rs b/tools/testoperator/src/commands/evals/mod.rs
@@ -62,14 +62,16 @@ pub(crate) async fn run(args: &EvalsTestArgs) -> anyhow::Result<()> {
         .send()
         .await?;
 
-    if !response.status().is_success() {
-        return Err(anyhow::anyhow!(
-            "Failed to execute evals: {}",
-            response.text().await?
-        ));
+    let response_status = response.status();
+    let response_msq = response.text().await?;
+
+    if !response_status.is_success() {
+        return Err(anyhow::anyhow!("Failed to execute evals: {response_msq}"));
     }
 
-    println!("Execution completed, retrieving results...");
+    println!("Evals completed:\n{response_msq}");
+
+    println!("Retrieving results...");
 
     let mut flight_client = spiced_instance.flight_client(None).await?;