feat(autofix-evals): o3-mini for evals + rca eval scorer prompt (#1869)

o3 mini for scoring, + new prompt for root cause that should fix issues that we were seeing such as - Mistaking the expected for the predicted - Not being critical enough of the root cause - Being _too strict_ Also adds a new metric for how helpful it is ![CleanShot 2025-02-04 at 15 40 06](https://github.com/user-attachments/assets/5e138b3e-eb04-469d-bdd6-9380a33caabe)
getsentry · Feb 4, 2025 · d31fb5a · d31fb5a
1 parent 41b98bf
commit d31fb5a
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 21 deletions.
diff --git a/src/seer/automation/agent/client.py b/src/seer/automation/agent/client.py
@@ -67,6 +67,10 @@ class OpenAiProvider:
             match=r"^o1-preview.*",
             defaults=LlmProviderDefaults(temperature=1.0),
         ),
+        LlmModelDefaultConfig(
+            match=r"^o3-mini.*",
+            defaults=LlmProviderDefaults(temperature=1.0),
+        ),
         LlmModelDefaultConfig(
             match=r".*",
             defaults=LlmProviderDefaults(temperature=0.0),

diff --git a/src/seer/automation/autofix/evaluations.py b/src/seer/automation/autofix/evaluations.py
@@ -306,7 +306,7 @@ def score_fix_single_it(
 @observe(name="Score root cause iteration")
 def score_root_cause_single_it(
     dataset_item: DatasetItemClient, causes: list[RootCauseAnalysisItem], model: str
-) -> tuple[float, bool]:
+) -> tuple[float, bool, bool]:
     if not dataset_item.expected_output:
         raise ValueError("Expected output is missing from dataset item")
 
@@ -325,23 +325,32 @@ def score_root_cause_single_it(
 
             Given the above issue, we know the correct root cause of the issue is:
 
+            <true_root_cause>
             {expected_output}
 
-            The model outputted the following possible root causes and solutions:
+            The solution to this root cause is:
+            {expected_solution_description}
+            {expected_solution_diff}
+            </true_root_cause>
+
+            We have an AI model say that the root cause of the issue is:
 
-            <predicted_solutions>
+            <predicted_root_cause>
             {predicted_solution}
-            </predicted_solutions>
+            </predicted_root_cause>
 
-            Score how well the predicted root cause and solution matches the expected root cause and solution with a float score from 0 to 1, where 1 means the solution fully fixes the issue and 0 means the solution does not fix the issue at all.
-            - The model will return multiple predicted root causes and solutions, ordered from most likely to least likely.
+            Score how well the AI model's predicted root cause aligns with the true known root cause with a float score from 0 to 1, where 1 means the predicted root cause is the correct root cause and 0 means the predicted root cause is completely incorrect.
 
-            Think step-by-step inside a <thoughts> tag before giving scores.
-            Score each solution inside a <score> tag, such as <score>0.5</score>.
-            Also, return your verdict of whether the predicted solution is the correct root cause of the issue with a boolean value inside a <verdict> tag, such as <verdict>True</verdict> or <verdict>False</verdict>."""
+            Provide your reasoning of why you gave this score inside a <reasoning> tag.
+
+            Place the score inside a <score> tag, such as <score>0.5</score>.
+            Also, return your verdict of whether the predicted root cause accurately represents the true root cause of the issue with a boolean value inside a <verdict> tag, such as <verdict>True</verdict> or <verdict>False</verdict>.
+            You should also grade whether the model's predicted root cause would be helpful to the developer in fixing the issue with a boolean value inside a <helpful> tag, such as <helpful>True</helpful> or <helpful>False</helpful>."""
     ).format(
         event_details=event_details.format_event(),
         expected_output=root_cause_expected_str,
+        expected_solution_description=expected_output["solution_diff"]["description"],
+        expected_solution_diff=expected_output["solution_diff"]["unified_diff"],
         predicted_solution=cause_xml.to_prompt_str(),
     )
     response = LlmClient().generate_text(
@@ -357,7 +366,10 @@ def score_root_cause_single_it(
     verdict_str = extract_text_inside_tags(response.message.content, "verdict")
     verdict_bool = (verdict_str or "False").lower() == "true"
 
-    return score, verdict_bool
+    helpful_str = extract_text_inside_tags(response.message.content, "helpful")
+    helpful_bool = (helpful_str or "False").lower() == "true"
+
+    return score, verdict_bool, helpful_bool
 
 
 @observe(name="Score one")
@@ -377,15 +389,17 @@ def score_one(
 @observe(name="Score root cause")
 def score_root_causes(
     dataset_item: DatasetItemClient, causes: list[RootCauseAnalysisItem], n_panel: int, model: str
-) -> tuple[float, bool]:
+) -> tuple[float, bool, bool]:
     results = [score_root_cause_single_it(dataset_item, causes, model) for _ in range(n_panel)]
 
     mean_score = round(sum([result[0] for result in results]) / len(results), 2)
 
     # If at least half of the panel says the fix is correct, then the fix is correct.
     verdict = sum(1 for result in results if result[1]) >= len(results) / 2
 
-    return mean_score, verdict
+    helpful = sum(1 for result in results if result[2]) >= len(results) / 2
+
+    return mean_score, verdict, helpful
 
 
 def make_score_name(model: str, n_panel: int, name: str) -> str:

diff --git a/src/seer/automation/autofix/tasks.py b/src/seer/automation/autofix/tasks.py
@@ -654,7 +654,7 @@ def run_autofix_evaluation_on_item(
     )
 
     scoring_n_panel = 5
-    scoring_model = "o1-mini-2024-09-12"
+    scoring_model = "o3-mini-2025-01-31"
 
     diff: str | None = None
     causes: list[RootCauseAnalysisItem] | None = None
@@ -802,7 +802,7 @@ def run_autofix_evaluation_on_item(
                 )
 
             if causes:
-                root_cause_score, root_cause_verdict = score_root_causes(
+                root_cause_score, root_cause_verdict, root_cause_helpful = score_root_causes(
                     dataset_item,
                     causes,
                     n_panel=scoring_n_panel,
@@ -821,6 +821,13 @@ def run_autofix_evaluation_on_item(
                     ),
                     value=1 if root_cause_verdict else 0,
                 )
+                langfuse.score(
+                    trace_id=trace_id,
+                    name=make_score_name(
+                        model=scoring_model, n_panel=scoring_n_panel, name="rc_is_helpful"
+                    ),
+                    value=1 if root_cause_helpful else 0,
+                )
                 langfuse.score(
                     trace_id=trace_id,
                     name=make_score_name(

diff --git a/tests/automation/autofix/test_autofix_evaluations.py b/tests/automation/autofix/test_autofix_evaluations.py
@@ -343,6 +343,10 @@ def mock_dataset_item(self):
             "root_cause": "Expected root cause",
             "solution_summary": "Expected solution summary",
             "diff": {"file_path": "test.py", "code_diff": "expected diff"},
+            "solution_diff": {
+                "description": "expected solution diff",
+                "unified_diff": "expected solution diff",
+            },
         }
         mock_item.input = {
             "request": AutofixRequest(
@@ -368,7 +372,10 @@ def test_score_root_cause_single_it(self, mock_llm_client, mock_dataset_item):
         mock_llm_instance = Mock()
         mock_llm_client.return_value = mock_llm_instance
         mock_llm_instance.generate_text.return_value = LlmGenerateTextResponse(
-            message=Message(role="assistant", content="<score>0.8</score><verdict>True</verdict>"),
+            message=Message(
+                role="assistant",
+                content="<score>0.8</score><verdict>True</verdict><helpful>True</helpful>",
+            ),
             metadata=LlmResponseMetadata(
                 model="test_model",
                 provider_name=LlmProviderType.OPENAI,
@@ -396,7 +403,7 @@ def test_score_root_cause_single_it(self, mock_llm_client, mock_dataset_item):
 
         result = score_root_cause_single_it(mock_dataset_item, causes, model="test_model")
 
-        assert result == (0.8, True)
+        assert result == (0.8, True, True)
         mock_llm_instance.generate_text.assert_called_once()
 
     @patch("seer.automation.autofix.evaluations.LlmClient")
@@ -432,7 +439,7 @@ def test_score_root_cause_single_it_no_score(self, mock_llm_client, mock_dataset
 
         result = score_root_cause_single_it(mock_dataset_item, causes, model="test_model")
 
-        assert result == (0, False)
+        assert result == (0, False, False)
 
     def test_score_root_cause_single_it_missing_expected_output(self, mock_dataset_item):
         mock_dataset_item.expected_output = None
@@ -466,7 +473,11 @@ def mock_dataset_item(self):
 
     @patch("seer.automation.autofix.evaluations.score_root_cause_single_it")
     def test_score_root_causes(self, mock_score_root_cause_single_it, mock_dataset_item):
-        mock_score_root_cause_single_it.side_effect = [(0.8, True), (0.7, True), (0.9, True)]
+        mock_score_root_cause_single_it.side_effect = [
+            (0.8, True, True),
+            (0.7, True, True),
+            (0.9, True, True),
+        ]
 
         causes = [
             RootCauseAnalysisItem(
@@ -488,15 +499,18 @@ def test_score_root_causes(self, mock_score_root_cause_single_it, mock_dataset_i
 
         result = score_root_causes(mock_dataset_item, causes, n_panel=3, model="test_model")
 
-        assert result == (0.8, True)  # Average score and majority verdict
+        assert result == (0.8, True, True)  # Average score and majority verdict
         assert mock_score_root_cause_single_it.call_count == 3
 
     @patch("seer.automation.autofix.evaluations.score_root_cause_single_it")
     def test_score_root_causes_custom_n_panel(
         self, mock_score_root_cause_single_it, mock_dataset_item
     ):
         # Changed side effect to have majority True (2 True, 0 False)
-        mock_score_root_cause_single_it.side_effect = [(0.6, True), (0.8, True)]
+        mock_score_root_cause_single_it.side_effect = [
+            (0.6, True, True),
+            (0.8, True, True),
+        ]
 
         causes = [
             RootCauseAnalysisItem(
@@ -518,7 +532,7 @@ def test_score_root_causes_custom_n_panel(
 
         result = score_root_causes(mock_dataset_item, causes, n_panel=2, model="test_model")
 
-        assert result == (0.7, True)  # Average score and majority verdict (2 True > 0 False)
+        assert result == (0.7, True, True)  # Average score and majority verdict (2 True > 0 False)
         assert mock_score_root_cause_single_it.call_count == 2