Skip to content

Commit

Permalink
feat(autofix-evals): o3-mini for evals + rca eval scorer prompt (#1869)
Browse files Browse the repository at this point in the history
o3 mini for scoring, + new prompt for root cause that should fix issues
that we were seeing such as
- Mistaking the expected for the predicted
- Not being critical enough of the root cause
- Being _too strict_ 

Also adds a new metric for how helpful it is

![CleanShot 2025-02-04 at 15 40
06](https://github.com/user-attachments/assets/5e138b3e-eb04-469d-bdd6-9380a33caabe)
  • Loading branch information
jennmueng authored Feb 4, 2025
1 parent 41b98bf commit d31fb5a
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 21 deletions.
4 changes: 4 additions & 0 deletions src/seer/automation/agent/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ class OpenAiProvider:
match=r"^o1-preview.*",
defaults=LlmProviderDefaults(temperature=1.0),
),
LlmModelDefaultConfig(
match=r"^o3-mini.*",
defaults=LlmProviderDefaults(temperature=1.0),
),
LlmModelDefaultConfig(
match=r".*",
defaults=LlmProviderDefaults(temperature=0.0),
Expand Down
38 changes: 26 additions & 12 deletions src/seer/automation/autofix/evaluations.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def score_fix_single_it(
@observe(name="Score root cause iteration")
def score_root_cause_single_it(
dataset_item: DatasetItemClient, causes: list[RootCauseAnalysisItem], model: str
) -> tuple[float, bool]:
) -> tuple[float, bool, bool]:
if not dataset_item.expected_output:
raise ValueError("Expected output is missing from dataset item")

Expand All @@ -325,23 +325,32 @@ def score_root_cause_single_it(
Given the above issue, we know the correct root cause of the issue is:
<true_root_cause>
{expected_output}
The model outputted the following possible root causes and solutions:
The solution to this root cause is:
{expected_solution_description}
{expected_solution_diff}
</true_root_cause>
We have an AI model say that the root cause of the issue is:
<predicted_solutions>
<predicted_root_cause>
{predicted_solution}
</predicted_solutions>
</predicted_root_cause>
Score how well the predicted root cause and solution matches the expected root cause and solution with a float score from 0 to 1, where 1 means the solution fully fixes the issue and 0 means the solution does not fix the issue at all.
- The model will return multiple predicted root causes and solutions, ordered from most likely to least likely.
Score how well the AI model's predicted root cause aligns with the true known root cause with a float score from 0 to 1, where 1 means the predicted root cause is the correct root cause and 0 means the predicted root cause is completely incorrect.
Think step-by-step inside a <thoughts> tag before giving scores.
Score each solution inside a <score> tag, such as <score>0.5</score>.
Also, return your verdict of whether the predicted solution is the correct root cause of the issue with a boolean value inside a <verdict> tag, such as <verdict>True</verdict> or <verdict>False</verdict>."""
Provide your reasoning of why you gave this score inside a <reasoning> tag.
Place the score inside a <score> tag, such as <score>0.5</score>.
Also, return your verdict of whether the predicted root cause accurately represents the true root cause of the issue with a boolean value inside a <verdict> tag, such as <verdict>True</verdict> or <verdict>False</verdict>.
You should also grade whether the model's predicted root cause would be helpful to the developer in fixing the issue with a boolean value inside a <helpful> tag, such as <helpful>True</helpful> or <helpful>False</helpful>."""
).format(
event_details=event_details.format_event(),
expected_output=root_cause_expected_str,
expected_solution_description=expected_output["solution_diff"]["description"],
expected_solution_diff=expected_output["solution_diff"]["unified_diff"],
predicted_solution=cause_xml.to_prompt_str(),
)
response = LlmClient().generate_text(
Expand All @@ -357,7 +366,10 @@ def score_root_cause_single_it(
verdict_str = extract_text_inside_tags(response.message.content, "verdict")
verdict_bool = (verdict_str or "False").lower() == "true"

return score, verdict_bool
helpful_str = extract_text_inside_tags(response.message.content, "helpful")
helpful_bool = (helpful_str or "False").lower() == "true"

return score, verdict_bool, helpful_bool


@observe(name="Score one")
Expand All @@ -377,15 +389,17 @@ def score_one(
@observe(name="Score root cause")
def score_root_causes(
dataset_item: DatasetItemClient, causes: list[RootCauseAnalysisItem], n_panel: int, model: str
) -> tuple[float, bool]:
) -> tuple[float, bool, bool]:
results = [score_root_cause_single_it(dataset_item, causes, model) for _ in range(n_panel)]

mean_score = round(sum([result[0] for result in results]) / len(results), 2)

# If at least half of the panel says the fix is correct, then the fix is correct.
verdict = sum(1 for result in results if result[1]) >= len(results) / 2

return mean_score, verdict
helpful = sum(1 for result in results if result[2]) >= len(results) / 2

return mean_score, verdict, helpful


def make_score_name(model: str, n_panel: int, name: str) -> str:
Expand Down
11 changes: 9 additions & 2 deletions src/seer/automation/autofix/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -654,7 +654,7 @@ def run_autofix_evaluation_on_item(
)

scoring_n_panel = 5
scoring_model = "o1-mini-2024-09-12"
scoring_model = "o3-mini-2025-01-31"

diff: str | None = None
causes: list[RootCauseAnalysisItem] | None = None
Expand Down Expand Up @@ -802,7 +802,7 @@ def run_autofix_evaluation_on_item(
)

if causes:
root_cause_score, root_cause_verdict = score_root_causes(
root_cause_score, root_cause_verdict, root_cause_helpful = score_root_causes(
dataset_item,
causes,
n_panel=scoring_n_panel,
Expand All @@ -821,6 +821,13 @@ def run_autofix_evaluation_on_item(
),
value=1 if root_cause_verdict else 0,
)
langfuse.score(
trace_id=trace_id,
name=make_score_name(
model=scoring_model, n_panel=scoring_n_panel, name="rc_is_helpful"
),
value=1 if root_cause_helpful else 0,
)
langfuse.score(
trace_id=trace_id,
name=make_score_name(
Expand Down
28 changes: 21 additions & 7 deletions tests/automation/autofix/test_autofix_evaluations.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,10 @@ def mock_dataset_item(self):
"root_cause": "Expected root cause",
"solution_summary": "Expected solution summary",
"diff": {"file_path": "test.py", "code_diff": "expected diff"},
"solution_diff": {
"description": "expected solution diff",
"unified_diff": "expected solution diff",
},
}
mock_item.input = {
"request": AutofixRequest(
Expand All @@ -368,7 +372,10 @@ def test_score_root_cause_single_it(self, mock_llm_client, mock_dataset_item):
mock_llm_instance = Mock()
mock_llm_client.return_value = mock_llm_instance
mock_llm_instance.generate_text.return_value = LlmGenerateTextResponse(
message=Message(role="assistant", content="<score>0.8</score><verdict>True</verdict>"),
message=Message(
role="assistant",
content="<score>0.8</score><verdict>True</verdict><helpful>True</helpful>",
),
metadata=LlmResponseMetadata(
model="test_model",
provider_name=LlmProviderType.OPENAI,
Expand Down Expand Up @@ -396,7 +403,7 @@ def test_score_root_cause_single_it(self, mock_llm_client, mock_dataset_item):

result = score_root_cause_single_it(mock_dataset_item, causes, model="test_model")

assert result == (0.8, True)
assert result == (0.8, True, True)
mock_llm_instance.generate_text.assert_called_once()

@patch("seer.automation.autofix.evaluations.LlmClient")
Expand Down Expand Up @@ -432,7 +439,7 @@ def test_score_root_cause_single_it_no_score(self, mock_llm_client, mock_dataset

result = score_root_cause_single_it(mock_dataset_item, causes, model="test_model")

assert result == (0, False)
assert result == (0, False, False)

def test_score_root_cause_single_it_missing_expected_output(self, mock_dataset_item):
mock_dataset_item.expected_output = None
Expand Down Expand Up @@ -466,7 +473,11 @@ def mock_dataset_item(self):

@patch("seer.automation.autofix.evaluations.score_root_cause_single_it")
def test_score_root_causes(self, mock_score_root_cause_single_it, mock_dataset_item):
mock_score_root_cause_single_it.side_effect = [(0.8, True), (0.7, True), (0.9, True)]
mock_score_root_cause_single_it.side_effect = [
(0.8, True, True),
(0.7, True, True),
(0.9, True, True),
]

causes = [
RootCauseAnalysisItem(
Expand All @@ -488,15 +499,18 @@ def test_score_root_causes(self, mock_score_root_cause_single_it, mock_dataset_i

result = score_root_causes(mock_dataset_item, causes, n_panel=3, model="test_model")

assert result == (0.8, True) # Average score and majority verdict
assert result == (0.8, True, True) # Average score and majority verdict
assert mock_score_root_cause_single_it.call_count == 3

@patch("seer.automation.autofix.evaluations.score_root_cause_single_it")
def test_score_root_causes_custom_n_panel(
self, mock_score_root_cause_single_it, mock_dataset_item
):
# Changed side effect to have majority True (2 True, 0 False)
mock_score_root_cause_single_it.side_effect = [(0.6, True), (0.8, True)]
mock_score_root_cause_single_it.side_effect = [
(0.6, True, True),
(0.8, True, True),
]

causes = [
RootCauseAnalysisItem(
Expand All @@ -518,7 +532,7 @@ def test_score_root_causes_custom_n_panel(

result = score_root_causes(mock_dataset_item, causes, n_panel=2, model="test_model")

assert result == (0.7, True) # Average score and majority verdict (2 True > 0 False)
assert result == (0.7, True, True) # Average score and majority verdict (2 True > 0 False)
assert mock_score_root_cause_single_it.call_count == 2


Expand Down

0 comments on commit d31fb5a

Please sign in to comment.