fix(autofix): Some fixes (#1904)

roaga · web-flow · commit c35f03e9dc32 · 2025-02-11T11:15:12.000-08:00
Not going ahead with DeepClaude, but discovered some bug in converting
tool calls between OpenAI, Anthropic, and Gemini, so fixing that. Also
adding support for reasoning-related parameters for OpenAI.

Also improving the get_valid_file_paths check that the AI suggested.

And fixes the exception logging in get_file_content so we don't get
spammed by Sentry.

And change the index at which we insert the initial prompt in the
solution/coding step to prevent some hallucinations we were seeing.
diff --git a/src/seer/automation/agent/agent.py b/src/seer/automation/agent/agent.py
@@ -35,6 +35,7 @@ class RunConfig(BaseModel):
     memory_storage_key: str | None = None
     temperature: float | None = 0.0
     run_name: str | None = None
+    reasoning_effort: str | None = None
 
 
 class LlmAgent:
@@ -62,6 +63,7 @@ def get_completion(self, run_config: RunConfig):
             system_prompt=run_config.system_prompt if run_config.system_prompt else None,
             tools=(self.tools if len(self.tools) > 0 else None),
             temperature=run_config.temperature or 0.0,
+            reasoning_effort=run_config.reasoning_effort,
         )
 
     def run_iteration(self, run_config: RunConfig):
diff --git a/src/seer/automation/agent/client.py b/src/seer/automation/agent/client.py
@@ -109,12 +109,14 @@ def generate_text(
         max_tokens: int | None = None,
         timeout: float | None = None,
         predicted_output: str | None = None,
+        reasoning_effort: str | None = None,
     ):
         message_dicts, tool_dicts = self._prep_message_and_tools(
             messages=messages,
             prompt=prompt,
             system_prompt=system_prompt,
             tools=tools,
+            reasoning_effort=reasoning_effort,
         )
 
         openai_client = self.get_client()
@@ -138,6 +140,7 @@ def generate_text(
                 if predicted_output
                 else openai.NotGiven()
             ),
+            reasoning_effort=reasoning_effort if reasoning_effort else openai.NotGiven(),
         )
 
         openai_message = completion.choices[0].message
@@ -183,12 +186,14 @@ def generate_structured(
         response_format: Type[StructuredOutputType],
         max_tokens: int | None = None,
         timeout: float | None = None,
+        reasoning_effort: str | None = None,
     ) -> LlmGenerateStructuredResponse[StructuredOutputType]:
         message_dicts, tool_dicts = self._prep_message_and_tools(
             messages=messages,
             prompt=prompt,
             system_prompt=system_prompt,
             tools=tools,
+            reasoning_effort=reasoning_effort,
         )
 
         openai_client = self.get_client()
@@ -205,6 +210,7 @@ def generate_structured(
             response_format=response_format,
             max_tokens=max_tokens or openai.NotGiven(),
             timeout=timeout or openai.NotGiven(),
+            reasoning_effort=reasoning_effort if reasoning_effort else openai.NotGiven(),
         )
 
         openai_message = completion.choices[0].message
@@ -244,6 +250,7 @@ def to_message_dict(message: Message) -> ChatCompletionMessageParam:
                 new_item["type"] = "function"
                 parsed_tool_calls.append(new_item)
             message_dict["tool_calls"] = parsed_tool_calls
+            message_dict["role"] = "assistant"
 
         if message.tool_call_id:
             message_dict["tool_call_id"] = message.tool_call_id
@@ -284,11 +291,18 @@ def _prep_message_and_tools(
         prompt: str | None = None,
         system_prompt: str | None = None,
         tools: list[FunctionTool] | None = None,
+        reasoning_effort: str | None = None,
     ):
         message_dicts = [cls.to_message_dict(message) for message in messages] if messages else []
         if system_prompt:
             message_dicts.insert(
-                0, cls.to_message_dict(Message(role="system", content=system_prompt))
+                0,
+                cls.to_message_dict(
+                    Message(
+                        role="system" if not reasoning_effort else "developer",
+                        content=system_prompt,
+                    )
+                ),
             )
         if prompt:
             message_dicts.append(cls.to_message_dict(Message(role="user", content=prompt)))
@@ -310,12 +324,14 @@ def generate_text_stream(
         temperature: float | None = None,
         max_tokens: int | None = None,
         timeout: float | None = None,
+        reasoning_effort: str | None = None,
     ) -> Iterator[str | ToolCall | Usage]:
         message_dicts, tool_dicts = self._prep_message_and_tools(
             messages=messages,
             prompt=prompt,
             system_prompt=system_prompt,
             tools=tools,
+            reasoning_effort=reasoning_effort,
         )
 
         openai_client = self.get_client()
@@ -333,6 +349,7 @@ def generate_text_stream(
             timeout=timeout or openai.NotGiven(),
             stream=True,
             stream_options={"include_usage": True},
+            reasoning_effort=reasoning_effort if reasoning_effort else openai.NotGiven(),
         )
 
         try:
@@ -515,7 +532,7 @@ def to_message_param(message: Message) -> MessageParam:
                     )
                 ],
             )
-        elif message.role == "tool_use":
+        elif message.role == "tool_use" or (message.role == "assistant" and message.tool_calls):
             if not message.tool_calls:
                 return MessageParam(role="assistant", content=[])
             tool_call = message.tool_calls[0]  # Assuming only one tool call per message
@@ -679,14 +696,6 @@ def construct_message_from_stream(
 
 @dataclass
 class GeminiProvider:
-    # !!! NOTE THE FOLLOWING LIMITATIONS FOR GEMINI:
-    # - super strict rate limits making it unusable for evals or prod
-    # - no multi-turn tool use
-    # - no nested Pydantic models for structured outputs
-    # - no nullable fields for structured outputs
-    # - no dynamic retrieval for google search
-    # These will likely be changed as the SDK matures. Make sure to keep an eye on updates and update these notes/our implementation as needed.
-
     model_name: str
     provider_name = LlmProviderType.GEMINI
     defaults: LlmProviderDefaults | None = None
@@ -985,7 +994,7 @@ def _prep_message_and_tools(
 
     @staticmethod
     def to_content(message: Message) -> Content:
-        if message.role == "tool_use":
+        if message.role == "tool_use" or (message.role == "assistant" and message.tool_calls):
             if not message.tool_calls:
                 return Content(
                     role="model",
@@ -1117,6 +1126,7 @@ def generate_text(
         run_name: str | None = None,
         timeout: float | None = None,
         predicted_output: str | None = None,
+        reasoning_effort: str | None = None,
     ) -> LlmGenerateTextResponse:
         try:
             if run_name:
@@ -1140,6 +1150,7 @@ def generate_text(
                     tools=tools,
                     timeout=timeout,
                     predicted_output=predicted_output,
+                    reasoning_effort=reasoning_effort,
                 )
             elif model.provider_name == LlmProviderType.ANTHROPIC:
                 model = cast(AnthropicProvider, model)
@@ -1182,6 +1193,7 @@ def generate_structured(
         max_tokens: int | None = None,
         run_name: str | None = None,
         timeout: float | None = None,
+        reasoning_effort: str | None = None,
     ) -> LlmGenerateStructuredResponse[StructuredOutputType]:
         try:
             if run_name:
@@ -1203,6 +1215,7 @@ def generate_structured(
                     temperature=temperature,
                     tools=tools,
                     timeout=timeout,
+                    reasoning_effort=reasoning_effort,
                 )
             elif model.provider_name == LlmProviderType.ANTHROPIC:
                 raise NotImplementedError("Anthropic structured outputs are not yet supported")
@@ -1236,6 +1249,7 @@ def generate_text_stream(
         max_tokens: int | None = None,
         run_name: str | None = None,
         timeout: float | None = None,
+        reasoning_effort: str | None = None,
     ) -> Iterator[str | ToolCall | Usage]:
         try:
             if run_name:
@@ -1260,6 +1274,7 @@ def generate_text_stream(
                     temperature=temperature or default_temperature,
                     tools=tools,
                     timeout=timeout,
+                    reasoning_effort=reasoning_effort,
                 )
             elif model.provider_name == LlmProviderType.ANTHROPIC:
                 model = cast(AnthropicProvider, model)
diff --git a/src/seer/automation/autofix/autofix_agent.py b/src/seer/automation/autofix/autofix_agent.py
@@ -84,6 +84,7 @@ def _get_completion(self, run_config: RunConfig):
             system_prompt=run_config.system_prompt if run_config.system_prompt else None,
             tools=(self.tools if len(self.tools) > 0 else None),
             temperature=run_config.temperature or 0.0,
+            reasoning_effort=run_config.reasoning_effort,
         )
 
         cleared = False
diff --git a/src/seer/automation/autofix/components/coding/component.py b/src/seer/automation/autofix/components/coding/component.py
@@ -223,15 +223,11 @@ def invoke(self, request: CodingRequest) -> CodingOutput | None:
             custom_solution = request.solution if isinstance(request.solution, str) else None
 
             if not request.initial_memory:
-                agent.memory.insert(
-                    0,
-                    Message(
-                        role="user",
-                        content=CodingPrompts.format_fix_msg(
-                            has_tools=not is_obvious,
-                            custom_solution=custom_solution,
-                            mode=request.mode,
-                        ),
+                agent.add_user_message(
+                    CodingPrompts.format_fix_msg(
+                        has_tools=not is_obvious,
+                        custom_solution=custom_solution,
+                        mode=request.mode,
                     ),
                 )
 
diff --git a/src/seer/automation/autofix/components/solution/component.py b/src/seer/automation/autofix/components/solution/component.py
@@ -142,19 +142,15 @@ def invoke(
 
             state = self.context.state.get()
             if not request.initial_memory:
-                agent.memory.insert(
-                    0,
-                    Message(
-                        role="user",
-                        content=SolutionPrompts.format_default_msg(
-                            event=request.event_details.format_event(),
-                            root_cause=request.root_cause_and_fix,
-                            summary=request.summary,
-                            repo_names=[repo.full_name for repo in state.request.repos],
-                            original_instruction=request.original_instruction,
-                            code_map=request.profile,
-                            has_tools=not is_obvious,
-                        ),
+                agent.add_user_message(
+                    SolutionPrompts.format_default_msg(
+                        event=request.event_details.format_event(),
+                        root_cause=request.root_cause_and_fix,
+                        summary=request.summary,
+                        repo_names=[repo.full_name for repo in state.request.repos],
+                        original_instruction=request.original_instruction,
+                        code_map=request.profile,
+                        has_tools=not is_obvious,
                     ),
                 )
 
diff --git a/src/seer/automation/codebase/repo_client.py b/src/seer/automation/codebase/repo_client.py
@@ -333,7 +333,9 @@ def get_file_content(
                     )
                     path = closest_match
                 else:
-                    logger.error(f"No matching file found for path: {path}")
+                    logger.exception(
+                        "No matching file found for provided file path", extra={"path": path}
+                    )
                     return None, "utf-8"
 
         try:
@@ -366,9 +368,8 @@ def get_valid_file_paths(self, sha: str | None = None, files_only=False) -> set[
         valid_file_paths: set[str] = set()
 
         for file in tree.tree:
-            if files_only and "." not in file.path:
-                continue
-            valid_file_paths.add(file.path)
+            if file.type == "blob":
+                valid_file_paths.add(file.path)
 
         return valid_file_paths
 
diff --git a/tests/automation/codebase/test_repo_client.py b/tests/automation/codebase/test_repo_client.py
@@ -137,7 +137,10 @@ def test_fail_get_file_content(self, mock_requests, repo_client, mock_github):
 
     def test_get_valid_file_paths(self, repo_client, mock_github):
         mock_tree = MagicMock()
-        mock_tree.tree = [MagicMock(path="file1.py"), MagicMock(path="file2.py")]
+        mock_tree.tree = [
+            MagicMock(path="file1.py", type="blob"),
+            MagicMock(path="file2.py", type="blob"),
+        ]
         mock_tree.raw_data = {"truncated": False}
         mock_github.get_repo.return_value.get_git_tree.return_value = mock_tree
 

Original file line number	Diff line number	Diff line change
`@@ -84,6 +84,7 @@ def _get_completion(self, run_config: RunConfig):`
`84`	`84`	`system_prompt=run_config.system_prompt if run_config.system_prompt else None,`
`85`	`85`	`tools=(self.tools if len(self.tools) > 0 else None),`
`86`	`86`	`temperature=run_config.temperature or 0.0,`
	`87`	`+ reasoning_effort=run_config.reasoning_effort,`
`87`	`88`	`)`
`88`	`89`
`89`	`90`	`cleared = False`