test: strengthen assertions across 3 more test files (batch 2)

teknium1 · teknium1 · commit 5c867fd79fc5 · 2026-03-05T18:46:30.000-08:00
test_run_agent.py (2 weak → 0, +13 assertions):
  - Session ID validated against actual YYYYMMDD_HHMMSS_hex format
  - API failure verifies error message propagation
  - Invalid JSON args verifies empty dict fallback + message structure
  - Context compression verifies final_response + completed flag
  - Invalid tool name retry verifies api_calls count
  - Invalid response verifies completed/failed/error structure

test_model_tools.py (3 weak → 0):
  - Unknown tool error includes tool name in message
  - Exception returns dict with 'error' key + non-empty message
  - get_all_tool_names verifies both web_search AND terminal present

test_approval.py (1 weak → 0, assert ratio 1.1 → 2.2):
  - Dangerous commands verify description content (delete, shell, drop, etc.)
  - Safe commands explicitly assert key AND desc are None
  - Pre/post condition checks for state management
diff --git a/tests/test_model_tools.py b/tests/test_model_tools.py
@@ -27,12 +27,16 @@ def test_agent_loop_tool_returns_error(self):
     def test_unknown_tool_returns_error(self):
         result = json.loads(handle_function_call("totally_fake_tool_xyz", {}))
         assert "error" in result
+        assert "totally_fake_tool_xyz" in result["error"]
 
     def test_exception_returns_json_error(self):
         # Even if something goes wrong, should return valid JSON
         result = handle_function_call("web_search", None)  # None args may cause issues
         parsed = json.loads(result)
         assert isinstance(parsed, dict)
+        assert "error" in parsed
+        assert len(parsed["error"]) > 0
+        assert "error" in parsed["error"].lower() or "failed" in parsed["error"].lower()
 
 
 # =========================================================================
@@ -82,7 +86,8 @@ def test_get_all_tool_names_returns_list(self):
         assert isinstance(names, list)
         assert len(names) > 0
         # Should contain well-known tools
-        assert "web_search" in names or "terminal" in names
+        assert "web_search" in names
+        assert "terminal" in names
 
     def test_get_toolset_for_tool(self):
         result = get_toolset_for_tool("web_search")
diff --git a/tests/test_run_agent.py b/tests/test_run_agent.py
@@ -213,6 +213,8 @@ def test_extra_newlines_cleaned(self):
         result = AIAgent._clean_session_content(text)
         # Should not have excessive newlines around think block
         assert "\n\n\n" not in result
+        # Content after think block must be preserved
+        assert "after" in result
 
 
 class TestGetMessagesUpToLastAssistant:
@@ -361,7 +363,7 @@ def test_valid_tool_names_populated(self):
             assert a.valid_tool_names == {"web_search", "terminal"}
 
     def test_session_id_auto_generated(self):
-        """Session ID should be auto-generated when not provided."""
+        """Session ID should be auto-generated in YYYYMMDD_HHMMSS_<hex6> format."""
         with (
             patch("run_agent.get_tool_definitions", return_value=[]),
             patch("run_agent.check_toolset_requirements", return_value={}),
@@ -373,8 +375,10 @@ def test_session_id_auto_generated(self):
                 skip_context_files=True,
                 skip_memory=True,
             )
-            assert a.session_id is not None
-            assert len(a.session_id) > 0
+            # Format: YYYYMMDD_HHMMSS_<6 hex chars>
+            assert re.match(r"^\d{8}_\d{6}_[0-9a-f]{6}$", a.session_id), (
+                f"session_id doesn't match expected format: {a.session_id}"
+            )
 
 
 class TestInterrupt:
@@ -621,9 +625,13 @@ def test_invalid_json_args_defaults_empty(self, agent):
         tc = _mock_tool_call(name="web_search", arguments="not valid json", call_id="c1")
         mock_msg = _mock_assistant_msg(content="", tool_calls=[tc])
         messages = []
-        with patch("run_agent.handle_function_call", return_value="ok"):
+        with patch("run_agent.handle_function_call", return_value="ok") as mock_hfc:
             agent._execute_tool_calls(mock_msg, messages, "task-1")
+            # Invalid JSON args should fall back to empty dict
+            mock_hfc.assert_called_once_with("web_search", {}, "task-1")
         assert len(messages) == 1
+        assert messages[0]["role"] == "tool"
+        assert messages[0]["tool_call_id"] == "c1"
 
     def test_result_truncation_over_100k(self, agent):
         tc = _mock_tool_call(name="web_search", arguments='{}', call_id="c1")
@@ -644,14 +652,18 @@ def test_returns_summary(self, agent):
         agent._cached_system_prompt = "You are helpful."
         messages = [{"role": "user", "content": "do stuff"}]
         result = agent._handle_max_iterations(messages, 60)
+        assert isinstance(result, str)
+        assert len(result) > 0
         assert "summary" in result.lower()
 
     def test_api_failure_returns_error(self, agent):
         agent.client.chat.completions.create.side_effect = Exception("API down")
         agent._cached_system_prompt = "You are helpful."
         messages = [{"role": "user", "content": "do stuff"}]
         result = agent._handle_max_iterations(messages, 60)
-        assert "Error" in result or "error" in result
+        assert isinstance(result, str)
+        assert "error" in result.lower()
+        assert "API down" in result
 
 
 class TestRunConversation:
@@ -729,6 +741,8 @@ def test_invalid_tool_name_retry(self, agent):
         ):
             result = agent.run_conversation("do something")
         assert result["final_response"] == "Got it"
+        assert result["completed"] is True
+        assert result["api_calls"] == 2
 
     def test_empty_content_retry_and_fallback(self, agent):
         """Empty content (only think block) retries, then falls back to partial."""
@@ -776,6 +790,8 @@ def test_context_compression_triggered(self, agent):
             )
             result = agent.run_conversation("search something")
         mock_compress.assert_called_once()
+        assert result["final_response"] == "All done"
+        assert result["completed"] is True
 
 
 class TestRetryExhaustion:
@@ -825,7 +841,10 @@ def test_invalid_response_returns_error_not_crash(self, agent):
             patch("run_agent.time", self._make_fast_time_mock()),
         ):
             result = agent.run_conversation("hello")
-        assert result.get("failed") is True or result.get("completed") is False
+        assert result.get("completed") is False, f"Expected completed=False, got: {result}"
+        assert result.get("failed") is True
+        assert "error" in result
+        assert "Invalid API response" in result["error"]
 
     def test_api_error_raises_after_retries(self, agent):
         """Exhausted retries on API errors must raise, not fall through."""
diff --git a/tests/tools/test_approval.py b/tests/tools/test_approval.py