AI-BOB/new_tests.py at master · gcp64/AI-BOB · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import os
import sys
import unittest
import time
import json
import re

# Add root folder to path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from base_tool import BaseTool, register_tool, TOOL_REGISTRY
from nlp_engine import ArabicNLPEngine
from system_manager import SystemManager
from llm_agent import LLMAgent
from thinking_engine import ThinkingEngine


class CustomTestTool(BaseTool):
    """A mock tool used to verify dynamic type-annotated parameters schema auto-generation."""
    name = "CUSTOM_TEST_TOOL"
    description = "أداة تجريبية لفحص الإنشاء التلقائي للمخطط المعلمي."

    def execute(self, folder_name: str, delete_on_exit: bool = False, max_files: int = 100):
        """
        أداة تجريبية.
        folder_name: اسم المجلد المطلوب
        delete_on_exit: تحديد ما إذا كان يجب مسح الملفات عند الخروج
        max_files: الحد الأقصى للملفات المسموح بها
        """
        pass

    def call(self, params, **kwargs):
        return "success"


class TestAIBobAdvancedSuite(unittest.TestCase):
    def test_type_annotated_schema_generation(self):
        """Verify dynamic type-annotated parameters schema auto-generation using inspection."""
        tool = CustomTestTool()
        schema = tool.parameters

        # Verify schema structure
        self.assertEqual(schema.get("type"), "object")
        properties = schema.get("properties", {})
        self.assertIn("folder_name", properties)
        self.assertIn("delete_on_exit", properties)
        self.assertIn("max_files", properties)

        # Verify type mappings
        self.assertEqual(properties["folder_name"]["type"], "string")
        self.assertEqual(properties["delete_on_exit"]["type"], "boolean")
        self.assertEqual(properties["max_files"]["type"], "integer")

        # Verify descriptions extracted from docstring
        self.assertIn("اسم المجلد", properties["folder_name"]["description"])
        self.assertIn("مسح الملفات", properties["delete_on_exit"]["description"])

        # Verify required arguments list
        required = schema.get("required", [])
        self.assertIn("folder_name", required)
        self.assertNotIn("delete_on_exit", required) # has default value
        self.assertNotIn("max_files", required) # has default value

    def test_semantic_cache(self):
        """Verify that identical/similar queries are retrieved instantly from semantic cache."""
        agent = LLMAgent(load_now=False) # Disable real model load
        agent.is_loaded = False

        query_1 = "شيك الرامات عيني فدوة"
        query_2 = "شيك الرامات عيني فدوة!"

        # Prime the cache manually
        normalized_q1 = agent._normalize_key(query_1)
        agent.semantic_cache[normalized_q1] = "تم فحص الرامات وطلعت 8 جيجا [RUN_COMMAND: CHECK_RAM]"

        # Verify normalization key matches
        normalized_q2 = agent._normalize_key(query_2)
        self.assertEqual(normalized_q1, normalized_q2)

        # Check cache hit in generate_stream
        start_time = time.time()
        stream_generator = agent.generate_stream(query_2, [])
        result = "".join(list(stream_generator))
        duration_ms = (time.time() - start_time) * 1000

        self.assertIn("CHECK_RAM", result)
        # Cache hit should be near-instant (under 200ms)
        self.assertLess(duration_ms, 200, f"Cache latency is too high: {duration_ms:.2f}ms")

    def test_context_window_pruner(self):
        """Verify that chat history is pruned and compressed dynamically to avoid token overflows."""
        agent = LLMAgent(load_now=False)

        # Create a mock long conversation history (exceeds token budget)
        long_history = []
        for i in range(15):
            long_history.append({"sender": "user", "text": f"رسالة تجريبية طويلة جداً من المستخدم رقم {i}"})
            long_history.append({"sender": "assistant", "text": f"رد البوت التجريبي الطويل جداً على الرسالة رقم {i}"})

        # Verify history starts very long
        self.assertEqual(len(long_history), 30)

        # Run pruner
        pruned = agent._prune_history(long_history, max_tokens=200)

        # Assert size is reduced
        self.assertLess(len(pruned), len(long_history))
        # Keep last 4 messages (for continuity)
        self.assertEqual(len(pruned), 4)

        # Verify the older logs are compacted into active pruned summary
        self.assertTrue(hasattr(agent, "_active_pruned_summary"))
        self.assertTrue(agent._active_pruned_summary.startswith("سياق الحوار المتقادم:"))
        self.assertIn("المستخدم", agent._active_pruned_summary)

        # Check formatting injection
        prompt = agent.format_chat_prompt("شلونك", long_history)
        self.assertIn(agent._active_pruned_summary, prompt)

    def test_react_parsing_robustness(self):
        """Verify Qwen ReAct action parsing and traditional command tag extraction."""
        engine = ThinkingEngine()

        # 1. Test Qwen ReAct Format
        qwen_react_output = (
            "<think>\nيحتاج فحص المعالج.\n</think>\n"
            "Action: CHECK_CPU\n"
            "Action Input: {}\n"
        )
        has_act, act_name, act_input, cleaned = engine.parse_action(qwen_react_output)
        self.assertTrue(has_act)
        self.assertEqual(act_name, "CHECK_CPU")
        self.assertEqual(act_input, "{}")

        # 2. Test Traditional Tag Format
        traditional_output = "صار عيني هسة أشيكلك المعالج [RUN_COMMAND: CHECK_CPU]"
        has_act, act_name, act_input, cleaned = engine.parse_action(traditional_output)
        self.assertTrue(has_act)
        self.assertEqual(act_name, "CHECK_CPU")
        self.assertEqual(act_input, "")
        self.assertNotIn("[RUN_COMMAND:", cleaned)

    def test_arabic_nlp_engine(self):
        """Verify Iraqi and standard Arabic intent parsing in ArabicNLPEngine."""
        nlp = ArabicNLPEngine()

        # RAM Check
        res = nlp.parse("شيكلي رامات الجهاز فدوة")
        self.assertEqual(res["intent"], "SYSTEM_INFO")
        self.assertEqual(res["sub_intent"], "ram")

        # File Create
        res = nlp.parse("سويلي فولدر جديد باسم مشاريع في D:\\")
        self.assertEqual(res["intent"], "FILE_CREATE")
        self.assertEqual(res["parameters"].get("name"), "مشاريع")

        # App Opening
        res = nlp.parse("افتح الحاسبة")
        self.assertEqual(res["intent"], "EXECUTE_CMD")
        self.assertEqual(res["sub_intent"], "open_calc")

    def test_system_manager_metrics(self):
        """Verify SystemManager returns active and valid Windows metrics."""
        mgr = SystemManager()

        cpu = mgr.get_cpu_usage()
        self.assertIsInstance(cpu, (int, float))
        self.assertTrue(0 <= cpu <= 100)

        ram = mgr.get_ram_usage()
        self.assertIn("percent", ram)
        self.assertTrue(0 <= ram["percent"] <= 100)

        temp = mgr.get_cpu_temperature()
        self.assertIsInstance(temp, (int, float))

    def test_bob_identity_safety_and_no_emojis(self):
        """Verify that Bob's developer identity is safe and polisher removes emojis."""
        engine = ThinkingEngine()

        # 1. Test identity safety (non-identity queries should not leak identity details)
        leaked_response = "هذا البرنامج كتبه مستر بوب وهو بالمرتبة 7 على غيت هاب."
        safe_response = engine.identity_leak_filter(leaked_response, "كم الرام؟")
        self.assertNotIn("مستر بوب", safe_response)

        # 2. Test emoji cleanup (strict user requirement: absolutely no emojis in response)
        emoji_response = "صار عيني! هسة راح أنظفلك الكاش 😊🚀💻🔥"
        polished_response = engine.format_response(emoji_response)

        # Check no emojis exist in final text
        # Regex matches emojis/symbols
        emojis_found = re.findall(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF]', polished_response)
        self.assertEqual(len(emojis_found), 0, f"Leaked emojis found: {emojis_found}")


    def test_math_interceptor(self):
        """Verify that mathematical queries are parsed and evaluated with 100% accuracy."""
        nlp = ArabicNLPEngine()

        # Test basic addition
        self.assertEqual(nlp.parse_math("واحد زائد واحد"), 2)
        self.assertEqual(nlp.parse_math("زين واحد زاد واحد كم"), 2)

        # Test multiplication
        self.assertEqual(nlp.parse_math("شكد 5 في 4"), 20)

        # Test division and dialect spelling swaps (th -> t)
        self.assertEqual(nlp.parse_math("تلاتة تقسيم تلاثة يطلع شكد؟"), 1)

        # Test subtraction with composite tens
        self.assertEqual(nlp.parse_math("كم 100 ناقص خمسة وعشرين"), 75)

        # Test addition with composite tens
        self.assertEqual(nlp.parse_math("خمسة وعشرين زائد خمسة وعشرين"), 50)

        # Test normal numbers math
        self.assertEqual(nlp.parse_math("10 + 20"), 30)

        # Non-math query should return None
        self.assertIsNone(nlp.parse_math("كم الرام عيني؟"))


if __name__ == "__main__":
    unittest.main()