wip

tastelikefeet · tastelikefeet · commit 18e662daec01 · 2025-06-14T15:52:54.000+08:00
diff --git a/modelscope_agent/callbacks/__init__.py b/modelscope_agent/callbacks/__init__.py
@@ -1,3 +1,3 @@
 from .base import Callback
-from .run_status import RunStatus
+from .runtime import Runtime
 from .utils import callbacks_mapping
diff --git a/modelscope_agent/callbacks/base.py b/modelscope_agent/callbacks/base.py
@@ -2,7 +2,7 @@
 
 from omegaconf import DictConfig
 
-from .run_status import RunStatus
+from .runtime import Runtime
 from ..llm.utils import Message
 
 
@@ -11,20 +11,20 @@ class Callback:
     def __init__(self, config: DictConfig):
         self.config = config
 
-    async def on_task_begin(self, run_status: RunStatus, messages: List[Message]):
+    async def on_task_begin(self, runtime: Runtime, messages: List[Message]):
         pass
 
-    async def on_generate_response(self, run_status: RunStatus, messages: List[Message]):
+    async def on_generate_response(self, runtime: Runtime, messages: List[Message]):
         pass
 
-    async def after_generate_response(self, run_status: RunStatus, messages: List[Message]):
+    async def after_generate_response(self, runtime: Runtime, messages: List[Message]):
         pass
 
-    async def on_tool_call(self, run_status: RunStatus, messages: List[Message]):
+    async def on_tool_call(self, runtime: Runtime, messages: List[Message]):
         pass
 
-    async def after_tool_call(self, run_status: RunStatus, messages: List[Message]):
+    async def after_tool_call(self, runtime: Runtime, messages: List[Message]):
         pass
 
-    async def on_task_end(self, run_status: RunStatus, messages: List[Message]):
+    async def on_task_end(self, runtime: Runtime, messages: List[Message]):
         pass
diff --git a/modelscope_agent/callbacks/runtime.py b/modelscope_agent/callbacks/runtime.py
@@ -1,11 +1,14 @@
 from dataclasses import dataclass
+from typing import Optional
 
 from modelscope_agent.llm.llm import LLM
 
 
 @dataclass
-class RunStatus:
+class Runtime:
 
     should_stop: bool = False
 
-    llm: LLM = None
+    llm: LLM = None
+
+    tag: Optional[str] = None
diff --git a/modelscope_agent/cli/code/artifact_callback.py b/modelscope_agent/cli/code/artifact_callback.py
@@ -1,10 +1,9 @@
 import os.path
-import re
 from typing import List
 
 from omegaconf import DictConfig
 
-from modelscope_agent.callbacks import Callback, RunStatus
+from modelscope_agent.callbacks import Callback, Runtime
 from modelscope_agent.llm.llm import LLM
 from modelscope_agent.llm.utils import Message
 from modelscope_agent.tools.filesystem_tool import FileSystemTool
@@ -15,8 +14,9 @@ class ArtifactCallback(Callback):
     def __init__(self, config: DictConfig):
         super().__init__(config)
         self.file_system = FileSystemTool(config)
+        self.code = False
 
-    async def on_task_begin(self, run_status: RunStatus, messages: List[Message]):
+    async def on_task_begin(self, runtime: Runtime, messages: List[Message]):
         await self.file_system.connect()
 
     @staticmethod
@@ -45,7 +45,7 @@ def extract_metadata(config: DictConfig, llm: LLM, messages: List[Message]):
             _response_message = llm.generate(_messages)
         return _response_message.content
 
-    async def after_generate_response(self, run_status: RunStatus, messages: List[Message]):
+    async def after_generate_response(self, runtime: Runtime, messages: List[Message]):
         last_message_content = messages[-1].content
         if '</code>' in last_message_content:
             code = ''
@@ -64,21 +64,27 @@ async def after_generate_response(self, run_status: RunStatus, messages: List[Me
                     elif recording:
                         code += message.content
             if code:
+                self.code = True
                 try:
-                    code_file = self.extract_metadata(self.config, run_status.llm, messages)
+                    code_file = self.extract_metadata(self.config, runtime.llm, messages)
                     await self.file_system.create_directory('output')
                     await self.file_system.write_file(os.path.join('output', code_file), code)
                     messages.append(Message(role='assistant', content=f'Original query: {messages[1].content}'
                                                                       f'Task sunning successfully, '
                                                                       f'the code has been saved in the {code_file} file.'))
                 except Exception as e:
-                    print(f'Original query: {messages[1].content}. Task sunning failed with error {e} please consider retry generation.', flush=True)
-                    messages.append(Message(role='assistant', content=f'Original query: {messages[1].content}'
+                    raise RuntimeError(f'Original query: {messages[1].content}. Task sunning failed with error {e} please consider retry generation.', flush=True)
+                    messages.append(Message(role='user', content=f'Original query: {messages[1].content}'
                                                                       f'Task sunning failed with error {e} please consider retry generation.'))
             else:
-                print(
+                raise RuntimeError(
                     f'Original query: {messages[1].content}. Task sunning failed, code format error, please consider retry generation.',
                     flush=True)
-                messages.append(Message(role='assistant', content=f'Original query: {messages[1].content}'
+                messages.append(Message(role='user', content=f'Original query: {messages[1].content}'
                                                                   f'Task sunning failed, code format error, please consider retry generation.'))
-            run_status.should_stop = True
+            runtime.should_stop = True
+
+    async def on_task_end(self, runtime: Runtime, messages: List[Message]):
+        if runtime.tag != 'Default workflow':
+            if not self.code:
+                raise RuntimeError()
diff --git a/modelscope_agent/cli/code/coding.yaml b/modelscope_agent/cli/code/coding.yaml
@@ -9,19 +9,21 @@ generation_config:
   temperature: 0.5
   top_k: 50
   stream: false
+  max_completion_tokens: 65536
+  max_tokens: 131072
   extra_body:
     enable_thinking: false
 
 prompt:
   system: |
     You are a senior software architect. Your responsibility is to break down original requirements into implementable modules and assign tasks for each module into subtasks. The initiation of subtasks requires calling the `split_to_sub_task` tool, which can start all sub tasks as you need at one time. In this process, you need to answer the following questions:
 
-    1. What is the original requirement? Does it involve frontend or backend code? What programming language is needed?
+    1. What is the original requirement? Does it involve frontend or backend code? What programming language is needed? 
     2. How many modules should it be split into? What functions are needed for each module? How to combine each file?
     3. Due to code complexity, you need to inform your subtasks that code blocks need to be wrapped with <code></code> tags, as this code will subsequently be stored in local files.
     4. You should **MENTION CLEARLY** the detailed functions and interfaces your subtasks should follow, and relations between each module/file, in case they do duplicate works.
-    5. Beauty and Functionality is the most important thing
-    6. One task only writes one code file, all files should be put flatten in one folder, so no parent folder name should be given
+    5. Beauty and Functionality is the most important thing, Never use invalid image links, use images from unsplash-like websites
+    6. One task only writes one code file, all files should be put in one folder, so no parent folder name should be given
     
     An example:
     query: Please help me write an e-commerce website with Christmas atmosphere
@@ -31,8 +33,10 @@ prompt:
     1. An e-commerce website requires these modules: categories, goods, detail good, purchase, history orders, shopping cart, favourites, carousel images and so on.
     2. The website needs Christmas features, so the CSS style should be mainly red and white, decorated with Christmas images
     3. Due to the complexity of the code engineering, I cannot complete this complex goal in a single file, so I need to split the tasks
-    4. One task should only writes one code file, all files should be put flatten in one folder, so I should not give parent folder prefix
+    4. One task should only writes one code file, all files should be put in one folder, so I should not give parent folder prefix
     5. I should give very detail designs of the pages functions(especially the functions interact with other code files), file import relations(This is the most important thing!!), in case the sub tasks work abnormally
+    6. I should specify the programming language of all the subtasks
+    7. I should tell the subtasks to use images from unsplash-like websites
     ...
     
     How many modules(subtasks) I should split?
@@ -71,19 +75,22 @@ prompt:
       {
         "system": "You are a software engineer which helps me to finish a part of my job. IMPORTANT: You should invoke other tasks' code files to finish the whole job. The code file you need to invoke will be given in the query.",
         "query": "The original query is to write an e-commerce website with Christmas atmosphere, your part of job is only one code file: the index.html page, you should follow instructions: 
-              1. Do not give fake image addresses, use links of Unsplash like website  
+              1. Do not give fake image addresses! use links of Unsplash like website  
               2. You should make the page as beautiful as you can 
               3. DO not add ``` around the code, wrap the code with <code></code> 
-              4. All files are flatten, always import other tasks' code file from the same directory
-              5. Link to purchase.js to fulfill the purchase section
-              6. Link to goods.js to fulfill the goods section
-              7. Link to invoke cart.js to ..."
+              5. All files are in one folder, always import other tasks' code file from the same directory!
+              6. Link to purchase.js to fulfill the purchase section
+              7. Link to goods.js to fulfill the goods section
+              8. Link to invoke cart.js to ..."
+              9. Use javascript instead of node.js, do not use ES6 modules
+              10. Never give dummy information!
       },
     ...
     ]
 
 callbacks:
   - artifact_callback
+  - evaluator_callback
 
 memory:
 
diff --git a/modelscope_agent/cli/code/evaluator_callback.py b/modelscope_agent/cli/code/evaluator_callback.py
@@ -0,0 +1,86 @@
+from typing import List
+
+from omegaconf import DictConfig
+
+from modelscope_agent.callbacks import Callback, Runtime
+from modelscope_agent.llm.utils import Message
+from modelscope_agent.utils import get_logger
+
+logger = get_logger()
+
+
+class EvaluatorCallback(Callback):
+
+    _system = """You are a software architecture evaluator whose job is to assess whether software architectures created by other architects are reasonable. The actual workflow is:
+
+1. An original requirement is given
+2. A software architect provides the modules that need to be designed and breaks these modules down into different subtasks for completion, with each subtask responsible for writing one specific file
+3. After the subtasks are completed, they are automatically saved to disk, and these modules will work together collaboratively
+
+However, software architects have a high probability of making mistakes, including but not limited to:
+
+1. Modules that don't meet user requirements, such as insufficient content richness. In this case, you can try prompting the software architect about whether there are other features that can be added, and you can also provide examples
+2. Dependencies between subtasks must be clear. For example, if file1 in subtask1 needs to import and use file2 from subtask2 and file3 from subtask3, you need to carefully review whether the dependency plan is reasonable
+3. Since files between subtasks work collaboratively, the interfaces between them must be reliable and clear. You need to check whether the interface design provided by the architect is sufficient to support collaborative work requirements
+4. Subtasks may use different programming languages or different technology(we don't want to use es6 modules or node.js) or encounter other scenarios where they cannot work together collaboratively. You need to carefully point these out
+5. The architect will call `split_task`to start all subtasks at one time, which needs a list of systems and queries. You need to check each subtask's arguments(system and query), whether the information is sufficient for collaborative work requirements.
+6. Check whether the architect has mentioned all subtasks the generated files are in one folder, so when importing other files, no dir prefix should be given, and the resources(links, images) should be valid or from the unsplash-like websites, do not use local invalid images.
+7. Your reply should be like `You should ...`, `Does you consider...`, or `Here is a problem which...`, at last you should say: `Now correct these problems and keep the good part and generate a new plan and call `split_task` again`
+8. Some designs from the architect may be good, point out the good parts to encourage the architect to keep them!
+9. **Do not be too strict!**, ignore trivial warnings. in case you and the architect cause a dead loop
+
+Your specific job is:
+Carefully analyze the errors within, prompt the software architect to make corrections, and when you feel the plan already meets the requirements, output the <OK> character, at which point the conversation will terminate.
+Remember: You are not a software architect, you are an evaluator. You don't need to design architecture, you only need to point out or inspire awareness of the errors. 
+Now Begin:
+
+"""
+
+    def __init__(self, config: DictConfig):
+        super().__init__(config)
+        self.argue_ended = False
+        self.argue_round = 0
+
+    async def after_generate_response(self, runtime: Runtime, messages: List[Message]):
+        if runtime.tag != 'Default workflow':
+            self.argue_ended = True
+            return
+
+        if len(messages) > 3:
+            temp = messages[:2] + messages[-1:]
+            messages.clear()
+            messages.extend(temp)
+
+        if self.argue_round >= 1:
+            self.argue_ended = True
+            return
+
+        query = (f'The original requirement is: \n```text\n{messages[1].content}\n```\n\n '
+                 f'The plan given by the architect is: \n```text\n{messages[2].content}\n```\n\n '
+                 f'The task arguments is : \n```json\n{messages[2].tool_calls[0]}\n```\n\n')
+
+        _messages = [
+            Message(role='system', content=self._system),
+            Message(role='user', content=query),
+        ]
+        if getattr(self.config.generation_config, 'stream', False):
+            message = None
+            for msg in runtime.llm.generate(_messages):
+                message = runtime.llm.merge_stream_message(message, msg)
+
+            _response_message = message
+        else:
+            _response_message = runtime.llm.generate(_messages)
+        self.argue_round += 1
+        for line in _response_message.content.split('\n'):
+            for _line in line.split('\\n'):
+                logger.info(f'[Evaluator] {_line}')
+
+        if '<OK>' in _response_message.content or self.argue_ended:
+            self.argue_ended = True
+        else:
+            messages[-1].tool_calls = None
+            messages.append(Message(role='user', content=_response_message.content))
+
+    async def after_tool_call(self, runtime: Runtime, messages: List[Message]):
+        runtime.should_stop = runtime.should_stop and self.argue_ended
diff --git a/modelscope_agent/engine/plan/base.py b/modelscope_agent/engine/plan/base.py
@@ -3,7 +3,7 @@
 
 from pydantic import ConfigDict
 
-from modelscope_agent.callbacks import RunStatus
+from modelscope_agent.callbacks import Runtime
 from modelscope_agent.llm.utils import Message
 
 
@@ -13,10 +13,10 @@ def __init__(self, config: ConfigDict):
         self.config = config
 
     @abstractmethod
-    def generate_plan(self, messages: List[Message], run_status: RunStatus):
+    def generate_plan(self, messages: List[Message], runtime: Runtime):
         pass
 
     @abstractmethod
-    def update_plan(self, messages: List[Message], run_status: RunStatus):
+    def update_plan(self, messages: List[Message], runtime: Runtime):
         pass
 
diff --git a/modelscope_agent/engine/plan/observer_planer.py b/modelscope_agent/engine/plan/observer_planer.py
@@ -2,7 +2,7 @@
 
 from pydantic import ConfigDict
 
-from modelscope_agent.callbacks import RunStatus
+from modelscope_agent.callbacks import Runtime
 from modelscope_agent.engine.plan.base import Planer
 from modelscope_agent.llm.llm import LLM
 from modelscope_agent.llm.utils import Message
@@ -15,8 +15,8 @@ def __init__(self, config: ConfigDict):
         observer_config = self.config.planer.observer
         self.observer = LLM.from_config(observer_config)
 
-    def generate_plan(self, messages: List[Message], run_status: RunStatus):
+    def generate_plan(self, messages: List[Message], runtime: Runtime):
         pass
 
-    def update_plan(self, messages: List[Message], run_status: RunStatus):
+    def update_plan(self, messages: List[Message], runtime: Runtime):
         pass
diff --git a/modelscope_agent/engine/simple_engine.py b/modelscope_agent/engine/simple_engine.py
@@ -7,7 +7,7 @@
 from omegaconf import DictConfig
 
 from modelscope_agent.callbacks import Callback
-from modelscope_agent.callbacks import RunStatus
+from modelscope_agent.callbacks import Runtime
 from modelscope_agent.callbacks import callbacks_mapping
 from modelscope_agent.config import Config
 from modelscope_agent.engine.memory import memory_mapping
@@ -61,7 +61,7 @@ def __init__(self,
             self.config = Config.from_task(task_dir_or_id, env)
         self.llm = LLM.from_config(self.config)
         self.callbacks = []
-        self.run_status = RunStatus(llm=self.llm)
+        self.runtime = Runtime(llm=self.llm)
         self.trust_remote_code = kwargs.get('trust_remote_code', False)
         self.config.trust_remote_code = self.trust_remote_code
         self._register_callback_from_config()
@@ -97,7 +97,7 @@ def _register_callback_from_config(self):
 
     async def _loop_callback(self, point, messages: List[Message]):
         for callback in self.callbacks:
-            await getattr(callback, point)(self.run_status, messages)
+            await getattr(callback, point)(self.runtime, messages)
 
     async def _parallel_tool_call(self, messages: List[Message]):
         tool_call_result = await self.tool_manager.parallel_call_tool(messages[-1].tool_calls)
@@ -160,7 +160,7 @@ def _refine_memory(self, messages: List[Message]):
 
     def _update_plan(self, messages: List[Message]):
         if self.planer:
-            self.planer.update_plan(self.llm, messages, self.run_status)
+            self.planer.update_plan(self.llm, messages, self.runtime)
         return messages
 
     def handle_stream_message(self):
@@ -183,11 +183,12 @@ async def run(self, prompt, **kwargs):
             self._prepare_planer()
             self._prepare_rag()
             tag = kwargs.get('tag', 'Default workflow')
+            self.runtime.tag = tag
             messages = self._prepare_messages(prompt)
             await self._loop_callback('on_task_begin', messages)
             if self.planer:
-                self.planer.generate_plan(messages, self.run_status)
-            while not self.run_status.should_stop:
+                self.planer.generate_plan(messages, self.runtime)
+            while not self.runtime.should_stop:
                 await self._loop_callback('on_generate_response', messages)
                 messages = self._refine_memory(messages)
                 messages = self._update_plan(messages)
@@ -208,7 +209,7 @@ async def run(self, prompt, **kwargs):
                 if _response_message.tool_calls:
                     await self._parallel_tool_call(messages)
                 else:
-                    self.run_status.should_stop = True
+                    self.runtime.should_stop = True
                 await self._loop_callback('after_tool_call', messages)
             await self._loop_callback('on_task_end', messages)
             await self._cleanup_tools()