Merge remote-tracking branch 'origin/main' into main_ds_eval

SSSuperDan · SSSuperDan · commit db903c7ff3cb · 2026-01-26T17:45:10.000+08:00
diff --git a/alias/README.md b/alias/README.md
@@ -207,10 +207,10 @@ alias_agent run --mode finance --task "Analyze Tesla's Q4 2024 financial perform
 # Data Science mode
 alias_agent run --mode ds \
   --task "Analyze the distribution of incidents across categories in 'incident_records.csv' to identify imbalances, inconsistencies, or anomalies, and determine their root cause." \
-  --files ./docs/data/incident_records.csv
+  --datasource ./docs/data/incident_records.csv
 ```
 
-**Note**: Files uploaded with `--files` are automatically copied to `/workspace` in the sandbox. Generated files are available in `sessions_mount_dir` subdirectories.
+**Note**: Files uploaded with `--datasource` are automatically copied to `/workspace` in the sandbox. Generated files are available in `sessions_mount_dir` subdirectories.
 
 #### Enable Long-Term Memory Service (General Mode Only)
 To enable the long-term memory service in General mode, you need to:
diff --git a/alias/README_ZH.md b/alias/README_ZH.md
@@ -208,10 +208,10 @@ alias_agent run --mode finance --task "Analyze Tesla's Q4 2024 financial perform
 # 数据科学（Data Science）模式
 alias_agent run --mode ds \
   --task "Analyze the distribution of incidents across categories in 'incident_records.csv' to identify imbalances, inconsistencies, or anomalies, and determine their root cause." \
-  --files ./docs/data/incident_records.csv
+  --datasource ./docs/data/incident_records.csv
 ```
 
-**注意**：使用 `--files` 上传的文件会自动复制到沙盒中的 `/workspace`。生成的文件可在 `sessions_mount_dir` 的子目录中找到。
+**注意**：使用 `--datasource` 上传的文件会自动复制到沙盒中的 `/workspace`。生成的文件可在 `sessions_mount_dir` 的子目录中找到。
 
 #### 启用长期记忆服务（仅限通用模式）
 要在通用模式下启用长期记忆服务，您需要：
diff --git a/alias/src/alias/agent/agents/_data_science_agent.py b/alias/src/alias/agent/agents/_data_science_agent.py
@@ -411,16 +411,13 @@ async def generate_response(
             report_md,
             report_html,
         ) = await report_generator.generate_report()
-        md_report_path = os.path.join(
-            self.tmp_file_storage_dir,
-            "detailed_report.md",
-        )
-        html_report_path = os.path.join(
-            self.tmp_file_storage_dir,
-            "detailed_report.html",
-        )
 
-        if report_html:
+        if report_md:
+            md_report_path = os.path.join(
+                self.tmp_file_storage_dir,
+                "detailed_report.md",
+            )
+
             await self.toolkit.call_tool_function(
                 ToolUseBlock(
                     type="tool_use",
@@ -432,25 +429,35 @@ async def generate_response(
                     },
                 ),
             )
-            await self.toolkit.call_tool_function(
-                ToolUseBlock(
-                    type="tool_use",
-                    id=str(uuid.uuid4()),
-                    name="write_file",
-                    input={
-                        "path": html_report_path,
-                        "content": report_html,
-                    },
-                ),
-            )
             response = (
                 f"{response}\n\n"
                 "The detailed report (markdown version) has been saved to "
-                f"{md_report_path}.\n"
-                "The detailed report (html version) has been saved to "
-                f"{html_report_path}."
+                f"{md_report_path}."
             )
 
+            if report_html:
+                html_report_path = os.path.join(
+                    self.tmp_file_storage_dir,
+                    "detailed_report.html",
+                )
+
+                await self.toolkit.call_tool_function(
+                    ToolUseBlock(
+                        type="tool_use",
+                        id=str(uuid.uuid4()),
+                        name="write_file",
+                        input={
+                            "path": html_report_path,
+                            "content": report_html,
+                        },
+                    ),
+                )
+                response = (
+                    f"{response}\n\n"
+                    "The detailed report (html version) has been saved to "
+                    f"{html_report_path}."
+                )
+
         kwargs["response"] = response
         structured_output = {}
 
diff --git a/alias/src/alias/agent/agents/ds_agent_utils/built_in_prompt/_log_to_markdown_prompt.md b/alias/src/alias/agent/agents/ds_agent_utils/built_in_prompt/_log_to_markdown_prompt.md
@@ -44,8 +44,13 @@ Each task in the roadmap contains:
   - Brief Response
   - Detailed Report
 - You should choose the template that is most appropriate for the user task.
-   - **Brief Respoonse Template** should ONLY be used when the user asks for a simple data query task, where ONLY numeric or concise string values are returned, and complex analysis or research are not required.
-   - **Detailed Report Template** should be used when the user asks for a detailed analysis of the data, where the analysis and research are required.
+   - **Brief Respoonse Template** should ONLY be used when the user asks for a
+   simple, static data point (e.g., a total count or a specific value), where
+   the answer is returned as a single numeric or concise string value with no
+   analysis, transformation, comparison, or interpretation required.
+   - **Detailed Report Template**  should be used whenever the task involves
+   distribution, discrepancy, imbalance, comparison, trend, root cause, or
+   any form of analysis, interpretation, or evidence generation.
 
 2. Data Source Constraints
 - **ONLY use information explicitly present in the log file**
@@ -103,7 +108,7 @@ You MUST ensure all captions, subtitles, and other contents in the report are wr
   - "brief_response": The brief response content.
     - When 'is_brief_response' is True, this field should be fulfilled with the brief response content following the **Brief Response Template**.
     - When 'is_brief_response' is False, this field should be a concise summary of the detailed report in in markdown format illustrating the key findings and insights.
-  - "detailed_report_content": The detailed markdown report content following the **Detailed Report Template**. This field is ONLY generated when 'is_brief_response' is False, otherwise fulfill an empty string.
+  - "report_content": The detailed markdown report content following the **Detailed Report Template**. This field is ONLY generated when 'is_brief_response' is False, otherwise fulfill an empty string.
 - You MUST ensure the JSON object is a valid JSON string and can be parsed by json.loads().
 - Double check all escapes are valid.
 
diff --git a/alias/src/alias/agent/agents/ds_agent_utils/report_generation.py b/alias/src/alias/agent/agents/ds_agent_utils/report_generation.py
@@ -1,20 +1,62 @@
 # -*- coding: utf-8 -*-
 import os
-import json
 import time
 from typing import Tuple
 
 import dotenv
+from pydantic import BaseModel, Field
+
 from agentscope.message import Msg
 
 from .utils import model_call_with_retry, get_prompt_from_file
-
-
 from .ds_config import PROMPT_DS_BASE_PATH
 
 dotenv.load_dotenv()
 
 
+class ReportResponse(BaseModel):
+    is_brief_response: bool = Field(
+        ...,
+        description=(
+            "True if the response is a brief response; "
+            "False if it includes a detailed report."
+        ),
+    )
+
+    brief_response: str = Field(
+        ...,
+        description=(
+            "The brief response content. "
+            "When 'is_brief_response' is True, this field contains the full "
+            "brief response following the Brief Response Template. "
+            "When 'is_brief_response' is False, this field contains a concise "
+            "markdown summary of the detailed report, highlighting key "
+            "findings and insights."
+        ),
+        json_schema_extra={
+            "example": (
+                "The analysis shows a 15% increase in user engagement "
+                "after the feature update."
+            ),
+        },
+    )
+
+    report_content: str = Field(
+        ...,
+        description=(
+            "The detailed markdown report content following the "
+            "Detailed Report Template. This field MUST be an empty "
+            "string ('') when 'is_brief_response' is True. It MUST contain "
+            "the full detailed report when 'is_brief_response' is False."
+        ),
+        json_schema_extra={
+            "example": "### User Task Description...\n"
+            "### Associated Data Sources...\n"
+            "### Research Conclusion...\n### Task1...### Task2...",
+        },
+    )
+
+
 class ReportGenerator:
     def __init__(self, model, formatter, memory_log: str):
         self.model = model
@@ -62,22 +104,13 @@ async def _log_to_markdown(self) -> str:
             self.formatter,
             msgs=msgs,
             msg_name="Report Generation",
+            structured_model=ReportResponse,
         )
 
-        raw_response = res.content[0]["text"]
-
-        # TODO: More robust response cleaning
-        if raw_response.strip().startswith("```json"):
-            cleaned = raw_response.strip()[len("```json") :].lstrip("\n")
-            if cleaned.endswith("```"):
-                cleaned = cleaned[:-3].rstrip()
-            response = cleaned
-        else:
-            response = raw_response.strip()
         end_time = time.time()
-        # print(response)
         print(f"Log to markdown took {end_time - start_time} seconds")
-        return response
+
+        return res.content[-1]["input"]
 
     async def _convert_to_html(self, markdown_content: str) -> str:
         start_time = time.time()
@@ -103,37 +136,34 @@ async def _convert_to_html(self, markdown_content: str) -> str:
         return response.content[0]["text"]
 
     async def generate_report(self) -> Tuple[str, str, str]:
-        markdown_response = await self._log_to_markdown()
-
-        #  responseFormat: {
-        #     "is_brief_response": True,
-        #     "brief_response": brief_response_content,
-        #     "report_content": detailed_report_content
-        #  }
-
-        try:
-            markdown_content = json.loads(markdown_response)
-        except json.JSONDecodeError as e:
-            print(f"Error parsing JSON response: {e}")
-            print(f"Response content: {markdown_response}")
-            raise
+        """
+        responseFormat: {
+           "is_brief_response": True,
+           "brief_response": brief_response_content,
+           "report_content": detailed_report_content
+        }
+        """
+        markdown_content = await self._log_to_markdown()
 
         if (
             str(markdown_content.get("is_brief_response", False)).lower()
             == "true"
         ):
             # During brief response mode,
             # directly return the brief response to the user.
-            return markdown_content["brief_response"], "", ""
+            return markdown_content.get("brief_response", ""), "", ""
         else:
             # In detailed report mode,
             # convert the detailed report to HTML and return it to the user;
             # if a brief summary of the report is needed,
             # it can be obtained through markdown_content["brief_response"].
+            html_content = ""
+            if os.getenv("ENABLE_HTML_REPORT", "ON").lower() != "off":
+                html_content = await self._convert_to_html(
+                    markdown_content.get("report_content", ""),
+                )
             return (
-                markdown_content["brief_response"],
-                markdown_content["report_content"],
-                await self._convert_to_html(
-                    markdown_content["report_content"],
-                ),
+                markdown_content.get("brief_response", ""),
+                markdown_content.get("report_content", ""),
+                html_content,
             )