fix(data profile): refine the image content; remove assert; remove api_key

StCarmen · StCarmen · commit 9f0d0bf2db97 · 2026-01-27T15:29:47.000+08:00
diff --git a/alias/src/alias/agent/agents/data_source/_data_profiler_factory.py b/alias/src/alias/agent/agents/data_source/_data_profiler_factory.py
@@ -102,9 +102,6 @@ def _load_prompt_and_model(source_type: Any = None, api_key: str = None):
             "IRREGULAR": MODEL_CONFIG_NAME,
         }
 
-        if not api_key:
-            api_key = os.environ.get("DASHSCOPE_API_KEY")
-
         models_2_model_and_formatter = {
             MODEL_CONFIG_NAME: [
                 DashScopeChatModel(
@@ -360,16 +357,19 @@ def _wrap_data_response(self, response: Dict[str, Any]) -> Dict[str, Any]:
         # they contain tables. Each table contains columns and description
         if "tables" in self.data and "tables" in response:
             new_schema["tables"] = []
-            for i, table in enumerate(self.data["tables"]):
-                # Ensure alignment between schema tables and resp tables
-                # TODO: It matches by order, by name would be more robust.
-                if i >= len(response["tables"]):
-                    # LLM returns less tables than the original schema
-                    break
-                assert response["tables"][i]["name"] == table["name"]
+            # Build a map for response tables and descriptions
+            res_des_map = {
+                table["name"]: table["description"]
+                for table in response["tables"]
+            }
+            for table in self.data["tables"]:
+                table_name = table["name"]
+                if table_name not in res_des_map:
+                    continue
                 new_table = {}
-                new_table["name"] = table["name"]
-                new_table["description"] = response["tables"][i]["description"]
+                new_table["name"] = table_name
+                # Retain the desrciption from the LLM response
+                new_table["description"] = res_des_map[table_name]
                 if "columns" in table:
                     new_table["columns"] = table["columns"]
                 if "irregular_judgment" in table:
@@ -687,15 +687,20 @@ def _generate_content(self, prompt, data):
         Returns:
             List containing image and text components for the LLM call
         """
-        contents = []
         # Convert image paths according to the model requirements
-        contents.append(
+        contents = [
             {
-                "image": data,
+                "text": prompt,
+                "type": "text",
             },
-        )
-        # append text
-        contents.append({"text": prompt})
+            {
+                "source": {
+                    "url": data,
+                    "type": "url",
+                },
+                "type": "image",
+            },
+        ]
         return contents
 
     def _wrap_data_response(self, response: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/alias/src/alias/agent/agents/data_source/data_profile.py b/alias/src/alias/agent/agents/data_source/data_profile.py
@@ -32,7 +32,10 @@ def _get_binary_buffer(
     return buffer
 
 
-def _copy_file_from_sandbox(sandbox: AliasSandbox, file_path: str) -> str:
+def _copy_file_from_sandbox_with_original_name(
+    sandbox: AliasSandbox,
+    file_path: str,
+) -> str:
     """
     Copies a file from the sandbox environment
     or a URL to a local temporary file.
@@ -61,7 +64,7 @@ def _copy_file_from_sandbox(sandbox: AliasSandbox, file_path: str) -> str:
         with open(full_path, "wb") as f:
             f.write(file_buffer.getvalue())
         file_source = full_path
-    return file_source
+    return str(file_source)
 
 
 async def data_profile(
@@ -87,7 +90,10 @@ async def data_profile(
     """
 
     if source_type in [SourceType.CSV, SourceType.EXCEL, SourceType.IMAGE]:
-        local_path = _copy_file_from_sandbox(sandbox, sandbox_path)
+        local_path = _copy_file_from_sandbox_with_original_name(
+            sandbox,
+            sandbox_path,
+        )
     elif source_type == SourceType.RELATIONAL_DB:
         local_path = sandbox_path
     else: