fix(udf): handle models serialization in complex types (#1459)

shcheklein · web-flow · commit 494bde9be0e7 · 2025-11-14T12:00:24.000-08:00
diff --git a/src/datachain/data_storage/warehouse.py b/src/datachain/data_storage/warehouse.py
@@ -83,7 +83,9 @@ def _to_jsonable(self, obj: Any) -> Any:
         """
 
         if ModelStore.is_pydantic(type(obj)):
-            return obj.model_dump()
+            # Use Pydantic's JSON mode to ensure datetime and other non-JSON
+            # native types are serialized in a compatible way.
+            return obj.model_dump(mode="json")
 
         if isinstance(obj, dict):
             out: dict[str, Any] = {}
diff --git a/src/datachain/lib/udf.py b/src/datachain/lib/udf.py
@@ -560,13 +560,16 @@ def run(
         self.setup()
 
         for batch in udf_inputs:
-            udf_args = zip(
-                *[
-                    self._prepare_row(row, udf_fields, catalog, cache, download_cb)
-                    for row in batch
-                ],
-                strict=False,
-            )
+            prepared_rows = [
+                self._prepare_row(row, udf_fields, catalog, cache, download_cb)
+                for row in batch
+            ]
+            batched_args = zip(*prepared_rows, strict=False)
+            # Convert aggregated column values to lists. This keeps behavior
+            # consistent with the type hints promoted in the public API.
+            udf_args = [
+                list(arg) if isinstance(arg, tuple) else arg for arg in batched_args
+            ]
             result_objs = self.process_safe(udf_args)
             udf_outputs = (self._flatten_row(row) for row in result_objs)
             output = (
diff --git a/tests/func/test_udf.py b/tests/func/test_udf.py
@@ -4,6 +4,7 @@
 import posixpath
 import sys
 import time
+from collections.abc import Iterator
 
 import multiprocess as mp
 import pytest
@@ -1028,3 +1029,80 @@ def summarize(left_path, right_value):
         for g in range(groups)
     }
     assert {row["partition"]: row["total"] for row in records} == expected_totals
+
+
+def test_agg_list_file_and_map_count(tmp_dir, test_session):
+    names = [
+        "hotdogs.txt",
+        "dogs.txt",
+        "dog.txt",
+        "1dog.txt",
+        "dogatxt.txt",
+        "dog.txtx",
+    ]
+
+    for name in names:
+        (tmp_dir / name).write_text(name, encoding="utf-8")
+
+    base_chain = dc.read_storage(tmp_dir.as_uri(), session=test_session).order_by(
+        "file.path"
+    )
+
+    expected_files: list[File] = []
+    for (file_obj,) in base_chain.select("file").to_iter():
+        assert isinstance(file_obj, File)
+        expected_files.append(file_obj)
+
+    def collect_files(file: list[File]) -> Iterator[list[File]]:
+        # Return the full collection for the partition
+        yield file
+
+    def count_files(files: list[File]) -> int:
+        return len(files)
+
+    (
+        base_chain.agg(files=collect_files)
+        .map(num_files=count_files)
+        .save("temp_udf_types")
+    )
+
+    # Validate result
+    ds = dc.read_dataset("temp_udf_types", session=test_session)
+    rows = ds.select("num_files").to_list()
+    assert rows == [(len(expected_files),)]
+
+
+def test_agg_list_file_persist_and_read(tmp_dir, test_session):
+    names = ["a.txt", "b.txt", "c.txt"]
+
+    for name in names:
+        (tmp_dir / name).write_text(name, encoding="utf-8")
+
+    base_chain = dc.read_storage(tmp_dir.as_uri(), session=test_session).order_by(
+        "file.path"
+    )
+
+    expected_files: list[File] = []
+    for (file_obj,) in base_chain.select("file").to_iter():
+        assert isinstance(file_obj, File)
+        expected_files.append(file_obj)
+
+    def collect_files(file: list[File]) -> Iterator[list[File]]:
+        yield file
+
+    (base_chain.agg(files=collect_files).save("temp_files_only"))
+
+    # When reading back, we should get a list of File objects
+    ds = dc.read_dataset("temp_files_only", session=test_session)
+    vals = ds.select("files").to_list()
+    assert len(vals) == 1
+    files_list = vals[0][0]
+    assert isinstance(files_list, list)
+    assert all(isinstance(f, File) for f in files_list)
+
+    expected_sorted: list[File] = sorted(expected_files, key=lambda f: f.path)
+    actual_sorted: list[File] = sorted(files_list, key=lambda f: f.path)
+
+    assert [f.model_dump() for f in actual_sorted] == [
+        f.model_dump() for f in expected_sorted
+    ]