ray-project · kyuds · Nov 22, 2025 · Nov 22, 2025 · Nov 22, 2025 · Nov 22, 2025
@@ -16,6 +16,7 @@
 )
 
 import numpy as np
+import pyarrow as pa
 import pyarrow.compute as pc
 
 from ray.data._internal.util import is_null
@@ -916,29 +917,41 @@ class Unique(AggregateFnV2[Set[Any], List[Any]]):
         ignore_nulls: Whether to ignore null values when collecting unique items.
                       Default is True (nulls are excluded).
         alias_name: Optional name for the resulting column.
+        encode_lists: If `True`, encode list elements.  If `False`, encode
+            whole lists (i.e., the entire list is considered as a single object).
+            `False` by default. Note that this is a top-level flatten (not a recursive
+            flatten) operation.
     """
 
     def __init__(
         self,
         on: Optional[str] = None,
         ignore_nulls: bool = True,
         alias_name: Optional[str] = None,
+        encode_lists: bool = False,
     ):
         super().__init__(
             alias_name if alias_name else f"unique({str(on)})",
             on=on,
             ignore_nulls=ignore_nulls,
             zero_factory=set,
         )
+        self._encode_lists = encode_lists
 
     def combine(self, current_accumulator: Set[Any], new: Set[Any]) -> Set[Any]:
         return self._to_set(current_accumulator) | self._to_set(new)
 
     def aggregate_block(self, block: Block) -> List[Any]:
-        import pyarrow.compute as pac
-
         col = BlockAccessor.for_block(block).to_arrow().column(self._target_col_name)
-        return pac.unique(col).to_pylist()
+        if pa.types.is_list(col.type) and self._encode_lists:
+            col = pc.list_flatten(col)
+        if self._ignore_nulls:
+            col = pc.drop_null(col)
+        pickled = [pickle.dumps(v.as_py()).hex() for v in col]
+        return pc.unique(pa.array(pickled)).to_pylist()
+
+    def finalize(self, accumulator: Set[Any]) -> List[Any]:
+        return [pickle.loads(bytes.fromhex(v)) for v in accumulator]
 
     @staticmethod
     def _to_set(x):

@@ -2953,7 +2953,7 @@ def unique(self, column: str) -> List[Any]:
 
             >>> import ray
             >>> ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv")
-            >>> ds.unique("target")
+            >>> sorted(ds.unique("target"))
             [0, 1, 2]
 
             One common use case is to convert the class labels
@@ -2976,7 +2976,7 @@ def unique(self, column: str) -> List[Any]:
         Returns:
             A list with unique elements in the given column.
         """  # noqa: E501
-        ret = self._aggregate_on(Unique, column)
+        ret = self._aggregate_on(Unique, column, ignore_nulls=False)
         return self._aggregate_result(ret)
 
     @AllToAllAPI

@@ -1,3 +1,5 @@
+from collections import Counter
+
 import numpy as np
 import pytest
 
@@ -6,6 +8,7 @@
     ApproximateQuantile,
     ApproximateTopK,
     MissingValuePercentage,
+    Unique,
     ZeroPercentage,
 )
 from ray.data.tests.conftest import *  # noqa
@@ -496,6 +499,68 @@ def test_approximate_topk_encode_lists(self, ray_start_regular_shared_2_cpus):
         assert result["approx_topk(id)"][2] == {"id": 3, "count": 1}
 
 
+class TestUnique:
+    """Test cases for Unique aggregation."""
+
+    def test_unique_basic(self, ray_start_regular_shared_2_cpus):
+        """Test basic Unique aggregation."""
+        data = [{"id": "a"}, {"id": "b"}, {"id": "b"}, {"id": None}]
+        ds = ray.data.from_items(data)
+        result = ds.aggregate(Unique(on="id", ignore_nulls=False))
+
+        answer = ["a", "b", None]
+
+        assert Counter(result["unique(id)"]) == Counter(answer)
+
+    def test_unique_ignores_nulls(self, ray_start_regular_shared_2_cpus):
+        """Test Unique properly ignores nulls."""
+        data = [{"id": "a"}, {"id": None}, {"id": "b"}, {"id": "b"}, {"id": None}]
+        ds = ray.data.from_items(data)
+        result = ds.aggregate(Unique(on="id"))
+
+        assert sorted(result["unique(id)"]) == ["a", "b"]
+
+    def test_unique_custom_alias(self, ray_start_regular_shared_2_cpus):
+        """Test Unique with custom alias."""
+        data = [{"id": "a"}, {"id": "b"}, {"id": "b"}]
+        ds = ray.data.from_items(data)
+        result = ds.aggregate(Unique(on="id", alias_name="custom"))
+
+        assert sorted(result["custom"]) == ["a", "b"]
+
+    def test_unique_list_datatype(self, ray_start_regular_shared_2_cpus):
+        """Test Unique works with non-hashable types like list."""
+        data = [
+            {"id": ["a", "b", "c"]},
+            {"id": ["a", "b", "c"]},
+            {"id": ["a", "b", "c"]},
+        ]
+        ds = ray.data.from_items(data)
+        result = ds.aggregate(Unique(on="id"))
+
+        assert result["unique(id)"][0] == ["a", "b", "c"]
+
+    def test_unique_encode_lists(self, ray_start_regular_shared_2_cpus):
+        """Test Unique works when encode_lists is True."""
+        data = [{"id": ["a", "b", "c"]}, {"id": ["a", "a", "a", "b", None]}]
+        ds = ray.data.from_items(data)
+        result = ds.aggregate(Unique(on="id", encode_lists=True, ignore_nulls=False))
+
+        answer = ["a", "b", "c", None]
+
+        assert Counter(result["unique(id)"]) == Counter(answer)
+
+    def test_unique_encode_lists_ignores_nulls(self, ray_start_regular_shared_2_cpus):
+        """Test Unique will drop null values when encode_lists is True."""
+        data = [{"id": ["a", "b", "c"]}, {"id": ["a", "a", "a", "b", None]}]
+        ds = ray.data.from_items(data)
+        result = ds.aggregate(Unique(on="id", encode_lists=True))
+
+        answer = ["a", "b", "c"]
+
+        assert Counter(result["unique(id)"]) == Counter(answer)
+
+
 if __name__ == "__main__":
     import sys
 

@@ -751,7 +751,7 @@ def test_groupby_aggregations_are_associative(
         Mean("B", alias_name="mean_b", ignore_nulls=ignore_nulls),
         Std("B", alias_name="std_b", ignore_nulls=ignore_nulls),
         Quantile("B", alias_name="quantile_b", ignore_nulls=ignore_nulls),
-        Unique("B", alias_name="unique_b"),
+        Unique("B", alias_name="unique_b", ignore_nulls=False),
     ]
 
     # Step 0: Prepare expected output (using Pandas)