[data] Cherry pick data fixes for 2.49.1 (#56058)

omatthew98 · iamjustinhsu · alexeykudinkin · web-flow · commit c057f1ea836f · 2025-08-28T15:16:02.000-07:00
Cherry pick two fixes for ray data (from #55854 and #55926). --------- Signed-off-by: iamjustinhsu <jhsu@anyscale.com> Signed-off-by: Alexey Kudinkin <alexey.kudinkin@gmail.com> Signed-off-by: Matthew Owen <mowen@anyscale.com> Co-authored-by: iamjustinhsu <140442892+iamjustinhsu@users.noreply.github.com> Co-authored-by: Alexey Kudinkin <alexey.kudinkin@gmail.com>
diff --git a/python/ray/air/util/object_extensions/arrow.py b/python/ray/air/util/object_extensions/arrow.py
@@ -71,6 +71,9 @@ def __reduce__(self):
             self.__arrow_ext_serialize__(),
         )
 
+    def __hash__(self) -> int:
+        return hash((type(self), self.storage_type.id, self.extension_name))
+
 
 @PublicAPI(stability="alpha")
 class ArrowPythonObjectScalar(pa.ExtensionScalar):
diff --git a/python/ray/air/util/tensor_extensions/arrow.py b/python/ray/air/util/tensor_extensions/arrow.py
@@ -574,6 +574,9 @@ def _need_variable_shaped_tensor_array(
             shape = arr_type.shape
         return False
 
+    def __hash__(self) -> int:
+        return hash((type(self), self.extension_name, self.storage_type, self._shape))
+
 
 @PublicAPI(stability="beta")
 class ArrowTensorType(_BaseFixedShapeArrowTensorType):
@@ -584,6 +587,7 @@ class ArrowTensorType(_BaseFixedShapeArrowTensorType):
     """
 
     OFFSET_DTYPE = np.int32
+    __hash__ = _BaseFixedShapeArrowTensorType.__hash__
 
     def __init__(self, shape: Tuple[int, ...], dtype: pa.DataType):
         """
@@ -614,6 +618,7 @@ class ArrowTensorTypeV2(_BaseFixedShapeArrowTensorType):
     """Arrow ExtensionType (v2) for tensors (supporting tensors > 4Gb)."""
 
     OFFSET_DTYPE = np.int64
+    __hash__ = _BaseFixedShapeArrowTensorType.__hash__
 
     def __init__(self, shape: Tuple[int, ...], dtype: pa.DataType):
         """
@@ -1125,6 +1130,9 @@ def _extension_scalar_to_ndarray(self, scalar: "pa.ExtensionScalar") -> np.ndarr
         data_buffer = raw_values.buffers()[1]
         return _to_ndarray_helper(shape, value_type, offset, data_buffer)
 
+    def __hash__(self) -> int:
+        return hash((type(self), self.extension_name, self.storage_type, self._ndim))
+
 
 # NOTE: We need to inherit from the mixin before pa.ExtensionArray to ensure that the
 # mixin's overriding methods appear first in the MRO.
diff --git a/python/ray/data/BUILD b/python/ray/data/BUILD
@@ -1239,6 +1239,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "test_unify_schemas_performance",
+    size = "small",
+    srcs = ["tests/test_unify_schemas_performance.py"],
+    tags = [
+        "exclusive",
+        "team:data",
+    ],
+    deps = [
+        ":conftest",
+        "//:ray_lib",
+    ],
+)
+
 py_test(
     name = "test_util",
     size = "small",
diff --git a/python/ray/data/_internal/arrow_ops/transform_pyarrow.py b/python/ray/data/_internal/arrow_ops/transform_pyarrow.py
@@ -172,6 +172,14 @@ def unify_schemas(
         ArrowVariableShapedTensorType,
     )
 
+    try:
+        if len(set(schemas)) == 1:
+            # Early exit because unifying can be expensive
+            return schemas.pop()
+    except Exception as e:
+        # Unsure if there are cases where schemas are NOT hashable
+        logger.warning(f"Failed to hash the schemas (for deduplication): {e}")
+
     schemas_to_unify = []
     schema_field_overrides = {}
 
diff --git a/python/ray/data/_internal/equalize.py b/python/ray/data/_internal/equalize.py
@@ -2,8 +2,12 @@
 
 from ray.data._internal.execution.interfaces import RefBundle
 from ray.data._internal.split import _calculate_blocks_rows, _split_at_indices
-from ray.data._internal.util import unify_ref_bundles_schema
-from ray.data.block import Block, BlockMetadata, BlockPartition
+from ray.data.block import (
+    Block,
+    BlockMetadata,
+    BlockPartition,
+    _take_first_non_empty_schema,
+)
 from ray.types import ObjectRef
 
 
@@ -41,7 +45,7 @@ def _equalize(
 
     # phase 2: based on the num rows needed for each shaved split, split the leftovers
     # in the shape that exactly matches the rows needed.
-    schema = unify_ref_bundles_schema(per_split_bundles)
+    schema = _take_first_non_empty_schema(bundle.schema for bundle in per_split_bundles)
     leftover_bundle = RefBundle(leftovers, owns_blocks=owned_by_consumer, schema=schema)
     leftover_splits = _split_leftovers(leftover_bundle, per_split_needed_rows)
 
diff --git a/python/ray/data/_internal/execution/interfaces/ref_bundle.py b/python/ray/data/_internal/execution/interfaces/ref_bundle.py
@@ -63,6 +63,13 @@ def __post_init__(self):
                     "The size in bytes of the block must be known: {}".format(b)
                 )
 
+        import pyarrow as pa
+
+        # The schema metadata might be unhashable.
+        # We need schemas to be hashable for unification
+        if isinstance(self.schema, pa.lib.Schema):
+            self.schema = self.schema.remove_metadata()
+
     def __setattr__(self, key, value):
         if hasattr(self, key) and key in ["blocks", "owns_blocks"]:
             raise ValueError(f"The `{key}` field of RefBundle cannot be updated.")
diff --git a/python/ray/data/_internal/execution/legacy_compat.py b/python/ray/data/_internal/execution/legacy_compat.py
@@ -16,10 +16,11 @@
 from ray.data._internal.logical.util import record_operators_usage
 from ray.data._internal.plan import ExecutionPlan
 from ray.data._internal.stats import DatasetStats
-from ray.data._internal.util import (
-    unify_schemas_with_validation,
+from ray.data.block import (
+    BlockMetadata,
+    BlockMetadataWithSchema,
+    _take_first_non_empty_schema,
 )
-from ray.data.block import BlockMetadata, BlockMetadataWithSchema
 
 # Warn about tasks larger than this.
 TASK_SIZE_WARN_THRESHOLD_BYTES = 100000
@@ -171,18 +172,18 @@ def _get_initial_stats_from_plan(plan: ExecutionPlan) -> DatasetStats:
 def _bundles_to_block_list(bundles: Iterator[RefBundle]) -> BlockList:
     blocks, metadata = [], []
     owns_blocks = True
-    schemas = []
+    bundle_list = list(bundles)
+    schema = _take_first_non_empty_schema(
+        ref_bundle.schema for ref_bundle in bundle_list
+    )
 
-    for ref_bundle in bundles:
+    for ref_bundle in bundle_list:
         if not ref_bundle.owns_blocks:
             owns_blocks = False
         blocks.extend(ref_bundle.block_refs)
         metadata.extend(ref_bundle.metadata)
-        schemas.append(ref_bundle.schema)
-    unified_schema = unify_schemas_with_validation(schemas)
-    return BlockList(
-        blocks, metadata, owned_by_consumer=owns_blocks, schema=unified_schema
-    )
+
+    return BlockList(blocks, metadata, owned_by_consumer=owns_blocks, schema=schema)
 
 
 def _set_stats_uuid_recursive(stats: DatasetStats, dataset_uuid: str) -> None:
diff --git a/python/ray/data/_internal/execution/operators/map_operator.py b/python/ray/data/_internal/execution/operators/map_operator.py
@@ -48,13 +48,14 @@
 )
 from ray.data._internal.execution.util import memory_string
 from ray.data._internal.stats import StatsDict
-from ray.data._internal.util import MemoryProfiler, unify_ref_bundles_schema
+from ray.data._internal.util import MemoryProfiler
 from ray.data.block import (
     Block,
     BlockAccessor,
     BlockExecStats,
     BlockMetadataWithSchema,
     BlockStats,
+    _take_first_non_empty_schema,
     to_stats,
 )
 from ray.data.context import DataContext
@@ -541,8 +542,6 @@ def _map_task(
         A generator of blocks, followed by the list of BlockMetadata for the blocks
         as the last generator return.
     """
-    from ray.data.block import BlockMetadataWithSchema
-
     logger.debug(
         "Executing map task of operator %s with task index %d",
         ctx.op_name,
@@ -662,14 +661,13 @@ def _get_bundle_size(bundle: RefBundle):
 def _merge_ref_bundles(*bundles: RefBundle) -> RefBundle:
     """Merge N ref bundles into a single bundle of multiple blocks."""
     # Check that at least one bundle is non-null.
-    assert any(bundle is not None for bundle in bundles)
+    bundles = [bundle for bundle in bundles if bundle is not None]
+    assert len(bundles) > 0
     blocks = list(
-        itertools.chain(
-            block for bundle in bundles if bundle is not None for block in bundle.blocks
-        )
+        itertools.chain(block for bundle in bundles for block in bundle.blocks)
     )
-    owns_blocks = all(bundle.owns_blocks for bundle in bundles if bundle is not None)
-    schema = unify_ref_bundles_schema(bundles)
+    owns_blocks = all(bundle.owns_blocks for bundle in bundles)
+    schema = _take_first_non_empty_schema(bundle.schema for bundle in bundles)
     return RefBundle(blocks, owns_blocks=owns_blocks, schema=schema)
 
 
diff --git a/python/ray/data/_internal/execution/streaming_executor_state.py b/python/ray/data/_internal/execution/streaming_executor_state.py
@@ -284,7 +284,10 @@ def add_output(self, ref: RefBundle) -> None:
         """Move a bundle produced by the operator to its outqueue."""
 
         ref, diverged = dedupe_schemas_with_validation(
-            self._schema, ref, warn=not self._warned_on_schema_divergence
+            self._schema,
+            ref,
+            warn=not self._warned_on_schema_divergence,
+            enforce_schemas=self.op.data_context.enforce_schemas,
         )
         self._schema = ref.schema
         self._warned_on_schema_divergence |= diverged
@@ -756,7 +759,7 @@ def dedupe_schemas_with_validation(
     old_schema: Optional["Schema"],
     bundle: "RefBundle",
     warn: bool = True,
-    allow_divergent: bool = False,
+    enforce_schemas: bool = False,
 ) -> Tuple["RefBundle", bool]:
     """Unify/Dedupe two schemas, warning if warn=True
 
@@ -765,7 +768,7 @@ def dedupe_schemas_with_validation(
             the new schema will be used as the old schema.
         bundle: The new `RefBundle` to unify with the old schema.
         warn: Raise a warning if the schemas diverge.
-        allow_divergent: If `True`, allow the schemas to diverge and return unified schema.
+        enforce_schemas: If `True`, allow the schemas to diverge and return unified schema.
             If `False`, but keep the old schema.
 
     Returns:
@@ -792,7 +795,7 @@ def dedupe_schemas_with_validation(
             f"than the previous one. Previous schema: {old_schema}, "
             f"new schema: {bundle.schema}. This may lead to unexpected behavior."
         )
-    if allow_divergent:
+    if enforce_schemas:
         old_schema = unify_schemas_with_validation([old_schema, bundle.schema])
 
     return (
diff --git a/python/ray/data/_internal/logical/operators/from_operators.py b/python/ray/data/_internal/logical/operators/from_operators.py
@@ -4,8 +4,12 @@
 
 from ray.data._internal.execution.interfaces import RefBundle
 from ray.data._internal.logical.interfaces import LogicalOperator, SourceOperator
-from ray.data._internal.util import unify_block_metadata_schema
-from ray.data.block import Block, BlockMetadata, BlockMetadataWithSchema
+from ray.data._internal.util import unify_ref_bundles_schema
+from ray.data.block import (
+    Block,
+    BlockMetadata,
+    BlockMetadataWithSchema,
+)
 from ray.types import ObjectRef
 
 if TYPE_CHECKING:
@@ -28,12 +32,11 @@ def __init__(
             len(input_metadata),
         )
         # `owns_blocks` is False because this op may be shared by multiple Datasets.
-        self._schema = unify_block_metadata_schema(input_metadata)
         self._input_data = [
             RefBundle(
                 [(input_blocks[i], input_metadata[i])],
                 owns_blocks=False,
-                schema=self._schema,
+                schema=input_metadata[i].schema,
             )
             for i in range(len(input_blocks))
         ]
@@ -71,7 +74,7 @@ def infer_metadata(self) -> BlockMetadata:
         return self._cached_output_metadata
 
     def infer_schema(self):
-        return self._schema
+        return unify_ref_bundles_schema(self._input_data)
 
     def is_lineage_serializable(self) -> bool:
         # This operator isn't serializable because it contains ObjectRefs.
diff --git a/python/ray/data/_internal/logical/operators/input_data_operator.py b/python/ray/data/_internal/logical/operators/input_data_operator.py
@@ -3,7 +3,7 @@
 
 from ray.data._internal.execution.interfaces import RefBundle
 from ray.data._internal.logical.interfaces import LogicalOperator, SourceOperator
-from ray.data._internal.util import unify_ref_bundles_schema
+from ray.data._internal.util import unify_schemas_with_validation
 from ray.data.block import BlockMetadata
 
 
@@ -49,7 +49,7 @@ def _size_bytes(self):
             return None
 
     def infer_schema(self):
-        return unify_ref_bundles_schema(self.input_data)
+        return unify_schemas_with_validation([data.schema for data in self.input_data])
 
     def is_lineage_serializable(self) -> bool:
         # This operator isn't serializable because it contains ObjectRefs.
diff --git a/python/ray/data/_internal/logical/operators/read_operator.py b/python/ray/data/_internal/logical/operators/read_operator.py
@@ -3,7 +3,10 @@
 
 from ray.data._internal.logical.interfaces import SourceOperator
 from ray.data._internal.logical.operators.map_operator import AbstractMap
-from ray.data.block import BlockMetadata, BlockMetadataWithSchema
+from ray.data.block import (
+    BlockMetadata,
+    BlockMetadataWithSchema,
+)
 from ray.data.datasource.datasource import Datasource, Reader
 
 
diff --git a/python/ray/data/_internal/plan.py b/python/ray/data/_internal/plan.py
@@ -13,8 +13,7 @@
 from ray.data._internal.logical.interfaces.logical_plan import LogicalPlan
 from ray.data._internal.logical.operators.read_operator import Read
 from ray.data._internal.stats import DatasetStats
-from ray.data._internal.util import unify_ref_bundles_schema
-from ray.data.block import BlockMetadataWithSchema
+from ray.data.block import BlockMetadataWithSchema, _take_first_non_empty_schema
 from ray.data.context import DataContext
 from ray.data.exceptions import omit_traceback_stdout
 from ray.util.debug import log_once
@@ -378,10 +377,9 @@ def schema(
                 iter_ref_bundles, _, executor = self.execute_to_iterator()
                 # Make sure executor is fully shutdown upon exiting
                 with executor:
-                    for bundle in iter_ref_bundles:
-                        if bundle.schema is not None:
-                            schema = bundle.schema
-                            break
+                    schema = _take_first_non_empty_schema(
+                        bundle.schema for bundle in iter_ref_bundles
+                    )
         self.cache_schema(schema)
         return self._schema
 
@@ -493,9 +491,10 @@ def execute(
                 # `List[RefBundle]` instead of `RefBundle`. Among other reasons, it'd
                 # allow us to remove the unwrapping logic below.
                 output_bundles = self._logical_plan.dag.output_data()
-                schema = self._logical_plan.dag.infer_schema()
                 owns_blocks = all(bundle.owns_blocks for bundle in output_bundles)
-                schema = unify_ref_bundles_schema(output_bundles)
+                schema = _take_first_non_empty_schema(
+                    bundle.schema for bundle in output_bundles
+                )
                 bundle = RefBundle(
                     [
                         (block, metadata)
diff --git a/python/ray/data/_internal/planner/aggregate.py b/python/ray/data/_internal/planner/aggregate.py
@@ -52,6 +52,7 @@ def fn(
             metadata.extend(ref_bundle.metadata)
         if len(blocks) == 0:
             return (blocks, {})
+
         unified_schema = unify_ref_bundles_schema(refs)
         for agg_fn in aggs:
             agg_fn._validate(unified_schema)
diff --git a/python/ray/data/_internal/planner/exchange/push_based_shuffle_task_scheduler.py b/python/ray/data/_internal/planner/exchange/push_based_shuffle_task_scheduler.py
@@ -14,7 +14,6 @@
 from ray.data._internal.stats import StatsDict
 from ray.data._internal.util import (
     convert_bytes_to_human_readable_str,
-    unify_schemas_with_validation,
     unzip,
 )
 from ray.data.block import (
@@ -23,6 +22,7 @@
     BlockExecStats,
     BlockMetadata,
     BlockMetadataWithSchema,
+    _take_first_non_empty_schema,
     to_stats,
 )
 from ray.data.context import DataContext
@@ -743,13 +743,14 @@ def _merge(
             del block
             schemas.append(meta_with_schema.schema)
 
+        schema = _take_first_non_empty_schema(iter(schemas))
+
         meta = BlockMetadata(
             num_rows=num_rows,
             size_bytes=size_bytes,
             input_files=None,
             exec_stats=stats.build(),
         )
-        schema = unify_schemas_with_validation(schemas)
         meta_with_schema = BlockMetadataWithSchema(metadata=meta, schema=schema)
         yield meta_with_schema
 
diff --git a/python/ray/data/_internal/planner/sort.py b/python/ray/data/_internal/planner/sort.py
@@ -36,6 +36,7 @@ def fn(
             blocks.extend(ref_bundle.block_refs)
         if len(blocks) == 0:
             return (blocks, {})
+
         sort_key.validate_schema(unify_ref_bundles_schema(refs))
 
         num_mappers = len(blocks)
diff --git a/python/ray/data/block.py b/python/ray/data/block.py
diff --git a/python/ray/data/context.py b/python/ray/data/context.py
diff --git a/python/ray/data/tests/test_deduping_schema.py b/python/ray/data/tests/test_deduping_schema.py
diff --git a/python/ray/data/tests/test_transform_pyarrow.py b/python/ray/data/tests/test_transform_pyarrow.py
diff --git a/python/ray/data/tests/test_unify_schemas_performance.py b/python/ray/data/tests/test_unify_schemas_performance.py

Original file line number	Diff line number	Diff line change
`@@ -71,6 +71,9 @@ def __reduce__(self):`
`71`	`71`	`self.__arrow_ext_serialize__(),`
`72`	`72`	`)`
`73`	`73`
	`74`	`+ def __hash__(self) -> int:`
	`75`	`+ return hash((type(self), self.storage_type.id, self.extension_name))`
	`76`	`+`
`74`	`77`
`75`	`78`	`@PublicAPI(stability="alpha")`
`76`	`79`	`class ArrowPythonObjectScalar(pa.ExtensionScalar):`