[data] Fix errors with concatenation with mixed pyarrow native and extension types (#57566)

omatthew98 · web-flow · commit 122893336c4a · 2025-10-08T13:44:06.000-07:00
## Why are these changes needed? Cherry-pick #56811 Original description: If we had an execution where we needed to concatenate native pyarrow types and pyarrow extension types, we would get errors like the following: ``` ⚠️ Dataset dataset_5_0 execution failed: : 0.00 row [00:00, ? row/s] - Repartition 1: 0.00 row [00:00, ? row/s] *- Split Repartition: : 0.00 row [00:00, ? row/s] 2025-09-22 17:21:34,068 ERROR exceptions.py:73 -- Exception occurred in Ray Data or Ray Core internal code. If you continue to see this error, please open an issue on the Ray project GitHub page with the full stack trace below: https://github.com/ray-project/ray/issues/new/choose 2025-09-22 17:21:34,068 ERROR exceptions.py:81 -- Full stack trace: Traceback (most recent call last): File "/Users/mowen/code/ray/python/ray/data/exceptions.py", line 49, in handle_trace return fn(*args, **kwargs) File "/Users/mowen/code/ray/python/ray/data/_internal/plan.py", line 533, in execute blocks = execute_to_legacy_block_list( File "/Users/mowen/code/ray/python/ray/data/_internal/execution/legacy_compat.py", line 127, in execute_to_legacy_block_list block_list = _bundles_to_block_list(bundles) File "/Users/mowen/code/ray/python/ray/data/_internal/execution/legacy_compat.py", line 175, in _bundles_to_block_list bundle_list = list(bundles) File "/Users/mowen/code/ray/python/ray/data/_internal/execution/interfaces/executor.py", line 34, in __next__ return self.get_next() File "/Users/mowen/code/ray/python/ray/data/_internal/execution/streaming_executor.py", line 680, in get_next bundle = state.get_output_blocking(output_split_idx) File "/Users/mowen/code/ray/python/ray/data/_internal/execution/streaming_executor_state.py", line 373, in get_output_blocking raise self._exception File "/Users/mowen/code/ray/python/ray/data/_internal/execution/streaming_executor.py", line 331, in run continue_sched = self._scheduling_loop_step(self._topology) File "/Users/mowen/code/ray/python/ray/data/_internal/execution/streaming_executor.py", line 475, in _scheduling_loop_step update_operator_states(topology) File "/Users/mowen/code/ray/python/ray/data/_internal/execution/streaming_executor_state.py", line 586, in update_operator_states op.all_inputs_done() File "/Users/mowen/code/ray/python/ray/data/_internal/execution/operators/base_physical_operator.py", line 122, in all_inputs_done self._output_buffer, self._stats = self._bulk_fn(self._input_buffer, ctx) File "/Users/mowen/code/ray/python/ray/data/_internal/planner/repartition.py", line 84, in split_repartition_fn return scheduler.execute(refs, num_outputs, ctx) File "/Users/mowen/code/ray/python/ray/data/_internal/planner/exchange/split_repartition_task_scheduler.py", line 106, in execute ] = reduce_bar.fetch_until_complete(list(reduce_metadata_schema)) File "/Users/mowen/code/ray/python/ray/data/_internal/progress_bar.py", line 166, in fetch_until_complete for ref, result in zip(done, ray.get(done)): File "/Users/mowen/code/ray/python/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper return fn(*args, **kwargs) File "/Users/mowen/code/ray/python/ray/_private/client_mode_hook.py", line 104, in wrapper return func(*args, **kwargs) File "/Users/mowen/code/ray/python/ray/_private/worker.py", line 2952, in get values, debugger_breakpoint = worker.get_objects( File "/Users/mowen/code/ray/python/ray/_private/worker.py", line 1025, in get_objects raise value.as_instanceof_cause() ray.exceptions.RayTaskError(RuntimeError): ray::reduce() (pid=7442, ip=127.0.0.1) File "/Users/mowen/code/ray/python/ray/data/_internal/planner/exchange/shuffle_task_spec.py", line 128, in reduce new_block = builder.build() File "/Users/mowen/code/ray/python/ray/data/_internal/delegating_block_builder.py", line 68, in build return self._builder.build() File "/Users/mowen/code/ray/python/ray/data/_internal/table_block.py", line 144, in build return self._concat_tables(tables) File "/Users/mowen/code/ray/python/ray/data/_internal/arrow_block.py", line 161, in _concat_tables return transform_pyarrow.concat(tables, promote_types=True) File "/Users/mowen/code/ray/python/ray/data/_internal/arrow_ops/transform_pyarrow.py", line 706, in concat col = _concatenate_chunked_arrays(col_chunked_arrays) File "/Users/mowen/code/ray/python/ray/data/_internal/arrow_ops/transform_pyarrow.py", line 397, in _concatenate_chunked_arrays raise RuntimeError(f"Types mismatch: {type_} != {arr.type}") RuntimeError: Types mismatch: uint64 != double 2025-09-22 17:21:34,069 ERROR worker.py:429 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::reduce() (pid=7442, ip=127.0.0.1) File "/Users/mowen/code/ray/python/ray/data/_internal/planner/exchange/shuffle_task_spec.py", line 128, in reduce new_block = builder.build() File "/Users/mowen/code/ray/python/ray/data/_internal/delegating_block_builder.py", line 68, in build return self._builder.build() File "/Users/mowen/code/ray/python/ray/data/_internal/table_block.py", line 144, in build return self._concat_tables(tables) File "/Users/mowen/code/ray/python/ray/data/_internal/arrow_block.py", line 161, in _concat_tables return transform_pyarrow.concat(tables, promote_types=True) File "/Users/mowen/code/ray/python/ray/data/_internal/arrow_ops/transform_pyarrow.py", line 706, in concat col = _concatenate_chunked_arrays(col_chunked_arrays) File "/Users/mowen/code/ray/python/ray/data/_internal/arrow_ops/transform_pyarrow.py", line 397, in _concatenate_chunked_arrays raise RuntimeError(f"Types mismatch: {type_} != {arr.type}") RuntimeError: Types mismatch: uint64 != double ``` This PR adds a test that replicates this and fixes the underlying issue by concatenating extension types and native types separately before rejoining them. ## Related issue number  ## Checks - [ ] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [ ] I've run pre-commit jobs to lint the changes in this PR. ([pre-commit setup](https://docs.ray.io/en/latest/ray-contribute/getting-involved.html#lint-and-formatting)) - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [ ] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [ ] Unit tests - [ ] Release tests - [ ] This PR is not tested :(
diff --git a/python/ray/data/_internal/arrow_ops/transform_pyarrow.py b/python/ray/data/_internal/arrow_ops/transform_pyarrow.py
@@ -584,6 +584,106 @@ def shuffle(block: "pyarrow.Table", seed: Optional[int] = None) -> "pyarrow.Tabl
     return take_table(block, indices)
 
 
+def _concat_cols_with_null_list(
+    col_chunked_arrays: List["pyarrow.ChunkedArray"],
+) -> "pyarrow.ChunkedArray":
+    import pyarrow as pa
+
+    # For each opaque list column, iterate through all schemas until
+    # we find a valid value_type that can be used to override the
+    # column types in the following for-loop.
+    scalar_type = None
+    for arr in col_chunked_arrays:
+        if not pa.types.is_list(arr.type) or not pa.types.is_null(arr.type.value_type):
+            scalar_type = arr.type
+            break
+
+    if scalar_type is not None:
+        for c_idx in range(len(col_chunked_arrays)):
+            c = col_chunked_arrays[c_idx]
+            if pa.types.is_list(c.type) and pa.types.is_null(c.type.value_type):
+                if pa.types.is_list(scalar_type):
+                    # If we are dealing with a list input,
+                    # cast the array to the scalar_type found above.
+                    col_chunked_arrays[c_idx] = c.cast(scalar_type)
+                else:
+                    # If we are dealing with a single value, construct
+                    # a new array with null values filled.
+                    col_chunked_arrays[c_idx] = pa.chunked_array(
+                        [pa.nulls(c.length(), type=scalar_type)]
+                    )
+
+    return _concatenate_chunked_arrays(col_chunked_arrays)
+
+
+def _concat_cols_with_extension_tensor_types(
+    col_chunked_arrays: List["pyarrow.ChunkedArray"],
+) -> "pyarrow.ChunkedArray":
+
+    import pyarrow as pa
+
+    # For our tensor extension types, manually construct a chunked array
+    # containing chunks from all blocks. This is to handle
+    # homogeneous-shaped block columns having different shapes across
+    # blocks: if tensor element shapes differ across blocks, a
+    # variable-shaped tensor array will be returned.
+    combined_chunks = list(
+        itertools.chain(*[chunked.iterchunks() for chunked in col_chunked_arrays])
+    )
+
+    return pa.chunked_array(unify_tensor_arrays(combined_chunks))
+
+
+def _concat_cols_with_extension_object_types(
+    col_chunked_arrays: List["pyarrow.ChunkedArray"],
+) -> "pyarrow.ChunkedArray":
+    import pyarrow as pa
+
+    from ray.data.extensions import ArrowPythonObjectArray, ArrowPythonObjectType
+
+    chunks_to_concat = []
+    # Cast everything to objects if concatenated with an object column
+    for ca in col_chunked_arrays:
+        for chunk in ca.chunks:
+            if isinstance(ca.type, ArrowPythonObjectType):
+                chunks_to_concat.append(chunk)
+            else:
+                chunks_to_concat.append(
+                    ArrowPythonObjectArray.from_objects(chunk.to_pylist())
+                )
+    return pa.chunked_array(chunks_to_concat)
+
+
+def _concat_cols_with_native_pyarrow_types(
+    col_names: List[str], blocks: List["pyarrow.Table"], promote_types: bool = False
+) -> Dict[str, "pyarrow.ChunkedArray"]:
+    if not col_names:
+        return {}
+
+    # For columns with native Pyarrow types, we should use built-in pyarrow.concat_tables.
+    import pyarrow as pa
+
+    # When concatenating tables we allow type promotions to occur, since
+    # no schema enforcement is currently performed, therefore allowing schemas
+    # to vary b/w blocks
+
+    # NOTE: Type promotions aren't available in Arrow < 14.0
+    subset_blocks = []
+    for block in blocks:
+        cols_to_select = [
+            col_name for col_name in col_names if col_name in block.schema.names
+        ]
+        subset_blocks.append(block.select(cols_to_select))
+    if get_pyarrow_version() < parse_version("14.0.0"):
+        table = pa.concat_tables(subset_blocks, promote=True)
+    else:
+        arrow_promote_types_mode = "permissive" if promote_types else "default"
+        table = pa.concat_tables(
+            subset_blocks, promote_options=arrow_promote_types_mode
+        )
+    return {col_name: table.column(col_name) for col_name in table.schema.names}
+
+
 def concat(
     blocks: List["pyarrow.Table"], *, promote_types: bool = False
 ) -> "pyarrow.Table":
@@ -594,7 +694,6 @@ def concat(
 
     from ray.air.util.tensor_extensions.arrow import ArrowConversionError
     from ray.data.extensions import (
-        ArrowPythonObjectArray,
         ArrowPythonObjectType,
         get_arrow_extension_tensor_types,
     )
@@ -624,104 +723,53 @@ def concat(
     # Handle alignment of struct type columns.
     blocks = _align_struct_fields(blocks, schema)
 
-    # Rollup columns with opaque (null-typed) lists, to process in following for-loop.
+    # Identify columns with null lists
     cols_with_null_list = set()
     for b in blocks:
         for col_name in b.schema.names:
             col_type = b.schema.field(col_name).type
             if pa.types.is_list(col_type) and pa.types.is_null(col_type.value_type):
                 cols_with_null_list.add(col_name)
 
-    if (
-        any(isinstance(type_, pa.ExtensionType) for type_ in schema.types)
-        or cols_with_null_list
-    ):
-        # Custom handling for extension array columns.
-        cols = []
-        for col_name in schema.names:
-            col_chunked_arrays = []
-            for block in blocks:
-                col_chunked_arrays.append(block.column(col_name))
-
-            if isinstance(schema.field(col_name).type, tensor_types):
-                # For our tensor extension types, manually construct a chunked array
-                # containing chunks from all blocks. This is to handle
-                # homogeneous-shaped block columns having different shapes across
-                # blocks: if tensor element shapes differ across blocks, a
-                # variable-shaped tensor array will be returned.
-                combined_chunks = list(
-                    itertools.chain(
-                        *[chunked.iterchunks() for chunked in col_chunked_arrays]
-                    )
-                )
+    # Concatenate the columns according to their type
+    concatenated_cols = {}
+    native_pyarrow_cols = []
+    for col_name in schema.names:
+        col_type = schema.field(col_name).type
 
-                col = pa.chunked_array(unify_tensor_arrays(combined_chunks))
-            elif isinstance(schema.field(col_name).type, ArrowPythonObjectType):
-                chunks_to_concat = []
-                # Cast everything to objects if concatenated with an object column
-                for ca in col_chunked_arrays:
-                    for chunk in ca.chunks:
-                        if isinstance(ca.type, ArrowPythonObjectType):
-                            chunks_to_concat.append(chunk)
-                        else:
-                            chunks_to_concat.append(
-                                ArrowPythonObjectArray.from_objects(chunk.to_pylist())
-                            )
-                col = pa.chunked_array(chunks_to_concat)
+        col_chunked_arrays = []
+        for block in blocks:
+            if col_name in block.schema.names:
+                col_chunked_arrays.append(block.column(col_name))
             else:
-                if col_name in cols_with_null_list:
-                    # For each opaque list column, iterate through all schemas until
-                    # we find a valid value_type that can be used to override the
-                    # column types in the following for-loop.
-                    scalar_type = None
-                    for arr in col_chunked_arrays:
-                        if not pa.types.is_list(arr.type) or not pa.types.is_null(
-                            arr.type.value_type
-                        ):
-                            scalar_type = arr.type
-                            break
-
-                    if scalar_type is not None:
-                        for c_idx in range(len(col_chunked_arrays)):
-                            c = col_chunked_arrays[c_idx]
-                            if pa.types.is_list(c.type) and pa.types.is_null(
-                                c.type.value_type
-                            ):
-                                if pa.types.is_list(scalar_type):
-                                    # If we are dealing with a list input,
-                                    # cast the array to the scalar_type found above.
-                                    col_chunked_arrays[c_idx] = c.cast(scalar_type)
-                                else:
-                                    # If we are dealing with a single value, construct
-                                    # a new array with null values filled.
-                                    col_chunked_arrays[c_idx] = pa.chunked_array(
-                                        [pa.nulls(c.length(), type=scalar_type)]
-                                    )
-
-                col = _concatenate_chunked_arrays(col_chunked_arrays)
-            cols.append(col)
-
-        # Build the concatenated table.
-        table = pyarrow.Table.from_arrays(cols, schema=schema)
-        # Validate table schema (this is a cheap check by default).
-        table.validate()
-    else:
-        # No extension array columns, so use built-in pyarrow.concat_tables.
-
-        # When concatenating tables we allow type promotions to occur, since
-        # no schema enforcement is currently performed, therefore allowing schemas
-        # to vary b/w blocks
-        #
-        # NOTE: Type promotions aren't available in Arrow < 14.0
-        if get_pyarrow_version() < parse_version("14.0.0"):
-            table = pyarrow.concat_tables(blocks, promote=True)
-        else:
-            arrow_promote_types_mode = "permissive" if promote_types else "default"
-            table = pyarrow.concat_tables(
-                blocks, promote_options=arrow_promote_types_mode
+                col_chunked_arrays.append(pa.nulls(block.num_rows, type=col_type))
+
+        if col_name in cols_with_null_list:
+            concatenated_cols[col_name] = _concat_cols_with_null_list(
+                col_chunked_arrays
+            )
+        elif isinstance(col_type, tensor_types):
+            concatenated_cols[col_name] = _concat_cols_with_extension_tensor_types(
+                col_chunked_arrays
+            )
+        elif isinstance(col_type, ArrowPythonObjectType):
+            concatenated_cols[col_name] = _concat_cols_with_extension_object_types(
+                col_chunked_arrays
             )
+        else:
+            # Add to the list of native pyarrow columns, these will be concatenated after the loop using pyarrow.concat_tables
+            native_pyarrow_cols.append(col_name)
 
-    return table
+    concatenated_cols.update(
+        _concat_cols_with_native_pyarrow_types(
+            native_pyarrow_cols, blocks, promote_types
+        )
+    )
+
+    # Ensure that the columns are in the same order as the schema, reconstruct the table.
+    return pyarrow.Table.from_arrays(
+        [concatenated_cols[col_name] for col_name in schema.names], schema=schema
+    )
 
 
 def concat_and_sort(
diff --git a/python/ray/data/tests/test_transform_pyarrow.py b/python/ray/data/tests/test_transform_pyarrow.py
@@ -2847,6 +2847,62 @@ def unify_schemas_nested_struct_tensors_schemas():
     return {"with_tensor": schema1, "without_tensor": schema2, "expected": expected}
 
 
+@pytest.mark.parametrize("use_arrow_tensor_v2", [True, False])
+@pytest.mark.skipif(
+    get_pyarrow_version() < MIN_PYARROW_VERSION_TYPE_PROMOTION,
+    reason="Requires Arrow version of at least 14.0.0",
+)
+def test_concat_with_mixed_tensor_types_and_native_pyarrow_types(
+    use_arrow_tensor_v2, restore_data_context
+):
+    DataContext.get_current().use_arrow_tensor_v2 = use_arrow_tensor_v2
+
+    num_rows = 1024
+
+    # Block A: int is uint64; tensor = Ray tensor extension
+    t_uint = pa.table(
+        {
+            "int": pa.array(np.zeros(num_rows // 2, dtype=np.uint64), type=pa.uint64()),
+            "tensor": ArrowTensorArray.from_numpy(
+                np.zeros((num_rows // 2, 3, 3), dtype=np.float32)
+            ),
+        }
+    )
+
+    # Block B: int is float64 with NaNs; tensor = same extension type
+    f = np.ones(num_rows // 2, dtype=np.float64)
+    f[::8] = np.nan
+    t_float = pa.table(
+        {
+            "int": pa.array(f, type=pa.float64()),
+            "tensor": ArrowTensorArray.from_numpy(
+                np.zeros((num_rows // 2, 3, 3), dtype=np.float32)
+            ),
+        }
+    )
+
+    # Two input blocks with different Arrow dtypes for "int"
+    ds = ray.data.from_arrow([t_uint, t_float])
+
+    # Force a concat across blocks
+    ds = ds.repartition(1)
+
+    # This should not raise: RuntimeError: Types mismatch: double != uint64
+    ds.materialize()
+
+    # Ensure that the result is correct
+    # Determine expected tensor type based on current DataContext setting
+    if use_arrow_tensor_v2:
+        expected_tensor_type = ArrowTensorTypeV2((3, 3), pa.float32())
+    else:
+        expected_tensor_type = ArrowTensorType((3, 3), pa.float32())
+
+    assert ds.schema().base_schema == pa.schema(
+        [("int", pa.float64()), ("tensor", expected_tensor_type)]
+    )
+    assert ds.count() == num_rows
+
+
 @pytest.fixture
 def object_with_tensor_fails_blocks():
     """Blocks that should fail when concatenating objects with tensors."""