Skip to content

Commit 2817166

Browse files
committed
stablize arrow table read
1 parent 5a98b1e commit 2817166

File tree

1 file changed

+6
-1
lines changed

1 file changed

+6
-1
lines changed

python/raydp/spark/dataset.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,12 @@ def _fetch_arrow_table_from_executor(executor_actor_name: str,
7474
executor_actor.getRDDPartition.remote(
7575
rdd_id, partition_id, schema_json, driver_agent_url))
7676
reader = pa.ipc.open_stream(pa.BufferReader(ipc_bytes))
77-
return reader.read_all()
77+
table = reader.read_all()
78+
# Spark's Arrow conversion may attach schema metadata. Ray Data metadata extraction
79+
# can be sensitive to unexpected schema metadata in some Ray/PyArrow combinations.
80+
# Strip schema metadata to make blocks more portable/deterministic.
81+
table = table.replace_schema_metadata()
82+
return table
7883

7984

8085
class RecordPiece:

0 commit comments

Comments
 (0)