ray-project · slfan1989 · Feb 6, 2026 · Feb 7, 2026 · Feb 7, 2026 · gemini-code-assist
@@ -11,6 +11,7 @@
 import pyarrow.compute as pc
 import pyarrow.dataset as ds
 
+from ray.data._internal.arrow_block import _BATCH_SIZE_PRESERVING_STUB_COL_NAME
 from ray.data._internal.logical.rules.projection_pushdown import (
     _extract_input_columns_renaming_mapping,
 )
@@ -747,15 +748,45 @@ def eval_projection(projection_exprs: List[Expr], block: Block) -> Block:
 
     names, output_cols = zip(*[(e.name, eval_expr(e, block)) for e in projection_exprs])
 
-    # This clumsy workaround is necessary to be able to fill in Pyarrow tables
-    # that has to be "seeded" from existing table with N rows, and couldn't be
-    # started from a truly empty table.
-    #
-    # TODO fix
-    new_block = BlockAccessor.for_block(block).fill_column("__stub__", None)
-    new_block = BlockAccessor.for_block(new_block).drop(input_column_names)
+    block_type = block_accessor.block_type()
+    if block_type == BlockType.ARROW:
+        num_rows = block_accessor.num_rows()
+        arrays = []
+        for output_col in output_cols:
+            if isinstance(output_col, (pa.Array, pa.ChunkedArray)):
+                arrays.append(output_col)
+            else:
+                if isinstance(output_col, pa.Scalar):
+                    column_type = output_col.type
+                else:
+                    column_type = pa.infer_type([output_col])
+                array = pa.nulls(num_rows, type=column_type)
+                arrays.append(pc.fill_null(array, output_col))
+        return pa.Table.from_arrays(arrays, names=list(names))
+    elif block_type == BlockType.PANDAS:
+        num_rows = block_accessor.num_rows()
+        index = block.index if isinstance(block, pd.DataFrame) else range(num_rows)
+        data = {}
+        for name, output_col in zip(names, output_cols):
+            if isinstance(output_col, (pa.Array, pa.ChunkedArray)):
+                data[name] = output_col.to_pandas()
+            else:
+                data[name] = output_col
+        return pd.DataFrame(data, index=index)
+
+    # Build an empty block that preserves row count across block types. Arrow tables
+    # cannot be created truly empty (0 columns) while retaining row count, so
+    # BlockAccessor.select([]) injects a stub column internally for Arrow blocks.
+    new_block = block_accessor.select([])
 
     for name, output_col in zip(names, output_cols):
         new_block = BlockAccessor.for_block(new_block).fill_column(name, output_col)
 
-    return BlockAccessor.for_block(new_block).drop(["__stub__"])
+    final_accessor = BlockAccessor.for_block(new_block)
+    if (
+        _BATCH_SIZE_PRESERVING_STUB_COL_NAME not in names
+        and _BATCH_SIZE_PRESERVING_STUB_COL_NAME in final_accessor.column_names()
+    ):
+        new_block = final_accessor.drop([_BATCH_SIZE_PRESERVING_STUB_COL_NAME])
+
+    return new_block
-    # Build an empty block that preserves row count across block types. Arrow tables
-    # cannot be created truly empty (0 columns) while retaining row count, so
-    # BlockAccessor.select([]) injects a stub column internally for Arrow blocks.
-    new_block = block_accessor.select([])
-
-    for name, output_col in zip(names, output_cols):
-        new_block = BlockAccessor.for_block(new_block).fill_column(name, output_col)
-
-    return BlockAccessor.for_block(new_block).drop(["__stub__"])
-    final_accessor = BlockAccessor.for_block(new_block)
-    if (
-        _BATCH_SIZE_PRESERVING_STUB_COL_NAME not in names
-        and _BATCH_SIZE_PRESERVING_STUB_COL_NAME in final_accessor.column_names()
-    ):
-        new_block = final_accessor.drop([_BATCH_SIZE_PRESERVING_STUB_COL_NAME])
-
-    return new_block
+    # Efficiently construct the new block based on its type.
+    block_type = block_accessor.block_type()
+    if block_type == BlockType.ARROW:
+        return pa.Table.from_arrays(list(output_cols), names=list(names))
+    elif block_type == BlockType.PANDAS:
+        return pd.DataFrame(dict(zip(names, output_cols)))
+
+    # Fallback to generic, iterative construction for other block types.
+    new_block = block_accessor.select([])
+    for name, output_col in zip(names, output_cols):
+        new_block = BlockAccessor.for_block(new_block).fill_column(name, output_col)
+
+    final_accessor = BlockAccessor.for_block(new_block)
+    if (
+        _BATCH_SIZE_PRESERVING_STUB_COL_NAME not in names
+        and _BATCH_SIZE_PRESERVING_STUB_COL_NAME in final_accessor.column_names()
+    ):
+        new_block = final_accessor.drop([_BATCH_SIZE_PRESERVING_STUB_COL_NAME])
+
+    return new_block
-    # Build an empty block that preserves row count across block types. Arrow tables
-    # cannot be created truly empty (0 columns) while retaining row count, so
-    # BlockAccessor.select([]) injects a stub column internally for Arrow blocks.
-    new_block = block_accessor.select([])
-
-    for name, output_col in zip(names, output_cols):
-        new_block = BlockAccessor.for_block(new_block).fill_column(name, output_col)
-
-    return BlockAccessor.for_block(new_block).drop(["__stub__"])
-    final_accessor = BlockAccessor.for_block(new_block)
-    if (
-        _BATCH_SIZE_PRESERVING_STUB_COL_NAME not in names
-        and _BATCH_SIZE_PRESERVING_STUB_COL_NAME in final_accessor.column_names()
-    ):
-        new_block = final_accessor.drop([_BATCH_SIZE_PRESERVING_STUB_COL_NAME])
-
-    return new_block
+    # Efficiently construct the new block based on its type.
+    block_type = block_accessor.block_type()
+    if block_type == BlockType.ARROW:
+        return pa.Table.from_arrays(list(output_cols), names=list(names))
+    elif block_type == BlockType.PANDAS:
+        return pd.DataFrame(dict(zip(names, output_cols)))
+
+    # Fallback to generic, iterative construction for other block types.
+    new_block = block_accessor.select([])
+    for name, output_col in zip(names, output_cols):
+        new_block = BlockAccessor.for_block(new_block).fill_column(name, output_col)
+
+    final_accessor = BlockAccessor.for_block(new_block)
+    if (
+        _BATCH_SIZE_PRESERVING_STUB_COL_NAME not in names
+        and _BATCH_SIZE_PRESERVING_STUB_COL_NAME in final_accessor.column_names()
+    ):
+        new_block = final_accessor.drop([_BATCH_SIZE_PRESERVING_STUB_COL_NAME])
+
+    return new_block
@@ -5,9 +5,13 @@
 import pytest
 from pkg_resources import parse_version
 
+from ray.data._internal.arrow_block import _BATCH_SIZE_PRESERVING_STUB_COL_NAME
 from ray.data._internal.planner.plan_expression.expression_evaluator import (
     ExpressionEvaluator,
+    eval_projection,
 )
+from ray.data.block import BlockAccessor
+from ray.data.expressions import col, lit
 from ray.data.tests.conftest import get_pyarrow_version
 
 
@@ -350,6 +354,62 @@ def test_filter_bad_expression(sample_data):
         pq.read_table(sample_data_path, filters=filters)
 
 
+@pytest.mark.parametrize(
+    "block",
+    [
+        pa.table({"a": [1, 2], "b": [3, 4]}),
+        pd.DataFrame({"a": [1, 2], "b": [3, 4]}),
+    ],
+)
+def test_eval_projection_builds_from_empty_block(block):
+    exprs = [lit(5).alias("five"), (col("a") + lit(1)).alias("a_plus")]
+
+    out = eval_projection(exprs, block)
+    acc = BlockAccessor.for_block(out)
+
+    assert acc.num_rows() == 2
+    assert acc.column_names() == ["five", "a_plus"]
+
+    out_df = acc.to_pandas()
+    assert out_df["five"].tolist() == [5, 5]
+    assert out_df["a_plus"].tolist() == [2, 3]
+
+    if isinstance(out, pa.Table):
+        assert _BATCH_SIZE_PRESERVING_STUB_COL_NAME not in acc.column_names()
+
+
+def test_eval_projection_empty_block():
+    block = pa.table({"a": pa.array([], type=pa.int64())})
+    exprs = [lit(5).alias("five")]
+
+    out = eval_projection(exprs, block)
+
+    assert BlockAccessor.for_block(out).num_rows() == 0
+
+
+def test_eval_projection_with_stub_col_name_in_projection():
+    block = pa.table({"a": [1, 2]})
+    exprs = [lit(999).alias(_BATCH_SIZE_PRESERVING_STUB_COL_NAME)]
+
+    out = eval_projection(exprs, block)
+
+    acc = BlockAccessor.for_block(out)
+    assert _BATCH_SIZE_PRESERVING_STUB_COL_NAME in acc.column_names()
+    assert acc.to_pandas()[_BATCH_SIZE_PRESERVING_STUB_COL_NAME].tolist() == [
+        999,
+        999,
+    ]
+
+
+def test_eval_projection_single_column():
+    block = pa.table({"a": [1, 2], "b": [3, 4]})
+    exprs = [col("a")]
+
+    out = eval_projection(exprs, block)
+
+    assert BlockAccessor.for_block(out).column_names() == ["a"]
+
+
 if __name__ == "__main__":
     import sys