[Data] Refactor eval_projection stub column handling

slfan1989 · slfan1989 · commit 1ab2d071d170 · 2026-02-07T07:37:57.000+08:00
Improve the stub column cleanup logic in eval_projection():
- Replace hardcoded "__stub__" with _BATCH_SIZE_PRESERVING_STUB_COL_NAME constant
- Use block_accessor.select([]) instead of fill + drop pattern for cleaner logic
- Add conditional check to prevent dropping user columns that match stub name
- Reuse BlockAccessor instance to avoid redundant object creation
- Enhance comment to explain Arrow's empty table limitation

Testing:
- Add test for empty block (0 rows) handling
- Add test for stub column name collision with user projection
- Add test for single column projection
- Verify stub column is properly cleaned up in Arrow tables
- Add value assertions in stub column name collision test

Signed-off-by: slfan1989 &lt;slfan1989@apache.org&gt;
diff --git a/python/ray/data/_internal/planner/plan_expression/expression_evaluator.py b/python/ray/data/_internal/planner/plan_expression/expression_evaluator.py
@@ -11,6 +11,7 @@
 import pyarrow.compute as pc
 import pyarrow.dataset as ds
 
+from ray.data._internal.arrow_block import _BATCH_SIZE_PRESERVING_STUB_COL_NAME
 from ray.data._internal.logical.rules.projection_pushdown import (
     _extract_input_columns_renaming_mapping,
 )
@@ -747,15 +748,19 @@ def eval_projection(projection_exprs: List[Expr], block: Block) -> Block:
 
     names, output_cols = zip(*[(e.name, eval_expr(e, block)) for e in projection_exprs])
 
-    # This clumsy workaround is necessary to be able to fill in Pyarrow tables
-    # that has to be "seeded" from existing table with N rows, and couldn't be
-    # started from a truly empty table.
-    #
-    # TODO fix
-    new_block = BlockAccessor.for_block(block).fill_column("__stub__", None)
-    new_block = BlockAccessor.for_block(new_block).drop(input_column_names)
+    # Build an empty block that preserves row count across block types. Arrow tables
+    # cannot be created truly empty (0 columns) while retaining row count, so
+    # BlockAccessor.select([]) injects a stub column internally for Arrow blocks.
+    new_block = block_accessor.select([])
 
     for name, output_col in zip(names, output_cols):
         new_block = BlockAccessor.for_block(new_block).fill_column(name, output_col)
 
-    return BlockAccessor.for_block(new_block).drop(["__stub__"])
+    final_accessor = BlockAccessor.for_block(new_block)
+    if (
+        _BATCH_SIZE_PRESERVING_STUB_COL_NAME not in names
+        and _BATCH_SIZE_PRESERVING_STUB_COL_NAME in final_accessor.column_names()
+    ):
+        new_block = final_accessor.drop([_BATCH_SIZE_PRESERVING_STUB_COL_NAME])
+
+    return new_block
diff --git a/python/ray/data/tests/unit/test_expression_evaluator.py b/python/ray/data/tests/unit/test_expression_evaluator.py
@@ -5,9 +5,13 @@
 import pytest
 from pkg_resources import parse_version
 
+from ray.data._internal.arrow_block import _BATCH_SIZE_PRESERVING_STUB_COL_NAME
 from ray.data._internal.planner.plan_expression.expression_evaluator import (
     ExpressionEvaluator,
+    eval_projection,
 )
+from ray.data.block import BlockAccessor
+from ray.data.expressions import col, lit
 from ray.data.tests.conftest import get_pyarrow_version
 
 
@@ -350,6 +354,62 @@ def test_filter_bad_expression(sample_data):
         pq.read_table(sample_data_path, filters=filters)
 
 
+@pytest.mark.parametrize(
+    "block",
+    [
+        pa.table({"a": [1, 2], "b": [3, 4]}),
+        pd.DataFrame({"a": [1, 2], "b": [3, 4]}),
+    ],
+)
+def test_eval_projection_builds_from_empty_block(block):
+    exprs = [lit(5).alias("five"), (col("a") + lit(1)).alias("a_plus")]
+
+    out = eval_projection(exprs, block)
+    acc = BlockAccessor.for_block(out)
+
+    assert acc.num_rows() == 2
+    assert acc.column_names() == ["five", "a_plus"]
+
+    out_df = acc.to_pandas()
+    assert out_df["five"].tolist() == [5, 5]
+    assert out_df["a_plus"].tolist() == [2, 3]
+
+    if isinstance(out, pa.Table):
+        assert _BATCH_SIZE_PRESERVING_STUB_COL_NAME not in acc.column_names()
+
+
+def test_eval_projection_empty_block():
+    block = pa.table({"a": pa.array([], type=pa.int64())})
+    exprs = [lit(5).alias("five")]
+
+    out = eval_projection(exprs, block)
+
+    assert BlockAccessor.for_block(out).num_rows() == 0
+
+
+def test_eval_projection_with_stub_col_name_in_projection():
+    block = pa.table({"a": [1, 2]})
+    exprs = [lit(999).alias(_BATCH_SIZE_PRESERVING_STUB_COL_NAME)]
+
+    out = eval_projection(exprs, block)
+
+    acc = BlockAccessor.for_block(out)
+    assert _BATCH_SIZE_PRESERVING_STUB_COL_NAME in acc.column_names()
+    assert acc.to_pandas()[_BATCH_SIZE_PRESERVING_STUB_COL_NAME].tolist() == [
+        999,
+        999,
+    ]
+
+
+def test_eval_projection_single_column():
+    block = pa.table({"a": [1, 2], "b": [3, 4]})
+    exprs = [col("a")]
+
+    out = eval_projection(exprs, block)
+
+    assert BlockAccessor.for_block(out).column_names() == ["a"]
+
+
 if __name__ == "__main__":
     import sys