matthewdeng
diff --git a/‎doc/source/data/dataset-tensor-support.rst‎
Lines changed: 11 additions & 46 deletions b/‎doc/source/data/dataset-tensor-support.rst‎
Lines changed: 11 additions & 46 deletions
diff --git a/‎python/ray/data/block.py‎
Lines changed: 3 additions & 23 deletions b/‎python/ray/data/block.py‎
Lines changed: 3 additions & 23 deletions
diff --git a/‎python/ray/data/dataset.py‎
Lines changed: 17 additions & 23 deletions b/‎python/ray/data/dataset.py‎
Lines changed: 17 additions & 23 deletions
diff --git a/‎python/ray/data/datasource/datasource.py‎
Lines changed: 14 additions & 7 deletions b/‎python/ray/data/datasource/datasource.py‎
Lines changed: 14 additions & 7 deletions
diff --git a/‎python/ray/data/datasource/numpy_datasource.py‎
Lines changed: 6 additions & 1 deletion b/‎python/ray/data/datasource/numpy_datasource.py‎
Lines changed: 6 additions & 1 deletion
@@ -15,57 +15,22 @@ Automatic conversion between the Pandas and Arrow extension types/arrays keeps t
 Single-column tensor datasets
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The most basic case is when a dataset only has a single column, which is of tensor
-type. This kind of dataset can be:
+The most basic case is when a dataset only has a single column, which is of tensor type. This kind of dataset can be created with ``.range_tensor()``, and can be read from and written to ``.npy`` files. Here are some examples:
 
-* created with :func:`range_tensor() <ray.data.range_tensor>`
-  or :func:`from_numpy() <ray.data.from_numpy>`,
-* transformed with NumPy UDFs via
-  :meth:`ds.map_batches() <ray.data.Dataset.map_batches>`,
-* consumed with :meth:`ds.iter_rows() <ray.data.Dataset.iter_rows>` and
-  :meth:`ds.iter_batches() <ray.data.Dataset.iter_batches>`, and
-* can be read from and written to ``.npy`` files.
+.. code-block:: python
 
-Here is an end-to-end example:
+    # Create a Dataset of tensor-typed values.
+    ds = ray.data.range_tensor(10000, shape=(3, 5))
+    # -> Dataset(num_blocks=200, num_rows=10000,
+    #            schema={value: <ArrowTensorType: shape=(3, 5), dtype=int64>})
 
-.. code-block:: python
+    # Save to storage.
+    ds.write_numpy("/tmp/tensor_out", column="value")
 
-    # Create a synthetic pure-tensor Dataset.
-    ds = ray.data.range_tensor(10, shape=(3, 5))
-    # -> Dataset(num_blocks=10, num_rows=10,
-    #            schema={__value__: <ArrowTensorType: shape=(3, 5), dtype=int64>})
-
-    # Create a pure-tensor Dataset from an existing NumPy ndarray.
-    arr = np.arange(10 * 3 * 5).reshape((10, 3, 5))
-    ds = ray.data.from_numpy(arr)
-    # -> Dataset(num_blocks=1, num_rows=10,
-    #            schema={__value__: <ArrowTensorType: shape=(3, 5), dtype=int64>})
-
-    # Transform the tensors. Datasets will automatically unpack the single-column Arrow
-    # table into a NumPy ndarray, provide that ndarray to your UDF, and then repack it
-    # into a single-column Arrow table; this will be a zero-copy conversion in both
-    # cases.
-    ds = ds.map_batches(lambda arr: arr / arr.max())
-    # -> Dataset(num_blocks=1, num_rows=10,
-    #            schema={__value__: <ArrowTensorType: shape=(3, 5), dtype=double>})
-
-    # Consume the tensor. This will yield the underlying (3, 5) ndarrays.
-    for arr in ds.iter_rows():
-        assert isinstance(arr, np.ndarray)
-        assert arr.shape == (3, 5)
-
-    # Consume the tensor in batches.
-    for arr in ds.iter_batches(batch_size=2):
-        assert isinstance(arr, np.ndarray)
-        assert arr.shape == (2, 3, 5)
-
-    # Save to storage. This will write out the blocks of the tensor column as NPY files.
-    ds.write_numpy("/tmp/tensor_out")
-
-    # Read back from storage.
+    # Read from storage.
     ray.data.read_numpy("/tmp/tensor_out")
-    # -> Dataset(num_blocks=1, num_rows=?,
-    #            schema={__value__: <ArrowTensorType: shape=(3, 5), dtype=double>})
+    # -> Dataset(num_blocks=200, num_rows=?,
+    #            schema={value: <ArrowTensorType: shape=(3, 5), dtype=int64>})
 
 Reading existing serialized tensor columns
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -3,7 +3,6 @@
 from typing import (
     TypeVar,
     List,
-    Dict,
     Generic,
     Iterator,
     Tuple,
@@ -83,10 +82,6 @@ def _validate_key_fn(ds: "Dataset", key: KeyFn) -> None:
 # ``SimpleBlockAccessor`` and ``ArrowBlockAccessor``.
 Block = Union[List[T], "pyarrow.Table", "pandas.DataFrame", bytes]
 
-# User-facing data batch type. This is the data type for data that is supplied to and
-# returned from batch UDFs.
-DataBatch = Union[Block, np.ndarray]
-
 # A list of block references pending computation by a single task. For example,
 # this may be the output of a task reading a file.
 BlockPartition = List[Tuple[ObjectRef[Block], "BlockMetadata"]]
@@ -215,13 +210,11 @@ def to_pandas(self) -> "pandas.DataFrame":
         """Convert this block into a Pandas dataframe."""
         raise NotImplementedError
 
-    def to_numpy(
-        self, columns: Optional[Union[str, List[str]]] = None
-    ) -> Union[np.ndarray, Dict[str, np.ndarray]]:
-        """Convert this block (or columns of block) into a NumPy ndarray.
+    def to_numpy(self, column: str = None) -> np.ndarray:
+        """Convert this block (or column of block) into a NumPy ndarray.
 
         Args:
-            columns: Name of columns to convert, or None if converting all columns.
+            column: Name of column to convert, or None.
         """
         raise NotImplementedError
 
@@ -233,10 +226,6 @@ def to_block(self) -> Block:
         """Return the base block that this accessor wraps."""
         raise NotImplementedError
 
-    def to_native(self) -> Block:
-        """Return the native data format for this accessor."""
-        return self.to_block()
-
     def size_bytes(self) -> int:
         """Return the approximate size in bytes of this block."""
         raise NotImplementedError
@@ -266,15 +255,6 @@ def builder() -> "BlockBuilder[T]":
         """Create a builder for this block type."""
         raise NotImplementedError
 
-    @staticmethod
-    def batch_to_block(batch: DataBatch) -> Block:
-        """Create a block from user-facing data formats."""
-        if isinstance(batch, np.ndarray):
-            from ray.data.impl.arrow_block import ArrowBlockAccessor
-
-            return ArrowBlockAccessor.numpy_to_block(batch)
-        return batch
-
     @staticmethod
     def for_block(block: Block) -> "BlockAccessor[T]":
         """Create a block accessor for the given block."""
 
@@ -68,7 +68,6 @@
 from ray.data.row import TableRow
 from ray.data.aggregate import AggregateFn, Sum, Max, Min, Mean, Std
 from ray.data.random_access_dataset import RandomAccessDataset
-from ray.data.impl.table_block import VALUE_COL_NAME
 from ray.data.impl.remote_fn import cached_remote_fn
 from ray.data.impl.block_batching import batch_blocks, BatchType
 from ray.data.impl.plan import ExecutionPlan, OneToOneStage, AllToAllStage
@@ -235,8 +234,8 @@ def map(
 
         def transform(block: Block) -> Iterable[Block]:
             DatasetContext._set_current(context)
-            output_buffer = BlockOutputBuffer(None, context.target_max_block_size)
             block = BlockAccessor.for_block(block)
+            output_buffer = BlockOutputBuffer(None, context.target_max_block_size)
             for row in block.iter_rows():
                 output_buffer.add(fn(row))
                 if output_buffer.has_next():
@@ -261,9 +260,6 @@ def map_batches(
     ) -> "Dataset[Any]":
         """Apply the given function to batches of records of this dataset.
 
-        The format of the data batch provided to ``fn`` can be controlled via the
-        ``batch_format`` argument, and the output of the UDF can be any batch type.
-
         This is a blocking operation.
 
         Examples:
@@ -310,9 +306,10 @@ def map_batches(
                 blocks as batches. Defaults to a system-chosen batch size.
             compute: The compute strategy, either "tasks" (default) to use Ray
                 tasks, or ActorPoolStrategy(min, max) to use an autoscaling actor pool.
-            batch_format: Specify "native" to use the native block format (promotes
-                tables to Pandas and tensors to NumPy), "pandas" to select
-                ``pandas.DataFrame``, or "pyarrow" to select `pyarrow.Table``.
+            batch_format: Specify "native" to use the native block format
+                (promotes Arrow to pandas), "pandas" to select
+                ``pandas.DataFrame`` as the batch format,
+                or "pyarrow" to select ``pyarrow.Table``.
             ray_remote_args: Additional resource requirements to request from
                 ray (e.g., num_gpus=1 to request GPUs for the map tasks).
         """
@@ -341,7 +338,9 @@ def transform(block: Block) -> Iterable[Block]:
                 # bug where we include the entire base view on serialization.
                 view = block.slice(start, end, copy=batch_size is not None)
                 if batch_format == "native":
-                    view = BlockAccessor.for_block(view).to_native()
+                    # Always promote Arrow blocks to pandas for consistency.
+                    if isinstance(view, pa.Table) or isinstance(view, bytes):
+                        view = BlockAccessor.for_block(view).to_pandas()
                 elif batch_format == "pandas":
                     view = BlockAccessor.for_block(view).to_pandas()
                 elif batch_format == "pyarrow":
@@ -356,7 +355,6 @@ def transform(block: Block) -> Iterable[Block]:
                 if not (
                     isinstance(applied, list)
                     or isinstance(applied, pa.Table)
-                    or isinstance(applied, np.ndarray)
                     or isinstance(applied, pd.core.frame.DataFrame)
                 ):
                     raise ValueError(
@@ -366,7 +364,7 @@ def transform(block: Block) -> Iterable[Block]:
                         "The return type must be either list, "
                         "pandas.DataFrame, or pyarrow.Table"
                     )
-                output_buffer.add_batch(applied)
+                output_buffer.add_block(applied)
                 if output_buffer.has_next():
                     yield output_buffer.next()
 
@@ -703,8 +701,6 @@ def process_batch(batch):
                 )
             if isinstance(batch, pd.DataFrame):
                 return batch.sample(frac=fraction)
-            if isinstance(batch, np.ndarray):
-                return np.array([row for row in batch if random.random() <= fraction])
             raise ValueError(f"Unsupported batch type: {type(batch)}")
 
         return self.map_batches(process_batch)
@@ -2075,7 +2071,7 @@ def write_numpy(
         self,
         path: str,
         *,
-        column: str = VALUE_COL_NAME,
+        column: str = "value",
         filesystem: Optional["pyarrow.fs.FileSystem"] = None,
         try_create_dir: bool = True,
         arrow_open_stream_args: Optional[Dict[str, Any]] = None,
@@ -2103,8 +2099,7 @@ def write_numpy(
             path: The path to the destination root directory, where npy
                 files will be written to.
             column: The name of the table column that contains the tensor to
-                be written. The default is ``"__value__"``, the column name that
-                Datasets uses for storing tensors in single-column tables.
+                be written. This defaults to "value".
             filesystem: The filesystem implementation to write to.
             try_create_dir: Try to create all directories in destination path
                 if True. Does nothing if all directories already exist.
@@ -2251,10 +2246,10 @@ def iter_batches(
                 current block during the scan.
             batch_size: Record batch size, or None to let the system pick.
             batch_format: The format in which to return each batch.
-                Specify "native" to use the native block format (promoting
-                tables to Pandas and tensors to NumPy), "pandas" to select
-                ``pandas.DataFrame``, or "pyarrow" to select ``pyarrow.Table``. Default
-                is "native".
+                Specify "native" to use the current block format (promoting
+                Arrow to pandas automatically), "pandas" to
+                select ``pandas.DataFrame`` or "pyarrow" to select
+                ``pyarrow.Table``. Default is "native".
             drop_last: Whether to drop the last batch if it's incomplete.
 
         Returns:
@@ -2776,9 +2771,8 @@ def to_numpy_refs(
         Time complexity: O(dataset size / parallelism)
 
         Args:
-            column: The name of the column to convert to numpy, or None to specify the
-            entire row. If not specified for Arrow or Pandas blocks, each returned
-            future will represent a dict of column ndarrays.
+            column: The name of the column to convert to numpy, or None to
+                specify the entire row. Required for Arrow tables.
 
         Returns:
             A list of remote NumPy ndarrays created from this dataset.
 
@@ -193,11 +193,14 @@ def make_block(start: int, count: int) -> Block:
             elif block_format == "tensor":
                 import pyarrow as pa
 
-                tensor = np.ones(tensor_shape, dtype=np.int64) * np.expand_dims(
-                    np.arange(start, start + count),
-                    tuple(range(1, 1 + len(tensor_shape))),
+                tensor = TensorArray(
+                    np.ones(tensor_shape, dtype=np.int64)
+                    * np.expand_dims(
+                        np.arange(start, start + count),
+                        tuple(range(1, 1 + len(tensor_shape))),
+                    )
                 )
-                return BlockAccessor.batch_to_block(tensor)
+                return pa.Table.from_pydict({"value": tensor})
             else:
                 return list(builtins.range(start, start + count))
 
@@ -211,12 +214,16 @@ def make_block(start: int, count: int) -> Block:
                 schema = pa.Table.from_pydict({"value": [0]}).schema
             elif block_format == "tensor":
                 _check_pyarrow_version()
+                from ray.data.extensions import TensorArray
                 import pyarrow as pa
 
-                tensor = np.ones(tensor_shape, dtype=np.int64) * np.expand_dims(
-                    np.arange(0, 10), tuple(range(1, 1 + len(tensor_shape)))
+                tensor = TensorArray(
+                    np.ones(tensor_shape, dtype=np.int64)
+                    * np.expand_dims(
+                        np.arange(0, 10), tuple(range(1, 1 + len(tensor_shape)))
+                    )
                 )
-                schema = BlockAccessor.batch_to_block(tensor).schema
+                schema = pa.Table.from_pydict({"value": tensor}).schema
             elif block_format == "list":
                 schema = int
             else:
 
@@ -26,13 +26,18 @@ class NumpyDatasource(FileBasedDatasource):
     """
 
     def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args):
+        from ray.data.extensions import TensorArray
+        import pyarrow as pa
+
         # TODO(ekl) Ideally numpy can read directly from the file, but it
         # seems like it requires the file to be seekable.
         buf = BytesIO()
         data = f.readall()
         buf.write(data)
         buf.seek(0)
-        return BlockAccessor.batch_to_block(np.load(buf, allow_pickle=True))
+        return pa.Table.from_pydict(
+            {"value": TensorArray(np.load(buf, allow_pickle=True))}
+        )
 
     def _write_block(
         self,