From 8657cdc99d22c1c6feeb4834a876ea1ffd342d4c Mon Sep 17 00:00:00 2001
From: jhsu <jhsu@anyscale.com>
Date: Fri, 2 May 2025 11:27:33 -0700
Subject: [PATCH 1/4] fix working code snippets

Signed-off-by: jhsu <jhsu@anyscale.com>
---
 doc/source/ray-contribute/writing-code-snippets.rst | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)
diff --git a/doc/source/ray-contribute/writing-code-snippets.rst b/doc/source/ray-contribute/writing-code-snippets.rst
index 1ef1c69f0170..f1e0db7b59de 100644
--- a/doc/source/ray-contribute/writing-code-snippets.rst
+++ b/doc/source/ray-contribute/writing-code-snippets.rst
@@ -228,10 +228,7 @@ To ignore parts of a *doctest-style* output, replace problematic sections with e
 
     >>> import ray
     >>> ray.data.read_images("s3://anonymous@ray-example-data/image-datasets/simple")
-    Dataset(
-       num_rows=...,
-       schema={image: numpy.ndarray(shape=(32, 32, 3), dtype=uint8)}
-    )
+    Dataset(num_rows=..., schema=...)
 
 To ignore an output altogether, write a *code-output-style* snippet. Don't use `# doctest: +SKIP`.
 
@@ -249,10 +246,7 @@ with ellipses. ::
 
     .. testoutput::
 
-        Dataset(
-           num_rows=...,
-           schema={image: numpy.ndarray(shape=(32, 32, 3), dtype=uint8)}
-        )
+        Dataset(num_rows=3, schema=...)
 
 If your output is nondeterministic and you want to display a sample output, add
 `:options: +MOCK`. ::

From d85651ef1957e7dc0e0ed5abffff02615afa712c Mon Sep 17 00:00:00 2001
From: jhsu <jhsu@anyscale.com>
Date: Fri, 2 May 2025 11:29:05 -0700
Subject: [PATCH 2/4] use ...

Signed-off-by: jhsu <jhsu@anyscale.com>
---
 doc/source/ray-contribute/writing-code-snippets.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/ray-contribute/writing-code-snippets.rst b/doc/source/ray-contribute/writing-code-snippets.rst
index f1e0db7b59de..7a8c84dc5c5b 100644
--- a/doc/source/ray-contribute/writing-code-snippets.rst
+++ b/doc/source/ray-contribute/writing-code-snippets.rst
@@ -246,7 +246,7 @@ with ellipses. ::
 
     .. testoutput::
 
-        Dataset(num_rows=3, schema=...)
+        Dataset(num_rows=..., schema=...)
 
 If your output is nondeterministic and you want to display a sample output, add
 `:options: +MOCK`. ::

From 4339948ccc8b11696d7911c0628f51c40a96fa48 Mon Sep 17 00:00:00 2001
From: jhsu <jhsu@anyscale.com>
Date: Fri, 2 May 2025 11:57:52 -0700
Subject: [PATCH 3/4] fix inspecting data rst

Signed-off-by: jhsu <jhsu@anyscale.com>
---
 doc/source/data/inspecting-data.rst | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/doc/source/data/inspecting-data.rst b/doc/source/data/inspecting-data.rst
index 406b4165d5de..986b0d82b6e1 100644
--- a/doc/source/data/inspecting-data.rst
+++ b/doc/source/data/inspecting-data.rst
@@ -51,16 +51,7 @@ For more information like the number of rows, print the Dataset.
 
 .. testoutput::
 
-    Dataset(
-       num_rows=150,
-       schema={
-          sepal length (cm): double,
-          sepal width (cm): double,
-          petal length (cm): double,
-          petal width (cm): double,
-          target: int64
-       }
-    )
+    Dataset(num_rows=..., schema=...)
 
 .. _inspecting-rows:
 
@@ -138,7 +129,6 @@ of the returned batch, set ``batch_format``.
             0                5.1               3.5  ...               0.2       0
             1                4.9               3.0  ...               0.2       0
             <BLANKLINE>
-            [2 rows x 5 columns]
 
 For more information on working with batches, see
 :ref:`Transforming batches <transforming_batches>` and

From 87181dab5f72a03475d80e29cf2f35e0ba7aad71 Mon Sep 17 00:00:00 2001
From: iamjustinhsu <140442892+iamjustinhsu@users.noreply.github.com>
Date: Thu, 1 May 2025 18:47:22 -0700
Subject: [PATCH 4/4] [Data] Speed up printing the schema (#52612)

<!-- Thank you for your contribution! Please review
https://github.com/ray-project/ray/blob/master/CONTRIBUTING.rst before
opening a pull request. -->

<!-- Please add a reviewer to the assignee section when you create a PR.
If you don't have the access to it, we will shortly find a reviewer and
assign them to your PR. -->

## Why are these changes needed?
if the dataset only has read functions applied, due to lazy execution
model, we need to perform computation to read the schema. This PR
removes the special case if the schema is not present.
<!-- Please give a short summary of the change and the problem this
solves. -->

## Related issue number

<!-- For example: "Closes #1234" -->
https://github.com/ray-project/ray/issues/50361

## Checks

- [ ] I've signed off every commit(by using the -s flag, i.e., `git
commit -s`) in this PR.
- [x] I've run `scripts/format.sh` to lint the changes in this PR.
- [ ] I've included any doc changes needed for
https://docs.ray.io/en/master/.
- [ ] I've added any new APIs to the API Reference. For example, if I
added a
method in Tune, I've added it in `doc/source/tune/api/` under the
           corresponding `.rst` file.
- [ ] I've made sure the tests are passing. Note that there might be a
few flaky tests, see the recent failures at https://flakey-tests.ray.io/
- Testing Strategy
   - [ ] Unit tests
   - [ ] Release tests
   - [ ] This PR is not tested :(

---------

Signed-off-by: jhsu <jhsu@anyscale.com>
---
 doc/source/data/working-with-tensors.rst |  5 +--
 python/ray/data/_internal/plan.py        | 11 +-----
 python/ray/data/dataset.py               | 44 +++---------------------
 python/ray/data/iterator.py              | 22 ++----------
 python/ray/data/read_api.py              | 12 ++-----
 python/ray/data/tests/test_mongo.py      |  8 +----
 python/ray/data/tests/test_parquet.py    |  5 ++-
 7 files changed, 14 insertions(+), 93 deletions(-)

diff --git a/doc/source/data/working-with-tensors.rst b/doc/source/data/working-with-tensors.rst
index b0f40f959e18..637b94b749dc 100644
--- a/doc/source/data/working-with-tensors.rst
+++ b/doc/source/data/working-with-tensors.rst
@@ -21,10 +21,7 @@ Ray Data represents tensors as
 
 .. testoutput::
 
-    Dataset(
-       num_rows=100,
-       schema={image: numpy.ndarray(shape=(28, 28), dtype=uint8)}
-    )
+    Dataset(num_rows=100, schema=...)
 
 Batches of fixed-shape tensors
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/python/ray/data/_internal/plan.py b/python/ray/data/_internal/plan.py
index 1a8ab59c8c50..ce0d887f91ec 100644
--- a/python/ray/data/_internal/plan.py
+++ b/python/ray/data/_internal/plan.py
@@ -373,7 +373,7 @@ def schema(
         elif self._logical_plan.dag.aggregate_output_metadata().schema is not None:
             schema = self._logical_plan.dag.aggregate_output_metadata().schema
 
-        elif fetch_if_missing or self.is_read_only():
+        elif fetch_if_missing:
             # For consistency with the previous implementation, we fetch the schema if
             # the plan is read-only even if `fetch_if_missing` is False.
 
@@ -587,15 +587,6 @@ def has_lazy_input(self) -> bool:
         """Return whether this plan has lazy input blocks."""
         return all(isinstance(op, Read) for op in self._logical_plan.sources())
 
-    def is_read_only(self, root_op: Optional[LogicalOperator] = None) -> bool:
-        """Return whether the LogicalPlan corresponding to `root_op`
-        contains only a Read op. By default, the last operator of
-        the LogicalPlan is used."""
-        if root_op is None:
-            root_op = self._logical_plan.dag
-
-        return root_op.is_read_op()
-
     def has_computed_output(self) -> bool:
         """Whether this plan has a computed snapshot for the final operator, i.e. for
         the output of this plan.
diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py
index 519992e6d0ce..60299d3742e9 100644
--- a/python/ray/data/dataset.py
+++ b/python/ray/data/dataset.py
@@ -4882,16 +4882,7 @@ def to_tf(
             >>> import ray
             >>> ds = ray.data.read_csv("s3://anonymous@air-example-data/iris.csv")
             >>> ds
-            Dataset(
-               num_rows=?,
-               schema={
-                  sepal length (cm): double,
-                  sepal width (cm): double,
-                  petal length (cm): double,
-                  petal width (cm): double,
-                  target: int64
-               }
-            )
+            Dataset(num_rows=?, schema=...)
 
             If your model accepts a single tensor as input, specify a single feature column.
 
@@ -4913,16 +4904,7 @@ def to_tf(
             >>> ds = preprocessor.transform(ds)
             >>> ds
             Concatenator
-            +- Dataset(
-                  num_rows=?,
-                  schema={
-                     sepal length (cm): double,
-                     sepal width (cm): double,
-                     petal length (cm): double,
-                     petal width (cm): double,
-                     target: int64
-                  }
-               )
+            +- Dataset(num_rows=?, schema=...)
             >>> ds.to_tf("features", "target")
             <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))>
 
@@ -5627,16 +5609,7 @@ def serialize_lineage(self) -> bytes:
 
             .. testoutput::
 
-                Dataset(
-                   num_rows=?,
-                   schema={
-                      sepal length (cm): double,
-                      sepal width (cm): double,
-                      petal length (cm): double,
-                      petal width (cm): double,
-                      target: int64
-                   }
-                )
+                Dataset(num_rows=?, schema=...)
 
 
         Returns:
@@ -5709,16 +5682,7 @@ def deserialize_lineage(serialized_ds: bytes) -> "Dataset":
 
             .. testoutput::
 
-                Dataset(
-                   num_rows=?,
-                   schema={
-                      sepal length (cm): double,
-                      sepal width (cm): double,
-                      petal length (cm): double,
-                      petal width (cm): double,
-                      target: int64
-                   }
-                )
+                Dataset(num_rows=?, schema=...)
 
         Args:
             serialized_ds: The serialized Dataset that we wish to deserialize.
diff --git a/python/ray/data/iterator.py b/python/ray/data/iterator.py
index 7bf8aab67e3a..d410c0566e2d 100644
--- a/python/ray/data/iterator.py
+++ b/python/ray/data/iterator.py
@@ -681,16 +681,7 @@ def to_tf(
             ...     "s3://anonymous@air-example-data/iris.csv"
             ... )
             >>> it = ds.iterator(); it
-            DataIterator(Dataset(
-               num_rows=?,
-               schema={
-                  sepal length (cm): double,
-                  sepal width (cm): double,
-                  petal length (cm): double,
-                  petal width (cm): double,
-                  target: int64
-               }
-            ))
+            DataIterator(Dataset(num_rows=?, schema=...))
 
             If your model accepts a single tensor as input, specify a single feature column.
 
@@ -712,16 +703,7 @@ def to_tf(
             >>> it = preprocessor.transform(ds).iterator()
             >>> it
             DataIterator(Concatenator
-            +- Dataset(
-                  num_rows=?,
-                  schema={
-                     sepal length (cm): double,
-                     sepal width (cm): double,
-                     petal length (cm): double,
-                     petal width (cm): double,
-                     target: int64
-                  }
-               ))
+            +- Dataset(num_rows=?, schema=...))
             >>> it.to_tf("features", "target")
             <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))>
 
diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py
index 786aa30e650c..66be38881a3c 100644
--- a/python/ray/data/read_api.py
+++ b/python/ray/data/read_api.py
@@ -1499,7 +1499,7 @@ def read_csv(
 
         >>> ray.data.read_csv("s3://anonymous@ray-example-data/different-extensions/",
         ...     file_extensions=["csv"])
-        Dataset(num_rows=?, schema={a: int64, b: int64})
+        Dataset(num_rows=?, schema=...)
 
     Args:
         paths: A single file or directory, or a list of file or directory paths.
@@ -1944,10 +1944,7 @@ def read_tfrecords(
     Examples:
         >>> import ray
         >>> ray.data.read_tfrecords("s3://anonymous@ray-example-data/iris.tfrecords")
-        Dataset(
-           num_rows=?,
-           schema={...}
-        )
+        Dataset(num_rows=?, schema=...)
 
         We can also read compressed TFRecord files, which use one of the
         `compression types supported by Arrow <https://arrow.apache.org/docs/python/\
@@ -1957,10 +1954,7 @@ def read_tfrecords(
         ...     "s3://anonymous@ray-example-data/iris.tfrecords.gz",
         ...     arrow_open_stream_args={"compression": "gzip"},
         ... )
-        Dataset(
-           num_rows=?,
-           schema={...}
-        )
+        Dataset(num_rows=?, schema=...)
 
     Args:
         paths: A single file or directory, or a list of file or directory paths.
diff --git a/python/ray/data/tests/test_mongo.py b/python/ray/data/tests/test_mongo.py
index eb03aab39f80..7b1f8b8b59b0 100644
--- a/python/ray/data/tests/test_mongo.py
+++ b/python/ray/data/tests/test_mongo.py
@@ -251,13 +251,7 @@ def test_mongo_datasource(ray_start_regular_shared, start_mongo):
         collection=foo_collection,
         override_num_blocks=1000,
     )
-    assert str(ds) == (
-        "Dataset(\n"
-        "   num_rows=5,\n"
-        "   schema={_id: fixed_size_binary[12], float_field: double, "
-        "int_field: int32}\n"
-        ")"
-    )
+    assert str(ds) == ("Dataset(num_rows=5, schema=Unknown schema)")
     assert df.equals(ds.drop_columns(["_id"]).to_pandas())
 
     # Read a subset of the collection.
diff --git a/python/ray/data/tests/test_parquet.py b/python/ray/data/tests/test_parquet.py
index 6ca1e1fd66fd..56da4c1b7123 100644
--- a/python/ray/data/tests/test_parquet.py
+++ b/python/ray/data/tests/test_parquet.py
@@ -394,10 +394,9 @@ def test_parquet_read_bulk(ray_start_regular_shared, fs, data_path):
     assert "test1.parquet" in str(input_files)
     assert "test2.parquet" in str(input_files)
     assert not ds._plan.has_started_execution
+    assert ds.schema() == Schema(pa.schema({"one": pa.int64(), "two": pa.string()}))
 
     # Schema isn't available, so we do a partial read.
-    assert ds.schema() == Schema(pa.schema({"one": pa.int64(), "two": pa.string()}))
-    assert ds._plan.has_started_execution
     assert not ds._plan.has_computed_output()
 
     # Forces a data read.
@@ -477,7 +476,7 @@ def test_parquet_read_bulk_meta_provider(ray_start_regular_shared, fs, data_path
     assert ds.count() == 6
     assert ds.size_bytes() > 0
     assert ds.schema() == Schema(pa.schema({"one": pa.int64(), "two": pa.string()}))
-    assert ds._plan.has_started_execution
+    assert not ds._plan.has_started_execution
 
     # Forces a data read.
     values = [[s["one"], s["two"]] for s in ds.take()]