Revert "[Data] Speed up printing the schema (#52612)" (#52753)

iamjustinhsu · web-flow · commit f23510cf8957 · 2025-05-02T13:05:31.000-07:00
This reverts commit 2c9e17d. it is breaking doc tests
diff --git a/doc/source/data/working-with-tensors.rst b/doc/source/data/working-with-tensors.rst
@@ -21,7 +21,10 @@ Ray Data represents tensors as
 
 .. testoutput::
 
-    Dataset(num_rows=100, schema=...)
+    Dataset(
+       num_rows=100,
+       schema={image: numpy.ndarray(shape=(28, 28), dtype=uint8)}
+    )
 
 Batches of fixed-shape tensors
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/python/ray/data/_internal/plan.py b/python/ray/data/_internal/plan.py
@@ -373,7 +373,7 @@ def schema(
         elif self._logical_plan.dag.aggregate_output_metadata().schema is not None:
             schema = self._logical_plan.dag.aggregate_output_metadata().schema
 
-        elif fetch_if_missing:
+        elif fetch_if_missing or self.is_read_only():
             # For consistency with the previous implementation, we fetch the schema if
             # the plan is read-only even if `fetch_if_missing` is False.
 
@@ -587,6 +587,15 @@ def has_lazy_input(self) -> bool:
         """Return whether this plan has lazy input blocks."""
         return all(isinstance(op, Read) for op in self._logical_plan.sources())
 
+    def is_read_only(self, root_op: Optional[LogicalOperator] = None) -> bool:
+        """Return whether the LogicalPlan corresponding to `root_op`
+        contains only a Read op. By default, the last operator of
+        the LogicalPlan is used."""
+        if root_op is None:
+            root_op = self._logical_plan.dag
+
+        return root_op.is_read_op()
+
     def has_computed_output(self) -> bool:
         """Whether this plan has a computed snapshot for the final operator, i.e. for
         the output of this plan.
diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py
@@ -4882,7 +4882,16 @@ def to_tf(
             >>> import ray
             >>> ds = ray.data.read_csv("s3://anonymous@air-example-data/iris.csv")
             >>> ds
-            Dataset(num_rows=?, schema=...)
+            Dataset(
+               num_rows=?,
+               schema={
+                  sepal length (cm): double,
+                  sepal width (cm): double,
+                  petal length (cm): double,
+                  petal width (cm): double,
+                  target: int64
+               }
+            )
 
             If your model accepts a single tensor as input, specify a single feature column.
 
@@ -4904,7 +4913,16 @@ def to_tf(
             >>> ds = preprocessor.transform(ds)
             >>> ds
             Concatenator
-            +- Dataset(num_rows=?, schema=...)
+            +- Dataset(
+                  num_rows=?,
+                  schema={
+                     sepal length (cm): double,
+                     sepal width (cm): double,
+                     petal length (cm): double,
+                     petal width (cm): double,
+                     target: int64
+                  }
+               )
             >>> ds.to_tf("features", "target")
             <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))>
 
@@ -5609,7 +5627,16 @@ def serialize_lineage(self) -> bytes:
 
             .. testoutput::
 
-                Dataset(num_rows=?, schema=...)
+                Dataset(
+                   num_rows=?,
+                   schema={
+                      sepal length (cm): double,
+                      sepal width (cm): double,
+                      petal length (cm): double,
+                      petal width (cm): double,
+                      target: int64
+                   }
+                )
 
 
         Returns:
@@ -5682,7 +5709,16 @@ def deserialize_lineage(serialized_ds: bytes) -> "Dataset":
 
             .. testoutput::
 
-                Dataset(num_rows=?, schema=...)
+                Dataset(
+                   num_rows=?,
+                   schema={
+                      sepal length (cm): double,
+                      sepal width (cm): double,
+                      petal length (cm): double,
+                      petal width (cm): double,
+                      target: int64
+                   }
+                )
 
         Args:
             serialized_ds: The serialized Dataset that we wish to deserialize.
diff --git a/python/ray/data/iterator.py b/python/ray/data/iterator.py
@@ -681,7 +681,16 @@ def to_tf(
             ...     "s3://anonymous@air-example-data/iris.csv"
             ... )
             >>> it = ds.iterator(); it
-            DataIterator(Dataset(num_rows=?, schema=...))
+            DataIterator(Dataset(
+               num_rows=?,
+               schema={
+                  sepal length (cm): double,
+                  sepal width (cm): double,
+                  petal length (cm): double,
+                  petal width (cm): double,
+                  target: int64
+               }
+            ))
 
             If your model accepts a single tensor as input, specify a single feature column.
 
@@ -703,7 +712,16 @@ def to_tf(
             >>> it = preprocessor.transform(ds).iterator()
             >>> it
             DataIterator(Concatenator
-            +- Dataset(num_rows=?, schema=...))
+            +- Dataset(
+                  num_rows=?,
+                  schema={
+                     sepal length (cm): double,
+                     sepal width (cm): double,
+                     petal length (cm): double,
+                     petal width (cm): double,
+                     target: int64
+                  }
+               ))
             >>> it.to_tf("features", "target")
             <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))>
 
diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py
@@ -1499,7 +1499,7 @@ def read_csv(
 
         >>> ray.data.read_csv("s3://anonymous@ray-example-data/different-extensions/",
         ...     file_extensions=["csv"])
-        Dataset(num_rows=?, schema=...)
+        Dataset(num_rows=?, schema={a: int64, b: int64})
 
     Args:
         paths: A single file or directory, or a list of file or directory paths.
@@ -1944,7 +1944,10 @@ def read_tfrecords(
     Examples:
         >>> import ray
         >>> ray.data.read_tfrecords("s3://anonymous@ray-example-data/iris.tfrecords")
-        Dataset(num_rows=?, schema=...)
+        Dataset(
+           num_rows=?,
+           schema={...}
+        )
 
         We can also read compressed TFRecord files, which use one of the
         `compression types supported by Arrow <https://arrow.apache.org/docs/python/\
@@ -1954,7 +1957,10 @@ def read_tfrecords(
         ...     "s3://anonymous@ray-example-data/iris.tfrecords.gz",
         ...     arrow_open_stream_args={"compression": "gzip"},
         ... )
-        Dataset(num_rows=?, schema=...)
+        Dataset(
+           num_rows=?,
+           schema={...}
+        )
 
     Args:
         paths: A single file or directory, or a list of file or directory paths.
diff --git a/python/ray/data/tests/test_mongo.py b/python/ray/data/tests/test_mongo.py
@@ -251,7 +251,13 @@ def test_mongo_datasource(ray_start_regular_shared, start_mongo):
         collection=foo_collection,
         override_num_blocks=1000,
     )
-    assert str(ds) == ("Dataset(num_rows=5, schema=Unknown schema)")
+    assert str(ds) == (
+        "Dataset(\n"
+        "   num_rows=5,\n"
+        "   schema={_id: fixed_size_binary[12], float_field: double, "
+        "int_field: int32}\n"
+        ")"
+    )
     assert df.equals(ds.drop_columns(["_id"]).to_pandas())
 
     # Read a subset of the collection.
diff --git a/python/ray/data/tests/test_parquet.py b/python/ray/data/tests/test_parquet.py
@@ -394,9 +394,10 @@ def test_parquet_read_bulk(ray_start_regular_shared, fs, data_path):
     assert "test1.parquet" in str(input_files)
     assert "test2.parquet" in str(input_files)
     assert not ds._plan.has_started_execution
-    assert ds.schema() == Schema(pa.schema({"one": pa.int64(), "two": pa.string()}))
 
     # Schema isn't available, so we do a partial read.
+    assert ds.schema() == Schema(pa.schema({"one": pa.int64(), "two": pa.string()}))
+    assert ds._plan.has_started_execution
     assert not ds._plan.has_computed_output()
 
     # Forces a data read.
@@ -476,7 +477,7 @@ def test_parquet_read_bulk_meta_provider(ray_start_regular_shared, fs, data_path
     assert ds.count() == 6
     assert ds.size_bytes() > 0
     assert ds.schema() == Schema(pa.schema({"one": pa.int64(), "two": pa.string()}))
-    assert not ds._plan.has_started_execution
+    assert ds._plan.has_started_execution
 
     # Forces a data read.
     values = [[s["one"], s["two"]] for s in ds.take()]