Skip to content

Commit f23510c

Browse files
authored
Revert "[Data] Speed up printing the schema (#52612)" (#52753)
This reverts commit 2c9e17d. it is breaking doc tests
1 parent 3522305 commit f23510c

File tree

7 files changed

+93
-14
lines changed

7 files changed

+93
-14
lines changed

doc/source/data/working-with-tensors.rst

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@ Ray Data represents tensors as
2121

2222
.. testoutput::
2323

24-
Dataset(num_rows=100, schema=...)
24+
Dataset(
25+
num_rows=100,
26+
schema={image: numpy.ndarray(shape=(28, 28), dtype=uint8)}
27+
)
2528

2629
Batches of fixed-shape tensors
2730
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

python/ray/data/_internal/plan.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ def schema(
373373
elif self._logical_plan.dag.aggregate_output_metadata().schema is not None:
374374
schema = self._logical_plan.dag.aggregate_output_metadata().schema
375375

376-
elif fetch_if_missing:
376+
elif fetch_if_missing or self.is_read_only():
377377
# For consistency with the previous implementation, we fetch the schema if
378378
# the plan is read-only even if `fetch_if_missing` is False.
379379

@@ -587,6 +587,15 @@ def has_lazy_input(self) -> bool:
587587
"""Return whether this plan has lazy input blocks."""
588588
return all(isinstance(op, Read) for op in self._logical_plan.sources())
589589

590+
def is_read_only(self, root_op: Optional[LogicalOperator] = None) -> bool:
591+
"""Return whether the LogicalPlan corresponding to `root_op`
592+
contains only a Read op. By default, the last operator of
593+
the LogicalPlan is used."""
594+
if root_op is None:
595+
root_op = self._logical_plan.dag
596+
597+
return root_op.is_read_op()
598+
590599
def has_computed_output(self) -> bool:
591600
"""Whether this plan has a computed snapshot for the final operator, i.e. for
592601
the output of this plan.

python/ray/data/dataset.py

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4882,7 +4882,16 @@ def to_tf(
48824882
>>> import ray
48834883
>>> ds = ray.data.read_csv("s3://anonymous@air-example-data/iris.csv")
48844884
>>> ds
4885-
Dataset(num_rows=?, schema=...)
4885+
Dataset(
4886+
num_rows=?,
4887+
schema={
4888+
sepal length (cm): double,
4889+
sepal width (cm): double,
4890+
petal length (cm): double,
4891+
petal width (cm): double,
4892+
target: int64
4893+
}
4894+
)
48864895
48874896
If your model accepts a single tensor as input, specify a single feature column.
48884897
@@ -4904,7 +4913,16 @@ def to_tf(
49044913
>>> ds = preprocessor.transform(ds)
49054914
>>> ds
49064915
Concatenator
4907-
+- Dataset(num_rows=?, schema=...)
4916+
+- Dataset(
4917+
num_rows=?,
4918+
schema={
4919+
sepal length (cm): double,
4920+
sepal width (cm): double,
4921+
petal length (cm): double,
4922+
petal width (cm): double,
4923+
target: int64
4924+
}
4925+
)
49084926
>>> ds.to_tf("features", "target")
49094927
<_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))>
49104928
@@ -5609,7 +5627,16 @@ def serialize_lineage(self) -> bytes:
56095627
56105628
.. testoutput::
56115629
5612-
Dataset(num_rows=?, schema=...)
5630+
Dataset(
5631+
num_rows=?,
5632+
schema={
5633+
sepal length (cm): double,
5634+
sepal width (cm): double,
5635+
petal length (cm): double,
5636+
petal width (cm): double,
5637+
target: int64
5638+
}
5639+
)
56135640
56145641
56155642
Returns:
@@ -5682,7 +5709,16 @@ def deserialize_lineage(serialized_ds: bytes) -> "Dataset":
56825709
56835710
.. testoutput::
56845711
5685-
Dataset(num_rows=?, schema=...)
5712+
Dataset(
5713+
num_rows=?,
5714+
schema={
5715+
sepal length (cm): double,
5716+
sepal width (cm): double,
5717+
petal length (cm): double,
5718+
petal width (cm): double,
5719+
target: int64
5720+
}
5721+
)
56865722
56875723
Args:
56885724
serialized_ds: The serialized Dataset that we wish to deserialize.

python/ray/data/iterator.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -681,7 +681,16 @@ def to_tf(
681681
... "s3://anonymous@air-example-data/iris.csv"
682682
... )
683683
>>> it = ds.iterator(); it
684-
DataIterator(Dataset(num_rows=?, schema=...))
684+
DataIterator(Dataset(
685+
num_rows=?,
686+
schema={
687+
sepal length (cm): double,
688+
sepal width (cm): double,
689+
petal length (cm): double,
690+
petal width (cm): double,
691+
target: int64
692+
}
693+
))
685694
686695
If your model accepts a single tensor as input, specify a single feature column.
687696
@@ -703,7 +712,16 @@ def to_tf(
703712
>>> it = preprocessor.transform(ds).iterator()
704713
>>> it
705714
DataIterator(Concatenator
706-
+- Dataset(num_rows=?, schema=...))
715+
+- Dataset(
716+
num_rows=?,
717+
schema={
718+
sepal length (cm): double,
719+
sepal width (cm): double,
720+
petal length (cm): double,
721+
petal width (cm): double,
722+
target: int64
723+
}
724+
))
707725
>>> it.to_tf("features", "target")
708726
<_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))>
709727

python/ray/data/read_api.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1499,7 +1499,7 @@ def read_csv(
14991499
15001500
>>> ray.data.read_csv("s3://anonymous@ray-example-data/different-extensions/",
15011501
... file_extensions=["csv"])
1502-
Dataset(num_rows=?, schema=...)
1502+
Dataset(num_rows=?, schema={a: int64, b: int64})
15031503
15041504
Args:
15051505
paths: A single file or directory, or a list of file or directory paths.
@@ -1944,7 +1944,10 @@ def read_tfrecords(
19441944
Examples:
19451945
>>> import ray
19461946
>>> ray.data.read_tfrecords("s3://anonymous@ray-example-data/iris.tfrecords")
1947-
Dataset(num_rows=?, schema=...)
1947+
Dataset(
1948+
num_rows=?,
1949+
schema={...}
1950+
)
19481951
19491952
We can also read compressed TFRecord files, which use one of the
19501953
`compression types supported by Arrow <https://arrow.apache.org/docs/python/\
@@ -1954,7 +1957,10 @@ def read_tfrecords(
19541957
... "s3://anonymous@ray-example-data/iris.tfrecords.gz",
19551958
... arrow_open_stream_args={"compression": "gzip"},
19561959
... )
1957-
Dataset(num_rows=?, schema=...)
1960+
Dataset(
1961+
num_rows=?,
1962+
schema={...}
1963+
)
19581964
19591965
Args:
19601966
paths: A single file or directory, or a list of file or directory paths.

python/ray/data/tests/test_mongo.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,13 @@ def test_mongo_datasource(ray_start_regular_shared, start_mongo):
251251
collection=foo_collection,
252252
override_num_blocks=1000,
253253
)
254-
assert str(ds) == ("Dataset(num_rows=5, schema=Unknown schema)")
254+
assert str(ds) == (
255+
"Dataset(\n"
256+
" num_rows=5,\n"
257+
" schema={_id: fixed_size_binary[12], float_field: double, "
258+
"int_field: int32}\n"
259+
")"
260+
)
255261
assert df.equals(ds.drop_columns(["_id"]).to_pandas())
256262

257263
# Read a subset of the collection.

python/ray/data/tests/test_parquet.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -394,9 +394,10 @@ def test_parquet_read_bulk(ray_start_regular_shared, fs, data_path):
394394
assert "test1.parquet" in str(input_files)
395395
assert "test2.parquet" in str(input_files)
396396
assert not ds._plan.has_started_execution
397-
assert ds.schema() == Schema(pa.schema({"one": pa.int64(), "two": pa.string()}))
398397

399398
# Schema isn't available, so we do a partial read.
399+
assert ds.schema() == Schema(pa.schema({"one": pa.int64(), "two": pa.string()}))
400+
assert ds._plan.has_started_execution
400401
assert not ds._plan.has_computed_output()
401402

402403
# Forces a data read.
@@ -476,7 +477,7 @@ def test_parquet_read_bulk_meta_provider(ray_start_regular_shared, fs, data_path
476477
assert ds.count() == 6
477478
assert ds.size_bytes() > 0
478479
assert ds.schema() == Schema(pa.schema({"one": pa.int64(), "two": pa.string()}))
479-
assert not ds._plan.has_started_execution
480+
assert ds._plan.has_started_execution
480481

481482
# Forces a data read.
482483
values = [[s["one"], s["two"]] for s in ds.take()]

0 commit comments

Comments
 (0)