From 8657cdc99d22c1c6feeb4834a876ea1ffd342d4c Mon Sep 17 00:00:00 2001 From: jhsu Date: Fri, 2 May 2025 11:27:33 -0700 Subject: [PATCH 1/4] fix working code snippets Signed-off-by: jhsu --- doc/source/ray-contribute/writing-code-snippets.rst | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/doc/source/ray-contribute/writing-code-snippets.rst b/doc/source/ray-contribute/writing-code-snippets.rst index 1ef1c69f0170..f1e0db7b59de 100644 --- a/doc/source/ray-contribute/writing-code-snippets.rst +++ b/doc/source/ray-contribute/writing-code-snippets.rst @@ -228,10 +228,7 @@ To ignore parts of a *doctest-style* output, replace problematic sections with e >>> import ray >>> ray.data.read_images("s3://anonymous@ray-example-data/image-datasets/simple") - Dataset( - num_rows=..., - schema={image: numpy.ndarray(shape=(32, 32, 3), dtype=uint8)} - ) + Dataset(num_rows=..., schema=...) To ignore an output altogether, write a *code-output-style* snippet. Don't use `# doctest: +SKIP`. @@ -249,10 +246,7 @@ with ellipses. :: .. testoutput:: - Dataset( - num_rows=..., - schema={image: numpy.ndarray(shape=(32, 32, 3), dtype=uint8)} - ) + Dataset(num_rows=3, schema=...) If your output is nondeterministic and you want to display a sample output, add `:options: +MOCK`. :: From d85651ef1957e7dc0e0ed5abffff02615afa712c Mon Sep 17 00:00:00 2001 From: jhsu Date: Fri, 2 May 2025 11:29:05 -0700 Subject: [PATCH 2/4] use ... Signed-off-by: jhsu --- doc/source/ray-contribute/writing-code-snippets.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/ray-contribute/writing-code-snippets.rst b/doc/source/ray-contribute/writing-code-snippets.rst index f1e0db7b59de..7a8c84dc5c5b 100644 --- a/doc/source/ray-contribute/writing-code-snippets.rst +++ b/doc/source/ray-contribute/writing-code-snippets.rst @@ -246,7 +246,7 @@ with ellipses. :: .. testoutput:: - Dataset(num_rows=3, schema=...) + Dataset(num_rows=..., schema=...) If your output is nondeterministic and you want to display a sample output, add `:options: +MOCK`. :: From 4339948ccc8b11696d7911c0628f51c40a96fa48 Mon Sep 17 00:00:00 2001 From: jhsu Date: Fri, 2 May 2025 11:57:52 -0700 Subject: [PATCH 3/4] fix inspecting data rst Signed-off-by: jhsu --- doc/source/data/inspecting-data.rst | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/doc/source/data/inspecting-data.rst b/doc/source/data/inspecting-data.rst index 406b4165d5de..986b0d82b6e1 100644 --- a/doc/source/data/inspecting-data.rst +++ b/doc/source/data/inspecting-data.rst @@ -51,16 +51,7 @@ For more information like the number of rows, print the Dataset. .. testoutput:: - Dataset( - num_rows=150, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - ) + Dataset(num_rows=..., schema=...) .. _inspecting-rows: @@ -138,7 +129,6 @@ of the returned batch, set ``batch_format``. 0 5.1 3.5 ... 0.2 0 1 4.9 3.0 ... 0.2 0 - [2 rows x 5 columns] For more information on working with batches, see :ref:`Transforming batches ` and From 87181dab5f72a03475d80e29cf2f35e0ba7aad71 Mon Sep 17 00:00:00 2001 From: iamjustinhsu <140442892+iamjustinhsu@users.noreply.github.com> Date: Thu, 1 May 2025 18:47:22 -0700 Subject: [PATCH 4/4] [Data] Speed up printing the schema (#52612) ## Why are these changes needed? if the dataset only has read functions applied, due to lazy execution model, we need to perform computation to read the schema. This PR removes the special case if the schema is not present. ## Related issue number https://github.com/ray-project/ray/issues/50361 ## Checks - [ ] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [x] I've run `scripts/format.sh` to lint the changes in this PR. - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [ ] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [ ] Unit tests - [ ] Release tests - [ ] This PR is not tested :( --------- Signed-off-by: jhsu --- doc/source/data/working-with-tensors.rst | 5 +-- python/ray/data/_internal/plan.py | 11 +----- python/ray/data/dataset.py | 44 +++--------------------- python/ray/data/iterator.py | 22 ++---------- python/ray/data/read_api.py | 12 ++----- python/ray/data/tests/test_mongo.py | 8 +---- python/ray/data/tests/test_parquet.py | 5 ++- 7 files changed, 14 insertions(+), 93 deletions(-) diff --git a/doc/source/data/working-with-tensors.rst b/doc/source/data/working-with-tensors.rst index b0f40f959e18..637b94b749dc 100644 --- a/doc/source/data/working-with-tensors.rst +++ b/doc/source/data/working-with-tensors.rst @@ -21,10 +21,7 @@ Ray Data represents tensors as .. testoutput:: - Dataset( - num_rows=100, - schema={image: numpy.ndarray(shape=(28, 28), dtype=uint8)} - ) + Dataset(num_rows=100, schema=...) Batches of fixed-shape tensors ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/python/ray/data/_internal/plan.py b/python/ray/data/_internal/plan.py index 1a8ab59c8c50..ce0d887f91ec 100644 --- a/python/ray/data/_internal/plan.py +++ b/python/ray/data/_internal/plan.py @@ -373,7 +373,7 @@ def schema( elif self._logical_plan.dag.aggregate_output_metadata().schema is not None: schema = self._logical_plan.dag.aggregate_output_metadata().schema - elif fetch_if_missing or self.is_read_only(): + elif fetch_if_missing: # For consistency with the previous implementation, we fetch the schema if # the plan is read-only even if `fetch_if_missing` is False. @@ -587,15 +587,6 @@ def has_lazy_input(self) -> bool: """Return whether this plan has lazy input blocks.""" return all(isinstance(op, Read) for op in self._logical_plan.sources()) - def is_read_only(self, root_op: Optional[LogicalOperator] = None) -> bool: - """Return whether the LogicalPlan corresponding to `root_op` - contains only a Read op. By default, the last operator of - the LogicalPlan is used.""" - if root_op is None: - root_op = self._logical_plan.dag - - return root_op.is_read_op() - def has_computed_output(self) -> bool: """Whether this plan has a computed snapshot for the final operator, i.e. for the output of this plan. diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py index 519992e6d0ce..60299d3742e9 100644 --- a/python/ray/data/dataset.py +++ b/python/ray/data/dataset.py @@ -4882,16 +4882,7 @@ def to_tf( >>> import ray >>> ds = ray.data.read_csv("s3://anonymous@air-example-data/iris.csv") >>> ds - Dataset( - num_rows=?, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - ) + Dataset(num_rows=?, schema=...) If your model accepts a single tensor as input, specify a single feature column. @@ -4913,16 +4904,7 @@ def to_tf( >>> ds = preprocessor.transform(ds) >>> ds Concatenator - +- Dataset( - num_rows=?, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - ) + +- Dataset(num_rows=?, schema=...) >>> ds.to_tf("features", "target") <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))> @@ -5627,16 +5609,7 @@ def serialize_lineage(self) -> bytes: .. testoutput:: - Dataset( - num_rows=?, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - ) + Dataset(num_rows=?, schema=...) Returns: @@ -5709,16 +5682,7 @@ def deserialize_lineage(serialized_ds: bytes) -> "Dataset": .. testoutput:: - Dataset( - num_rows=?, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - ) + Dataset(num_rows=?, schema=...) Args: serialized_ds: The serialized Dataset that we wish to deserialize. diff --git a/python/ray/data/iterator.py b/python/ray/data/iterator.py index 7bf8aab67e3a..d410c0566e2d 100644 --- a/python/ray/data/iterator.py +++ b/python/ray/data/iterator.py @@ -681,16 +681,7 @@ def to_tf( ... "s3://anonymous@air-example-data/iris.csv" ... ) >>> it = ds.iterator(); it - DataIterator(Dataset( - num_rows=?, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - )) + DataIterator(Dataset(num_rows=?, schema=...)) If your model accepts a single tensor as input, specify a single feature column. @@ -712,16 +703,7 @@ def to_tf( >>> it = preprocessor.transform(ds).iterator() >>> it DataIterator(Concatenator - +- Dataset( - num_rows=?, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - )) + +- Dataset(num_rows=?, schema=...)) >>> it.to_tf("features", "target") <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))> diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index 786aa30e650c..66be38881a3c 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -1499,7 +1499,7 @@ def read_csv( >>> ray.data.read_csv("s3://anonymous@ray-example-data/different-extensions/", ... file_extensions=["csv"]) - Dataset(num_rows=?, schema={a: int64, b: int64}) + Dataset(num_rows=?, schema=...) Args: paths: A single file or directory, or a list of file or directory paths. @@ -1944,10 +1944,7 @@ def read_tfrecords( Examples: >>> import ray >>> ray.data.read_tfrecords("s3://anonymous@ray-example-data/iris.tfrecords") - Dataset( - num_rows=?, - schema={...} - ) + Dataset(num_rows=?, schema=...) We can also read compressed TFRecord files, which use one of the `compression types supported by Arrow 0 assert ds.schema() == Schema(pa.schema({"one": pa.int64(), "two": pa.string()})) - assert ds._plan.has_started_execution + assert not ds._plan.has_started_execution # Forces a data read. values = [[s["one"], s["two"]] for s in ds.take()]