diff --git a/doc/source/data/inspecting-data.rst b/doc/source/data/inspecting-data.rst index 406b4165d5def..986b0d82b6e1c 100644 --- a/doc/source/data/inspecting-data.rst +++ b/doc/source/data/inspecting-data.rst @@ -51,16 +51,7 @@ For more information like the number of rows, print the Dataset. .. testoutput:: - Dataset( - num_rows=150, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - ) + Dataset(num_rows=..., schema=...) .. _inspecting-rows: @@ -138,7 +129,6 @@ of the returned batch, set ``batch_format``. 0 5.1 3.5 ... 0.2 0 1 4.9 3.0 ... 0.2 0 - [2 rows x 5 columns] For more information on working with batches, see :ref:`Transforming batches ` and diff --git a/doc/source/data/working-with-tensors.rst b/doc/source/data/working-with-tensors.rst index b0f40f959e18b..637b94b749dc7 100644 --- a/doc/source/data/working-with-tensors.rst +++ b/doc/source/data/working-with-tensors.rst @@ -21,10 +21,7 @@ Ray Data represents tensors as .. testoutput:: - Dataset( - num_rows=100, - schema={image: numpy.ndarray(shape=(28, 28), dtype=uint8)} - ) + Dataset(num_rows=100, schema=...) Batches of fixed-shape tensors ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/ray-contribute/writing-code-snippets.rst b/doc/source/ray-contribute/writing-code-snippets.rst index 1ef1c69f0170a..7a8c84dc5c5b0 100644 --- a/doc/source/ray-contribute/writing-code-snippets.rst +++ b/doc/source/ray-contribute/writing-code-snippets.rst @@ -228,10 +228,7 @@ To ignore parts of a *doctest-style* output, replace problematic sections with e >>> import ray >>> ray.data.read_images("s3://anonymous@ray-example-data/image-datasets/simple") - Dataset( - num_rows=..., - schema={image: numpy.ndarray(shape=(32, 32, 3), dtype=uint8)} - ) + Dataset(num_rows=..., schema=...) To ignore an output altogether, write a *code-output-style* snippet. Don't use `# doctest: +SKIP`. @@ -249,10 +246,7 @@ with ellipses. :: .. testoutput:: - Dataset( - num_rows=..., - schema={image: numpy.ndarray(shape=(32, 32, 3), dtype=uint8)} - ) + Dataset(num_rows=..., schema=...) If your output is nondeterministic and you want to display a sample output, add `:options: +MOCK`. :: diff --git a/python/ray/data/_internal/plan.py b/python/ray/data/_internal/plan.py index 1a8ab59c8c501..ce0d887f91ec3 100644 --- a/python/ray/data/_internal/plan.py +++ b/python/ray/data/_internal/plan.py @@ -373,7 +373,7 @@ def schema( elif self._logical_plan.dag.aggregate_output_metadata().schema is not None: schema = self._logical_plan.dag.aggregate_output_metadata().schema - elif fetch_if_missing or self.is_read_only(): + elif fetch_if_missing: # For consistency with the previous implementation, we fetch the schema if # the plan is read-only even if `fetch_if_missing` is False. @@ -587,15 +587,6 @@ def has_lazy_input(self) -> bool: """Return whether this plan has lazy input blocks.""" return all(isinstance(op, Read) for op in self._logical_plan.sources()) - def is_read_only(self, root_op: Optional[LogicalOperator] = None) -> bool: - """Return whether the LogicalPlan corresponding to `root_op` - contains only a Read op. By default, the last operator of - the LogicalPlan is used.""" - if root_op is None: - root_op = self._logical_plan.dag - - return root_op.is_read_op() - def has_computed_output(self) -> bool: """Whether this plan has a computed snapshot for the final operator, i.e. for the output of this plan. diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py index e8a821b102230..9f17fedd3c167 100644 --- a/python/ray/data/dataset.py +++ b/python/ray/data/dataset.py @@ -5008,16 +5008,7 @@ def to_tf( >>> import ray >>> ds = ray.data.read_csv("s3://anonymous@air-example-data/iris.csv") >>> ds - Dataset( - num_rows=?, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - ) + Dataset(num_rows=?, schema=...) If your model accepts a single tensor as input, specify a single feature column. @@ -5039,16 +5030,7 @@ def to_tf( >>> ds = preprocessor.transform(ds) >>> ds Concatenator - +- Dataset( - num_rows=?, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - ) + +- Dataset(num_rows=?, schema=...) >>> ds.to_tf("features", "target") <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))> @@ -5753,16 +5735,7 @@ def serialize_lineage(self) -> bytes: .. testoutput:: - Dataset( - num_rows=?, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - ) + Dataset(num_rows=?, schema=...) Returns: @@ -5835,16 +5808,7 @@ def deserialize_lineage(serialized_ds: bytes) -> "Dataset": .. testoutput:: - Dataset( - num_rows=?, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - ) + Dataset(num_rows=?, schema=...) Args: serialized_ds: The serialized Dataset that we wish to deserialize. diff --git a/python/ray/data/iterator.py b/python/ray/data/iterator.py index 7bf8aab67e3a2..d410c0566e2d3 100644 --- a/python/ray/data/iterator.py +++ b/python/ray/data/iterator.py @@ -681,16 +681,7 @@ def to_tf( ... "s3://anonymous@air-example-data/iris.csv" ... ) >>> it = ds.iterator(); it - DataIterator(Dataset( - num_rows=?, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - )) + DataIterator(Dataset(num_rows=?, schema=...)) If your model accepts a single tensor as input, specify a single feature column. @@ -712,16 +703,7 @@ def to_tf( >>> it = preprocessor.transform(ds).iterator() >>> it DataIterator(Concatenator - +- Dataset( - num_rows=?, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - )) + +- Dataset(num_rows=?, schema=...)) >>> it.to_tf("features", "target") <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))> diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index 786aa30e650c4..66be38881a3c5 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -1499,7 +1499,7 @@ def read_csv( >>> ray.data.read_csv("s3://anonymous@ray-example-data/different-extensions/", ... file_extensions=["csv"]) - Dataset(num_rows=?, schema={a: int64, b: int64}) + Dataset(num_rows=?, schema=...) Args: paths: A single file or directory, or a list of file or directory paths. @@ -1944,10 +1944,7 @@ def read_tfrecords( Examples: >>> import ray >>> ray.data.read_tfrecords("s3://anonymous@ray-example-data/iris.tfrecords") - Dataset( - num_rows=?, - schema={...} - ) + Dataset(num_rows=?, schema=...) We can also read compressed TFRecord files, which use one of the `compression types supported by Arrow 0 assert ds.schema() == Schema(pa.schema({"one": pa.int64(), "two": pa.string()})) - assert ds._plan.has_started_execution + assert not ds._plan.has_started_execution # Forces a data read. values = [[s["one"], s["two"]] for s in ds.take()]