pytorch · SvenDS9 · Mar 2, 2023 · Mar 3, 2023 · Mar 3, 2023 · Mar 6, 2023
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -92,17 +92,22 @@ When adding a new DataPipe, there are few things that need to be done to ensure
    [test requirements](https://github.com/pytorch/data/issues/106) that we have.
    - One test that is commonly missed is the serialization test. Please add the new DataPipe to
      [`test_serialization.py`](https://github.com/pytorch/data/blob/main/test/test_serialization.py).
-   - If your test requires interacting with files in the file system (e.g. opening a `csv` or `tar` file, we prefer
-     those files to be generated during the test (see `test_local_io.py`). If the file is on a remote server, see
-     `test_remote_io.py`.
+   - If your test requires interacting with files in the file system (e.g. opening a `csv` or `tar` file), we prefer
+     those files to be generated during the test (see
+     [`test_local_io.py`](https://github.com/pytorch/data/blob/main/test/test_local_io.py)). If the file is on a remote
+     server, see [`test_remote_io.py`](https://github.com/pytorch/data/blob/main/test/test_remote_io.py).
 3. Documentation - ensure that the DataPipe has docstring, usage example, and that it is added to the right category of
-   the right RST file to be rendered.
-   - If your DataPipe has a functional form (i.e. `@functional_datapipe(...)`), include at the
+   the right RST file (in [`docs/source`](https://github.com/pytorch/data/tree/main/docs/source)) to be rendered.
+   - If your DataPipe has a functional form (i.e. `@functional_datapipe(...)`), include it at the
      [end of the first sentence](https://github.com/pytorch/data/blob/main/torchdata/datapipes/iter/util/combining.py#L25)
      of your docstring. This will make sure it correctly shows up in the
      [summary table](https://pytorch.org/data/main/torchdata.datapipes.iter.html#archive-datapipes) of our
      documentation.
-4. Import - import the DataPipe in the correct `__init__.py` file.
+   - For usage examples we support both standard doctest-style interactive Python sessions and code-output-style blocks.
+     See [sphinx doctest](https://www.sphinx-doc.org/en/master/usage/extensions/doctest.html) for more information. To
+     build the documentation and validate that your example works please refer to
+     [`docs`](https://github.com/pytorch/data/tree/main/docs).
+4. Import - import the DataPipe in the correct `__init__.py` file and add it to the `__all__` list.
 5. Interface - if the DataPipe has a functional form, make sure that is generated properly by `gen_pyi.py` into the
    relevant interface file.
    - You can re-generate the pyi files by re-running `pip install -e .`, then you can examine the new outputs.

diff --git a/docs/README.md b/docs/README.md
@@ -8,12 +8,23 @@ pip install -r requirements.txt
 ```
 
 You can then build the documentation by running `make <format>` from the `docs/` folder. Run `make` to get a list of all
-available output formats.
+available output formats. Run
 
 ```bash
 make html
 ```
 
+to build the documentation. The html files can then be found in `build/html`. To validate the code examples use:
+
+```bash
+make doctest
+```
+
+Note that currently only code-output-style blocks are tested as many standard reST doctest examples do not work atm. The
+results can then be found in `build/html/output.txt`. To also test interactive Python sessions you can temporarily
+replace `doctest_test_doctest_blocks` in
+[`source/conf.py`](https://github.com/pytorch/data/blob/main/docs/source/conf.py) with a non-empty string.
+
 ## Improving the Documentation
 
 Feel free to open an issue or pull request to inform us of any inaccuracy or potential improvement that we can make to

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -61,6 +61,24 @@
 # be successively migrated to sphinx's doctest directive.
 doctest_test_doctest_blocks = ""
 
+doctest_global_setup = """
+import torch
+from torchdata.datapipes.iter import IterableWrapper, FileLister, FileOpener
+
+io_doctest = True
+
+try:
+    import torcharrow.dtypes as dt
+except ImportError:
+    dt = None
+
+try:
+    import rarfile
+    rarfile.tool_setup()
+except Exception:
+    rarfile = None
+"""
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
 

diff --git a/torchdata/datapipes/iter/util/bz2fileloader.py b/torchdata/datapipes/iter/util/bz2fileloader.py
@@ -33,13 +33,20 @@ class Bz2FileLoaderIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]):
         or let Python's GC close them periodically.
 
     Example:
-        >>> from torchdata.datapipes.iter import FileLister, FileOpener
-        >>> datapipe1 = FileLister(".", "*.bz2")
-        >>> datapipe2 = FileOpener(datapipe1, mode="b")
-        >>> bz2_loader_dp = datapipe2.load_from_bz2()
-        >>> for _, stream in bz2_loader_dp:
-        >>>     print(stream.read())
+
+    .. testcode::
+
+        filenames_dp = FileLister(".", "*.bz2")
+        files_dp = filenames_dp.open_files(mode="b")
+        bz2_loader_dp = files_dp.load_from_bz2()
+        for _, stream in bz2_loader_dp:
+            print(stream.read())
+
+    .. testoutput::
+        :skipif: io_doctest
+
         b'0123456789abcdef'
+
     """
 
     def __init__(self, datapipe: Iterable[Tuple[str, BufferedIOBase]], length: int = -1) -> None:

diff --git a/torchdata/datapipes/iter/util/combining.py b/torchdata/datapipes/iter/util/combining.py
@@ -43,16 +43,24 @@ class IterKeyZipperIterDataPipe(IterDataPipe[T_co]):
             by default a tuple is created
 
     Example:
-        >>> from torchdata.datapipes.iter import IterableWrapper
-        >>> from operator import itemgetter
-        >>> def merge_fn(t1, t2):
-        >>>     return t1[1] + t2[1]
-        >>> dp1 = IterableWrapper([('a', 100), ('b', 200), ('c', 300)])
-        >>> dp2 = IterableWrapper([('a', 1), ('b', 2), ('c', 3), ('d', 4)])
-        >>> res_dp = dp1.zip_with_iter(dp2, key_fn=itemgetter(0),
-        >>>                            ref_key_fn=itemgetter(0), keep_key=True, merge_fn=merge_fn)
-        >>> list(res_dp)
+
+    .. testcode::
+
+        from operator import itemgetter
+
+        def merge_fn(t1, t2):
+            return t1[1] + t2[1]
+
+        dp1 = IterableWrapper([('a', 100), ('b', 200), ('c', 300)])
+        dp2 = IterableWrapper([('a', 1), ('b', 2), ('c', 3), ('d', 4)])
+        res_dp = dp1.zip_with_iter(dp2, key_fn=itemgetter(0),
+                                   ref_key_fn=itemgetter(0), keep_key=True, merge_fn=merge_fn)
+        print(list(res_dp))
+
+    .. testoutput::
+
         [('a', 101), ('b', 202), ('c', 303)]
+
     """
 
     def __init__(

diff --git a/torchdata/datapipes/iter/util/dataframemaker.py b/torchdata/datapipes/iter/util/dataframemaker.py
@@ -54,19 +54,27 @@ class DataFrameMakerIterDataPipe(IterDataPipe):  # IterDataPipe[torcharrow.IData
         device: specify the device on which the DataFrame will be stored
 
     Example:
-        >>> from torchdata.datapipes.iter import IterableWrapper
-        >>> import torcharrow.dtypes as dt
-        >>> source_data = [(i,) for i in range(3)]
-        >>> source_dp = IterableWrapper(source_data)
-        >>> DTYPE = dt.Struct([dt.Field("Values", dt.int32)])
-        >>> df_dp = source_dp.dataframe(dtype=DTYPE)
-        >>> list(df_dp)[0]
+
+    .. testcode::
+        :skipif: dt is None
+
+        import torcharrow.dtypes as dt
+        source_data = [(i,) for i in range(3)]
+        source_dp = IterableWrapper(source_data)
+        DTYPE = dt.Struct([dt.Field("Values", dt.int32)])
+        df_dp = source_dp.dataframe(dtype=DTYPE)
+        print(list(df_dp)[0])
+
+    .. testoutput::
+        :skipif: io_doctest
+
           index    Values
         -------  --------
               0         0
               1         1
               2         2
         dtype: Struct([Field('Values', int32)]), count: 3, null_count: 0
+
     """
 
     def __new__(
@@ -105,18 +113,26 @@ class ParquetDFLoaderIterDataPipe(IterDataPipe):  # IterDataPipe[torcharrow.IDat
         device: specify the device on which the DataFrame will be stored
 
     Example:
-        >>> from torchdata.datapipes.iter import FileLister
-        >>> import torcharrow.dtypes as dt
-        >>> DTYPE = dt.Struct([dt.Field("Values", dt.int32)])
-        >>> source_dp = FileLister(".", masks="df*.parquet")
-        >>> parquet_df_dp = source_dp.load_parquet_as_df(dtype=DTYPE)
-        >>> list(parquet_df_dp)[0]
+
+    .. testcode::
+        :skipif: dt is None
+
+        import torcharrow.dtypes as dt
+        DTYPE = dt.Struct([dt.Field("Values", dt.int32)])
+        source_dp = FileLister(".", masks="df*.parquet")
+        parquet_df_dp = source_dp.load_parquet_as_df(dtype=DTYPE)
+        print(list(parquet_df_dp)[0])
+
+    .. testoutput::
+        :skipif: io_doctest
+
           index    Values
         -------  --------
               0         0
               1         1
               2         2
         dtype: Struct([Field('Values', int32)]), count: 3, null_count: 0
+
     """
 
     def __init__(

diff --git a/torchdata/datapipes/iter/util/decompressor.py b/torchdata/datapipes/iter/util/decompressor.py
@@ -41,13 +41,20 @@ class DecompressorIterDataPipe(IterDataPipe[Tuple[str, StreamWrapper]]):
         file_type: Optional `string` or ``CompressionType`` that represents what compression format of the inputs
 
     Example:
-        >>> from torchdata.datapipes.iter import FileLister, FileOpener
-        >>> tar_file_dp = FileLister(self.temp_dir.name, "*.tar")
-        >>> tar_load_dp = FileOpener(tar_file_dp, mode="b")
-        >>> tar_decompress_dp = Decompressor(tar_load_dp, file_type="tar")
-        >>> for _, stream in tar_decompress_dp:
-        >>>     print(stream.read())
+
+    .. testcode::
+
+        tar_file_dp = FileLister(".", "*.tar")
+        tar_load_dp = tar_file_dp.open_files(mode="b")
+        tar_decompress_dp = tar_load_dp.decompress(file_type="tar")
+        for _, stream in tar_decompress_dp:
+            print(stream.read())
+
+    .. testoutput::
+        :skipif: io_doctest
+
         b'0123456789abcdef'
+
     """
 
     types = CompressionType

diff --git a/torchdata/datapipes/iter/util/hashchecker.py b/torchdata/datapipes/iter/util/hashchecker.py
@@ -33,19 +33,22 @@ class HashCheckerIterDataPipe(IterDataPipe[Tuple[str, U]]):
             does not work with non-seekable stream, e.g. HTTP)
 
     Example:
-        >>> from torchdata.datapipes.iter import IterableWrapper, FileOpener
-        >>> expected_MD5_hash = "bb9675028dd39d2dd2bf71002b93e66c"
-        File is from "https://raw.githubusercontent.com/pytorch/data/main/LICENSE"
-        >>> file_dp = FileOpener(IterableWrapper(["LICENSE.txt"]), mode='rb')
-        >>> # An exception is only raised when the hash doesn't match, otherwise (path, stream) is returned
-        >>> check_hash_dp = file_dp.check_hash({"LICENSE.txt": expected_MD5_hash}, "md5", rewind=True)
-        >>> reader_dp = check_hash_dp.readlines()
-        >>> it = iter(reader_dp)
-        >>> path, line = next(it)
-        >>> path
-        LICENSE.txt
-        >>> line
-        b'BSD 3-Clause License'
+
+    .. testcode::
+        :skipif: io_doctest
+
+        expected_MD5_hash = "bb9675028dd39d2dd2bf71002b93e66c"
+        # File is from "https://raw.githubusercontent.com/pytorch/data/main/LICENSE"
+        file_dp = FileOpener(IterableWrapper(["LICENSE.txt"]), mode='rb')
+        # An exception is only raised when the hash doesn't match, otherwise (path, stream) is returned
+        check_hash_dp = file_dp.check_hash({"LICENSE.txt": expected_MD5_hash}, "md5", rewind=True)
+        reader_dp = check_hash_dp.readlines()
+
+        it = iter(reader_dp)
+        path, line = next(it)
+        assert path == LICENSE.txt
+        assert line == b'BSD 3-Clause License'
+
     """
 
     def __init__(

diff --git a/torchdata/datapipes/iter/util/header.py b/torchdata/datapipes/iter/util/header.py
@@ -78,17 +78,21 @@ class LengthSetterIterDataPipe(IterDataPipe[T_co]):
         length: the integer value that will be set as the length
 
     Example:
-        >>> from torchdata.datapipes.iter import IterableWrapper
-        >>> dp = IterableWrapper(range(10)).filter(lambda x: x < 5).set_length(3)
-        >>> list(dp)  # Notice that the number of elements yielded is unchanged
-        [0, 1, 2, 3, 4]
-        >>> len(dp)
-        3
-        >>> header_dp = IterableWrapper(range(10)).filter(lambda x: x < 5).header(3)
-        >>> list(header_dp)  # Use `.header()` if you want to limit the number of elements yielded
-        [0, 1, 2]
-        >>> len(header_dp)
-        3
+
+    .. testcode::
+
+        dp = IterableWrapper(range(10)).filter(lambda x: x < 5).set_length(3)
+        # Notice that the number of elements yielded is unchanged
+        assert list(dp) == [0, 1, 2, 3, 4]
+        assert len(dp) == 3
+
+    .. testcode::
+
+        header_dp = IterableWrapper(range(10)).filter(lambda x: x < 5).header(3)
+        # Use `.header()` if you want to limit the number of elements yielded
+        assert list(header_dp) == [0, 1, 2]
+        assert len(header_dp) == 3
+
     """
 
     def __init__(self, source_datapipe: IterDataPipe[T_co], length: int) -> None:

diff --git a/torchdata/datapipes/iter/util/jsonparser.py b/torchdata/datapipes/iter/util/jsonparser.py
@@ -21,16 +21,29 @@ class JsonParserIterDataPipe(IterDataPipe[Tuple[str, Dict]]):
         kwargs: keyword arguments that will be passed through to ``json.loads``
 
     Example:
-        >>> from torchdata.datapipes.iter import IterableWrapper, FileOpener
-        >>> import os
-        >>> def get_name(path_and_stream):
-        >>>     return os.path.basename(path_and_stream[0]), path_and_stream[1]
-        >>> datapipe1 = IterableWrapper(["empty.json", "1.json", "2.json"])
-        >>> datapipe2 = FileOpener(datapipe1, mode="b")
-        >>> datapipe3 = datapipe2.map(get_name)
-        >>> json_dp = datapipe3.parse_json_files()
-        >>> list(json_dp)
+
+    .. testcode::
+        :skipif: io_doctest
+
+        # assume the files look like this:
+        # 1.json: '["foo", {"bar":["baz", null, 1.0, 2]}]'
+        # 2.json: '{"__complex__": true, "real": 1, "imag": 2}'
+
+        import os
+
+        def get_name(path_and_stream):
+            return os.path.basename(path_and_stream[0]), path_and_stream[1]
+
+        source_dp = IterableWrapper(["1.json", "2.json"])
+        datapipe2 = source_dp.open_files(mode="b")
+        datapipe3 = datapipe2.map(get_name)
+        json_dp = datapipe3.parse_json_files()
+        print(list(json_dp))
+
+    .. testoutput::
+
         [('1.json', ['foo', {'bar': ['baz', None, 1.0, 2]}]), ('2.json', {'__complex__': True, 'real': 1, 'imag': 2})]
+
     """
 
     def __init__(self, source_datapipe: IterDataPipe[Tuple[str, IO]], **kwargs) -> None:

diff --git a/torchdata/datapipes/iter/util/mux_longest.py b/torchdata/datapipes/iter/util/mux_longest.py
@@ -21,10 +21,16 @@ class MultiplexerLongestIterDataPipe(IterDataPipe):
         datapipes: Iterable DataPipes that will take turn to yield their elements, until they are all exhausted
 
     Example:
-        >>> from torchdata.datapipes.iter import IterableWrapper
-        >>> dp1, dp2, dp3 = IterableWrapper(range(5)), IterableWrapper(range(10, 15)), IterableWrapper(range(20, 25))
-        >>> list(dp1.mux_longest(dp2, dp3))
-        [0, 10, 20, 1, 11, 21, 2, 12, 22, 3, 13, 23, 4, 14, 24]
+
+    .. testcode::
+
+        dp1, dp2, dp3 = IterableWrapper(range(5)), IterableWrapper(range(10, 12)), IterableWrapper(range(20, 25))
+        print(list(dp1.mux_longest(dp2, dp3)))
+
+    .. testoutput::
+
+        [0, 10, 20, 1, 11, 21, 2, 22, 3, 23, 4, 24]
+
     """
 
     def __init__(self, *datapipes):