Add Dataset.from_ptransform

shoyer · Xarray-Beam authors · commit aa0f087f808d · 2025-10-03T22:36:02.000-07:00
This is a variant of the Dataset constructor with extensive validation.

Also add documentation explaining how it works.

PiperOrigin-RevId: 814972825
diff --git a/docs/api.md b/docs/api.md
@@ -92,9 +92,10 @@ guarantees.
     :toctree: _autosummary
 
     Dataset
-    Dataset.from_xarray
     Dataset.from_zarr
     Dataset.to_zarr
+    Dataset.from_xarray
+    Dataset.from_ptransform
     Dataset.collect_with_direct_runner
     Dataset.map_blocks
     Dataset.rechunk
diff --git a/docs/high-level.ipynb b/docs/high-level.ipynb
@@ -63,7 +63,7 @@
         "xbeam_ds"
       ],
       "outputs": [],
-      "execution_count": 2
+      "execution_count": 1
     },
     {
       "metadata": {
@@ -83,7 +83,7 @@
         "xarray_ds.chunk(chunks).to_zarr('example_data.zarr', mode='w')"
       ],
       "outputs": [],
-      "execution_count": 3
+      "execution_count": 2
     },
     {
       "metadata": {
@@ -186,7 +186,7 @@
         "xarray.open_zarr('example_climatology.zarr')"
       ],
       "outputs": [],
-      "execution_count": 6
+      "execution_count": 3
     },
     {
       "metadata": {
@@ -215,7 +215,7 @@
         "xarray.open_zarr('example_regrid.zarr')"
       ],
       "outputs": [],
-      "execution_count": 7
+      "execution_count": 4
     },
     {
       "metadata": {
@@ -245,15 +245,15 @@
         "  print(f'{type(e).__name__}: {e}')"
       ],
       "outputs": [],
-      "execution_count": 8
+      "execution_count": 5
     },
     {
       "metadata": {
         "id": "vCjZK9fmEeEq"
       },
       "cell_type": "markdown",
       "source": [
-        "You can avoid these errors by explicitly [creating a template](creating_templates):"
+        "You can avoid these errors by explicitly supplying a template, either from {py:attr}`Dataset.template \u003cxarray_beam.Dataset.template\u003e` or  produced by {py:func}`~xarray_beam.make_template`:"
       ]
     },
     {
@@ -262,19 +262,70 @@
       },
       "cell_type": "code",
       "source": [
-        "ds_beam = xbeam.Dataset.from_zarr('example_data.zarr')\n",
-        "ds_beam.map_blocks(lambda ds: ds.compute(), template=ds_beam.template)"
+        "template = xbeam.make_template(xarray_ds)\n",
+        "(\n",
+        "    xbeam.Dataset.from_zarr('example_data.zarr')\n",
+        "    .map_blocks(lambda ds: ds.compute(), template=template)\n",
+        ")"
       ],
       "outputs": [],
-      "execution_count": 9
+      "execution_count": 6
+    },
+    {
+      "metadata": {
+        "id": "-U4t0kKIkDvb"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "## Interfacing with low-level transforms"
+      ]
     },
     {
       "metadata": {
         "id": "75IG-22cKcuE"
       },
       "cell_type": "markdown",
       "source": [
-        "Sometimes, your computation doesn't fit into the ``map_blocks`` paradigm because you don't want to create `xarray.Dataset` objects. For these cases, you can switch to the lower-level Xarray-Beam [data model](data-model), and use raw Beam operations:"
+        "`Dataset` is a thin wrapper around Xarray-Beam transformations, so you can always drop into the lower-level Xarray-Beam [data model](data-model) and use raw Beam operations. This is especially useful for the reading or writing data.\n",
+        "\n",
+        "To manually create a `Dataset` from a Beam ptransform, use {py:meth}`~xarray_beam.Dataset.from_ptransform`. Here's an example, showing the common pattern of evaluating a single example in-memory to produce the `xarray.Dataset` required for building a template:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "l9pHS1QDlMd-"
+      },
+      "cell_type": "code",
+      "source": [
+        "all_times = pd.date_range('2025-01-01', freq='1D', periods=365)\n",
+        "source_dataset = xarray.open_zarr('example_data.zarr', chunks=None)\n",
+        "\n",
+        "def load_chunk(time: pd.Timestamp) -\u003e tuple[xbeam.Key, xarray.Dataset]:\n",
+        "  key = xbeam.Key({'time': (time - all_times[0]).days})\n",
+        "  dataset = source_dataset.sel(time=[time])\n",
+        "  return key, dataset\n",
+        "\n",
+        "ptransform = beam.Create(all_times) | beam.Map(load_chunk)\n",
+        "\n",
+        "_, example = load_chunk(all_times[0])\n",
+        "template = xbeam.make_template(example)\n",
+        "template = xbeam.replace_template_dims(template, time=all_times)\n",
+        "\n",
+        "ds_beam = xbeam.Dataset.from_ptransform(\n",
+        "    ptransform, template=template, chunks={'time': 1}, split_vars=False\n",
+        ")\n",
+        "ds_beam"
+      ],
+      "outputs": [],
+      "execution_count": 12
+    },
+    {
+      "metadata": {
+        "id": "1qjeY5mwlLGJ"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "You can also pull-out the underlying Beam `ptransform` from a dataset to append new transformations, e.g., to write each element of the pipeline to disk as a separate file:"
       ]
     },
     {
@@ -288,16 +339,12 @@
         "  chunk.to_netcdf(path)\n",
         "\n",
         "with beam.Pipeline() as p:\n",
-        "  p | (\n",
-        "      xbeam.Dataset.from_zarr('example_data.zarr')\n",
-        "      .rechunk({'latitude': -1, 'longitude': -1})\n",
-        "      .ptransform\n",
-        "  ) | beam.MapTuple(to_netcdf)\n",
+        "  p | ds_beam.rechunk('50MB').ptransform | beam.MapTuple(to_netcdf)\n",
         "\n",
         "%ls *.nc"
       ],
       "outputs": [],
-      "execution_count": 10
+      "execution_count": 13
     }
   ],
   "metadata": {
diff --git a/xarray_beam/__init__.py b/xarray_beam/__init__.py
@@ -55,4 +55,4 @@
     DatasetToZarr as DatasetToZarr,
 )
 
-__version__ = '0.10.3'  # automatically synchronized to pyproject.toml
+__version__ = '0.10.4'  # automatically synchronized to pyproject.toml
diff --git a/xarray_beam/_src/core.py b/xarray_beam/_src/core.py
@@ -505,24 +505,29 @@ def expand(self, pcoll):
     )
 
 
+def _ensure_chunk_is_computed(key: Key,dataset: xarray.Dataset) -> None:
+  """Ensure that a dataset contains no chunked variables."""
+  for var_name, variable in dataset.variables.items():
+    if variable.chunks is not None:
+      raise ValueError(
+          f"Dataset variable {var_name!r} corresponding to key {key} is"
+          " chunked with Dask. Datasets passed to validate_chunk must be"
+          f" fully computed (not chunked): {dataset}\nThis typically arises"
+          " with datasets originating with `xarray.open_zarr()`, which by"
+          " default use Dask. If this is the case, you can fix it by passing"
+          " `chunks=None` or xarray_beam.open_zarr(). Alternatively, you"
+          " can load datasets explicitly into memory with `.compute()`."
+      )
+
+
 def validate_chunk(key: Key, datasets: DatasetOrDatasets) -> None:
   """Verify that a key and dataset(s) are valid for xarray-beam transforms."""
   if isinstance(datasets, xarray.Dataset):
     datasets: list[xarray.Dataset] = [datasets]
 
   for dataset in datasets:
     # Verify that no variables are chunked with Dask
-    for var_name, variable in dataset.variables.items():
-      if variable.chunks is not None:
-        raise ValueError(
-            f"Dataset variable {var_name!r} corresponding to key {key} is"
-            " chunked with Dask. Datasets passed to validate_chunk must be"
-            f" fully computed (not chunked): {dataset}\nThis typically arises"
-            " with datasets originating with `xarray.open_zarr()`, which by"
-            " default use Dask. If this is the case, you can fix it by passing"
-            " `chunks=None` or xarray_beam.open_zarr(). Alternatively, you"
-            " can load datasets explicitly into memory with `.compute()`."
-        )
+    _ensure_chunk_is_computed(key, dataset)
 
     # Validate key offsets
     missing_keys = [
diff --git a/xarray_beam/_src/dataset.py b/xarray_beam/_src/dataset.py
@@ -227,6 +227,98 @@ def _infer_new_chunks(
   return new_chunks
 
 
+def _normalize_and_validate_chunk(
+    template: xarray.Dataset,
+    chunks: Mapping[str, int],
+    split_vars: bool,
+    key: core.Key,
+    dataset: xarray.Dataset,
+) -> tuple[core.Key, xarray.Dataset]:
+  """Validate and normalize (key, dataset) pairs for a Dataset."""
+
+  if split_vars:
+    if key.vars is None:
+      key = key.replace(vars=set(dataset.keys()))
+    elif key.vars != set(dataset.keys()):
+      raise ValueError(
+          f'dataset keys {sorted(dataset.keys())} do not match'
+          f' key.vars={sorted(key.vars)}'
+      )
+  elif key.vars is not None:
+    raise ValueError(f'must not set vars on key if split_vars=False: {key}')
+
+  new_offsets = dict(key.offsets)
+  for dim in dataset.dims:
+    if dim not in new_offsets:
+      new_offsets[dim] = 0
+  if len(new_offsets) != len(key.offsets):
+    key = key.replace(offsets=new_offsets)
+
+  core._ensure_chunk_is_computed(key, dataset)
+
+  def _with_dataset(msg: str):
+    dataset_repr = textwrap.indent(repr(dataset), prefix='    ')
+    return f'{msg}\nKey: {key}\nDataset chunk:\n{dataset_repr}'
+
+  def _bad_template_error(msg: str):
+    template_repr = textwrap.indent(repr(template), prefix='    ')
+    raise ValueError(_with_dataset(msg) + f'Template:\n{template_repr}')
+
+  for k, v in dataset.items():
+    if k not in template:
+      _bad_template_error(
+          f'Chunk variable {k!r} not found in template variables '
+          f' {list(template.data_vars)}:'
+      )
+    if v.dtype != template[k].dtype:
+      _bad_template_error(
+          f'Chunk variable {k!r} has dtype {v.dtype} which does not match'
+          f' template variable dtype {template[k].dtype}:'
+      )
+    if v.dims != template[k].dims:
+      _bad_template_error(
+          f'Chunk variable {k!r} has dims {v.dims} which does not match'
+          f' template variable dims {template[k].dims}:'
+      )
+
+  for dim, size in dataset.sizes.items():
+    if dim not in chunks:
+      raise ValueError(
+          _with_dataset(
+              f'Dataset dimension {dim!r} not found in chunks {chunks}:'
+          )
+      )
+    offset = key.offsets[dim]
+    if offset % chunks[dim] != 0:
+      raise ValueError(
+          _with_dataset(
+              f'Chunk offset {offset} is not aligned with chunk '
+              f'size {chunks[dim]} for dimension {dim!r}:'
+          )
+      )
+    if offset + size > template.sizes[dim]:
+      _bad_template_error(
+          f'Chunk dimension {dim!r} has size {size} which is larger than the '
+          f'remaining size {template.sizes[dim] - offset} in the '
+          'template:'
+      )
+    is_last_chunk = offset + chunks[dim] > template.sizes[dim]
+    if is_last_chunk:
+      expected_size = template.sizes[dim] - offset
+      if size != expected_size:
+        _bad_template_error(
+            f'Chunk dimension {dim!r} is the last chunk, but has size {size} '
+            f'which does not match expected size {expected_size}:'
+        )
+    elif size != chunks[dim]:
+      _bad_template_error(
+          f'Chunk dimension {dim!r} has size {size} which does not match'
+          f' chunk size {chunks[dim]}:'
+      )
+
+  return key, dataset
+
+
 def _apply_to_each_chunk(
     func: Callable[[xarray.Dataset], xarray.Dataset],
     old_chunks: Mapping[str, int],
@@ -302,9 +394,8 @@ def __init__(
   ):
     """Low level interface for creating a new Dataset, without validation.
 
-    Most users should use the higher level
-    :py:class:`xarray_beam.Dataset.from_xarray` or
-    :py:class:`xarray_beam.Dataset.from_zarr` instead.
+    Unless you're really sure you don't need validation, prefer using
+    :py:class:`xarray_beam.Dataset.from_ptransform`.
 
     Args:
       template: xarray.Dataset describing the structure of this dataset,
@@ -317,9 +408,7 @@ def __init__(
         this dataset's data.
     """
     self._template = template
-    self._chunks = {
-        k: min(template.sizes[k], v) for k, v in chunks.items()
-    }
+    self._chunks = {k: min(template.sizes[k], v) for k, v in chunks.items()}
     self._split_vars = split_vars
     self._ptransform = ptransform
 
@@ -390,6 +479,62 @@ def __repr__(self):
         + textwrap.indent('\n'.join(base.split('\n')[1:]), ' ' * 4)
     )
 
+  @classmethod
+  def from_ptransform(
+      cls,
+      ptransform: beam.PTransform,
+      *,
+      template: xarray.Dataset,
+      chunks: Mapping[str | types.EllipsisType, int],
+      split_vars: bool = False,
+  ) -> Dataset:
+    """Create an xarray_beam.Dataset from a Beam PTransform.
+
+    This is an advanced constructor that allows you to create an
+    ``xarray_beam.Dataset`` from an existing Beam PTransform that produces
+    ``(Key, xarray.Dataset)`` pairs.
+
+    The PTransform should produce chunks that conform to the given ``template``,
+    ``chunks``, and ``split_vars`` arguments. This constructor will add a
+    validation step to the PTransform to normalize keys into the strictest
+    possible form based on the other arguments, and ensure that transform
+    outputs are valid.
+
+    Args:
+      ptransform: A Beam PTransform that yields ``(Key, xarray.Dataset)`` pairs.
+        You only need to set ``offsets`` on these keys, ``vars`` will be
+        automatically set based on the dataset if ``split_vars`` is True.
+      template: An ``xarray.Dataset`` object representing the schema
+        (coordinates, dimensions, data variables, and attributes) of the full
+        dataset, as produced by :py:func:`xarray_beam.make_template`, with data
+        variables backed by Dask arrays.
+      chunks: A dictionary mapping dimension names to integer chunk sizes. Every
+        chunk produced by ``ptransform`` must have dimensions of these sizes,
+        except for the last chunk in each dimension, which may be smaller.
+      split_vars: A boolean indicating whether the chunks in ``ptransform`` are
+        split across variables, or if each chunk contains all variables.
+
+    Returns:
+      An ``xarray_beam.Dataset`` instance wrapping the PTransform.
+    """
+    if not isinstance(chunks, Mapping):
+      raise TypeError(
+          f'chunks must be a mapping for from_ptransform, got {chunks}'
+      )
+    for v in chunks.values():
+      if not isinstance(v, int):
+        raise TypeError(
+            'chunks must be a mapping with integer values for from_ptransform,'
+            f' got {chunks}'
+        )
+    chunks = normalize_chunks(chunks, template)
+    ptransform = ptransform | _get_label("validate") >> beam.MapTuple(
+        functools.partial(
+            _normalize_and_validate_chunk, template, chunks, split_vars
+        )
+    )
+    return cls(template, chunks, split_vars, ptransform)
+
   @classmethod
   def from_xarray(
       cls,
diff --git a/xarray_beam/_src/dataset_test.py b/xarray_beam/_src/dataset_test.py

Original file line number	Diff line number	Diff line change
`@@ -55,4 +55,4 @@`
`55`	`55`	`DatasetToZarr as DatasetToZarr,`
`56`	`56`	`)`
`57`	`57`
`58`		`-__version__ = '0.10.3' # automatically synchronized to pyproject.toml`
	`58`	`+__version__ = '0.10.4' # automatically synchronized to pyproject.toml`