Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
4428058
Add support for distributed cholla datasets.
mabruzzo Oct 10, 2023
5b51057
Update yt/frontends/cholla/data_structures.py
mabruzzo Apr 22, 2025
ae37959
Merge branch 'main' into cholla-frontend-improvements
mabruzzo Apr 22, 2025
448b483
improve the docstring for _split_fname_procid_suffix
mabruzzo Apr 22, 2025
a51fe81
Merge branch 'main' into cholla-frontend-improvements
mabruzzo Apr 23, 2025
d5fa078
add support for new cholla concatenation format
mabruzzo Apr 29, 2025
b21a792
Merge branch 'main' into cholla-frontend-newformat
mabruzzo May 21, 2025
250dbd9
try to satisfy mypy
mabruzzo May 22, 2025
f5e92f0
fix the cholla frontend
mabruzzo Jun 26, 2025
a26cbff
finally addressing the mypy type-checking issues
mabruzzo Jun 26, 2025
be25241
Update yt/frontends/cholla/data_structures.py
mabruzzo Jun 26, 2025
3b593d1
address PR comments
mabruzzo Jun 27, 2025
4d29f05
Update yt/frontends/cholla/io.py
mabruzzo Jun 27, 2025
e87e136
fix a minor bug
mabruzzo Jun 27, 2025
640f729
Merge branch 'main' into cholla-frontend-newformat
mabruzzo Aug 26, 2025
bbf529f
Light refactoring
mabruzzo Aug 26, 2025
b9077fb
Merge branch 'main' into cholla-frontend-newformat
mabruzzo Oct 31, 2025
b9351cd
Introduced new tests for the Cholla frontend
mabruzzo Oct 31, 2025
f5f6160
Slightly reorganize Cholla-frontend
mabruzzo Oct 31, 2025
e92b626
satisfy type-checker
mabruzzo Oct 31, 2025
5f1d20d
satisfy type-check v2
mabruzzo Oct 31, 2025
3246d78
Rename test_load.py and exclude it from nose
mabruzzo Oct 31, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions nose_ignores.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@
--ignore-file=test_cf_radial_pytest\.py
--ignore-file=test_data_containers\.py
--ignore-file=test_fields_pytest\.py
--ignore-file=test_cholla_load\.py
62 changes: 45 additions & 17 deletions yt/frontends/cholla/data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,17 @@
from yt.geometry.api import Geometry
from yt.geometry.grid_geometry_handler import GridIndex
from yt.utilities.logger import ytLogger as mylog
from yt.utilities.on_demand_imports import _h5py as h5py
from yt.utilities.on_demand_imports import _h5py

from .fields import ChollaFieldInfo
from .misc import _determine_data_layout


class ChollaGrid(AMRGridPatch):
_id_offset = 0

def __init__(self, id, index, level, dims):
super().__init__(id, filename=index.index_filename, index=index)
def __init__(self, id, index, level, dims, filename):
super().__init__(id, filename=filename, index=index)
self.Parent = None
self.Children = []
self.Level = level
Expand All @@ -27,6 +28,7 @@ def __init__(self, id, index, level, dims):

class ChollaHierarchy(GridIndex):
grid = ChollaGrid
_grid_chunksize = 1

def __init__(self, ds, dataset_type="cholla"):
self.dataset_type = dataset_type
Expand All @@ -39,27 +41,53 @@ def __init__(self, ds, dataset_type="cholla"):
super().__init__(ds, dataset_type)

def _detect_output_fields(self):
with h5py.File(self.index_filename, mode="r") as h5f:
self.field_list = [("cholla", k) for k in h5f.keys()]
with _h5py.File(self.index_filename, mode="r") as h5f:
grp = h5f.get("field", h5f)
self.field_list = [("cholla", k) for k in grp.keys()]

def _count_grids(self):
self.num_grids = 1
with _h5py.File(self.index_filename, "r") as f:
self._blockid_location_arr, self._block_mapping = _determine_data_layout(f)
self.num_grids = self._blockid_location_arr.size

def _parse_index(self):
self.grid_left_edge[0][:] = self.ds.domain_left_edge[:]
self.grid_right_edge[0][:] = self.ds.domain_right_edge[:]
self.grid_dimensions[0][:] = self.ds.domain_dimensions[:]
self.grid_particle_count[0][0] = 0
self.grid_levels[0][0] = 0
self.grids = np.empty(self.num_grids, dtype="object")

shape_arr = np.array(self._blockid_location_arr.shape)
dims_local = (self.ds.domain_dimensions[:] / shape_arr).astype("=i8")

for idx3D, blockid in np.ndenumerate(self._blockid_location_arr):
idx3D_arr = np.array(idx3D)
left_frac = idx3D_arr / shape_arr
right_frac = (1 + idx3D_arr) / shape_arr

level = 0

self.grids[blockid] = self.grid(
blockid,
index=self,
level=level,
dims=dims_local,
filename=self._block_mapping.fname_template.format(blockid=blockid),
)

self.grid_left_edge[blockid, :] = left_frac
self.grid_right_edge[blockid, :] = right_frac
self.grid_dimensions[blockid, :] = dims_local
self.grid_levels[blockid, 0] = level
self.grid_particle_count[blockid, 0] = 0

slope = self.ds.domain_width / self.ds.arr(np.ones(3), "code_length")
self.grid_left_edge = self.grid_left_edge * slope + self.ds.domain_left_edge
self.grid_right_edge = self.grid_right_edge * slope + self.ds.domain_left_edge

self.max_level = 0

def _populate_grid_objects(self):
self.grids = np.empty(self.num_grids, dtype="object")
for i in range(self.num_grids):
g = self.grid(i, self, self.grid_levels.flat[i], self.grid_dimensions[i])
g = self.grids[i]
g._prepare_grid()
g._setup_dx()
self.grids[i] = g


class ChollaDataset(Dataset):
Expand Down Expand Up @@ -100,15 +128,15 @@ def _set_code_unit_attributes(self):
setdefaultattr(self, key, self.quan(1, unit))

def _parse_parameter_file(self):
with h5py.File(self.parameter_filename, mode="r") as h5f:
with _h5py.File(self.parameter_filename, mode="r") as h5f:
attrs = h5f.attrs
self.parameters = dict(attrs.items())
self.domain_left_edge = attrs["bounds"][:].astype("=f8")
self.domain_right_edge = self.domain_left_edge + attrs["domain"][:].astype(
"=f8"
)
self.dimensionality = len(attrs["dims"][:])
self.domain_dimensions = attrs["dims"][:].astype("=f8")
self.domain_dimensions = attrs["dims"][:].astype("=i8")
self.current_time = attrs["t"][:]
self._periodicity = tuple(attrs.get("periodicity", (False, False, False)))
self.gamma = attrs.get("gamma", 5.0 / 3.0)
Expand Down Expand Up @@ -167,7 +195,7 @@ def _is_valid(cls, filename: str, *args, **kwargs) -> bool:
return False

try:
fileh = h5py.File(filename, mode="r")
fileh = _h5py.File(filename, mode="r")
except OSError:
return False

Expand Down
37 changes: 21 additions & 16 deletions yt/frontends/cholla/io.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import numpy as np

from yt.utilities.io_handler import BaseIOHandler
from yt.utilities.on_demand_imports import _h5py as h5py

from .misc import _CachedH5Openner


class ChollaIOHandler(BaseIOHandler):
Expand All @@ -14,22 +13,28 @@ def _read_particle_coords(self, chunks, ptf):
def _read_particle_fields(self, chunks, ptf, selector):
raise NotImplementedError

def _read_fluid_selection(self, chunks, selector, fields, size):
data = {}
for field in fields:
data[field] = np.empty(size, dtype="float64")

with h5py.File(self.ds.parameter_filename, "r") as fh:
ind = 0
def io_iter(self, chunks, fields):
# this is loosely inspired by the implementation used for Enzo/Enzo-E
# - those other implementations use the lower-level hdf5 interface. Unclear
# whether that affords any advantages...
mapper = self.ds.index._block_mapping
with _CachedH5Openner(mode="r") as h5_context_manager:
for chunk in chunks:
for grid in chunk.objs:
nd = 0
for obj in chunk.objs:
if obj.filename is None: # unclear when this case arises...
continue

# ensure the file containing data for obj is open
fh = h5_context_manager.open_fh(obj.filename)

# access the HDF5 group containing the datasets of field values
grp = fh[mapper.h5_group]
# get the indices in a generic dataset that correspond to obj.id
idx = mapper.idx_map[obj.id]

for field in fields:
ftype, fname = field
values = fh[fname][:].astype("=f8")
nd = grid.select(selector, values, data[field], ind)
ind += nd
return data
yield field, obj, grp[fname][idx].astype("=f8")

def _read_chunk_data(self, chunk, fields):
raise NotImplementedError
214 changes: 214 additions & 0 deletions yt/frontends/cholla/misc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
import os
import typing
from collections import defaultdict
from collections.abc import Mapping
from dataclasses import dataclass

import numpy as np

# this is a hacky workaround to get _h5py.File to work in annotations. We can probably
# address this issue more robustly by directly modifying yt.utilities.on_demand_imports
if typing.TYPE_CHECKING:
import h5py as _h5py
else:
from yt.utilities.on_demand_imports import _h5py


class _CachedH5Openner:
"""
A simple context manager that helps implement the idiom where data is read
from (or written to) one or more HDF5 and we want to wait to close the
previous HDF5 file until it is time to open a new file. This lets us avoid
overhead in cases where we would close and then immediately reopen the
same file.

By using a context manager, we're able to properly cleanup in the event
that an exception occurs.
"""

def __init__(self, mode="r"):
self._filename = None
self._fh = None
self._mode = mode

def open_fh(self, filename):
if self._filename == filename:
return self._fh
if self._fh is not None:
self._fh.close()
self._fh = _h5py.File(filename, self._mode)
self._filename = filename
return self._fh

def __enter__(self):
return self

def __exit__(self, exc, value, tb):
if self._fh is not None:
self._fh.close()


@dataclass(kw_only=True, slots=True, frozen=True)
class _BlockDiskMapping:
"""Contains info for mapping blockids to locations in hdf5 files

Notes
-----
At the time of writing, this is primarily meant to provide a mapping for
field data. In the future, we may initialize a separate instance to
provide a mapping for particle data
"""

# ``fname_template.format(blockid=...)`` produces the file containing blockid (this
# can properly handle cases where all blocks are stored in a single file)
fname_template: str
# hdf5 group containing the field data
h5_group: str
# maps blockid to an index that select all associated data from a field-dataset
idx_map: Mapping[int, tuple[int | slice, ...]]


def _infer_blockid_location_arr(fname_template, global_dims, arr_shape):
# used when hdf5 files don't have an explicit "domain" group
blockid_location_arr = np.empty(shape=tuple(int(e) for e in arr_shape), dtype="i8")
if blockid_location_arr.size == 1:
# primarily intended to handle the result of older concatenation scripts (it
# also handles the case when only a single block is used, which is okay)
blockid_location_arr[0, 0, 0] = 0
else: # handle distributed cholla datasets
local_dims, rem = np.divmod(global_dims, blockid_location_arr.shape)
assert np.all(rem == 0) and np.all(local_dims > 0)
for blockid in range(0, blockid_location_arr.size):
with _h5py.File(fname_template.format(blockid=blockid), "r") as f:
tmp, rem = np.divmod(f.attrs["offset"][:], local_dims)
assert np.all(rem == 0) # sanity check
idx3D = tuple(int(e) for e in tmp)
blockid_location_arr[idx3D] = blockid
return blockid_location_arr


def _determine_data_layout(f: _h5py.File) -> tuple[np.ndarray, _BlockDiskMapping]:
"""Determine the data layout of the snapshot

The premise is that the basic different data formats shouldn't
matter outside of this function."""
filename = f.filename

# STEP 1: infer the template for all Cholla data-files by inspecting filename
# ===========================================================================
# There are 2 conventions for the names of Cholla's data-files:
# 1. "root.h5.{blockid}" is the standard format Cholla uses when writing files
# storing a single snapshot. Each MPI-rank will write a separate file and
# replace ``{blockid}`` with MPI-rank (Modern Cholla versions without MPI
# replace ``{blockid}`` with ``0``)
# 2. "root.h5": is the standard format used by Cholla's concatenation scripts
# (older versions of Cholla without MPI also used this format to name outputs)
inferred_fname_template, cur_filename_suffix = _infer_fname_template(filename)

# STEP 2: Check whether the hdf5 file has a flat structure
# ========================================================
# Historically, we would always store datasets directly in the root group of the
# data file. More recent concatenation scripts store no data in groups.
flat_structure = any(not isinstance(elem, _h5py.Group) for elem in f.values())

# STEP 3: Extract basic domain info information from the file(s)
# ==============================================================
has_explicit_domain_info = "domain" in f
if has_explicit_domain_info:
# this branch primarily handles concatenated files made with newer logic
blockid_location_arr = f["domain/blockid_location_arr"][...]
field_idx_map = {
int(blockid): (i, slice(None), slice(None), slice(None))
for i, blockid in enumerate(f["domain/stored_blockid_list"][...])
}
consolidated_data = len(field_idx_map) == blockid_location_arr.size
if not consolidated_data:
# in the near future, we may support one of the 2 cases:
# > if (flat_structure):
# > _common_idx = (slice(None), slice(None), slice(None))
# > else:
# > _common_idx = (0, slice(None), slice(None), slice(None))
# > field_idx_map = defaultdict(lambda arg=_common_idx: arg)
raise ValueError(
"no support for reading Cholla datasets where data is distributed "
"among files that explicitly encode domain info."
)
else: # (not has_explicit_domain_info)
# this branch covers distributed datasets (directly written by Cholla) and
# older concatenated files.
#
# historically, when the dataset is concatenated (in post-processing),
# the "nprocs" hdf5 attribute has been dropped
blockid_location_arr = _infer_blockid_location_arr(
fname_template=inferred_fname_template,
global_dims=f.attrs["dims"].astype("=i8"),
arr_shape=f.attrs.get("nprocs", np.array([1, 1, 1])).astype("=i8"),
)
consolidated_data = blockid_location_arr.size == 1

def _get_common_idx():
return (slice(None), slice(None), slice(None))

field_idx_map = defaultdict(_get_common_idx)

# STEP 4: Finalize the fname template
# ===================================
if consolidated_data:
fname_template = filename
elif cur_filename_suffix != 0:
raise ValueError( # mostly just a sanity check!
"filename passed to yt.load for a distributed cholla dataset must "
"end in '.0'"
)
else:
fname_template = inferred_fname_template

mapping = _BlockDiskMapping(
fname_template=fname_template,
h5_group="./" if flat_structure else "field",
idx_map=field_idx_map,
)
return blockid_location_arr, mapping


def _infer_fname_template(filename: str) -> tuple[str, int | None]:
"""Infers the template for all Cholla data-files based on the filename
passed to ``yt.load``.

string from the process-id suffix, and returns both parts in a 2-tuple.

There are 2 conventions for the names of Cholla's data-files:
1. "root.h5.{blockid}" is the standard format Cholla uses when writing
files storing a single snapshot. Each MPI-rank will write a separate
file and replace ``{blockid}`` with MPI-rank (Modern Cholla versions
without MPI replace ``{blockid}`` with ``0``)
2. "root.h5": is the standard format used by Cholla's concatenation
scripts (older versions of Cholla without MPI also used this format
to name outputs)

Returns
-------
template: str
The path to the file containing a blockid is given by calling
``template.format(blockid=<blockid>)``. (This will work whether
all blocks are stored in 1 file or blocks are distributed across
files)
cur_blockid_suffix: int or None
The blockid specified in the suffix of ``filename``. If there isn't a
suffix, then this will be None.
"""

# at this time, we expect the suffix to be the minimum number of characters
# that are necessary to represent the process id. For flexibility, we will
# allow extra zero-padding

_dir, _base = os.path.split(filename)
match _base.rpartition("."):
case ("", ".", _): # Cholla never writes a file like this
raise ValueError(
f"1st character in {filename!r} is the only '.' in the file's name"
)
case (prefix, ".", suffix) if suffix.isdecimal():
return os.path.join(_dir, f"{prefix}.{{blockid}}"), int(suffix)
case _:
return (filename, None)
Loading
Loading