Skip to content

Commit 4428058

Browse files
committed
Add support for distributed cholla datasets.
1 parent 14c49c4 commit 4428058

2 files changed

Lines changed: 134 additions & 31 deletions

File tree

yt/frontends/cholla/data_structures.py

Lines changed: 116 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,50 @@
55

66
from yt.data_objects.index_subobjects.grid_patch import AMRGridPatch
77
from yt.data_objects.static_output import Dataset
8-
from yt.funcs import setdefaultattr
8+
from yt.funcs import get_pbar, setdefaultattr
99
from yt.geometry.api import Geometry
1010
from yt.geometry.grid_geometry_handler import GridIndex
1111
from yt.utilities.on_demand_imports import _h5py as h5py
1212

1313
from .fields import ChollaFieldInfo
1414

1515

16+
def _split_fname_proc_suffix(filename: str):
17+
"""Splits ``filename`` at the '.' separating the beginning part of the
18+
string from the process-id suffix, and returns both parts in a 2-tuple.
19+
20+
When cholla is compiled with MPI and it directly writes data-files, each
21+
process appends a suffix to each filename that denotes the process-id. For
22+
example, the MPI-compiled version might write '0.h5.0'. If that function is
23+
passed such a string, then it returns ``('0.h5', '0')``.
24+
25+
In cases where there is no suffix, the output is ``(filename, '')``. This
26+
might come up if the user concatenated the output files, which is common
27+
practice.
28+
"""
29+
30+
# at this time, we expect the suffix to be the minimum number of characters
31+
# that are necessary to represent the process id. For flexibility, we will
32+
# allow extra zero-padding
33+
34+
sep_i = filename.rfind(".")
35+
suf_len = len(filename) - (sep_i + 1)
36+
if (sep_i == -1) or (suf_len == 0) or not filename[sep_i + 1 :].isdecimal():
37+
return (filename, "")
38+
elif (sep_i == 0) or ((sep_i - 1) == filename.rfind("/")):
39+
raise ValueError(
40+
f"can't split a process-suffix off of {filename!r} "
41+
"since the remaining filename would be empty"
42+
)
43+
else:
44+
return (filename[:sep_i], filename[sep_i + 1 :])
45+
46+
1647
class ChollaGrid(AMRGridPatch):
1748
_id_offset = 0
1849

19-
def __init__(self, id, index, level, dims):
20-
super().__init__(id, filename=index.index_filename, index=index)
50+
def __init__(self, id, index, level, dims, filename):
51+
super().__init__(id, filename=filename, index=index)
2152
self.Parent = None
2253
self.Children = []
2354
self.Level = level
@@ -42,23 +73,92 @@ def _detect_output_fields(self):
4273
self.field_list = [("cholla", k) for k in h5f.keys()]
4374

4475
def _count_grids(self):
45-
self.num_grids = 1
76+
# the number of grids is equal to the number of processes, unless the
77+
# dataset has been concatenated. But, when the dataset is concatenated
78+
# (a common post-processing step), the "nprocs" hdf5 attribute is
79+
# usually dropped.
80+
81+
with h5py.File(self.index_filename, mode="r") as h5f:
82+
nprocs = h5f.attrs.get("nprocs", np.array([1, 1, 1]))[:].astype("=i8")
83+
self.num_grids = np.prod(nprocs)
84+
85+
if self.num_grids > 1:
86+
# When there's more than 1 grid, we expect the user to
87+
# - have not changed the names of the output files
88+
# - have passed the file written by process 0 to ``yt.load``
89+
# Let's perform a sanity-check that self.index_filename has the
90+
# expected suffix for a file written by mpi-process 0
91+
if int(_split_fname_proc_suffix(self.index_filename)[1]) != 0:
92+
raise ValueError(
93+
"the primary file associated with a "
94+
"distributed cholla dataset must end in '.0'"
95+
)
4696

4797
def _parse_index(self):
48-
self.grid_left_edge[0][:] = self.ds.domain_left_edge[:]
49-
self.grid_right_edge[0][:] = self.ds.domain_right_edge[:]
50-
self.grid_dimensions[0][:] = self.ds.domain_dimensions[:]
51-
self.grid_particle_count[0][0] = 0
52-
self.grid_levels[0][0] = 0
98+
self.grids = np.empty(self.num_grids, dtype="object")
99+
100+
# construct an iterable over the pairs of grid-index and corresponding
101+
# filename
102+
if self.num_grids == 1:
103+
ind_fname_pairs = [(0, self.index_filename)]
104+
else:
105+
# index_fname should has the form f'{self.directory}/<prefix>.0'
106+
# strip off the '.0' and determine the contents of <prefix>
107+
pref, suf = _split_fname_proc_suffix(self.index_filename)
108+
assert int(suf) == 0 # sanity check!
109+
110+
ind_fname_pairs = ((i, f"{pref}.{i}") for i in range(self.num_grids))
111+
112+
dims_global = self.ds.domain_dimensions[:]
113+
pbar = get_pbar("Parsing Hierarchy", self.num_grids)
114+
115+
# It would be nice if we could avoid reading in every hdf5 file during
116+
# this step... (to do this, Cholla could probably encode how the blocks
117+
# are sorted in an hdf5 attribute)
118+
119+
for i, fname in ind_fname_pairs:
120+
if self.num_grids == 1:
121+
# if the file was concatenated, we might be missing attributes
122+
# that are accessed in the other branch. To avoid issues, we use
123+
# hardcoded values
124+
left_frac, right_frac, dims_local = 0.0, 1.0, dims_global
125+
else:
126+
with h5py.File(fname, "r") as f:
127+
offset = f.attrs["offset"][:].astype("=i8")
128+
dims_local = f.attrs["dims_local"][:].astype("=i8")
129+
left_frac = offset / dims_global
130+
right_frac = (offset + dims_local) / dims_global
131+
132+
level = 0
133+
134+
self.grids[i] = self.grid(
135+
i,
136+
index=self,
137+
level=level,
138+
dims=dims_local,
139+
filename=fname,
140+
)
141+
142+
self.grid_left_edge[i] = left_frac
143+
self.grid_right_edge[i] = right_frac
144+
self.grid_dimensions[i] = dims_local
145+
self.grid_levels[i, 0] = level
146+
self.grid_particle_count[i, 0] = 0
147+
148+
pbar.update(i + 1)
149+
pbar.finish()
150+
151+
slope = self.ds.domain_width / self.ds.arr(np.ones(3), "code_length")
152+
self.grid_left_edge = self.grid_left_edge * slope + self.ds.domain_left_edge
153+
self.grid_right_edge = self.grid_right_edge * slope + self.ds.domain_left_edge
154+
53155
self.max_level = 0
54156

55157
def _populate_grid_objects(self):
56-
self.grids = np.empty(self.num_grids, dtype="object")
57158
for i in range(self.num_grids):
58-
g = self.grid(i, self, self.grid_levels.flat[i], self.grid_dimensions[i])
159+
g = self.grids[i]
59160
g._prepare_grid()
60161
g._setup_dx()
61-
self.grids[i] = g
62162

63163

64164
class ChollaDataset(Dataset):
@@ -103,9 +203,11 @@ def _parse_parameter_file(self):
103203
attrs = h5f.attrs
104204
self.parameters = dict(attrs.items())
105205
self.domain_left_edge = attrs["bounds"][:].astype("=f8")
106-
self.domain_right_edge = attrs["domain"][:].astype("=f8")
206+
self.domain_right_edge = self.domain_left_edge + attrs["domain"][:].astype(
207+
"=f8"
208+
)
107209
self.dimensionality = len(attrs["dims"][:])
108-
self.domain_dimensions = attrs["dims"][:].astype("=f8")
210+
self.domain_dimensions = attrs["dims"][:].astype("=i8")
109211
self.current_time = attrs["t"][:]
110212
self._periodicity = tuple(attrs.get("periodicity", (False, False, False)))
111213
self.gamma = attrs.get("gamma", 5.0 / 3.0)

yt/frontends/cholla/io.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import numpy as np
21

32
from yt.utilities.io_handler import BaseIOHandler
43
from yt.utilities.on_demand_imports import _h5py as h5py
@@ -14,22 +13,24 @@ def _read_particle_coords(self, chunks, ptf):
1413
def _read_particle_fields(self, chunks, ptf, selector):
1514
raise NotImplementedError
1615

17-
def _read_fluid_selection(self, chunks, selector, fields, size):
18-
data = {}
19-
for field in fields:
20-
data[field] = np.empty(size, dtype="float64")
21-
22-
with h5py.File(self.ds.parameter_filename, "r") as fh:
23-
ind = 0
24-
for chunk in chunks:
25-
for grid in chunk.objs:
26-
nd = 0
27-
for field in fields:
28-
ftype, fname = field
29-
values = fh[fname][:].astype("=f8")
30-
nd = grid.select(selector, values, data[field], ind)
31-
ind += nd
32-
return data
16+
def io_iter(self, chunks, fields):
17+
# this is loosely inspired by the implementation used for Enzo/Enzo-E
18+
# - those other options use the lower-level hdf5 interface. Unclear
19+
# whether that affords any advantages...
20+
fh, filename = None, None
21+
for chunk in chunks:
22+
for obj in chunk.objs:
23+
if obj.filename is None: # unclear when this case arises...
24+
continue
25+
elif obj.filename != filename:
26+
if fh is not None:
27+
fh.close()
28+
fh, filename = h5py.File(obj.filename, "r"), obj.filename
29+
for field in fields:
30+
ftype, fname = field
31+
yield field, obj, fh[fname][:].astype("=f8")
32+
if fh is not None:
33+
fh.close()
3334

3435
def _read_chunk_data(self, chunk, fields):
3536
raise NotImplementedError

0 commit comments

Comments
 (0)