Skip to content

Commit 92ade51

Browse files
committed
Try to handle chunking better. More more vtk writer stuff out of triangulation
1 parent b396fb7 commit 92ade51

File tree

10 files changed

+325
-36
lines changed

10 files changed

+325
-36
lines changed

docs/checkpointing.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,5 +88,9 @@ that specified in the checkpoint json file.
8888
The savestate occurs at the end of the timestep, so the resume time will be one timestep into the future, i.e., the
8989
next timestep.
9090

91+
UGRID rotation state (active output file and rotation cadence) is stored in the checkpoint metadata. On resume, CHM
92+
continues writing to the active ugrid file and keeps the original rotation cadence even though internal timestep
93+
counters restart at zero.
94+
9195
The make use of checkpointing, a module must implement the ``checkpoint(mesh& domain, netcdf& chkpt)`` and
92-
``load_checkpoint(mesh& domain, netcdf& chkpt)`` methods.
96+
``load_checkpoint(mesh& domain, netcdf& chkpt)`` methods.

docs/configuration.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,23 @@ please see the :ref:`output` section.
526526
:default: 0
527527

528528
Only applies to ugrid outputs. The timestep frequency to create a new ugrid file at.
529+
Rotated files are named ``<base_name>_YYYYMMDDTHHMMSS.nc`` and the cadence is preserved across checkpoint resume.
530+
531+
.. confval:: chunk_time_len
532+
533+
:type: int
534+
:default: unset
535+
536+
Only applies to ugrid outputs. Sets an explicit time chunk length (in timesteps). Must not be set alongside
537+
``chunk_target_mb``.
538+
539+
.. confval:: chunk_target_mb
540+
541+
:type: float
542+
:default: unset
543+
544+
Only applies to ugrid outputs. Sets the target chunk size per variable (in MB). Must not be set alongside
545+
``chunk_time_len``.
529546

530547
.. confval:: write_all_parameters
531548

docs/output.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,13 @@ and improves the output speed. For a mesh of #cells=2200 for 24 hours with 3 ran
5858
UGRID output can also be stored as a Zarr directory via NCZarr by setting ``output.ugrid.format`` to ``zarr``. This
5959
creates a ``.zarr`` store with the same schema and parallel write behavior.
6060

61+
Rotation
62+
--------
63+
64+
UGRID rotation (``rotate_frequency``) starts a new file every N timesteps. Rotated files are named
65+
``<base_name>_YYYYMMDDTHHMMSS.nc`` based on the model time. When resuming from a checkpoint, CHM continues writing to
66+
the file that was active at checkpoint time and preserves the rotation cadence.
67+
6168
Chunking optimizations
6269
----------------------
6370

@@ -68,6 +75,9 @@ UGRID time chunking is sized to balance Dask recommendations and output cadence:
6875
- Chunk lengths align to the mesh output cadence (``frequency`` or ``only_last_n``). If neither is set,
6976
the chunker assumes only a small number of outputs and keeps chunk sizes small.
7077

78+
Users can override chunk sizing with either ``chunk_time_len`` (explicit timesteps) or ``chunk_target_mb`` (target MB
79+
per variable). Only one override can be set at a time.
80+
7181
These settings reduce metadata overhead and avoid overly large task graphs while respecting model output frequency.
7282

7383
+---------------------+-----------+----------------+

src/chkpt_op.hpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525

2626
#include <cstddef>
2727

28+
#include <string>
29+
#include <vector>
30+
2831
#include <boost/filesystem/path.hpp>
2932
#include <boost/mpi.hpp>
3033
#include <boost/optional.hpp>
@@ -66,6 +69,16 @@ class chkptOp
6669
// used to stop the simulation when we checkpoint when we are outta time
6770
bool checkpoint_request_terminate;
6871

72+
struct ugrid_output_state
73+
{
74+
std::string base_name;
75+
std::string path;
76+
boost::optional<size_t> rotate_offset;
77+
};
78+
79+
// Cached output rotation state from checkpoint metadata (used during resume).
80+
std::vector<ugrid_output_state> ugrid_outputs;
81+
6982
/**
7083
* Should checkpointing occur
7184
* @param current_ts

src/core.cpp

Lines changed: 139 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ void core::config_checkpoint( pt::ptree& value)
375375
CHM_THROW_EXCEPTION(config_error, "Error in checkpoint config");
376376
}
377377

378-
if(*auto_resume)
378+
if(auto_resume && *auto_resume)
379379
{
380380

381381
// match the filename of checkpoint_20170901T070000.np1.json
@@ -386,6 +386,13 @@ void core::config_checkpoint( pt::ptree& value)
386386
boost::filesystem::path newest_file; // will end up with the most recent chkpt to resume from
387387
boost::filesystem::path const dir(output_folder_path / "checkpoint" );
388388

389+
if (!boost::filesystem::exists(dir))
390+
{
391+
SPDLOG_ERROR("auto_resume is enabled but checkpoint directory is missing: {}", dir.string());
392+
CHM_THROW_EXCEPTION(config_error, "Checkpoint auto_resume failed: missing checkpoint directory");
393+
}
394+
else
395+
{
389396
for(const auto& entry : boost::filesystem::directory_iterator(dir))
390397
{
391398
if (boost::filesystem::is_regular_file(entry.status()))
@@ -407,6 +414,7 @@ void core::config_checkpoint( pt::ptree& value)
407414
}
408415
}
409416
}
417+
}
410418

411419
if(newest_file.empty())
412420
{
@@ -421,14 +429,14 @@ void core::config_checkpoint( pt::ptree& value)
421429

422430

423431

424-
if (file)
425-
{
426-
_checkpoint_opts.load_from_checkpoint = true;
432+
if (file)
433+
{
434+
_checkpoint_opts.load_from_checkpoint = true;
427435

428-
boost::filesystem::path ckpt_path = *file;
436+
boost::filesystem::path ckpt_path = *file;
429437
// ckpt_path = boost::filesystem::canonical(ckpt_path);
430438

431-
auto chkp = read_json(ckpt_path.string());
439+
auto chkp = read_json(ckpt_path.string());
432440

433441
size_t csz = 1;
434442
size_t rank = 0;
@@ -472,10 +480,31 @@ void core::config_checkpoint( pt::ptree& value)
472480
CHM_THROW_EXCEPTION(config_error, "Error reading list of checkpoint files");
473481
}
474482

475-
ckpt_nc_path = ckpt_path.parent_path() / ckpt_nc_path;
476-
SPDLOG_DEBUG("Rank {} using checkpoint restore file {}", rank, ckpt_nc_path.string());
477-
_checkpoint_opts.in_savestate.open(ckpt_nc_path.string());
478-
}
483+
ckpt_nc_path = ckpt_path.parent_path() / ckpt_nc_path;
484+
SPDLOG_DEBUG("Rank {} using checkpoint restore file {}", rank, ckpt_nc_path.string());
485+
_checkpoint_opts.in_savestate.open(ckpt_nc_path.string());
486+
487+
_checkpoint_opts.ugrid_outputs.clear();
488+
if (auto ugrid_child = chkp.get_child_optional("ugrid_outputs"))
489+
{
490+
// Preserve ugrid rotation state for resume; applied later when outputs are configured.
491+
for (auto &itr : *ugrid_child)
492+
{
493+
chkptOp::ugrid_output_state state;
494+
state.base_name = itr.second.get<std::string>("base_name", "");
495+
state.path = itr.second.get<std::string>("path", "");
496+
auto offset = itr.second.get_optional<size_t>("rotate_offset");
497+
if (offset)
498+
{
499+
state.rotate_offset = *offset;
500+
}
501+
if (!state.base_name.empty() || !state.path.empty())
502+
{
503+
_checkpoint_opts.ugrid_outputs.push_back(state);
504+
}
505+
}
506+
}
507+
}
479508
}
480509
void core::config_forcing(pt::ptree &value)
481510
{
@@ -1232,7 +1261,58 @@ void core::config_output(pt::ptree &value)
12321261
if (out.mesh_output_formats == output_info::mesh_outputs::ugrid)
12331262
{
12341263
boost::get<boost::shared_ptr<ugrid_writer>>(out.writer)
1235-
->set_output_cadence(out.frequency, out.only_last_n);
1264+
->set_output_cadence(out.frequency, out.only_last_n, out.rotate_frequency);
1265+
}
1266+
1267+
if (out.mesh_output_formats == output_info::mesh_outputs::ugrid)
1268+
{
1269+
auto chunk_len_steps = itr.second.get_optional<size_t>("chunk_time_len");
1270+
auto chunk_target_mb = itr.second.get_optional<double>("chunk_target_mb");
1271+
if (chunk_len_steps && chunk_target_mb)
1272+
{
1273+
CHM_THROW_EXCEPTION(config_error, "Set only one of chunk_time_len or chunk_target_mb for ugrid output");
1274+
}
1275+
if (chunk_len_steps && *chunk_len_steps == 0)
1276+
{
1277+
CHM_THROW_EXCEPTION(config_error, "chunk_time_len must be > 0 for ugrid output");
1278+
}
1279+
if (chunk_target_mb && *chunk_target_mb <= 0.0)
1280+
{
1281+
CHM_THROW_EXCEPTION(config_error, "chunk_target_mb must be > 0 for ugrid output");
1282+
}
1283+
boost::get<boost::shared_ptr<ugrid_writer>>(out.writer)
1284+
->set_chunking_override(chunk_len_steps, chunk_target_mb);
1285+
}
1286+
1287+
if (_checkpoint_opts.load_from_checkpoint && out.mesh_output_formats == output_info::mesh_outputs::ugrid)
1288+
{
1289+
// Restore the output path and rotation offset captured in the checkpoint metadata.
1290+
const chkptOp::ugrid_output_state* match = nullptr;
1291+
for (const auto &state : _checkpoint_opts.ugrid_outputs)
1292+
{
1293+
if (state.base_name == out.base_name)
1294+
{
1295+
match = &state;
1296+
break;
1297+
}
1298+
}
1299+
if (!match && _checkpoint_opts.ugrid_outputs.size() == 1)
1300+
{
1301+
match = &_checkpoint_opts.ugrid_outputs.front();
1302+
}
1303+
if (match)
1304+
{
1305+
if (!match->path.empty())
1306+
{
1307+
auto& writer = boost::get<boost::shared_ptr<ugrid_writer>>(out.writer);
1308+
writer->set_store_path(match->path);
1309+
SPDLOG_DEBUG("Resuming ugrid output from checkpoint file {}", match->path);
1310+
}
1311+
if (match->rotate_offset)
1312+
{
1313+
out.rotate_offset = match->rotate_offset;
1314+
}
1315+
}
12361316
}
12371317

12381318
auto specific_datetime = itr.second.get_optional<std::string>("specific_datetime");
@@ -2288,8 +2368,7 @@ void core::run()
22882368

22892369
//setup a XML writer for the PVD paraview format
22902370
pt::ptree pvd;
2291-
pvd.add("VTKFile.<xmlattr>.type", "Collection");
2292-
pvd.add("VTKFile.<xmlattr>.version", "0.1");
2371+
vtk_writer::init_pvd(pvd);
22932372

22942373

22952374
SPDLOG_DEBUG("Loading first timestep's met data");
@@ -2476,6 +2555,35 @@ void core::run()
24762555
}
24772556
tree.add_child("files", tmp_files);
24782557

2558+
pt::ptree ugrid_outputs;
2559+
for (auto &out : _outputs)
2560+
{
2561+
if (out.mesh_output_formats != output_info::mesh_outputs::ugrid)
2562+
{
2563+
continue;
2564+
}
2565+
2566+
// Cache the active file and rotation offset so resume can align rotation cadence.
2567+
pt::ptree entry;
2568+
entry.put("base_name", out.base_name);
2569+
2570+
auto& writer = boost::get<boost::shared_ptr<ugrid_writer>>(out.writer);
2571+
entry.put("path", writer->store_path());
2572+
2573+
if (out.rotate_frequency)
2574+
{
2575+
// Store the offset for the next timestep after this checkpoint.
2576+
size_t offset = (current_ts + 1) % *out.rotate_frequency;
2577+
entry.put("rotate_offset", offset);
2578+
}
2579+
2580+
ugrid_outputs.push_back(std::make_pair("", entry));
2581+
}
2582+
if (!ugrid_outputs.empty())
2583+
{
2584+
tree.add_child("ugrid_outputs", ugrid_outputs);
2585+
}
2586+
24792587

24802588
if(_comm_world.rank() == 0)
24812589
{
@@ -2529,13 +2637,11 @@ void core::run()
25292637
for(int rank = 0; rank < _comm_world.size(); rank++)
25302638
{
25312639

2532-
// write paths that are relative to the pvd file
2533-
boost::filesystem::path vtu_path(output_folder_path.string() + "/vtu/" + p.filename().string()+"_"+std::to_string(rank) + ".vtu");
2534-
pt::ptree &dataset = pvd.add("VTKFile.Collection.DataSet", "");
2535-
dataset.add("<xmlattr>.timestep", _global->posix_time_int());
2536-
dataset.add("<xmlattr>.group", "");
2537-
dataset.add("<xmlattr>.part", rank);
2538-
dataset.add("<xmlattr>.file", boost::filesystem::relative(vtu_path, output_folder_path).string());
2640+
vtk_writer::append_pvd_entry(pvd,
2641+
output_folder_path,
2642+
p.string(),
2643+
rank,
2644+
_global->posix_time_int());
25392645

25402646
}
25412647
}
@@ -2553,7 +2659,19 @@ void core::run()
25532659
auto& writer = boost::get<boost::shared_ptr<ugrid_writer>>(itr.writer);
25542660
if (new_ugrid)
25552661
{
2662+
auto rotated_path = [&]() {
2663+
boost::filesystem::path base_path(itr.fname);
2664+
std::string stem = base_path.stem().string();
2665+
std::string ext = base_path.extension().string();
2666+
std::string ts = boost::posix_time::to_iso_string(_global->posix_time());
2667+
boost::filesystem::path rotated =
2668+
base_path.parent_path() /
2669+
(stem + "_" + ts + ext);
2670+
return rotated.string();
2671+
};
25562672
writer->close_ugrid();
2673+
SPDLOG_DEBUG("Rotating ugrid output to {}", rotated_path());
2674+
writer->set_store_path(rotated_path());
25572675
}
25582676

25592677
writer->write_ugrid({itr.variables.begin(), itr.variables.end()} );
@@ -2637,10 +2755,7 @@ void core::run()
26372755
{
26382756
#endif
26392757

2640-
// output the pvd one level higher in the main outdir than we have previously
2641-
boost::filesystem::path path(itr.fname + ".pvd");
2642-
pt::write_xml( (output_folder_path.string() / path.filename()).string(),
2643-
pvd, std::locale(), pt::xml_writer_settings<std::string>(' ', 4));
2758+
vtk_writer::write_pvd(pvd, output_folder_path, itr.fname);
26442759
break;
26452760

26462761
#ifdef USE_MPI

0 commit comments

Comments
 (0)