Skip to content

Commit 40e6439

Browse files
committed
use existing timesteps written to a file to figure out the remaining file outputs before rotation
1 parent ef8d454 commit 40e6439

File tree

4 files changed

+64
-40
lines changed

4 files changed

+64
-40
lines changed

src/chkpt_op.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ class chkptOp
7373
{
7474
std::string base_name;
7575
std::string path;
76-
boost::optional<size_t> rotate_offset;
7776
};
7877

7978
// Cached output rotation state from checkpoint metadata (used during resume).

src/core.cpp

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -490,11 +490,6 @@ void core::config_checkpoint( pt::ptree& value)
490490
chkptOp::ugrid_output_state state;
491491
state.base_name = itr.second.get<std::string>("base_name", "");
492492
state.path = itr.second.get<std::string>("path", "");
493-
auto offset = itr.second.get_optional<size_t>("rotate_offset");
494-
if (offset)
495-
{
496-
state.rotate_offset = *offset;
497-
}
498493
if (!state.base_name.empty() || !state.path.empty())
499494
{
500495
_checkpoint_opts.ugrid_outputs.push_back(state);
@@ -539,10 +534,14 @@ void core::config_checkpoint( pt::ptree& value)
539534
writer->set_store_path(match->path);
540535
SPDLOG_DEBUG("Resuming ugrid output from checkpoint file {}", match->path);
541536
}
542-
if (match->rotate_offset)
537+
if (out.rotate_frequency && !match->path.empty())
543538
{
544-
out.rotate_offset = match->rotate_offset;
545-
SPDLOG_DEBUG("Resuming ugrid rotation with offset {}", *match->rotate_offset);
539+
auto& writer = boost::get<std::shared_ptr<ugrid_writer>>(out.writer);
540+
// Derive rotation offset from the existing ugrid file instead of checkpoint JSON.
541+
size_t time_len = writer->probe_time_index();
542+
out.rotate_offset = time_len % *out.rotate_frequency;
543+
SPDLOG_DEBUG("Resuming ugrid rotation with offset {} ({} timesteps already in file)",
544+
*out.rotate_offset, time_len);
546545
}
547546
}
548547
}
@@ -2587,13 +2586,6 @@ void core::run()
25872586
auto& writer = boost::get<std::shared_ptr<ugrid_writer>>(out.writer);
25882587
entry.put("path", writer->store_path());
25892588

2590-
if (out.rotate_frequency)
2591-
{
2592-
// Store the offset for the next timestep after this checkpoint.
2593-
size_t offset = (current_ts + 1) % *out.rotate_frequency;
2594-
entry.put("rotate_offset", offset);
2595-
}
2596-
25972589
ugrid_outputs.push_back(std::make_pair("", entry));
25982590
}
25992591
if (!ugrid_outputs.empty())

src/mesh/ugrid_writer.cpp

Lines changed: 54 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,59 @@ const std::string& ugrid_writer::store_path() const
9898
return _store_path;
9999
}
100100

101+
size_t ugrid_writer::probe_time_index()
102+
{
103+
if (!store_exists())
104+
{
105+
return 0;
106+
}
107+
108+
// Open the existing ugrid file read-only to determine how many timesteps are present.
109+
int fid = -1;
110+
MPI_Info info_used;
111+
MPI_Comm_get_info(_comm_world, &info_used);
112+
113+
nc_chk_ret(nc_open_par(_fname.c_str(), NC_NOWRITE, _comm_world, info_used, &fid));
114+
size_t time_len = read_time_index(fid);
115+
116+
nc_chk_ret(nc_close(fid));
117+
MPI_Info_free(&info_used);
118+
119+
return time_len;
120+
}
121+
122+
size_t ugrid_writer::read_time_index(int fid) const
123+
{
124+
int unlimdimidp;
125+
nc_chk_ret(nc_inq_unlimdim(fid, &unlimdimidp));
126+
127+
size_t time_len = 0;
128+
nc_chk_ret(nc_inq_dimlen(fid, unlimdimidp, &time_len));
129+
130+
if (_global->from_checkpoint() && time_len > 0)
131+
{
132+
// Trim to the restart time so we overwrite any outputs past the checkpoint.
133+
int time_varid = -1;
134+
int status = nc_inq_varid(fid, "time", &time_varid);
135+
if (status == NC_NOERR)
136+
{
137+
std::vector<double> time_vals(time_len, 0.0);
138+
nc_chk_ret(nc_get_var_double(fid, time_varid, time_vals.data()));
139+
140+
const double restart_time_minutes = _global->posix_time_double() / 60.0;
141+
auto it = std::lower_bound(time_vals.begin(), time_vals.end(), restart_time_minutes);
142+
auto idx = static_cast<size_t>(std::distance(time_vals.begin(), it));
143+
144+
if (idx < time_len)
145+
{
146+
time_len = idx;
147+
}
148+
}
149+
}
150+
151+
return time_len;
152+
}
153+
101154
std::string ugrid_writer::build_store_uri(const std::string& store_path) const
102155
{
103156
if (!_use_zarr)
@@ -394,30 +447,7 @@ void ugrid_writer::open_ugrid(const std::vector<std::string>& output_variables)
394447
nc_chk_ret(nc_var_par_access(_ugrid_fid, p.second, NC_COLLECTIVE));
395448
}
396449

397-
int unlimdimidp;
398-
399-
nc_chk_ret(nc_inq_unlimdim(_ugrid_fid, &unlimdimidp)); // get the time /dimension/. it's the only unlimited
400-
nc_chk_ret(nc_inq_dimlen(_ugrid_fid, unlimdimidp, &_time_index));
401-
402-
// If we are resuming from a checkpoint, make sure we overwrite any timesteps
403-
// beyond the checkpoint instead of blindly appending to the existing file.
404-
if (_global->from_checkpoint() && _time_index > 0)
405-
{
406-
// chm outputs minutes since as time unit
407-
const double restart_time_minutes = _global->posix_time_double() / 60.0;
408-
std::vector<double> time_vals(_time_index, 0.0);
409-
410-
nc_chk_ret(nc_get_var_double(_ugrid_fid, _ugrid_id_var["time"], time_vals.data()));
411-
412-
auto it = std::lower_bound(time_vals.begin(), time_vals.end(), restart_time_minutes);
413-
auto idx = static_cast<size_t>(std::distance(time_vals.begin(), it));
414-
415-
if (idx < _time_index)
416-
{
417-
SPDLOG_DEBUG("Resuming ugrid at time index {} (was {}).", idx, _time_index);
418-
_time_index = idx;
419-
}
420-
}
450+
_time_index = read_time_index(_ugrid_fid);
421451

422452
SPDLOG_DEBUG("Existing ugrid output has {} timesteps already", _time_index);
423453

src/mesh/ugrid_writer.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,16 @@ class ugrid_writer
5757
const boost::optional<double>& chunk_target_mb);
5858
void set_store_path(std::string store_path);
5959
const std::string& store_path() const;
60+
// Probe the existing ugrid file to determine how many timesteps are already written.
61+
size_t probe_time_index();
6062

6163
bool bitgroom;
6264
bool compress;
6365
private:
6466
std::string build_store_uri(const std::string& store_path) const;
6567
bool store_exists() const;
6668
size_t compute_time_chunk_len(size_t max_faces_per_rank, size_t num_output_vars) const;
69+
size_t read_time_index(int fid) const;
6770

6871
//holds the file id for the ugrid output netcdf
6972
int _ugrid_fid;

0 commit comments

Comments
 (0)