diff --git a/src/CheckPoint.C b/src/CheckPoint.C index 75baed0c..c06d6233 100644 --- a/src/CheckPoint.C +++ b/src/CheckPoint.C @@ -299,6 +299,10 @@ void CheckPoint::compute_file_suffix(int cycle, std::stringstream& fileSuffix) { fileSuffix << ".sw4checkpoint"; } +void CheckPoint::compute_file_suffix(const char* cycle, std::stringstream& fileSuffix) { + fileSuffix << mCheckPointFile << "." << cycle << ".sw4checkpoint"; +} + //----------------------------------------------------------------------- void CheckPoint::write_checkpoint(float_sw4 a_time, int a_cycle, vector& a_Um, vector& a_U, @@ -338,9 +342,25 @@ void CheckPoint::write_checkpoint(float_sw4 a_time, int a_cycle, s << fileSuffix.str(); } +#ifndef SW4_USE_SCR // Keep track of the number of files, save previous file name, and delete the // second last. cycle_checkpoints(s.str()); +#else + // SCR can delete older checkpoints, + // e.g., SCR_PREFIX_SIZE=3 to keep a sliding window of the last 3 + + // Inform SCR that a new checkpoint is starting + std::string cycle_num; + cycle_num << "cycle=" << a_cycle; + SCR_Start_output(cycle_num.str().c_str(), SCR_FLAG_CHECKPOINT); + + // Ask SCR for the path to write our checkpoint file + // This could be moved just before each H5FCreate call if need to preserve original file name + char scr_file[SCR_MAX_FILENAME]; + SCR_Route_file(s.str().c_str(), scr_file); + s = scr_file; +#endif // Open file from processor zero and write header. int hsize; @@ -441,6 +461,11 @@ void CheckPoint::write_checkpoint(float_sw4 a_time, int a_cycle, delete[] doubleField; } if (iwrite) close(fid); + +#ifdef SW4_USE_SCR + int valid = 1; + SCR_Complete_output(valid); +#endif } // end write_checkpoint() //----------------------------------------------------------------------- @@ -464,7 +489,22 @@ void CheckPoint::read_checkpoint(float_sw4& a_time, int& a_cycle, s << mRestartPath << "/"; else if (mEW->getPath() != "./") s << mEW->getPath(); + +#ifndef SW4_USE_SCR s << mRestartFile; +#else + // TODO: need to get cycle_num from earlier call to Have_restart + + // TODO: this is not right, but you get the idea... + std::stringstream fileSuffix; + compute_file_suffix(cycle_num, fileSuffix); + s << fileSuffix.str(); + + // Ask SCR for the path to open our checkpoint file + char scr_file[SCR_MAX_FILENAME]; + SCR_Route_file(s.str().c_str(), scr_file); + s = scr_file; +#endif } // Open file from processor zero and read header. @@ -574,19 +614,50 @@ void CheckPoint::read_checkpoint(float_sw4& a_time, int& a_cycle, delete[] doubleField; } if (iread) close(fid); + +#ifdef SW4_USE_SCR + int valid = 1; + SCR_Complete_restart(valid); +#endif } //----------------------------------------------------------------------- float_sw4 CheckPoint::getDt() { -#ifndef SW4_USE_SCR float_sw4 dt; + +#ifdef SW4_USE_SCR + // Get checkpoint name (cycle number) from SCR + int have_restart = 0; + char cycle_num[SCR_MAX_FILENAME]; + SCR_Have_restart(&have_restart, cycle_num); + if (! have_restart) { + // We expected SCR to have something to get to this point + abort(); + } + + SCR_Start_restart(cycle_num); +#endif + if (mEW->getRank() == 0) { std::stringstream s; if (mRestartPathSet) s << mRestartPath << "/"; else if (mEW->getPath() != "./") s << mEW->getPath(); + +#ifndef SW4_USE_SCR s << mRestartFile; // string 's' is the file name including path +#else + // TODO: this is not right, but you get the idea... + std::stringstream fileSuffix; + compute_file_suffix(cycle_num, fileSuffix); + s << fileSuffix.str(); + + // Ask SCR for the path to open our checkpoint file + char scr_file[SCR_MAX_FILENAME]; + SCR_Route_file(s.str().c_str(), scr_file); + s = scr_file; +#endif if (mUseHDF5) { #ifdef USE_HDF5 @@ -620,8 +691,8 @@ float_sw4 CheckPoint::getDt() { } MPI_Bcast(&dt, 1, mEW->m_mpifloat, 0, mEW->m_cartesian_communicator); return dt; -#else +#if 0 int have_restart = 0; char checkpoint_dir[SCR_MAX_FILENAME]; SCR_Have_restart(&have_restart, checkpoint_dir); @@ -1119,9 +1190,24 @@ void CheckPoint::write_checkpoint_hdf5(float_sw4 a_time, int a_cycle, s << mEW->getPath() << "/"; s << fileSuffix.str(); +#ifndef SW4_USE_SCR // Keep track of the number of files, save previous file name, and delete the // second last. cycle_checkpoints(s.str()); +#else + // SCR can delete older checkpoints, e.g., SCR_PREFIX_SIZE=3 + + // Inform SCR that a new checkpoint is starting + std::string cycle_num; + cycle_num << "cycle=" << a_cycle; + SCR_Start_output(cycle_num.str().c_str(), SCR_FLAG_CHECKPOINT); + + // Ask SCR for the path to write our checkpoint file + // This could be moved just before each H5FCreate call if need to preserve original file name + char scr_file[SCR_MAX_FILENAME]; + SCR_Route_file(s.str().c_str(), scr_file); + s = scr_file; +#endif hid_t fid, fapl, dxpl, dspace, mspace, mydspace, dtype, dcpl; int myrank, nrank; @@ -1343,6 +1429,12 @@ void CheckPoint::write_checkpoint_hdf5(float_sw4 a_time, int a_cycle, #else H5Fclose(fid); #endif + +#ifdef SW4_USE_SCR + int valid = 1; + SCR_Complete_output(valid); +#endif + etime = MPI_Wtime(); if (myrank == 0) @@ -1365,7 +1457,22 @@ void CheckPoint::read_checkpoint_hdf5(float_sw4& a_time, int& a_cycle, s << mRestartPath << "/"; else if (mEW->getPath() != "./") s << mEW->getPath(); + +#ifndef SW4_USE_SCR s << mRestartFile; +#else + // TODO: need to get cycle_num from earlier call to Have_restart + + // TODO: this is not right, but you get the idea... + std::stringstream fileSuffix; + compute_file_suffix(cycle_num, fileSuffix); + s << fileSuffix.str(); + + // Ask SCR for the path to open our checkpoint file + char scr_file[SCR_MAX_FILENAME]; + SCR_Route_file(s.str().c_str(), scr_file); + s = scr_file; +#endif if (myrank == 0) std::cout << "Reading checkpoint from file " << s.str() << std::endl; @@ -1474,6 +1581,11 @@ void CheckPoint::read_checkpoint_hdf5(float_sw4& a_time, int& a_cycle, H5Pclose(fapl); H5Pclose(dxpl); H5Fclose(fid); + +#ifdef SW4_USE_SCR + int valid = 1; + SCR_Complete_restart(valid); +#endif } #endif // End USE_HDF5 //----------------------------------------------------------------------- diff --git a/src/CheckPoint.h b/src/CheckPoint.h index 5068c97b..36b2df09 100644 --- a/src/CheckPoint.h +++ b/src/CheckPoint.h @@ -75,6 +75,7 @@ void write_checkpoint_scr(float_sw4 a_time, int a_cycle, void define_pio(); void setSteps(int a_steps); void compute_file_suffix(int cycle, std::stringstream& fileSuffix); + void compute_file_suffix(const char* cycle, std::stringstream& fileSuffix); void cycle_checkpoints(string fname); void write_header(int& fid, float_sw4 a_time, int a_cycle, int& hsize); void read_header(int& fid, float_sw4& a_time, int& a_cycle, int& hsize);