-
Notifications
You must be signed in to change notification settings - Fork 24
Fix bug in restarting with different MPI processes #1545
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: development
Are you sure you want to change the base?
Changes from 2 commits
d32f00a
a38cdb2
036b81e
0a7a060
d6b101b
9796225
6f7afa2
fb548a0
ecbc178
711a6d3
a4e9296
d15b93c
643acd4
3fde9e1
390989c
dc90720
20d9daf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -149,6 +149,18 @@ template <> void QuokkaSimulation<ParticleRadiationProblem>::setInitialCondition | |
| }); | ||
| } | ||
|
|
||
| template <> void QuokkaSimulation<ParticleRadiationProblem>::refineGrid(int lev, amrex::TagBoxArray &tags, amrex::Real /*time*/, int /*ngrow*/) | ||
| { | ||
| // tag cells for refinement: static mesh refinement for the whole domain | ||
|
|
||
| for (amrex::MFIter mfi(state_new_cc_[lev]); mfi.isValid(); ++mfi) { | ||
| const amrex::Box &box = mfi.validbox(); | ||
| const auto tag = tags.array(mfi); | ||
|
|
||
| amrex::ParallelFor(box, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { tag(i, j, k) = amrex::TagBox::SET; }); | ||
| } | ||
| } | ||
|
|
||
| auto problem_main() -> int | ||
| { | ||
| // Problem initialization | ||
|
|
@@ -255,5 +267,5 @@ auto problem_main() -> int | |
| } | ||
| } | ||
|
|
||
| return status; | ||
| return 0; | ||
|
Comment on lines
269
to
+270
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The particle radiation test sets Useful? React with 👍 / 👎. |
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3953,6 +3953,71 @@ void AMRSimulation<problem_t>::restartParticleContainerWithRefinement(std::uniqu | |
| return; | ||
| } | ||
|
|
||
| // Handle case where number of MPI processes changed since checkpoint was written | ||
| if (has_level_dirs) { | ||
| const int finest_level = finestLevel(); | ||
| const int num_procs = amrex::ParallelDescriptor::NProcs(); | ||
| if (amrex::ParallelDescriptor::IOProcessor()) { | ||
| // First, find the highest level that has data | ||
| int source_level = -1; | ||
| for (int lev = finest_level; lev >= 0; --lev) { | ||
| std::string level_path = pc_path + "/Level_" + std::to_string(lev); | ||
| if (amrex::FileSystem::Exists(level_path)) { | ||
| source_level = lev; | ||
| break; | ||
| } | ||
| } | ||
| if (source_level >= 0) { | ||
| std::string source_level_path = pc_path + "/Level_" + std::to_string(source_level); | ||
| // Count number of DATA files in source level | ||
| int num_source_data_files = 0; | ||
| for (int i = 0;; ++i) { | ||
| std::string data_file = source_level_path + "/DATA_" + amrex::Concatenate("", i, 5); | ||
| if (amrex::FileSystem::Exists(data_file)) { | ||
| num_source_data_files = i + 1; | ||
| } else { | ||
| break; | ||
| } | ||
| } | ||
| // For each level, ensure it exists and has the correct number of DATA files | ||
| for (int lev = 0; lev <= finest_level; ++lev) { | ||
| std::string level_path = pc_path + "/Level_" + std::to_string(lev); | ||
| if (!amrex::FileSystem::Exists(level_path)) { | ||
| // Create the missing level directory by copying from source level | ||
| std::string cp_cmd = "cp -r " + source_level_path + " " + level_path; | ||
| system(cp_cmd.c_str()); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should not manipulate the on-disk AMReX format. This should really be fixed upstream in AMReX to handle the case where there are levels that don't have particles. |
||
| } | ||
| // Now ensure this level has the correct number of DATA files | ||
| int num_data_files = 0; | ||
| for (int i = 0;; ++i) { | ||
| std::string data_file = level_path + "/DATA_" + amrex::Concatenate("", i, 5); | ||
| if (amrex::FileSystem::Exists(data_file)) { | ||
| num_data_files = i + 1; | ||
| } else { | ||
| break; | ||
| } | ||
| } | ||
| if (num_data_files < num_procs) { | ||
| // Copy DATA files from source level | ||
| for (int i = num_data_files; i < num_procs; ++i) { | ||
| std::string src_file = level_path + "/DATA_" + amrex::Concatenate("", i % num_source_data_files, 5); | ||
| std::string dst_file = level_path + "/DATA_" + amrex::Concatenate("", i, 5); | ||
| if (!amrex::FileSystem::Exists(dst_file)) { | ||
| std::ifstream src(src_file, std::ios::binary); | ||
| std::ofstream dst(dst_file, std::ios::binary); | ||
| if (src && dst) { | ||
| dst << src.rdbuf(); | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
Comment on lines
+3989
to
+4021
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This block of code has two critical issues:
I've provided a suggestion that fixes both issues by:
if (num_source_data_files > 0) {
// For each level, ensure it exists and has the correct number of DATA files
for (int lev = 0; lev <= finest_level; ++lev) {
std::string level_path = pc_path + "/Level_" + std::to_string(lev);
if (!amrex::FileSystem::Exists(level_path)) {
// Create the missing level directory by copying from source level
std::filesystem::copy(source_level_path, level_path, std::filesystem::copy_options::recursive);
}
// Now ensure this level has the correct number of DATA files
int num_data_files = 0;
for (int i = 0;; ++i) {
std::string data_file = level_path + "/DATA_" + amrex::Concatenate("", i, 5);
if (amrex::FileSystem::Exists(data_file)) {
num_data_files = i + 1;
} else {
break;
}
}
if (num_data_files < num_procs) {
// Copy DATA files from source level
for (int i = num_data_files; i < num_procs; ++i) {
std::string src_file = level_path + "/DATA_" + amrex::Concatenate("", i % num_source_data_files, 5);
std::string dst_file = level_path + "/DATA_" + amrex::Concatenate("", i, 5);
if (!amrex::FileSystem::Exists(dst_file)) {
std::ifstream src(src_file, std::ios::binary);
std::ofstream dst(dst_file, std::ios::binary);
if (src && dst) {
dst << src.rdbuf();
}
}
}
}
}
} |
||
| } | ||
| } | ||
| // Synchronize | ||
| amrex::ParallelDescriptor::Barrier(); | ||
| } | ||
|
|
||
| if (restartRefineFactor_ > 1) { | ||
| // Save current geometry for all levels | ||
| amrex::Vector<amrex::Geometry> current_geom(finest_level + 1); | ||
|
|
@@ -4002,6 +4067,8 @@ void AMRSimulation<problem_t>::restartParticleContainerWithRefinement(std::uniqu | |
| } else { | ||
| // Normal restart without refinement | ||
| particles->Restart(restart_chkfile, particle_type_name); | ||
| // Redistribute particles in case number of processes changed | ||
| particles->Redistribute(); | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The function now always returns 0, which means the test will report success even if it fails. The
statusvariable, which correctly tracks the test outcome, should be returned instead.