Skip to content

Commit fe6f3de

Browse files
atmyersax3lasalmgren
authored
Use less device memory when checkpointing particles (#3238)
This chunks the operation per box rather than per rank so as to use less device memory. The proposed changes: - [ ] fix a bug or incorrect behavior in AMReX - [ ] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate --------- Co-authored-by: Axel Huebl <[email protected]> Co-authored-by: Ann Almgren <[email protected]>
1 parent f264290 commit fe6f3de

File tree

1 file changed

+11
-10
lines changed

1 file changed

+11
-10
lines changed

Src/Particle/AMReX_WriteBinaryParticleData.H

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -189,16 +189,12 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
189189
const Long rChunkSize = AMREX_SPACEDIM + num_output_real;
190190
rdata.resize(np*rChunkSize);
191191

192-
typename PC::IntVector idata_d(idata.size());
193-
typename PC::RealVector rdata_d(rdata.size());
194-
195192
typename PC::IntVector write_int_comp_d(write_int_comp.size());
196193
typename PC::IntVector write_real_comp_d(write_real_comp.size());
197194
Gpu::copyAsync(Gpu::hostToDevice, write_int_comp.begin(), write_int_comp.end(),
198195
write_int_comp_d.begin());
199196
Gpu::copyAsync(Gpu::hostToDevice, write_real_comp.begin(), write_real_comp.end(),
200197
write_real_comp_d.begin());
201-
Gpu::Device::streamSynchronize();
202198

203199
const auto write_int_comp_d_ptr = write_int_comp_d.data();
204200
const auto write_real_comp_d_ptr = write_real_comp_d.data();
@@ -211,6 +207,9 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
211207
typename PC::IntVector offsets(np_tile);
212208
int num_copies = Scan::ExclusiveSum(np_tile, pflags.begin(), offsets.begin(), Scan::retSum);
213209

210+
typename PC::IntVector idata_d(num_copies*iChunkSize);
211+
typename PC::RealVector rdata_d(num_copies*rChunkSize);
212+
214213
const auto flag_ptr = pflags.data();
215214

216215
auto idata_d_ptr = idata_d.data();
@@ -224,11 +223,11 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
224223
const auto p = ptd.getSuperParticle(pindex);
225224

226225
if (flag_ptr[pindex]) {
227-
std::size_t iout_index = (pindex+poffset)*iChunkSize;
226+
std::size_t iout_index = pindex*iChunkSize;
228227
packParticleIDs(&idata_d_ptr[iout_index], p, is_checkpoint);
229228
iout_index += 2;
230229

231-
std::size_t rout_index = (pindex+poffset)*rChunkSize;
230+
std::size_t rout_index = pindex*rChunkSize;
232231
for (int j = 0; j < AMREX_SPACEDIM; j++) {
233232
rdata_d_ptr[rout_index] = p.pos(j);
234233
rout_index++;
@@ -267,12 +266,14 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
267266
}
268267
});
269268

269+
Gpu::copyAsync(Gpu::deviceToHost, idata_d.begin(), idata_d.end(),
270+
idata.begin() + typename PC::IntVector::difference_type(poffset));
271+
Gpu::copyAsync(Gpu::deviceToHost, rdata_d.begin(), rdata_d.end(),
272+
rdata.begin() + typename PC::RealVector::difference_type(poffset));
273+
Gpu::Device::streamSynchronize();
274+
270275
poffset += num_copies;
271276
}
272-
273-
Gpu::copyAsync(Gpu::deviceToHost, idata_d.begin(), idata_d.end(), idata.begin());
274-
Gpu::copyAsync(Gpu::deviceToHost, rdata_d.begin(), rdata_d.end(), rdata.begin());
275-
Gpu::Device::streamSynchronize();
276277
}
277278

278279
template <class PC>

0 commit comments

Comments
 (0)