Skip to content

Commit f0ba515

Browse files
Merge pull request ComputationalRadiationPhysics#897 from ax3l/topic-cudaSyncCheckpoints
Checkpoints: Check for CUDA Errors
2 parents a16bea8 + 3386050 commit f0ba515

File tree

2 files changed

+26
-1
lines changed

2 files changed

+26
-1
lines changed

Diff for: src/libPMacc/include/simulationControl/SimulationHelper.hpp

+18-1
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,16 @@ class SimulationHelper : public IPlugin
126126
/* trigger checkpoint notification */
127127
if (checkpointPeriod && (currentStep % checkpointPeriod == 0))
128128
{
129+
/* first synchronize: if something failed, we can spare the time
130+
* for the checkpoint writing */
131+
CUDA_CHECK(cudaDeviceSynchronize());
132+
CUDA_CHECK(cudaGetLastError());
133+
134+
GridController<DIM> &gc = Environment<DIM>::get().GridController();
135+
/* can be spared for better scalings, but allows to spare the
136+
* time for checkpointing if some ranks died */
137+
MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm()));
138+
129139
/* create directory containing checkpoints */
130140
if (numCheckpoints == 0)
131141
{
@@ -135,7 +145,14 @@ class SimulationHelper : public IPlugin
135145
Environment<DIM>::get().PluginConnector().checkpointPlugins(currentStep,
136146
checkpointDirectory);
137147

138-
GridController<DIM> &gc = Environment<DIM>::get().GridController();
148+
/* important synchronize: only if no errors occured until this
149+
* point guarantees that a checkpoint is usable */
150+
CUDA_CHECK(cudaDeviceSynchronize());
151+
CUDA_CHECK(cudaGetLastError());
152+
153+
/* \todo in an ideal world with MPI-3, this would be an
154+
* MPI_Ibarrier call and this function would return a MPI_Request
155+
* that could be checked */
139156
MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm()));
140157

141158
if (gc.getGlobalRank() == 0)

Diff for: src/picongpu/include/initialization/InitialiserController.hpp

+8
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,14 @@ class InitialiserController : public IInitPlugin
8484
Environment<>::get().PluginConnector().restartPlugins(restartStep, restartDirectory);
8585
__getTransactionEvent().waitForFinished();
8686

87+
CUDA_CHECK(cudaDeviceSynchronize());
88+
CUDA_CHECK(cudaGetLastError());
89+
90+
GridController<simDim> &gc = Environment<simDim>::get().GridController();
91+
/* can be spared for better scalings, but guarantees the user
92+
* that the restart was successful */
93+
MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm()));
94+
8795
log<picLog::SIMULATION_STATE > ("Loading from persistent data finished");
8896
}
8997

0 commit comments

Comments
 (0)