@@ -126,6 +126,16 @@ class SimulationHelper : public IPlugin
126
126
/* trigger checkpoint notification */
127
127
if (checkpointPeriod && (currentStep % checkpointPeriod == 0 ))
128
128
{
129
+ /* first synchronize: if something failed, we can spare the time
130
+ * for the checkpoint writing */
131
+ CUDA_CHECK (cudaDeviceSynchronize ());
132
+ CUDA_CHECK (cudaGetLastError ());
133
+
134
+ GridController<DIM> &gc = Environment<DIM>::get ().GridController ();
135
+ /* can be spared for better scalings, but allows to spare the
136
+ * time for checkpointing if some ranks died */
137
+ MPI_CHECK (MPI_Barrier (gc.getCommunicator ().getMPIComm ()));
138
+
129
139
/* create directory containing checkpoints */
130
140
if (numCheckpoints == 0 )
131
141
{
@@ -135,7 +145,14 @@ class SimulationHelper : public IPlugin
135
145
Environment<DIM>::get ().PluginConnector ().checkpointPlugins (currentStep,
136
146
checkpointDirectory);
137
147
138
- GridController<DIM> &gc = Environment<DIM>::get ().GridController ();
148
+ /* important synchronize: only if no errors occured until this
149
+ * point guarantees that a checkpoint is usable */
150
+ CUDA_CHECK (cudaDeviceSynchronize ());
151
+ CUDA_CHECK (cudaGetLastError ());
152
+
153
+ /* \todo in an ideal world with MPI-3, this would be an
154
+ * MPI_Ibarrier call and this function would return a MPI_Request
155
+ * that could be checked */
139
156
MPI_CHECK (MPI_Barrier (gc.getCommunicator ().getMPIComm ()));
140
157
141
158
if (gc.getGlobalRank () == 0 )
0 commit comments