@@ -87,20 +87,22 @@ int main(int argc, char *argv[])
87
87
{
88
88
#if USE_MPI
89
89
int provided;
90
+ int localRank;
91
+
90
92
MPI_Init_thread (&argc, &argv, MPI_THREAD_FUNNELED, &provided);
91
- if (provided < MPI_THREAD_FUNNELED) {
93
+
94
+ if (provided < MPI_THREAD_FUNNELED)
92
95
MPI_Abort (MPI_COMM_WORLD, provided);
93
- }
94
96
95
97
MPI_Comm_rank (MPI_COMM_WORLD, &rank);
96
98
MPI_Comm_size (MPI_COMM_WORLD, &procs);
97
99
98
- // Each local rank on a given node will own a single device/GCD
99
- MPI_Comm shmcomm ;
100
+ // Each rank will run the benchmark on a single device
101
+ MPI_Comm shared_comm ;
100
102
MPI_Comm_split_type (MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0 ,
101
- MPI_INFO_NULL, &shmcomm );
102
- int localRank;
103
- MPI_Comm_rank (shmcomm, &localRank);
103
+ MPI_INFO_NULL, &shared_comm );
104
+ MPI_Comm_rank (shared_comm, & localRank) ;
105
+
104
106
// Set device index to be the local MPI rank
105
107
deviceIndex = localRank;
106
108
#endif
@@ -110,16 +112,17 @@ int main(int argc, char *argv[])
110
112
if (!output_as_csv)
111
113
{
112
114
#if USE_MPI
113
- if (rank == 0 ) {
115
+ if (rank == 0 )
114
116
#endif
117
+ {
115
118
std::cout
116
119
<< " BabelStream" << std::endl
117
120
<< " Version: " << VERSION_STRING << std::endl
118
121
<< " Implementation: " << IMPLEMENTATION_STRING << std::endl;
119
122
#if USE_MPI
120
123
std::cout << " Number of MPI ranks: " << procs << std::endl;
121
- }
122
124
#endif
125
+ }
123
126
}
124
127
125
128
if (use_float)
@@ -145,54 +148,48 @@ std::vector<std::vector<double>> run_all(Stream<T> *stream, T& sum)
145
148
// Declare timers
146
149
std::chrono::high_resolution_clock::time_point t1, t2;
147
150
151
+ #if USE_MPI
152
+ // Set MPI data type for the dot-product reduction
153
+ MPI_Datatype MPI_DTYPE = use_float ? MPI_FLOAT : MPI_DOUBLE;
154
+ #endif
155
+
148
156
// Main loop
149
157
for (unsigned int k = 0 ; k < num_times; k++)
150
158
{
151
- #if USE_MPI
152
- MPI_Barrier (MPI_COMM_WORLD);
153
- #endif
154
159
155
160
// Execute Copy
156
161
t1 = std::chrono::high_resolution_clock::now ();
157
162
stream->copy ();
158
- #if USE_MPI
159
- MPI_Barrier (MPI_COMM_WORLD);
160
- #endif
161
163
t2 = std::chrono::high_resolution_clock::now ();
162
164
timings[0 ].push_back (std::chrono::duration_cast<std::chrono::duration<double > >(t2 - t1).count ());
163
165
164
166
// Execute Mul
165
167
t1 = std::chrono::high_resolution_clock::now ();
166
168
stream->mul ();
167
- #if USE_MPI
168
- MPI_Barrier (MPI_COMM_WORLD);
169
- #endif
170
169
t2 = std::chrono::high_resolution_clock::now ();
171
170
timings[1 ].push_back (std::chrono::duration_cast<std::chrono::duration<double > >(t2 - t1).count ());
172
171
173
172
// Execute Add
174
173
t1 = std::chrono::high_resolution_clock::now ();
175
174
stream->add ();
176
- #if USE_MPI
177
- MPI_Barrier (MPI_COMM_WORLD);
178
- #endif
179
175
t2 = std::chrono::high_resolution_clock::now ();
180
176
timings[2 ].push_back (std::chrono::duration_cast<std::chrono::duration<double > >(t2 - t1).count ());
181
177
182
178
// Execute Triad
183
179
t1 = std::chrono::high_resolution_clock::now ();
184
180
stream->triad ();
185
- #if USE_MPI
186
- MPI_Barrier (MPI_COMM_WORLD);
187
- #endif
188
181
t2 = std::chrono::high_resolution_clock::now ();
189
182
timings[3 ].push_back (std::chrono::duration_cast<std::chrono::duration<double > >(t2 - t1).count ());
190
183
191
184
// Execute Dot
185
+ #if USE_MPI
186
+ // Synchronize ranks before computing dot-product
187
+ MPI_Barrier (MPI_COMM_WORLD);
188
+ #endif
192
189
t1 = std::chrono::high_resolution_clock::now ();
193
190
sum = stream->dot ();
194
191
#if USE_MPI
195
- MPI_Allreduce (MPI_IN_PLACE, &sum, 1 , MPI_DOUBLE , MPI_SUM, MPI_COMM_WORLD);
192
+ MPI_Allreduce (MPI_IN_PLACE, &sum, 1 , MPI_DTYPE , MPI_SUM, MPI_COMM_WORLD);
196
193
#endif
197
194
t2 = std::chrono::high_resolution_clock::now ();
198
195
timings[4 ].push_back (std::chrono::duration_cast<std::chrono::duration<double > >(t2 - t1).count ());
@@ -217,9 +214,6 @@ std::vector<std::vector<double>> run_triad(Stream<T> *stream)
217
214
t1 = std::chrono::high_resolution_clock::now ();
218
215
for (unsigned int k = 0 ; k < num_times; k++)
219
216
{
220
- #if USE_MPI
221
- MPI_Barrier (MPI_COMM_WORLD);
222
- #endif
223
217
stream->triad ();
224
218
}
225
219
t2 = std::chrono::high_resolution_clock::now ();
@@ -241,14 +235,8 @@ std::vector<std::vector<double>> run_nstream(Stream<T> *stream)
241
235
242
236
// Run nstream in loop
243
237
for (int k = 0 ; k < num_times; k++) {
244
- #if USE_MPI
245
- MPI_Barrier (MPI_COMM_WORLD);
246
- #endif
247
238
t1 = std::chrono::high_resolution_clock::now ();
248
239
stream->nstream ();
249
- #if USE_MPI
250
- MPI_Barrier (MPI_COMM_WORLD);
251
- #endif
252
240
t2 = std::chrono::high_resolution_clock::now ();
253
241
timings[0 ].push_back (std::chrono::duration_cast<std::chrono::duration<double > >(t2 - t1).count ());
254
242
}
@@ -416,10 +404,6 @@ void run()
416
404
417
405
418
406
stream->read_arrays (a, b, c);
419
- #if USE_MPI
420
- // Only check solutions on rank 0 in case verificaiton fails
421
- if (rank == 0 )
422
- #endif
423
407
check_solution<T>(num_times, a, b, c, sum);
424
408
425
409
// Display timing results
@@ -485,17 +469,11 @@ void run()
485
469
double max = *minmax.second ;
486
470
487
471
#if USE_MPI
488
- // Collate timings
489
- if (rank == 0 )
490
- {
491
- MPI_Reduce (MPI_IN_PLACE, &min, 1 , MPI_DOUBLE, MPI_MIN, 0 , MPI_COMM_WORLD);
492
- MPI_Reduce (MPI_IN_PLACE, &max, 1 , MPI_DOUBLE, MPI_MAX, 0 , MPI_COMM_WORLD);
493
- }
494
- else
495
- {
496
- MPI_Reduce (&min, NULL , 1 , MPI_DOUBLE, MPI_MIN, 0 , MPI_COMM_WORLD);
497
- MPI_Reduce (&max, NULL , 1 , MPI_DOUBLE, MPI_MAX, 0 , MPI_COMM_WORLD);
498
- }
472
+ MPI_Datatype MPI_DTYPE = use_float ? MPI_FLOAT : MPI_DOUBLE;
473
+
474
+ // Collect global min/max timings
475
+ MPI_Allreduce (MPI_IN_PLACE, &min, 1 , MPI_DTYPE, MPI_MIN, MPI_COMM_WORLD);
476
+ MPI_Allreduce (MPI_IN_PLACE, &max, 1 , MPI_DTYPE, MPI_MAX, MPI_COMM_WORLD);
499
477
sizes[i] *= procs;
500
478
#endif
501
479
0 commit comments