3333using namespace pfc ;
3434
3535// Benchmark configuration
36- constexpr int GRID_SIZE = 128 ; // 128³ = 2,097,152 points
37- constexpr int NUM_ITERATIONS = 10 ; // Number of iterations for averaging
36+ constexpr int GRID_SIZE = 128 ; // 128³ = 2,097,152 points
37+ constexpr int NUM_ITERATIONS = 10 ; // Number of iterations for averaging
3838
3939/* *
4040 * @brief Benchmark FFT performance for a given backend
@@ -47,17 +47,18 @@ constexpr int NUM_ITERATIONS = 10; // Number of iterations for averaging
4747 */
4848double benchmark_fft (fft::Backend backend, const World &world,
4949 const decomposition::Decomposition &decomp, int rank_id) {
50-
51- std::string backend_name = (backend == fft::Backend::FFTW) ? " FFTW (CPU)" : " CUDA (GPU)" ;
50+
51+ std::string backend_name =
52+ (backend == fft::Backend::FFTW) ? " FFTW (CPU)" : " CUDA (GPU)" ;
5253 std::cout << " \n ========================================\n " ;
5354 std::cout << " Benchmarking: " << backend_name << " \n " ;
5455 std::cout << " ========================================\n " ;
55-
56+
5657 // Create FFT with selected backend
5758 auto fft = fft::create_with_backend (decomp, rank_id, backend);
58-
59- std::cout << " Grid size: " << GRID_SIZE << " ³ = "
60- << (GRID_SIZE * GRID_SIZE * GRID_SIZE) << " points\n " ;
59+
60+ std::cout << " Grid size: " << GRID_SIZE
61+ << " ³ = " << (GRID_SIZE * GRID_SIZE * GRID_SIZE) << " points\n " ;
6162 std::cout << " Real data size: " << fft->size_inbox () << " (local)\n " ;
6263 std::cout << " Complex data size: " << fft->size_outbox () << " (local)\n " ;
6364 std::cout << " Iterations: " << NUM_ITERATIONS << " \n\n " ;
@@ -66,86 +67,90 @@ double benchmark_fft(fft::Backend backend, const World &world,
6667 // CPU backend: use std::vector
6768 std::vector<double > real_data (fft->size_inbox ());
6869 std::vector<std::complex <double >> complex_data (fft->size_outbox ());
69-
70+
7071 // Initialize with some test data
7172 for (size_t i = 0 ; i < real_data.size (); ++i) {
7273 real_data[i] = std::sin (2.0 * M_PI * i / real_data.size ());
7374 }
74-
75+
7576 // Warmup
7677 std::cout << " Warmup..." ;
7778 fft->forward (real_data, complex_data);
7879 fft->backward (complex_data, real_data);
7980 std::cout << " done.\n " ;
80-
81+
8182 // Benchmark
8283 std::cout << " Running benchmark...\n " ;
8384 auto start = std::chrono::high_resolution_clock::now ();
84-
85+
8586 for (int iter = 0 ; iter < NUM_ITERATIONS; ++iter) {
8687 fft->forward (real_data, complex_data);
8788 fft->backward (complex_data, real_data);
8889 }
89-
90+
9091 auto end = std::chrono::high_resolution_clock::now ();
91- auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
92-
92+ auto duration =
93+ std::chrono::duration_cast<std::chrono::microseconds>(end - start);
94+
9395 double avg_time_ms = duration.count () / (1000.0 * NUM_ITERATIONS);
94-
96+
9597 std::cout << " Total time: " << duration.count () / 1000.0 << " ms\n " ;
9698 std::cout << " Average time per forward+backward: " << std::fixed
9799 << std::setprecision (3 ) << avg_time_ms << " ms\n " ;
98-
100+
99101 return avg_time_ms;
100-
102+
101103 } else {
102104#if defined(OpenPFC_ENABLE_CUDA)
103105 // GPU backend: use DataBuffer
104106 using RealBufferGPU = core::DataBuffer<backend::CudaTag, double >;
105- using ComplexBufferGPU = core::DataBuffer<backend::CudaTag, std::complex <double >>;
106-
107+ using ComplexBufferGPU =
108+ core::DataBuffer<backend::CudaTag, std::complex <double >>;
109+
107110 RealBufferGPU real_data (fft->size_inbox ());
108111 ComplexBufferGPU complex_data (fft->size_outbox ());
109-
112+
110113 // Initialize on host, copy to device
111114 std::vector<double > host_data (fft->size_inbox ());
112115 for (size_t i = 0 ; i < host_data.size (); ++i) {
113116 host_data[i] = std::sin (2.0 * M_PI * i / host_data.size ());
114117 }
115118 real_data.copy_from_host (host_data);
116-
119+
117120 // Get the FFT_Impl with CUDA backend
118- auto * fft_cuda = dynamic_cast <fft::FFT_Impl<heffte::backend::cufft>*>(fft.get ());
121+ auto *fft_cuda =
122+ dynamic_cast <fft::FFT_Impl<heffte::backend::cufft> *>(fft.get ());
119123 if (!fft_cuda) {
120124 throw std::runtime_error (" Failed to cast to CUDA FFT implementation" );
121125 }
122-
126+
123127 // Warmup
124128 std::cout << " Warmup..." ;
125129 fft_cuda->forward (real_data, complex_data);
126130 fft_cuda->backward (complex_data, real_data);
127- cudaDeviceSynchronize (); // Ensure GPU work is complete
131+ cudaDeviceSynchronize (); // Ensure GPU work is complete
128132 std::cout << " done.\n " ;
129-
133+
130134 // Benchmark
131135 std::cout << " Running benchmark...\n " ;
132136 auto start = std::chrono::high_resolution_clock::now ();
133-
137+
134138 for (int iter = 0 ; iter < NUM_ITERATIONS; ++iter) {
135139 fft_cuda->forward (real_data, complex_data);
136140 fft_cuda->backward (complex_data, real_data);
137141 }
138-
139- cudaDeviceSynchronize (); // Ensure all GPU work is complete
142+
143+ cudaDeviceSynchronize (); // Ensure all GPU work is complete
140144 auto end = std::chrono::high_resolution_clock::now ();
141- auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
142-
145+ auto duration =
146+ std::chrono::duration_cast<std::chrono::microseconds>(end - start);
147+
143148 double avg_time_ms = duration.count () / (1000.0 * NUM_ITERATIONS);
144-
149+
145150 std::cout << " Total time: " << duration.count () / 1000.0 << " ms\n " ;
146151 std::cout << " Average time per forward+backward: " << std::fixed
147152 << std::setprecision (3 ) << avg_time_ms << " ms\n " ;
148-
153+
149154 return avg_time_ms;
150155#else
151156 throw std::runtime_error (" CUDA support not compiled in" );
@@ -156,45 +161,45 @@ double benchmark_fft(fft::Backend backend, const World &world,
156161int main (int argc, char *argv[]) {
157162 // Initialize MPI
158163 MPI_Init (&argc, &argv);
159-
164+
160165 int rank, size;
161166 MPI_Comm_rank (MPI_COMM_WORLD, &rank);
162167 MPI_Comm_size (MPI_COMM_WORLD, &size);
163-
168+
164169 if (rank == 0 ) {
165170 std::cout << " \n ╔════════════════════════════════════════╗\n " ;
166171 std::cout << " ║ FFT Backend Performance Benchmark ║\n " ;
167172 std::cout << " ╚════════════════════════════════════════╝\n " ;
168173 std::cout << " \n MPI ranks: " << size << " \n " ;
169174 }
170-
175+
171176 try {
172177 // Create computational domain (128³ grid)
173- auto world = world::create ({GRID_SIZE, GRID_SIZE, GRID_SIZE},
174- {1.0 , 1.0 , 1.0 }, { 1.0 , 1.0 , 1.0 } );
175-
178+ auto world = world::create ({GRID_SIZE, GRID_SIZE, GRID_SIZE}, { 1.0 , 1.0 , 1.0 },
179+ {1.0 , 1.0 , 1.0 });
180+
176181 // Create domain decomposition
177182 auto decomp = decomposition::create (world, size);
178-
183+
179184 if (rank == 0 ) {
180185 std::cout << " Domain: " << GRID_SIZE << " × " << GRID_SIZE << " × "
181- << GRID_SIZE << " = "
182- << (GRID_SIZE * GRID_SIZE * GRID_SIZE) << " grid points\n " ;
186+ << GRID_SIZE << " = " << (GRID_SIZE * GRID_SIZE * GRID_SIZE)
187+ << " grid points\n " ;
183188 }
184-
189+
185190 // Benchmark CPU (FFTW)
186191 double cpu_time_ms = 0.0 ;
187192 if (rank == 0 ) {
188193 cpu_time_ms = benchmark_fft (fft::Backend::FFTW, world, decomp, rank);
189194 }
190-
195+
191196#if defined(OpenPFC_ENABLE_CUDA)
192197 // Benchmark GPU (CUDA)
193198 double gpu_time_ms = 0.0 ;
194199 if (rank == 0 ) {
195200 gpu_time_ms = benchmark_fft (fft::Backend::CUDA, world, decomp, rank);
196201 }
197-
202+
198203 // Report results
199204 if (rank == 0 ) {
200205 std::cout << " \n ========================================\n " ;
@@ -203,26 +208,29 @@ int main(int argc, char *argv[]) {
203208 std::cout << std::fixed << std::setprecision (3 );
204209 std::cout << " CPU (FFTW) time: " << cpu_time_ms << " ms\n " ;
205210 std::cout << " GPU (CUDA) time: " << gpu_time_ms << " ms\n " ;
206-
211+
207212 double speedup = cpu_time_ms / gpu_time_ms;
208213 std::cout << " \n Speedup: " << std::setprecision (2 ) << speedup << " x\n " ;
209-
214+
210215 if (speedup > 1.0 ) {
211216 std::cout << " ✓ GPU is " << speedup << " x faster than CPU\n " ;
212217 } else {
213218 std::cout << " ✗ CPU is " << (1.0 / speedup) << " x faster than GPU\n " ;
214- std::cout << " (Note: GPU may be slower for small problems due to overhead)\n " ;
219+ std::cout
220+ << " (Note: GPU may be slower for small problems due to overhead)\n " ;
215221 }
216-
222+
217223 // Performance metrics
218224 size_t total_points = GRID_SIZE * GRID_SIZE * GRID_SIZE;
219- double cpu_throughput = total_points / (cpu_time_ms * 1e-3 ) / 1e6 ; // Mpoints/s
220- double gpu_throughput = total_points / (gpu_time_ms * 1e-3 ) / 1e6 ; // Mpoints/s
221-
225+ double cpu_throughput = total_points / (cpu_time_ms * 1e-3 ) / 1e6 ; // Mpoints/s
226+ double gpu_throughput = total_points / (gpu_time_ms * 1e-3 ) / 1e6 ; // Mpoints/s
227+
222228 std::cout << " \n Throughput:\n " ;
223- std::cout << " CPU: " << std::setprecision (1 ) << cpu_throughput << " Mpoints/s\n " ;
224- std::cout << " GPU: " << std::setprecision (1 ) << gpu_throughput << " Mpoints/s\n " ;
225-
229+ std::cout << " CPU: " << std::setprecision (1 ) << cpu_throughput
230+ << " Mpoints/s\n " ;
231+ std::cout << " GPU: " << std::setprecision (1 ) << gpu_throughput
232+ << " Mpoints/s\n " ;
233+
226234 std::cout << " \n ========================================\n " ;
227235 std::cout << " Recommendation:\n " ;
228236 std::cout << " ========================================\n " ;
@@ -247,15 +255,15 @@ int main(int argc, char *argv[]) {
247255 std::cout << " \n CPU (FFTW) time: " << cpu_time_ms << " ms\n " ;
248256 }
249257#endif
250-
258+
251259 } catch (const std::exception &e) {
252260 if (rank == 0 ) {
253261 std::cerr << " \n Error: " << e.what () << std::endl;
254262 }
255263 MPI_Finalize ();
256264 return 1 ;
257265 }
258-
266+
259267 MPI_Finalize ();
260268 return 0 ;
261269}
0 commit comments