Skip to content

Commit 055e2d5

Browse files
committed
style: Format fft_backend_benchmark.cpp with clang-format
Fix clang-format violations to resolve CI failures. All formatting now complies with project style guidelines.
1 parent e712398 commit 055e2d5

File tree

1 file changed

+65
-57
lines changed

1 file changed

+65
-57
lines changed

examples/fft_backend_benchmark.cpp

Lines changed: 65 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@
3333
using namespace pfc;
3434

3535
// Benchmark configuration
36-
constexpr int GRID_SIZE = 128; // 128³ = 2,097,152 points
37-
constexpr int NUM_ITERATIONS = 10; // Number of iterations for averaging
36+
constexpr int GRID_SIZE = 128; // 128³ = 2,097,152 points
37+
constexpr int NUM_ITERATIONS = 10; // Number of iterations for averaging
3838

3939
/**
4040
* @brief Benchmark FFT performance for a given backend
@@ -47,17 +47,18 @@ constexpr int NUM_ITERATIONS = 10; // Number of iterations for averaging
4747
*/
4848
double benchmark_fft(fft::Backend backend, const World &world,
4949
const decomposition::Decomposition &decomp, int rank_id) {
50-
51-
std::string backend_name = (backend == fft::Backend::FFTW) ? "FFTW (CPU)" : "CUDA (GPU)";
50+
51+
std::string backend_name =
52+
(backend == fft::Backend::FFTW) ? "FFTW (CPU)" : "CUDA (GPU)";
5253
std::cout << "\n========================================\n";
5354
std::cout << "Benchmarking: " << backend_name << "\n";
5455
std::cout << "========================================\n";
55-
56+
5657
// Create FFT with selected backend
5758
auto fft = fft::create_with_backend(decomp, rank_id, backend);
58-
59-
std::cout << "Grid size: " << GRID_SIZE << "³ = "
60-
<< (GRID_SIZE * GRID_SIZE * GRID_SIZE) << " points\n";
59+
60+
std::cout << "Grid size: " << GRID_SIZE
61+
<< "³ = " << (GRID_SIZE * GRID_SIZE * GRID_SIZE) << " points\n";
6162
std::cout << "Real data size: " << fft->size_inbox() << " (local)\n";
6263
std::cout << "Complex data size: " << fft->size_outbox() << " (local)\n";
6364
std::cout << "Iterations: " << NUM_ITERATIONS << "\n\n";
@@ -66,86 +67,90 @@ double benchmark_fft(fft::Backend backend, const World &world,
6667
// CPU backend: use std::vector
6768
std::vector<double> real_data(fft->size_inbox());
6869
std::vector<std::complex<double>> complex_data(fft->size_outbox());
69-
70+
7071
// Initialize with some test data
7172
for (size_t i = 0; i < real_data.size(); ++i) {
7273
real_data[i] = std::sin(2.0 * M_PI * i / real_data.size());
7374
}
74-
75+
7576
// Warmup
7677
std::cout << "Warmup...";
7778
fft->forward(real_data, complex_data);
7879
fft->backward(complex_data, real_data);
7980
std::cout << " done.\n";
80-
81+
8182
// Benchmark
8283
std::cout << "Running benchmark...\n";
8384
auto start = std::chrono::high_resolution_clock::now();
84-
85+
8586
for (int iter = 0; iter < NUM_ITERATIONS; ++iter) {
8687
fft->forward(real_data, complex_data);
8788
fft->backward(complex_data, real_data);
8889
}
89-
90+
9091
auto end = std::chrono::high_resolution_clock::now();
91-
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
92-
92+
auto duration =
93+
std::chrono::duration_cast<std::chrono::microseconds>(end - start);
94+
9395
double avg_time_ms = duration.count() / (1000.0 * NUM_ITERATIONS);
94-
96+
9597
std::cout << "Total time: " << duration.count() / 1000.0 << " ms\n";
9698
std::cout << "Average time per forward+backward: " << std::fixed
9799
<< std::setprecision(3) << avg_time_ms << " ms\n";
98-
100+
99101
return avg_time_ms;
100-
102+
101103
} else {
102104
#if defined(OpenPFC_ENABLE_CUDA)
103105
// GPU backend: use DataBuffer
104106
using RealBufferGPU = core::DataBuffer<backend::CudaTag, double>;
105-
using ComplexBufferGPU = core::DataBuffer<backend::CudaTag, std::complex<double>>;
106-
107+
using ComplexBufferGPU =
108+
core::DataBuffer<backend::CudaTag, std::complex<double>>;
109+
107110
RealBufferGPU real_data(fft->size_inbox());
108111
ComplexBufferGPU complex_data(fft->size_outbox());
109-
112+
110113
// Initialize on host, copy to device
111114
std::vector<double> host_data(fft->size_inbox());
112115
for (size_t i = 0; i < host_data.size(); ++i) {
113116
host_data[i] = std::sin(2.0 * M_PI * i / host_data.size());
114117
}
115118
real_data.copy_from_host(host_data);
116-
119+
117120
// Get the FFT_Impl with CUDA backend
118-
auto* fft_cuda = dynamic_cast<fft::FFT_Impl<heffte::backend::cufft>*>(fft.get());
121+
auto *fft_cuda =
122+
dynamic_cast<fft::FFT_Impl<heffte::backend::cufft> *>(fft.get());
119123
if (!fft_cuda) {
120124
throw std::runtime_error("Failed to cast to CUDA FFT implementation");
121125
}
122-
126+
123127
// Warmup
124128
std::cout << "Warmup...";
125129
fft_cuda->forward(real_data, complex_data);
126130
fft_cuda->backward(complex_data, real_data);
127-
cudaDeviceSynchronize(); // Ensure GPU work is complete
131+
cudaDeviceSynchronize(); // Ensure GPU work is complete
128132
std::cout << " done.\n";
129-
133+
130134
// Benchmark
131135
std::cout << "Running benchmark...\n";
132136
auto start = std::chrono::high_resolution_clock::now();
133-
137+
134138
for (int iter = 0; iter < NUM_ITERATIONS; ++iter) {
135139
fft_cuda->forward(real_data, complex_data);
136140
fft_cuda->backward(complex_data, real_data);
137141
}
138-
139-
cudaDeviceSynchronize(); // Ensure all GPU work is complete
142+
143+
cudaDeviceSynchronize(); // Ensure all GPU work is complete
140144
auto end = std::chrono::high_resolution_clock::now();
141-
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
142-
145+
auto duration =
146+
std::chrono::duration_cast<std::chrono::microseconds>(end - start);
147+
143148
double avg_time_ms = duration.count() / (1000.0 * NUM_ITERATIONS);
144-
149+
145150
std::cout << "Total time: " << duration.count() / 1000.0 << " ms\n";
146151
std::cout << "Average time per forward+backward: " << std::fixed
147152
<< std::setprecision(3) << avg_time_ms << " ms\n";
148-
153+
149154
return avg_time_ms;
150155
#else
151156
throw std::runtime_error("CUDA support not compiled in");
@@ -156,45 +161,45 @@ double benchmark_fft(fft::Backend backend, const World &world,
156161
int main(int argc, char *argv[]) {
157162
// Initialize MPI
158163
MPI_Init(&argc, &argv);
159-
164+
160165
int rank, size;
161166
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
162167
MPI_Comm_size(MPI_COMM_WORLD, &size);
163-
168+
164169
if (rank == 0) {
165170
std::cout << "\n╔════════════════════════════════════════╗\n";
166171
std::cout << "║ FFT Backend Performance Benchmark ║\n";
167172
std::cout << "╚════════════════════════════════════════╝\n";
168173
std::cout << "\nMPI ranks: " << size << "\n";
169174
}
170-
175+
171176
try {
172177
// Create computational domain (128³ grid)
173-
auto world = world::create({GRID_SIZE, GRID_SIZE, GRID_SIZE},
174-
{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0});
175-
178+
auto world = world::create({GRID_SIZE, GRID_SIZE, GRID_SIZE}, {1.0, 1.0, 1.0},
179+
{1.0, 1.0, 1.0});
180+
176181
// Create domain decomposition
177182
auto decomp = decomposition::create(world, size);
178-
183+
179184
if (rank == 0) {
180185
std::cout << "Domain: " << GRID_SIZE << " × " << GRID_SIZE << " × "
181-
<< GRID_SIZE << " = "
182-
<< (GRID_SIZE * GRID_SIZE * GRID_SIZE) << " grid points\n";
186+
<< GRID_SIZE << " = " << (GRID_SIZE * GRID_SIZE * GRID_SIZE)
187+
<< " grid points\n";
183188
}
184-
189+
185190
// Benchmark CPU (FFTW)
186191
double cpu_time_ms = 0.0;
187192
if (rank == 0) {
188193
cpu_time_ms = benchmark_fft(fft::Backend::FFTW, world, decomp, rank);
189194
}
190-
195+
191196
#if defined(OpenPFC_ENABLE_CUDA)
192197
// Benchmark GPU (CUDA)
193198
double gpu_time_ms = 0.0;
194199
if (rank == 0) {
195200
gpu_time_ms = benchmark_fft(fft::Backend::CUDA, world, decomp, rank);
196201
}
197-
202+
198203
// Report results
199204
if (rank == 0) {
200205
std::cout << "\n========================================\n";
@@ -203,26 +208,29 @@ int main(int argc, char *argv[]) {
203208
std::cout << std::fixed << std::setprecision(3);
204209
std::cout << "CPU (FFTW) time: " << cpu_time_ms << " ms\n";
205210
std::cout << "GPU (CUDA) time: " << gpu_time_ms << " ms\n";
206-
211+
207212
double speedup = cpu_time_ms / gpu_time_ms;
208213
std::cout << "\nSpeedup: " << std::setprecision(2) << speedup << "x\n";
209-
214+
210215
if (speedup > 1.0) {
211216
std::cout << "✓ GPU is " << speedup << "x faster than CPU\n";
212217
} else {
213218
std::cout << "✗ CPU is " << (1.0 / speedup) << "x faster than GPU\n";
214-
std::cout << " (Note: GPU may be slower for small problems due to overhead)\n";
219+
std::cout
220+
<< " (Note: GPU may be slower for small problems due to overhead)\n";
215221
}
216-
222+
217223
// Performance metrics
218224
size_t total_points = GRID_SIZE * GRID_SIZE * GRID_SIZE;
219-
double cpu_throughput = total_points / (cpu_time_ms * 1e-3) / 1e6; // Mpoints/s
220-
double gpu_throughput = total_points / (gpu_time_ms * 1e-3) / 1e6; // Mpoints/s
221-
225+
double cpu_throughput = total_points / (cpu_time_ms * 1e-3) / 1e6; // Mpoints/s
226+
double gpu_throughput = total_points / (gpu_time_ms * 1e-3) / 1e6; // Mpoints/s
227+
222228
std::cout << "\nThroughput:\n";
223-
std::cout << " CPU: " << std::setprecision(1) << cpu_throughput << " Mpoints/s\n";
224-
std::cout << " GPU: " << std::setprecision(1) << gpu_throughput << " Mpoints/s\n";
225-
229+
std::cout << " CPU: " << std::setprecision(1) << cpu_throughput
230+
<< " Mpoints/s\n";
231+
std::cout << " GPU: " << std::setprecision(1) << gpu_throughput
232+
<< " Mpoints/s\n";
233+
226234
std::cout << "\n========================================\n";
227235
std::cout << "Recommendation:\n";
228236
std::cout << "========================================\n";
@@ -247,15 +255,15 @@ int main(int argc, char *argv[]) {
247255
std::cout << "\nCPU (FFTW) time: " << cpu_time_ms << " ms\n";
248256
}
249257
#endif
250-
258+
251259
} catch (const std::exception &e) {
252260
if (rank == 0) {
253261
std::cerr << "\nError: " << e.what() << std::endl;
254262
}
255263
MPI_Finalize();
256264
return 1;
257265
}
258-
266+
259267
MPI_Finalize();
260268
return 0;
261269
}

0 commit comments

Comments
 (0)