Skip to content

Commit f315afb

Browse files
jngradRudolfWeeber
andcommitted
p3m: Use OpenMP kernels
Co-authored-by: Rudolf Weeber <[email protected]>
1 parent d287680 commit f315afb

File tree

14 files changed

+243
-74
lines changed

14 files changed

+243
-74
lines changed

maintainer/benchmarks/benchmarks.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
# You should have received a copy of the GNU General Public License
1717
# along with this program. If not, see <http://www.gnu.org/licenses/>.
1818
#
19+
import os
1920
import sys
2021
import time
2122
import pathlib
@@ -108,7 +109,7 @@ def get_average_time(timings):
108109
return (avg, ci)
109110

110111

111-
def write_report(filepath, n_proc, timings, n_steps, label=''):
112+
def write_report(filepath, n_ranks, timings, n_steps, label=''):
112113
'''
113114
Append timing data to a CSV file. If it doesn't exist, it is created
114115
with a header.
@@ -117,7 +118,7 @@ def write_report(filepath, n_proc, timings, n_steps, label=''):
117118
----------
118119
filepath: :obj:`str`
119120
Path to the CSV file.
120-
n_proc: :obj:`int`
121+
n_ranks: :obj:`int`
121122
Number of MPI ranks.
122123
timings: :obj:`ndarray` of :obj:`float`
123124
Timings.
@@ -127,11 +128,12 @@ def write_report(filepath, n_proc, timings, n_steps, label=''):
127128
Label to distinguish e.g. MD from MC or LB steps.
128129
129130
'''
131+
n_threads = int(os.environ.get("OMP_NUM_THREADS", 1))
130132
script = pathlib.Path(sys.argv[0]).name
131133
cmd = " ".join(x for x in sys.argv[1:] if not x.startswith("--output"))
132134
avg, ci = get_average_time(timings)
133-
header = '"script","arguments","cores","mean","ci","nsteps","duration","label"\n'
134-
report = f'"{script}","{cmd}",{n_proc},{avg:.3e},{ci:.3e},{n_steps},{np.sum(timings):.1f},"{label}"\n' # nopep8
135+
header = '"script","arguments","ranks","threads","mean","ci","nsteps","duration","label"\n'
136+
report = f'"{script}","{cmd}",{n_ranks},{n_threads},{avg:.3e},{ci:.3e},{n_steps},{np.sum(timings):.1f},"{label}"\n' # nopep8
135137
if pathlib.Path(filepath).is_file():
136138
header = ''
137139
with open(filepath, "a") as f:

maintainer/benchmarks/p3m.py

+3
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,9 @@
156156
print("Re-tune p3m")
157157
p3m = p3m_class(**p3m_params)
158158
system.electrostatics.solver = p3m
159+
print("Tune skin: {:.3f}".format(system.cell_system.tune_skin(
160+
min_skin=min_skin, max_skin=max_skin, tol=0.05, int_steps=100,
161+
adjust_max_skin=True)))
159162

160163

161164
if args.visualizer:

src/core/communication.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
#include "cuda/init.hpp"
2727
#include "errorhandling.hpp"
28+
#include "fft/init.hpp"
2829

2930
#ifdef WALBERLA
3031
#include <walberla_bridge/walberla_init.hpp>
@@ -90,6 +91,10 @@ void init(std::shared_ptr<boost::mpi::environment> mpi_env) {
9091
cuda_on_program_start();
9192
#endif
9293

94+
#ifdef FFTW
95+
fft_on_program_start();
96+
#endif
97+
9398
#ifdef SHARED_MEMORY_PARALLELISM
9499
Kokkos::initialize();
95100
#endif

src/core/electrostatics/p3m.cpp

+40-31
Original file line numberDiff line numberDiff line change
@@ -117,11 +117,14 @@ FloatType complex_norm2(std::complex<FloatType> const &z) {
117117
template <Utils::MemoryOrder order_in, Utils::MemoryOrder order_out, typename T>
118118
auto transpose(std::span<T> const &flat_array, Utils::Vector3i const &shape) {
119119
auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
120+
std::size_t index_out{};
120121
Utils::Vector3i indices{};
121122
std::vector<T> flat_array_t(flat_array.size());
122-
for_each_3d(mesh_start, shape, indices, [&]() {
123-
auto const index_in = Utils::get_linear_index(indices, shape, order_in);
124-
auto const index_out = Utils::get_linear_index(indices, shape, order_out);
123+
for_each_3d_lin<order_out>(mesh_start, shape, indices, index_out, [&]() {
124+
auto const index_in = Utils::get_linear_index<order_in>(indices, shape);
125+
#ifdef ADDITIONAL_CHECKS
126+
assert(index_out == Utils::get_linear_index<order_out>(indices, shape));
127+
#endif
125128
flat_array_t[index_out] = flat_array[index_in];
126129
});
127130
return flat_array_t;
@@ -451,10 +454,11 @@ void CoulombP3MImpl<FloatType, Architecture>::kernel_ks_charge_density() {
451454
p3m.local_mesh.dim);
452455

453456
// get real space charge density without ghost layers
454-
auto charge_density_no_halos = extract_block(
455-
p3m.rs_charge_density, p3m.local_mesh.dim, p3m.local_mesh.n_halo_ld,
456-
p3m.local_mesh.dim - p3m.local_mesh.n_halo_ur,
457-
Utils::MemoryOrder::ROW_MAJOR, Utils::MemoryOrder::COLUMN_MAJOR);
457+
auto charge_density_no_halos =
458+
extract_block<Utils::MemoryOrder::ROW_MAJOR,
459+
Utils::MemoryOrder::COLUMN_MAJOR>(
460+
p3m.rs_charge_density, p3m.local_mesh.dim, p3m.local_mesh.n_halo_ld,
461+
p3m.local_mesh.dim - p3m.local_mesh.n_halo_ur);
458462

459463
// Set up the FFT using the Heffte library.
460464
// This is in global mesh coordinates without any ghost layers
@@ -466,14 +470,14 @@ void CoulombP3MImpl<FloatType, Architecture>::kernel_ks_charge_density() {
466470

467471
template <typename FloatType, Arch Architecture>
468472
void CoulombP3MImpl<FloatType, Architecture>::kernel_rs_electric_field() {
469-
auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
470-
auto const &mesh_stop = p3m.fft->ks_local_size();
473+
auto const mesh_start = p3m.fft->ks_local_ld_index();
474+
auto const mesh_stop = mesh_start + p3m.fft->ks_local_size();
471475
auto const &box_geo = *get_system().box_geo;
472-
auto indices = Utils::Vector3i{};
476+
Utils::Vector3i indices{};
473477

474478
// hold electric field in k-space
475479
std::array<std::span<std::complex<FloatType>>, 3> ks_E_fields;
476-
auto const fft_mesh_length = get_size_from_shape(mesh_stop);
480+
auto const fft_mesh_length = get_size_from_shape(mesh_stop - mesh_start);
477481
for (auto d : {0u, 1u, 2u}) {
478482
auto const offset = d * fft_mesh_length;
479483
auto const begin = p3m.ks_E_fields_storage.begin() + offset;
@@ -485,30 +489,35 @@ void CoulombP3MImpl<FloatType, Architecture>::kernel_rs_electric_field() {
485489
Utils::Vector3<FloatType>((2. * std::numbers::pi) * box_geo.length_inv());
486490

487491
// compute electric field, Eq. (3.49) @cite deserno00b
488-
for_each_3d(mesh_start, mesh_stop, indices, [&]() {
489-
auto const global_index = indices + p3m.fft->ks_local_ld_index();
490-
auto const local_index = Utils::get_linear_index(
491-
indices, mesh_stop, Utils::MemoryOrder::COLUMN_MAJOR);
492-
auto const phi_hat = multiply_complex_by_real(
493-
p3m.ks_charge_density[local_index], p3m.g_force[local_index]);
494-
495-
for (auto d : {0u, 1u, 2u}) {
496-
// wave vector of the current mesh point
497-
auto const k = FloatType(p3m.d_op[d][global_index[d]]) * wavevector[d];
498-
// electric field in k-space
499-
ks_E_fields[d][local_index] = multiply_complex_by_imaginary(phi_hat, k);
500-
}
501-
});
492+
std::size_t local_index{};
493+
for_each_3d_lin<Utils::MemoryOrder::COLUMN_MAJOR>(
494+
mesh_start, mesh_stop, indices, local_index, [&]() {
495+
#ifdef ADDITIONAL_CHECKS
496+
assert(local_index ==
497+
Utils::get_linear_index<Utils::MemoryOrder::COLUMN_MAJOR>(
498+
indices - mesh_start, mesh_stop - mesh_start));
499+
#endif
500+
auto const phi_hat = multiply_complex_by_real(
501+
p3m.ks_charge_density[local_index], p3m.g_force[local_index]);
502+
503+
for (auto d : {0u, 1u, 2u}) {
504+
// wave vector of the current mesh point
505+
auto const k = FloatType(p3m.d_op[d][indices[d]]) * wavevector[d];
506+
// electric field in k-space
507+
ks_E_fields[d][local_index] =
508+
multiply_complex_by_imaginary(phi_hat, k);
509+
}
510+
});
502511

503512
// back-transform the k-space electric field to real space
504-
auto const rs_mesh_size_no_halo =
505-
get_size_from_shape(p3m.local_mesh.dim_no_halo);
506-
p3m.fft->backward_batch(3, p3m.ks_E_fields_storage.data(),
507-
p3m.rs_E_fields_no_halo.data());
508-
509-
// add zeros around the E-field in real space to make room for ghost layers
510513
auto const size = p3m.local_mesh.ur_no_halo - p3m.local_mesh.ld_no_halo;
514+
auto const rs_mesh_size_no_halo = Utils::product(size);
511515
for (auto d : {0u, 1u, 2u}) {
516+
auto k_space = ks_E_fields[d].data();
517+
auto real_space = p3m.rs_E_fields_no_halo.data() + d * rs_mesh_size_no_halo;
518+
p3m.fft->backward(k_space, real_space);
519+
520+
// add zeros around the E-field in real space to make room for ghost layers
512521
auto const offset = d * rs_mesh_size_no_halo;
513522
auto const begin = p3m.rs_E_fields_no_halo.begin() + offset;
514523
auto f = std::span<std::complex<FloatType>>(begin, rs_mesh_size_no_halo);

src/core/fft/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,5 @@ if(ESPRESSO_BUILD_WITH_FFTW)
2121
target_link_libraries(
2222
espresso_p3m PRIVATE $<$<BOOL:${fftw3_FOUND}>:espresso::fftw3>
2323
$<$<BOOL:${fftw3_omp_FOUND}>:espresso::fftw3_omp>)
24-
target_sources(espresso_p3m PRIVATE fft.cpp)
24+
target_sources(espresso_p3m PRIVATE fft.cpp init.cpp)
2525
endif()

src/core/fft/fft.cpp

+16
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@
4040

4141
#include <fftw3.h>
4242

43+
#ifdef _OPENMP
44+
#include <omp.h>
45+
#endif
46+
4347
#include <algorithm>
4448
#include <cmath>
4549
#include <cstddef>
@@ -677,27 +681,39 @@ int fft_data_struct<FloatType>::initialize_fft(
677681
/* === FFT Routines (Using FFTW / RFFTW package)=== */
678682
for (int i = 1; i < 4; i++) {
679683
if (init_tag) {
684+
#ifdef _OPENMP
685+
#pragma omp critical(fftw_destroy_plan_forward)
680686
forw[i].destroy_plan();
687+
#endif
681688
}
682689
forw[i].dir = FFTW_FORWARD;
690+
#ifdef _OPENMP
691+
#pragma omp critical(fftw_create_plan_forward)
683692
forw[i].plan_handle = fftw<FloatType>::plan_many_dft(
684693
1, &forw[i].new_mesh[2], forw[i].n_ffts, c_data, nullptr, 1,
685694
forw[i].new_mesh[2], c_data, nullptr, 1, forw[i].new_mesh[2],
686695
forw[i].dir, FFTW_PATIENT);
696+
#endif
687697
assert(forw[i].plan_handle);
688698
}
689699

690700
/* === The BACK Direction === */
691701
/* this is needed because slightly different functions are used */
692702
for (int i = 1; i < 4; i++) {
693703
if (init_tag) {
704+
#ifdef _OPENMP
705+
#pragma omp critical(fftw_destroy_plan_backward)
694706
back[i].destroy_plan();
707+
#endif
695708
}
696709
back[i].dir = FFTW_BACKWARD;
710+
#ifdef _OPENMP
711+
#pragma omp critical(fftw_create_plan_backward)
697712
back[i].plan_handle = fftw<FloatType>::plan_many_dft(
698713
1, &forw[i].new_mesh[2], forw[i].n_ffts, c_data, nullptr, 1,
699714
forw[i].new_mesh[2], c_data, nullptr, 1, forw[i].new_mesh[2],
700715
back[i].dir, FFTW_PATIENT);
716+
#endif
701717
back[i].pack_function = pack_block_permute1;
702718
assert(back[i].plan_handle);
703719
}

src/core/fft/init.cpp

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
* Copyright (C) 2025 The ESPResSo project
3+
*
4+
* This file is part of ESPResSo.
5+
*
6+
* ESPResSo is free software: you can redistribute it and/or modify
7+
* it under the terms of the GNU General Public License as published by
8+
* the Free Software Foundation, either version 3 of the License, or
9+
* (at your option) any later version.
10+
*
11+
* ESPResSo is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
* GNU General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU General Public License
17+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
18+
*/
19+
20+
#include "init.hpp"
21+
22+
#include "config/config.hpp"
23+
24+
#include <cassert>
25+
26+
#include <fftw3.h>
27+
28+
#ifdef _OPENMP
29+
#include <omp.h>
30+
#endif
31+
32+
#ifdef FFTW
33+
void fft_on_program_start() {
34+
#ifdef _OPENMP
35+
int omp_num_threads = 1;
36+
#pragma omp parallel
37+
{
38+
#pragma omp single
39+
omp_num_threads = omp_get_num_threads();
40+
}
41+
[[maybe_unused]] auto const init_status_success = fftw_init_threads();
42+
assert(init_status_success);
43+
fftw_plan_with_nthreads(omp_num_threads);
44+
#endif // _OPENMP
45+
}
46+
#endif // FFTW

src/core/fft/init.hpp

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
/*
2+
* Copyright (C) 2025 The ESPResSo project
3+
*
4+
* This file is part of ESPResSo.
5+
*
6+
* ESPResSo is free software: you can redistribute it and/or modify
7+
* it under the terms of the GNU General Public License as published by
8+
* the Free Software Foundation, either version 3 of the License, or
9+
* (at your option) any later version.
10+
*
11+
* ESPResSo is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
* GNU General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU General Public License
17+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
18+
*/
19+
20+
#pragma once
21+
22+
#include "config/config.hpp"
23+
24+
#ifdef FFTW
25+
void fft_on_program_start();
26+
#endif

src/core/p3m/P3MFFT.hpp

+17-8
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@
2424
#include <boost/mpi/communicator.hpp>
2525

2626
#include <heffte.h>
27+
#include <heffte_backends.h>
2728

2829
#include <algorithm>
2930
#include <array>
31+
#include <initializer_list>
3032
#include <memory>
3133

3234
template <typename T, std::size_t N>
@@ -45,6 +47,7 @@ template <typename FloatType> class P3MFFT {
4547
Utils::Vector3i m_global_mesh;
4648
std::shared_ptr<Box> in_box;
4749
std::shared_ptr<Box> out_box;
50+
std::vector<std::complex<FloatType>> m_buffer;
4851
heffte::fft3d<backend_tag> fft3d;
4952

5053
public:
@@ -69,8 +72,16 @@ template <typename FloatType> class P3MFFT {
6972
auto const global_box = heffte::box3d<>(
7073
{0, 0, 0}, to_array(m_global_mesh - Utils::Vector3i::broadcast(1)),
7174
to_array(m_memory_layout));
75+
auto const n_procs = Utils::product(node_grid);
76+
auto best_grid = node_grid;
77+
for (auto i : {0u, 1u, 2u}) {
78+
if (m_global_mesh[i] % 2 * n_procs == 0) {
79+
best_grid = {n_procs, 1, 1};
80+
break;
81+
}
82+
}
7283
auto all_boxes = heffte::split_world(
73-
global_box, {node_grid[2], node_grid[1], node_grid[0]});
84+
global_box, {best_grid[0], best_grid[1], best_grid[2]});
7485
out_box = std::make_shared<Box>(all_boxes[comm.rank()]);
7586
init_fft();
7687
}
@@ -95,8 +106,9 @@ template <typename FloatType> class P3MFFT {
95106
// 1-D pencils for sufficiently large problem, it is expected that the
96107
// pencil decomposition is better but for smaller problems, the slabs may
97108
// perform better (depending on hardware and backend)
98-
options.use_pencils = false;
109+
options.use_pencils = true;
99110
fft3d = heffte::fft3d<backend_tag>(*in_box, *out_box, comm, options);
111+
m_buffer.resize(fft3d.size_workspace());
100112
}
101113

102114
Utils::Vector3i ks_local_ld_index() const {
@@ -108,14 +120,11 @@ template <typename FloatType> class P3MFFT {
108120
Utils::Vector3i ks_local_size() const {
109121
return ks_local_ur_index() - ks_local_ld_index();
110122
}
111-
template <typename T> auto forward(T &in) { return fft3d.forward(in); }
112123
template <typename In, typename Out> void forward(In in, Out out) {
113-
fft3d.forward(in, out);
124+
fft3d.forward(in, out, m_buffer.data());
114125
}
115-
template <typename T> auto backward(T &in) { return fft3d.backward(in); }
116-
template <typename T1, typename T2>
117-
auto backward_batch(int n, T1 in, T2 out) {
118-
return fft3d.backward(n, in, out);
126+
template <typename T1, typename T2> auto backward(T1 &in, T2 &out) {
127+
return fft3d.backward(in, out, m_buffer.data());
119128
}
120129
auto const &get_memory_layout() const { return m_memory_layout; }
121130
};

0 commit comments

Comments
 (0)