Skip to content

Commit 8c56742

Browse files
jngradRudolfWeeber
andcommitted
p3m: Use OpenMP kernels
Co-authored-by: Rudolf Weeber <[email protected]>
1 parent d74f4fa commit 8c56742

File tree

19 files changed

+276
-86
lines changed

19 files changed

+276
-86
lines changed

cmake/FindFFTW3.cmake

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,25 +33,28 @@ endif(FFTW3_INCLUDE_DIR)
3333
find_path(FFTW3_INCLUDE_DIR fftw3.h)
3434
find_library(FFTW3_LIBRARIES fftw3)
3535
find_library(FFTW3F_LIBRARIES fftw3f)
36-
find_path(FFTW3_MPI_INCLUDE_DIR fftw3-mpi.h)
37-
find_library(FFTW3_MPI_LIBRARIES fftw3_mpi)
38-
find_library(FFTW3F_MPI_LIBRARIES fftw3f_mpi)
36+
find_library(FFTW3_OMP_LIBRARIES fftw3_omp)
37+
find_library(FFTW3F_OMP_LIBRARIES fftw3f_omp)
3938

4039
# handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE if all
4140
# listed variables are TRUE
4241
include(FindPackageHandleStandardArgs)
4342
find_package_handle_standard_args(FFTW3 DEFAULT_MSG FFTW3_LIBRARIES FFTW3F_LIBRARIES
4443
FFTW3_INCLUDE_DIR)
4544
set(FPHSA_NAME_MISMATCHED 1)
46-
find_package_handle_standard_args(FFTW3_MPI DEFAULT_MSG FFTW3_MPI_LIBRARIES FFTW3F_MPI_LIBRARIES
47-
FFTW3_MPI_INCLUDE_DIR)
45+
find_package_handle_standard_args(FFTW3_OMP DEFAULT_MSG FFTW3_OMP_LIBRARIES FFTW3F_OMP_LIBRARIES)
4846
unset(FPHSA_NAME_MISMATCHED)
4947

50-
mark_as_advanced(FFTW3_LIBRARIES FFTW3F_LIBRARIES FFTW3_INCLUDE_DIR FFTW3_MPI_LIBRARIES FFTW3F_MPI_LIBRARIES FFTW3_MPI_INCLUDE_DIR)
48+
mark_as_advanced(FFTW3_LIBRARIES FFTW3F_LIBRARIES FFTW3_INCLUDE_DIR FFTW3_OMP_LIBRARIES FFTW3F_OMP_LIBRARIES)
5149

5250

5351
if(FFTW3_FOUND AND NOT TARGET FFTW3::FFTW3)
5452
add_library(FFTW3::FFTW3 INTERFACE IMPORTED)
5553
target_include_directories(FFTW3::FFTW3 INTERFACE "${FFTW3_INCLUDE_DIR}")
5654
target_link_libraries(FFTW3::FFTW3 INTERFACE "${FFTW3_LIBRARIES}" "${FFTW3F_LIBRARIES}")
5755
endif()
56+
if(FFTW3_FOUND AND NOT TARGET FFTW3::FFTW3_OMP)
57+
add_library(FFTW3::FFTW3_OMP INTERFACE IMPORTED)
58+
target_include_directories(FFTW3::FFTW3_OMP INTERFACE "${FFTW3_INCLUDE_DIR}")
59+
target_link_libraries(FFTW3::FFTW3_OMP INTERFACE "${FFTW3_OMP_LIBRARIES}" "${FFTW3F_OMP_LIBRARIES}")
60+
endif()

maintainer/benchmarks/p3m.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import benchmarks
2323
import numpy as np
2424
import argparse
25+
import os
2526

2627
parser = argparse.ArgumentParser(description="Benchmark P3M simulations. "
2728
"Save the results to a CSV file.")
@@ -83,7 +84,8 @@
8384
# System parameters
8485
#############################################################
8586
n_proc = system.cell_system.get_state()["n_nodes"]
86-
n_part = n_proc * args.particles_per_core
87+
n_threads = int(os.environ.get("OMP_NUM_THREADS", 1))
88+
n_part = n_proc * n_threads * args.particles_per_core
8789
# volume of N spheres with radius r: N * (4/3*pi*r^3)
8890
lj_sig = (lj_sigmas["cation"] + lj_sigmas["anion"]) / 2
8991
box_l = (n_part * 4. / 3. * np.pi * (lj_sig / 2.)**3
@@ -137,7 +139,7 @@
137139
# tuning and equilibration
138140
min_skin = 0.2
139141
max_skin = 1.6
140-
p3m_params = {"prefactor": args.prefactor, "accuracy": 1e-3}
142+
p3m_params = {"prefactor": args.prefactor, "accuracy": 1e-3, "timings": 100}
141143
p3m = p3m_class(**p3m_params)
142144
print("Quick equilibration")
143145
system.integrator.run(min(3 * measurement_steps, 1000))
@@ -156,6 +158,9 @@
156158
print("Re-tune p3m")
157159
p3m = p3m_class(**p3m_params)
158160
system.electrostatics.solver = p3m
161+
print("Tune skin: {:.3f}".format(system.cell_system.tune_skin(
162+
min_skin=min_skin, max_skin=max_skin, tol=0.05, int_steps=100,
163+
adjust_max_skin=True)))
159164

160165

161166
if args.visualizer:

src/core/communication.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
#include "cuda/init.hpp"
2727
#include "errorhandling.hpp"
28+
#include "fft/init.hpp"
2829

2930
#ifdef WALBERLA
3031
#include <walberla_bridge/walberla_init.hpp>
@@ -90,6 +91,10 @@ void init(std::shared_ptr<boost::mpi::environment> mpi_env) {
9091
cuda_on_program_start();
9192
#endif
9293

94+
#ifdef FFTW
95+
fft_on_program_start();
96+
#endif
97+
9398
#ifdef SHARED_MEMORY_PARALLELISM
9499
Kokkos::initialize();
95100
#endif

src/core/electrostatics/p3m.cpp

Lines changed: 39 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -117,12 +117,14 @@ FloatType complex_norm2(std::complex<FloatType> const &z) {
117117
template <Utils::MemoryOrder order_in, Utils::MemoryOrder order_out, typename T>
118118
auto transpose(std::span<T> const &flat_array, Utils::Vector3i const &shape) {
119119
auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
120+
std::size_t index_out{};
120121
Utils::Vector3i indices{};
121-
std::vector<T> flat_array_t(flat_array.size());
122-
for_each_3d(mesh_start, shape, indices, [&]() {
123-
auto const index_in = Utils::get_linear_index(indices, shape, order_in);
124-
auto const index_out = Utils::get_linear_index(indices, shape, order_out);
125-
flat_array_t[index_out] = flat_array[index_in];
122+
std::vector<T> flat_array_t;
123+
flat_array_t.reserve(flat_array.size());
124+
for_each_3d_lin<order_out>(mesh_start, shape, indices, index_out, [&]() {
125+
auto const index_in = Utils::get_linear_index<order_in>(indices, shape);
126+
assert(index_out == Utils::get_linear_index<order_out>(indices, shape));
127+
flat_array_t.push_back(flat_array[index_in]);
126128
});
127129
return flat_array_t;
128130
}
@@ -451,10 +453,11 @@ void CoulombP3MImpl<FloatType, Architecture>::kernel_ks_charge_density() {
451453
p3m.local_mesh.dim);
452454

453455
// get real space charge density without ghost layers
454-
auto charge_density_no_halos = extract_block(
455-
p3m.rs_charge_density, p3m.local_mesh.dim, p3m.local_mesh.n_halo_ld,
456-
p3m.local_mesh.dim - p3m.local_mesh.n_halo_ur,
457-
Utils::MemoryOrder::ROW_MAJOR, Utils::MemoryOrder::COLUMN_MAJOR);
456+
auto charge_density_no_halos =
457+
extract_block<Utils::MemoryOrder::ROW_MAJOR,
458+
Utils::MemoryOrder::COLUMN_MAJOR>(
459+
p3m.rs_charge_density, p3m.local_mesh.dim, p3m.local_mesh.n_halo_ld,
460+
p3m.local_mesh.dim - p3m.local_mesh.n_halo_ur);
458461

459462
// Set up the FFT using the Heffte library.
460463
// This is in global mesh coordinates without any ghost layers
@@ -466,14 +469,14 @@ void CoulombP3MImpl<FloatType, Architecture>::kernel_ks_charge_density() {
466469

467470
template <typename FloatType, Arch Architecture>
468471
void CoulombP3MImpl<FloatType, Architecture>::kernel_rs_electric_field() {
469-
auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
470-
auto const &mesh_stop = p3m.fft->ks_local_size();
472+
auto const mesh_start = p3m.fft->ks_local_ld_index();
473+
auto const mesh_stop = mesh_start + p3m.fft->ks_local_size();
471474
auto const &box_geo = *get_system().box_geo;
472-
auto indices = Utils::Vector3i{};
475+
Utils::Vector3i indices{};
473476

474477
// hold electric field in k-space
475478
std::array<std::span<std::complex<FloatType>>, 3> ks_E_fields;
476-
auto const fft_mesh_length = get_size_from_shape(mesh_stop);
479+
auto const fft_mesh_length = get_size_from_shape(mesh_stop - mesh_start);
477480
for (auto d : {0u, 1u, 2u}) {
478481
auto const offset = d * fft_mesh_length;
479482
auto const begin = p3m.ks_E_fields_storage.begin() + offset;
@@ -485,30 +488,33 @@ void CoulombP3MImpl<FloatType, Architecture>::kernel_rs_electric_field() {
485488
Utils::Vector3<FloatType>((2. * std::numbers::pi) * box_geo.length_inv());
486489

487490
// compute electric field, Eq. (3.49) @cite deserno00b
488-
for_each_3d(mesh_start, mesh_stop, indices, [&]() {
489-
auto const global_index = indices + p3m.fft->ks_local_ld_index();
490-
auto const local_index = Utils::get_linear_index(
491-
indices, mesh_stop, Utils::MemoryOrder::COLUMN_MAJOR);
492-
auto const phi_hat = multiply_complex_by_real(
493-
p3m.ks_charge_density[local_index], p3m.g_force[local_index]);
494-
495-
for (auto d : {0u, 1u, 2u}) {
496-
// wave vector of the current mesh point
497-
auto const k = FloatType(p3m.d_op[d][global_index[d]]) * wavevector[d];
498-
// electric field in k-space
499-
ks_E_fields[d][local_index] = multiply_complex_by_imaginary(phi_hat, k);
500-
}
501-
});
491+
std::size_t local_index = 0;
492+
for_each_3d_lin<Utils::MemoryOrder::COLUMN_MAJOR>(
493+
mesh_start, mesh_stop, indices, local_index, [&]() {
494+
assert(local_index ==
495+
Utils::get_linear_index<Utils::MemoryOrder::COLUMN_MAJOR>(
496+
indices - mesh_start, mesh_stop));
497+
auto const phi_hat = multiply_complex_by_real(
498+
p3m.ks_charge_density[local_index], p3m.g_force[local_index]);
499+
500+
for (auto d : {0u, 1u, 2u}) {
501+
// wave vector of the current mesh point
502+
auto const k = FloatType(p3m.d_op[d][indices[d]]) * wavevector[d];
503+
// electric field in k-space
504+
ks_E_fields[d][local_index] =
505+
multiply_complex_by_imaginary(phi_hat, k);
506+
}
507+
});
502508

503509
// back-transform the k-space electric field to real space
504-
auto const rs_mesh_size_no_halo =
505-
get_size_from_shape(p3m.local_mesh.dim_no_halo);
506-
p3m.fft->backward_batch(3, p3m.ks_E_fields_storage.data(),
507-
p3m.rs_E_fields_no_halo.data());
508-
509-
// add zeros around the E-field in real space to make room for ghost layers
510510
auto const size = p3m.local_mesh.ur_no_halo - p3m.local_mesh.ld_no_halo;
511+
auto const rs_mesh_size_no_halo = Utils::product(size);
511512
for (auto d : {0u, 1u, 2u}) {
513+
auto k_space = ks_E_fields[d].data();
514+
auto real_space = p3m.rs_E_fields_no_halo.data() + d * rs_mesh_size_no_halo;
515+
p3m.fft->backward(k_space, real_space);
516+
517+
// add zeros around the E-field in real space to make room for ghost layers
512518
auto const offset = d * rs_mesh_size_no_halo;
513519
auto const begin = p3m.rs_E_fields_no_halo.begin() + offset;
514520
auto f = std::span<std::complex<FloatType>>(begin, rs_mesh_size_no_halo);

src/core/fft/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,6 @@
1818
#
1919

2020
if(ESPRESSO_BUILD_WITH_FFTW)
21-
target_link_libraries(espresso_p3m PRIVATE FFTW3::FFTW3)
22-
target_sources(espresso_p3m PRIVATE fft.cpp)
21+
target_link_libraries(espresso_p3m PRIVATE FFTW3::FFTW3 FFTW3::FFTW3_OMP)
22+
target_sources(espresso_p3m PRIVATE fft.cpp init.cpp)
2323
endif()

src/core/fft/fft.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@
4040

4141
#include <fftw3.h>
4242

43+
#ifdef _OPENMP
44+
#include <omp.h>
45+
#endif
46+
4347
#include <algorithm>
4448
#include <cmath>
4549
#include <cstddef>
@@ -677,27 +681,39 @@ int fft_data_struct<FloatType>::initialize_fft(
677681
/* === FFT Routines (Using FFTW / RFFTW package)=== */
678682
for (int i = 1; i < 4; i++) {
679683
if (init_tag) {
684+
#ifdef _OPENMP
685+
#pragma omp critical(fftw_destroy_plan_forward)
680686
forw[i].destroy_plan();
687+
#endif
681688
}
682689
forw[i].dir = FFTW_FORWARD;
690+
#ifdef _OPENMP
691+
#pragma omp critical(fftw_create_plan_forward)
683692
forw[i].plan_handle = fftw<FloatType>::plan_many_dft(
684693
1, &forw[i].new_mesh[2], forw[i].n_ffts, c_data, nullptr, 1,
685694
forw[i].new_mesh[2], c_data, nullptr, 1, forw[i].new_mesh[2],
686695
forw[i].dir, FFTW_PATIENT);
696+
#endif
687697
assert(forw[i].plan_handle);
688698
}
689699

690700
/* === The BACK Direction === */
691701
/* this is needed because slightly different functions are used */
692702
for (int i = 1; i < 4; i++) {
693703
if (init_tag) {
704+
#ifdef _OPENMP
705+
#pragma omp critical(fftw_destroy_plan_backward)
694706
back[i].destroy_plan();
707+
#endif
695708
}
696709
back[i].dir = FFTW_BACKWARD;
710+
#ifdef _OPENMP
711+
#pragma omp critical(fftw_create_plan_backward)
697712
back[i].plan_handle = fftw<FloatType>::plan_many_dft(
698713
1, &forw[i].new_mesh[2], forw[i].n_ffts, c_data, nullptr, 1,
699714
forw[i].new_mesh[2], c_data, nullptr, 1, forw[i].new_mesh[2],
700715
back[i].dir, FFTW_PATIENT);
716+
#endif
701717
back[i].pack_function = pack_block_permute1;
702718
assert(back[i].plan_handle);
703719
}

src/core/fft/init.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
* Copyright (C) 2025 The ESPResSo project
3+
*
4+
* This file is part of ESPResSo.
5+
*
6+
* ESPResSo is free software: you can redistribute it and/or modify
7+
* it under the terms of the GNU General Public License as published by
8+
* the Free Software Foundation, either version 3 of the License, or
9+
* (at your option) any later version.
10+
*
11+
* ESPResSo is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
* GNU General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU General Public License
17+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
18+
*/
19+
20+
#include "init.hpp"
21+
22+
#include "config/config.hpp"
23+
24+
#include <cassert>
25+
26+
#include <fftw3.h>
27+
28+
#ifdef _OPENMP
29+
#include <omp.h>
30+
#endif
31+
32+
#ifdef FFTW
33+
void fft_on_program_start() {
34+
#ifdef _OPENMP
35+
int omp_num_threads = 1;
36+
#pragma omp parallel
37+
{
38+
#pragma omp single
39+
omp_num_threads = omp_get_num_threads();
40+
}
41+
[[maybe_unused]] auto const init_status_success = fftw_init_threads();
42+
assert(init_status_success);
43+
fftw_plan_with_nthreads(omp_num_threads);
44+
#endif // _OPENMP
45+
}
46+
#endif // FFTW

src/core/fft/init.hpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
/*
2+
* Copyright (C) 2025 The ESPResSo project
3+
*
4+
* This file is part of ESPResSo.
5+
*
6+
* ESPResSo is free software: you can redistribute it and/or modify
7+
* it under the terms of the GNU General Public License as published by
8+
* the Free Software Foundation, either version 3 of the License, or
9+
* (at your option) any later version.
10+
*
11+
* ESPResSo is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
* GNU General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU General Public License
17+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
18+
*/
19+
20+
#pragma once
21+
22+
#include "config/config.hpp"
23+
24+
#ifdef FFTW
25+
void fft_on_program_start();
26+
#endif

src/core/magnetostatics/CMakeLists.txt

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,11 @@ target_sources(
2222
${CMAKE_CURRENT_SOURCE_DIR}/dipolar_direct_sum.cpp
2323
${CMAKE_CURRENT_SOURCE_DIR}/dipolar_direct_sum_gpu.cpp
2424
${CMAKE_CURRENT_SOURCE_DIR}/dlc.cpp
25-
${CMAKE_CURRENT_SOURCE_DIR}/dp3m.cpp
2625
${CMAKE_CURRENT_SOURCE_DIR}/scafacos_impl.cpp)
26+
27+
if(ESPRESSO_BUILD_WITH_FFTW)
28+
target_sources(espresso_p3m PRIVATE dp3m.cpp)
29+
set_source_files_properties(
30+
dp3m.cpp TARGET_DIRECTORY espresso_p3m
31+
PROPERTIES INCLUDE_DIRECTORIES ${CMAKE_SOURCE_DIR}/src/core)
32+
endif()

src/core/npt.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020

2121
#ifdef NPT
2222

23+
#include "BoxGeometry.hpp"
2324
#include "PropagationMode.hpp"
25+
#include "cell_system/CellStructure.hpp"
2426
#include "communication.hpp"
2527
#include "config/config.hpp"
2628
#include "electrostatics/coulomb.hpp"

src/core/p3m/CMakeLists.txt

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,15 @@ if(ESPRESSO_BUILD_WITH_FFTW)
3737
target_link_libraries(
3838
espresso_p3m
3939
PRIVATE
40-
espresso::cpp_flags espresso::p3m::cpp_flags espresso::instrumentation
41-
espresso::utils espresso::config Boost::mpi MPI::MPI_CXX Heffte::Heffte
40+
espresso::cpp_flags
41+
espresso::p3m::cpp_flags
42+
espresso::instrumentation
43+
espresso::utils
44+
espresso::config
45+
Boost::mpi
46+
MPI::MPI_CXX
47+
Heffte::Heffte
48+
$<$<BOOL:${ESPRESSO_BUILD_WITH_SHARED_MEMORY_PARALLELISM}>:OpenMP::OpenMP_CXX>
4249
$<$<BOOL:${ESPRESSO_BUILD_WITH_SHARED_MEMORY_PARALLELISM}>:Kokkos::kokkos>
4350
$<$<BOOL:${ESPRESSO_BUILD_WITH_SHARED_MEMORY_PARALLELISM}>:Cabana::Core>)
4451
set_target_properties(espresso_p3m PROPERTIES POSITION_INDEPENDENT_CODE ON)

0 commit comments

Comments
 (0)