diff --git a/benchmark/cajita/CMakeLists.txt b/benchmark/cajita/CMakeLists.txt index 236eb1bfb..404333fdd 100644 --- a/benchmark/cajita/CMakeLists.txt +++ b/benchmark/cajita/CMakeLists.txt @@ -12,8 +12,11 @@ add_executable(SparseMapPerformance Cajita_SparseMapPerformance.cpp) target_link_libraries(SparseMapPerformance Cajita) -add_executable(SparsePartitionerPerformance Cajita_SparsePartitionerPerformance.cpp) -target_link_libraries(SparsePartitionerPerformance Cajita) +add_executable(ParticleDynamicPartitionerPerformance Cajita_ParticleDynamicPartitionerPerformance.cpp) +target_link_libraries(ParticleDynamicPartitionerPerformance Cajita) + +add_executable(SparseMapDynamicPartitionerPerformance Cajita_SparseMapDynamicPartitionerPerformance.cpp) +target_link_libraries(SparseMapDynamicPartitionerPerformance Cajita) add_executable(HaloPerformance Cajita_HaloPerformance.cpp) target_link_libraries(HaloPerformance Cajita) @@ -29,7 +32,9 @@ endif() if(Cabana_ENABLE_TESTING) add_test(NAME Cajita_SparseMapPerformance COMMAND ${NONMPI_PRECOMMAND} SparseMapPerformance sparsemap_output.txt) - add_test(NAME Cajita_SparsePartitionerPerformance COMMAND ${NONMPI_PRECOMMAND} SparsePartitionerPerformance sparsepartitioner_output.txt) + add_test(NAME Cajita_ParticleDynamicPartitionerPerformance COMMAND ${NONMPI_PRECOMMAND} ParticleDynamicPartitionerPerformance particledynamicpartitioner_output.txt) + + add_test(NAME Cajita_SparseMapDynamicPartitionerPerformance COMMAND ${NONMPI_PRECOMMAND} SparseMapDynamicPartitionerPerformance sparsemapdynamicpartitioner_output.txt) add_test(NAME Cajita_HaloPerformance COMMAND ${NONMPI_PRECOMMAND} HaloPerformance halo_output.txt) diff --git a/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp new file mode 100644 index 000000000..59b3e6115 --- /dev/null +++ b/benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp @@ -0,0 +1,290 @@ +/**************************************************************************** + * Copyright (c) 2018-2022 by the Cabana authors * + * All rights reserved. * + * * + * This file is part of the Cabana library. Cabana is distributed under a * + * BSD 3-clause license. For the licensing terms see the LICENSE file in * + * the top-level directory. * + * * + * SPDX-License-Identifier: BSD-3-Clause * + ****************************************************************************/ + +#include "../Cabana_BenchmarkUtils.hpp" +#include "Cabana_ParticleInit.hpp" + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// generate average partitioner +std::array, 3> +computeAveragePartition( const int tile_per_dim, + const std::array& ranks_per_dim ) +{ + std::array, 3> rec_partitions; + for ( int d = 0; d < 3; ++d ) + { + int ele = tile_per_dim / ranks_per_dim[d]; + int part = 0; + for ( int i = 0; i < ranks_per_dim[d]; ++i ) + { + rec_partitions[d].push_back( part ); + part += ele; + } + rec_partitions[d].push_back( tile_per_dim ); + } + return rec_partitions; +} + +//---------------------------------------------------------------------------// +// Performance test. +template +void performanceTest( std::ostream& stream, MPI_Comm comm, + const std::string& test_prefix, + std::vector problem_sizes, + std::vector num_cells_per_dim ) +{ + using memory_space = typename Device::memory_space; + + // Get comm rank; + int comm_rank; + MPI_Comm_rank( comm, &comm_rank ); + + // Get comm size; + int comm_size; + MPI_Comm_size( comm, &comm_size ); + + // Domain size setup + std::array global_low_corner = { 0.0, 0.0, 0.0 }; + std::array global_high_corner = { 1.0, 1.0, 1.0 }; + constexpr int cell_num_per_tile_dim = 4; + constexpr int cell_bits_per_tile_dim = 2; + + // Declare the total number of particles + int num_problem_size = problem_sizes.size(); + + // Declare the size (cell nums) of the domain + int num_cells_per_dim_size = num_cells_per_dim.size(); + + // Number of runs in the test loops. + int num_run = 10; + + // Basic settings for partitioenr + int max_optimize_iteration = 10; + + // Create random sets of particle positions. + using position_type = Kokkos::View; + std::vector positions( num_problem_size ); + for ( int p = 0; p < num_problem_size; ++p ) + { + positions[p] = position_type( + Kokkos::ViewAllocateWithoutInitializing( "positions" ), + problem_sizes[p] ); + Cabana::createRandomParticles( positions[p], problem_sizes[p], + global_low_corner[0], + global_high_corner[0] ); + } + + for ( int c = 0; c < num_cells_per_dim_size; ++c ) + { + // init the sparse grid domain + std::array global_num_cell = { + num_cells_per_dim[c], num_cells_per_dim[c], num_cells_per_dim[c] }; + int num_tiles_per_dim = num_cells_per_dim[c] >> cell_bits_per_tile_dim; + + // set up partitioner + Cajita::DynamicPartitioner partitioner( + comm, global_num_cell, max_optimize_iteration ); + auto ranks_per_dim = + partitioner.ranksPerDimension( comm, global_num_cell ); + auto ave_partition = + computeAveragePartition( num_tiles_per_dim, ranks_per_dim ); + + // Create insertion timers + std::stringstream local_workload_name; + local_workload_name << test_prefix << "compute_local_workload_" + << "domain_size(cell)_" << num_cells_per_dim[c]; + Cabana::Benchmark::Timer local_workload_timer( + local_workload_name.str(), num_problem_size ); + + std::stringstream prefix_sum_name; + prefix_sum_name << test_prefix << "compute_prefix_sum_" + << "domain_size(cell)_" << num_cells_per_dim[c]; + Cabana::Benchmark::Timer prefix_sum_timer( prefix_sum_name.str(), + num_problem_size ); + + std::stringstream total_optimize_name; + total_optimize_name << test_prefix << "total_optimize_" + << "domain_size(cell)_" << num_cells_per_dim[c]; + Cabana::Benchmark::Timer total_optimize_timer( + total_optimize_name.str(), num_problem_size ); + + // loop over all the particle numbers + for ( int p = 0; p < num_problem_size; ++p ) + { + // compute the number of particles handled by the current MPI rank + int par_num = problem_sizes[p] / comm_size + + ( problem_sizes[p] % comm_size < comm_rank ? 1 : 0 ); + + auto pos_view = Kokkos::subview( + positions[p], Kokkos::pair( 0, par_num ), + Kokkos::pair( 0, 3 ) ); + + // try for num_run times + for ( int t = 0; t < num_run; ++t ) + { + // ensure every optimization process starts from the same status + partitioner.initializePartitionByAverage( comm, + global_num_cell ); + + // compute local workload + local_workload_timer.start( p ); + constexpr int cell_num_per_tile_dim = 4; + constexpr int num_space_dim = 3; + auto pws = + Cajita::createParticleDynamicPartitionerWorkloadMeasurer< + cell_num_per_tile_dim, num_space_dim, Device>( + pos_view, par_num, global_low_corner, + 1.0f / num_cells_per_dim[c], comm ); + partitioner.setLocalWorkload( &pws ); + local_workload_timer.stop( p ); + + // compute prefix sum matrix + prefix_sum_timer.start( p ); + partitioner.computeFullPrefixSum( comm ); + prefix_sum_timer.stop( p ); + + // optimization + bool is_changed = false; + // full timer + total_optimize_timer.start( p ); + for ( int i = 0; i < max_optimize_iteration; ++i ) + { + partitioner.updatePartition( std::rand() % 3, is_changed ); + if ( !is_changed ) + break; + } + total_optimize_timer.stop( p ); + } + } + // Output results + outputResults( stream, "insert_tile_num", problem_sizes, + local_workload_timer, comm ); + outputResults( stream, "insert_tile_num", problem_sizes, + prefix_sum_timer, comm ); + outputResults( stream, "insert_tile_num", problem_sizes, + total_optimize_timer, comm ); + stream << std::flush; + } +} + +//---------------------------------------------------------------------------// +// main +int main( int argc, char* argv[] ) +{ + // Initialize environment + MPI_Init( &argc, &argv ); + Kokkos::initialize( argc, argv ); + + // Check arguments. + if ( argc < 2 ) + throw std::runtime_error( "Incorrect number of arguments. \n \ + First argument - file name for output \n \ + Optional second argument - run size (small or large) \n \ + \n \ + Example: \n \ + $/: ./SparseMapPerformance test_results.txt\n" ); + + // Define run sizes. + std::string run_type = ""; + if ( argc > 2 ) + run_type = argv[2]; + std::vector problem_sizes = { 1000, 10000 }; + std::vector num_cells_per_dim = { 32, 64 }; + if ( run_type == "large" ) + { + problem_sizes = { 1000, 10000, 100000, 1000000 }; + num_cells_per_dim = { 32, 64, 128, 256 }; + } + std::vector occupy_fraction = { 0.01, 0.1, 0.5, 0.75, 1.0 }; + + // Get the name of the output file. + std::string filename = argv[1]; + + // Barier before continuing. + MPI_Barrier( MPI_COMM_WORLD ); + + // Get comm rank; + int comm_rank; + MPI_Comm_rank( MPI_COMM_WORLD, &comm_rank ); + + // Get comm size; + int comm_size; + MPI_Comm_size( MPI_COMM_WORLD, &comm_size ); + + // Get Cartesian comm + std::array ranks_per_dim; + for ( std::size_t d = 0; d < 3; ++d ) + ranks_per_dim[d] = 0; + MPI_Dims_create( comm_size, 3, ranks_per_dim.data() ); + + // Open the output file on rank 0. + std::fstream file; + if ( 0 == comm_rank ) + file.open( filename, std::fstream::out ); + + // Output problem details. + if ( 0 == comm_rank ) + { + file << "\n"; + file << "Cajita Sparse Partitioner Performance Benchmark" + << "\n"; + file << "----------------------------------------------" + << "\n"; + file << "MPI Ranks: " << comm_size << "\n"; + file << "MPI Cartesian Dim Ranks: (" << ranks_per_dim[0] << ", " + << ranks_per_dim[1] << ", " << ranks_per_dim[2] << ")\n"; + file << "----------------------------------------------" + << "\n"; + file << "\n"; + file << std::flush; + } + + // Do everything on the default CPU. + using host_exec_space = Kokkos::DefaultHostExecutionSpace; + using host_device_type = host_exec_space::device_type; + // Do everything on the default device with default memory. + using exec_space = Kokkos::DefaultExecutionSpace; + using device_type = exec_space::device_type; + + // Don't run twice on the CPU if only host enabled. + // Don't rerun on the CPU if already done or if turned off. + if ( !std::is_same{} ) + { + performanceTest( file, MPI_COMM_WORLD, + "device_particleWL_", problem_sizes, + num_cells_per_dim ); + } + performanceTest( file, MPI_COMM_WORLD, "host_particleWL_", + problem_sizes, num_cells_per_dim ); + + // Close the output file on rank 0. + file.close(); + + // Finalize + Kokkos::finalize(); + MPI_Finalize(); + return 0; +} diff --git a/benchmark/cajita/Cajita_SparsePartitionerPerformance.cpp b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp similarity index 59% rename from benchmark/cajita/Cajita_SparsePartitionerPerformance.cpp rename to benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp index 4706bdf4e..19b5f0533 100644 --- a/benchmark/cajita/Cajita_SparsePartitionerPerformance.cpp +++ b/benchmark/cajita/Cajita_SparseMapDynamicPartitionerPerformance.cpp @@ -12,8 +12,8 @@ #include "../Cabana_BenchmarkUtils.hpp" #include "Cabana_ParticleInit.hpp" -#include #include +#include #include @@ -28,15 +28,6 @@ #include -//---------------------------------------------------------------------------// -// Helper functions. -struct ParticleWorkloadTag -{ -}; -struct SparseMapTag -{ -}; - // generate a random tile sequence int current = 0; int uniqueNumber() { return current++; } @@ -82,150 +73,7 @@ std::array, 3> computeAveragePartition( //---------------------------------------------------------------------------// // Performance test. template -void performanceTest( ParticleWorkloadTag, std::ostream& stream, MPI_Comm comm, - const std::string& test_prefix, - std::vector problem_sizes, - std::vector num_cells_per_dim ) -{ - using memory_space = typename Device::memory_space; - - // Get comm rank; - int comm_rank; - MPI_Comm_rank( comm, &comm_rank ); - - // Get comm size; - int comm_size; - MPI_Comm_size( comm, &comm_size ); - - // Domain size setup - std::array global_low_corner = { 0.0, 0.0, 0.0 }; - std::array global_high_corner = { 1.0, 1.0, 1.0 }; - constexpr int cell_num_per_tile_dim = 4; - constexpr int cell_bits_per_tile_dim = 2; - - // Declare the total number of particles - int num_problem_size = problem_sizes.size(); - - // Declare the size (cell nums) of the domain - int num_cells_per_dim_size = num_cells_per_dim.size(); - - // Number of runs in the test loops. - int num_run = 10; - - // Basic settings for partitioenr - float max_workload_coeff = 1.5; - int max_optimize_iteration = 10; - int num_step_rebalance = 100; - - // compute the max number of particles handled by the current MPI rank - int max_par_num = problem_sizes.back() / comm_size + - ( problem_sizes.back() % comm_size < comm_rank ? 1 : 0 ); - - // Create random sets of particle positions. - using position_type = Kokkos::View; - std::vector positions( num_problem_size ); - for ( int p = 0; p < num_problem_size; ++p ) - { - positions[p] = position_type( - Kokkos::ViewAllocateWithoutInitializing( "positions" ), - problem_sizes[p] ); - Cabana::createRandomParticles( positions[p], problem_sizes[p], - global_low_corner[0], - global_high_corner[0] ); - } - - for ( int c = 0; c < num_cells_per_dim_size; ++c ) - { - // init the sparse grid domain - std::array global_num_cell = { - num_cells_per_dim[c], num_cells_per_dim[c], num_cells_per_dim[c] }; - int num_tiles_per_dim = num_cells_per_dim[c] >> cell_bits_per_tile_dim; - - // set up partitioner - Cajita::SparseDimPartitioner partitioner( - comm, max_workload_coeff, max_par_num, num_step_rebalance, - global_num_cell, max_optimize_iteration ); - auto ranks_per_dim = - partitioner.ranksPerDimension( comm, global_num_cell ); - auto ave_partition = - computeAveragePartition( num_tiles_per_dim, ranks_per_dim ); - - // Create insertion timers - std::stringstream local_workload_name; - local_workload_name << test_prefix << "compute_local_workload_" - << "domain_size(cell)_" << num_cells_per_dim[c]; - Cabana::Benchmark::Timer local_workload_timer( - local_workload_name.str(), num_problem_size ); - - std::stringstream prefix_sum_name; - prefix_sum_name << test_prefix << "compute_prefix_sum_" - << "domain_size(cell)_" << num_cells_per_dim[c]; - Cabana::Benchmark::Timer prefix_sum_timer( prefix_sum_name.str(), - num_problem_size ); - - std::stringstream total_optimize_name; - total_optimize_name << test_prefix << "total_optimize_" - << "domain_size(cell)_" << num_cells_per_dim[c]; - Cabana::Benchmark::Timer total_optimize_timer( - total_optimize_name.str(), num_problem_size ); - - // loop over all the particle numbers - for ( int p = 0; p < num_problem_size; ++p ) - { - // compute the number of particles handled by the current MPI rank - int par_num = problem_sizes[p] / comm_size + - ( problem_sizes[p] % comm_size < comm_rank ? 1 : 0 ); - - auto pos_view = Kokkos::subview( - positions[p], Kokkos::pair( 0, par_num ), - Kokkos::pair( 0, 3 ) ); - - // try for num_run times - for ( int t = 0; t < num_run; ++t ) - { - // ensure every optimization process starts from the same status - partitioner.initializeRecPartition( - ave_partition[0], ave_partition[1], ave_partition[2] ); - - // compute local workload - local_workload_timer.start( p ); - partitioner.computeLocalWorkLoad( pos_view, par_num, - global_low_corner, - 1.0f / num_cells_per_dim[c] ); - local_workload_timer.stop( p ); - - // compute prefix sum matrix - prefix_sum_timer.start( p ); - partitioner.computeFullPrefixSum( comm ); - prefix_sum_timer.stop( p ); - - // optimization - bool is_changed = false; - // full timer - total_optimize_timer.start( p ); - for ( int i = 0; i < max_optimize_iteration; ++i ) - { - partitioner.optimizePartition( is_changed, - std::rand() % 3 ); - if ( !is_changed ) - break; - } - total_optimize_timer.stop( p ); - } - } - // Output results - outputResults( stream, "insert_tile_num", problem_sizes, - local_workload_timer, comm ); - outputResults( stream, "insert_tile_num", problem_sizes, - prefix_sum_timer, comm ); - outputResults( stream, "insert_tile_num", problem_sizes, - total_optimize_timer, comm ); - stream << std::flush; - } -} - -template -void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, +void performanceTest( std::ostream& stream, MPI_Comm comm, const std::string& test_prefix, std::vector occupy_fraction, std::vector num_cells_per_dim ) @@ -247,9 +95,7 @@ void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, int num_run = 10; // Basic settings for partitioenr - float max_workload_coeff = 1.5; int max_optimize_iteration = 10; - int num_step_rebalance = 100; for ( int c = 0; c < num_cells_per_dim_size; ++c ) { @@ -271,11 +117,8 @@ void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, typename Device::memory_space(), tiles_host ); // set up partitioner - auto total_num = - num_tiles_per_dim * num_tiles_per_dim * num_tiles_per_dim; - Cajita::SparseDimPartitioner partitioner( - comm, max_workload_coeff, total_num, num_step_rebalance, - global_num_cell, max_optimize_iteration ); + Cajita::DynamicPartitioner partitioner( + comm, global_num_cell, max_optimize_iteration ); auto ranks_per_dim = partitioner.ranksPerDimension( comm, global_num_cell ); auto ave_partition = @@ -321,12 +164,15 @@ void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, for ( int t = 0; t < num_run; ++t ) { // ensure every optimization process starts from the same status - partitioner.initializeRecPartition( - ave_partition[0], ave_partition[1], ave_partition[2] ); + partitioner.initializePartitionByAverage( comm, + global_num_cell ); // compute local workload local_workload_timer.start( frac ); - partitioner.computeLocalWorkLoad( sis ); + auto smws = + Cajita::createSparseMapDynamicPartitionerWorkloadMeasurer< + Device>( sis, comm ); + partitioner.setLocalWorkload( &smws ); local_workload_timer.stop( frac ); // compute prefix sum matrix @@ -340,8 +186,7 @@ void performanceTest( SparseMapTag, std::ostream& stream, MPI_Comm comm, total_optimize_timer.start( frac ); for ( int i = 0; i < max_optimize_iteration; ++i ) { - partitioner.optimizePartition( is_changed, - std::rand() % 3 ); + partitioner.updatePartition( std::rand() % 3, is_changed ); if ( !is_changed ) break; } @@ -443,17 +288,11 @@ int main( int argc, char* argv[] ) // Don't rerun on the CPU if already done or if turned off. if ( !std::is_same{} ) { - performanceTest( ParticleWorkloadTag(), file, - MPI_COMM_WORLD, "device_particleWL_", - problem_sizes, num_cells_per_dim ); - performanceTest( SparseMapTag(), file, MPI_COMM_WORLD, + performanceTest( file, MPI_COMM_WORLD, "device_sparsemapWL_", occupy_fraction, num_cells_per_dim ); } - performanceTest( ParticleWorkloadTag(), file, - MPI_COMM_WORLD, "host_particleWL_", - problem_sizes, num_cells_per_dim ); - performanceTest( SparseMapTag(), file, MPI_COMM_WORLD, + performanceTest( file, MPI_COMM_WORLD, "host_sparsemapWL_", occupy_fraction, num_cells_per_dim ); diff --git a/cajita/src/CMakeLists.txt b/cajita/src/CMakeLists.txt index 6c9b5c6d1..9addb651d 100644 --- a/cajita/src/CMakeLists.txt +++ b/cajita/src/CMakeLists.txt @@ -35,7 +35,7 @@ set(HEADERS_PUBLIC Cajita_Splines.hpp Cajita_Types.hpp Cajita_UniformDimPartitioner.hpp - Cajita_SparseDimPartitioner.hpp + Cajita_DynamicPartitioner.hpp Cajita_SparseArray.hpp ) diff --git a/cajita/src/Cajita.hpp b/cajita/src/Cajita.hpp index e0424120c..b825eb236 100644 --- a/cajita/src/Cajita.hpp +++ b/cajita/src/Cajita.hpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -34,7 +35,6 @@ #include #include #include -#include #include #include #include diff --git a/cajita/src/Cajita_SparseDimPartitioner.hpp b/cajita/src/Cajita_DynamicPartitioner.hpp similarity index 76% rename from cajita/src/Cajita_SparseDimPartitioner.hpp rename to cajita/src/Cajita_DynamicPartitioner.hpp index 1e6e8360d..3ee3ae2d6 100644 --- a/cajita/src/Cajita_SparseDimPartitioner.hpp +++ b/cajita/src/Cajita_DynamicPartitioner.hpp @@ -10,11 +10,11 @@ ****************************************************************************/ /*! - \file Cajita_SparseDimPartitioner.hpp - \brief Multi-node sparse grid partitioner + \file Cajita_DynamicPartitioner.hpp + \brief Multi-node dynamic grid partitioner */ -#ifndef CAJITA_SPARSEDIMPARTITIONER_HPP -#define CAJITA_SPARSEDIMPARTITIONER_HPP +#ifndef CAJITA_DYNAMICPARTITIONER_HPP +#define CAJITA_DYNAMICPARTITIONER_HPP #include #include @@ -27,16 +27,42 @@ namespace Cajita { + +//---------------------------------------------------------------------------// +/*! + Workload measurer for DynamicPartitioner. It can be customized by the user + to compute workload on each rank. + + \tparam Device Kokkos device type. +*/ +template +class DynamicPartitionerWorkloadMeasurer +{ + using memory_space = typename Device::memory_space; + + public: + /*! + \brief this function need to be overwrited to compute workload + \param workload workload computed on each rank + */ + virtual void compute( Kokkos::View& workload ) = 0; +}; + //---------------------------------------------------------------------------// /*! - Sparse mesh block partitioner. (Current Version: Support 3D only) + Dynamic mesh block partitioner. (Current Version: Support 3D only) There + should be no instantiation for this class without implementing any workload + computation. + \tparam Device Kokkos device type. - \tparam CellPerTileDim Cells per tile per dimension. - \tparam NumSpaceDim Dimemsion (The current version support 3D only) + \tparam CellPerTileDim Cells + per tile per dimension. + \tparam NumSpaceDim Dimemsion (The current version + support 3D only) */ template -class SparseDimPartitioner : public BlockPartitioner +class DynamicPartitioner : public BlockPartitioner { public: //! dimension @@ -74,53 +100,35 @@ class SparseDimPartitioner : public BlockPartitioner \brief Constructor - automatically compute ranks_per_dim from MPI communicator \param comm MPI communicator to decide the rank nums in each dimension - \param max_workload_coeff threshold factor for re-partition - \param workload_num total workload(particle/tile) number, used to compute - workload_threshold - \param num_step_rebalance the simulation step number after which one - should check if repartition is needed \param global_cells_per_dim 3D array, global cells in each dimension \param max_optimize_iteration max iteration number to run the optimization */ - SparseDimPartitioner( - MPI_Comm comm, float max_workload_coeff, int workload_num, - int num_step_rebalance, + DynamicPartitioner( + MPI_Comm comm, const std::array& global_cells_per_dim, int max_optimize_iteration = 10 ) - : _workload_threshold( - static_cast( max_workload_coeff * workload_num ) ) - , _num_step_rebalance( num_step_rebalance ) - , _max_optimize_iteration( max_optimize_iteration ) + : _max_optimize_iteration( max_optimize_iteration ) { // compute the ranks_per_dim from MPI communicator allocate( global_cells_per_dim ); ranksPerDimension( comm ); + initializePartitionByAverage( comm, global_cells_per_dim ); } /*! \brief Constructor - user-defined ranks_per_dim communicator \param comm MPI communicator to decide the rank nums in each dimension - \param max_workload_coeff threshold factor for re-partition - \param workload_num total workload(particle/tile) number, used to compute - workload_threshold - \param num_step_rebalance the simulation step number after which one - should check if repartition is needed \param ranks_per_dim 3D array, user-defined MPI rank constrains in per dimension \param global_cells_per_dim 3D array, global cells in each dimension \param max_optimize_iteration max iteration number to run the optimization */ - SparseDimPartitioner( - MPI_Comm comm, float max_workload_coeff, int workload_num, - int num_step_rebalance, - const std::array& ranks_per_dim, + DynamicPartitioner( + MPI_Comm comm, const std::array& ranks_per_dim, const std::array& global_cells_per_dim, int max_optimize_iteration = 10 ) - : _workload_threshold( - static_cast( max_workload_coeff * workload_num ) ) - , _num_step_rebalance( num_step_rebalance ) - , _max_optimize_iteration( max_optimize_iteration ) + : _max_optimize_iteration( max_optimize_iteration ) { allocate( global_cells_per_dim ); std::copy( ranks_per_dim.begin(), ranks_per_dim.end(), @@ -130,6 +138,7 @@ class SparseDimPartitioner : public BlockPartitioner int comm_size; MPI_Comm_size( comm, &comm_size ); MPI_Dims_create( comm_size, num_space_dim, _ranks_per_dim.data() ); + initializePartitionByAverage( comm, global_cells_per_dim ); } /*! @@ -171,7 +180,7 @@ class SparseDimPartitioner : public BlockPartitioner nrank *= _ranks_per_dim[d]; if ( comm_size != nrank ) throw std::runtime_error( - "SparsePartitioner ranks do not match comm size" ); + "DynamicPartitioner ranks do not match comm size" ); return ranks_per_dim; } @@ -271,18 +280,49 @@ class SparseDimPartitioner : public BlockPartitioner } /*! - \brief Initialize the tile partition; partition in each dimension + \brief Initialize the tile partition by average size + \param comm The communicator to use for initializing partitioning + \param global_cells_per_dim 3D array, global cells in each dimension + */ + void initializePartitionByAverage( + MPI_Comm comm, + const std::array& global_cells_per_dim ) + { + std::array global_num_tile = { + global_cells_per_dim[0] / (int)cell_num_per_tile_dim, + global_cells_per_dim[1] / (int)cell_num_per_tile_dim, + global_cells_per_dim[2] / (int)cell_num_per_tile_dim }; + + auto ranks_per_dim = ranksPerDimension( comm, global_cells_per_dim ); + std::array, 3> rec_partitions; + for ( int d = 0; d < 3; ++d ) + { + int ele = global_num_tile[d] / ranks_per_dim[d]; + int part = 0; + for ( int i = 0; i < ranks_per_dim[d]; ++i ) + { + rec_partitions[d].push_back( part ); + part += ele; + } + rec_partitions[d].push_back( global_num_tile[d] ); + } + + setRecPartition( rec_partitions[0], rec_partitions[1], + rec_partitions[2] ); + } + + /*! + \brief Set the tile partition; partition in each dimension has the form [0, p_1, ..., p_n, total_tile_num], so the partition would be [0, p_1), [p_1, p_2) ... [p_n, total_tile_num] \param rec_partition_i partition array in dimension i \param rec_partition_j partition array in dimension j \param rec_partition_k partition array in dimension k */ - void initializeRecPartition( std::vector& rec_partition_i, - std::vector& rec_partition_j, - std::vector& rec_partition_k ) + void setRecPartition( std::vector& rec_partition_i, + std::vector& rec_partition_j, + std::vector& rec_partition_k ) { - int max_size = 0; for ( std::size_t d = 0; d < num_space_dim; ++d ) max_size = @@ -335,75 +375,6 @@ class SparseDimPartitioner : public BlockPartitioner Kokkos::deep_copy( _workload_prefix_sum, 0 ); } - /*! - \brief compute the workload in the current MPI rank from particle - positions (each particle count for 1 workload value) - \param view particle positions view - \param particle_num total particle number - \param global_lower_corner the coordinate of the domain global lower - corner - \param dx cell dx size - */ - template - void computeLocalWorkLoad( const ParticlePosViewType& view, - int particle_num, - const ArrayType& global_lower_corner, - const CellUnit dx ) - { - resetWorkload(); - // make a local copy - auto workload = _workload_per_tile; - Kokkos::Array lower_corner; - for ( std::size_t d = 0; d < num_space_dim; ++d ) - { - lower_corner[d] = global_lower_corner[d]; - } - - Kokkos::parallel_for( - "compute_local_workload_parpos", - Kokkos::RangePolicy( 0, particle_num ), - KOKKOS_LAMBDA( const int i ) { - int ti = static_cast( - ( view( i, 0 ) - lower_corner[0] ) / dx - 0.5 ) >> - cell_bits_per_tile_dim; - int tj = static_cast( - ( view( i, 1 ) - lower_corner[1] ) / dx - 0.5 ) >> - cell_bits_per_tile_dim; - int tz = static_cast( - ( view( i, 2 ) - lower_corner[2] ) / dx - 0.5 ) >> - cell_bits_per_tile_dim; - Kokkos::atomic_increment( &workload( ti + 1, tj + 1, tz + 1 ) ); - } ); - Kokkos::fence(); - } - - /*! - \brief compute the workload in the current MPI rank from sparseMap - (the workload of a tile is 1 if the tile is occupied, 0 otherwise) - \param sparseMap sparseMap in the current rank - */ - template - void computeLocalWorkLoad( const SparseMapType& sparseMap ) - { - resetWorkload(); - // make a local copy - auto workload = _workload_per_tile; - Kokkos::parallel_for( - "compute_local_workload_sparsmap", - Kokkos::RangePolicy( 0, sparseMap.capacity() ), - KOKKOS_LAMBDA( uint32_t i ) { - if ( sparseMap.valid_at( i ) ) - { - auto key = sparseMap.key_at( i ); - int ti, tj, tk; - sparseMap.key2ijk( key, ti, tj, tk ); - Kokkos::atomic_increment( - &workload( ti + 1, tj + 1, tk + 1 ) ); - } - } ); - Kokkos::fence(); - } - /*! \brief 1. reduce the total workload in all MPI ranks; 2. compute the workload prefix sum matrix for all MPI ranks @@ -488,67 +459,30 @@ class SparseDimPartitioner : public BlockPartitioner } /*! - \brief iteratively optimize the partition - \param view particle positions view - \param particle_num total particle number - \param global_lower_corner the coordinate of the domain global lower - corner - \param dx cell dx size - \param comm MPI communicator used for workload reduction - \return iteration number + \brief compute workload in each MPI rank + \param measurer measurer defined by user to compute workload. + DynamicPartitionerWorkloadMeasurer is the base class and the user + should define a derived measurer class with compute() implemented. */ - template - int optimizePartition( const ParticlePosViewType& view, int particle_num, - const ArrayType& global_lower_corner, - const CellUnit dx, MPI_Comm comm ) + void + setLocalWorkload( DynamicPartitionerWorkloadMeasurer* measurer ) { - computeLocalWorkLoad( view, particle_num, global_lower_corner, dx ); - MPI_Barrier( comm ); - - computeFullPrefixSum( comm ); - MPI_Barrier( comm ); - - // each iteration covers partitioner optization in all three dimensions - // (with a random dim sequence) - for ( int i = 0; i < _max_optimize_iteration; ++i ) - { - bool is_changed = false; // record changes in current iteration - bool dim_covered[3] = { false, false, false }; - for ( int d = 0; d < 3; ++d ) - { - int random_dim_id = std::rand() % num_space_dim; - while ( dim_covered[random_dim_id] ) - random_dim_id = std::rand() % num_space_dim; - - bool is_dim_changed = false; // record changes in current dim - optimizePartition( is_dim_changed, random_dim_id ); - - // update control info - is_changed = is_changed || is_dim_changed; - dim_covered[random_dim_id] = true; - } - // return if the current partition is optimal - if ( !is_changed ) - return i; - } - return _max_optimize_iteration; + resetWorkload(); + measurer->compute( _workload_per_tile ); } /*! \brief iteratively optimize the partition - \param sparseMap sparseMap in the current rank \param comm MPI communicator used for workload reduction \return iteration number */ - template - int optimizePartition( const SparseMapType& sparseMap, MPI_Comm comm ) + int optimizePartition( MPI_Comm comm ) { - computeLocalWorkLoad( sparseMap ); - MPI_Barrier( comm ); - computeFullPrefixSum( comm ); MPI_Barrier( comm ); + // each iteration covers partitioner optization in all three dimensions + // (with a random dim sequence) for ( int i = 0; i < _max_optimize_iteration; ++i ) { bool is_changed = false; // record changes in current iteration @@ -560,7 +494,7 @@ class SparseDimPartitioner : public BlockPartitioner random_dim_id = std::rand() % num_space_dim; bool is_dim_changed = false; // record changes in current dim - optimizePartition( is_dim_changed, random_dim_id ); + updatePartition( random_dim_id, is_dim_changed ); // update control info is_changed = is_changed || is_dim_changed; @@ -575,11 +509,11 @@ class SparseDimPartitioner : public BlockPartitioner /*! \brief optimize the partition in three dimensions seperately - \param is_changed label if the partition is changed after the optimization \param iter_seed seed number to choose the starting dimension of the optimization + \param is_changed label if the partition is changed after the optimization */ - void optimizePartition( bool& is_changed, int iter_seed ) + void updatePartition( int iter_seed, bool& is_changed ) { is_changed = false; // loop over three dimensions, optimize the partition in dimension di @@ -630,51 +564,27 @@ class SparseDimPartitioner : public BlockPartitioner // last_point: the opimized position for the lask partition int last_point = 0; // current_workload: the workload between [last_point, point_i) - Kokkos::View current_workload( - "current_workload", _ranks_per_dim[dj] * _ranks_per_dim[dk] ); for ( int current_rank = 1; current_rank < rank; current_rank++ ) { int last_diff = __INT_MAX__; while ( true ) { - // compute current workload between [last_point, point_i) - Kokkos::parallel_for( - "compute_current_workload", + int diff; + Kokkos::parallel_reduce( + "diff_reduce", Kokkos::RangePolicy( 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ), - KOKKOS_LAMBDA( uint32_t jnk ) { + KOKKOS_LAMBDA( const int jnk, int& update ) { int j = static_cast( jnk / rank_k ); int k = static_cast( jnk % rank_k ); - current_workload( jnk ) = compute_sub_workload( + int current_workload = compute_sub_workload( di, last_point, point_i, dj, j, dk, k ); - } ); - Kokkos::fence(); - - // compute the (w_jk^ave - w_jk^{last_point:point_i}) - Kokkos::parallel_for( - "compute_diff", - Kokkos::RangePolicy( - 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ), - KOKKOS_LAMBDA( uint32_t jnk ) { - auto wl = - current_workload( jnk ) - ave_workload( jnk ); + auto wl = current_workload - ave_workload( jnk ); // compute absolute diff (rather than squares to // avoid potential overflow) // TODO: update when Kokkos::abs() available wl = wl > 0 ? wl : -wl; - current_workload( jnk ) = wl; - } ); - Kokkos::fence(); - - // compute the sum of the difference in all rank_j*rank_k - // regions - int diff; - Kokkos::parallel_reduce( - "diff_reduce", - Kokkos::RangePolicy( - 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ), - KOKKOS_LAMBDA( const int idx, int& update ) { - update += current_workload( idx ); + update += wl; }, diff ); Kokkos::fence(); @@ -866,25 +776,26 @@ class SparseDimPartitioner : public BlockPartitioner }; private: - // workload_threshold - int _workload_threshold; - // default check point for re-balance - int _num_step_rebalance; // max_optimize iterations int _max_optimize_iteration; - // represent the rectangle partition in each dimension - // with form [0, p_1, ..., p_n, cell_num], n = rank num in current - // dimension, partition in this dimension would be [0, p_1), [p_1, p_2) ... - // [p_n, total_tile_num] (unit: tile) + protected: + //! represent the rectangle partition in each dimension + //! with form [0, p_1, ..., p_n, cell_num], n = rank num in current + //! dimension, partition in this dimension would be [0, p_1), [p_1, p_2) ... + //! [p_n, total_tile_num] (unit: tile) partition_view _rectangle_partition_dev; - // the workload of each tile on current + //! the workload of each tile on current workload_view _workload_per_tile; - // 3d prefix sum of the workload of each tile on current + //! 3d prefix sum of the workload of each tile on current workload_view _workload_prefix_sum; - // ranks per dimension + //! ranks per dimension Kokkos::Array _ranks_per_dim; + /*! + \brief allocate internal data structure for the partition algorithm + \param global_cells_per_dim grid size along each dimension + */ void allocate( const std::array& global_cells_per_dim ) { @@ -903,6 +814,7 @@ class SparseDimPartitioner : public BlockPartitioner ( global_cells_per_dim[2] >> cell_bits_per_tile_dim ) + 1 ); } }; + } // end namespace Cajita -#endif // end CAJITA_SPARSEDIMPARTITIONER_HPP +#endif // end CAJITA_DYNAMICPARTITIONER_HPP diff --git a/cajita/src/Cajita_ParticleDynamicPartitioner.hpp b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp new file mode 100644 index 000000000..2719f428f --- /dev/null +++ b/cajita/src/Cajita_ParticleDynamicPartitioner.hpp @@ -0,0 +1,133 @@ +/**************************************************************************** + * Copyright (c) 2018-2022 by the Cabana authors * + * All rights reserved. * + * * + * This file is part of the Cabana library. Cabana is distributed under a * + * BSD 3-clause license. For the licensing terms see the LICENSE file in * + * the top-level directory. * + * * + * SPDX-License-Identifier: BSD-3-Clause * + ****************************************************************************/ + +/*! + \file Cajita_ParticleDynamicPartitioner.hpp + \brief Multi-node particle based dynamic grid partitioner +*/ +#ifndef CAJITA_PARTICLEDYNAMICPARTITIONER_HPP +#define CAJITA_PARTICLEDYNAMICPARTITIONER_HPP + +#include +#include +#include + +#include +#include + +#include + +namespace Cajita +{ + +//---------------------------------------------------------------------------// +/*! + \brief Helper class to set workload for DynamicPartitioner with particles. + \tparam Particles' position view type (Kokkos::View) + \tparam Global grid bottom left corner type + \tparam Global grid unit cell size type + \tparam Partitioner's cell number per tile dim + \tparam Partitioner's space dim number + \tparam Partitioner's device type +*/ +template +class ParticleDynamicPartitionerWorkloadMeasurer + : public DynamicPartitionerWorkloadMeasurer +{ + using memory_space = typename Device::memory_space; + using execution_space = typename Device::execution_space; + + static constexpr unsigned long long cell_bits_per_tile_dim = + bitCount( CellPerTileDim ); + + const ParticlePosViewType& view; + int particle_num; + const ArrayType& global_lower_corner; + const CellUnit dx; + MPI_Comm comm; + + public: + /*! + \brief Constructor. + \param view Position of particles used in workload computation. + \param particle_num The number of particles used in workload computation. + \param global_lower_corner The bottom-left corner of global grid. + \param dx The global grid resolution. + \param comm MPI communicator to use for computing workload. + */ + ParticleDynamicPartitionerWorkloadMeasurer( + const ParticlePosViewType& view, int particle_num, + const ArrayType& global_lower_corner, const CellUnit dx, MPI_Comm comm ) + : view( view ) + , particle_num( particle_num ) + , global_lower_corner( global_lower_corner ) + , dx( dx ) + , comm( comm ) + { + } + + //! \brief Called by DynamicPartitioner to compute workload + void compute( Kokkos::View& workload ) override + { + Kokkos::Array lower_corner; + for ( std::size_t d = 0; d < num_space_dim; ++d ) + { + lower_corner[d] = global_lower_corner[d]; + } + + auto dx_copy = dx; + auto cell_bits_per_tile_dim_copy = cell_bits_per_tile_dim; + auto view_copy = view; + Kokkos::parallel_for( + "compute_local_workload_parpos", + Kokkos::RangePolicy( 0, particle_num ), + KOKKOS_LAMBDA( const int i ) { + int ti = static_cast( + ( view_copy( i, 0 ) - lower_corner[0] ) / dx_copy - + 0.5 ) >> + cell_bits_per_tile_dim_copy; + int tj = static_cast( + ( view_copy( i, 1 ) - lower_corner[1] ) / dx_copy - + 0.5 ) >> + cell_bits_per_tile_dim_copy; + int tz = static_cast( + ( view_copy( i, 2 ) - lower_corner[2] ) / dx_copy - + 0.5 ) >> + cell_bits_per_tile_dim_copy; + Kokkos::atomic_increment( &workload( ti + 1, tj + 1, tz + 1 ) ); + } ); + Kokkos::fence(); + // Wait for other ranks' workload to be ready + MPI_Barrier( comm ); + } +}; + +//---------------------------------------------------------------------------// +//! Creation function for ParticleDynamicPartitionerWorkloadMeasurer from +//! Kokkos::View +template +ParticleDynamicPartitionerWorkloadMeasurer +createParticleDynamicPartitionerWorkloadMeasurer( + const ParticlePosViewType& view, int particle_num, + const ArrayType& global_lower_corner, const CellUnit dx, MPI_Comm comm ) +{ + return ParticleDynamicPartitionerWorkloadMeasurer< + ParticlePosViewType, ArrayType, CellUnit, CellPerTileDim, num_space_dim, + Device>( view, particle_num, global_lower_corner, dx, comm ); +} + +} // end namespace Cajita + +#endif // end CAJITA_PARTICLEDYNAMICPARTITIONER_HPP diff --git a/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp new file mode 100644 index 000000000..009be1e53 --- /dev/null +++ b/cajita/src/Cajita_SparseMapDynamicPartitioner.hpp @@ -0,0 +1,97 @@ +/**************************************************************************** + * Copyright (c) 2018-2022 by the Cabana authors * + * All rights reserved. * + * * + * This file is part of the Cabana library. Cabana is distributed under a * + * BSD 3-clause license. For the licensing terms see the LICENSE file in * + * the top-level directory. * + * * + * SPDX-License-Identifier: BSD-3-Clause * + ****************************************************************************/ + +/*! + \file Cajita_SparseMapDynamicPartitioner.hpp + \brief Multi-node sparse map based dynamic grid partitioner +*/ +#ifndef CAJITA_SPARSEMAPDYNAMICPARTITIONER_HPP +#define CAJITA_SPARSEMAPDYNAMICPARTITIONER_HPP + +#include +#include +#include + +#include +#include + +#include + +namespace Cajita +{ + +//---------------------------------------------------------------------------// +/*! + \brief Helper class to set workload for DynamicPartitioner with sparse map. + \tparam Sparse map type + \tparam Partitioner's device type +*/ +template +class SparseMapDynamicPartitionerWorkloadMeasurer + : public DynamicPartitionerWorkloadMeasurer +{ + using memory_space = typename Device::memory_space; + using execution_space = typename Device::execution_space; + + const SparseMapType& sparseMap; + MPI_Comm comm; + + public: + /*! + \brief Constructor. + \param sparseMap Sparse map used in workload computation. + \param comm MPI communicator to use for computing workload. + */ + SparseMapDynamicPartitionerWorkloadMeasurer( const SparseMapType& sparseMap, + MPI_Comm comm ) + : sparseMap( sparseMap ) + , comm( comm ) + { + } + + //! \brief Called by DynamicPartitioner to compute workload + void compute( Kokkos::View& workload ) override + { + const SparseMapType& sparse_map_copy = sparseMap; + Kokkos::parallel_for( + "compute_local_workload_sparsmap", + Kokkos::RangePolicy( 0, sparseMap.capacity() ), + KOKKOS_LAMBDA( uint32_t i ) { + if ( sparse_map_copy.valid_at( i ) ) + { + auto key = sparse_map_copy.key_at( i ); + int ti, tj, tk; + sparse_map_copy.key2ijk( key, ti, tj, tk ); + Kokkos::atomic_increment( + &workload( ti + 1, tj + 1, tk + 1 ) ); + } + } ); + Kokkos::fence(); + // Wait for other ranks' workload to be ready + MPI_Barrier( comm ); + } +}; + +//---------------------------------------------------------------------------// +//! Creation function for SparseMapDynamicPartitionerWorkloadMeasurer from +//! SparseMap +template +SparseMapDynamicPartitionerWorkloadMeasurer +createSparseMapDynamicPartitionerWorkloadMeasurer( + const SparseMapType& sparseMap, MPI_Comm comm ) +{ + return SparseMapDynamicPartitionerWorkloadMeasurer( + sparseMap, comm ); +} + +} // end namespace Cajita + +#endif // end CAJITA_SPARSEMAPDYNAMICPARTITIONER_HPP diff --git a/cajita/unit_test/CMakeLists.txt b/cajita/unit_test/CMakeLists.txt index 6693944dc..a8baab468 100644 --- a/cajita/unit_test/CMakeLists.txt +++ b/cajita/unit_test/CMakeLists.txt @@ -37,7 +37,8 @@ set(MPI_TESTS Interpolation2d BovWriter Parallel - SparseDimPartitioner + ParticleDynamicPartitioner + SparseMapDynamicPartitioner Partitioner SparseArray ) diff --git a/cajita/unit_test/tstGlobalGrid.hpp b/cajita/unit_test/tstGlobalGrid.hpp index bdf0966e2..1e0e243ff 100644 --- a/cajita/unit_test/tstGlobalGrid.hpp +++ b/cajita/unit_test/tstGlobalGrid.hpp @@ -9,10 +9,10 @@ * SPDX-License-Identifier: BSD-3-Clause * ****************************************************************************/ +#include #include #include #include -#include #include #include @@ -425,15 +425,10 @@ void sparseGridTest3d() global_low_corner, global_high_corner, global_num_cell ); // Sparse paritioner - float max_workload_coeff = 1.5; - int workload_num = - global_num_cell[0] * global_num_cell[1] * global_num_cell[2]; - int num_step_rebalance = 100; int max_optimize_iteration = 10; - SparseDimPartitioner partitioner( - MPI_COMM_WORLD, max_workload_coeff, workload_num, num_step_rebalance, - global_num_cell, max_optimize_iteration ); + DynamicPartitioner partitioner( + MPI_COMM_WORLD, global_num_cell, max_optimize_iteration ); // test ranks per dim auto ranks_per_dim = @@ -451,8 +446,6 @@ void sparseGridTest3d() } rec_partitions[d].push_back( global_num_tile[d] ); } - partitioner.initializeRecPartition( rec_partitions[0], rec_partitions[1], - rec_partitions[2] ); // Create spares global grid auto global_grid = createGlobalGrid( MPI_COMM_WORLD, global_mesh, @@ -567,7 +560,7 @@ void sparseGridTest3d() for ( int id = 1; id < ranks_per_dim[d]; id++ ) part[d][id] += 1; - partitioner.initializeRecPartition( part[0], part[1], part[2] ); + partitioner.setRecPartition( part[0], part[1], part[2] ); std::array new_owned_num_cell; std::array new_global_cell_offset; diff --git a/cajita/unit_test/tstParticleDynamicPartitioner.hpp b/cajita/unit_test/tstParticleDynamicPartitioner.hpp new file mode 100644 index 000000000..e2322b553 --- /dev/null +++ b/cajita/unit_test/tstParticleDynamicPartitioner.hpp @@ -0,0 +1,218 @@ +/**************************************************************************** + * Copyright (c) 2018-2022 by the Cabana authors * + * All rights reserved. * + * * + * This file is part of the Cabana library. Cabana is distributed under a * + * BSD 3-clause license. For the licensing terms see the LICENSE file in * + * the top-level directory. * + * * + * SPDX-License-Identifier: BSD-3-Clause * + ****************************************************************************/ + +#include +#include +#include + +#include +#include +#include + +#include + +using namespace Cajita; + +namespace Test +{ + +auto generate_random_particles( + const std::array, 3>& gt_partition, + const Kokkos::Array& cart_rank, int occupy_par_num_per_rank, + const std::array global_low_corner, double dx, + int cell_num_per_tile_dim ) -> Kokkos::View +{ + std::set> par_set; + + double start[3], size[3]; + for ( int d = 0; d < 3; ++d ) + { + start[d] = + ( gt_partition[d][cart_rank[d]] * cell_num_per_tile_dim + 0.5 ) * + dx + + global_low_corner[d]; + + size[d] = + ( ( gt_partition[d][cart_rank[d] + 1] * cell_num_per_tile_dim ) - + ( gt_partition[d][cart_rank[d]] * cell_num_per_tile_dim ) ) * + dx; + } + // insert the corner tiles to the set, to ensure the uniqueness of the + // ground truth partition + par_set.insert( + { start[0] + 0.01 * dx, start[1] + 0.01 * dx, start[2] + 0.01 * dx } ); + par_set.insert( { + start[0] + size[0] - dx - 0.01 * dx, + start[1] + size[1] - dx - 0.01 * dx, + start[2] + size[2] - dx - 0.01 * dx, + } ); + + // insert random tiles to the set + while ( static_cast( par_set.size() ) < occupy_par_num_per_rank ) + { + double rand_offset[3]; + for ( int d = 0; d < 3; ++d ) + rand_offset[d] = (double)std::rand() / RAND_MAX; + par_set.insert( { start[0] + rand_offset[0] * ( size[0] - dx ), + start[1] + rand_offset[1] * ( size[1] - dx ), + start[2] + rand_offset[2] * ( size[2] - dx ) } ); + } + + // particle_set => particle view (host) + typedef typename TEST_EXECSPACE::array_layout layout; + Kokkos::View par_view_host( + "particle_view_host", par_set.size() ); + int i = 0; + for ( auto it = par_set.begin(); it != par_set.end(); ++it ) + { + for ( int d = 0; d < 3; ++d ) + par_view_host( i, d ) = ( *it )[d]; + i++; + } + + // create tiles view on device + Kokkos::View par_view = + Kokkos::create_mirror_view_and_copy( TEST_MEMSPACE(), par_view_host ); + return par_view; +} + +/*! + \brief In this test, the ground truth partition is first randomly chosen, then + a given number of tiles are regiestered on each rank (the most bottom-left and + top-right tiles are always registered to ensure the uniqueness of the ground + truth partition ) + \param occupy_num_per_rank the tile number that will be registered on each MPI + rank +*/ +void random_distribution_automatic_rank( int occupy_num_per_rank ) +{ + // define the domain size + constexpr int size_tile_per_dim = 32; + constexpr int cell_per_tile_dim = 4; + constexpr int size_per_dim = size_tile_per_dim * cell_per_tile_dim; + srand( time( 0 ) ); + + // some settings for partitioner + int max_optimize_iteration = 10; + + std::array global_cells_per_dim = { size_per_dim, size_per_dim, + size_per_dim }; + + // partitioner + DynamicPartitioner partitioner( + MPI_COMM_WORLD, global_cells_per_dim, max_optimize_iteration ); + + // check the value of some pre-computed constants + auto cbptd = partitioner.cell_bits_per_tile_dim; + EXPECT_EQ( cbptd, 2 ); + + auto cnptd = partitioner.cell_num_per_tile_dim; + EXPECT_EQ( cnptd, 4 ); + + // ranks per dim test + auto ranks_per_dim = + partitioner.ranksPerDimension( MPI_COMM_WORLD, global_cells_per_dim ); + + EXPECT_EQ( ranks_per_dim[0] >= 1, true ); + EXPECT_EQ( ranks_per_dim[1] >= 1, true ); + EXPECT_EQ( ranks_per_dim[2] >= 1, true ); + + // compute the rank ID + Kokkos::Array cart_rank; + std::array periodic_dims = { 0, 0, 0 }; + int reordered_cart_ranks = 1; + MPI_Comm cart_comm; + MPI_Cart_create( MPI_COMM_WORLD, 3, ranks_per_dim.data(), + periodic_dims.data(), reordered_cart_ranks, &cart_comm ); + int linear_rank; + MPI_Comm_rank( cart_comm, &linear_rank ); + MPI_Cart_coords( cart_comm, linear_rank, 3, cart_rank.data() ); + + // generate random ground truth partition on the root rank + std::array, 3> gt_partition_set; + std::array, 3> gt_partition; + int world_rank, world_size; + MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); + MPI_Comm_size( MPI_COMM_WORLD, &world_size ); + for ( int d = 0; d < 3; ++d ) + { + gt_partition[d].resize( ranks_per_dim[d] + 1 ); + } + + if ( world_rank == 0 ) + { + for ( int d = 0; d < 3; ++d ) + { + gt_partition_set[d].insert( 0 ); + while ( static_cast( gt_partition_set[d].size() ) < + ranks_per_dim[d] ) + { + int rand_num = std::rand() % size_tile_per_dim; + gt_partition_set[d].insert( rand_num ); + } + gt_partition_set[d].insert( size_tile_per_dim ); + int i = 0; + for ( auto it = gt_partition_set[d].begin(); + it != gt_partition_set[d].end(); ++it ) + { + gt_partition[d][i++] = *it; + } + } + } + + // broadcast the ground truth partition to all ranks + for ( int d = 0; d < 3; ++d ) + { + MPI_Barrier( MPI_COMM_WORLD ); + MPI_Bcast( gt_partition[d].data(), gt_partition[d].size(), MPI_INT, 0, + MPI_COMM_WORLD ); + MPI_Barrier( MPI_COMM_WORLD ); + } + + // basic settings for domain size and position + double cell_size = 0.1; + std::array global_low_corner = { 1.2, 3.3, -2.8 }; + + // randomly generate a fixed number of particles on each MPI rank + auto particle_view = generate_random_particles( + gt_partition, cart_rank, occupy_num_per_rank, global_low_corner, + cell_size, cell_per_tile_dim ); + // compute workload from a particle view and do partition optimization + constexpr int cell_num_per_tile_dim = 4; + constexpr int num_space_dim = 3; + auto pws = createParticleDynamicPartitionerWorkloadMeasurer< + cell_num_per_tile_dim, num_space_dim, TEST_DEVICE>( + particle_view, occupy_num_per_rank, global_low_corner, cell_size, + MPI_COMM_WORLD ); + partitioner.setLocalWorkload( &pws ); + partitioner.optimizePartition( MPI_COMM_WORLD ); + + // check results (should be the same as the gt_partition) + auto part = partitioner.getCurrentPartition(); + for ( int d = 0; d < 3; ++d ) + { + for ( int id = 0; id < ranks_per_dim[d] + 1; id++ ) + EXPECT_EQ( part[d][id], gt_partition[d][id] ); + } + + auto imbalance_factor = partitioner.computeImbalanceFactor( cart_comm ); + EXPECT_FLOAT_EQ( imbalance_factor, 1.0f ); +} + +//---------------------------------------------------------------------------// +// RUN TESTS +//---------------------------------------------------------------------------// +TEST( sparse_dim_partitioner, sparse_dim_partitioner_random_par_test ) +{ + random_distribution_automatic_rank( 50 ); +} +//---------------------------------------------------------------------------// +} // end namespace Test diff --git a/cajita/unit_test/tstPartitioner.hpp b/cajita/unit_test/tstPartitioner.hpp index c001da746..0a34fd322 100644 --- a/cajita/unit_test/tstPartitioner.hpp +++ b/cajita/unit_test/tstPartitioner.hpp @@ -9,10 +9,10 @@ * SPDX-License-Identifier: BSD-3-Clause * ****************************************************************************/ +#include #include #include #include -#include #include #include diff --git a/cajita/unit_test/tstSparseArray.hpp b/cajita/unit_test/tstSparseArray.hpp index 6234bcd76..a84c20414 100644 --- a/cajita/unit_test/tstSparseArray.hpp +++ b/cajita/unit_test/tstSparseArray.hpp @@ -9,8 +9,8 @@ * SPDX-License-Identifier: BSD-3-Clause * ****************************************************************************/ +#include #include -#include #include #include @@ -181,13 +181,9 @@ void sparse_array_test( int par_num, EntityType entity ) std::array is_dim_periodic = { false, false, false }; // sparse partitioner - T max_workload_coeff = 1.5; - int workload_num = size_per_dim * size_per_dim * size_per_dim; - int num_step_rebalance = 200; int max_optimize_iteration = 10; - SparseDimPartitioner partitioner( - MPI_COMM_WORLD, max_workload_coeff, workload_num, num_step_rebalance, - global_num_cell, max_optimize_iteration ); + DynamicPartitioner partitioner( + MPI_COMM_WORLD, global_num_cell, max_optimize_iteration ); // rank-related information Kokkos::Array cart_rank; @@ -207,8 +203,8 @@ void sparse_array_test( int par_num, EntityType entity ) // scene initialization auto gt_partitions = generate_random_partition( ranks_per_dim, size_tile_per_dim ); - partitioner.initializeRecPartition( gt_partitions[0], gt_partitions[1], - gt_partitions[2] ); + partitioner.setRecPartition( gt_partitions[0], gt_partitions[1], + gt_partitions[2] ); std::set> tile_set; std::set> par_pos_set; @@ -427,13 +423,9 @@ void full_occupy_test( EntityType entity ) std::array is_dim_periodic = { false, false, false }; // sparse partitioner - T max_workload_coeff = 1.5; - int workload_num = size_per_dim * size_per_dim * size_per_dim; - int num_step_rebalance = 200; int max_optimize_iteration = 10; - SparseDimPartitioner partitioner( - MPI_COMM_WORLD, max_workload_coeff, workload_num, num_step_rebalance, - global_num_cell, max_optimize_iteration ); + DynamicPartitioner partitioner( + MPI_COMM_WORLD, global_num_cell, max_optimize_iteration ); // rank-related information Kokkos::Array cart_rank; @@ -453,8 +445,8 @@ void full_occupy_test( EntityType entity ) // scene initialization auto gt_partitions = generate_random_partition( ranks_per_dim, size_tile_per_dim ); - partitioner.initializeRecPartition( gt_partitions[0], gt_partitions[1], - gt_partitions[2] ); + partitioner.setRecPartition( gt_partitions[0], gt_partitions[1], + gt_partitions[2] ); // mesh/grid related initialization auto global_mesh = createSparseGlobalMesh( diff --git a/cajita/unit_test/tstSparseLocalGrid.hpp b/cajita/unit_test/tstSparseLocalGrid.hpp index 97a1c393b..dfb4b3209 100644 --- a/cajita/unit_test/tstSparseLocalGrid.hpp +++ b/cajita/unit_test/tstSparseLocalGrid.hpp @@ -8,9 +8,9 @@ * * * SPDX-License-Identifier: BSD-3-Clause * ****************************************************************************/ +#include #include #include -#include #include #include @@ -33,9 +33,6 @@ void sparseLocalGridTest( EntityType t2 ) double cell_size = 0.23; std::array global_num_cell = { 16, 32, 64 }; int cell_num_per_tile_dim = 4; - std::array global_num_tile = { 16 / cell_num_per_tile_dim, - 32 / cell_num_per_tile_dim, - 64 / cell_num_per_tile_dim }; std::array global_low_corner = { 1.2, 3.3, -2.8 }; std::array global_high_corner = { global_low_corner[0] + cell_size * global_num_cell[0], @@ -46,24 +43,8 @@ void sparseLocalGridTest( EntityType t2 ) // Create and initialize sparse partitioner std::array periodic = { false, false, false }; - SparseDimPartitioner partitioner( - MPI_COMM_WORLD, 1.5, 16 * 32, 100, global_num_cell, 10 ); - auto ranks_per_dim = - partitioner.ranksPerDimension( MPI_COMM_WORLD, global_num_cell ); - std::array, 3> rec_partitions; - for ( int d = 0; d < 3; ++d ) - { - int ele = global_num_tile[d] / ranks_per_dim[d]; - int part = 0; - for ( int i = 0; i < ranks_per_dim[d]; ++i ) - { - rec_partitions[d].push_back( part ); - part += ele; - } - rec_partitions[d].push_back( global_num_tile[d] ); - } - partitioner.initializeRecPartition( rec_partitions[0], rec_partitions[1], - rec_partitions[2] ); + DynamicPartitioner partitioner( MPI_COMM_WORLD, + global_num_cell, 10 ); // Create global grid auto global_grid_ptr = Cajita::createGlobalGrid( diff --git a/cajita/unit_test/tstSparseDimPartitioner.hpp b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp similarity index 70% rename from cajita/unit_test/tstSparseDimPartitioner.hpp rename to cajita/unit_test/tstSparseMapDynamicPartitioner.hpp index 9eb6c185b..fb30ddb09 100644 --- a/cajita/unit_test/tstSparseDimPartitioner.hpp +++ b/cajita/unit_test/tstSparseMapDynamicPartitioner.hpp @@ -9,8 +9,8 @@ * SPDX-License-Identifier: BSD-3-Clause * ****************************************************************************/ -#include #include +#include #include #include @@ -34,12 +34,8 @@ void uniform_distribution_automatic_rank() constexpr int size_tile_per_dim = 16; constexpr int cell_per_tile_dim = 4; constexpr int size_per_dim = size_tile_per_dim * cell_per_tile_dim; - constexpr int total_size = size_per_dim * size_per_dim * size_per_dim; // some settings for partitioner - float max_workload_coeff = 1.5; - int workload_num = total_size; - int num_step_rebalance = 100; int max_optimize_iteration = 10; std::array global_cells_per_dim = { size_tile_per_dim * cell_per_tile_dim, @@ -47,9 +43,8 @@ void uniform_distribution_automatic_rank() size_tile_per_dim * cell_per_tile_dim }; // partitioner - SparseDimPartitioner partitioner( - MPI_COMM_WORLD, max_workload_coeff, workload_num, num_step_rebalance, - global_cells_per_dim, max_optimize_iteration ); + DynamicPartitioner partitioner( + MPI_COMM_WORLD, global_cells_per_dim, max_optimize_iteration ); // check the value of some pre-computed constants auto cbptd = partitioner.cell_bits_per_tile_dim; @@ -79,8 +74,6 @@ void uniform_distribution_automatic_rank() } rec_partitions[d].push_back( size_tile_per_dim ); } - partitioner.initializeRecPartition( rec_partitions[0], rec_partitions[1], - rec_partitions[2] ); // test getCurrentPartition function { @@ -147,7 +140,10 @@ void uniform_distribution_automatic_rank() Kokkos::fence(); // compute workload and do partition optimization - partitioner.optimizePartition( sis, MPI_COMM_WORLD ); + auto smws = createSparseMapDynamicPartitionerWorkloadMeasurer( + sis, MPI_COMM_WORLD ); + partitioner.setLocalWorkload( &smws ); + partitioner.optimizePartition( MPI_COMM_WORLD ); // check results (should be the same as the average partition) owned_cells_per_dim = partitioner.ownedCellsPerDimension( cart_comm ); @@ -219,66 +215,6 @@ auto generate_random_tiles( const std::array, 3>& gt_partition, return tiles_view; } -auto generate_random_particles( - const std::array, 3>& gt_partition, - const Kokkos::Array& cart_rank, int occupy_par_num_per_rank, - const std::array global_low_corner, double dx, - int cell_num_per_tile_dim ) -> Kokkos::View -{ - std::set> par_set; - - double start[3], size[3]; - for ( int d = 0; d < 3; ++d ) - { - start[d] = - ( gt_partition[d][cart_rank[d]] * cell_num_per_tile_dim + 0.5 ) * - dx + - global_low_corner[d]; - - size[d] = - ( ( gt_partition[d][cart_rank[d] + 1] * cell_num_per_tile_dim ) - - ( gt_partition[d][cart_rank[d]] * cell_num_per_tile_dim ) ) * - dx; - } - // insert the corner tiles to the set, to ensure the uniqueness of the - // ground truth partition - par_set.insert( - { start[0] + 0.01 * dx, start[1] + 0.01 * dx, start[2] + 0.01 * dx } ); - par_set.insert( { - start[0] + size[0] - dx - 0.01 * dx, - start[1] + size[1] - dx - 0.01 * dx, - start[2] + size[2] - dx - 0.01 * dx, - } ); - - // insert random tiles to the set - while ( static_cast( par_set.size() ) < occupy_par_num_per_rank ) - { - double rand_offset[3]; - for ( int d = 0; d < 3; ++d ) - rand_offset[d] = (double)std::rand() / RAND_MAX; - par_set.insert( { start[0] + rand_offset[0] * ( size[0] - dx ), - start[1] + rand_offset[1] * ( size[1] - dx ), - start[2] + rand_offset[2] * ( size[2] - dx ) } ); - } - - // particle_set => particle view (host) - typedef typename TEST_EXECSPACE::array_layout layout; - Kokkos::View par_view_host( - "particle_view_host", par_set.size() ); - int i = 0; - for ( auto it = par_set.begin(); it != par_set.end(); ++it ) - { - for ( int d = 0; d < 3; ++d ) - par_view_host( i, d ) = ( *it )[d]; - i++; - } - - // create tiles view on device - Kokkos::View par_view = - Kokkos::create_mirror_view_and_copy( TEST_MEMSPACE(), par_view_host ); - return par_view; -} - /*! \brief In this test, the ground truth partition is first randomly chosen, then a given number of tiles are regiestered on each rank (the most bottom-left and @@ -286,32 +222,24 @@ auto generate_random_particles( truth partition ) \param occupy_num_per_rank the tile number that will be registered on each MPI rank - \param use_tile2workload indicate the source to compute the workload on MPI - ranks, true if using tile occupation while false if using particle positions */ -void random_distribution_automatic_rank( int occupy_num_per_rank, - bool use_tile2workload = true ) +void random_distribution_automatic_rank( int occupy_num_per_rank ) { // define the domain size constexpr int size_tile_per_dim = 32; constexpr int cell_per_tile_dim = 4; constexpr int size_per_dim = size_tile_per_dim * cell_per_tile_dim; - constexpr int total_size = size_per_dim * size_per_dim * size_per_dim; srand( time( 0 ) ); // some settings for partitioner - float max_workload_coeff = 1.5; - int particle_num = total_size; - int num_step_rebalance = 100; int max_optimize_iteration = 10; std::array global_cells_per_dim = { size_per_dim, size_per_dim, size_per_dim }; // partitioner - SparseDimPartitioner partitioner( - MPI_COMM_WORLD, max_workload_coeff, particle_num, num_step_rebalance, - global_cells_per_dim, max_optimize_iteration ); + DynamicPartitioner partitioner( + MPI_COMM_WORLD, global_cells_per_dim, max_optimize_iteration ); // check the value of some pre-computed constants auto cbptd = partitioner.cell_bits_per_tile_dim; @@ -394,9 +322,6 @@ void random_distribution_automatic_rank( int occupy_num_per_rank, rec_partitions[d].push_back( size_tile_per_dim ); } - partitioner.initializeRecPartition( rec_partitions[0], rec_partitions[1], - rec_partitions[2] ); - // basic settings for domain size and position double cell_size = 0.1; int pre_alloc_size = size_per_dim * size_per_dim; @@ -406,42 +331,28 @@ void random_distribution_automatic_rank( int occupy_num_per_rank, global_low_corner[1] + cell_size * global_cells_per_dim[1], global_low_corner[2] + cell_size * global_cells_per_dim[2] }; - // use tile occupization info to compute the workload on MPI ranks - if ( use_tile2workload ) - { - // randomly generate a fixed number of tiles on every MPI rank - auto tiles_view = generate_random_tiles( - gt_partition, cart_rank, size_tile_per_dim, occupy_num_per_rank ); - // create a new sparseMap - auto global_mesh = createSparseGlobalMesh( - global_low_corner, global_high_corner, global_cells_per_dim ); - auto sis = - createSparseMap( global_mesh, pre_alloc_size ); - // register selected tiles to the sparseMap - Kokkos::parallel_for( - "insert_tile_to_sparse_map", - Kokkos::RangePolicy( 0, tiles_view.extent( 0 ) ), - KOKKOS_LAMBDA( int id ) { - sis.insertTile( tiles_view( id, 0 ), tiles_view( id, 1 ), - tiles_view( id, 2 ) ); - } ); - Kokkos::fence(); - - // compute workload from a sparseMap and do partition optimization - partitioner.optimizePartition( sis, MPI_COMM_WORLD ); - } - // use particle positions to compute teh workload on MPI ranks - else - { - // randomly generate a fixed number of particles on each MPI rank - auto particle_view = generate_random_particles( - gt_partition, cart_rank, occupy_num_per_rank, global_low_corner, - cell_size, cell_per_tile_dim ); - // compute workload from a particle view and do partition optimization - partitioner.optimizePartition( particle_view, occupy_num_per_rank, - global_low_corner, cell_size, - MPI_COMM_WORLD ); - } + // randomly generate a fixed number of tiles on every MPI rank + auto tiles_view = generate_random_tiles( + gt_partition, cart_rank, size_tile_per_dim, occupy_num_per_rank ); + // create a new sparseMap + auto global_mesh = createSparseGlobalMesh( + global_low_corner, global_high_corner, global_cells_per_dim ); + auto sis = createSparseMap( global_mesh, pre_alloc_size ); + // register selected tiles to the sparseMap + Kokkos::parallel_for( + "insert_tile_to_sparse_map", + Kokkos::RangePolicy( 0, tiles_view.extent( 0 ) ), + KOKKOS_LAMBDA( int id ) { + sis.insertTile( tiles_view( id, 0 ), tiles_view( id, 1 ), + tiles_view( id, 2 ) ); + } ); + Kokkos::fence(); + + // compute workload from a sparseMap and do partition optimization + auto smws = createSparseMapDynamicPartitionerWorkloadMeasurer( + sis, MPI_COMM_WORLD ); + partitioner.setLocalWorkload( &smws ); + partitioner.optimizePartition( MPI_COMM_WORLD ); // check results (should be the same as the gt_partition) auto part = partitioner.getCurrentPartition(); @@ -464,11 +375,7 @@ TEST( sparse_dim_partitioner, sparse_dim_partitioner_uniform_test ) } TEST( sparse_dim_partitioner, sparse_dim_partitioner_random_tile_test ) { - random_distribution_automatic_rank( 32, true ); -} -TEST( sparse_dim_partitioner, sparse_dim_partitioner_random_par_test ) -{ - random_distribution_automatic_rank( 50, false ); + random_distribution_automatic_rank( 32 ); } //---------------------------------------------------------------------------// } // end namespace Test