diff --git a/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixDenseRowUnpack.cpp b/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixDenseRowUnpack.cpp index 618795fef82f..44e7fd875e98 100644 --- a/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixDenseRowUnpack.cpp +++ b/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixDenseRowUnpack.cpp @@ -96,7 +96,11 @@ getTargetRowMapIndices(const LO lclNumRows, TEUCHOS_ASSERT(gblRow < indexBase + gblNumRows); tgtGids[lid] = gblRow; } - return std::move(tgtGids); + // The original return using std::move (commented out below) returns the + // following warning with gcc 9.2.0: + // waring: moving a local object in a return statement prevents copy elision [-Wpessimizing-move] + //return std::move(tgtGids); + return tgtGids; } RCP diff --git a/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixMatVec.cpp b/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixMatVec.cpp index 113af682dccf..6721fbdcc664 100644 --- a/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixMatVec.cpp +++ b/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixMatVec.cpp @@ -252,15 +252,15 @@ getTpetraCrsMatrix (Teuchos::FancyOStream& out, using Teuchos::rcp; using std::endl; using matrix_type = Tpetra::CrsMatrix<>; - using device_type = matrix_type::device_type; + //using device_type = matrix_type::device_type; using SC = matrix_type::impl_scalar_type; - using KAT = Kokkos::ArithTraits; + //using KAT = Kokkos::ArithTraits; using LO = Tpetra::Map<>::local_ordinal_type; - using host_device_type = Kokkos::View::host_mirror_space; - using host_execution_space = host_device_type::execution_space; + //using host_device_type = Kokkos::View::host_mirror_space; + //using host_execution_space = host_device_type::execution_space; // We're filling on the host, so generate random numbers on the host. - using pool_type = Kokkos::Random_XorShift64_Pool; + //using pool_type = Kokkos::Random_XorShift64_Pool; Teuchos::OSTab tab0 (out); out << "Create CrsMatrix for benchmark" << endl; diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 80d92a878855..12b261b0552e 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -4733,6 +4733,9 @@ namespace Tpetra { os << *prefix << endl; std::cerr << os.str (); } + Details::ProfilingRegion region( + "Tpetra::CrsMatrix::fillCompete", + "fillCompete"); TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC (! this->isFillActive () || this->isFillComplete (), std::runtime_error, @@ -4743,54 +4746,57 @@ namespace Tpetra { // // Read parameters from the input ParameterList. // - - // If true, the caller promises that no process did nonlocal - // changes since the last call to fillComplete. - bool assertNoNonlocalInserts = false; - // If true, makeColMap sorts remote GIDs (within each remote - // process' group). - bool sortGhosts = true; - - if (! params.is_null ()) { - assertNoNonlocalInserts = params->get ("No Nonlocal Changes", - assertNoNonlocalInserts); - if (params->isParameter ("sort column map ghost gids")) { - sortGhosts = params->get ("sort column map ghost gids", sortGhosts); - } - else if (params->isParameter ("Sort column Map ghost GIDs")) { - sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts); - } - } - // We also don't need to do global assembly if there is only one - // process in the communicator. - const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1; - // This parameter only matters if this matrix owns its graph. - if (! this->myGraph_.is_null ()) { - this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts; - } - - if (! this->getCrsGraphRef ().indicesAreAllocated ()) { - if (this->hasColMap ()) { // use local indices - allocateValues(LocalIndices, GraphNotYetAllocated, verbose); + { + Details::ProfilingRegion region_fc("Tpetra::CrsMatrix::fillCompete", "ParameterList"); + + // If true, the caller promises that no process did nonlocal + // changes since the last call to fillComplete. + bool assertNoNonlocalInserts = false; + // If true, makeColMap sorts remote GIDs (within each remote + // process' group). + bool sortGhosts = true; + + if (! params.is_null ()) { + assertNoNonlocalInserts = params->get ("No Nonlocal Changes", + assertNoNonlocalInserts); + if (params->isParameter ("sort column map ghost gids")) { + sortGhosts = params->get ("sort column map ghost gids", sortGhosts); + } + else if (params->isParameter ("Sort column Map ghost GIDs")) { + sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts); + } + } + // We also don't need to do global assembly if there is only one + // process in the communicator. + const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1; + // This parameter only matters if this matrix owns its graph. + if (! this->myGraph_.is_null ()) { + this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts; + } + + if (! this->getCrsGraphRef ().indicesAreAllocated ()) { + if (this->hasColMap ()) { // use local indices + allocateValues(LocalIndices, GraphNotYetAllocated, verbose); + } + else { // no column Map, so use global indices + allocateValues(GlobalIndices, GraphNotYetAllocated, verbose); + } + } + // Global assemble, if we need to. This call only costs a single + // all-reduce if we didn't need global assembly after all. + if (needGlobalAssemble) { + this->globalAssemble (); } - else { // no column Map, so use global indices - allocateValues(GlobalIndices, GraphNotYetAllocated, verbose); + else { + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC + (numProcs == 1 && nonlocals_.size() > 0, + std::runtime_error, "Cannot have nonlocal entries on a serial run. " + "An invalid entry (i.e., with row index not in the row Map) must have " + "been submitted to the CrsMatrix."); } } - // Global assemble, if we need to. This call only costs a single - // all-reduce if we didn't need global assembly after all. - if (needGlobalAssemble) { - this->globalAssemble (); - } - else { - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC - (numProcs == 1 && nonlocals_.size() > 0, - std::runtime_error, "Cannot have nonlocal entries on a serial run. " - "An invalid entry (i.e., with row index not in the row Map) must have " - "been submitted to the CrsMatrix."); - } - if (this->isStaticGraph ()) { + Details::ProfilingRegion region_isg("Tpetra::CrsMatrix::fillCompete", "isStaticGraph"); // FIXME (mfh 14 Nov 2016) In order to fix #843, I enable the // checks below only in debug mode. It would be nicer to do a // local check, then propagate the error state in a deferred @@ -4840,6 +4846,7 @@ namespace Tpetra { this->fillLocalMatrix (params); } else { + Details::ProfilingRegion region_insg("Tpetra::CrsMatrix::fillCompete", "isNotStaticGraph"); // Set the graph's domain and range Maps. This will clear the // Import if the domain Map has changed (is a different // pointer), and the Export if the range Map has changed (is a @@ -4892,16 +4899,26 @@ namespace Tpetra { this->myGraph_->checkInternalState (); } - const bool callComputeGlobalConstants = params.get () == nullptr || - params->get ("compute global constants", true); - if (callComputeGlobalConstants) { - this->computeGlobalConstants (); + { + Details::ProfilingRegion region_ccgc( + "Tpetra::CrsMatrix::fillCompete", "callComputeGlobalConstamnts" + ); + const bool callComputeGlobalConstants = params.get () == nullptr || + params->get ("compute global constants", true); + if (callComputeGlobalConstants) { + this->computeGlobalConstants (); + } } // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used. this->fillComplete_ = true; // Now we're fill complete! - this->checkInternalState (); + { + Details::ProfilingRegion region_cis( + "Tpetra::CrsMatrix::fillCompete", "checkInternalState" + ); + this->checkInternalState (); + } } template @@ -7284,6 +7301,11 @@ namespace Tpetra { typedef GlobalOrdinal GO; typedef impl_scalar_type ST; + Details::ProfilingRegion region_upack_row( + "Tpetra::CrsMatrix::unpackRow", + "Import/Export" + ); + if (numBytes == 0) { // Rows with zero bytes should always have zero entries. if (numEnt != 0) { @@ -7475,6 +7497,7 @@ namespace Tpetra { Distributor& dist) const { // The call to packNew in packAndPrepare catches and handles any exceptions. + Details::ProfilingRegion region_pack_new("Tpetra::CrsMatrix::packNew", "Import/Export"); if (this->isStaticGraph ()) { using ::Tpetra::Details::packCrsMatrixNew; packCrsMatrixNew (*this, exports, numPacketsPerLID, exportLIDs, @@ -7902,6 +7925,10 @@ namespace Tpetra { const CombineMode combineMode, const bool verbose) { + Details::ProfilingRegion region_unpack_and_combine_impl( + "Tpetra::CrsMatrix::unpackAndCombineImpl", + "Import/Export" + ); using std::endl; const char tfecfFuncName[] = "unpackAndCombineImpl"; std::unique_ptr prefix; @@ -8019,6 +8046,11 @@ namespace Tpetra { return; // nothing to do; no need to combine entries } + Details::ProfilingRegion region_unpack_and_combine_impl_non_static( + "Tpetra::CrsMatrix::unpackAndCombineImplNonStatic", + "Import/Export" + ); + // We're unpacking on host. This is read-only host access. if (imports.need_sync_host()) { imports.sync_host (); diff --git a/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp b/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp index a769c7aef494..47817957d4fe 100644 --- a/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp +++ b/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp @@ -48,6 +48,7 @@ #include #include #include "Teuchos_TestForException.hpp" +#include "Teuchos_OrdinalTraits.hpp" #include namespace Tpetra { @@ -56,6 +57,7 @@ namespace Details { namespace BehaviorDetails { std::map > namedVariableMap_; bool verboseDisabled_ = false; +bool timingDisabled_ = false; } namespace { // (anonymous) @@ -188,12 +190,16 @@ namespace { // (anonymous) else { // This could throw invalid_argument or out_of_range. // Go ahead and let it do so. - const long long val = std::stoll(stringToUpper(varVal)); - TEUCHOS_TEST_FOR_EXCEPTION - (val < static_cast(0), std::out_of_range, - prefix << "Environment variable \"" - << environmentVariableName << "\" is supposed to be a size, " - "but it has a negative integer value " << val << "."); + long long val = std::stoll(stringToUpper(varVal)); + if (val < static_cast(0)) { + // If negative - user has requested threshold be lifted + return std::numeric_limits::max(); + } +// TEUCHOS_TEST_FOR_EXCEPTION +// (val < static_cast(0), std::out_of_range, +// prefix << "Environment variable \"" +// << environmentVariableName << "\" is supposed to be a size, " +// "but it has a negative integer value " << val << "."); if (sizeof(long long) > sizeof(size_t)) { // It's hard to test this code, but I want to try writing it // at least, in case we ever have to run on 32-bit machines or @@ -271,6 +277,10 @@ namespace { // (anonymous) return false; } + constexpr bool timingDefault () { + return false; + } + constexpr bool assumeMpiIsCudaAwareDefault () { #ifdef TPETRA_ASSUME_CUDA_AWARE_MPI return true; @@ -279,6 +289,10 @@ namespace { // (anonymous) #endif // TPETRA_ASSUME_CUDA_AWARE_MPI } + constexpr bool hierarchicalUnpackDefault () { + return true; + } + } // namespace (anonymous) bool Behavior::debug () @@ -309,6 +323,21 @@ bool Behavior::verbose () defaultValue); } +bool Behavior::timing () +{ + if (BehaviorDetails::timingDisabled_) return false; + + constexpr char envVarName[] = "TPETRA_TIMING"; + constexpr bool defaultValue = timingDefault (); + + static bool value_ = defaultValue; + static bool initialized_ = false; + return idempotentlyGetEnvironmentVariableAsBool (value_, + initialized_, + envVarName, + defaultValue); +} + bool Behavior::assumeMpiIsCudaAware () { constexpr char envVarName[] = "TPETRA_ASSUME_CUDA_AWARE_MPI"; @@ -358,7 +387,7 @@ size_t Behavior::longRowMinNumEntries () (value_, initialized_, envVarName, defaultValue); } -size_t Behavior::multivectorKernelLocationThreshold () + size_t Behavior::multivectorKernelLocationThreshold () { constexpr char envVarName[] = "TPETRA_VECTOR_DEVICE_THRESHOLD"; constexpr size_t defaultValue (10000); @@ -369,7 +398,38 @@ size_t Behavior::multivectorKernelLocationThreshold () (value_, initialized_, envVarName, defaultValue); } -bool Behavior::profilingRegionUseTeuchosTimers () +size_t Behavior::hierarchicalUnpackBatchSize () +{ + constexpr char envVarName[] = "TPETRA_HIERARCHICAL_UNPACK_BATCH_SIZE"; + +#ifdef HAVE_TPETRA_INST_CUDA + constexpr size_t defaultValue (16); +#else + constexpr size_t defaultValue (256); +#endif + + static size_t value_ = defaultValue; + static bool initialized_ = false; + return idempotentlyGetEnvironmentVariableAsSize + (value_, initialized_, envVarName, defaultValue); +} + +size_t Behavior::hierarchicalUnpackTeamSize () +{ + constexpr char envVarName[] = "TPETRA_HIERARCHICAL_UNPACK_TEAM_SIZE"; +#ifdef HAVE_TPETRA_INST_CUDA + const size_t defaultValue (16); +#else + const size_t defaultValue (Teuchos::OrdinalTraits::invalid ()); +#endif + + static size_t value_ = defaultValue; + static bool initialized_ = false; + return idempotentlyGetEnvironmentVariableAsSize + (value_, initialized_, envVarName, defaultValue); +} + +bool Behavior::profilingRegionUseTeuchosTimers () { constexpr char envVarName[] = "TPETRA_USE_TEUCHOS_TIMERS"; constexpr bool defaultValue(false); @@ -380,7 +440,7 @@ bool Behavior::profilingRegionUseTeuchosTimers () (value_, initialized_, envVarName, defaultValue); } -bool Behavior::profilingRegionUseKokkosProfiling () +bool Behavior::profilingRegionUseKokkosProfiling () { constexpr char envVarName[] = "TPETRA_USE_KOKKOS_PROFILING"; constexpr bool defaultValue(false); @@ -426,6 +486,41 @@ void Behavior::disable_verbose_behavior () { BehaviorDetails::verboseDisabled_ = true; } +bool Behavior::timing (const char name[]) +{ + if (BehaviorDetails::timingDisabled_) return false; + + constexpr char envVarName[] = "TPETRA_TIMING"; + constexpr bool defaultValue = false; + + static bool initialized_ = false; + return idempotentlyGetNamedEnvironmentVariableAsBool (name, + initialized_, + envVarName, + defaultValue); +} + +void Behavior::enable_timing() { + BehaviorDetails::timingDisabled_ = false; +} + +void Behavior::disable_timing() { + BehaviorDetails::timingDisabled_ = true; +} + +bool Behavior::hierarchicalUnpack () +{ + constexpr char envVarName[] = "TPETRA_HIERARCHICAL_UNPACK"; + constexpr bool defaultValue = hierarchicalUnpackDefault(); + + static bool value_ = defaultValue; + static bool initialized_ = false; + return idempotentlyGetEnvironmentVariableAsBool (value_, + initialized_, + envVarName, + defaultValue); +} + } // namespace Details } // namespace Tpetra diff --git a/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp b/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp index 1bc24215d64e..fe63aac31b71 100644 --- a/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp @@ -97,20 +97,22 @@ namespace Details { /// /// TPETRA_DEBUG: flags Tpetra to turn on debug checking. /// TPETRA_VERBOSE: flags Tpetra to turn on debug _output_. +/// TPETRA_TIMING: flags Tpetra to turn on timing code. /// /// These are two different things. For example, TPETRA_DEBUG may do extra MPI /// communication in order to ensure correct error state propagation, but /// TPETRA_DEBUG should never print copious debug output if no errors occurred. /// The idea is that if users get a mysterious error or hang, they can rerun /// with TPETRA_DEBUG set. TPETRA_VERBOSE is for Tpetra developers to use for -/// debugging Tpetra. +/// debugging Tpetra. TPETRA_TIMING is for Tpetra developers to use for timing +/// Tpetra. /// /// The environment variables are understood to be "on" or "off" and recognized -/// if specified in one of two ways. The first is to specify the variable -/// unconditionally ON or OFF. e.g., TPETRA_VERBOSE=ON or TPETRA_VERBOSE=OFF. -/// The default value of TPETRA_VERBOSE is always OFF. The default value for -/// TPETRA_DEBUG is ON if Tpetra is configured with Tpetra_ENABLE_DEBUG, -/// otherwise it is OFF +/// if specified in one of two ways. The first is to specify the variable +/// unconditionally ON or OFF. e.g., TPETRA_[VERBOSE,DEBUG,TIMING]=ON or +/// TPETRA_[VERBOSE,DEBUG,TIMING]=OFF. The default value of TPETRA_VERBOSE and +/// TPETRA_TIMING is always OFF. The default value for TPETRA_DEBUG is ON if +/// Tpetra is configured with Tpetra_ENABLE_DEBUG, otherwise it is OFF. /// /// The second is to specify the variable on a per class/object basis, e.g., /// TPETRA_VERBOSE=CrsGraph,CrsMatrix,Distributor means that verbose output @@ -154,6 +156,24 @@ class Behavior { /// \brief Enable verbose mode, programatically static void enable_verbose_behavior (); + /// \brief Whether Tpetra is in timing mode. + /// + /// "Timing mode" means that Tpetra enables code that instruments internal timing. + static bool timing (); + + /// \brief Whether the given Tpetra object is in timing mode. + /// + /// \param name [in] Name of the Tpetra object. Typically, the object would + /// be a class name, e.g., "CrsGraph" or method, e.g., + /// "CrsGraph::insertLocalIndices". + static bool timing (const char name[]); + + /// \brief Disable timing, programatically + static void disable_timing(); + + /// \brief Enable timing, programatically + static void enable_timing(); + /// \brief Whether to assume that MPI is CUDA aware. /// /// An MPI implementation is "CUDA aware" if it can accept CUDA @@ -200,6 +220,15 @@ class Behavior { /// separate question. static size_t longRowMinNumEntries (); + /// \brief Unpack rows of a matrix using hierarchical unpacking + static bool hierarchicalUnpack (); + + /// \brief Size of batch for hierarchical unpacking + static size_t hierarchicalUnpackBatchSize (); + + /// \brief Size of team for hierarchical unpacking + static size_t hierarchicalUnpackTeamSize (); + /// \brief the threshold for transitioning from device to host /// /// If the number of elements in the multivector does not exceed this diff --git a/packages/tpetra/core/src/Tpetra_Details_Profiling.cpp b/packages/tpetra/core/src/Tpetra_Details_Profiling.cpp index 46e9c707dc49..8a3656648bcb 100644 --- a/packages/tpetra/core/src/Tpetra_Details_Profiling.cpp +++ b/packages/tpetra/core/src/Tpetra_Details_Profiling.cpp @@ -51,20 +51,33 @@ namespace Tpetra { namespace Details { ProfilingRegion::ProfilingRegion (const char name[]) { -#if defined(KOKKOS_ENABLE_PROFILING) - if(Behavior::profilingRegionUseKokkosProfiling()) +#if defined(KOKKOS_ENABLE_PROFILING) + if(Behavior::profilingRegionUseKokkosProfiling()) ::Kokkos::Profiling::pushRegion(name); #endif if(Behavior::profilingRegionUseTeuchosTimers()) tm = Teuchos::rcp(new Teuchos::TimeMonitor(*Teuchos::TimeMonitor::getNewTimer(name))); - + +} + +ProfilingRegion::ProfilingRegion (const char name[], const char group[]) { + const bool timeit = Behavior::timing(group); + if (timeit) + { +#if defined(KOKKOS_ENABLE_PROFILING) + if(Behavior::profilingRegionUseKokkosProfiling()) + ::Kokkos::Profiling::pushRegion(name); +#endif + if(Behavior::profilingRegionUseTeuchosTimers()) + tm = Teuchos::rcp(new Teuchos::TimeMonitor(*Teuchos::TimeMonitor::getNewTimer(name))); + } } ProfilingRegion::~ProfilingRegion () { #if defined(KOKKOS_ENABLE_PROFILING) - if(Behavior::profilingRegionUseKokkosProfiling()) + if(Behavior::profilingRegionUseKokkosProfiling()) ::Kokkos::Profiling::popRegion(); -#endif +#endif } } // namespace Details diff --git a/packages/tpetra/core/src/Tpetra_Details_Profiling.hpp b/packages/tpetra/core/src/Tpetra_Details_Profiling.hpp index cef7251d8a08..a2512dd8904e 100644 --- a/packages/tpetra/core/src/Tpetra_Details_Profiling.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_Profiling.hpp @@ -104,6 +104,9 @@ class ProfilingRegion { public: //! Open region to profile; name the region \c name. ProfilingRegion (const char name[]); + //! Open region to profile, if the group name \c group is enabled by the + //! TPETRA_TIMING variable; name the region \c name. + ProfilingRegion (const char name[], const char group[]); //! Close region to profile. ~ProfilingRegion (); diff --git a/packages/tpetra/core/src/Tpetra_Details_packCrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_Details_packCrsMatrix_def.hpp index b5bbb73e182b..71477576e13c 100644 --- a/packages/tpetra/core/src/Tpetra_Details_packCrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_packCrsMatrix_def.hpp @@ -49,6 +49,7 @@ #include "Tpetra_Details_getEntryOnHost.hpp" #include "Tpetra_Details_OrdinalTraits.hpp" #include "Tpetra_Details_PackTraits.hpp" +#include "Tpetra_Details_Profiling.hpp" #include "Tpetra_CrsMatrix_decl.hpp" #include #include @@ -733,6 +734,10 @@ packCrsMatrix (const CrsMatrix& sourceMatrix, const bool pack_pids, Distributor& /* dist */) { + ::Tpetra::Details::ProfilingRegion region_pack_crs_matrix( + "Tpetra::Details::PackCrsMatrixImpl::packCrsMatrix", + "Import/Export" + ); using Kokkos::View; typedef BufferDeviceType DT; typedef typename DT::execution_space execution_space; @@ -930,15 +935,14 @@ packCrsMatrix (const CrsMatrix& sourceMatrix, template void -packCrsMatrixNew (const CrsMatrix& sourceMatrix, - Kokkos::DualView::buffer_device_type>& exports, - const Kokkos::DualView::buffer_device_type>& numPacketsPerLID, - const Kokkos::DualView::buffer_device_type>& exportLIDs, - size_t& constantNumPackets, - Distributor& distor) +packCrsMatrixNew( + const CrsMatrix& sourceMatrix, + Kokkos::DualView::buffer_device_type>& exports, + const Kokkos::DualView::buffer_device_type>& numPacketsPerLID, + const Kokkos::DualView::buffer_device_type>& exportLIDs, + size_t& constantNumPackets, + Distributor& distor +) { using device_type = typename CrsMatrix::device_type; using buffer_device_type = typename DistObject::buffer_device_type; @@ -957,6 +961,10 @@ packCrsMatrixNew (const CrsMatrix& sourceMatrix, TEUCHOS_ASSERT( ! exportLIDs.need_sync_device () ); auto exportLIDs_d = exportLIDs.view_device (); + ::Tpetra::Details::ProfilingRegion region_pack_crs_matrix_new( + "Tpetra::Details::packCrsMatrixNew", + "Import/Export" + ); PackCrsMatrixImpl::packCrsMatrix ( sourceMatrix, exports, numPacketsPerLID_d, exportLIDs_d, exportPIDs_d, constantNumPackets, pack_pids, distor); diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp index ac3e6ed9bcd9..908ca4a10000 100644 --- a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp @@ -43,6 +43,7 @@ #include "TpetraCore_config.h" #include "Teuchos_Array.hpp" #include "Teuchos_ArrayView.hpp" +#include "Teuchos_OrdinalTraits.hpp" #include "Tpetra_Details_castAwayConstDualView.hpp" #include "Tpetra_Details_computeOffsets.hpp" #include "Tpetra_Details_createMirrorView.hpp" @@ -107,7 +108,7 @@ unpackRow(const typename PackTraits::output_array_type& gids_out, const size_t offset, const size_t /* num_bytes */, const size_t num_ent, - const size_t num_bytes_per_value) + const size_t bytes_per_value) { if (num_ent == 0) { // Empty rows always take zero bytes, to ensure sparsity. @@ -128,7 +129,7 @@ unpackRow(const typename PackTraits::output_array_type& gids_out, size_t (0); const size_t vals_beg = gids_beg + gids_len + pids_len; - const size_t vals_len = num_ent * num_bytes_per_value; + const size_t vals_len = num_ent * bytes_per_value; const char* const num_ent_in = imports + num_ent_beg; const char* const gids_in = imports + gids_beg; @@ -199,12 +200,8 @@ struct UnpackCrsMatrixAndCombineFunctor { typedef Kokkos::View input_buffer_type; typedef Kokkos::View import_lids_type; - typedef Kokkos::View lids_scratch_type; - typedef Kokkos::View gids_scratch_type; - typedef Kokkos::View pids_scratch_type; - typedef Kokkos::View vals_scratch_type; - - typedef Kokkos::pair value_type; + typedef Kokkos::View error_type; + using member_type = typename Kokkos::TeamPolicy::member_type; static_assert (std::is_same::value, "LocalMap::local_ordinal_type and " @@ -215,17 +212,13 @@ struct UnpackCrsMatrixAndCombineFunctor { input_buffer_type imports; num_packets_per_lid_type num_packets_per_lid; import_lids_type import_lids; + Kokkos::View batch_info; offsets_type offsets; Tpetra::CombineMode combine_mode; - size_t max_num_ent; - bool unpack_pids; - size_t num_bytes_per_value; + size_t batch_size; + size_t bytes_per_value; bool atomic; - Kokkos::Experimental::UniqueToken tokens; - lids_scratch_type lids_scratch; - gids_scratch_type gids_scratch; - pids_scratch_type pids_scratch; - vals_scratch_type vals_scratch; + error_type error_code; UnpackCrsMatrixAndCombineFunctor( const local_matrix_type& local_matrix_in, @@ -233,172 +226,190 @@ struct UnpackCrsMatrixAndCombineFunctor { const input_buffer_type& imports_in, const num_packets_per_lid_type& num_packets_per_lid_in, const import_lids_type& import_lids_in, + const Kokkos::View& batch_info_in, const offsets_type& offsets_in, const Tpetra::CombineMode combine_mode_in, - const size_t max_num_ent_in, - const bool unpack_pids_in, - const size_t num_bytes_per_value_in, + const size_t batch_size_in, + const size_t bytes_per_value_in, const bool atomic_in) : local_matrix (local_matrix_in), local_col_map (local_col_map_in), imports (imports_in), num_packets_per_lid (num_packets_per_lid_in), import_lids (import_lids_in), + batch_info (batch_info_in), offsets (offsets_in), combine_mode (combine_mode_in), - max_num_ent (max_num_ent_in), - unpack_pids (unpack_pids_in), - num_bytes_per_value (num_bytes_per_value_in), + batch_size (batch_size_in), + bytes_per_value (bytes_per_value_in), atomic (atomic_in), - tokens (XS()), - lids_scratch (Kokkos::view_alloc("lids_scratch", Kokkos::WithoutInitializing), tokens.size() * max_num_ent), - gids_scratch (Kokkos::view_alloc("gids_scratch", Kokkos::WithoutInitializing), tokens.size() * max_num_ent), - pids_scratch (Kokkos::view_alloc("pids_scratch", Kokkos::WithoutInitializing), tokens.size() * max_num_ent), - vals_scratch (Kokkos::view_alloc("vals_scratch", Kokkos::WithoutInitializing), tokens.size() * max_num_ent) + error_code("error") {} - KOKKOS_INLINE_FUNCTION void init(value_type& dst) const - { - using Tpetra::Details::OrdinalTraits; - dst = Kokkos::make_pair (0, OrdinalTraits::invalid ()); - } - - KOKKOS_INLINE_FUNCTION void - join (volatile value_type& dst, const volatile value_type& src) const - { - // `dst` should reflect the first (least) bad index and - // all other associated error codes and data. Thus, we need only - // check if the `src` object shows an error and if its associated - // bad index is less than `dst`'s bad index. - using Tpetra::Details::OrdinalTraits; - if (src.second != OrdinalTraits::invalid ()) { - // An error in the src; check if - // 1. `dst` shows errors - // 2. If `dst` does show errors, if src's bad index is less than - // *this' bad index - if (dst.second == OrdinalTraits::invalid () || - src.second < dst.second) { - dst = src; - } - } - } - KOKKOS_INLINE_FUNCTION - void operator()(const LO i, value_type& dst) const + void operator()(member_type team_member) const { using Kokkos::View; using Kokkos::subview; using Kokkos::MemoryUnmanaged; - using size_type = typename XS::size_type; - using slice = Kokkos::pair; - typedef View lids_out_type; - typedef View pids_out_type; - typedef View gids_out_type; - typedef View vals_out_type; + const LO batch = team_member.league_rank(); + const LO lid_no = batch_info(batch, 0); + const LO batch_no = batch_info(batch, 1); - const size_t num_bytes = num_packets_per_lid(i); + const size_t num_bytes = num_packets_per_lid(lid_no); // Only unpack data if there is a nonzero number of bytes. - if (num_bytes == 0) { + if (num_bytes == 0) return; - } // there is actually something in the row - const LO import_lid = import_lids[i]; + const LO import_lid = import_lids(lid_no); const size_t buf_size = imports.size(); - const size_t offset = offsets(i); + const size_t offset = offsets(lid_no); // Get the number of entries to expect in the received data for this row. LO num_ent_LO = 0; - const char* const in_buf = imports.data () + offset; - (void) PackTraits::unpackValue (num_ent_LO, in_buf); - const size_t num_ent = static_cast (num_ent_LO); + const char* const in_buf = imports.data() + offset; + (void) PackTraits::unpackValue(num_ent_LO, in_buf); + const size_t num_entries_in_row = static_cast(num_ent_LO); // Count the number of bytes expected to unpack size_t expected_num_bytes = 0; { - expected_num_bytes += PackTraits::packValueCount (LO (0)); - expected_num_bytes += num_ent * PackTraits::packValueCount (GO (0)); - if (unpack_pids) { - expected_num_bytes += num_ent * PackTraits::packValueCount (int (0)); - } - expected_num_bytes += num_ent * PackTraits::packValueCount (ST ()); + expected_num_bytes += PackTraits::packValueCount(LO(0)); + expected_num_bytes += num_entries_in_row * PackTraits::packValueCount(GO(0)); + expected_num_bytes += num_entries_in_row * PackTraits::packValueCount(ST()); } - if (expected_num_bytes > num_bytes) { - dst = Kokkos::make_pair (1, i); // wrong number of bytes + if (expected_num_bytes > num_bytes) + { + printf( + "*** Error: UnpackCrsMatrixAndCombineFunctor: " + "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n", + (int) lid_no, (int) expected_num_bytes, (int) num_bytes + ); + Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21); return; } - if (offset > buf_size || offset + num_bytes > buf_size) { - dst = Kokkos::make_pair (2, i); // out of bounds + if (offset > buf_size || offset + num_bytes > buf_size) + { + printf( + "*** Error: UnpackCrsMatrixAndCombineFunctor: " + "At row %d, the offset (%d) > buffer size (%d)\n", + (int) lid_no, (int) offset, (int) buf_size + ); + Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22); return; } - // Get subviews in to the scratch arrays. The token returned from acquire - // is an integer in [0, tokens.size()). It is used to grab a unique (to - // this thread) subview of the scratch arrays. - const size_type token = tokens.acquire(); - const size_t a = static_cast(token) * max_num_ent; - const size_t b = a + num_ent; - lids_out_type lids_out = subview(lids_scratch, slice(a, b)); - gids_out_type gids_out = subview(gids_scratch, slice(a, b)); - pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a))); - vals_out_type vals_out = subview(vals_scratch, slice(a, b)); - - // Unpack this row! - int unpack_err = - unpackRow(gids_out, pids_out, vals_out, - imports.data(), offset, num_bytes, - num_ent, num_bytes_per_value); - if (unpack_err != 0) { - dst = Kokkos::make_pair (unpack_err, i); // unpack error - tokens.release (token); - return; + // Determine the number of entries to unpack in this batch + size_t num_entries_in_batch = 0; + if (num_entries_in_row <= batch_size) + num_entries_in_batch = num_entries_in_row; + else if (num_entries_in_row >= (batch_no + 1) * batch_size) + num_entries_in_batch = batch_size; + else + num_entries_in_batch = num_entries_in_row - batch_no * batch_size; + + const size_t bytes_per_lid = PackTraits::packValueCount(LO(0)); + const size_t num_ent_start = offset; + const size_t num_ent_end = num_ent_start + bytes_per_lid; + + const size_t bytes_per_gid = PackTraits::packValueCount(GO(0)); + const size_t gids_start = num_ent_end; + const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid; + + const size_t vals_start = gids_end; + + const size_t shift = batch_no * batch_size; + const char* const num_ent_in = imports.data() + num_ent_start; + const char* const gids_in = imports.data() + gids_start + shift * bytes_per_gid; + const char* const vals_in = imports.data() + vals_start + shift * bytes_per_value; + + LO num_ent_out; + (void)PackTraits::unpackValue(num_ent_out, num_ent_in); + if (static_cast(num_ent_out) != num_entries_in_row) + { + printf( + "*** Error: UnpackCrsMatrixAndCombineFunctor: " + "At row %d, number of entries (%d) != number of entries unpacked (%d)\n", + (int) lid_no, (int) num_entries_in_row, (int) num_ent_out + ); + Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23); } - // Column indices come in as global indices, in case the - // source object's column Map differs from the target object's - // (this's) column Map, and must be converted local index values - for (size_t k = 0; k < num_ent; ++k) { - lids_out(k) = local_col_map.getLocalElement (gids_out(k)); - } + constexpr bool matrix_has_sorted_rows = true; // see #6282 + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team_member, num_entries_in_batch), + KOKKOS_LAMBDA(const LO& j) + { + size_t distance = 0; + + GO gid_out; + distance = j * bytes_per_gid; + (void) PackTraits::unpackValue(gid_out, gids_in + distance); + auto lid_out = local_col_map.getLocalElement(gid_out); + + // Column indices come in as global indices, in case the + // source object's column Map differs from the target object's + // (this's) column Map, and must be converted local index values + + // assume that ST is default constructible + ST val_out; + distance = j * bytes_per_value; + (void) PackTraits::unpackValue(val_out, vals_in + distance); + + if (combine_mode == ADD) { + // NOTE (mfh 20 Nov 2019) Must assume atomic is required, unless + // different threads don't touch the same row (i.e., no + // duplicates in incoming LIDs list). + const bool use_atomic_updates = atomic; + (void)local_matrix.sumIntoValues( + import_lid, + &lid_out, + 1, + &val_out, + matrix_has_sorted_rows, + use_atomic_updates + ); + } else if (combine_mode == REPLACE) { + // NOTE (mfh 20 Nov 2019): It's never correct to use REPLACE + // combine mode with multiple incoming rows that touch the same + // target matrix entries, so we never need atomic updates. + const bool use_atomic_updates = false; + (void)local_matrix.replaceValues( + import_lid, + &lid_out, + 1, + &val_out, + matrix_has_sorted_rows, + use_atomic_updates + ); + } else { + // should never get here + printf( + "*** Error: UnpackCrsMatrixAndCombineFunctor: " + "At row %d, an unknown error occurred during unpack\n", (int) lid_no + ); + Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31); + } + } + ); - // Combine the values according to the combine_mode - const LO* const lids_raw = const_cast (lids_out.data ()); - const ST* const vals_raw = const_cast (vals_out.data ()); - LO num_modified = 0; + team_member.team_barrier(); - constexpr bool matrix_has_sorted_rows = true; // see #6282 - if (combine_mode == ADD) { - // NOTE (mfh 20 Nov 2019) Must assume atomic is required, unless - // different threads don't touch the same row (i.e., no - // duplicates in incoming LIDs list). - const bool use_atomic_updates = atomic; - num_modified += - local_matrix.sumIntoValues (import_lid, lids_raw, num_ent, - vals_raw, matrix_has_sorted_rows, - use_atomic_updates); - } - else if (combine_mode == REPLACE) { - // NOTE (mfh 20 Nov 2019): It's never correct to use REPLACE - // combine mode with multiple incoming rows that touch the same - // target matrix entries, so we never need atomic updates. - const bool use_atomic_updates = false; - num_modified += - local_matrix.replaceValues (import_lid, lids_raw, num_ent, - vals_raw, matrix_has_sorted_rows, - use_atomic_updates); - } - else { - dst = Kokkos::make_pair (4, i); // invalid combine mode - tokens.release (token); - return; - } + } - tokens.release (token); + //! Host function for getting the error. + int error() const { + auto error_code_h = Kokkos::create_mirror_view_and_copy( + Kokkos::HostSpace(), error_code + ); + return error_code_h(); } + }; struct MaxNumEntTag {}; @@ -423,8 +434,6 @@ class NumEntriesFunctor { typedef size_t value_type; private: - typedef Kokkos::pair slice; - num_packets_per_lid_type num_packets_per_lid; offsets_type offsets; input_buffer_type imports; @@ -529,6 +538,53 @@ compute_total_num_entries ( return tot_num_ent; } +template +KOKKOS_INLINE_FUNCTION +size_t +unpackRowCount(const char imports[], + const size_t offset, + const size_t num_bytes) +{ + using PT = PackTraits; + + LO num_ent_LO = 0; + if (num_bytes > 0) { + const size_t p_num_bytes = PT::packValueCount(num_ent_LO); + if (p_num_bytes > num_bytes) { + return OrdinalTraits::invalid(); + } + const char* const in_buf = imports + offset; + (void) PT::unpackValue(num_ent_LO, in_buf); + } + return static_cast(num_ent_LO); +} + +/// \brief Compute the index and batch number associated with each batch +/// +/// batch_info(i, 0) is the local index of the ith batch +/// batch_info(i, 1) is the local batch number of the ith batch +template +inline +bool +compute_batch_info( + const View1& batches_per_lid, + View2& batch_info +) +{ + using LO = typename View2::value_type; + size_t batch = 0; + for (size_t i=0; i(i); + batch_info(batch, 1) = batch_no; + batch++; + } + } + return batch == batch_info.extent(0); +} + /// \brief Perform the unpack operation for the matrix /// /// \tparam LocalMatrix the specialization of the KokkosSparse::CrsMatrix @@ -544,18 +600,12 @@ unpackAndCombineIntoCrsMatrix( const Kokkos::View& imports, const Kokkos::View& num_packets_per_lid, const typename PackTraits::input_array_type import_lids, - const Tpetra::CombineMode combine_mode, - const bool unpack_pids) + const Tpetra::CombineMode combine_mode) { using ST = typename LocalMatrix::value_type; using LO = typename LocalMap::local_ordinal_type; using DT = typename LocalMap::device_type; using XS = typename DT::execution_space; - using range_policy = - Kokkos::RangePolicy >; - using unpack_functor_type = - UnpackCrsMatrixAndCombineFunctor; const char prefix[] = "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::" "unpackAndCombineIntoCrsMatrix: "; @@ -599,28 +649,85 @@ unpackAndCombineIntoCrsMatrix( Kokkos::View offsets("offsets", num_import_lids+1); computeOffsetsFromCounts(offsets, num_packets_per_lid); - // Determine the maximum number of entries in any row in the matrix. The - // maximum number of entries is needed to allocate unpack buffers on the - // device. - size_t max_num_ent = compute_maximum_num_entries( - num_packets_per_lid, offsets, imports); + // Determine the sizes of the unpack batches + size_t max_num_ent = compute_maximum_num_entries(num_packets_per_lid, offsets, imports); + const size_t default_batch_size = Tpetra::Details::Behavior::hierarchicalUnpackBatchSize(); + const size_t batch_size = std::min(default_batch_size, max_num_ent); + + // To achieve some balance amongst threads, unpack each row in equal size batches + size_t num_batches = 0; + Kokkos::View batch_info("", num_batches); + Kokkos::View batches_per_lid("", num_import_lids); + // Compute meta data that allows batch unpacking + Kokkos::parallel_reduce( + Kokkos::RangePolicy>(0, num_import_lids), + KOKKOS_LAMBDA(const size_t i, size_t& batches) + { + const size_t num_entries_in_row = unpackRowCount( + imports.data(), offsets(i), num_packets_per_lid(i) + ); + batches_per_lid(i) = + (num_entries_in_row <= batch_size) ? + 1 : + num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0); + batches += batches_per_lid(i); + }, + num_batches + ); + Kokkos::resize(batch_info, num_batches); + + Kokkos::HostSpace host_space; + auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid); + Kokkos::deep_copy(batches_per_lid_h, batches_per_lid); + + auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info); + + (void) compute_batch_info(batches_per_lid_h, batch_info_h); + Kokkos::deep_copy(batch_info, batch_info_h); // FIXME (TJF SEP 2017) // The scalar type is not necessarily default constructible - size_t num_bytes_per_value = PackTraits::packValueCount(ST()); + size_t bytes_per_value = PackTraits::packValueCount(ST()); // Now do the actual unpack! const bool atomic = XS::concurrency() != 1; - unpack_functor_type f(local_matrix, local_map, - imports, num_packets_per_lid, import_lids, offsets, combine_mode, - max_num_ent, unpack_pids, num_bytes_per_value, atomic); - - typename unpack_functor_type::value_type x; - Kokkos::parallel_reduce(range_policy(0, static_cast(num_import_lids)), f, x); - auto x_h = x.to_std_pair(); - TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error, - prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " - << x_h.first << " for the first bad row " << x_h.second); + using functor = UnpackCrsMatrixAndCombineFunctor; + functor f( + local_matrix, + local_map, + imports, + num_packets_per_lid, + import_lids, + batch_info, + offsets, + combine_mode, + batch_size, + bytes_per_value, + atomic + ); + + using policy = Kokkos::TeamPolicy>; + const size_t team_size = Tpetra::Details::Behavior::hierarchicalUnpackTeamSize(); +#if defined(KOKKOS_ENABLE_CUDA) + constexpr bool is_cuda = std::is_same::value; +#else + constexpr bool is_cuda = false; +#endif + if (!is_cuda || team_size == Teuchos::OrdinalTraits::invalid()) + { + Kokkos::parallel_for(policy(static_cast(num_batches), Kokkos::AUTO), f); + } + else + { + Kokkos::parallel_for(policy(static_cast(num_batches), static_cast(team_size)), f); + } + + auto error_code = f.error(); + TEUCHOS_TEST_FOR_EXCEPTION( + error_code != 0, + std::runtime_error, + prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code + ); } template @@ -681,27 +788,6 @@ unpackAndCombineWithOwningPIDsCount( return count; } -template -KOKKOS_INLINE_FUNCTION -size_t -unpackRowCount(const char imports[], - const size_t offset, - const size_t num_bytes) -{ - using PT = PackTraits; - - LO num_ent_LO = 0; - if (num_bytes > 0) { - const size_t p_num_bytes = PT::packValueCount(num_ent_LO); - if (p_num_bytes > num_bytes) { - return OrdinalTraits::invalid(); - } - const char* const in_buf = imports + offset; - (void) PT::unpackValue(num_ent_LO, in_buf); - } - return static_cast(num_ent_LO); -} - /// \brief Setup row pointers for remotes template int @@ -868,7 +954,7 @@ unpackAndCombineIntoCrsArrays2( const LocalMatrix& /* local_matrix */, const LocalMap /*& local_col_map*/, const int my_pid, - const size_t num_bytes_per_value) + const size_t bytes_per_value) { using Kokkos::View; using Kokkos::subview; @@ -920,7 +1006,7 @@ unpackAndCombineIntoCrsArrays2( k_error += unpackRow(gids_out, pids_out, vals_out, imports.data(), offset, num_bytes, - num_ent, num_bytes_per_value); + num_ent, bytes_per_value); // Correct target PIDs. for (size_t j = 0; j < static_cast(num_ent); ++j) { @@ -951,7 +1037,7 @@ unpackAndCombineIntoCrsArrays( const size_t tgt_num_rows, const size_t tgt_num_nonzeros, const int my_tgt_pid, - const size_t num_bytes_per_value) + const size_t bytes_per_value) { using Kokkos::View; using Kokkos::subview; @@ -1048,7 +1134,7 @@ unpackAndCombineIntoCrsArrays( int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids, tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid, - local_matrix, local_col_map, my_pid, num_bytes_per_value); + local_matrix, local_col_map, my_pid, bytes_per_value); TEUCHOS_TEST_FOR_EXCEPTION( unpack_err != 0, std::logic_error, prefix << "unpack loop failed. This " "should never happen. Please report this bug to the Tpetra developers."); @@ -1138,10 +1224,18 @@ unpackCrsMatrixAndCombine( auto local_matrix = sourceMatrix.getLocalMatrix(); auto local_col_map = sourceMatrix.getColMap()->getLocalMap(); + for (int i=0; i A_indices; + Teuchos::ArrayView A_values; + sourceMatrix.getLocalRowView(lclRow, A_indices, A_values); + } // Now do the actual unpack! UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix( local_matrix, local_col_map, imports_d, num_packets_per_lid_d, - import_lids_d, combineMode, false); + import_lids_d, combineMode); + } template @@ -1187,13 +1281,12 @@ unpackCrsMatrixAndCombineNew( auto local_col_map = sourceMatrix.getColMap ()->getLocalMap (); typedef decltype (local_col_map) local_map_type; - const bool unpack_pids = false; UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix< local_matrix_type, local_map_type, buffer_device_type > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d, - import_lids_d, combineMode, unpack_pids); + import_lids_d, combineMode); } /// \brief Special version of Tpetra::Details::unpackCrsMatrixAndCombine @@ -1453,10 +1546,10 @@ unpackAndCombineIntoCrsArrays ( create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(), TargetPids.size(), true, "tgt_pids"); - size_t num_bytes_per_value = 0; + size_t bytes_per_value = 0; if (PackTraits::compileTimeSize) { // assume that ST is default constructible - num_bytes_per_value = PackTraits::packValueCount(ST()); + bytes_per_value = PackTraits::packValueCount(ST()); } else { // Since the packed data come from the source matrix, we can use the source @@ -1467,18 +1560,18 @@ unpackAndCombineIntoCrsArrays ( // a Scalar value is. Of course, if no processes have any entries, then no // values should be packed (though this does assume that in our packing // scheme, rows with zero entries take zero bytes). - size_t num_bytes_per_value_l = 0; + size_t bytes_per_value_l = 0; if (local_matrix.values.extent(0) > 0) { const ST& val = local_matrix.values(0); - num_bytes_per_value_l = PackTraits::packValueCount(val); + bytes_per_value_l = PackTraits::packValueCount(val); } else { const ST& val = crs_vals_d(0); - num_bytes_per_value_l = PackTraits::packValueCount(val); + bytes_per_value_l = PackTraits::packValueCount(val); } Teuchos::reduceAll(*(sourceMatrix.getComm()), Teuchos::REDUCE_MAX, - num_bytes_per_value_l, - outArg(num_bytes_per_value)); + bytes_per_value_l, + outArg(bytes_per_value)); } #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE @@ -1494,7 +1587,7 @@ unpackAndCombineIntoCrsArrays ( num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID, - num_bytes_per_value); + bytes_per_value); // Copy outputs back to host typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h( diff --git a/packages/tpetra/core/test/CrsGraph/CrsGraph_PackUnpack.cpp b/packages/tpetra/core/test/CrsGraph/CrsGraph_PackUnpack.cpp index 52d331374e21..09f14dc46622 100644 --- a/packages/tpetra/core/test/CrsGraph/CrsGraph_PackUnpack.cpp +++ b/packages/tpetra/core/test/CrsGraph/CrsGraph_PackUnpack.cpp @@ -139,8 +139,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(CrsGraph, PackThenUnpackAndCombine, LO, GO, NT { typedef Tpetra::CrsGraph crs_graph_type; typedef typename crs_graph_type::packet_type packet_type; - typedef typename NT::device_type device_type; - typedef typename device_type::execution_space execution_space; int lclSuccess = 1; // to be revised below int gblSuccess = 0; // output argument @@ -230,6 +228,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(CrsGraph, PackThenUnpackAndCombine, LO, GO, NT // compare graph values. Thus, we need to do a fence before // comparing graph values, in order to ensure that changes made on // device are visible on host. + using device_type = typename NT::device_type; + using execution_space = typename device_type::execution_space; execution_space().fence (); int lclNumErrors = 0; diff --git a/packages/tpetra/core/test/CrsGraph/CrsGraph_UnpackIntoStaticGraph.cpp b/packages/tpetra/core/test/CrsGraph/CrsGraph_UnpackIntoStaticGraph.cpp index e72d7ad42c72..ed27dfdc6b54 100644 --- a/packages/tpetra/core/test/CrsGraph/CrsGraph_UnpackIntoStaticGraph.cpp +++ b/packages/tpetra/core/test/CrsGraph/CrsGraph_UnpackIntoStaticGraph.cpp @@ -89,8 +89,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(CrsGraph, PackThenUnpackAndCombine, LO, GO, NT using map_type = Tpetra::Map; using graph_type = Tpetra::CrsGraph; using packet_type = typename graph_type::packet_type; - using device_type = typename NT::device_type; - using execution_space = typename device_type::execution_space; // using import_type = Tpetra::Import; auto comm = getDefaultComm(); @@ -201,6 +199,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(CrsGraph, PackThenUnpackAndCombine, LO, GO, NT // comparing graph values, in order to ensure that changes made on // device are visible on host. A->fillComplete(); + using device_type = typename NT::device_type; + using execution_space = typename device_type::execution_space; execution_space().fence (); auto loc_num_errs = 0; diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_PackUnpack.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_PackUnpack.cpp index a9fde1aa4f66..6121f66f01fe 100644 --- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_PackUnpack.cpp +++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_PackUnpack.cpp @@ -208,7 +208,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CrsMatrix, PackThenUnpackAndCombine, SC, LO, G out << "Building second matrix" << endl; auto graph = A->getCrsGraph(); RCP B (new crs_matrix_type (graph)); - B->setAllToScalar(SC {}); + B->setAllToScalar(SC{-1.}); B->fillComplete(); out << "Calling unpackCrsMatrixAndCombine with " @@ -273,6 +273,12 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CrsMatrix, PackThenUnpackAndCombine, SC, LO, G << "B[" << i << "]=" << B_values[i] << "!\n"; ++curNumErrors; } + else + { + errStrm << "INFO: Proc " << world_rank << ", row " << lclRow + << ", A[" << i << "]=" << A_values[i] << ", and " + << "B[" << i << "]=" << B_values[i] << "!\n"; + } } lclNumErrors += curNumErrors; } @@ -349,6 +355,9 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CrsMatrix, PackThenUnpackAndCombine, SC, LO, G ArrayView B_indices; ArrayView B_values; B->getLocalRowView(loc_row, B_indices, B_values); +// std::cout << "A_values: " << A_values << "\n"; +// std::cout << "B_values: " << B_values << "\n"; +// std::cout << std::flush; TEST_EQUALITY( A_indices.size (), B_indices.size () ); diff --git a/packages/tpetra/core/test/CrsMatrix/UnpackMerge.cpp b/packages/tpetra/core/test/CrsMatrix/UnpackMerge.cpp index d8c8e160c571..7a6ad471bad3 100644 --- a/packages/tpetra/core/test/CrsMatrix/UnpackMerge.cpp +++ b/packages/tpetra/core/test/CrsMatrix/UnpackMerge.cpp @@ -80,7 +80,6 @@ namespace { // (anonymous) using crs_matrix_type = Tpetra::CrsMatrix; using import_type = Tpetra::Import; using map_type = Tpetra::Map; - using STS = Teuchos::ScalarTraits; RCP > comm = getDefaultComm(); const int myRank = comm->getRank(); @@ -225,7 +224,6 @@ namespace { // (anonymous) using crs_matrix_type = Tpetra::CrsMatrix; using import_type = Tpetra::Import; using map_type = Tpetra::Map; - using STS = Teuchos::ScalarTraits; int lclSuccess = 1; int gblSuccess = 0; diff --git a/packages/tpetra/core/test/ImportExport/CMakeLists.txt b/packages/tpetra/core/test/ImportExport/CMakeLists.txt index 7294a475846e..875c601f1dce 100644 --- a/packages/tpetra/core/test/ImportExport/CMakeLists.txt +++ b/packages/tpetra/core/test/ImportExport/CMakeLists.txt @@ -49,6 +49,15 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( STANDARD_PASS_OUTPUT ) +TRIBITS_ADD_EXECUTABLE_AND_TEST( + UnpackLongRows + SOURCES + UnpackLongRows.cpp + COMM mpi + NUM_MPI_PROCS 4 + STANDARD_PASS_OUTPUT +) + IF (${PROJECT_NAME}_ENABLE_Epetra) IF(NOT Trilinos_NO_32BIT_GLOBAL_INDICES AND Tpetra_INST_INT_INT) # Tpetra bug 5430: diff --git a/packages/tpetra/core/test/ImportExport/UnpackLongRows.cpp b/packages/tpetra/core/test/ImportExport/UnpackLongRows.cpp new file mode 100644 index 000000000000..423184a872f2 --- /dev/null +++ b/packages/tpetra/core/test/ImportExport/UnpackLongRows.cpp @@ -0,0 +1,727 @@ +/* +// @HEADER +// *********************************************************************** +// +// Tpetra: Templated Linear Algebra Services Package +// Copyright (2008) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// ************************************************************************ +// @HEADER +*/ +#include + +#include "Kokkos_Core.hpp" + +#include "Tpetra_TestingUtilities.hpp" +#include "TpetraCore_ETIHelperMacros.h" + +#include "Tpetra_Map.hpp" +#include "Tpetra_Core.hpp" +#include "Tpetra_CrsMatrix.hpp" +#include "Tpetra_Distributor.hpp" + +#include "Teuchos_CommHelpers.hpp" +#include +#include +#include +#include + +#define INFO(X) std::cout << "==> info [" << comm->getRank() << "/" << comm->getSize() << "]: " << X << std::flush + +namespace { // anonymous + +// Create a crs matrix with a Poisson-like structure and extra dense rows at its +// end. Add column entries to each row so that the matrix remains symmetric. +// +// The final matrix will have the following form +// +// 4 -1 0 -1 0 ... 0 1 ... 1 +// -1 4 -1 0 -1 0 ... 0 1 ... 1 +// 0 -1 4 -1 0 -1 0 ... 0 1 ... 1 +// -1 0 -1 4 -1 0 -1 0 ... 0 1 ... 1 +// . . . . +// . . . . +// . . . . +// 0 ... -1 0 -1 4 -1 0 -1 1 ... 1 +// 0 ... -1 0 -1 4 -1 0 1 ... 1 +// 0 ... -1 0 -1 4 -1 1 ... 1 +// 0 ... -1 0 -1 4 1 ... 1 +// 1 1 ... 1 +// . . . +// . . . +// 1 1 ... 1 +// +// The final matrix is distributed evenly over processors in the communicator, +// but constructed in an arbitrary way such that each rank contributes to an +// overlap region on the rank's boundary and to the dense region. +// +// This test is constructed as such to isolate slow downs in export operations +// experienced by Aria when constructing a linear system that has a Poisson +// structure with additional dense rows. +template +void +generate_graphs( + Teuchos::RCP> const &comm, + Teuchos::RCP& owned, + Teuchos::RCP& shared, + int const rows_per_rank, + int const overlap, + int const dense_rows +) +{ + + using Teuchos::rcp; + using Teuchos::RCP; + using Teuchos::Array; + using map_type = typename graph_type::map_type; + using GO = typename graph_type::global_ordinal_type; + + auto const rank = comm->getRank(); + auto const procs = comm->getSize(); + + using gsize_t = Tpetra::global_size_t; + const gsize_t global_rows = rows_per_rank * procs + dense_rows; + + INFO("GENERATING GRAPHS\n"); + // one-to-one map for entries on my rank + RCP owned_map; + { + Array indices((rank != procs - 1) ? rows_per_rank : rows_per_rank + dense_rows); + GO start = rank * rows_per_rank; + std::iota(indices.begin(), indices.end(), start); + owned_map = rcp(new map_type(global_rows, indices(), 0, comm)); + } + + // overlapping map for shared entries + RCP shared_map; + { + Array indices; + if (rank == 0) + { + indices.resize(overlap + dense_rows); + // overlap in to rank 1 + std::iota(indices.begin(), indices.begin()+overlap, GO(rows_per_rank)); + // dense rows at end of matrix + std::iota(indices.begin()+overlap, indices.end(), GO(procs * rows_per_rank)); + } + else if (rank == procs - 1) + { + indices.resize(overlap); + // overlap in to rank (procs - 1) + std::iota(indices.begin(), indices.end(), GO(rank * rows_per_rank - overlap)); + } + else + { + indices.resize(2 * overlap + dense_rows); + // overlap in to rank (rank - 1) + std::iota(indices.begin(), indices.begin()+overlap, GO(rank * rows_per_rank - overlap)); + // overlap in to rank (rank + 1) + std::iota(indices.begin()+overlap, indices.begin()+2*overlap, GO((rank+1) * rows_per_rank)); + // dense rows at end of matrix + std::iota(indices.begin()+2*overlap, indices.end(), GO(procs * rows_per_rank)); + } + auto invalid = Teuchos::OrdinalTraits::invalid(); + shared_map = rcp(new map_type(invalid, indices(), 0, comm)); + } + + owned = rcp(new graph_type(owned_map, rows_per_rank + dense_rows, Tpetra::StaticProfile)); + shared = rcp(new graph_type(shared_map, rows_per_rank + dense_rows, Tpetra::StaticProfile)); + + { + using Teuchos::tuple; + auto rows_to_fill = GO(rows_per_rank + overlap); + if (rank > 0 && rank < procs - 1) rows_to_fill += overlap; + auto start = (rank == 0) ? GO(0) : GO(rank * rows_per_rank - overlap); + for (GO row=start; row columns; + + if (row == 0) + { + // [4, -1, 0, -1] + auto my_cols = tuple(row, row + 1, row + 3); + columns.assign(my_cols.begin(), my_cols.end()); + } + else if (row == 1 || row == 2) + { + // 1: [-1, 4, -1, 0, -1] + // 2: [0, -1, 4, -1, 0, -1] + auto my_cols = tuple(row - 1, row, row + 1, row + 3); + columns.assign(my_cols.begin(), my_cols.end()); + } + else if ( + gsize_t(row) == global_rows - 3 - dense_rows || + gsize_t(row) == global_rows - 2 - dense_rows + ) + { + // -3: [-1, 0, -1, 4, -1, 0] + // -2: [-1, 0, -1, 4, -1] + auto my_cols = tuple(row - 3, row - 1, row, row + 1); + columns.assign(my_cols.begin(), my_cols.end()); + } + else if (gsize_t(row) == global_rows - 1 - dense_rows) + { + // [-1, 0, -1, 4] + auto my_cols = tuple(row - 3, row - 1, row); + columns.assign(my_cols.begin(), my_cols.end()); + } + else + { + // [-1, 0, -1, 4, -1, 0, -1] + auto my_cols = tuple(row - 3, row - 1, row, row + 1, row + 3); + columns.assign(my_cols.begin(), my_cols.end()); + } + + // Fill in columns at end of row associated with the extra dense rows to + // assure symmetry of the final matrix + for (int i=0; iisNodeGlobalElement(row)) + { + owned->insertGlobalIndices(row, columns()); + } + else if (shared_map->isNodeGlobalElement(row)) + { + shared->insertGlobalIndices(row, columns()); + } + else + { + TEUCHOS_TEST_FOR_EXCEPTION( + true, + std::logic_error, + "Row " << row << " is not owned by anyone!" + ); + } + } + } + + { + for (int i=0; i columns(n); + std::iota(columns.begin(), columns.end(), GO(rank * rows_per_rank)); + if (rank == procs - 1) + { + TEUCHOS_TEST_FOR_EXCEPTION( + !owned_map->isNodeGlobalElement(row), + std::logic_error, + "==> Error [" << rank << "/" << procs << "]: " << + "the global row " << row << " is not in this owned map\n" << + owned_map->getNodeElementList() + ); + owned->insertGlobalIndices(row, columns()); + } + else + { + TEUCHOS_TEST_FOR_EXCEPTION( + !shared_map->isNodeGlobalElement(row), + std::logic_error, + "==> Error [" << rank << "/" << procs << "]: " << + "the global row " << row << " is not in this shared map\n" << + shared_map->getNodeElementList() + ); + shared->insertGlobalIndices(row, columns()); + } + } + } + shared->fillComplete(); + comm->barrier(); + + // We've created a sparse matrix containing owned indices and another + // containing shared. The owned matrix has one-to-one map, while the shared + // matrix is overlapping. Here, export the entries from the shared matrix in + // to the owned matrix. + // + // Since only the target Map is one-to-one, we have to use an Export. + { + using export_type = typename graph_type::export_type; + export_type exporter(shared_map, owned_map); + comm->barrier(); + + owned->doExport(*shared, exporter, Tpetra::ADD); + } + owned->fillComplete(); + + INFO("GENERATING GRAPHS DONE\n"); +} + + +template +Teuchos::RCP +generate_matrix( + Teuchos::RCP> const &comm, + Teuchos::RCP const &g_owned, + Teuchos::RCP const &g_shared, + int const rows_per_rank, + int const overlap, + int const dense_rows +) +{ + + using Teuchos::rcp; + using Teuchos::RCP; + using Teuchos::Time; + using Teuchos::Array; + using Teuchos::TimeMonitor; + using real = typename matrix_type::scalar_type; + using GO = typename matrix_type::global_ordinal_type; + using LO = typename matrix_type::local_ordinal_type; + + auto const rank = comm->getRank(); + auto const procs = comm->getSize(); + + INFO("GENERATING MATRIX\n"); + using gsize_t = Tpetra::global_size_t; + const gsize_t global_rows = rows_per_rank * procs + dense_rows; + + const real mone = static_cast(-1.0); + const real one = static_cast(1.0); + const real four = static_cast(4.0); + + // one-to-one map for entries on my rank + auto owned_map = g_owned->getRowMap(); + auto mtx_owned = rcp(new matrix_type(g_owned)); + + auto shared_map = g_shared->getRowMap(); + auto mtx_shared = rcp(new matrix_type(g_shared)); + INFO("MATRIX CREATED\n"); + + { + INFO("FILLING MATRIX\n"); + RCP