Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
16a8105
Add TPETRA_TIMING behavior variable and tie in with Tpetra::Profiling…
tjfulle Apr 15, 2020
d28d665
timing for crsmatrix fill complete
GeoffDanielson Apr 17, 2020
d5b6498
Add TPETRA_TIMING behavior variable and tie in with Tpetra::Profiling…
tjfulle Apr 15, 2020
a95cd0b
Merge branch 'tjfulle/tpetra-timing' of github.com:tjfulle/Trilinos i…
tjfulle Apr 27, 2020
c6e2fa0
use static graph
tjfulle Apr 27, 2020
20275cb
unpack dense rows separately - but still using RangePolicy
tjfulle Apr 29, 2020
3c1b449
long_rows -> dense_rows
tjfulle Apr 29, 2020
800c6ae
Implement hierarhial unpacking
tjfulle May 5, 2020
ba64b53
properly count number of dense rows
tjfulle May 6, 2020
28a6777
another update
tjfulle May 12, 2020
b860691
unpack hierarchically or not
tjfulle May 12, 2020
64b2608
unpack in chunks
tjfulle May 15, 2020
0a2a77b
fix typo in computing number of entries in batch for *last* batch in row
tjfulle May 20, 2020
4bda0c5
correct construction of batch_info
tjfulle May 21, 2020
b307f8b
debug commit
tjfulle May 26, 2020
918eb92
fixing hierarchical unpack... take 45
tjfulle May 26, 2020
0ed1771
hierarchical unpack, take 97. combined 3 kernel launches in to 1, un…
tjfulle May 27, 2020
f3dacd0
add batch and team size hierarchical unpack settings and environment …
tjfulle May 28, 2020
a620807
Merge branch 'develop' into tjfulle/tpetra-timing
tjfulle Jun 16, 2020
65b7a0b
remove flat unpack, address other comments. still need to get unpack…
tjfulle Jun 17, 2020
2b37357
get error code from unpack kernel. set default unpack sizes for cuda
tjfulle Jun 17, 2020
3e49bcf
use Kokkos::AUTO team size on any non-cuda execution space
tjfulle Jun 18, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,11 @@ getTargetRowMapIndices(const LO lclNumRows,
TEUCHOS_ASSERT(gblRow < indexBase + gblNumRows);
tgtGids[lid] = gblRow;
}
return std::move(tgtGids);
// The original return using std::move (commented out below) returns the
// following warning with gcc 9.2.0:
// waring: moving a local object in a return statement prevents copy elision [-Wpessimizing-move]
//return std::move(tgtGids);
return tgtGids;
}

RCP<const map_type>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -252,15 +252,15 @@ getTpetraCrsMatrix (Teuchos::FancyOStream& out,
using Teuchos::rcp;
using std::endl;
using matrix_type = Tpetra::CrsMatrix<>;
using device_type = matrix_type::device_type;
//using device_type = matrix_type::device_type;
using SC = matrix_type::impl_scalar_type;
using KAT = Kokkos::ArithTraits<SC>;
//using KAT = Kokkos::ArithTraits<SC>;
using LO = Tpetra::Map<>::local_ordinal_type;
using host_device_type = Kokkos::View<SC*, Kokkos::LayoutRight, device_type>::host_mirror_space;
using host_execution_space = host_device_type::execution_space;
//using host_device_type = Kokkos::View<SC*, Kokkos::LayoutRight, device_type>::host_mirror_space;
//using host_execution_space = host_device_type::execution_space;

// We're filling on the host, so generate random numbers on the host.
using pool_type = Kokkos::Random_XorShift64_Pool<host_execution_space>;
//using pool_type = Kokkos::Random_XorShift64_Pool<host_execution_space>;

Teuchos::OSTab tab0 (out);
out << "Create CrsMatrix for benchmark" << endl;
Expand Down
130 changes: 81 additions & 49 deletions packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4733,6 +4733,9 @@ namespace Tpetra {
os << *prefix << endl;
std::cerr << os.str ();
}
Details::ProfilingRegion region(
"Tpetra::CrsMatrix::fillCompete",
"fillCompete");

TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
(! this->isFillActive () || this->isFillComplete (), std::runtime_error,
Expand All @@ -4743,54 +4746,57 @@ namespace Tpetra {
//
// Read parameters from the input ParameterList.
//

// If true, the caller promises that no process did nonlocal
// changes since the last call to fillComplete.
bool assertNoNonlocalInserts = false;
// If true, makeColMap sorts remote GIDs (within each remote
// process' group).
bool sortGhosts = true;

if (! params.is_null ()) {
assertNoNonlocalInserts = params->get ("No Nonlocal Changes",
assertNoNonlocalInserts);
if (params->isParameter ("sort column map ghost gids")) {
sortGhosts = params->get ("sort column map ghost gids", sortGhosts);
}
else if (params->isParameter ("Sort column Map ghost GIDs")) {
sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts);
}
}
// We also don't need to do global assembly if there is only one
// process in the communicator.
const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1;
// This parameter only matters if this matrix owns its graph.
if (! this->myGraph_.is_null ()) {
this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
}

if (! this->getCrsGraphRef ().indicesAreAllocated ()) {
if (this->hasColMap ()) { // use local indices
allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
{
Details::ProfilingRegion region_fc("Tpetra::CrsMatrix::fillCompete", "ParameterList");

// If true, the caller promises that no process did nonlocal
// changes since the last call to fillComplete.
bool assertNoNonlocalInserts = false;
// If true, makeColMap sorts remote GIDs (within each remote
// process' group).
bool sortGhosts = true;

if (! params.is_null ()) {
assertNoNonlocalInserts = params->get ("No Nonlocal Changes",
assertNoNonlocalInserts);
if (params->isParameter ("sort column map ghost gids")) {
sortGhosts = params->get ("sort column map ghost gids", sortGhosts);
}
else if (params->isParameter ("Sort column Map ghost GIDs")) {
sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts);
}
}
// We also don't need to do global assembly if there is only one
// process in the communicator.
const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1;
// This parameter only matters if this matrix owns its graph.
if (! this->myGraph_.is_null ()) {
this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
}

if (! this->getCrsGraphRef ().indicesAreAllocated ()) {
if (this->hasColMap ()) { // use local indices
allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
}
else { // no column Map, so use global indices
allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
}
}
// Global assemble, if we need to. This call only costs a single
// all-reduce if we didn't need global assembly after all.
if (needGlobalAssemble) {
this->globalAssemble ();
}
else { // no column Map, so use global indices
allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
else {
TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
(numProcs == 1 && nonlocals_.size() > 0,
std::runtime_error, "Cannot have nonlocal entries on a serial run. "
"An invalid entry (i.e., with row index not in the row Map) must have "
"been submitted to the CrsMatrix.");
}
}
// Global assemble, if we need to. This call only costs a single
// all-reduce if we didn't need global assembly after all.
if (needGlobalAssemble) {
this->globalAssemble ();
}
else {
TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
(numProcs == 1 && nonlocals_.size() > 0,
std::runtime_error, "Cannot have nonlocal entries on a serial run. "
"An invalid entry (i.e., with row index not in the row Map) must have "
"been submitted to the CrsMatrix.");
}

if (this->isStaticGraph ()) {
Details::ProfilingRegion region_isg("Tpetra::CrsMatrix::fillCompete", "isStaticGraph");
// FIXME (mfh 14 Nov 2016) In order to fix #843, I enable the
// checks below only in debug mode. It would be nicer to do a
// local check, then propagate the error state in a deferred
Expand Down Expand Up @@ -4840,6 +4846,7 @@ namespace Tpetra {
this->fillLocalMatrix (params);
}
else {
Details::ProfilingRegion region_insg("Tpetra::CrsMatrix::fillCompete", "isNotStaticGraph");
// Set the graph's domain and range Maps. This will clear the
// Import if the domain Map has changed (is a different
// pointer), and the Export if the range Map has changed (is a
Expand Down Expand Up @@ -4892,16 +4899,26 @@ namespace Tpetra {
this->myGraph_->checkInternalState ();
}

const bool callComputeGlobalConstants = params.get () == nullptr ||
params->get ("compute global constants", true);
if (callComputeGlobalConstants) {
this->computeGlobalConstants ();
{
Details::ProfilingRegion region_ccgc(
"Tpetra::CrsMatrix::fillCompete", "callComputeGlobalConstamnts"
);
const bool callComputeGlobalConstants = params.get () == nullptr ||
params->get ("compute global constants", true);
if (callComputeGlobalConstants) {
this->computeGlobalConstants ();
}
}

// FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.

this->fillComplete_ = true; // Now we're fill complete!
this->checkInternalState ();
{
Details::ProfilingRegion region_cis(
"Tpetra::CrsMatrix::fillCompete", "checkInternalState"
);
this->checkInternalState ();
}
}

template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
Expand Down Expand Up @@ -7284,6 +7301,11 @@ namespace Tpetra {
typedef GlobalOrdinal GO;
typedef impl_scalar_type ST;

Details::ProfilingRegion region_upack_row(
"Tpetra::CrsMatrix::unpackRow",
"Import/Export"
);

if (numBytes == 0) {
// Rows with zero bytes should always have zero entries.
if (numEnt != 0) {
Expand Down Expand Up @@ -7475,6 +7497,7 @@ namespace Tpetra {
Distributor& dist) const
{
// The call to packNew in packAndPrepare catches and handles any exceptions.
Details::ProfilingRegion region_pack_new("Tpetra::CrsMatrix::packNew", "Import/Export");
if (this->isStaticGraph ()) {
using ::Tpetra::Details::packCrsMatrixNew;
packCrsMatrixNew (*this, exports, numPacketsPerLID, exportLIDs,
Expand Down Expand Up @@ -7902,6 +7925,10 @@ namespace Tpetra {
const CombineMode combineMode,
const bool verbose)
{
Details::ProfilingRegion region_unpack_and_combine_impl(
"Tpetra::CrsMatrix::unpackAndCombineImpl",
"Import/Export"
);
using std::endl;
const char tfecfFuncName[] = "unpackAndCombineImpl";
std::unique_ptr<std::string> prefix;
Expand Down Expand Up @@ -8019,6 +8046,11 @@ namespace Tpetra {
return; // nothing to do; no need to combine entries
}

Details::ProfilingRegion region_unpack_and_combine_impl_non_static(
"Tpetra::CrsMatrix::unpackAndCombineImplNonStatic",
"Import/Export"
);

// We're unpacking on host. This is read-only host access.
if (imports.need_sync_host()) {
imports.sync_host ();
Expand Down
Loading