tpetra: Hang in getLocalDiagCopy post Kokkos-4.4 #13498
Open
Description
Bug Report
@trilinos/tpetra
Description
Calling CrsMatrix::getLocalDiagCopy(Vector)
on a non-fillComplete matrix hangs in a Cuda build after the Kokkos-4.4 thread safety changes. Stack trace is:
#0 0x000015551288e85d in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x0000155512887ad9 in pthread_mutex_lock () from /lib64/libpthread.so.0
#2 0x000000000f21e40b in void Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Serial, Kokkos::Serial::impl_static_fence(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)::{lambda()#1}>(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, Kokkos::Tools::Experimental::SpecialSynchronizationCases, Kokkos::Serial::impl_static_fence(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)::{lambda()#1} const&) ()
#3 0x000000000f21e536 in Kokkos::Impl::ExecSpaceDerived<Kokkos::Serial>::static_fence(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) ()
#4 0x000000000f1f18c5 in Kokkos::Impl::ExecSpaceManager::static_fence(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) ()
#5 0x000000000e14302c in void Kokkos::deep_copy<unsigned long*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::Serial, Kokkos::HostSpace>, Kokkos::Experimental::EmptyViewHooks, Kokkos::MemoryTraits<0u>, unsigned long const*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::Cuda, Kokkos::CudaSpace>, Kokkos::Experimental::EmptyViewHooks, Kokkos::MemoryTraits<0u> >(Kokkos::View<unsigned long*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::Serial, Kokkos::HostSpace>, Kokkos::Experimental::EmptyViewHooks, Kokkos::MemoryTraits<0u> > const&, Kokkos::View<unsigned long const*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::Cuda, Kokkos::CudaSpace>, Kokkos::Experimental::EmptyViewHooks, Kokkos::MemoryTraits<0u> > const&, std::enable_if<(std::is_void<Kokkos::ViewTraits<unsigned long*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::Serial, Kokkos::HostSpace>, Kokkos::Experimental::EmptyViewHooks, Kokkos::MemoryTraits<0u> >::specialize>::value&&std::is_void<Kokkos::ViewTraits<unsigned long const*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::Cuda, Kokkos::CudaSpace>, Kokkos::Experimental::EmptyViewHooks, Kokkos::MemoryTraits<0u> >::specialize>::value)&&((((unsigned int)Kokkos::ViewTraits<unsigned long*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::Serial, Kokkos::HostSpace>, Kokkos::Experimental::EmptyViewHooks, Kokkos::MemoryTraits<0u> >::rank)!=(0))||(((unsigned int)Kokkos::ViewTraits<unsigned long const*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::Cuda, Kokkos::CudaSpace>, Kokkos::Experimental::EmptyViewHooks, Kokkos::MemoryTraits<0u> >::rank)!=(0))), void>::type*) [clone .isra.0] ()
#6 0x000000000e18c47e in Tpetra::CrsGraph<int, long long, Tpetra::KokkosCompat::KokkosDeviceWrapperNode<Kokkos::Cuda, Kokkos::CudaSpace> >::getRowPtrsUnpackedHost() const ()
#7 0x000000000e7abf98 in Tpetra::CrsGraph<int, long long, Tpetra::KokkosCompat::KokkosDeviceWrapperNode<Kokkos::Cuda, Kokkos::CudaSpace> >::getRowInfo(int) const ()
#8 0x000000000e2afb34 in Tpetra::CrsMatrix<double, int, long long, Tpetra::KokkosCompat::KokkosDeviceWrapperNode<Kokkos::Cuda, Kokkos::CudaSpace> >::getLocalRowView(int, Kokkos::View<int const*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::Serial, Kokkos::HostSpace>, Kokkos::Experimental::EmptyViewHooks, Kokkos::MemoryTraits<0u> >&, Kokkos::View<double const*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::Serial, Kokkos::HostSpace>, Kokkos::Experimental::EmptyViewHooks, Kokkos::MemoryTraits<0u> >&) const ()
#9 0x000000000e5f66e0 in Kokkos::Impl::ParallelReduceAdaptor<Kokkos::RangePolicy<Kokkos::Serial, int>, Tpetra::Details::GetLocalDiagCopyWithoutOffsetsNotFillCompleteFunctor<double, int, long long, Tpetra::KokkosCompat::KokkosDeviceWrapperNode<Kokkos::Cuda, Kokkos::CudaSpace> >, int>::execute_impl(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, Kokkos::RangePolicy<Kokkos::Serial, int> const&, Tpetra::Details::GetLocalDiagCopyWithoutOffsetsNotFillCompleteFunctor<double, int, long long, Tpetra::KokkosCompat::KokkosDeviceWrapperNode<Kokkos::Cuda, Kokkos::CudaSpace> > const&, int&) ()
#10 0x000000000e5f8bf1 in Tpetra::Details::GetLocalDiagCopyWithoutOffsetsNotFillCompleteFunctor<double, int, long long, Tpetra::KokkosCompat::KokkosDeviceWrapperNode<Kokkos::Cuda, Kokkos::CudaSpace> >::GetLocalDiagCopyWithoutOffsetsNotFillCompleteFunctor(int&, Tpetra::Vector<double, int, long long, Tpetra::KokkosCompat::KokkosDeviceWrapperNode<Kokkos::Cuda, Kokkos::CudaSpace> >&, Tpetra::RowMatrix<double, int, long long, Tpetra::KokkosCompat::KokkosDeviceWrapperNode<Kokkos::Cuda, Kokkos::CudaSpace> > const&) ()
#11 0x000000000e5f99f0 in int Tpetra::Details::getLocalDiagCopyWithoutOffsetsNotFillComplete<double, int, long long, Tpetra::KokkosCompat::KokkosDeviceWrapperNode<Kokkos::Cuda, Kokkos::CudaSpace> >(Tpetra::Vector<double, int, long long, Tpetra::KokkosCompat::KokkosDeviceWrapperNode<Kokkos::Cuda, Kokkos::CudaSpace> >&, Tpetra::RowMatrix<double, int, long long, Tpetra::KokkosCompat::KokkosDeviceWrapperNode<Kokkos::Cuda, Kokkos::CudaSpace> > const&, bool) ()
#12 0x000000000e2f462d in Tpetra::CrsMatrix<double, int, long long, Tpetra::KokkosCompat::KokkosDeviceWrapperNode<Kokkos::Cuda, Kokkos::CudaSpace> >::getLocalDiagCopy(Tpetra::Vector<double, int, long long, Tpetra::KokkosCompat::KokkosDeviceWrapperNode<Kokkos::Cuda, Kokkos::CudaSpace> >&) const ()
It looks like the GetLocalDiagCopyWithoutOffsetsNotFillCompleteFunctor
ends up calling CrsGraph::getRowPtrsUnpackedHost
inside a parallel_reduce, but that function does a View allocation in some cases which is not allowed in a parallel region.