diff --git a/blas/impl/KokkosBlas1_scal_impl.hpp b/blas/impl/KokkosBlas1_scal_impl.hpp index 541d9a4934..72606b6bb2 100644 --- a/blas/impl/KokkosBlas1_scal_impl.hpp +++ b/blas/impl/KokkosBlas1_scal_impl.hpp @@ -20,6 +20,9 @@ #include #include #include +#include +#include +#include #ifndef KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL #define KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL 2 @@ -28,7 +31,10 @@ namespace KokkosBlas { namespace Impl { -// Single-vector version of MV_Scal_Functor. By default, a is still a + +// Single-vector version of MV_Scal_Functor. +// a has been unified into either a scalar or a 0D view + // 1-D View. Below is a partial specialization that lets a be a // scalar. This functor computes any of the following: // @@ -42,7 +48,7 @@ namespace Impl { // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does not apply to // coefficients in the a vector, if used. -template +template struct V_Scal_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -51,90 +57,70 @@ struct V_Scal_Functor { XV m_x; AV m_a; - V_Scal_Functor(const RV& r, const XV& x, const AV& a, - const SizeType startingColumn) + V_Scal_Functor(const RV& r, const XV& x, const AV& a) : m_r(r), m_x(x), m_a(a) { static_assert(Kokkos::is_view::value, "V_Scal_Functor: RV is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "V_Scal_Functor: AV is not a Kokkos::View."); + + // TODO: static assert truths about AV + static_assert(Kokkos::is_view::value, "V_Scal_Functor: XV is not a Kokkos::View."); static_assert(RV::rank == 1, "V_Scal_Functor: RV is not rank 1."); - static_assert(AV::rank == 1, "V_Scal_Functor: AV is not rank 1."); static_assert(XV::rank == 1, "V_Scal_Functor: XV is not rank 1."); - - if (startingColumn != 0) { - m_a = Kokkos::subview( - a, - std::make_pair(startingColumn, static_cast(a.extent(0)))); - } } KOKKOS_INLINE_FUNCTION void operator()(const size_type& i) const { - // scalar_x is a compile-time constant (since it is a template + + using ScalarHint = KokkosKernels::Impl::ScalarHint; + + // scalar_a is a compile-time constant (since it is a template // parameter), so the compiler should evaluate these branches at // compile time. - if (scalar_x == 0) { + if constexpr (ALPHA_HINT == ScalarHint::zero) { m_r(i) = ATS::zero(); } - if (scalar_x == -1) { + else if constexpr (ALPHA_HINT == ScalarHint::neg_one) { m_r(i) = -m_x(i); } - if (scalar_x == 1) { + else if constexpr (ALPHA_HINT == ScalarHint::pos_one) { m_r(i) = m_x(i); } - if (scalar_x == 2) { - m_r(i) = m_a(0) * m_x(i); + else if constexpr (ALPHA_HINT == ScalarHint::none) { + m_r(i) = KokkosBlas::Impl::as_scalar(m_a) * m_x(i); + } + else { + static_assert(KokkosKernels::Impl::always_false_v, "Unexpected value for ALPHA_HINT"); } } }; -// Partial specialization of V_Scal_Functor that lets a be a scalar -// (rather than a 1-D View, as in the most general version above). -// This functor computes any of the following: -// -// 1. Y(i) = alpha*X(i) for alpha in -1,0,1 -// 2. Y(i) = a*X(i) -template -struct V_Scal_Functor { - typedef SizeType size_type; - typedef Kokkos::ArithTraits ATS; +/*! \brief - RV m_r; - XV m_x; - const typename XV::non_const_value_type m_a; + r(i) = av * x(i) + r(i) = av() * x(i) - V_Scal_Functor(const RV& r, const XV& x, - const typename XV::non_const_value_type& a, - const SizeType /* startingColumn */) - : m_r(r), m_x(x), m_a(a) {} + \param space + \param r + \param av + \param x + \param alphaHint A KokkosKernels::Impl::ScalarHint corresponding to the value of av. If not KokkosKernels::Impl:ß:ScalarHint::none, may be used to optimize the implementation - KOKKOS_INLINE_FUNCTION - void operator()(const size_type& i) const { - if (scalar_x == 0) { - m_r(i) = ATS::zero(); - } - if (scalar_x == -1) { - m_r(i) = -m_x(i); - } - if (scalar_x == 1) { - m_r(i) = m_x(i); - } - if (scalar_x == 2) { - m_r(i) = m_a * m_x(i); - } - } -}; + \tparam SizeType + \tparam ExecutionSpace + \tparam RV + \tparam AV + \tparam XV + +*/ +template +void V_Scal_Generic(const ExecutionSpace& space, const RV& r, const AV& av, + const XV& x, + const KokkosKernels::Impl::ScalarHint &alphaHint = KokkosKernels::Impl::ScalarHint::none) { + + // TODO: assert some things about AV -// Variant of MV_Scal_Generic for single vectors (1-D Views) r and x. -// As above, av is either a 1-D View (and only its first entry will be -// read), or a scalar. -template -void V_Scal_Generic(const execution_space& space, const RV& r, const AV& av, - const XV& x, const SizeType startingColumn, int a = 2) { static_assert(Kokkos::is_view::value, "V_Scal_Generic: RV is not a Kokkos::View."); static_assert(Kokkos::is_view::value, @@ -143,27 +129,26 @@ void V_Scal_Generic(const execution_space& space, const RV& r, const AV& av, static_assert(XV::rank == 1, "V_Scal_Generic: XV is not rank 1."); const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(space, 0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); - if (a == 0) { - V_Scal_Functor op(r, x, av, startingColumn); - Kokkos::parallel_for("KokkosBlas::Scal::S0", policy, op); + if (alphaHint == KokkosKernels::Impl::ScalarHint::zero) { + V_Scal_Functor op(r, x, av); + Kokkos::parallel_for("KokkosBlas::Scal::0", policy, op); return; } - if (a == -1) { - V_Scal_Functor op(r, x, av, startingColumn); - Kokkos::parallel_for("KokkosBlas::Scal::S1", policy, op); + else if (alphaHint == KokkosKernels::Impl::ScalarHint::neg_one) { + V_Scal_Functor op(r, x, av); + Kokkos::parallel_for("KokkosBlas::Scal::-1", policy, op); return; } - if (a == 1) { - V_Scal_Functor op(r, x, av, startingColumn); - Kokkos::parallel_for("KokkosBlas::Scal::S2", policy, op); + else if (alphaHint == KokkosKernels::Impl::ScalarHint::pos_one) { + V_Scal_Functor op(r, x, av); + Kokkos::parallel_for("KokkosBlas::Scal::1", policy, op); return; } - // a arbitrary (not -1, 0, or 1) - V_Scal_Functor op(r, x, av, startingColumn); - Kokkos::parallel_for("KokkosBlas::Scal::S3", policy, op); + V_Scal_Functor op(r, x, av); + Kokkos::parallel_for("KokkosBlas::Scal::none", policy, op); } } // namespace Impl diff --git a/blas/impl/KokkosBlas1_scal_mv_impl.hpp b/blas/impl/KokkosBlas1_scal_mv_impl.hpp index da4d7a5149..242c077cc9 100644 --- a/blas/impl/KokkosBlas1_scal_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_scal_mv_impl.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #ifndef KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL #define KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL 2 @@ -323,8 +324,9 @@ template void MV_Scal_Unrolled(const execution_space& space, const RMV& r, const aVector& av, const XMV& x, - const SizeType startingColumn, int a = 2) { - if (a == 0) { + const SizeType startingColumn, + const KokkosKernels::Impl::ScalarHint &a = KokkosKernels::Impl::ScalarHint::none) { + if (a == KokkosKernels::Impl::ScalarHint::zero) { MV_Scal_Unroll_Functor op( r, x, av, startingColumn); const SizeType numRows = x.extent(0); @@ -332,7 +334,7 @@ void MV_Scal_Unrolled(const execution_space& space, const RMV& r, Kokkos::parallel_for("KokkosBlas::Scal::MV::S0", policy, op); return; } - if (a == -1) { + if (a == KokkosKernels::Impl::ScalarHint::neg_one) { MV_Scal_Unroll_Functor op( r, x, av, startingColumn); const SizeType numRows = x.extent(0); @@ -340,7 +342,7 @@ void MV_Scal_Unrolled(const execution_space& space, const RMV& r, Kokkos::parallel_for("KokkosBlas::Scal::MV::S1", policy, op); return; } - if (a == 1) { + if (a == KokkosKernels::Impl::ScalarHint::pos_one) { MV_Scal_Unroll_Functor op( r, x, av, startingColumn); const SizeType numRows = x.extent(0); @@ -349,7 +351,6 @@ void MV_Scal_Unrolled(const execution_space& space, const RMV& r, return; } - // a arbitrary (not -1, 0, or 1) MV_Scal_Unroll_Functor op( r, x, av, startingColumn); const SizeType numRows = x.extent(0); @@ -420,7 +421,8 @@ void MV_Scal_Generic(const execution_space& space, const RVector& r, // coefficient(s) in av, if used. template void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, - const AV& av, const XMV& x, int a = 2) { + const AV& av, const XMV& x, + const KokkosKernels::Impl::ScalarHint &aHint = KokkosKernels::Impl::ScalarHint::none) { const SizeType numCols = x.extent(1); #if KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL <= 2 @@ -438,7 +440,7 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, typedef decltype(R_cur) RMV2D; MV_Scal_Unrolled( - space, R_cur, av, X_cur, j, a); + space, R_cur, av, X_cur, j, aHint); } for (; j + 4 <= numCols; j += 4) { const std::pair rng(j, j + 4); @@ -448,7 +450,7 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, typedef decltype(R_cur) RMV2D; MV_Scal_Unrolled( - space, R_cur, av, X_cur, j, a); + space, R_cur, av, X_cur, j, aHint); } for (; j < numCols; ++j) { // RMV and XMV need to turn 1-D. @@ -457,8 +459,21 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, typedef decltype(r_cur) RV; typedef decltype(x_cur) XV; - V_Scal_Generic(space, r_cur, av, - x_cur, j, a); + // If AV is a rank-one vector, get a rank-0 subview + // Otherwise, just pass along AV as-is + // can't short-circuit if constexpr :( + if constexpr (Kokkos::is_view_v) { + if constexpr (AV::rank == 1) { + auto a_cur = Kokkos::subview(av, j); + V_Scal_Generic(space, r_cur, a_cur, x_cur, aHint); + } else { + V_Scal_Generic(space, r_cur, av, x_cur, aHint); + } + } else { + V_Scal_Generic(space, r_cur, av, x_cur, aHint); + } + + } #else // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL > 2 @@ -470,7 +485,7 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, typedef decltype(r_0) RV; typedef decltype(x_0) XV; - V_Scal_Generic(space, r_0, av, x_0, + V_Scal_Generic(space, r_0, av, x_0, 0, a); break; } @@ -535,7 +550,7 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, space, r, av, x, 0, a); break; default: - MV_Scal_Generic(space, r, av, x, + MV_Scal_Generic(space, r, av, x, 0, a); } @@ -572,11 +587,9 @@ void MV_Scal_Invoke_Right(const execution_space& space, const RMV& r, RV r_0 = Kokkos::subview(r, Kokkos::ALL(), 0); XV x_0 = Kokkos::subview(x, Kokkos::ALL(), 0); - V_Scal_Generic(space, r_0, - av, x_0, a); + V_Scal_Generic(space, r_0, av, x_0, a); } else { - MV_Scal_Generic(space, r, av, - x, a); + MV_Scal_Generic(space, r, av, x, a); } } diff --git a/blas/impl/KokkosBlas1_scal_spec.hpp b/blas/impl/KokkosBlas1_scal_spec.hpp index 38972b2223..57f120941c 100644 --- a/blas/impl/KokkosBlas1_scal_spec.hpp +++ b/blas/impl/KokkosBlas1_scal_spec.hpp @@ -39,11 +39,12 @@ struct scal_eti_spec_avail { // // Macro for declaration of full specialization availability -// KokkosBlas::Impl::Scal for rank == 1. This is NOT for users!!! All +// KokkosBlas::Impl::Scal for rank == 1 R and X. This is NOT for users!!! All // the declarations of full specializations go in this header file. // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // +// Alpha can either be scalar or rank 0 #define KOKKOSBLAS1_SCAL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct scal_eti_spec_avail< \ @@ -56,15 +57,28 @@ struct scal_eti_spec_avail { Kokkos::MemoryTraits >, \ 1> { \ enum : bool { value = true }; \ + }; \ + template <> \ + struct scal_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; - // // Macro for declaration of full specialization availability -// KokkosBlas::Impl::Scal for rank == 2. This is NOT for users!!! All +// KokkosBlas::Impl::Scal for rank == 2 R and X. This is NOT for users!!! All // the declarations of full specializations go in this header file. // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // +// Alpha can either be rank 1, rank 0, or scalar #define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template <> \ @@ -82,6 +96,20 @@ struct scal_eti_spec_avail { enum : bool { value = true }; \ }; \ template <> \ + struct scal_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ + }; \ + template <> \ struct scal_eti_spec_avail< \ EXEC_SPACE, \ Kokkos::View, \ @@ -151,43 +179,44 @@ struct Scal(INT_MAX)) { - typedef int index_type; - V_Scal_Generic(space, R, alpha, - X, a); + V_Scal_Generic(space, R, alpha, X, alphaHint); } else { - typedef typename XV::size_type index_type; - V_Scal_Generic(space, R, alpha, - X, a); + V_Scal_Generic(space, R, alpha, X, alphaHint); } Kokkos::Profiling::popRegion(); } }; -/// \brief Partial specialization of Scal for 2-D Views and 1-D View AV. +/// \brief Partial specialization of Scal for 2-D Views and 1-D, 0-D, or scalar AV. /// /// Compute any of the following: -/// -/// 1. R(i,j) = a*X(i,j) for a in -1,0,1 -/// 2. R(i,j) = alpha(j)*X(i,j) +/// 1. R(i,j) = av * X(i,j) +/// 2. R(i,j) = av() * X(i,j) +/// 3. R(i,j) = av(j) * X(i,j) template struct Scal { + using ScalarHint = KokkosKernels::Impl::ScalarHint; typedef typename XMV::size_type size_type; typedef Kokkos::ArithTraits ATA; static void scal(const execution_space& space, const RMV& R, const AV& av, const XMV& X) { + + // TODO: assert some things about AV + static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Scal<2-D>: RMV is not a Kokkos::View."); @@ -200,9 +229,6 @@ struct Scal: " "RMV is not rank 2."); - static_assert(AV::rank == 1, - "KokkosBlas::Impl::Scal<2-D>: " - "AV is not rank 1."); static_assert(XMV::rank == 2, "KokkosBlas::Impl::Scal<2-D>: " "XMV is not rank 2."); @@ -221,16 +247,16 @@ struct Scal(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; MV_Scal_Invoke_Left(space, R, - av, X, a); + av, X, alphaHint); } else { typedef typename XMV::size_type index_type; MV_Scal_Invoke_Left(space, R, - av, X, a); + av, X, alphaHint); } Kokkos::Profiling::popRegion(); } @@ -245,6 +271,7 @@ struct Scal struct Scal { + using ScalarHint = KokkosKernels::Impl::ScalarHint; typedef typename XMV::non_const_value_type AV; typedef typename XMV::size_type size_type; typedef Kokkos::ArithTraits ATA; @@ -279,13 +306,13 @@ struct Scal(INT_MAX) && @@ -293,12 +320,12 @@ struct Scal( - space, R, alpha, X, a); + space, R, alpha, X, alphaHint); } else { typedef typename XMV::size_type index_type; MV_Scal_Invoke_Left( - space, R, alpha, X, a); + space, R, alpha, X, alphaHint); } Kokkos::Profiling::popRegion(); } @@ -310,17 +337,29 @@ struct Scal, \ Kokkos::MemoryTraits >, \ SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; \ + extern template struct Scal< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -332,6 +371,17 @@ struct Scal, \ Kokkos::MemoryTraits >, \ SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; \ + template struct Scal< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/blas/impl/KokkosBlas1_scal_unified_scalar_view_impl.hpp b/blas/impl/KokkosBlas1_scal_unified_scalar_view_impl.hpp new file mode 100644 index 0000000000..512919285a --- /dev/null +++ b/blas/impl/KokkosBlas1_scal_unified_scalar_view_impl.hpp @@ -0,0 +1,307 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBLAS1_SCAL_UNIFIED_SCALAR_VIEW_IMPL +#define KOKKOSBLAS1_SCAL_UNIFIED_SCALAR_VIEW_IMPL + +#include +#include + +#include + +/*! \brief + +Canonicalizes a variety of different "scalar" values of AV to the most +restrictive version still consistent with the interface. + +This may reduce the number of instantiations. + +Transformations made: +* rank-1 AV, RMV, and XMV means AV is actually a scalar. On the host, we can go +further and convert to a true scalar since we can access the value. On device, +take a subview to convert to rank-0 scalar. +* S[1] -> S +* apply const to view data types +* TODO: simplify this by just applying these in turn when possible + +Row | RMV / XMV | AV | alpha_type + 1 | Rank-1 | S | S + 2 | Rank-2 | S | S + 3 | Rank-1 | View | S + 4 | Rank-2 | View | S + 5 | Rank-1 | View | View + 6 | Rank-2 | View | View + 7 | Rank-1 | View | S + 8 | Rank-2 | View | S + 9 | Rank-1 | View | S +10 | Rank-2 | View | View +11 | Rank-1 | View | View +12 | Rank-1 | View | View +13 | Rank-2 | View | View +14 | Rank-2 | View | View + +See comments on the implementation below for each rows + +This canonicalization strategy avoids: +* Calling Kokkos::deep_copy to convert S to View +* Interacting with device scalars in the host code +*/ + + + +namespace KokkosBlas::Impl { + +template +struct is_host : std::false_type {}; +template +struct is_host && + Kokkos::SpaceAccessibility< + Kokkos::HostSpace, + typename T::memory_space>::accessible>> + : std::true_type {}; +template +constexpr inline bool is_host_v = is_host::value; + +template +struct is_dev : std::false_type {}; +template +struct is_dev && + !Kokkos::SpaceAccessibility< + Kokkos::HostSpace, + typename T::memory_space>::accessible>> + : std::true_type {}; +template +constexpr inline bool is_dev_v = is_dev::value; + +template +struct is_rank_0_host : std::false_type {}; +template +struct is_rank_0_host && T::rank == 0>> + : std::true_type {}; +template +constexpr inline bool is_rank_0_host_v = is_rank_0_host::value; + +template +struct is_rank_1_host : std::false_type {}; +template +struct is_rank_1_host && T::rank == 1>> + : std::true_type {}; +template +constexpr inline bool is_rank_1_host_v = is_rank_1_host::value; + +template +struct is_rank_1_host_static : std::false_type {}; +template +struct is_rank_1_host_static< + T, std::enable_if_t && T::static_extent(0) == 1>> + : std::true_type {}; +template +constexpr inline bool is_rank_1_host_static_v = is_rank_1_host_static::value; + +template +struct is_rank_1_host_dynamic : std::false_type {}; +template +struct is_rank_1_host_dynamic< + T, std::enable_if_t && T::rank_dynamic == 1>> + : std::true_type {}; +template +constexpr inline bool is_rank_1_host_dynamic_v = + is_rank_1_host_dynamic::value; + +template +struct is_rank_0_dev : std::false_type {}; +template +struct is_rank_0_dev && T::rank == 0>> + : std::true_type {}; +template +constexpr inline bool is_rank_0_dev_v = is_rank_0_dev::value; + +template +struct is_rank_1_dev : std::false_type {}; +template +struct is_rank_1_dev && T::rank == 1>> + : std::true_type {}; +template +constexpr inline bool is_rank_1_dev_v = is_rank_1_dev::value; + +template +struct is_rank_1_dev_static : std::false_type {}; +template +struct is_rank_1_dev_static< + T, std::enable_if_t && T::static_extent(0) == 1>> + : std::true_type {}; +template +constexpr inline bool is_rank_1_dev_static_v = is_rank_1_dev_static::value; + +template +struct is_rank_1_dev_dynamic : std::false_type {}; +template +struct is_rank_1_dev_dynamic< + T, std::enable_if_t && T::rank_dynamic == 1>> + : std::true_type {}; +template +constexpr inline bool is_rank_1_dev_dynamic_v = is_rank_1_dev_dynamic::value; + +template +struct scal_unified_scalar_view; + +// Rows 1,2: S -> S +template +struct scal_unified_scalar_view>> { + using alpha_type = AV; + + static alpha_type from(const AV &av) { return av; } +}; + +// Rows 3,4: AV is a rank 0 host view +template +struct scal_unified_scalar_view>> { + using alpha_type = typename AV::non_const_data_type; + + static alpha_type from(const AV &av) { return av(); } +}; + +// Rows 5,6: AV is a rank 0 device view +template +struct scal_unified_scalar_view>> { + using alpha_type = + Kokkos::View; + + static alpha_type from(const AV &av) { return alpha_type(av); } +}; + +// Rows 7,8: AV is a rank 1 host view with known extent +template +struct scal_unified_scalar_view>> { + using alpha_type = typename AV::non_const_value_type; + + static alpha_type from(const AV &av) { return av(0); } +}; + +// Row 9: AV is a rank 1 host view of unknown size, but we assume it's +// a single scalar since XMV and YMV are rank 1 +template +struct scal_unified_scalar_view< + RMV, AV, XMV, + std::enable_if_t && RMV::rank == 1 && + XMV::rank == 1>> { + using alpha_type = typename AV::non_const_value_type; + + static alpha_type from(const AV &av) { return av(0); } +}; + +// Row 10: AV is a rank 1 host view of unknown size, and we assume +// each element is to scale a vector in RMV and XMV +template +struct scal_unified_scalar_view< + RMV, AV, XMV, + std::enable_if_t && XMV::rank == 2 && + RMV::rank == 2>> { + using alpha_type = + Kokkos::View; + + static alpha_type from(const AV &av) { return av; } +}; + +// Row 11, 12: AV is a rank 1 dev view, but we assume its +// a single scalar since XMV and YMV are rank 1 +template +struct scal_unified_scalar_view>> { + using alpha_type = + Kokkos::View; + + static alpha_type from(const AV &av) { return Kokkos::subview(av, 0); } +}; + +// Row 13: AV is a rank 1 dev view of static size, +// so its a single scalar +template +struct scal_unified_scalar_view>> { + using alpha_type = + Kokkos::View; + + static alpha_type from(const AV &av) { return Kokkos::subview(av, 0); } +}; + +// Row 14: AV is a rank 1 dev view of unknown size, +// and XMV and YMV are rank 2, so assume each entry is +// used to scale each vector +template +struct scal_unified_scalar_view< + RMV, AV, XMV, + std::enable_if_t && XMV::rank == 2 && RMV::rank == 2>> { + using alpha_type = + Kokkos::View; + + static alpha_type from(const AV &av) { return av; } +}; + +/*! \brief return av + +Get a POD, Kokkos::complex, or 0D view as a scalar +*/ +template , bool> = true> +KOKKOS_INLINE_FUNCTION auto as_scalar(const AV &av) { + return av; +} + +/*! \brief return av() + +Get a POD, Kokkos::complex, or 0D view as a scalar +*/ +template , bool> = true> +KOKKOS_INLINE_FUNCTION auto as_scalar(const AV &av) { + return av(); +} + +/*! \brief return av + */ +template , bool> = true> +KOKKOS_INLINE_FUNCTION auto as_scalar(const AV &av, const IndexType & /*i*/) { + return av; +} + +/*! \brief return av() + */ +template , bool> = true> +KOKKOS_INLINE_FUNCTION auto as_scalar(const AV &av, const IndexType &i) { + return av(); +} + +/*! \brief return av(i) + */ +template , bool> = true> +KOKKOS_INLINE_FUNCTION auto as_scalar(const AV &av, const IndexType &i) { + return av(i); +} + +} // namespace KokkosBlas::Impl + +#endif // KOKKOSBLAS1_SCAL_UNIFIED_SCALAR_VIEW_IMPL \ No newline at end of file diff --git a/blas/src/KokkosBlas1_scal.hpp b/blas/src/KokkosBlas1_scal.hpp index 39c197f352..51db7127dc 100644 --- a/blas/src/KokkosBlas1_scal.hpp +++ b/blas/src/KokkosBlas1_scal.hpp @@ -23,6 +23,8 @@ #include #include +#include + /// /// General/Host Scale /// @@ -37,7 +39,7 @@ namespace KokkosBlas { /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV. -/// \tparam AV 1-D or 2-D Kokkos::View specialization. +/// \tparam AV a scalar, 0-D, or 1-D Kokkos::View specialization. /// /// \param space [in] the execution space instance on which the kernel will run. /// \param R [in/out] view of type RMV in which the results will be stored. @@ -103,13 +105,23 @@ void scal(const execution_space& space, const RMV& R, const AV& a, using XMV_Internal = Kokkos::View >; - using AV_Internal = + + // this promotes AV to be compatible with XMV, e.g. if XMV is complex + // and AV is double, result will be complex + using AV_PromotedToXMV = typename KokkosKernels::Impl::GetUnifiedScalarViewType::type; + // this canonicalizes the type of Alpha to be a particular flavor of scalar, + // 0D, or 1D views, depending on whether alpha lives on the host or device + using AlphaUnifier = + KokkosBlas::Impl::scal_unified_scalar_view; + using AV_Internal = typename AlphaUnifier::alpha_type; + RMV_Internal R_internal = R; - AV_Internal a_internal = a; XMV_Internal X_internal = X; + AV_Internal a_internal = AlphaUnifier::from(AV_PromotedToXMV(a)); Impl::Scal::scal( space, R_internal, a_internal, X_internal); diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index da11555f7b..653857ada5 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -256,6 +256,15 @@ KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, namespace KokkosBlas { namespace Impl { +/* rocBLAS documentation: + "a rocBLAS handle always has one stream."" + "If the handle is switching from one non-default stream to another, the + old stream needs to be synchronized...next...rocblas_set_stream" Basically + this means if we're switching streams, we have to fence the old one first. We + also set the handle's pointer mode appropriately before invoking BLAS. + + // push_pointer_mode +*/ #define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ @@ -293,6 +302,11 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::RocBlasSingleton& s = \ KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + hipStream_t cur; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_stream(s.handle, &cur)); \ + if (cur != space.hip_stream()) { \ + execution_space(cur).fence(); \ + } \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_stream(s.handle, space.hip_stream())); \ rocblas_pointer_mode pointer_mode; \ diff --git a/blas/unit_test/Test_Blas1_scal.hpp b/blas/unit_test/Test_Blas1_scal.hpp index 6c4f7b7f2a..4091d48ccc 100644 --- a/blas/unit_test/Test_Blas1_scal.hpp +++ b/blas/unit_test/Test_Blas1_scal.hpp @@ -21,6 +21,149 @@ #include namespace Test { + +/*! \brief Test scal will AV being a view or a scalar + +*/ +template +void impl_test_scal_a_is_3(int N, const AV &a) { + + typedef typename XView::value_type ScalarX; + typedef typename YView::value_type ScalarY; + typedef Kokkos::ArithTraits AT; + + typename AT::mag_type eps = AT::epsilon() * 1000; // FIXME + + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + { + ScalarX randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); + } + + Kokkos::deep_copy(x.h_base, x.d_base); + + KokkosBlas::scal(y.d_view, a, x.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); + for (int i = 0; i < N; i++) { + // scaling factor is 3 + EXPECT_NEAR_KK(static_cast(3 * x.h_view(i)), y.h_view(i), eps); + } + + // Zero out y again and run with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); + KokkosBlas::scal(y.d_view, a, x.d_view_const); + Kokkos::deep_copy(y.h_base, y.d_base); + for (int i = 0; i < N; i++) { + // scaling factor is three + EXPECT_NEAR_KK(static_cast(3 * x.h_view(i)), y.h_view(i), eps); + } +} + +/*! \brief test scal with alpha as a device view of rank 0 or rank 1 + + \tparam RANK if VIEW what rank is alpha? + \tparam STATIC if VIEW and RANK=1, is the extent static? +*/ +template +void impl_test_scal_device_alpha(int N) { + + using XView = Kokkos::View; + using YView = Kokkos::View; + + if constexpr (1 == RANK && STATIC) { + Kokkos::View a("View"); + Kokkos::deep_copy(a, 3); + impl_test_scal_a_is_3(N, a); + } else if constexpr(1 == RANK) { + Kokkos::View a("View", 1); + Kokkos::deep_copy(a, 3); + impl_test_scal_a_is_3(N, a); + } else if constexpr(0 == RANK) { + Kokkos::View a("View"); + Kokkos::deep_copy(a, 3); + impl_test_scal_a_is_3(N, a); + } +} + +/*! \brief Test scal will AV being a view or a scalar + +*/ +template +void impl_test_scal_mv_a_is_3(const int N, const int K, const AV &a) { + + using ScalarX = typename XView::value_type; + using ScalarY = typename YView::value_type; + typedef Kokkos::ArithTraits AT; + + typename AT::mag_type eps = AT::epsilon() * 1000; // FIXME + + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + { + ScalarX randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); + } + + Kokkos::deep_copy(x.h_base, x.d_base); + + KokkosBlas::scal(y.d_view, a, x.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); + for (int k = 0; k < K; ++k) { + for (int n = 0; n < N; ++n) { + // scaling factor is 3 + EXPECT_NEAR_KK(static_cast(3 * x.h_view(n,k)), y.h_view(n,k), eps); + } + } + + // Zero out y again and run with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); + KokkosBlas::scal(y.d_view, a, x.d_view_const); + Kokkos::deep_copy(y.h_base, y.d_base); + for (int k = 0; k < K; ++k) { + for (int n = 0; n < N; ++n) { + // scaling factor is three + EXPECT_NEAR_KK(static_cast(3 * x.h_view(n,k)), y.h_view(n,k), eps); + } + } +} + +/*! \brief test scal with alpha as a device view of rank 0 or rank 1 + + \tparam RANK if VIEW what rank is alpha? + \tparam STATIC if VIEW and RANK=1, is the extent static? +*/ +template +void impl_test_scal_mv_device_alpha(const int N, const int K) { + + using XView = Kokkos::View; + using YView = Kokkos::View; + + if constexpr (1 == RANK && STATIC) { + Kokkos::View a("View"); + Kokkos::deep_copy(a, 3); + impl_test_scal_mv_a_is_3(N, K, a); + } else if constexpr(1 == RANK) { + Kokkos::View a("View", K); + Kokkos::deep_copy(a, 3); + impl_test_scal_mv_a_is_3(N, K, a); + } else if constexpr(0 == RANK) { + Kokkos::View a("View"); + Kokkos::deep_copy(a, 3); + impl_test_scal_mv_a_is_3(N, K, a); + } +} + template void impl_test_scal(int N) { typedef typename ViewTypeA::value_type ScalarA; @@ -59,6 +202,8 @@ void impl_test_scal(int N) { } } + + template void impl_test_scal_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; @@ -143,12 +288,30 @@ int test_scal() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ll; - typedef Kokkos::View view_type_b_ll; + using view_type_a_ll = Kokkos::View; + using view_type_b_ll = Kokkos::View; + Test::impl_test_scal(0); Test::impl_test_scal(13); Test::impl_test_scal(1024); - // Test::impl_test_scal(132231); + Test::impl_test_scal(132231); + + /* Test that scal works for 0-rank and 1-rank views from the device. Host alphas are tested elsewhere */ + // clang-format off + // rank, static + Test::impl_test_scal_device_alpha(0); + Test::impl_test_scal_device_alpha(0); + Test::impl_test_scal_device_alpha(0); + + Test::impl_test_scal_device_alpha(13); + Test::impl_test_scal_device_alpha(13); + Test::impl_test_scal_device_alpha(13); + + Test::impl_test_scal_device_alpha(1024); + Test::impl_test_scal_device_alpha(1024); + Test::impl_test_scal_device_alpha(1024); + // clang-format on + #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ @@ -159,7 +322,7 @@ int test_scal() { Test::impl_test_scal(0); Test::impl_test_scal(13); Test::impl_test_scal(1024); - // Test::impl_test_scal(132231); + Test::impl_test_scal(132231); #endif #if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ @@ -169,7 +332,7 @@ int test_scal() { Test::impl_test_scal(0); Test::impl_test_scal(13); Test::impl_test_scal(1024); - // Test::impl_test_scal(132231); + Test::impl_test_scal(132231); #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ @@ -191,7 +354,24 @@ int test_scal_mv() { Test::impl_test_scal_mv(0, 5); Test::impl_test_scal_mv(13, 5); Test::impl_test_scal_mv(1024, 5); - // Test::impl_test_scal_mv(132231,5); + Test::impl_test_scal_mv(132231,5); + + /* Test that scal works for 0-rank and 1-rank views from the device. Host alphas are tested elsewhere */ + // clang-format off + // rank, static + Test::impl_test_scal_mv_device_alpha(0, 5); + Test::impl_test_scal_mv_device_alpha(0, 5); + Test::impl_test_scal_mv_device_alpha(0, 5); + + Test::impl_test_scal_mv_device_alpha(13, 5); + Test::impl_test_scal_mv_device_alpha(13, 5); + Test::impl_test_scal_mv_device_alpha(13, 5); + + Test::impl_test_scal_mv_device_alpha(1024, 5); + Test::impl_test_scal_mv_device_alpha(1024, 5); + Test::impl_test_scal_mv_device_alpha(1024, 5); + // clang-format on + #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ @@ -202,7 +382,7 @@ int test_scal_mv() { Test::impl_test_scal_mv(0, 5); Test::impl_test_scal_mv(13, 5); Test::impl_test_scal_mv(1024, 5); - // Test::impl_test_scal_mv(132231,5); + Test::impl_test_scal_mv(132231,5); #endif #if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ @@ -212,7 +392,7 @@ int test_scal_mv() { Test::impl_test_scal_mv(0, 5); Test::impl_test_scal_mv(13, 5); Test::impl_test_scal_mv(1024, 5); - // Test::impl_test_scal_mv(132231,5); + Test::impl_test_scal_mv(132231,5); #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 88bf237274..b065869296 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,3 +1,4 @@ # Adding source directory to the build LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/src) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/impl) LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/unit_test) diff --git a/common/impl/KokkosKernels_IsKokkosComplex.hpp b/common/impl/KokkosKernels_IsKokkosComplex.hpp new file mode 100644 index 0000000000..4a619ae718 --- /dev/null +++ b/common/impl/KokkosKernels_IsKokkosComplex.hpp @@ -0,0 +1,40 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_ISKOKKOSCOMPLEX_HPP +#define KOKKOSKERNELS_ISKOKKOSCOMPLEX_HPP + +#include + +namespace KokkosKernels::Impl { + +/// \class is_kokkos_complex +/// \brief is_kokkos_complex::value is true if T is a Kokkos::complex<...>, false +/// otherwise +template +struct is_kokkos_complex : public std::false_type {}; +template +struct is_kokkos_complex> : public std::true_type {}; +template +struct is_kokkos_complex> : public std::true_type {}; + +template +inline constexpr bool is_kokkos_complex_v = is_kokkos_complex::value; + +} // namespace KokkosKernels::Impl + + +#endif // KOKKOSKERNELS_ISKOKKOSCOMPLEX_HPP \ No newline at end of file diff --git a/common/impl/KokkosKernels_ScalarHint.hpp b/common/impl/KokkosKernels_ScalarHint.hpp new file mode 100644 index 0000000000..ae5eb9be82 --- /dev/null +++ b/common/impl/KokkosKernels_ScalarHint.hpp @@ -0,0 +1,33 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_SCALARHINT_HPP +#define KOKKOSKERNELS_SCALARHINT_HPP + +namespace KokkosKernels::Impl { + +/*! An enum that can be used as a template param to optimize an implementation +*/ +enum class ScalarHint : int { + none, + zero, + pos_one, + neg_one +}; + +} // namespace KokkosKernels::Impl + +#endif // KOKKOSKERNELS_SCALARHINT_HPP \ No newline at end of file diff --git a/common/impl/KokkosKernels_ViewUtils.hpp b/common/impl/KokkosKernels_ViewUtils.hpp new file mode 100644 index 0000000000..bae4b6fc1d --- /dev/null +++ b/common/impl/KokkosKernels_ViewUtils.hpp @@ -0,0 +1,47 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#include + +#include "Kokkos_Core.hpp" + +#ifndef KOKKOSKERNELS_VIEWUTILS_HPP +#define KOKKOSKERNELS_VIEWUTILS_HPP + +namespace KokkosKernels::Impl { + +template +struct is_rank_0 : std::false_type {}; +template +struct is_rank_0 && T::rank == 0>> + : std::true_type {}; +template +constexpr inline bool is_rank_0_v = is_rank_0::value; + +template +struct is_rank_1 : std::false_type {}; +template +struct is_rank_1 && T::rank == 1>> + : std::true_type {}; +template +constexpr inline bool is_rank_1_v = is_rank_1::value; + + +} // namespace KokkosKernels::Impl + +#endif // KOKKOSKERNELS_VIEWUTILS_HPP + diff --git a/common/src/KokkosKernels_helpers.hpp b/common/src/KokkosKernels_helpers.hpp index b36360b991..2e8c29ce20 100644 --- a/common/src/KokkosKernels_helpers.hpp +++ b/common/src/KokkosKernels_helpers.hpp @@ -43,15 +43,19 @@ struct GetUnifiedLayout { default_layout>::array_layout; }; +/* If T is not a view, type is TX::non_const_value_type +*/ template ::value> struct GetUnifiedScalarViewType { typedef typename TX::non_const_value_type type; }; +/* If T is a view, type is T with unified layout & unmanaged +*/ template struct GetUnifiedScalarViewType { - typedef Kokkos::View::array_layout, typename T::device_type, @@ -61,7 +65,7 @@ struct GetUnifiedScalarViewType { template struct GetUnifiedScalarViewType { - typedef Kokkos::View::array_layout, typename T::device_type, @@ -69,6 +73,7 @@ struct GetUnifiedScalarViewType { type; }; + template struct are_integral : std::bool_constant<((std::is_integral_v || std::is_enum_v)&&...)> {}; diff --git a/common/unit_test/Test_Common.hpp b/common/unit_test/Test_Common.hpp index 2ccf9c2103..38d4d9c2c8 100644 --- a/common/unit_test/Test_Common.hpp +++ b/common/unit_test/Test_Common.hpp @@ -26,6 +26,7 @@ #include #include #include +// #include #include #endif // TEST_COMMON_HPP diff --git a/common/unit_test/Test_Common_UnifiedScalarView.hpp b/common/unit_test/Test_Common_UnifiedScalarView.hpp new file mode 100644 index 0000000000..049824af0e --- /dev/null +++ b/common/unit_test/Test_Common_UnifiedScalarView.hpp @@ -0,0 +1,250 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + + +#ifndef TEST_KOKKOSKERNELS_UNIFIEDSCALARVIEW_HPP +#define TEST_KOKKOSKERNELS_UNIFIEDSCALARVIEW_HPP + +#include + +#include + +template +void test_is_unified_scalar() { + + static_assert( + KokkosKernels::Impl::is_scalar_or_scalar_view, + "" + ); + static_assert( + KokkosKernels::Impl::is_scalar_or_scalar_view, + "" + ); + static_assert( + KokkosKernels::Impl::is_scalar_or_scalar_view>, + "" + ); + static_assert( + KokkosKernels::Impl::is_scalar_or_scalar_view>, + "" + ); + static_assert( + KokkosKernels::Impl::is_scalar_or_scalar_view>, + "" + ); + static_assert( + KokkosKernels::Impl::is_scalar_or_scalar_view>, + "" + ); + // false cases + + static_assert( + !KokkosKernels::Impl::is_scalar_or_scalar_view>, + "" + ); + static_assert( + !KokkosKernels::Impl::is_scalar_or_scalar_view>, + "" + ); + + static_assert( + !KokkosKernels::Impl::is_scalar_or_scalar_view>, + "" + ); + static_assert( + !KokkosKernels::Impl::is_scalar_or_scalar_view>, + "" + ); + // could support this one, but seems unlikely to come up in practice + static_assert( + !KokkosKernels::Impl::is_scalar_or_scalar_view>, + "" + ); + static_assert( + !KokkosKernels::Impl::is_scalar_or_scalar_view>, + "" + ); + static_assert( + !KokkosKernels::Impl::is_scalar_or_scalar_view>, + "" + ); + static_assert( + !KokkosKernels::Impl::is_scalar_or_scalar_view>, + "" + ); + + static_assert( + !KokkosKernels::Impl::is_scalar_or_scalar_view, + "" + ); + + static_assert( + !KokkosKernels::Impl::is_scalar_or_scalar_view, + "" + ); +} + +template +void test_unified_scalar() { + + // test scalars + static_assert( + std::is_same_v>, + "" + ); + static_assert( + std::is_same_v>, + "" + ); + static_assert( + std::is_same_v>, + "" + ); + static_assert( + std::is_same_v>, + "" + ); + + // test 0D views + static_assert( + std::is_same_v>>, + "" + ); + static_assert( + std::is_same_v>>, + "" + ); + static_assert( + std::is_same_v>>, + "" + ); + static_assert( + std::is_same_v>>, + "" + ); + + // test 1D views + static_assert( + std::is_same_v>>, + "" + ); + static_assert( + std::is_same_v>>, + "" + ); + static_assert( + std::is_same_v>>, + "" + ); + static_assert( + std::is_same_v>>, + "" + ); +} + +template +struct Expect0DEqual { + using exp_type = typename View::value_type; + Expect0DEqual(const View &v, const exp_type &exp) : v_(v), exp_(exp) {} + void operator()(size_t i, int &lsum) const { + if (v_() != exp_) { + ++lsum; + } + } + View v_; + exp_type exp_; +}; + +template +struct Expect1DEqual { + using exp_type = typename View::value_type; + Expect1DEqual(const View &v, const exp_type &exp) : v_(v), exp_(exp) {} + void operator()(size_t i, int &lsum) const { + if (v_(0) != exp_) { + ++lsum; + } + } + View v_; + exp_type exp_; +}; + + +template +void test_get_scalar() { + + // constexpr context + static_assert(3 == KokkosKernels::Impl::get_scalar(3), ""); + + // 0D + { + Kokkos::View v; + Kokkos::deep_copy(v, ValueType(4)); + Kokkos::View cv = v; + + const Kokkos::RangePolicy policy(0, 1); // one thread + + { + Expect0DEqual op(v, ValueType(4)); + int err; + Kokkos::parallel_reduce("", policy, op, Kokkos::Sum(err)); + EXPECT_EQ(err, 0); + } + + { + Expect0DEqual op(cv, ValueType(4)); + int err; + Kokkos::parallel_reduce("", policy, op, Kokkos::Sum(err)); + EXPECT_EQ(err, 0); + } + } + + // 1D + { + Kokkos::View v; + Kokkos::deep_copy(v, 4); + Kokkos::View cv = v; + + const Kokkos::RangePolicy policy(0, 1); // one thread + + { + Expect1DEqual op(v, ValueType(4)); + int err; + Kokkos::parallel_reduce("", policy, op, Kokkos::Sum(err)); + EXPECT_EQ(err, 0); + } + + { + Expect1DEqual op(cv, ValueType(4)); + int err; + Kokkos::parallel_reduce("", policy, op, Kokkos::Sum(err)); + EXPECT_EQ(err, 0); + } + } + +} + +TEST_F(TestCategory, common_device_unifiedscalarview) { + // Test device-level bitonic with some larger arrays + + test_is_unified_scalar(); + test_is_unified_scalar>(); + test_unified_scalar(); + test_unified_scalar>(); + test_get_scalar(); + test_get_scalar>(); +} + +#endif // TEST_KOKKOSKERNELS_UNIFIEDSCALARVIEW_HPP diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index f43ec0bd54..f314d0efcd 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -69,6 +69,12 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const BetaType& beta, const YVector& y, [[maybe_unused]] const RANK_ONE& tag) { + std::cerr << __FILE__ << ":" << __LINE__ << " KokkosSparse::spmv(" + << "A[" << A.numRows() << "," << A.numCols() << "]" + << ", x[" << x.size() << "]" + << ", y[" << y.size() << "]" + << ")\n"; + // Make sure that x and y have the same rank. static_assert( static_cast(XVector::rank) == static_cast(YVector::rank), @@ -135,10 +141,13 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], // This is required to maintain semantics of KokkosKernels native SpMV: // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. - if (beta == Kokkos::ArithTraits::zero()) + if (beta == Kokkos::ArithTraits::zero()) { + std::cerr << __FILE__ << ":" << __LINE__ << " deep_copy y <- 0\n"; Kokkos::deep_copy(y_i, Kokkos::ArithTraits::zero()); - else + } else { + std::cerr << __FILE__ << ":" << __LINE__ << " scal y <- beta * y\n"; KokkosBlas::scal(y_i, beta, y_i); + } return; } @@ -184,6 +193,7 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], #endif if (useFallback) { + std::cerr << __FILE__ << ":" << __LINE__ << " fallback!\n"; // Explicitly call the non-TPL SPMV implementation std::string label = "KokkosSparse::spmv[NATIVE," + @@ -211,6 +221,7 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], y_i); Kokkos::Profiling::popRegion(); } else { + std::cerr << __FILE__ << ":" << __LINE__ << " no fallback!\n"; // note: the cuSPARSE spmv wrapper defines a profiling region, so one is not // needed here. Impl::SPMV(XVector::rank) == static_cast(YVector::rank), @@ -534,6 +552,12 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const BetaType& beta, const YVector& y, [[maybe_unused]] const RANK_TWO& tag) { + std::cerr << __FILE__ << ":" << __LINE__ << " KokkosSparse::spmv(" + << "A[" << A.numRows() << "," << A.numCols() << "]" + << ", x[" << x.size() << "]" + << ", y[" << y.size() << "]" + << ")\n"; + // Make sure that x and y have the same rank. static_assert( static_cast(XVector::rank) == static_cast(YVector::rank), @@ -680,6 +704,13 @@ template (XVector::rank) == static_cast(YVector::rank), @@ -904,6 +935,13 @@ template (XVector::rank) == static_cast(YVector::rank), @@ -950,10 +988,16 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], // This is required to maintain semantics of KokkosKernels native SpMV: // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. - if (beta == Kokkos::ArithTraits::zero()) + if (beta == Kokkos::ArithTraits::zero()) { + std::cerr << __FILE__ << ":" << __LINE__ << " deep_copy\n"; Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); - else + } else { + std::cerr << __FILE__ << ":" << __LINE__ << " scal\n"; KokkosBlas::scal(y, beta, y); + std::cerr << __FILE__ << ":" << __LINE__ << "...done scal!\n"; + } + + std::cerr << __FILE__ << ":" << __LINE__ << "return...\n"; return; } // diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index 9da0733581..ae23ed5761 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -164,6 +164,14 @@ void check_spmv( typename y_vector_type::non_const_value_type beta, char mode, typename Kokkos::ArithTraits::mag_type max_val) { + + std::cerr << __FILE__ << ":" << __LINE__ << " check_spmv(" + << "alpha=" << alpha + << ", beta=" << beta + << ", mode=" << mode + << ", max_val=" << max_val + << ")\n"; + // typedef typename crsMat_t::StaticCrsGraphType graph_t; using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; @@ -179,12 +187,16 @@ void check_spmv( Kokkos::deep_copy(expected_y, y); Kokkos::fence(); + std::cerr << __FILE__ << ":" << __LINE__ << " sequential_spmv...\n"; sequential_spmv(input_mat, x, expected_y, alpha, beta, mode); bool threw = false; std::string msg; try { + std::cerr << __FILE__ << ":" << __LINE__ << " KokkosSparse::spmv...\n"; KokkosSparse::spmv(controls, &mode, alpha, input_mat, x, beta, y); + std::cerr << __FILE__ << ":" << __LINE__ << " fence...\n"; Kokkos::fence(); + std::cerr << __FILE__ << ":" << __LINE__ << " ...done fence!\n"; } catch (std::exception &e) { threw = true; msg = e.what(); @@ -193,6 +205,7 @@ void check_spmv( << ": threw exception:\n" << msg << '\n'; int num_errors = 0; + std::cerr << __FILE__ << ":" << __LINE__ << " check result...\n"; Kokkos::parallel_reduce( "KokkosSparse::Test::spmv", my_exec_space(0, y.extent(0)), fSPMV(expected_y, y, eps, max_val), @@ -447,14 +460,17 @@ void test_spmv(const Controls &controls, lno_t numRows, size_type nnz, const lno_t max_nnz_per_row = numRows ? (nnz / numRows + row_size_variance) : 0; + std::cerr << __FILE__ << ":" << __LINE__ << "\n"; x_vector_type input_x("x", nc); y_vector_type output_y("y", nr); x_vector_type input_xt("x", nr); y_vector_type output_yt("y", nc); + std::cerr << __FILE__ << ":" << __LINE__ << "\n"; Kokkos::Random_XorShift64_Pool rand_pool( 13718); + std::cerr << __FILE__ << ":" << __LINE__ << "\n"; Kokkos::fill_random(input_x, rand_pool, randomUpperBound(max_x)); Kokkos::fill_random(output_y, rand_pool, randomUpperBound(max_y)); Kokkos::fill_random(input_xt, rand_pool, randomUpperBound(max_x)); @@ -463,6 +479,7 @@ void test_spmv(const Controls &controls, lno_t numRows, size_type nnz, // We also need to bound the values // in the matrix to bound the cancellations // coming from arithmetic operations. + std::cerr << __FILE__ << ":" << __LINE__ << "\n"; Kokkos::fill_random(input_mat.values, rand_pool, randomUpperBound(max_val)); @@ -480,6 +497,7 @@ void test_spmv(const Controls &controls, lno_t numRows, size_type nnz, for (double beta : testAlphaBeta) { mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; + std::cerr << __FILE__ << ":" << __LINE__ << "\n"; Test::check_spmv(controls, input_mat, input_x, output_y, alpha, beta, mode, max_error); } @@ -491,6 +509,7 @@ void test_spmv(const Controls &controls, lno_t numRows, size_type nnz, // hoping the transpose won't have a long column... mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; + std::cerr << __FILE__ << ":" << __LINE__ << "\n"; Test::check_spmv(controls, input_mat, input_xt, output_yt, alpha, beta, mode, max_error); } @@ -503,12 +522,26 @@ template ( controls, numRows, nnz, bandwidth, row_size_variance, heavy); } { + std::cerr << __FILE__ << ":" << __LINE__ + << " test_spmv_algorithms[algorithm=native](" + << numRows << ", " + << nnz << ", " + << bandwidth << ", " + << row_size_variance << ", " + << heavy << ")\n"; Controls controls; controls.setParameter("algorithm", "native"); test_spmv( @@ -1597,24 +1630,32 @@ void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha, #define EXECUTE_TEST_ISSUE_101(DEVICE) \ TEST_F(TestCategory, sparse##_##spmv_issue_101##_##OFFSET##_##DEVICE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << "(1)\n"; \ test_github_issue_101(); \ } #define EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##spmv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << "(2)\n"; \ test_spmv_algorithms(1000, 1000 * 3, 200, \ 10, true); \ + std::cerr << __FILE__ << ":" << __LINE__ << "(3)\n"; \ test_spmv_algorithms(1000, 1000 * 3, 100, \ 10, true); \ + std::cerr << __FILE__ << ":" << __LINE__ << "(4)\n"; \ test_spmv_algorithms(1000, 1000 * 20, \ 100, 5, true); \ + std::cerr << __FILE__ << ":" << __LINE__ << "(5)\n"; \ test_spmv_algorithms(50000, 50000 * 3, \ 20, 10, false); \ + std::cerr << __FILE__ << ":" << __LINE__ << "(6)\n"; \ test_spmv_algorithms(50000, 50000 * 3, \ 100, 10, false); \ + std::cerr << __FILE__ << ":" << __LINE__ << "(7)\n"; \ test_spmv_algorithms(10000, 10000 * 2, \ 100, 5, false); \ + std::cerr << __FILE__ << ":" << __LINE__ << "(8)\n"; \ test_spmv_controls(10000, 10000 * 20, \ 100, 5); \ } @@ -1623,18 +1664,25 @@ void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha, TEST_F( \ TestCategory, \ sparse##_##spmv_mv##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << "(9)\n"; \ test_spmv_mv( \ 1000, 1000 * 3, 200, 10, true, 1); \ + std::cerr << __FILE__ << ":" << __LINE__ << "(10)\n"; \ test_spmv_mv( \ 1000, 1000 * 3, 100, 10, true, 5); \ + std::cerr << __FILE__ << ":" << __LINE__ << "(11)\n"; \ test_spmv_mv( \ 1000, 1000 * 2, 100, 5, true, 10); \ + std::cerr << __FILE__ << ":" << __LINE__ << "(12)\n"; \ test_spmv_mv( \ 50000, 50000 * 3, 20, 10, false, 1); \ + std::cerr << __FILE__ << ":" << __LINE__ << "(13)\n"; \ test_spmv_mv( \ 50000, 50000 * 3, 100, 10, false, 1); \ + std::cerr << __FILE__ << ":" << __LINE__ << "(14)\n"; \ test_spmv_mv( \ 10000, 10000 * 2, 100, 5, false, 5); \ + std::cerr << __FILE__ << ":" << __LINE__ << "(15)\n"; \ test_spmv_mv_heavy( \ 200, 200 * 10, 60, 4, 30); \ } @@ -1643,14 +1691,23 @@ void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha, TEST_F( \ TestCategory, \ sparse##_##spmv_struct##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_struct_1D(10, 1, 1); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_struct_2D(25, 21, 3, 3); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_struct_2D(20, 25, 3, 3); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_struct_2D(22, 22, 3, 3); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_struct_3D(20, 20, 20, 3, 3, 3); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_struct_3D(22, 22, 22, 3, 3, 3); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_struct_3D(25, 10, 20, 3, 3, 3); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_struct_3D(10, 20, 25, 3, 3, 3); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_struct_3D(10, 24, 20, 3, 3, 3); \ } @@ -1658,8 +1715,10 @@ void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha, TEST_F( \ TestCategory, \ sparse##_##spmv_mv_struct##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_mv_struct_1D( \ 10, 1); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_mv_struct_1D( \ 10, 2); \ } @@ -1673,50 +1732,70 @@ void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha, TestCategory, \ sparse##_##spmv_tensor_core##_##ASCALAR##_##XSCALAR##_##YSCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ /* easy case with different alphas and betas*/ \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(16, 16, 0, 0); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(16, 16, 1, 0); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(16, 16, 0, 1); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(16, 16, 1, 1); \ /* easy case with a real alpha/beta */ \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(16, 16, 1.25, -2.73); \ /* smaller block size with k < and > block size*/ \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(7, 6, 1.25, -2.73); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(7, 7, 1.25, -2.73); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(7, 8, 1.25, -2.73); \ /* smaller block size with k < and > block size*/ \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(15, 14, 1.25, -2.73); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(15, 15, 1.25, -2.73); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(15, 16, 1.25, -2.73); \ /* larger block size with k < and > block size*/ \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(17, 16, 1.25, -2.73); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(17, 17, 1.25, -2.73); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(17, 18, 1.25, -2.73); \ /* larger block size with k < and > block size*/ \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(32, 31, 1.25, -2.73); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(32, 32, 1.25, -2.73); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(32, 33, 1.25, -2.73); \ /* more than one team per block*/ \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(33, 13, 1.25, -2.73); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(33, 27, 1.25, -2.73); \ + std::cerr << __FILE__ << ":" << __LINE__ << "()\n"; \ test_spmv_bsrmatrix(33, 41, 1.25, -2.73); \ }