Skip to content
This repository was archived by the owner on May 23, 2024. It is now read-only.
This repository was archived by the owner on May 23, 2024. It is now read-only.

Proposition & discussion for a ThreadSimdRange execution policy #24

@tpadioleau

Description

@tpadioleau

From what I can see in the Kokkos tutorials, a way to use a Kokkos::View with the simd-math library is to use a view based on a simd type. It does not seem easy to express stencil computation with this kind of view as we need direct neighbours. Hence I would like to propose an alternative using a new execution policy, ThreadSimdRange, to handle this type of computation that is close to the existing Kokkos::ThreadVectorRange.

using view_type = Kokkos::View< double**, Kokkos::LayoutLeft >;

#ifdef KOKKOS_ENABLE_CUDA
    using simd_t = simd::simd< double, simd::simd_abi::cuda_warp< 32 > >;
#else
    using simd_t = simd::simd< double, simd::simd_abi::native >;
#endif

class SimdRange
{
public:
    using simd_type = simd_t;

    KOKKOS_INLINE_FUNCTION
    SimdRange( int begin ) noexcept
        : m_begin( begin )
    {
    }

    KOKKOS_INLINE_FUNCTION
    static constexpr auto size() noexcept
    {
        return simd_type::size();
    }

    KOKKOS_INLINE_FUNCTION
    int begin() const noexcept
    {
        return m_begin;
    }

    KOKKOS_INLINE_FUNCTION
    int end() const noexcept
    {
        return m_begin + this->size();
    }

private:
    int m_begin;
};

class ScalarRange
{
public:
    KOKKOS_INLINE_FUNCTION
    ScalarRange( int begin ) noexcept
        : m_begin( begin )
    {
    }

    KOKKOS_INLINE_FUNCTION
    static constexpr auto size() noexcept
    {
        return 1;
    }

    KOKKOS_INLINE_FUNCTION
    int begin() const noexcept
    {
        return m_begin;
    }

    KOKKOS_INLINE_FUNCTION
    int end() const noexcept
    {
        return m_begin + this->size();
    }

private:
    int m_begin;
};

KOKKOS_INLINE_FUNCTION
SimdRange shift( const SimdRange& x, int s )
{
    return SimdRange( x.begin() + s );
}

KOKKOS_INLINE_FUNCTION
ScalarRange shift( const ScalarRange& x, int s )
{
    return ScalarRange( x.begin() + s );
}

KOKKOS_INLINE_FUNCTION
SimdRange::simd_type
load( const double* ptr, const SimdRange& range )
{
    return SimdRange::simd_type( ptr + range.begin(), simd::element_aligned_tag() );
}

KOKKOS_INLINE_FUNCTION
double
load( const double* ptr, const ScalarRange& range )
{
    return *( ptr + range.begin() );
}

KOKKOS_INLINE_FUNCTION
void
store( double* ptr, const SimdRange& range, const SimdRange::simd_type& simd )
{
    simd.copy_to( ptr + range.begin(), simd::element_aligned_tag() );
}

KOKKOS_INLINE_FUNCTION
void
store( double* ptr, const ScalarRange& range, const double& scalar )
{
    *( ptr + range.begin() ) = scalar;
}

class ThreadSimdRange
{
public:
    using scalar_range = ScalarRange;
    using simd_range = SimdRange;

    KOKKOS_INLINE_FUNCTION
    ThreadSimdRange( int begin, int end ) noexcept
        : m_begin( begin )
        , m_end( end )
    {
    }

    KOKKOS_INLINE_FUNCTION
    ThreadSimdRange( int count ) noexcept
        : ThreadSimdRange( 0, count )
    {
    }

    KOKKOS_INLINE_FUNCTION
    int begin() const noexcept
    {
        return m_begin;
    }

    KOKKOS_INLINE_FUNCTION
    int end() const noexcept
    {
        return m_end;
    }

private:
    int m_begin;
    int m_end;
};

template < class Functor >
KOKKOS_INLINE_FUNCTION
void
parallel_for( const ThreadSimdRange& range, Functor&& f )
{
    using simd_range = ThreadSimdRange::simd_range;

    const auto count = range.end() - range.begin();
    const auto end_vector = range.end() - count % simd_range::size();

    // simd loop
    for ( auto i = range.begin(); i < end_vector; i += simd_range::size() )
    {
        f( SimdRange( i ) );
    }

    // remainder loop
#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA)
    for ( auto i = end_vector + threadIdx.x; i < range.end(); i += blockDim.x )
    {
        f( ScalarRange( i ) );
    }
#else
    for ( auto i = end_vector; i < range.end(); ++i )
    {
        f( ScalarRange( i ) );
    }
#endif
}

// ...

Kokkos::parallel_for( "Stencil computation", Kokkos::TeamPolicy<>( view.extent( 1 ), 1, simd_t::size() ),
    KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team)
    {
        const int j = team.league_rank() + 1;
        parallel_for( ThreadSimdRange( 1, view.extent( 0 ) - 1 ),
            [&]( const auto& irange ) // represents [i, i + 1, ...]
            {
                const auto T_ij  = load( &view( 0, j     ), irange );              // loads [view( i    , j     ), view( i + 1, j     ), ...]
                const auto T_imj = load( &view( 0, j     ), shift( irange, -1 ) ); // loads [view( i - 1, j     ), view( i    , j     ), ...]
                const auto T_ipj = load( &view( 0, j     ), shift( irange, +1 ) ); // loads [view( i + 1, j     ), view( i + 2, j     ), ...]
                const auto T_ijm = load( &view( 0, j - 1 ), irange );              // loads [view( i    , j - 1 ), view( i + 1, j - 1 ), ...]
                const auto T_ijp = load( &view( 0, j + 1 ), irange );              // loads [view( i    , j + 1 ), view( i + 1, j + 1 ), ...]
                const auto T2_ij = T_ipj + T_ijp + T_ij - T_imj - T_ijm;
                store( &view2( 0, j ), irange, T2_ij );
            } );
    } );

The SimdRange represents a contiguous set of indices that will be used to load/store (unaligned) data from a scalar Kokkos::View. The associated parallel_for should mimic the work of the compiler by splitting the loop into a vectorized loop and a scalar loop for the remainder.

A drawback is the need for C++ 14 because of the lambda but compiles and run with recent compilers.

Do you see any problem with this execution policy ?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions