Skip to content

rvv simd acceleration #160

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,16 @@ int main() {
return 0;
}" SNAPPY_HAVE_NEON)

check_cxx_source_compiles("
#include <riscv_vector.h>
int main() {
uint8_t val = 3, dup[8];
vuint8m1_t v = vmv_v_x_u8m1(val, 128);
vuint64m1_t vv = vreinterpret_v_u8m1_u64m1(v);
vse64_v_u64m1(reinterpret_cast<uint64_t*>(dup), vv, 128);
return 0;
}" SNAPPY_HAVE_RVV)

include(CheckSymbolExists)
check_symbol_exists("mmap" "sys/mman.h" HAVE_FUNC_MMAP)
check_symbol_exists("sysconf" "unistd.h" HAVE_FUNC_SYSCONF)
Expand Down
3 changes: 3 additions & 0 deletions cmake/config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,7 @@
first (like Motorola and SPARC, unlike Intel and VAX). */
#cmakedefine01 SNAPPY_IS_BIG_ENDIAN

/* Define to 1 if you target processors with RVV and have <riscv_vector.h>. */
#cmakedefine01 SNAPPY_HAVE_RVV

#endif // THIRD_PARTY_SNAPPY_OPENSOURCE_CMAKE_CONFIG_H_
34 changes: 31 additions & 3 deletions snappy-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,11 @@
#include <arm_neon.h>
#endif

#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON
#if SNAPPY_HAVE_RVV
#include <riscv_vector.h>
#endif

#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON || SNAPPY_HAVE_RVV
#define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1
#else
#define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0
Expand All @@ -58,6 +62,8 @@ namespace internal {
using V128 = __m128i;
#elif SNAPPY_HAVE_NEON
using V128 = uint8x16_t;
#elif SNAPPY_HAVE_RVV
using V128 = vuint8m1_t;
#endif

// Load 128 bits of integer data. `src` must be 16-byte aligned.
Expand Down Expand Up @@ -108,7 +114,29 @@ inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
}

inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); }
#endif

#elif SNAPPY_HAVE_RVV
inline V128 V128_Load(const V128* src) {
return vle8_v_u8m1(reinterpret_cast<const uint8_t*>(src), 128);
}

inline V128 V128_LoadU(const V128* src) {
return vle8_v_u8m1(reinterpret_cast<const uint8_t*>(src), 128);
}

inline void V128_StoreU(V128* dst, V128 val) {
vse8_v_u8m1(reinterpret_cast<uint8_t*>(dst), val, 128);
}

inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
return vrgather_vv_u8m1(input, shuffle_mask, 128);
}

inline V128 V128_DupChar(char c) {
return vmv_v_x_u8m1(c, 128);
}

#endif // SNAPPY_HAVE_RVV
#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE

// Working memory performs a single allocation to hold all scratch space
Expand Down Expand Up @@ -172,7 +200,7 @@ char* CompressFragment(const char* input,
// Separate implementation for 64-bit, little-endian cpus.
#if !SNAPPY_IS_BIG_ENDIAN && \
(defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || \
defined(ARCH_ARM))
defined(ARCH_ARM) || defined(__riscv))
static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
const char* s2,
const char* s2_limit,
Expand Down
44 changes: 38 additions & 6 deletions snappy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -232,22 +232,32 @@ inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes(
return {static_cast<char>((index_offset + indexes) % pattern_size)...};
}

template <typename V>
struct SizeOfV128 {
static constexpr uint64_t size = sizeof(V);
};

template <>
struct SizeOfV128<vuint8m1_t> {
static constexpr uint64_t size = 128;
};

// Computes the shuffle control mask bytes array for given pattern-sizes and
// returns an array.
template <size_t... pattern_sizes_minus_one>
inline constexpr std::array<std::array<char, sizeof(V128)>,
inline constexpr std::array<std::array<char, SizeOfV128<V128>::size>,
sizeof...(pattern_sizes_minus_one)>
MakePatternMaskBytesTable(int index_offset,
index_sequence<pattern_sizes_minus_one...>) {
return {
MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1,
make_index_sequence</*indexes=*/sizeof(V128)>())...};
make_index_sequence</*indexes=*/SizeOfV128<V128>::size>())...};
}

// This is an array of shuffle control masks that can be used as the source
// operand for PSHUFB to permute the contents of the destination XMM register
// into a repeating byte pattern.
alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,
alignas(16) constexpr std::array<std::array<char, SizeOfV128<V128>::size>()>,
16> pattern_generation_masks =
MakePatternMaskBytesTable(
/*index_offset=*/0,
Expand All @@ -258,7 +268,7 @@ alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,
// Basically, pattern_reshuffle_masks is a continuation of
// pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as
// pattern_generation_masks for offsets 1, 2, 4, 8 and 16.
alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,
alignas(16) constexpr std::array<std::array<char, SizeOfV128<V128>::size>()>,
16> pattern_reshuffle_masks =
MakePatternMaskBytesTable(
/*index_offset=*/16,
Expand All @@ -275,6 +285,15 @@ static inline V128 LoadPattern(const char* src, const size_t pattern_size) {
generation_mask);
}

// fix sizeless compiler issue
#if SNAPPY_HAVE_RVV
#define LoadPatternAndReshuffleMask(src, pattern_size) \
V128 pattern = LoadPattern(src, pattern_size);\
V128 reshuffle_mask = V128_Load(reinterpret_cast<const V128*>(\
pattern_reshuffle_masks[pattern_size - 1].data()));

#else

SNAPPY_ATTRIBUTE_ALWAYS_INLINE
static inline std::pair<V128 /* pattern */, V128 /* reshuffle_mask */>
LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
Expand All @@ -290,6 +309,7 @@ LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
pattern_reshuffle_masks[pattern_size - 1].data()));
return {pattern, reshuffle_mask};
}
#endif

#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE

Expand Down Expand Up @@ -324,10 +344,14 @@ static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) {
return true;
}
default: {
#if SNAPPY_HAVE_RVV
LoadPatternAndReshuffleMask(dst - offset, offset)
#else
auto pattern_and_reshuffle_mask =
LoadPatternAndReshuffleMask(dst - offset, offset);
V128 pattern = pattern_and_reshuffle_mask.first;
V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
#endif
for (int i = 0; i < 4; i++) {
V128_StoreU(reinterpret_cast<V128*>(dst + 16 * i), pattern);
pattern = V128_Shuffle(pattern, reshuffle_mask);
Expand Down Expand Up @@ -435,10 +459,14 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
// Typically, the op_limit is the gating factor so try to simplify the loop
// based on that.
if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
#if SNAPPY_HAVE_RVV
LoadPatternAndReshuffleMask(src, pattern_size)
#else
auto pattern_and_reshuffle_mask =
LoadPatternAndReshuffleMask(src, pattern_size);
V128 pattern = pattern_and_reshuffle_mask.first;
V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
#endif

// There is at least one, and at most four 16-byte blocks. Writing four
// conditionals instead of a loop allows FDO to layout the code with
Expand All @@ -462,10 +490,14 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
}
char* const op_end = buf_limit - 15;
if (SNAPPY_PREDICT_TRUE(op < op_end)) {
#if SNAPPY_HAVE_RVV
LoadPatternAndReshuffleMask(src, pattern_size)
#else
auto pattern_and_reshuffle_mask =
LoadPatternAndReshuffleMask(src, pattern_size);
V128 pattern = pattern_and_reshuffle_mask.first;
V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
#endif

// This code path is relatively cold however so we save code size
// by avoiding unrolling and vectorizing.
Expand Down Expand Up @@ -1099,7 +1131,7 @@ inline uint32_t ExtractOffset(uint32_t val, size_t tag_type) {
reinterpret_cast<const char*>(&kExtractMasksCombined) + 2 * tag_type,
sizeof(result));
return val & result;
#elif defined(__aarch64__)
#elif defined(__aarch64__) || defined(__riscv)
constexpr uint64_t kExtractMasksCombined = 0x0000FFFF00FF0000ull;
return val & static_cast<uint32_t>(
(kExtractMasksCombined >> (tag_type * 16)) & 0xFFFF);
Expand Down Expand Up @@ -1149,7 +1181,7 @@ std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(
// For literals tag_type = 0, hence we will always obtain 0 from
// ExtractLowBytes. For literals offset will thus be kLiteralOffset.
ptrdiff_t len_min_offset = kLengthMinusOffset[tag];
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(__riscv)
size_t tag_type = AdvanceToNextTagARMOptimized(&ip, &tag);
#else
size_t tag_type = AdvanceToNextTagX86Optimized(&ip, &tag);
Expand Down