Skip to content

Mathias's Patch: Perf tuning for gcc + aarch64 #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions snappy-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,9 +231,7 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
int shift = Bits::FindLSBSetNonZero64(xorval);
size_t matched_bytes = shift >> 3;
uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
#ifndef __x86_64__
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
#else
#ifdef __x86_64__
// Ideally this would just be
//
// a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
Expand All @@ -248,6 +246,14 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
: "+r"(a2)
: "r"(a3), "r"(xorval)
: "cc");
#elif defined(__aarch64__)
asm("cmp %w[xorval], 0\n\t"
"csel %x[a2], %[a3], %[a2], eq\n\t"
: [a2] "+r" (a2)
: [a3] "r" (a3) , [xorval] "r" (xorval)
: "cc");
#else
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
#endif
*data = a2 >> (shift & (3 * 8));
return std::pair<size_t, bool>(matched_bytes, true);
Expand All @@ -272,14 +278,20 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
int shift = Bits::FindLSBSetNonZero64(xorval);
size_t matched_bytes = shift >> 3;
uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
#ifndef __x86_64__
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
#else
#ifdef __x86_64__
asm("testl %k2, %k2\n\t"
"cmovzq %1, %0\n\t"
: "+r"(a2)
: "r"(a3), "r"(xorval)
: "cc");
#elif defined(__aarch64__)
asm("cmp %w[xorval], 0\n\t"
"csel %x[a2], %[a3], %[a2], eq\n\t"
: [a2] "+r" (a2)
: [a3] "r" (a3) , [xorval] "r" (xorval)
: "cc");
#else
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
#endif
*data = a2 >> (shift & (3 * 8));
matched += matched_bytes;
Expand Down
79 changes: 65 additions & 14 deletions snappy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,57 @@ using internal::V128_StoreU;
using internal::V128_DupChar;
#endif

// GCC dispatches to libc for memmoves > 16 bytes, so we need to
// do some work to get good code from that compiler. Clang handles
// powers-of-2 at least up to 64 well.
#if !defined(__GNUC__) || defined(__clang__)
template <size_t SIZE>
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
inline void FixedSizeMemMove(void* dest, const void* src) {
memmove(dest, src, SIZE);
}
#else

template <size_t SIZE>
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
inline void FixedSizeMemMove(void* dest, const void* src) {
if (SIZE <= 16) {
// gcc has patterns for memmove up to 16 bytes
memmove(dest, src, SIZE);
} else {
// This generates reasonable code on x86_64, but on aarch64 this produces a
// dead store to tmp, plus takes up stack space.
char tmp[SIZE];
memcpy(tmp, src, SIZE);
memcpy(dest, tmp, SIZE);
}
}

#ifdef __aarch64__ // Implies neon support
template <>
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
inline void FixedSizeMemMove<32>(void* dest, const void* src) {
V128 a = V128_LoadU(reinterpret_cast<const V128*>(src));
V128 b = V128_LoadU(reinterpret_cast<const V128*>(src) + 1);
V128_StoreU(reinterpret_cast<V128*>(dest), a);
V128_StoreU(reinterpret_cast<V128*>(dest) + 1, b);
}

template <>
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
inline void FixedSizeMemMove<64>(void* dest, const void* src) {
V128 a = V128_LoadU(reinterpret_cast<const V128*>(src));
V128 b = V128_LoadU(reinterpret_cast<const V128*>(src) + 1);
V128 c = V128_LoadU(reinterpret_cast<const V128*>(src) + 2);
V128 d = V128_LoadU(reinterpret_cast<const V128*>(src) + 3);
V128_StoreU(reinterpret_cast<V128*>(dest), a);
V128_StoreU(reinterpret_cast<V128*>(dest) + 1, b);
V128_StoreU(reinterpret_cast<V128*>(dest) + 2, c);
V128_StoreU(reinterpret_cast<V128*>(dest) + 3, d);
}
#endif
#endif

// We translate the information encoded in a tag through a lookup table to a
// format that requires fewer instructions to decode. Effectively we store
// the length minus the tag part of the offset. The lowest significant byte
Expand Down Expand Up @@ -1066,13 +1117,18 @@ void MemCopy64(char* dst, const void* src, size_t size) {
data = _mm256_lddqu_si256(static_cast<const __m256i *>(src) + 1);
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data);
}
#elif defined(__aarch64__)
// Emperically it is faster to just copy all 64 rather than branching.
(void)kShortMemCopy;
(void)size;
FixedSizeMemMove<64>(dst, src);
#else
std::memmove(dst, src, kShortMemCopy);
FixedSizeMemMove<kShortMemCopy>(dst, src);
// Profiling shows that nearly all copies are short.
if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) {
std::memmove(dst + kShortMemCopy,
static_cast<const uint8_t*>(src) + kShortMemCopy,
64 - kShortMemCopy);
FixedSizeMemMove<kShortMemCopy>(
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Q] We've already copied kShortMemCopy bytes on line 1126. This code used to copy the remaining bytes if size > kShortMemCopy by copying 64 - kShortMemCopy bytes. The new code, however, always copies kShortMemCopy, assuming 2 * kShortMemCopy == 64, right? If so, do we need to have a static assertion verifying this?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is defined as 32 on line 1100, 30 lines up. Adding a static assert that 32*2 == 64 seems silly. Also the original AVX impl was relying on this as well.

dst + kShortMemCopy,
static_cast<const uint8_t*>(src) + kShortMemCopy);
}
#endif
}
Expand Down Expand Up @@ -1108,14 +1164,9 @@ inline size_t AdvanceToNextTagARMOptimized(const uint8_t** ip_p, size_t* tag) {
// instruction (csinc) and it removes several register moves.
const size_t tag_type = *tag & 3;
const bool is_literal = (tag_type == 0);
if (is_literal) {
size_t next_literal_tag = (*tag >> 2) + 1;
*tag = ip[next_literal_tag];
ip += next_literal_tag + 1;
} else {
*tag = ip[tag_type];
ip += tag_type + 1;
}
const size_t next_tag = is_literal ? (*tag >> 2) + 1 : tag_type;
*tag = ip[next_tag];
ip += (next_tag) + 1;
return tag_type;
}

Expand Down Expand Up @@ -2014,7 +2065,7 @@ class SnappyArrayWriter {
*op_p = IncrementalCopy(op - offset, op, op_end, op_limit_);
return true;
}
std::memmove(op, op - offset, kSlopBytes);
FixedSizeMemMove<kSlopBytes>(op, op - offset);
*op_p = op_end;
return true;
}
Expand Down Expand Up @@ -2266,7 +2317,7 @@ class SnappyScatteredWriter {
}
// Fast path
char* const op_end = op + len;
std::memmove(op, op - offset, kSlopBytes);
FixedSizeMemMove<kSlopBytes>(op, op - offset);
*op_p = op_end;
return true;
}
Expand Down