Skip to content

Commit 5ec5d16

Browse files
committed
Perf tuning for gcc + aarch64
1 parent 27f34a5 commit 5ec5d16

File tree

2 files changed

+83
-20
lines changed

2 files changed

+83
-20
lines changed

snappy-internal.h

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -233,9 +233,7 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
233233
int shift = Bits::FindLSBSetNonZero64(xorval);
234234
size_t matched_bytes = shift >> 3;
235235
uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
236-
#ifndef __x86_64__
237-
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
238-
#else
236+
#ifdef __x86_64__
239237
// Ideally this would just be
240238
//
241239
// a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
@@ -250,6 +248,14 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
250248
: "+r"(a2)
251249
: "r"(a3), "r"(xorval)
252250
: "cc");
251+
#elif defined(__aarch64__)
252+
asm("cmp %w[xorval], 0\n\t"
253+
"csel %x[a2], %[a3], %[a2], eq\n\t"
254+
: [a2] "+r" (a2)
255+
: [a3] "r" (a3) , [xorval] "r" (xorval)
256+
: "cc");
257+
#else
258+
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
253259
#endif
254260
*data = a2 >> (shift & (3 * 8));
255261
return std::pair<size_t, bool>(matched_bytes, true);
@@ -276,14 +282,20 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
276282
int shift = Bits::FindLSBSetNonZero64(xorval);
277283
size_t matched_bytes = shift >> 3;
278284
uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
279-
#ifndef __x86_64__
280-
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
281-
#else
285+
#ifdef __x86_64__
282286
asm("testl %k2, %k2\n\t"
283287
"cmovzq %1, %0\n\t"
284288
: "+r"(a2)
285289
: "r"(a3), "r"(xorval)
286290
: "cc");
291+
#elif defined(__aarch64__)
292+
asm("cmp %w[xorval], 0\n\t"
293+
"csel %x[a2], %[a3], %[a2], eq\n\t"
294+
: [a2] "+r" (a2)
295+
: [a3] "r" (a3) , [xorval] "r" (xorval)
296+
: "cc");
297+
#else
298+
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
287299
#endif
288300
*data = a2 >> (shift & (3 * 8));
289301
matched += matched_bytes;

snappy.cc

Lines changed: 65 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,57 @@ using internal::V128_StoreU;
100100
using internal::V128_DupChar;
101101
#endif
102102

103+
// GCC dispatches to libc for memmoves > 16 bytes, so we need to
104+
// do some work to get good code from that compiler. Clang handles
105+
// powers-of-2 at least up to 64 well.
106+
#if !defined(__GNUC__) || defined(__clang__)
107+
template <size_t SIZE>
108+
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
109+
inline void FixedSizeMemMove(void* dest, const void* src) {
110+
memmove(dest, src, SIZE);
111+
}
112+
#else
113+
114+
template <size_t SIZE>
115+
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
116+
inline void FixedSizeMemMove(void* dest, const void* src) {
117+
if (SIZE <= 16) {
118+
// gcc has patterns for memmove up to 16 bytes
119+
memmove(dest, src, SIZE);
120+
} else {
121+
// This generates reasonable code on x86_64, but on aarch64 this produces a
122+
// dead store to tmp, plus takes up stack space.
123+
char tmp[SIZE];
124+
memcpy(tmp, src, SIZE);
125+
memcpy(dest, tmp, SIZE);
126+
}
127+
}
128+
129+
#ifdef __aarch64__ // Implies neon support
130+
template <>
131+
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
132+
inline void FixedSizeMemMove<32>(void* dest, const void* src) {
133+
V128 a = V128_LoadU(reinterpret_cast<const V128*>(src));
134+
V128 b = V128_LoadU(reinterpret_cast<const V128*>(src) + 1);
135+
V128_StoreU(reinterpret_cast<V128*>(dest), a);
136+
V128_StoreU(reinterpret_cast<V128*>(dest) + 1, b);
137+
}
138+
139+
template <>
140+
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
141+
inline void FixedSizeMemMove<64>(void* dest, const void* src) {
142+
V128 a = V128_LoadU(reinterpret_cast<const V128*>(src));
143+
V128 b = V128_LoadU(reinterpret_cast<const V128*>(src) + 1);
144+
V128 c = V128_LoadU(reinterpret_cast<const V128*>(src) + 2);
145+
V128 d = V128_LoadU(reinterpret_cast<const V128*>(src) + 3);
146+
V128_StoreU(reinterpret_cast<V128*>(dest), a);
147+
V128_StoreU(reinterpret_cast<V128*>(dest) + 1, b);
148+
V128_StoreU(reinterpret_cast<V128*>(dest) + 2, c);
149+
V128_StoreU(reinterpret_cast<V128*>(dest) + 3, d);
150+
}
151+
#endif
152+
#endif
153+
103154
// We translate the information encoded in a tag through a lookup table to a
104155
// format that requires fewer instructions to decode. Effectively we store
105156
// the length minus the tag part of the offset. The lowest significant byte
@@ -1060,13 +1111,18 @@ void MemCopy64(char* dst, const void* src, size_t size) {
10601111
data = _mm256_lddqu_si256(static_cast<const __m256i *>(src) + 1);
10611112
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data);
10621113
}
1114+
#elif defined(__aarch64__)
1115+
// Emperically it is faster to just copy all 64 rather than branching.
1116+
(void)kShortMemCopy;
1117+
(void)size;
1118+
FixedSizeMemMove<64>(dst, src);
10631119
#else
1064-
std::memmove(dst, src, kShortMemCopy);
1120+
FixedSizeMemMove<kShortMemCopy>(dst, src);
10651121
// Profiling shows that nearly all copies are short.
10661122
if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) {
1067-
std::memmove(dst + kShortMemCopy,
1068-
static_cast<const uint8_t*>(src) + kShortMemCopy,
1069-
64 - kShortMemCopy);
1123+
FixedSizeMemMove<kShortMemCopy>(
1124+
dst + kShortMemCopy,
1125+
static_cast<const uint8_t*>(src) + kShortMemCopy);
10701126
}
10711127
#endif
10721128
}
@@ -1102,14 +1158,9 @@ inline size_t AdvanceToNextTagARMOptimized(const uint8_t** ip_p, size_t* tag) {
11021158
// instruction (csinc) and it removes several register moves.
11031159
const size_t tag_type = *tag & 3;
11041160
const bool is_literal = (tag_type == 0);
1105-
if (is_literal) {
1106-
size_t next_literal_tag = (*tag >> 2) + 1;
1107-
*tag = ip[next_literal_tag];
1108-
ip += next_literal_tag + 1;
1109-
} else {
1110-
*tag = ip[tag_type];
1111-
ip += tag_type + 1;
1112-
}
1161+
const size_t next_tag = is_literal ? (*tag >> 2) + 1 : tag_type;
1162+
*tag = ip[next_tag];
1163+
ip += (next_tag) + 1;
11131164
return tag_type;
11141165
}
11151166

@@ -2013,7 +2064,7 @@ class SnappyArrayWriter {
20132064
*op_p = IncrementalCopy(op - offset, op, op_end, op_limit_);
20142065
return true;
20152066
}
2016-
std::memmove(op, op - offset, kSlopBytes);
2067+
FixedSizeMemMove<kSlopBytes>(op, op - offset);
20172068
*op_p = op_end;
20182069
return true;
20192070
}
@@ -2265,7 +2316,7 @@ class SnappyScatteredWriter {
22652316
}
22662317
// Fast path
22672318
char* const op_end = op + len;
2268-
std::memmove(op, op - offset, kSlopBytes);
2319+
FixedSizeMemMove<kSlopBytes>(op, op - offset);
22692320
*op_p = op_end;
22702321
return true;
22712322
}

0 commit comments

Comments
 (0)