Skip to content

Commit 80c9300

Browse files
authored
Merge pull request #2 from mongodb-forks/mathias-perf-tuning
Mathias's Patch: Perf tuning for gcc + aarch64
2 parents 80f6085 + 90b7d1b commit 80c9300

File tree

2 files changed

+83
-20
lines changed

2 files changed

+83
-20
lines changed

snappy-internal.h

+18-6
Original file line numberDiff line numberDiff line change
@@ -231,9 +231,7 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
231231
int shift = Bits::FindLSBSetNonZero64(xorval);
232232
size_t matched_bytes = shift >> 3;
233233
uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
234-
#ifndef __x86_64__
235-
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
236-
#else
234+
#ifdef __x86_64__
237235
// Ideally this would just be
238236
//
239237
// a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
@@ -248,6 +246,14 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
248246
: "+r"(a2)
249247
: "r"(a3), "r"(xorval)
250248
: "cc");
249+
#elif defined(__aarch64__)
250+
asm("cmp %w[xorval], 0\n\t"
251+
"csel %x[a2], %[a3], %[a2], eq\n\t"
252+
: [a2] "+r" (a2)
253+
: [a3] "r" (a3) , [xorval] "r" (xorval)
254+
: "cc");
255+
#else
256+
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
251257
#endif
252258
*data = a2 >> (shift & (3 * 8));
253259
return std::pair<size_t, bool>(matched_bytes, true);
@@ -272,14 +278,20 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
272278
int shift = Bits::FindLSBSetNonZero64(xorval);
273279
size_t matched_bytes = shift >> 3;
274280
uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
275-
#ifndef __x86_64__
276-
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
277-
#else
281+
#ifdef __x86_64__
278282
asm("testl %k2, %k2\n\t"
279283
"cmovzq %1, %0\n\t"
280284
: "+r"(a2)
281285
: "r"(a3), "r"(xorval)
282286
: "cc");
287+
#elif defined(__aarch64__)
288+
asm("cmp %w[xorval], 0\n\t"
289+
"csel %x[a2], %[a3], %[a2], eq\n\t"
290+
: [a2] "+r" (a2)
291+
: [a3] "r" (a3) , [xorval] "r" (xorval)
292+
: "cc");
293+
#else
294+
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
283295
#endif
284296
*data = a2 >> (shift & (3 * 8));
285297
matched += matched_bytes;

snappy.cc

+65-14
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,57 @@ using internal::V128_StoreU;
119119
using internal::V128_DupChar;
120120
#endif
121121

122+
// GCC dispatches to libc for memmoves > 16 bytes, so we need to
123+
// do some work to get good code from that compiler. Clang handles
124+
// powers-of-2 at least up to 64 well.
125+
#if !defined(__GNUC__) || defined(__clang__)
126+
template <size_t SIZE>
127+
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
128+
inline void FixedSizeMemMove(void* dest, const void* src) {
129+
memmove(dest, src, SIZE);
130+
}
131+
#else
132+
133+
template <size_t SIZE>
134+
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
135+
inline void FixedSizeMemMove(void* dest, const void* src) {
136+
if (SIZE <= 16) {
137+
// gcc has patterns for memmove up to 16 bytes
138+
memmove(dest, src, SIZE);
139+
} else {
140+
// This generates reasonable code on x86_64, but on aarch64 this produces a
141+
// dead store to tmp, plus takes up stack space.
142+
char tmp[SIZE];
143+
memcpy(tmp, src, SIZE);
144+
memcpy(dest, tmp, SIZE);
145+
}
146+
}
147+
148+
#ifdef __aarch64__ // Implies neon support
149+
template <>
150+
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
151+
inline void FixedSizeMemMove<32>(void* dest, const void* src) {
152+
V128 a = V128_LoadU(reinterpret_cast<const V128*>(src));
153+
V128 b = V128_LoadU(reinterpret_cast<const V128*>(src) + 1);
154+
V128_StoreU(reinterpret_cast<V128*>(dest), a);
155+
V128_StoreU(reinterpret_cast<V128*>(dest) + 1, b);
156+
}
157+
158+
template <>
159+
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
160+
inline void FixedSizeMemMove<64>(void* dest, const void* src) {
161+
V128 a = V128_LoadU(reinterpret_cast<const V128*>(src));
162+
V128 b = V128_LoadU(reinterpret_cast<const V128*>(src) + 1);
163+
V128 c = V128_LoadU(reinterpret_cast<const V128*>(src) + 2);
164+
V128 d = V128_LoadU(reinterpret_cast<const V128*>(src) + 3);
165+
V128_StoreU(reinterpret_cast<V128*>(dest), a);
166+
V128_StoreU(reinterpret_cast<V128*>(dest) + 1, b);
167+
V128_StoreU(reinterpret_cast<V128*>(dest) + 2, c);
168+
V128_StoreU(reinterpret_cast<V128*>(dest) + 3, d);
169+
}
170+
#endif
171+
#endif
172+
122173
// We translate the information encoded in a tag through a lookup table to a
123174
// format that requires fewer instructions to decode. Effectively we store
124175
// the length minus the tag part of the offset. The lowest significant byte
@@ -1079,13 +1130,18 @@ void MemCopy64(char* dst, const void* src, size_t size) {
10791130
data = _mm256_lddqu_si256(static_cast<const __m256i *>(src) + 1);
10801131
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data);
10811132
}
1133+
#elif defined(__aarch64__)
1134+
// Emperically it is faster to just copy all 64 rather than branching.
1135+
(void)kShortMemCopy;
1136+
(void)size;
1137+
FixedSizeMemMove<64>(dst, src);
10821138
#else
1083-
std::memmove(dst, src, kShortMemCopy);
1139+
FixedSizeMemMove<kShortMemCopy>(dst, src);
10841140
// Profiling shows that nearly all copies are short.
10851141
if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) {
1086-
std::memmove(dst + kShortMemCopy,
1087-
static_cast<const uint8_t*>(src) + kShortMemCopy,
1088-
64 - kShortMemCopy);
1142+
FixedSizeMemMove<kShortMemCopy>(
1143+
dst + kShortMemCopy,
1144+
static_cast<const uint8_t*>(src) + kShortMemCopy);
10891145
}
10901146
#endif
10911147
}
@@ -1121,14 +1177,9 @@ inline size_t AdvanceToNextTagARMOptimized(const uint8_t** ip_p, size_t* tag) {
11211177
// instruction (csinc) and it removes several register moves.
11221178
const size_t tag_type = *tag & 3;
11231179
const bool is_literal = (tag_type == 0);
1124-
if (is_literal) {
1125-
size_t next_literal_tag = (*tag >> 2) + 1;
1126-
*tag = ip[next_literal_tag];
1127-
ip += next_literal_tag + 1;
1128-
} else {
1129-
*tag = ip[tag_type];
1130-
ip += tag_type + 1;
1131-
}
1180+
const size_t next_tag = is_literal ? (*tag >> 2) + 1 : tag_type;
1181+
*tag = ip[next_tag];
1182+
ip += (next_tag) + 1;
11321183
return tag_type;
11331184
}
11341185

@@ -2027,7 +2078,7 @@ class SnappyArrayWriter {
20272078
*op_p = IncrementalCopy(op - offset, op, op_end, op_limit_);
20282079
return true;
20292080
}
2030-
std::memmove(op, op - offset, kSlopBytes);
2081+
FixedSizeMemMove<kSlopBytes>(op, op - offset);
20312082
*op_p = op_end;
20322083
return true;
20332084
}
@@ -2279,7 +2330,7 @@ class SnappyScatteredWriter {
22792330
}
22802331
// Fast path
22812332
char* const op_end = op + len;
2282-
std::memmove(op, op - offset, kSlopBytes);
2333+
FixedSizeMemMove<kSlopBytes>(op, op - offset);
22832334
*op_p = op_end;
22842335
return true;
22852336
}

0 commit comments

Comments
 (0)