@@ -100,6 +100,57 @@ using internal::V128_StoreU;
100
100
using internal::V128_DupChar;
101
101
#endif
102
102
103
+ // GCC dispatches to libc for memmoves > 16 bytes, so we need to
104
+ // do some work to get good code from that compiler. Clang handles
105
+ // powers-of-2 at least up to 64 well.
106
+ #if !defined(__GNUC__) || defined(__clang__)
107
+ template <size_t SIZE>
108
+ SNAPPY_ATTRIBUTE_ALWAYS_INLINE
109
+ inline void FixedSizeMemMove (void * dest, const void * src) {
110
+ memmove (dest, src, SIZE);
111
+ }
112
+ #else
113
+
114
+ template <size_t SIZE>
115
+ SNAPPY_ATTRIBUTE_ALWAYS_INLINE
116
+ inline void FixedSizeMemMove (void * dest, const void * src) {
117
+ if (SIZE <= 16 ) {
118
+ // gcc has patterns for memmove up to 16 bytes
119
+ memmove (dest, src, SIZE);
120
+ } else {
121
+ // This generates reasonable code on x86_64, but on aarch64 this produces a
122
+ // dead store to tmp, plus takes up stack space.
123
+ char tmp[SIZE];
124
+ memcpy (tmp, src, SIZE);
125
+ memcpy (dest, tmp, SIZE);
126
+ }
127
+ }
128
+
129
+ #ifdef __aarch64__ // Implies neon support
130
+ template <>
131
+ SNAPPY_ATTRIBUTE_ALWAYS_INLINE
132
+ inline void FixedSizeMemMove<32 >(void * dest, const void * src) {
133
+ V128 a = V128_LoadU (reinterpret_cast <const V128*>(src));
134
+ V128 b = V128_LoadU (reinterpret_cast <const V128*>(src) + 1 );
135
+ V128_StoreU (reinterpret_cast <V128*>(dest), a);
136
+ V128_StoreU (reinterpret_cast <V128*>(dest) + 1 , b);
137
+ }
138
+
139
+ template <>
140
+ SNAPPY_ATTRIBUTE_ALWAYS_INLINE
141
+ inline void FixedSizeMemMove<64 >(void * dest, const void * src) {
142
+ V128 a = V128_LoadU (reinterpret_cast <const V128*>(src));
143
+ V128 b = V128_LoadU (reinterpret_cast <const V128*>(src) + 1 );
144
+ V128 c = V128_LoadU (reinterpret_cast <const V128*>(src) + 2 );
145
+ V128 d = V128_LoadU (reinterpret_cast <const V128*>(src) + 3 );
146
+ V128_StoreU (reinterpret_cast <V128*>(dest), a);
147
+ V128_StoreU (reinterpret_cast <V128*>(dest) + 1 , b);
148
+ V128_StoreU (reinterpret_cast <V128*>(dest) + 2 , c);
149
+ V128_StoreU (reinterpret_cast <V128*>(dest) + 3 , d);
150
+ }
151
+ #endif
152
+ #endif
153
+
103
154
// We translate the information encoded in a tag through a lookup table to a
104
155
// format that requires fewer instructions to decode. Effectively we store
105
156
// the length minus the tag part of the offset. The lowest significant byte
@@ -1060,13 +1111,18 @@ void MemCopy64(char* dst, const void* src, size_t size) {
1060
1111
data = _mm256_lddqu_si256 (static_cast <const __m256i *>(src) + 1 );
1061
1112
_mm256_storeu_si256 (reinterpret_cast <__m256i *>(dst) + 1 , data);
1062
1113
}
1114
+ #elif defined(__aarch64__)
1115
+ // Emperically it is faster to just copy all 64 rather than branching.
1116
+ (void )kShortMemCopy ;
1117
+ (void )size;
1118
+ FixedSizeMemMove<64 >(dst, src);
1063
1119
#else
1064
- std::memmove (dst, src, kShortMemCopy );
1120
+ FixedSizeMemMove< kShortMemCopy > (dst, src);
1065
1121
// Profiling shows that nearly all copies are short.
1066
1122
if (SNAPPY_PREDICT_FALSE (size > kShortMemCopy )) {
1067
- std::memmove (dst + kShortMemCopy ,
1068
- static_cast < const uint8_t *>(src) + kShortMemCopy ,
1069
- 64 - kShortMemCopy );
1123
+ FixedSizeMemMove< kShortMemCopy >(
1124
+ dst + kShortMemCopy ,
1125
+ static_cast < const uint8_t *>(src) + kShortMemCopy );
1070
1126
}
1071
1127
#endif
1072
1128
}
@@ -1102,14 +1158,9 @@ inline size_t AdvanceToNextTagARMOptimized(const uint8_t** ip_p, size_t* tag) {
1102
1158
// instruction (csinc) and it removes several register moves.
1103
1159
const size_t tag_type = *tag & 3 ;
1104
1160
const bool is_literal = (tag_type == 0 );
1105
- if (is_literal) {
1106
- size_t next_literal_tag = (*tag >> 2 ) + 1 ;
1107
- *tag = ip[next_literal_tag];
1108
- ip += next_literal_tag + 1 ;
1109
- } else {
1110
- *tag = ip[tag_type];
1111
- ip += tag_type + 1 ;
1112
- }
1161
+ const size_t next_tag = is_literal ? (*tag >> 2 ) + 1 : tag_type;
1162
+ *tag = ip[next_tag];
1163
+ ip += (next_tag) + 1 ;
1113
1164
return tag_type;
1114
1165
}
1115
1166
@@ -2013,7 +2064,7 @@ class SnappyArrayWriter {
2013
2064
*op_p = IncrementalCopy (op - offset, op, op_end, op_limit_);
2014
2065
return true ;
2015
2066
}
2016
- std::memmove (op, op - offset, kSlopBytes );
2067
+ FixedSizeMemMove< kSlopBytes > (op, op - offset);
2017
2068
*op_p = op_end;
2018
2069
return true ;
2019
2070
}
@@ -2265,7 +2316,7 @@ class SnappyScatteredWriter {
2265
2316
}
2266
2317
// Fast path
2267
2318
char * const op_end = op + len;
2268
- std::memmove (op, op - offset, kSlopBytes );
2319
+ FixedSizeMemMove< kSlopBytes > (op, op - offset);
2269
2320
*op_p = op_end;
2270
2321
return true ;
2271
2322
}
0 commit comments