@@ -119,6 +119,57 @@ using internal::V128_StoreU;
119
119
using internal::V128_DupChar;
120
120
#endif
121
121
122
+ // GCC dispatches to libc for memmoves > 16 bytes, so we need to
123
+ // do some work to get good code from that compiler. Clang handles
124
+ // powers-of-2 at least up to 64 well.
125
+ #if !defined(__GNUC__) || defined(__clang__)
126
+ template <size_t SIZE>
127
+ SNAPPY_ATTRIBUTE_ALWAYS_INLINE
128
+ inline void FixedSizeMemMove (void * dest, const void * src) {
129
+ memmove (dest, src, SIZE);
130
+ }
131
+ #else
132
+
133
+ template <size_t SIZE>
134
+ SNAPPY_ATTRIBUTE_ALWAYS_INLINE
135
+ inline void FixedSizeMemMove (void * dest, const void * src) {
136
+ if (SIZE <= 16 ) {
137
+ // gcc has patterns for memmove up to 16 bytes
138
+ memmove (dest, src, SIZE);
139
+ } else {
140
+ // This generates reasonable code on x86_64, but on aarch64 this produces a
141
+ // dead store to tmp, plus takes up stack space.
142
+ char tmp[SIZE];
143
+ memcpy (tmp, src, SIZE);
144
+ memcpy (dest, tmp, SIZE);
145
+ }
146
+ }
147
+
148
+ #ifdef __aarch64__ // Implies neon support
149
+ template <>
150
+ SNAPPY_ATTRIBUTE_ALWAYS_INLINE
151
+ inline void FixedSizeMemMove<32 >(void * dest, const void * src) {
152
+ V128 a = V128_LoadU (reinterpret_cast <const V128*>(src));
153
+ V128 b = V128_LoadU (reinterpret_cast <const V128*>(src) + 1 );
154
+ V128_StoreU (reinterpret_cast <V128*>(dest), a);
155
+ V128_StoreU (reinterpret_cast <V128*>(dest) + 1 , b);
156
+ }
157
+
158
+ template <>
159
+ SNAPPY_ATTRIBUTE_ALWAYS_INLINE
160
+ inline void FixedSizeMemMove<64 >(void * dest, const void * src) {
161
+ V128 a = V128_LoadU (reinterpret_cast <const V128*>(src));
162
+ V128 b = V128_LoadU (reinterpret_cast <const V128*>(src) + 1 );
163
+ V128 c = V128_LoadU (reinterpret_cast <const V128*>(src) + 2 );
164
+ V128 d = V128_LoadU (reinterpret_cast <const V128*>(src) + 3 );
165
+ V128_StoreU (reinterpret_cast <V128*>(dest), a);
166
+ V128_StoreU (reinterpret_cast <V128*>(dest) + 1 , b);
167
+ V128_StoreU (reinterpret_cast <V128*>(dest) + 2 , c);
168
+ V128_StoreU (reinterpret_cast <V128*>(dest) + 3 , d);
169
+ }
170
+ #endif
171
+ #endif
172
+
122
173
// We translate the information encoded in a tag through a lookup table to a
123
174
// format that requires fewer instructions to decode. Effectively we store
124
175
// the length minus the tag part of the offset. The lowest significant byte
@@ -1079,13 +1130,18 @@ void MemCopy64(char* dst, const void* src, size_t size) {
1079
1130
data = _mm256_lddqu_si256 (static_cast <const __m256i *>(src) + 1 );
1080
1131
_mm256_storeu_si256 (reinterpret_cast <__m256i *>(dst) + 1 , data);
1081
1132
}
1133
+ #elif defined(__aarch64__)
1134
+ // Emperically it is faster to just copy all 64 rather than branching.
1135
+ (void )kShortMemCopy ;
1136
+ (void )size;
1137
+ FixedSizeMemMove<64 >(dst, src);
1082
1138
#else
1083
- std::memmove (dst, src, kShortMemCopy );
1139
+ FixedSizeMemMove< kShortMemCopy > (dst, src);
1084
1140
// Profiling shows that nearly all copies are short.
1085
1141
if (SNAPPY_PREDICT_FALSE (size > kShortMemCopy )) {
1086
- std::memmove (dst + kShortMemCopy ,
1087
- static_cast < const uint8_t *>(src) + kShortMemCopy ,
1088
- 64 - kShortMemCopy );
1142
+ FixedSizeMemMove< kShortMemCopy >(
1143
+ dst + kShortMemCopy ,
1144
+ static_cast < const uint8_t *>(src) + kShortMemCopy );
1089
1145
}
1090
1146
#endif
1091
1147
}
@@ -1121,14 +1177,9 @@ inline size_t AdvanceToNextTagARMOptimized(const uint8_t** ip_p, size_t* tag) {
1121
1177
// instruction (csinc) and it removes several register moves.
1122
1178
const size_t tag_type = *tag & 3 ;
1123
1179
const bool is_literal = (tag_type == 0 );
1124
- if (is_literal) {
1125
- size_t next_literal_tag = (*tag >> 2 ) + 1 ;
1126
- *tag = ip[next_literal_tag];
1127
- ip += next_literal_tag + 1 ;
1128
- } else {
1129
- *tag = ip[tag_type];
1130
- ip += tag_type + 1 ;
1131
- }
1180
+ const size_t next_tag = is_literal ? (*tag >> 2 ) + 1 : tag_type;
1181
+ *tag = ip[next_tag];
1182
+ ip += (next_tag) + 1 ;
1132
1183
return tag_type;
1133
1184
}
1134
1185
@@ -2027,7 +2078,7 @@ class SnappyArrayWriter {
2027
2078
*op_p = IncrementalCopy (op - offset, op, op_end, op_limit_);
2028
2079
return true ;
2029
2080
}
2030
- std::memmove (op, op - offset, kSlopBytes );
2081
+ FixedSizeMemMove< kSlopBytes > (op, op - offset);
2031
2082
*op_p = op_end;
2032
2083
return true ;
2033
2084
}
@@ -2279,7 +2330,7 @@ class SnappyScatteredWriter {
2279
2330
}
2280
2331
// Fast path
2281
2332
char * const op_end = op + len;
2282
- std::memmove (op, op - offset, kSlopBytes );
2333
+ FixedSizeMemMove< kSlopBytes > (op, op - offset);
2283
2334
*op_p = op_end;
2284
2335
return true ;
2285
2336
}
0 commit comments