|
1 | 1 | #include "config.h" |
2 | 2 | #include "types.h" |
3 | 3 |
|
| 4 | +#if (defined(__AVX512F__) && defined(__AVX512DQ__)) || defined(__AVX2__) |
| 5 | +# include <immintrin.h> |
| 6 | +#endif |
4 | 7 |
|
5 | 8 | static inline u64 classify_word(u64 word) { |
6 | 9 |
|
@@ -94,3 +97,112 @@ static inline void discover_word(u8* ret, u64* current, u64* virgin) { |
94 | 97 | } |
95 | 98 |
|
96 | 99 | } |
| 100 | + |
| 101 | + |
| 102 | +#if defined(__AVX512F__) && defined(__AVX512DQ__) |
| 103 | +#define PACK_SIZE 64 |
| 104 | +static inline const u64* skim(const u64* virgin, const u64* current, const u64* current_end) { |
| 105 | + |
| 106 | + for (; current != current_end; virgin += 8, current += 8) { |
| 107 | + |
| 108 | + __m512i value = *(__m512i*)current; |
| 109 | + __mmask8 mask = _mm512_testn_epi64_mask(value, value); |
| 110 | + |
| 111 | + /* All bytes are zero. */ |
| 112 | + if (mask == 0xff) continue; |
| 113 | + |
| 114 | + /* Look for nonzero bytes and check for new bits. */ |
| 115 | +#define UNROLL(x) \ |
| 116 | + if (!(mask & (1 << x)) && classify_word(current[x]) & virgin[x]) return ¤t[x] |
| 117 | + UNROLL(0); UNROLL(1); UNROLL(2); UNROLL(3); |
| 118 | + UNROLL(4); UNROLL(5); UNROLL(6); UNROLL(7); |
| 119 | +#undef UNROLL |
| 120 | + |
| 121 | + } |
| 122 | + |
| 123 | + return current_end; |
| 124 | + |
| 125 | +} |
| 126 | +#endif |
| 127 | + |
| 128 | + |
| 129 | +#if !defined(PACK_SIZE) && defined(__AVX2__) |
| 130 | +#define PACK_SIZE 32 |
| 131 | +static inline const u64* skim(const u64* virgin, const u64* current, const u64* current_end) { |
| 132 | + |
| 133 | + __m256i zeroes = _mm256_setzero_si256(); |
| 134 | + |
| 135 | + for (; current != current_end; virgin += 4, current += 4) { |
| 136 | + |
| 137 | + __m256i value = *(__m256i*)current; |
| 138 | + __m256i cmp = _mm256_cmpeq_epi64(value, zeroes); |
| 139 | + u32 mask = _mm256_movemask_epi8(cmp); |
| 140 | + |
| 141 | + /* All bytes are zero. */ |
| 142 | + if (mask == -1) continue; |
| 143 | + |
| 144 | + /* Look for nonzero bytes and check for new bits. */ |
| 145 | + if (!(mask & 0xff) && classify_word(current[0]) & virgin[0]) return ¤t[0]; |
| 146 | + if (!(mask & 0xff00) && classify_word(current[1]) & virgin[1]) return ¤t[1]; |
| 147 | + if (!(mask & 0xff0000) && classify_word(current[2]) & virgin[2]) return ¤t[2]; |
| 148 | + if (!(mask & 0xff000000) && classify_word(current[3]) & virgin[3]) return ¤t[3]; |
| 149 | + |
| 150 | + } |
| 151 | + |
| 152 | + return current_end; |
| 153 | + |
| 154 | +} |
| 155 | +#endif |
| 156 | + |
| 157 | + |
| 158 | +#if !defined(PACK_SIZE) |
| 159 | +#define PACK_SIZE 32 |
| 160 | +static inline const u64* skim(const u64* virgin, const u64* current, const u64* current_end) { |
| 161 | + |
| 162 | + for (; current != current_end; virgin += 4, current += 4) { |
| 163 | + |
| 164 | + if (current[0] && classify_word(current[0]) & virgin[0]) return ¤t[0]; |
| 165 | + if (current[1] && classify_word(current[1]) & virgin[1]) return ¤t[1]; |
| 166 | + if (current[2] && classify_word(current[2]) & virgin[2]) return ¤t[2]; |
| 167 | + if (current[3] && classify_word(current[3]) & virgin[3]) return ¤t[3]; |
| 168 | + |
| 169 | + } |
| 170 | + |
| 171 | + return current_end; |
| 172 | + |
| 173 | +} |
| 174 | +#endif |
| 175 | + |
| 176 | + |
| 177 | +static inline u8 has_new_bits_unclassified(u8* virgin_map) { |
| 178 | + u64* virgin = (u64*)virgin_map; |
| 179 | + u64* current = (u64*)trace_bits; |
| 180 | + u64* current_end = (u64*)(trace_bits + MAP_SIZE); |
| 181 | + |
| 182 | + u8 ret = 0; |
| 183 | + while ((current = (u64*)skim(virgin, current, current_end)) != current_end) { |
| 184 | + /* Compute the word offset inside current pack. */ |
| 185 | + u64 offset = ((uintptr_t)current & (PACK_SIZE - 1)) / 8; |
| 186 | + virgin = (u64*)((u8*)current - trace_bits + virgin_map); |
| 187 | + |
| 188 | +#define UNROLL(x) \ |
| 189 | + case x: \ |
| 190 | + if (*current) { \ |
| 191 | + *current = classify_word(*current); \ |
| 192 | + discover_word(&ret, current, virgin); \ |
| 193 | + } \ |
| 194 | + ++current, ++virgin; |
| 195 | + |
| 196 | + /* Ensure the alignment of the next iteration. */ |
| 197 | + switch (offset) { |
| 198 | + UNROLL(0) UNROLL(1) UNROLL(2) UNROLL(3) |
| 199 | +#if PACK_SIZE == 64 |
| 200 | + UNROLL(4) UNROLL(5) UNROLL(6) UNROLL(7) |
| 201 | +#endif |
| 202 | + } |
| 203 | + |
| 204 | +#undef UNROLL |
| 205 | + } |
| 206 | + |
| 207 | + return ret; |
| 208 | +} |
0 commit comments