|
1 | 1 | #include "config.h" |
2 | 2 | #include "types.h" |
3 | 3 |
|
| 4 | +#if (defined(__AVX512F__) && defined(__AVX512DQ__)) || defined(__AVX2__) |
| 5 | +# include <immintrin.h> |
| 6 | +#endif |
4 | 7 |
|
5 | 8 | static inline u64 classify_word(u64 word) { |
6 | 9 |
|
@@ -94,3 +97,78 @@ static inline void discover_word(u8* ret, u64* current, u64* virgin) { |
94 | 97 | } |
95 | 98 |
|
96 | 99 | } |
| 100 | + |
| 101 | + |
| 102 | +#if defined(__AVX512F__) && defined(__AVX512DQ__) |
| 103 | +#define PACK_SIZE 64 |
| 104 | +static inline u32 skim(const u64* virgin, const u64* current, const u64* current_end) { |
| 105 | + |
| 106 | + for (; current != current_end; virgin += 8, current += 8) { |
| 107 | + |
| 108 | + __m512i value = *(__m512i*)current; |
| 109 | + __mmask8 mask = _mm512_testn_epi64_mask(value, value); |
| 110 | + |
| 111 | + /* All bytes are zero. */ |
| 112 | + if (mask == 0xff) continue; |
| 113 | + |
| 114 | + /* Look for nonzero bytes and check for new bits. */ |
| 115 | +#define UNROLL(x) \ |
| 116 | + if (!(mask & (1 << x)) && classify_word(current[x]) & virgin[x]) return 1 |
| 117 | + UNROLL(0); UNROLL(1); UNROLL(2); UNROLL(3); |
| 118 | + UNROLL(4); UNROLL(5); UNROLL(6); UNROLL(7); |
| 119 | +#undef UNROLL |
| 120 | + |
| 121 | + } |
| 122 | + |
| 123 | + return 0; |
| 124 | + |
| 125 | +} |
| 126 | +#endif |
| 127 | + |
| 128 | + |
| 129 | +#if !defined(PACK_SIZE) && defined(__AVX2__) |
| 130 | +#define PACK_SIZE 32 |
| 131 | +static inline u32 skim(const u64* virgin, const u64* current, const u64* current_end) { |
| 132 | + |
| 133 | + __m256i zeroes = _mm256_setzero_si256(); |
| 134 | + |
| 135 | + for (; current != current_end; virgin += 4, current += 4) { |
| 136 | + |
| 137 | + __m256i value = *(__m256i*)current; |
| 138 | + __m256i cmp = _mm256_cmpeq_epi64(value, zeroes); |
| 139 | + u32 mask = _mm256_movemask_epi8(cmp); |
| 140 | + |
| 141 | + /* All bytes are zero. */ |
| 142 | + if (mask == -1) continue; |
| 143 | + |
| 144 | + /* Look for nonzero bytes and check for new bits. */ |
| 145 | + if (!(mask & 0xff) && classify_word(current[0]) & virgin[0]) return 1; |
| 146 | + if (!(mask & 0xff00) && classify_word(current[1]) & virgin[1]) return 1; |
| 147 | + if (!(mask & 0xff0000) && classify_word(current[2]) & virgin[2]) return 1; |
| 148 | + if (!(mask & 0xff000000) && classify_word(current[3]) & virgin[3]) return 1; |
| 149 | + |
| 150 | + } |
| 151 | + |
| 152 | + return 0; |
| 153 | + |
| 154 | +} |
| 155 | +#endif |
| 156 | + |
| 157 | + |
| 158 | +#if !defined(PACK_SIZE) |
| 159 | +#define PACK_SIZE 32 |
| 160 | +static inline u32 skim(const u64* virgin, const u64* current, const u64* current_end) { |
| 161 | + |
| 162 | + for (; current != current_end; virgin += 4, current += 4) { |
| 163 | + |
| 164 | + if (current[0] && classify_word(current[0]) & virgin[0]) return 1; |
| 165 | + if (current[1] && classify_word(current[1]) & virgin[1]) return 1; |
| 166 | + if (current[2] && classify_word(current[2]) & virgin[2]) return 1; |
| 167 | + if (current[3] && classify_word(current[3]) & virgin[3]) return 1; |
| 168 | + |
| 169 | + } |
| 170 | + |
| 171 | + return 0; |
| 172 | + |
| 173 | +} |
| 174 | +#endif |
0 commit comments