|
| 1 | +#include "process_csv.h" |
| 2 | +#include <stdlib.h> |
| 3 | +#include <stdio.h> |
| 4 | + |
| 5 | +/* |
| 6 | + * compile with clang -O3 |
| 7 | + * in cabal use |
| 8 | + * cc-options: -O3 |
| 9 | + * |
| 10 | + * Produce an array of field delimiter indices |
| 11 | + * Fields can be delimited by commas and newlines |
| 12 | + * TODO: allow the user to provide a custom delimiter |
| 13 | + * character to replace commas. |
| 14 | + * This should work with UTF-8, so long as the delimiter |
| 15 | + * character is a single byte. |
| 16 | + * |
| 17 | + * Delimiters can be escaped inside of quotes. Quotes |
| 18 | + * can also be placed inside quotes by double quoting. |
| 19 | + * For the purposes of this parser we can ignore double |
| 20 | + * quotes inside quotes, thereby treating the first quote |
| 21 | + * as the closing of the string and the next one the |
| 22 | + * immediate opening of a new one |
| 23 | + * |
| 24 | + * We can find the quoted regions by first finding |
| 25 | + * the positions of the quotes (cmpeq and then movemask) |
| 26 | + * and then using the carryless multiplication operation |
| 27 | + * to know the regions that are quoted. We can then simply |
| 28 | + * and the inverse of the quotemask to exclude commas and |
| 29 | + * newlines inside quotes |
| 30 | + * |
| 31 | + */ |
| 32 | + |
| 33 | +// if the character is found at a particular |
| 34 | +// position in the array of bytes, the |
| 35 | +// corresponding bit in the returned uint64_t should |
| 36 | +// be turned on. |
| 37 | +// Example: searching for commas in |
| 38 | +// input: one, two, three |
| 39 | +// result: 000100001000000 |
| 40 | +#ifdef HAS_SIMD_CSV |
| 41 | +static uint64_t find_character_in_chunk(uint8_t *in, uint8_t c) { |
| 42 | +#ifdef USE_AVX2 |
| 43 | + // AVX2 implementation: load two 32-byte chunks |
| 44 | + __m256i v0 = _mm256_loadu_si256((const __m256i *)(in)); |
| 45 | + __m256i v1 = _mm256_loadu_si256((const __m256i *)(in + 32)); |
| 46 | + __m256i b = _mm256_set1_epi8((char)c); |
| 47 | + __m256i m0 = _mm256_cmpeq_epi8(v0, b); |
| 48 | + __m256i m1 = _mm256_cmpeq_epi8(v1, b); |
| 49 | + uint32_t lo = (uint32_t)_mm256_movemask_epi8(m0); |
| 50 | + uint32_t hi = (uint32_t)_mm256_movemask_epi8(m1); |
| 51 | + return ((uint64_t)hi << 32) | (uint64_t)lo; |
| 52 | +#else // USE_NEON |
| 53 | + // ARM NEON implementation: load 64 bytes deinterleaved |
| 54 | + uint8x16x4_t src = vld4q_u8(in); |
| 55 | + uint8x16_t mask = vmovq_n_u8(c); |
| 56 | + uint8x16_t cmp0 = vceqq_u8(src.val[0], mask); |
| 57 | + uint8x16_t cmp1 = vceqq_u8(src.val[1], mask); |
| 58 | + uint8x16_t cmp2 = vceqq_u8(src.val[2], mask); |
| 59 | + uint8x16_t cmp3 = vceqq_u8(src.val[3], mask); |
| 60 | + |
| 61 | + // For an explanation of how to do movemask in |
| 62 | + // NEON, see: https://branchfree.org/2019/04/01/fitting-my-head-through-the-arm-holes-or-two-sequences-to-substitute-for-the-missing-pmovmskb-instruction-on-arm-neon/ |
| 63 | + // The specific implementation below is owed to the |
| 64 | + // user 'aqrit' in a comment on the blog above |
| 65 | + // There's also https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon |
| 66 | + // |
| 67 | + // The input to the move mask must be de-interleaved. |
| 68 | + // That is, for a string after vceqq_u8 we have, |
| 69 | + // cmp0 = aaaaaaaa / eeeeeeee / ... |
| 70 | + // cmp1 = bbbbbbbb / ffffffff / ... |
| 71 | + // cmp2 = cccccccc / gggggggg / ... |
| 72 | + // cmp3 = dddddddd / hhhhhhhh / ... |
| 73 | + // Luckily vld4q_u8 does this for us. Now we want |
| 74 | + // to interleave this into a 64bit integer with bits |
| 75 | + // abcdefgh... |
| 76 | + // cmp0 holds bits for positions |
| 77 | + // 0,4,8,..., cmp1 holds bits for 1,5,9,..., and so on. |
| 78 | + // So to bring together the bits for different positions |
| 79 | + // we right shift and combine with vsriq_n_u8 |
| 80 | + // |
| 81 | + // vsriq_n_u8 shifts each byte of the first operand |
| 82 | + // right by n bits, and combines it with the bits of |
| 83 | + // the second operand. |
| 84 | + // example: |
| 85 | + // uint8_t mask = 0xFF >> n; // if n = 1, 0111 1111 |
| 86 | + // uint8_t shifted = operand1 >> n // 0bbb, bbbb |
| 87 | + // operand1 = (operand2 & (!mask)) | shifted |
| 88 | + // (aaaa aaaa & 1000 0000) | 0bbb bbbb = abbb bbbb |
| 89 | + // |
| 90 | + // So we first bring together the bits the first two |
| 91 | + // rows and the next two rows (so to speak) |
| 92 | + // t0 = abbbbbbb/efffffff/... |
| 93 | + uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); |
| 94 | + // t1 = cddddddd/ghhhhhhh/... |
| 95 | + uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); |
| 96 | + // Now we must combine each of our combined rows |
| 97 | + // so that we get back our column interleaved |
| 98 | + // t2 = abcddddd/efghhhhh/... |
| 99 | + uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); |
| 100 | + // Then to get rid of the repeated bits in the upper half |
| 101 | + // t3 = abcdabcd/efghefgh/... |
| 102 | + uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); |
| 103 | + // and now it's the relatively simple matter of getting |
| 104 | + // rid of half the bits. We combine our 8bit words into 16 |
| 105 | + // bit words for this step, and then we shift right by 4 |
| 106 | + // and turn the result into an 8 bit word |
| 107 | + // afterreinterpert: abcdabcdefghefgh/... |
| 108 | + // afterrightshift: 0000abcdabcdefgh/... |
| 109 | + // take the lower bits: abcdefgh/... |
| 110 | + uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); |
| 111 | + // Finally we recombine them into a 64 bit integer |
| 112 | + // (vreinterpret_u64_u8 here does uint8x8 -> uint64x1 |
| 113 | + // and vget_lane_u64 does uint64x1 -> uint64) |
| 114 | + return vget_lane_u64(vreinterpret_u64_u8(t4), 0); |
| 115 | +#endif |
| 116 | +} |
| 117 | +#endif |
| 118 | + |
| 119 | +// I owe a debt to https://github.com/geofflangdale/simdcsv |
| 120 | +// Let's go ahead and assume `in` will only ever get 64 bytes |
| 121 | +// initial_quoted will be either all_ones ~0ULL or all_zeros 0ULL |
| 122 | +#ifdef HAS_SIMD_CSV |
| 123 | +static uint64_t parse_chunk(uint8_t *in, uint8_t separator, uint64_t *initial_quoted) { |
| 124 | + uint64_t quotebits = find_character_in_chunk(in, QUOTE_CHAR); |
| 125 | + // See https://wunkolo.github.io/post/2020/05/pclmulqdq-tricks/ |
| 126 | + // Also, section 3.1.1 of Parsing Gigabytes of JSON per Second, |
| 127 | + // Geoff Langdale, Daniel Lemire, https://arxiv.org/pdf/1902.08318 |
| 128 | +#ifdef USE_AVX2 |
| 129 | + // Use PCLMUL for carryless multiplication on x86 |
| 130 | + __m128i a = _mm_set_epi64x(0, (int64_t)ALL_ONES_MASK); |
| 131 | + __m128i b = _mm_set_epi64x(0, (int64_t)quotebits); |
| 132 | + __m128i result = _mm_clmulepi64_si128(a, b, 0); |
| 133 | + uint64_t quotemask = (uint64_t)_mm_cvtsi128_si64(result); |
| 134 | +#else // USE_NEON |
| 135 | + // Use vmull_p64 (PMULL) for carryless multiplication on ARM |
| 136 | + // Requires ARM crypto extensions (compile: __ARM_FEATURE_AES, runtime: pmull flag) |
| 137 | + uint64_t quotemask = vmull_p64(ALL_ONES_MASK, quotebits); |
| 138 | +#endif |
| 139 | + quotemask ^= (*initial_quoted); |
| 140 | + // Find out if the chunk ends in a quoted region by looking |
| 141 | + // at the last bit |
| 142 | + (*initial_quoted) = (uint64_t)((int64_t)quotemask >> 63); |
| 143 | + |
| 144 | + uint64_t commabits = find_character_in_chunk(in, separator); |
| 145 | + uint64_t newlinebits = find_character_in_chunk(in, NEWLINE_CHAR); |
| 146 | + |
| 147 | + uint64_t delimiter_bits = (commabits | newlinebits) & ~quotemask; |
| 148 | + return delimiter_bits; |
| 149 | +} |
| 150 | +#endif |
| 151 | + |
| 152 | +#ifdef HAS_SIMD_CSV |
| 153 | +static size_t find_one_indices(size_t start_index, uint64_t bits, size_t *indices, size_t *base) { |
| 154 | + size_t position = 0; |
| 155 | + uint64_t bitset = bits; |
| 156 | + while (bitset != 0) { |
| 157 | + // temp only has the least significant bit of |
| 158 | + // bitset turned on. |
| 159 | + // In twos complement: 0 - x = ~ x + 1 |
| 160 | + uint64_t temp = bitset & -bitset; |
| 161 | + // count trailing zeros |
| 162 | + size_t r = __builtin_ctzll(bitset); |
| 163 | + indices[(*base) + position] = start_index + r; |
| 164 | + position++; |
| 165 | + |
| 166 | + bitset ^= temp; |
| 167 | + } |
| 168 | + *base += position; |
| 169 | + return position; |
| 170 | +} |
| 171 | +#endif |
| 172 | + |
| 173 | +size_t get_delimiter_indices(uint8_t *buf, size_t len, uint8_t separator, size_t *indices) { |
| 174 | + // Recall we padded our file with 64 empty bytes. |
| 175 | + // So if, for example, we had a file of 187 bytes |
| 176 | + // We pad it with zeros and so we have 251 bytes |
| 177 | + // The chunks we have are ptr + 0, ptr + 64, and pt |
| 178 | + // (we don't do ptr + 192 since we do len - 64 |
| 179 | + // below. This way, in ptr + 128 we have 59 bytes of |
| 180 | + // actual data and 5 bytes of zeros. If we didn't do this |
| 181 | + // we'd be reading past the end of file on the last row |
| 182 | +#ifdef HAS_SIMD_CSV |
| 183 | + size_t unpaddedLen = len < 64 ? 0 : len - 64; |
| 184 | + uint64_t initial_quoted = 0ULL; |
| 185 | + size_t base = 0; |
| 186 | + for (size_t i = 0; i < unpaddedLen; i += 64) { |
| 187 | + uint8_t *in = buf + i; |
| 188 | + uint64_t delimiter_bits = parse_chunk(in, separator, &initial_quoted); |
| 189 | + find_one_indices(i, delimiter_bits, indices, &base); |
| 190 | + } |
| 191 | + return base; |
| 192 | +#else |
| 193 | + // SIMD not available or carryless multiplication not supported. |
| 194 | + // Signal fallback to Haskell implementation. |
| 195 | + (void)buf; (void)len; (void)indices; |
| 196 | + return (size_t)-1; |
| 197 | +#endif |
| 198 | +} |
0 commit comments