Skip to content

Commit 6786f09

Browse files
committed
feat: split SIMD CSV into a separate package.
1 parent 3d0b4de commit 6786f09

37 files changed

+744
-331
lines changed

.github/workflows/ci.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,11 @@ jobs:
3030
run: |
3131
cat > cabal.project <<EOF
3232
packages:
33+
.
3334
examples/
3435
dataframe-persistent/
3536
dataframe-hasktorch/
37+
dataframe-fastcsv/
3638
EOF
3739
3840
- name: Freeze

.github/workflows/haskell-ci.yml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ jobs:
3636
- name: Run HLint
3737
uses: haskell-actions/hlint-run@v2
3838
with:
39-
path: '["src/", "app/", "benchmark/", "examples/", "dataframe-hasktorch/", "dataframe-persistent"]'
39+
path: '["src/", "app/", "benchmark/", "examples/", "dataframe-hasktorch/", "dataframe-persistent/", "dataframe-fastcsv/"]'
4040
fail-on: warning
4141
linux:
4242
name: Haskell-CI - Linux - ${{ matrix.compiler }}
@@ -171,6 +171,7 @@ jobs:
171171
run: |
172172
touch cabal.project
173173
echo "packages: $GITHUB_WORKSPACE/source/." >> cabal.project
174+
echo "packages: $GITHUB_WORKSPACE/source/dataframe-fastcsv" >> cabal.project
174175
cat cabal.project
175176
- name: sdist
176177
run: |
@@ -184,10 +185,13 @@ jobs:
184185
run: |
185186
PKGDIR_dataframe="$(find "$GITHUB_WORKSPACE/unpacked" -maxdepth 1 -type d -regex '.*/dataframe-[0-9.]*')"
186187
echo "PKGDIR_dataframe=${PKGDIR_dataframe}" >> "$GITHUB_ENV"
188+
PKGDIR_fastcsv="$(find "$GITHUB_WORKSPACE/unpacked" -maxdepth 1 -type d -regex '.*/dataframe-fastcsv-[0-9.]*')"
189+
echo "PKGDIR_fastcsv=${PKGDIR_fastcsv}" >> "$GITHUB_ENV"
187190
rm -f cabal.project cabal.project.local
188191
touch cabal.project
189192
touch cabal.project.local
190193
echo "packages: ${PKGDIR_dataframe}" >> cabal.project
194+
echo "packages: ${PKGDIR_fastcsv}" >> cabal.project
191195
echo "package dataframe" >> cabal.project
192196
echo " ghc-options: -Werror=missing-methods -Werror=missing-fields" >> cabal.project
193197
echo "package dataframe" >> cabal.project
@@ -196,7 +200,7 @@ jobs:
196200
echo " ghc-options: -Werror=incomplete-patterns -Werror=incomplete-uni-patterns" >> cabal.project
197201
cat >> cabal.project <<EOF
198202
EOF
199-
$HCPKG list --simple-output --names-only | perl -ne 'for (split /\s+/) { print "constraints: any.$_ installed\n" unless /^(dataframe)$/; }' >> cabal.project.local
203+
$HCPKG list --simple-output --names-only | perl -ne 'for (split /\s+/) { print "constraints: any.$_ installed\n" unless /^(dataframe|dataframe-fastcsv)$/; }' >> cabal.project.local
200204
cat cabal.project
201205
cat cabal.project.local
202206
- name: dump install plan
@@ -226,6 +230,10 @@ jobs:
226230
run: |
227231
cd ${PKGDIR_dataframe} || false
228232
${CABAL} -vnormal check
233+
- name: cabal check (dataframe-fastcsv)
234+
run: |
235+
cd ${PKGDIR_fastcsv} || false
236+
${CABAL} -vnormal check
229237
- name: haddock
230238
run: |
231239
$CABAL v2-haddock --disable-documentation --haddock-all $ARG_COMPILER --with-haddock $HADDOCK $ARG_TESTS $ARG_BENCH all

benchmark/Main.hs

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ explorer = do
4343

4444
groupByHaskell :: IO ()
4545
groupByHaskell = do
46-
df <- D.fastReadCsvUnstable "./data/housing.csv"
46+
df <- D.readCsv "./data/housing.csv"
4747
print $
4848
df
4949
|> D.groupBy ["ocean_proximity"]
@@ -84,12 +84,6 @@ groupByExplorer = do
8484
parseFile :: String -> IO ()
8585
parseFile = void . D.readCsv
8686

87-
parseFileUnstable :: String -> IO ()
88-
parseFileUnstable = void . D.readCsvUnstable
89-
90-
parseFileUnstableSIMD :: String -> IO ()
91-
parseFileUnstableSIMD = void . D.fastReadCsvUnstable
92-
9387
parseHousingCSV :: IO ()
9488
parseHousingCSV = parseFile "./data/housing.csv"
9589

@@ -215,23 +209,13 @@ main = do
215209
, bgroup
216210
"housing.csv (1.4 MB)"
217211
[ bench "Attoparsec" $ nfIO $ parseFile "./data/housing.csv"
218-
, bench "Native Haskell" $ nfIO $ parseFileUnstable "./data/housing.csv"
219-
, bench "SIMD" $ nfIO $ parseFileUnstableSIMD "./data/housing.csv"
220212
]
221213
, bgroup
222214
"effects-of-covid-19-on-trade-at-15-december-2021-provisional.csv (9.1 MB)"
223215
[ bench "Attoparsec" $
224216
nfIO $
225217
parseFile
226218
"./data/effects-of-covid-19-on-trade-at-15-december-2021-provisional.csv"
227-
, bench "Native Haskell" $
228-
nfIO $
229-
parseFileUnstable
230-
"./data/effects-of-covid-19-on-trade-at-15-december-2021-provisional.csv"
231-
, bench "SIMD" $
232-
nfIO $
233-
parseFileUnstableSIMD
234-
"./data/effects-of-covid-19-on-trade-at-15-december-2021-provisional.csv"
235219
]
236220
, bgroup
237221
"join/inner/1:1"

cabal.project

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
packages:
2+
.
3+
dataframe-fastcsv

dataframe-fastcsv/LICENSE

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
Copyright (c) 2026 Michael Chavinda
2+
3+
Permission is hereby granted, free of charge, to any person obtaining
4+
a copy of this software and associated documentation files (the
5+
"Software"), to deal in the Software without restriction, including
6+
without limitation the rights to use, copy, modify, merge, publish,
7+
distribute, sublicense, and/or sell copies of the Software, and to
8+
permit persons to whom the Software is furnished to do so, subject to
9+
the following conditions:
10+
11+
The above copyright notice and this permission notice shall be included
12+
in all copies or substantial portions of the Software.
13+
14+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
#include "process_csv.h"
2+
#include <stdlib.h>
3+
#include <stdio.h>
4+
5+
/*
6+
* compile with clang -O3
7+
* in cabal use
8+
* cc-options: -O3
9+
*
10+
* Produce an array of field delimiter indices
11+
* Fields can be delimited by commas and newlines
12+
* TODO: allow the user to provide a custom delimiter
13+
* character to replace commas.
14+
* This should work with UTF-8, so long as the delimiter
15+
* character is a single byte.
16+
*
17+
* Delimiters can be escaped inside of quotes. Quotes
18+
* can also be placed inside quotes by double quoting.
19+
* For the purposes of this parser we can ignore double
20+
* quotes inside quotes, thereby treating the first quote
21+
* as the closing of the string and the next one the
22+
* immediate opening of a new one
23+
*
24+
* We can find the quoted regions by first finding
25+
* the positions of the quotes (cmpeq and then movemask)
26+
* and then using the carryless multiplication operation
27+
* to know the regions that are quoted. We can then simply
28+
* and the inverse of the quotemask to exclude commas and
29+
* newlines inside quotes
30+
*
31+
*/
32+
33+
// if the character is found at a particular
34+
// position in the array of bytes, the
35+
// corresponding bit in the returned uint64_t should
36+
// be turned on.
37+
// Example: searching for commas in
38+
// input: one, two, three
39+
// result: 000100001000000
40+
#ifdef HAS_SIMD_CSV
41+
static uint64_t find_character_in_chunk(uint8_t *in, uint8_t c) {
42+
#ifdef USE_AVX2
43+
// AVX2 implementation: load two 32-byte chunks
44+
__m256i v0 = _mm256_loadu_si256((const __m256i *)(in));
45+
__m256i v1 = _mm256_loadu_si256((const __m256i *)(in + 32));
46+
__m256i b = _mm256_set1_epi8((char)c);
47+
__m256i m0 = _mm256_cmpeq_epi8(v0, b);
48+
__m256i m1 = _mm256_cmpeq_epi8(v1, b);
49+
uint32_t lo = (uint32_t)_mm256_movemask_epi8(m0);
50+
uint32_t hi = (uint32_t)_mm256_movemask_epi8(m1);
51+
return ((uint64_t)hi << 32) | (uint64_t)lo;
52+
#else // USE_NEON
53+
// ARM NEON implementation: load 64 bytes deinterleaved
54+
uint8x16x4_t src = vld4q_u8(in);
55+
uint8x16_t mask = vmovq_n_u8(c);
56+
uint8x16_t cmp0 = vceqq_u8(src.val[0], mask);
57+
uint8x16_t cmp1 = vceqq_u8(src.val[1], mask);
58+
uint8x16_t cmp2 = vceqq_u8(src.val[2], mask);
59+
uint8x16_t cmp3 = vceqq_u8(src.val[3], mask);
60+
61+
// For an explanation of how to do movemask in
62+
// NEON, see: https://branchfree.org/2019/04/01/fitting-my-head-through-the-arm-holes-or-two-sequences-to-substitute-for-the-missing-pmovmskb-instruction-on-arm-neon/
63+
// The specific implementation below is owed to the
64+
// user 'aqrit' in a comment on the blog above
65+
// There's also https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
66+
//
67+
// The input to the move mask must be de-interleaved.
68+
// That is, for a string after vceqq_u8 we have,
69+
// cmp0 = aaaaaaaa / eeeeeeee / ...
70+
// cmp1 = bbbbbbbb / ffffffff / ...
71+
// cmp2 = cccccccc / gggggggg / ...
72+
// cmp3 = dddddddd / hhhhhhhh / ...
73+
// Luckily vld4q_u8 does this for us. Now we want
74+
// to interleave this into a 64bit integer with bits
75+
// abcdefgh...
76+
// cmp0 holds bits for positions
77+
// 0,4,8,..., cmp1 holds bits for 1,5,9,..., and so on.
78+
// So to bring together the bits for different positions
79+
// we right shift and combine with vsriq_n_u8
80+
//
81+
// vsriq_n_u8 shifts each byte of the first operand
82+
// right by n bits, and combines it with the bits of
83+
// the second operand.
84+
// example:
85+
// uint8_t mask = 0xFF >> n; // if n = 1, 0111 1111
86+
// uint8_t shifted = operand1 >> n // 0bbb, bbbb
87+
// operand1 = (operand2 & (!mask)) | shifted
88+
// (aaaa aaaa & 1000 0000) | 0bbb bbbb = abbb bbbb
89+
//
90+
// So we first bring together the bits the first two
91+
// rows and the next two rows (so to speak)
92+
// t0 = abbbbbbb/efffffff/...
93+
uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
94+
// t1 = cddddddd/ghhhhhhh/...
95+
uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
96+
// Now we must combine each of our combined rows
97+
// so that we get back our column interleaved
98+
// t2 = abcddddd/efghhhhh/...
99+
uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
100+
// Then to get rid of the repeated bits in the upper half
101+
// t3 = abcdabcd/efghefgh/...
102+
uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
103+
// and now it's the relatively simple matter of getting
104+
// rid of half the bits. We combine our 8bit words into 16
105+
// bit words for this step, and then we shift right by 4
106+
// and turn the result into an 8 bit word
107+
// afterreinterpert: abcdabcdefghefgh/...
108+
// afterrightshift: 0000abcdabcdefgh/...
109+
// take the lower bits: abcdefgh/...
110+
uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
111+
// Finally we recombine them into a 64 bit integer
112+
// (vreinterpret_u64_u8 here does uint8x8 -> uint64x1
113+
// and vget_lane_u64 does uint64x1 -> uint64)
114+
return vget_lane_u64(vreinterpret_u64_u8(t4), 0);
115+
#endif
116+
}
117+
#endif
118+
119+
// I owe a debt to https://github.com/geofflangdale/simdcsv
120+
// Let's go ahead and assume `in` will only ever get 64 bytes
121+
// initial_quoted will be either all_ones ~0ULL or all_zeros 0ULL
122+
#ifdef HAS_SIMD_CSV
123+
static uint64_t parse_chunk(uint8_t *in, uint8_t separator, uint64_t *initial_quoted) {
124+
uint64_t quotebits = find_character_in_chunk(in, QUOTE_CHAR);
125+
// See https://wunkolo.github.io/post/2020/05/pclmulqdq-tricks/
126+
// Also, section 3.1.1 of Parsing Gigabytes of JSON per Second,
127+
// Geoff Langdale, Daniel Lemire, https://arxiv.org/pdf/1902.08318
128+
#ifdef USE_AVX2
129+
// Use PCLMUL for carryless multiplication on x86
130+
__m128i a = _mm_set_epi64x(0, (int64_t)ALL_ONES_MASK);
131+
__m128i b = _mm_set_epi64x(0, (int64_t)quotebits);
132+
__m128i result = _mm_clmulepi64_si128(a, b, 0);
133+
uint64_t quotemask = (uint64_t)_mm_cvtsi128_si64(result);
134+
#else // USE_NEON
135+
// Use vmull_p64 (PMULL) for carryless multiplication on ARM
136+
// Requires ARM crypto extensions (compile: __ARM_FEATURE_AES, runtime: pmull flag)
137+
uint64_t quotemask = vmull_p64(ALL_ONES_MASK, quotebits);
138+
#endif
139+
quotemask ^= (*initial_quoted);
140+
// Find out if the chunk ends in a quoted region by looking
141+
// at the last bit
142+
(*initial_quoted) = (uint64_t)((int64_t)quotemask >> 63);
143+
144+
uint64_t commabits = find_character_in_chunk(in, separator);
145+
uint64_t newlinebits = find_character_in_chunk(in, NEWLINE_CHAR);
146+
147+
uint64_t delimiter_bits = (commabits | newlinebits) & ~quotemask;
148+
return delimiter_bits;
149+
}
150+
#endif
151+
152+
#ifdef HAS_SIMD_CSV
153+
static size_t find_one_indices(size_t start_index, uint64_t bits, size_t *indices, size_t *base) {
154+
size_t position = 0;
155+
uint64_t bitset = bits;
156+
while (bitset != 0) {
157+
// temp only has the least significant bit of
158+
// bitset turned on.
159+
// In twos complement: 0 - x = ~ x + 1
160+
uint64_t temp = bitset & -bitset;
161+
// count trailing zeros
162+
size_t r = __builtin_ctzll(bitset);
163+
indices[(*base) + position] = start_index + r;
164+
position++;
165+
166+
bitset ^= temp;
167+
}
168+
*base += position;
169+
return position;
170+
}
171+
#endif
172+
173+
size_t get_delimiter_indices(uint8_t *buf, size_t len, uint8_t separator, size_t *indices) {
174+
// Recall we padded our file with 64 empty bytes.
175+
// So if, for example, we had a file of 187 bytes
176+
// We pad it with zeros and so we have 251 bytes
177+
// The chunks we have are ptr + 0, ptr + 64, and pt
178+
// (we don't do ptr + 192 since we do len - 64
179+
// below. This way, in ptr + 128 we have 59 bytes of
180+
// actual data and 5 bytes of zeros. If we didn't do this
181+
// we'd be reading past the end of file on the last row
182+
#ifdef HAS_SIMD_CSV
183+
size_t unpaddedLen = len < 64 ? 0 : len - 64;
184+
uint64_t initial_quoted = 0ULL;
185+
size_t base = 0;
186+
for (size_t i = 0; i < unpaddedLen; i += 64) {
187+
uint8_t *in = buf + i;
188+
uint64_t delimiter_bits = parse_chunk(in, separator, &initial_quoted);
189+
find_one_indices(i, delimiter_bits, indices, &base);
190+
}
191+
return base;
192+
#else
193+
// SIMD not available or carryless multiplication not supported.
194+
// Signal fallback to Haskell implementation.
195+
(void)buf; (void)len; (void)indices;
196+
return (size_t)-1;
197+
#endif
198+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#ifndef PROCESS_CSV
2+
#define PROCESS_CSV
3+
4+
#include <stddef.h>
5+
#include <stdint.h>
6+
7+
// Define feature macros for SIMD support with carryless multiplication
8+
// We need both SIMD instructions AND carryless multiplication for the CSV parser
9+
#if defined(__AVX2__) && defined(__PCLMUL__)
10+
#define HAS_SIMD_CSV 1
11+
#define USE_AVX2 1
12+
#include <immintrin.h>
13+
#include <wmmintrin.h>
14+
#elif defined(__ARM_NEON) && (defined(__ARM_FEATURE_AES) || defined(__ARM_FEATURE_CRYPTO))
15+
// Note: __ARM_FEATURE_CRYPTO is deprecated; prefer __ARM_FEATURE_AES
16+
// We need polynomial multiply (vmull_p64/PMULL) for carryless multiplication
17+
// Runtime check: 'pmull' flag in /proc/cpuinfo on Linux
18+
// We support both macros for compatibility with older compilers
19+
#define HAS_SIMD_CSV 1
20+
#define USE_NEON 1
21+
#include <arm_neon.h>
22+
#endif
23+
24+
// CSV parsing constants
25+
#define COMMA_CHAR 0x2C
26+
#define NEWLINE_CHAR 0x0A
27+
#define QUOTE_CHAR 0x22
28+
#define ALL_ONES_MASK ~0ULL
29+
#define ALL_ZEROS_MASK 0ULL
30+
31+
size_t get_delimiter_indices(uint8_t *buf, size_t len, uint8_t separator, size_t* indices);
32+
33+
#endif

0 commit comments

Comments
 (0)