Skip to content

Commit 00fa81e

Browse files
rewrite impl after blog post
1 parent 8c7f61c commit 00fa81e

7 files changed

Lines changed: 1762 additions & 1931 deletions

File tree

README.md

Lines changed: 0 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -3,67 +3,6 @@
33
simdcsv is a CSV parser that evaluates 64 bytes at a time. There are many kinds of CSV files; this project adheres to the format described
44
in [RFC 4180](https://www.rfc-editor.org/rfc/rfc4180.html).
55

6-
**Introduction**
7-
8-
We can classify every character in CSV into the following: a COMMA, QUOTATION, NEW_LINE, OTHER. We can build a perfect lookup table and use `vqtbl1q_u8` to classify 16 characters at once. Daniel Lemire calls this "vectorized classification" in the simdjson paper. [[code pointer]](https://github.com/friendlymatthew/simdcsv/blob/main/src/classifier.rs)
9-
10-
Once we classify every character, we can build a bitset for each class. We chunk through 64 characters at a time, building a `u64` for every chunk. Here is a naive case:
11-
12-
```
13-
[//]: # COMMA = 0, QUOTATION = 1, NEW_LINE = 2, OTHER = 3
14-
15-
aaa,bbb,ccc
16-
33303330333
17-
```
18-
19-
Then the bitsets look like:
20-
21-
```rs
22-
comma_bitset = 0b00010001000
23-
other_bitset = 0b11101110111
24-
```
25-
26-
Now, we can just [count the number of leading zeros](https://doc.rust-lang.org/std/primitive.u64.html#method.leading_zeros) in the comma bitset to pull the csv entries.
27-
28-
Using a bitset is pretty powerful in cases where one wants to check if there exists a symbol, count the # of symbols, or remove escaped symbols.
29-
30-
**Detecting Escaped Quotations and Commas**
31-
32-
Consider the csv row: `"aaa,norm","b""bb","ccc"`
33-
34-
In CSV, quotes are escaped by doubling them (`""`). The `""` in `b""bb` is field content, not a structural delimiter. We detect escaped pairs by finding adjacent quotes:
35-
36-
```rs
37-
let escaped = q & (q << 1); // Find adjacent quote pairs
38-
let escaped = escaped | (escaped >> 1); // Mark both quotes in each pair
39-
let valid_quotes = q & !escaped; // Remove escaped quotes
40-
```
41-
42-
```rs
43-
quote_bitset = 0b100010010011010000000001
44-
q << 1 = 0b000100100110100000000010
45-
escaped = 0b000000010000000000000000 // Found the "" pair
46-
escaped | >> 1 = 0b000000011000000000000000 // Both quotes marked
47-
valid_quotes = 0b100010000011010000000001 // Only structural quotes remain
48-
```
49-
50-
**Marking Inside Quotations**
51-
52-
With only structural quotes, we use parallel prefix XOR to mark all bits between quote pairs:
53-
54-
```rs
55-
valid_quotes = 0b100010000011010000000001 // Structural quotes only
56-
inside_quotes = 0b011100011111000011111110 // All bits between quote pairs marked as 1
57-
```
58-
59-
Masking out commas inside quotes:
60-
61-
```rs
62-
comma_bitset = 0b000001000000100000010000 // Commas at positions 4, 10, 18
63-
valid_commas = comma_bitset & !inside_quotes
64-
= 0b000001000000100000000000 // Comma at 4 masked out
65-
```
66-
676
## Reading
687

698
https://www.rfc-editor.org/rfc/rfc4180.html<br>

src/classify.rs

Lines changed: 35 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -1,110 +1,45 @@
1-
pub const COMMA_CLASS: u8 = 1;
2-
pub const NEW_LINE_CLASS: u8 = 2;
3-
pub const QUOTATION_CLASS: u8 = 4;
4-
5-
pub const LO_LOOKUP: [u8; 16] = [
6-
0,
7-
0,
8-
QUOTATION_CLASS,
9-
0,
10-
0,
11-
0,
12-
0,
13-
0,
14-
0,
15-
0,
16-
NEW_LINE_CLASS,
17-
0,
18-
COMMA_CLASS,
19-
NEW_LINE_CLASS,
20-
0,
21-
0,
22-
];
23-
24-
pub const HI_LOOKUP: [u8; 16] = [
25-
NEW_LINE_CLASS,
26-
0,
27-
COMMA_CLASS | QUOTATION_CLASS,
28-
0,
29-
0,
30-
0,
31-
0,
32-
0,
33-
0,
34-
0,
35-
0,
36-
0,
37-
0,
38-
0,
39-
0,
40-
0,
41-
];
42-
43-
use crate::u8x16::u8x16;
44-
45-
#[derive(Debug)]
46-
pub struct CsvClassifier<'a> {
47-
cursor: usize,
48-
data: &'a [u8],
49-
}
1+
use crate::u8x16;
502

51-
impl<'a> CsvClassifier<'a> {
52-
pub const fn new(data: &'a [u8]) -> Self {
53-
Self { cursor: 0, data }
54-
}
3+
pub const COMMA: u8 = 1;
4+
pub const NEWLINE: u8 = 2;
5+
pub const QUOTES: u8 = 4;
556

56-
pub fn classify(&mut self) -> Vec<u8x16> {
57-
let mut bitsets = Vec::new();
7+
pub const LOW_NIBBLES: [u8; 16] = {
8+
let mut out = [0u8; 16];
9+
out[0x2] = QUOTES;
10+
out[0xA] = NEWLINE;
11+
out[0xC] = COMMA;
12+
out[0xD] = NEWLINE;
5813

59-
let high_nibble_lookup = u8x16::from_slice_unchecked(&HI_LOOKUP);
60-
let low_nibble_lookup = u8x16::from_slice_unchecked(&LO_LOOKUP);
14+
out
15+
};
6116

62-
while self.cursor < self.data.len() {
63-
let (lanes, _aligned) = self.load_u8x16();
17+
pub const HIGH_NIBBLES: [u8; 16] = {
18+
let mut out = [0u8; 16];
19+
out[0x0] = NEWLINE;
20+
out[0x2] = COMMA | QUOTES;
6421

65-
let (hi_lanes, lo_lanes) = lanes.nibbles();
66-
let res = high_nibble_lookup.classify(hi_lanes) & low_nibble_lookup.classify(lo_lanes);
22+
out
23+
};
6724

68-
bitsets.push(res);
69-
}
25+
// note: data must be a multiple of 16
26+
pub fn classify(data: &[u8]) -> Vec<u8x16> {
27+
let low_nibbles = u8x16::from_slice_unchecked(&LOW_NIBBLES);
28+
let high_nibbles = u8x16::from_slice_unchecked(&HIGH_NIBBLES);
7029

71-
// if data length is exactly aligned to 16 bytes and doesn't end with a \n
72-
// the last load_u8x16 took the fast path and no new line was appended
73-
let last = self.data[self.data.len() - 1];
74-
if self.data.len().is_multiple_of(16) && last != 0x0A && last != 0x0D {
75-
let mut temp = [0u8; 16];
76-
temp[0] = 0x0A;
30+
let chunks = data.chunks_exact(u8x16::LANE_COUNT);
7731

78-
let (hi, lo) = u8x16::from_slice_unchecked(&temp).nibbles();
32+
let mut out = Vec::with_capacity(data.len() / 16);
7933

80-
bitsets.push(high_nibble_lookup.classify(hi) & low_nibble_lookup.classify(lo));
81-
}
34+
for chunk in chunks {
35+
let v = u8x16::from_slice_unchecked(chunk);
36+
let (high, low) = v.nibbles();
8237

83-
bitsets
38+
let res = high_nibbles.classify(high) & low_nibbles.classify(low);
39+
out.push(res);
8440
}
8541

86-
fn load_u8x16(&mut self) -> (u8x16, bool) {
87-
if self.cursor + u8x16::LANE_COUNT < self.data.len() {
88-
let slice = &self.data[self.cursor..self.cursor + u8x16::LANE_COUNT];
89-
self.cursor += u8x16::LANE_COUNT;
90-
91-
return (u8x16::from_slice_unchecked(slice), true);
92-
}
93-
94-
let slice = &self.data[self.cursor..];
95-
self.cursor = self.data.len();
96-
97-
let mut temp = [0u8; 16];
98-
let copy_len = slice.len().min(16);
99-
temp[..copy_len].copy_from_slice(&slice[..copy_len]);
100-
101-
let last = self.data[self.data.len() - 1];
102-
if last != 0x0A && last != 0x0D && copy_len < 16 {
103-
temp[copy_len] = 0x0A;
104-
}
105-
106-
(u8x16::from_slice_unchecked(&temp), false)
107-
}
42+
out
10843
}
10944

11045
#[cfg(test)]
@@ -113,31 +48,18 @@ mod tests {
11348

11449
#[test]
11550
fn test_basic_classify() {
116-
let mut classifier = CsvClassifier::new(b"a,b,c\nf,\"g\"");
117-
let bitsets = classifier.classify();
51+
let mut data = b"a,b,c\nf,\"g\"".to_vec();
52+
data.resize(data.len().next_multiple_of(16), 0);
53+
54+
let bitsets = classify(&data);
11855

11956
assert_eq!(bitsets.len(), 1);
12057
let res: [u8; 16] = bitsets[0].into();
12158

12259
assert_eq!(
12360
res,
12461
[
125-
0,
126-
COMMA_CLASS,
127-
0,
128-
COMMA_CLASS,
129-
0,
130-
NEW_LINE_CLASS,
131-
0,
132-
COMMA_CLASS,
133-
QUOTATION_CLASS,
134-
0,
135-
QUOTATION_CLASS,
136-
NEW_LINE_CLASS, // we always add a \n at the end
137-
0,
138-
0,
139-
0,
140-
0,
62+
0, COMMA, 0, COMMA, 0, NEWLINE, 0, COMMA, QUOTES, 0, QUOTES, 0, 0, 0, 0, 0,
14163
]
14264
);
14365
}

src/lib.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
mod classify;
44
mod read;
5-
mod u8x16;
5+
mod simd;
66

77
pub use read::*;
8+
pub(crate) use simd::*;

src/main.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ use simdcsv::read;
22

33
fn main() -> Result<(), Box<dyn std::error::Error>> {
44
let path = std::env::args().nth(1).expect("expect .csv file path");
5-
let data = std::fs::read(path)?;
6-
let rows = read(&data);
5+
let mut data = std::fs::read(path)?;
6+
let rows = read(&mut data);
77

88
println!("{:?}", rows);
99

0 commit comments

Comments
 (0)