Skip to content

Commit bab62d6

Browse files
simplify decoder
1 parent 8cb94a5 commit bab62d6

5 files changed

Lines changed: 104 additions & 424 deletions

File tree

benches/parse.rs

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ fn clickbench_schema() -> Arc<Schema> {
99
let nullable = matches!(dt, Utf8);
1010
Field::new(name, dt, nullable)
1111
};
12+
1213
Arc::new(Schema::new(vec![
1314
f("WatchID", Int64),
1415
f("JavaEnable", Int16),
@@ -124,30 +125,6 @@ fn bench_clickbench(c: &mut Criterion) {
124125

125126
let schema = clickbench_schema();
126127

127-
c.bench_function("arrow-csv2::Decoder (clickbench 100MB)", |b| {
128-
b.iter(|| {
129-
let mut decoder = arrow_csv2::ReaderBuilder::new(schema.clone())
130-
.with_batch_size(8192)
131-
.build_decoder();
132-
133-
let mut offset = 0;
134-
let mut batches = Vec::new();
135-
loop {
136-
let consumed = decoder.decode(&raw[offset..]).unwrap();
137-
offset += consumed;
138-
if consumed == 0 || decoder.capacity() == 0 {
139-
if let Some(batch) = decoder.flush().unwrap() {
140-
batches.push(batch);
141-
}
142-
if consumed == 0 && decoder.capacity() > 0 {
143-
break;
144-
}
145-
}
146-
}
147-
batches
148-
});
149-
});
150-
151128
c.bench_function("arrow-csv2::Reader (clickbench 100MB)", |b| {
152129
b.iter(|| {
153130
let reader = arrow_csv2::ReaderBuilder::new(schema.clone())

src/classify.rs

Lines changed: 61 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -22,45 +22,75 @@ pub const HIGH_NIBBLES: [u8; 16] = {
2222
out
2323
};
2424

25-
// note: data must be a multiple of 16
26-
pub fn classify(data: &[u8]) -> Vec<u8x16> {
27-
let low_nibbles = u8x16::from_slice_unchecked(&LOW_NIBBLES);
28-
let high_nibbles = u8x16::from_slice_unchecked(&HIGH_NIBBLES);
25+
pub struct ClassifyResult {
26+
pub comma_bitsets: Vec<u64>,
27+
pub newline_bitsets: Vec<u64>,
28+
}
2929

30-
let chunks = data.chunks_exact(u8x16::LANE_COUNT);
30+
// classify structual characters and apply quote masking
31+
// note: data must be a multiple of 64 bytes
32+
pub fn classify_and_mask(data: &[u8]) -> ClassifyResult {
33+
let low_nib = u8x16::from_slice_unchecked(&LOW_NIBBLES);
34+
let high_nib = u8x16::from_slice_unchecked(&HIGH_NIBBLES);
35+
let comma_bc = u8x16::broadcast(COMMA);
36+
let newline_bc = u8x16::broadcast(NEWLINE);
37+
let quote_bc = u8x16::broadcast(QUOTES);
3138

32-
let mut out = Vec::with_capacity(data.len() / 16);
39+
let num_words = data.len() / 64;
40+
let mut comma_bitsets = Vec::with_capacity(num_words);
41+
let mut newline_bitsets = Vec::with_capacity(num_words);
3342

34-
for chunk in chunks {
35-
let v = u8x16::from_slice_unchecked(chunk);
36-
let (high, low) = v.nibbles();
43+
let mut carry = false;
44+
for chunk in data.chunks_exact(64) {
45+
let (c0, c1, c2, c3) = classify_four_lanes(chunk, high_nib, low_nib);
3746

38-
let res = high_nibbles.classify(high) & low_nibbles.classify(low);
39-
out.push(res);
40-
}
47+
let mut comma = build_eq_bitset(c0, c1, c2, c3, comma_bc);
48+
let mut newline = build_eq_bitset(c0, c1, c2, c3, newline_bc);
49+
let quote = build_eq_bitset(c0, c1, c2, c3, quote_bc);
4150

42-
out
43-
}
51+
let inside = unsafe { std::arch::aarch64::vmull_p64(quote, 0xFFFFFFFFFFFFFFFF_u64) } as u64;
52+
let outside = if carry { inside } else { !inside };
53+
carry ^= (quote.count_ones() & 1) != 0;
54+
55+
comma &= outside;
56+
newline &= outside;
4457

45-
#[cfg(test)]
46-
mod tests {
47-
use super::*;
58+
comma_bitsets.push(comma);
59+
newline_bitsets.push(newline);
60+
}
4861

49-
#[test]
50-
fn test_basic_classify() {
51-
let mut data = b"a,b,c\nf,\"g\"".to_vec();
52-
data.resize(data.len().next_multiple_of(16), 0);
62+
ClassifyResult {
63+
comma_bitsets,
64+
newline_bitsets,
65+
}
66+
}
5367

54-
let bitsets = classify(&data);
68+
#[inline(always)]
69+
fn classify_four_lanes(
70+
chunk: &[u8],
71+
high_nib: u8x16,
72+
low_nib: u8x16,
73+
) -> (u8x16, u8x16, u8x16, u8x16) {
74+
let classify_one = |slice: &[u8]| {
75+
let v = u8x16::from_slice_unchecked(slice);
76+
let (h, l) = v.nibbles();
77+
high_nib.classify(h) & low_nib.classify(l)
78+
};
79+
80+
(
81+
classify_one(&chunk[0..16]),
82+
classify_one(&chunk[16..32]),
83+
classify_one(&chunk[32..48]),
84+
classify_one(&chunk[48..64]),
85+
)
86+
}
5587

56-
assert_eq!(bitsets.len(), 1);
57-
let res: [u8; 16] = bitsets[0].into();
88+
#[inline(always)]
89+
fn build_eq_bitset(v0: u8x16, v1: u8x16, v2: u8x16, v3: u8x16, bc: u8x16) -> u64 {
90+
let a = v0.eq(bc).bitset() as u64;
91+
let b = v1.eq(bc).bitset() as u64;
92+
let c = v2.eq(bc).bitset() as u64;
93+
let d = v3.eq(bc).bitset() as u64;
5894

59-
assert_eq!(
60-
res,
61-
[
62-
0, COMMA, 0, COMMA, 0, NEWLINE, 0, COMMA, QUOTES, 0, QUOTES, 0, 0, 0, 0, 0,
63-
]
64-
);
65-
}
95+
a | (b << 16) | (c << 32) | (d << 48)
6696
}

0 commit comments

Comments
 (0)