Skip to content

Commit 39f4728

Browse files
add benchmarks, rename project
1 parent fab6a1d commit 39f4728

8 files changed

Lines changed: 257 additions & 78 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
/target
22
hits.csv
3+
hits_100mb.csv

Cargo.lock

Lines changed: 48 additions & 11 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[package]
2-
name = "simdcsv"
2+
name = "arrow-csv2"
33
authors = ["Matthew Kim"]
44
version = "0.1.0"
55
edition = "2024"
@@ -14,6 +14,11 @@ arrow-cast = "55"
1414
[dev-dependencies]
1515
insta = "1.46.3"
1616
criterion = "0.8.2"
17+
arrow-csv = "55"
18+
19+
[[bin]]
20+
name = "slice_clickbench"
21+
path = "bin/slice_clickbench.rs"
1722

1823
[[bench]]
1924
name = "parse"

benches/parse.rs

Lines changed: 67 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,78 @@
1+
use std::sync::Arc;
2+
3+
use arrow_csv2::{ReaderBuilder, read};
4+
use arrow_schema::{DataType, Field, Schema};
15
use criterion::{Criterion, criterion_group, criterion_main};
2-
use simdcsv::read;
36

4-
fn parse_majestic_million(c: &mut Criterion) {
5-
let raw = std::fs::read("majestic_million.csv").expect("csv not found");
7+
const NUM_COLUMNS: usize = 105;
8+
fn clickbench_schema() -> Arc<Schema> {
9+
Arc::new(Schema::new(
10+
(0..NUM_COLUMNS)
11+
.map(|i| Field::new(format!("c{i}"), DataType::Utf8, true))
12+
.collect::<Vec<_>>(),
13+
))
14+
}
15+
16+
fn bench_clickbench(c: &mut Criterion) {
17+
let raw = std::fs::read("hits_100mb.csv")
18+
.expect("hits_100mb.csv not found — run: cargo run --release --bin slice_clickbench");
19+
let schema = clickbench_schema();
620

7-
c.bench_function("parse majestic_million.csv", |b| {
21+
c.bench_function("arrow-csv2::read (clickbench 100MB)", |b| {
822
b.iter(|| {
923
let mut data = raw.clone();
1024
read(&mut data)
1125
});
1226
});
27+
28+
c.bench_function("arrow-csv2::Decoder (clickbench 100MB)", |b| {
29+
b.iter(|| {
30+
let mut decoder = ReaderBuilder::new(schema.clone())
31+
.with_batch_size(8192)
32+
.build_decoder();
33+
34+
let mut offset = 0;
35+
let mut batches = Vec::new();
36+
loop {
37+
let consumed = decoder.decode(&raw[offset..]).unwrap();
38+
offset += consumed;
39+
if consumed == 0 || decoder.capacity() == 0 {
40+
if let Some(batch) = decoder.flush().unwrap() {
41+
batches.push(batch);
42+
}
43+
if consumed == 0 && decoder.capacity() > 0 {
44+
break;
45+
}
46+
}
47+
}
48+
batches
49+
});
50+
});
51+
52+
c.bench_function("arrow-csv::Decoder (clickbench 100MB)", |b| {
53+
b.iter(|| {
54+
let mut decoder = arrow_csv::ReaderBuilder::new(schema.clone())
55+
.with_batch_size(8192)
56+
.build_decoder();
57+
58+
let mut offset = 0;
59+
let mut batches = Vec::new();
60+
loop {
61+
let consumed = decoder.decode(&raw[offset..]).unwrap();
62+
offset += consumed;
63+
if consumed == 0 || decoder.capacity() == 0 {
64+
if let Some(batch) = decoder.flush().unwrap() {
65+
batches.push(batch);
66+
}
67+
if consumed == 0 {
68+
break;
69+
}
70+
}
71+
}
72+
batches
73+
});
74+
});
1375
}
1476

15-
criterion_group!(benches, parse_majestic_million);
77+
criterion_group!(benches, bench_clickbench);
1678
criterion_main!(benches);

bin/slice_clickbench.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
use std::io::Read;
2+
3+
const BENCH_BYTES: usize = 100 * 1024 * 1024;
4+
5+
fn main() {
6+
let mut f =
7+
std::fs::File::open("hits.csv").expect("hits.csv not found. run ./download_clickbench.sh");
8+
9+
let mut buf = vec![0u8; BENCH_BYTES];
10+
f.read_exact(&mut buf).expect("hits.csv too small");
11+
12+
let end = buf.iter().rposition(|&b| b == b'\n').unwrap_or(buf.len());
13+
buf.truncate(end + 1);
14+
15+
std::fs::write("hits_100mb.csv", &buf).expect("failed to write");
16+
}

0 commit comments

Comments
 (0)