add readme and correctness test

friendlymatthew · friendlymatthew · commit 88a9a620cb5f · 2026-03-25T22:41:08.000-04:00
diff --git a/README.md b/README.md
@@ -1,10 +1,26 @@
-# simdcsv
+# arrow-csv2
 
-simdcsv is a CSV parser that evaluates 64 bytes at a time. There are many kinds of CSV files; this project adheres to the format described
-in [RFC 4180](https://www.rfc-editor.org/rfc/rfc4180.html).
+Vectorized CSV parsing for Apache Arrow.
 
-## Reading
+This project aims to be a faster drop-in replacement for `arrow-csv`, the csv-to-arrow decoder in the `arrow-rs` ecosystem. The parser employs techniques highlighted in the `simdjson` paper like vectorized classification and prefix xor.
 
+A secondary goal is to demonstrate a performant parallel object store reader that uses speculative quote-state reconciliation to enable byte-range splitting for files with quoted newlines, something Datafusion currently disables.
+
+# Status
+
+Currently, `arrow-csv2` decodes **2.4x faster** than `arrow-csv` (69ms vs 168ms). This is measured on a 100MB slice (~130K rows) of the [ClickBench](https://github.com/ClickHouse/ClickBench) `hits.csv` dataset.
+
+```sh
+# run benchmarks
+./download_clickbench.sh
+cargo r --bin slice_clickbench
+cargo bench
+```
+
+The goal is not full feature parity with `arrow-csv`, but a proof of concept that explores how far we can push CSV-to-Arrow performance, from single threaded decoding to parallel ingestion from object store.
+
+# Reading
+
+https://branchfree.org/2019/03/06/code-fragment-finding-quote-pairs-with-carry-less-multiply-pclmulqdq/<br>
 https://www.rfc-editor.org/rfc/rfc4180.html<br>
 https://arxiv.org/pdf/1902.08318<br>
-https://branchfree.org/2019/03/06/code-fragment-finding-quote-pairs-with-carry-less-multiply-pclmulqdq/<br>
diff --git a/tests/correctness.rs b/tests/correctness.rs
@@ -0,0 +1,110 @@
+use std::sync::Arc;
+
+use arrow_csv2::ReaderBuilder;
+use arrow_schema::{DataType, Field, Schema};
+
+fn taxi_zone_schema() -> Arc<Schema> {
+    Arc::new(Schema::new(vec![
+        Field::new("LocationID", DataType::Utf8, true),
+        Field::new("Borough", DataType::Utf8, true),
+        Field::new("Zone", DataType::Utf8, true),
+        Field::new("service_zone", DataType::Utf8, true),
+    ]))
+}
+
+fn decode_all(
+    raw: &[u8],
+    schema: Arc<Schema>,
+    build: impl Fn(Arc<Schema>) -> arrow_csv2::Decoder,
+) -> Vec<arrow_array::RecordBatch> {
+    let mut decoder = build(schema);
+    let mut offset = 0;
+    let mut batches = Vec::new();
+    loop {
+        let consumed = decoder.decode(&raw[offset..]).unwrap();
+        offset += consumed;
+        if consumed == 0 || decoder.capacity() == 0 {
+            if let Some(batch) = decoder.flush().unwrap() {
+                batches.push(batch);
+            }
+            if consumed == 0 && decoder.capacity() > 0 {
+                break;
+            }
+        }
+    }
+    batches
+}
+
+fn decode_all_arrow_csv(
+    raw: &[u8],
+    schema: Arc<Schema>,
+) -> Vec<arrow_array::RecordBatch> {
+    let mut decoder = arrow_csv::ReaderBuilder::new(schema)
+        .with_header(true)
+        .with_batch_size(8192)
+        .build_decoder();
+
+    let mut offset = 0;
+    let mut batches = Vec::new();
+    loop {
+        let consumed = decoder.decode(&raw[offset..]).unwrap();
+        offset += consumed;
+        if consumed == 0 || decoder.capacity() == 0 {
+            if let Some(batch) = decoder.flush().unwrap() {
+                batches.push(batch);
+            }
+            if consumed == 0 {
+                break;
+            }
+        }
+    }
+    batches
+}
+
+#[test]
+fn arrow_csv2_matches_arrow_csv() {
+    let raw = std::fs::read("taxi_zone_lookup.csv").expect("missing csv");
+    let schema = taxi_zone_schema();
+
+    let ours = decode_all(&raw, schema.clone(), |s| {
+        ReaderBuilder::new(s)
+            .with_header(true)
+            .with_batch_size(8192)
+            .build_decoder()
+    });
+
+    let theirs = decode_all_arrow_csv(&raw, schema);
+
+    // same number of batches
+    assert_eq!(ours.len(), theirs.len(), "batch count mismatch");
+
+    for (i, (a, b)) in ours.iter().zip(&theirs).enumerate() {
+        assert_eq!(a.num_rows(), b.num_rows(), "row count mismatch in batch {i}");
+        assert_eq!(
+            a.num_columns(),
+            b.num_columns(),
+            "column count mismatch in batch {i}"
+        );
+
+        for col in 0..a.num_columns() {
+            let col_a = a
+                .column(col)
+                .as_any()
+                .downcast_ref::<arrow_array::StringArray>()
+                .unwrap();
+            let col_b = b
+                .column(col)
+                .as_any()
+                .downcast_ref::<arrow_array::StringArray>()
+                .unwrap();
+
+            for row in 0..a.num_rows() {
+                assert_eq!(
+                    col_a.value(row),
+                    col_b.value(row),
+                    "mismatch at batch {i}, col {col}, row {row}"
+                );
+            }
+        }
+    }
+}
diff --git a/tests/taxi_zone.rs b/tests/taxi_zone.rs