Skip to content

Commit 13e9fff

Browse files
pull in simdutf8 and atoi for faster parsing
1 parent 3875f01 commit 13e9fff

3 files changed

Lines changed: 48 additions & 15 deletions

File tree

Cargo.lock

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ license = "Apache 2.0"
1010
arrow-schema = "55"
1111
arrow-array = "55"
1212
arrow-cast = "55"
13+
simdutf8 = "0.1.5"
14+
atoi = "2.0.0"
1315

1416
[dev-dependencies]
1517
insta = "1.46.3"

src/decoder.rs

Lines changed: 38 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ impl Decoder {
300300
Ok(Arc::new(b.finish()))
301301
}
302302
DataType::Utf8 => {
303-
std::str::from_utf8(data).map_err(|e| {
303+
simdutf8::basic::from_utf8(data).map_err(|e| {
304304
ArrowError::ParseError(format!("invalid utf-8 in col {col}: {e}"))
305305
})?;
306306

@@ -318,34 +318,34 @@ impl Decoder {
318318
Ok(Arc::new(b.finish()))
319319
}
320320
DataType::Int8 => {
321-
build_primitive_col!(data, offsets, col, num_rows, nullable, Int8Builder, i8)
321+
build_int_col!(data, offsets, col, num_rows, nullable, Int8Builder, i8)
322322
}
323323
DataType::Int16 => {
324-
build_primitive_col!(data, offsets, col, num_rows, nullable, Int16Builder, i16)
324+
build_int_col!(data, offsets, col, num_rows, nullable, Int16Builder, i16)
325325
}
326326
DataType::Int32 => {
327-
build_primitive_col!(data, offsets, col, num_rows, nullable, Int32Builder, i32)
327+
build_int_col!(data, offsets, col, num_rows, nullable, Int32Builder, i32)
328328
}
329329
DataType::Int64 => {
330-
build_primitive_col!(data, offsets, col, num_rows, nullable, Int64Builder, i64)
330+
build_int_col!(data, offsets, col, num_rows, nullable, Int64Builder, i64)
331331
}
332332
DataType::UInt8 => {
333-
build_primitive_col!(data, offsets, col, num_rows, nullable, UInt8Builder, u8)
333+
build_int_col!(data, offsets, col, num_rows, nullable, UInt8Builder, u8)
334334
}
335335
DataType::UInt16 => {
336-
build_primitive_col!(data, offsets, col, num_rows, nullable, UInt16Builder, u16)
336+
build_int_col!(data, offsets, col, num_rows, nullable, UInt16Builder, u16)
337337
}
338338
DataType::UInt32 => {
339-
build_primitive_col!(data, offsets, col, num_rows, nullable, UInt32Builder, u32)
339+
build_int_col!(data, offsets, col, num_rows, nullable, UInt32Builder, u32)
340340
}
341341
DataType::UInt64 => {
342-
build_primitive_col!(data, offsets, col, num_rows, nullable, UInt64Builder, u64)
342+
build_int_col!(data, offsets, col, num_rows, nullable, UInt64Builder, u64)
343343
}
344344
DataType::Float32 => {
345-
build_primitive_col!(data, offsets, col, num_rows, nullable, Float32Builder, f32)
345+
build_float_col!(data, offsets, col, num_rows, nullable, Float32Builder, f32)
346346
}
347347
DataType::Float64 => {
348-
build_primitive_col!(data, offsets, col, num_rows, nullable, Float64Builder, f64)
348+
build_float_col!(data, offsets, col, num_rows, nullable, Float64Builder, f64)
349349
}
350350
other => Err(ArrowError::NotYetImplemented(format!(
351351
"data type {other} not yet supported"
@@ -375,21 +375,44 @@ impl Decoder {
375375
}
376376
}
377377

378-
macro_rules! build_primitive_col {
378+
macro_rules! build_int_col {
379379
($data:expr, $offsets:expr, $col:expr, $num_rows:expr, $nullable:expr, $builder:ty, $native:ty) => {{
380380
let mut b = <$builder>::with_capacity($num_rows);
381381
for row in 0..$num_rows {
382382
let raw = &$data[$offsets[row]..$offsets[row + 1]];
383383
if raw.is_empty() && $nullable {
384384
b.append_null();
385385
} else {
386-
let s = std::str::from_utf8(raw).map_err(|e| {
386+
let v: $native = atoi::atoi(raw).ok_or_else(|| {
387+
ArrowError::ParseError(format!(
388+
"cannot parse as {} at row {}, col {}",
389+
stringify!($native),
390+
row,
391+
$col
392+
))
393+
})?;
394+
b.append_value(v);
395+
}
396+
}
397+
Ok(Arc::new(b.finish()) as ArrayRef)
398+
}};
399+
}
400+
use build_int_col;
401+
402+
macro_rules! build_float_col {
403+
($data:expr, $offsets:expr, $col:expr, $num_rows:expr, $nullable:expr, $builder:ty, $native:ty) => {{
404+
let mut b = <$builder>::with_capacity($num_rows);
405+
for row in 0..$num_rows {
406+
let raw = &$data[$offsets[row]..$offsets[row + 1]];
407+
if raw.is_empty() && $nullable {
408+
b.append_null();
409+
} else {
410+
let s = simdutf8::basic::from_utf8(raw).map_err(|e| {
387411
ArrowError::ParseError(format!(
388412
"invalid utf-8 at row {}, col {}: {}",
389413
row, $col, e
390414
))
391415
})?;
392-
393416
let v: $native = s.parse().map_err(|_| {
394417
ArrowError::ParseError(format!(
395418
"cannot parse '{}' as {} at row {}, col {}",
@@ -405,7 +428,7 @@ macro_rules! build_primitive_col {
405428
Ok(Arc::new(b.finish()) as ArrayRef)
406429
}};
407430
}
408-
use build_primitive_col;
431+
use build_float_col;
409432

410433
#[inline(always)]
411434
fn classify_one(chunk: &[u8], high_nibbles: u8x16, low_nibbles: u8x16) -> u8x16 {

0 commit comments

Comments
 (0)