Skip to content

Commit 5b88318

Browse files
committed
bench(writer): codec comparison benchmark on a 10k-row mixed batch
Adds a Criterion benchmark comparing on-disk size and write time for None / Snappy / ZLIB (levels 1, 6, 9) / ZSTD (levels 1, 3, 9, 19) on a 10 000-row Int64 + Utf8 batch — the workload representative of production Hive / Trino tables. Each codec's resulting file size is printed alongside the benchmark so reviewers can sanity-check the size / speed trade-off without re-running the bench themselves. Sample numbers from a single run on Apple-silicon (debug rustc 1.95): codec output_bytes ratio time none 246698 1.30x 110 us snappy 59346 5.39x 293 us zlib_1 34318 9.32x 375 us zlib_6 32461 9.86x 3.4 ms zlib_9 32461 9.86x 11.8 ms zstd_1 12538 25.5x 264 us zstd_3 8834 36.2x 314 us zstd_9 12981 24.6x 1.2 ms zstd_19 4823 66.4x 81 ms (zstd_3 and zstd_1 dominate the speed / ratio Pareto frontier on this workload; this matches Java ORC's choice of zstd level 3 as the default `orc.compress.zstd.level`.) Signed-off-by: Youichi Uda <youichi.uda@gmail.com>
1 parent 3c14155 commit 5b88318

2 files changed

Lines changed: 112 additions & 0 deletions

File tree

Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ required-features = ["async"]
9393
# Some issue when publishing and path isn't specified, so adding here
9494
path = "./benches/arrow_reader.rs"
9595

96+
[[bench]]
97+
name = "writer_compression"
98+
harness = false
99+
path = "./benches/writer_compression.rs"
100+
96101
[profile.bench]
97102
debug = true
98103

benches/writer_compression.rs

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Benchmarks comparing writer-side compression codecs on a 10k-row
19+
//! mixed Int64 + Utf8 batch — the exact workload most production Hive /
20+
//! Trino tables emit. Each benchmark measures end-to-end write time
21+
//! from `RecordBatch` to closed ORC file. The reported throughput is
22+
//! single-stripe (the batch fits comfortably under the 64 MiB stripe
23+
//! size), so the variance between codecs is dominated by encoder cost
24+
//! and the resulting on-disk size — both of which we surface in the
25+
//! benchmark printouts so reviewers can sanity-check the trade-off.
26+
27+
use std::sync::Arc;
28+
29+
use arrow::array::{Int64Array, RecordBatch, StringArray};
30+
use arrow::datatypes::{DataType, Field, Schema};
31+
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
32+
33+
use orc_rust::arrow_writer::{ArrowWriterBuilder, Compression};
34+
35+
fn build_batch() -> RecordBatch {
36+
let n = 10_000;
37+
let ints: Vec<i64> = (0..n as i64).collect();
38+
// Repeating-but-not-trivial strings — gives every codec something
39+
// to chew on without making the input pathologically compressible.
40+
let strs: Vec<String> = (0..n)
41+
.map(|i| format!("event-{:08x}-payload-{}", i, i % 17))
42+
.collect();
43+
let schema = Arc::new(Schema::new(vec![
44+
Field::new("ts", DataType::Int64, true),
45+
Field::new("payload", DataType::Utf8, true),
46+
]));
47+
RecordBatch::try_new(
48+
schema,
49+
vec![
50+
Arc::new(Int64Array::from(ints)),
51+
Arc::new(StringArray::from(strs)),
52+
],
53+
)
54+
.unwrap()
55+
}
56+
57+
fn write_orc(batch: &RecordBatch, compression: Compression) -> Vec<u8> {
58+
let mut buf: Vec<u8> = Vec::with_capacity(1024 * 1024);
59+
let mut writer = ArrowWriterBuilder::new(&mut buf, batch.schema())
60+
.with_compression(compression)
61+
.try_build()
62+
.unwrap();
63+
writer.write(batch).unwrap();
64+
writer.close().unwrap();
65+
buf
66+
}
67+
68+
fn writer_compression(c: &mut Criterion) {
69+
let batch = build_batch();
70+
// Headline: how many bytes of *input* (rows × column count × ~16
71+
// bytes for the payload column) we are compressing.
72+
let approx_input_bytes = batch.num_rows() as u64 * 32;
73+
74+
let codecs: Vec<(&str, Compression)> = vec![
75+
("none", Compression::None),
76+
("snappy", Compression::Snappy),
77+
("zlib_1", Compression::Zlib { level: 1 }),
78+
("zlib_6", Compression::Zlib { level: 6 }),
79+
("zlib_9", Compression::Zlib { level: 9 }),
80+
("zstd_1", Compression::Zstd { level: 1 }),
81+
("zstd_3", Compression::Zstd { level: 3 }),
82+
("zstd_9", Compression::Zstd { level: 9 }),
83+
("zstd_19", Compression::Zstd { level: 19 }),
84+
];
85+
86+
let mut group = c.benchmark_group("write_10k_rows");
87+
group.throughput(Throughput::Bytes(approx_input_bytes));
88+
for (label, codec) in &codecs {
89+
// Surface output file size as a stderr line — Criterion doesn't
90+
// model this natively, but reviewers care a lot.
91+
let bytes = write_orc(&batch, *codec);
92+
eprintln!(
93+
"[writer_compression] codec={label:>7} output_bytes={:>8} ratio={:.2}x",
94+
bytes.len(),
95+
approx_input_bytes as f64 / bytes.len() as f64,
96+
);
97+
group.bench_function(*label, |b| {
98+
b.iter(|| {
99+
let _ = write_orc(&batch, *codec);
100+
})
101+
});
102+
}
103+
group.finish();
104+
}
105+
106+
criterion_group!(benches, writer_compression);
107+
criterion_main!(benches);

0 commit comments

Comments
 (0)