Skip to content

Commit 61099b1

Browse files
fix profile script
1 parent a323e08 commit 61099b1

4 files changed

Lines changed: 81 additions & 1 deletion

File tree

Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ name = "check_github_issues"
3636
path = "bin/check_github_issues.rs"
3737

3838

39+
[[bin]]
40+
name = "parse_hits"
41+
path = "bin/parse_hits.rs"
42+
3943
[[bench]]
4044
name = "parallel"
4145
harness = false

bin/parse_hits.rs

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
use std::sync::Arc;
2+
3+
use arrow_csv2::clickbench;
4+
use arrow_csv2::{ArrowCsv2Decoder, ParallelCsvSource};
5+
use datafusion::datasource::physical_plan::{FileGroup, FileScanConfigBuilder};
6+
use datafusion::execution::object_store::ObjectStoreUrl;
7+
use datafusion::physical_plan::collect;
8+
use datafusion::prelude::SessionContext;
9+
use datafusion_datasource::PartitionedFile;
10+
use datafusion_datasource::source::DataSourceExec;
11+
use object_store::path::Path as ObjectPath;
12+
13+
const FILE: &str = "hits_100mb.csv";
14+
const BATCH_SIZE: usize = 8192;
15+
16+
fn main() {
17+
let schema = clickbench::schema();
18+
let file_len = std::fs::metadata(FILE)
19+
.expect("hits_100mb.csv not found")
20+
.len() as usize;
21+
22+
let num_partitions = std::env::args()
23+
.nth(1)
24+
.and_then(|s| s.parse().ok())
25+
.unwrap_or(0);
26+
27+
if num_partitions == 0 {
28+
let file = std::fs::File::open(FILE).unwrap();
29+
let reader = arrow_csv2::ReaderBuilder::new(schema)
30+
.with_batch_size(BATCH_SIZE)
31+
.build(file);
32+
33+
let _ = reader.collect::<Vec<_>>();
34+
} else {
35+
let rt = tokio::runtime::Runtime::new().unwrap();
36+
rt.block_on(async {
37+
let plan = build_parallel_csv(&schema, file_len, num_partitions);
38+
let ctx = SessionContext::new();
39+
let _ = collect(plan, ctx.task_ctx()).await.unwrap();
40+
})
41+
}
42+
}
43+
44+
fn build_parallel_csv(
45+
schema: &arrow_schema::SchemaRef,
46+
file_len: usize,
47+
num_partitions: usize,
48+
) -> Arc<dyn datafusion::physical_plan::ExecutionPlan> {
49+
let chunk_size = file_len / num_partitions;
50+
let boundaries: Arc<[usize]> = (0..num_partitions)
51+
.map(|i| i * chunk_size)
52+
.chain(std::iter::once(file_len))
53+
.collect::<Vec<_>>()
54+
.into();
55+
56+
let abs_path = std::fs::canonicalize(FILE).unwrap();
57+
let object_path = ObjectPath::from_absolute_path(&abs_path).unwrap();
58+
59+
let source = Arc::new(ParallelCsvSource::new(
60+
schema.clone(),
61+
object_path,
62+
boundaries.clone(),
63+
BATCH_SIZE,
64+
ArrowCsv2Decoder,
65+
));
66+
67+
let url = ObjectStoreUrl::parse("file://").unwrap();
68+
let mut builder = FileScanConfigBuilder::new(url, source);
69+
for i in 0..num_partitions {
70+
let file = PartitionedFile::new(FILE.to_string(), file_len as u64)
71+
.with_range(boundaries[i] as i64, boundaries[i + 1] as i64);
72+
builder = builder.with_file_group(FileGroup::new(vec![file]));
73+
}
74+
75+
DataSourceExec::from_data_source(builder.build())
76+
}

profile.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22

33
set -euo pipefail
44

5-
cargo b --release --bin parse_hits_100 && samply record ./target/release/parse_hits_100
5+
cargo b --release --bin parse_hits && samply record ./target/release/parse_hits "$@"

0 commit comments

Comments
 (0)