@@ -86,6 +86,10 @@ struct Cli {
8686 /// Directory to write checkpoint parquet files into
8787 #[ arg( long, default_value = "./checkpoints" ) ]
8888 checkpoint_dir : PathBuf ,
89+
90+ /// Local directory to check for pre-extracted data before downloading from S3
91+ #[ arg( long, default_value = "./spicebench-data" ) ]
92+ data_dir : PathBuf ,
8993}
9094
9195#[ cfg( feature = "duckdb" ) ]
@@ -105,6 +109,32 @@ impl Cli {
105109 }
106110}
107111
112+ /// Log the row count for each table in the DuckDB sink.
113+ #[ cfg( feature = "duckdb" ) ]
114+ async fn log_table_row_counts (
115+ sink : & DuckDBSink ,
116+ table_names : & [ String ] ,
117+ checkpoint_idx : usize ,
118+ ) -> anyhow:: Result < ( ) > {
119+ for table in table_names {
120+ let sql = format ! ( "SELECT COUNT(*) AS cnt FROM {table}" ) ;
121+ let batches = sink. query ( & sql) . await ?;
122+ let count: i64 = batches
123+ . first ( )
124+ . and_then ( |b| {
125+ b. column ( 0 )
126+ . as_any ( )
127+ . downcast_ref :: < arrow:: array:: Int64Array > ( )
128+ . map ( |a| a. value ( 0 ) )
129+ } )
130+ . unwrap_or ( 0 ) ;
131+ if table == "lineitem" {
132+ eprintln ! ( "[checkpoint] Checkpoint {checkpoint_idx} | {table}: {count} rows" ) ;
133+ }
134+ }
135+ Ok ( ( ) )
136+ }
137+
108138/// Run all checkpoint queries against the DuckDB sink and write each result
109139/// set to a parquet file at `<checkpoint_dir>/<checkpoint_idx>/<query_idx>.parquet`.
110140#[ cfg( feature = "duckdb" ) ]
@@ -198,20 +228,34 @@ async fn main() -> anyhow::Result<()> {
198228
199229 let source_config = cli. source_config ( ) ;
200230 let version_prefix = source_config. prefix . clone ( ) ;
201- let archive_storage: Arc < dyn DataStorage > = Arc :: new ( S3Storage :: new ( & source_config) ?) ;
202231
203- // Download and extract the archive to a temporary local directory.
204- let extract_dir = tempfile:: tempdir ( ) ?;
205- ETLPipeline :: download ( archive_storage, extract_dir. path ( ) ) . await ?;
206- let source: Arc < dyn DataStorage > = Arc :: new ( FileStorage :: new ( extract_dir. path ( ) ) ) ;
232+ // Check local directory first; download from S3 only if version.json is missing.
233+ let ( _extract_dir_handle, extract_path) ;
234+ let version_json_path = cli. data_dir . join ( "version.json" ) ;
235+ if version_json_path. exists ( ) {
236+ tracing:: info!(
237+ data_dir = %cli. data_dir. display( ) ,
238+ "Using pre-existing local data, skipping S3 download"
239+ ) ;
240+ _extract_dir_handle = None ;
241+ extract_path = cli. data_dir . clone ( ) ;
242+ } else {
243+ tracing:: info!( "Local data not found at {}, downloading from S3" , cli. data_dir. display( ) ) ;
244+ let archive_storage: Arc < dyn DataStorage > = Arc :: new ( S3Storage :: new ( & source_config) ?) ;
245+ let tmp = tempfile:: tempdir ( ) ?;
246+ ETLPipeline :: download ( archive_storage, tmp. path ( ) ) . await ?;
247+ extract_path = tmp. path ( ) . to_path_buf ( ) ;
248+ _extract_dir_handle = Some ( tmp) ;
249+ }
250+ let source: Arc < dyn DataStorage > = Arc :: new ( FileStorage :: new ( & extract_path) ) ;
207251
208252 // Read version metadata to derive dataset config and mutations.
209253 let version_metadata = source. read_version_metadata ( ) . await ?. ok_or_else ( || {
210- anyhow:: anyhow!(
211- "No version.json found in extracted data at {}. Was data generation run for this version?" ,
212- extract_dir . path ( ) . display( )
213- )
214- } ) ?;
254+ anyhow:: anyhow!(
255+ "No version.json found in extracted data at {}. Was data generation run for this version?" ,
256+ extract_path . display( )
257+ )
258+ } ) ?;
215259
216260 let dataset_source = DatasetSource :: from_dataset_type ( & version_metadata. dataset_type ) ?;
217261 let dataset_config = version_metadata. dataset_config ( ) ;
@@ -234,7 +278,7 @@ async fn main() -> anyhow::Result<()> {
234278 bucket = %cli. bucket,
235279 prefix = %cli. prefix,
236280 version_prefix = %version_prefix,
237- extract_dir = %extract_dir . path ( ) . display( ) ,
281+ extract_dir = %extract_path . display( ) ,
238282 duckdb_path = %cli. duckdb_path. display( ) ,
239283 scale_factor = version_metadata. scale_factor,
240284 num_steps = version_metadata. num_steps,
@@ -243,6 +287,9 @@ async fn main() -> anyhow::Result<()> {
243287 "Starting Checkpointer"
244288 ) ;
245289
290+ let mut table_names: Vec < String > = version_metadata. tables . keys ( ) . cloned ( ) . collect ( ) ;
291+ table_names. sort ( ) ;
292+
246293 pipeline. initialize ( ) . await ?;
247294 pipeline. run ( cli. checkpoint_interval_steps as usize ) . await ?;
248295
@@ -258,6 +305,7 @@ async fn main() -> anyhow::Result<()> {
258305 checkpoint = checkpoint_idx,
259306 "Pipeline paused, running checkpoint queries"
260307 ) ;
308+ log_table_row_counts ( & target, & table_names, checkpoint_idx) . await ?;
261309 run_checkpoint_queries (
262310 & target,
263311 & checkpoint_queries,
@@ -276,6 +324,7 @@ async fn main() -> anyhow::Result<()> {
276324 checkpoint = checkpoint_idx,
277325 "Pipeline completed, running final checkpoint queries"
278326 ) ;
327+ log_table_row_counts ( & target, & table_names, checkpoint_idx) . await ?;
279328 run_checkpoint_queries (
280329 & target,
281330 & checkpoint_queries,
0 commit comments