@@ -15,7 +15,6 @@ limitations under the License.
1515*/
1616
1717use std:: collections:: HashMap ;
18- use std:: collections:: VecDeque ;
1918use std:: sync:: atomic:: { AtomicI64 , AtomicU16 , Ordering } ;
2019use std:: sync:: { Arc , Mutex } ;
2120
@@ -73,34 +72,6 @@ const SF1_ROW_COUNTS: &[(&str, u64)] = &[
7372 ( "lineitem" , 6_001_215 ) ,
7473] ;
7574
76- const MIN_TPCH_ROWS_PER_FILE : usize = 32_000 ;
77- const MAX_TPCH_ROWS_PER_FILE : usize = 64_000 ;
78- const DEFAULT_TPCH_MAX_ROWS_PER_FILE : usize = 48_000 ;
79-
80- fn tpch_max_rows_per_file ( ) -> usize {
81- std:: env:: var ( "SPICEBENCH_TPCH_MAX_ROWS_PER_FILE" )
82- . ok ( )
83- . and_then ( |v| v. parse :: < usize > ( ) . ok ( ) )
84- . filter ( |v| * v > 0 )
85- . map ( |v| v. clamp ( MIN_TPCH_ROWS_PER_FILE , MAX_TPCH_ROWS_PER_FILE ) )
86- . unwrap_or ( DEFAULT_TPCH_MAX_ROWS_PER_FILE )
87- }
88-
89- fn split_record_batch ( batch : RecordBatch , max_rows : usize ) -> VecDeque < RecordBatch > {
90- if batch. num_rows ( ) <= max_rows {
91- return VecDeque :: from ( [ batch] ) ;
92- }
93-
94- let mut out = VecDeque :: new ( ) ;
95- let mut offset = 0usize ;
96- while offset < batch. num_rows ( ) {
97- let len = std:: cmp:: min ( max_rows, batch. num_rows ( ) - offset) ;
98- out. push_back ( batch. slice ( offset, len) ) ;
99- offset += len;
100- }
101- out
102- }
103-
10475/// Returns the expected total number of rows for a given table at the
10576/// specified scale factor.
10677fn total_rows_for_table ( table : & str , scale_factor : f64 ) -> u64 {
@@ -393,16 +364,12 @@ pub struct TpchDataset {
393364 mutations : MutationConfig ,
394365 /// Per-table step counter tracking which part to generate next (0-indexed).
395366 table_steps : HashMap < String , AtomicU16 > ,
396- /// Per-table queue of already-generated chunks waiting to be emitted.
397- pending_batches : HashMap < String , Mutex < VecDeque < RecordBatch > > > ,
398367 /// Per-table primary key tracking for update/delete targeting.
399368 key_sets : HashMap < String , Mutex < IndexedKeySet < PrimaryKeyValue > > > ,
400369 /// Global monotonically increasing operation counter for replay ordering.
401370 op_counter : AtomicI64 ,
402371 /// The storage backend for reading/writing table metadata.
403372 storage : Arc < dyn DataStorage > ,
404- /// Maximum number of rows per emitted batch/file.
405- max_rows_per_file : usize ,
406373}
407374
408375impl TpchDataset {
@@ -427,25 +394,14 @@ impl TpchDataset {
427394 . map ( |( name, _) | ( name. to_string ( ) , AtomicU16 :: new ( 0 ) ) )
428395 . collect ( ) ;
429396
430- let pending_batches: HashMap < String , Mutex < VecDeque < RecordBatch > > > = TPCH_TABLES
431- . iter ( )
432- . map ( |( name, _) | ( name. to_string ( ) , Mutex :: new ( VecDeque :: new ( ) ) ) )
433- . collect ( ) ;
434-
435- let max_rows_per_file = tpch_max_rows_per_file ( ) ;
436-
437- info ! ( max_rows_per_file, "Configured TPCH maximum rows per file" ) ;
438-
439397 Ok ( Self {
440398 scale_factor : config. scale_factor ,
441399 num_steps : config. num_steps ,
442400 mutations : mutations. clone ( ) ,
443401 table_steps,
444- pending_batches,
445402 key_sets,
446403 op_counter : AtomicI64 :: new ( 0 ) ,
447404 storage,
448- max_rows_per_file,
449405 } )
450406 }
451407}
@@ -507,15 +463,6 @@ impl Dataset for TpchDataset {
507463 }
508464
509465 async fn raw_next_batch ( & self , table : & str ) -> anyhow:: Result < Option < RecordBatch > > {
510- if let Some ( queued) = self . pending_batches . get ( table) {
511- let mut queued = queued
512- . lock ( )
513- . map_err ( |e| anyhow:: anyhow!( "lock poisoned: {e}" ) ) ?;
514- if let Some ( batch) = queued. pop_front ( ) {
515- return Ok ( Some ( batch) ) ;
516- }
517- }
518-
519466 // Each table independently tracks which step (part) it is on.
520467 let step_counter = self
521468 . table_steps
@@ -655,23 +602,7 @@ impl Dataset for TpchDataset {
655602 let op_indices: Vec < i64 > = ( op_base..op_base + total_rows as i64 ) . collect ( ) ;
656603 columns. push ( Arc :: new ( Int64Array :: from ( op_indices) ) ) ;
657604
658- let combined_batch = RecordBatch :: try_new ( schema, columns) ?;
659- let mut chunks = split_record_batch ( combined_batch, self . max_rows_per_file ) ;
660-
661- let first = chunks
662- . pop_front ( )
663- . ok_or_else ( || anyhow:: anyhow!( "internal error: no chunks produced" ) ) ?;
664-
665- if !chunks. is_empty ( )
666- && let Some ( queued) = self . pending_batches . get ( table)
667- {
668- let mut queued = queued
669- . lock ( )
670- . map_err ( |e| anyhow:: anyhow!( "lock poisoned: {e}" ) ) ?;
671- queued. extend ( chunks) ;
672- }
673-
674- Ok ( Some ( first) )
605+ Ok ( Some ( RecordBatch :: try_new ( schema, columns) ?) )
675606 }
676607
677608 fn tables ( & self ) -> HashMap < String , DatasetTable > {
@@ -711,6 +642,7 @@ mod tests {
711642 & self ,
712643 _table_name : & str ,
713644 _batch_id : u64 ,
645+ _part_id : Option < usize > ,
714646 ) -> anyhow:: Result < Option < ReadResult > > {
715647 Ok ( None )
716648 }
@@ -724,6 +656,7 @@ mod tests {
724656 Ok ( WriteResult {
725657 rows_written : 0 ,
726658 bytes_written : 0 ,
659+ part_ids : Vec :: new ( ) ,
727660 } )
728661 }
729662
@@ -785,7 +718,7 @@ mod tests {
785718 }
786719
787720 #[ tokio:: test]
788- async fn tpch_num_batches_is_a_lower_bound_for_emitted_batches_per_table ( ) {
721+ async fn tpch_emits_exactly_one_batch_per_step_for_non_static_tables ( ) {
789722 let dataset = build_dataset ( 1.0 , 7 ) ;
790723
791724 for ( table, _) in TPCH_TABLES {
@@ -800,33 +733,12 @@ mod tests {
800733 emitted_batches += 1 ;
801734 }
802735
803- assert ! (
804- emitted_batches >= dataset. num_batches( table) ,
805- "emitted batches should be >= planned batches for table '{table}'"
806- ) ;
807- }
808- }
809-
810- #[ tokio:: test]
811- async fn tpch_batches_are_capped_to_max_rows_per_file ( ) {
812- let dataset = build_dataset ( 1.0 , 7 ) ;
813-
814- let mut saw_split = false ;
815- while let Some ( batch) = dataset
816- . raw_next_batch ( "lineitem" )
817- . await
818- . expect ( "raw_next_batch should not fail" )
819- {
820- assert ! (
821- batch. num_rows( ) <= DEFAULT_TPCH_MAX_ROWS_PER_FILE ,
822- "lineitem chunk exceeded max rows per file"
736+ assert_eq ! (
737+ emitted_batches,
738+ dataset. num_batches( table) ,
739+ "emitted batches should match planned logical batches for table '{table}'"
823740 ) ;
824- if batch. num_rows ( ) == DEFAULT_TPCH_MAX_ROWS_PER_FILE {
825- saw_split = true ;
826- }
827741 }
828-
829- assert ! ( saw_split, "expected at least one full-size split chunk" ) ;
830742 }
831743
832744 #[ tokio:: test]
0 commit comments