spiceai
diff --git a/‎crates/data-generation/src/dataset/tpch.rs‎
Lines changed: 8 additions & 96 deletions b/‎crates/data-generation/src/dataset/tpch.rs‎
Lines changed: 8 additions & 96 deletions
diff --git a/‎crates/data-generation/src/generator.rs‎
Lines changed: 23 additions & 8 deletions b/‎crates/data-generation/src/generator.rs‎
Lines changed: 23 additions & 8 deletions
diff --git a/‎crates/data-generation/src/storage/mod.rs‎
Lines changed: 34 additions & 5 deletions b/‎crates/data-generation/src/storage/mod.rs‎
Lines changed: 34 additions & 5 deletions
@@ -15,7 +15,6 @@ limitations under the License.
 */
 
 use std::collections::HashMap;
-use std::collections::VecDeque;
 use std::sync::atomic::{AtomicI64, AtomicU16, Ordering};
 use std::sync::{Arc, Mutex};
 
@@ -73,34 +72,6 @@ const SF1_ROW_COUNTS: &[(&str, u64)] = &[
     ("lineitem", 6_001_215),
 ];
 
-const MIN_TPCH_ROWS_PER_FILE: usize = 32_000;
-const MAX_TPCH_ROWS_PER_FILE: usize = 64_000;
-const DEFAULT_TPCH_MAX_ROWS_PER_FILE: usize = 48_000;
-
-fn tpch_max_rows_per_file() -> usize {
-    std::env::var("SPICEBENCH_TPCH_MAX_ROWS_PER_FILE")
-        .ok()
-        .and_then(|v| v.parse::<usize>().ok())
-        .filter(|v| *v > 0)
-        .map(|v| v.clamp(MIN_TPCH_ROWS_PER_FILE, MAX_TPCH_ROWS_PER_FILE))
-        .unwrap_or(DEFAULT_TPCH_MAX_ROWS_PER_FILE)
-}
-
-fn split_record_batch(batch: RecordBatch, max_rows: usize) -> VecDeque<RecordBatch> {
-    if batch.num_rows() <= max_rows {
-        return VecDeque::from([batch]);
-    }
-
-    let mut out = VecDeque::new();
-    let mut offset = 0usize;
-    while offset < batch.num_rows() {
-        let len = std::cmp::min(max_rows, batch.num_rows() - offset);
-        out.push_back(batch.slice(offset, len));
-        offset += len;
-    }
-    out
-}
-
 /// Returns the expected total number of rows for a given table at the
 /// specified scale factor.
 fn total_rows_for_table(table: &str, scale_factor: f64) -> u64 {
@@ -393,16 +364,12 @@ pub struct TpchDataset {
     mutations: MutationConfig,
     /// Per-table step counter tracking which part to generate next (0-indexed).
     table_steps: HashMap<String, AtomicU16>,
-    /// Per-table queue of already-generated chunks waiting to be emitted.
-    pending_batches: HashMap<String, Mutex<VecDeque<RecordBatch>>>,
     /// Per-table primary key tracking for update/delete targeting.
     key_sets: HashMap<String, Mutex<IndexedKeySet<PrimaryKeyValue>>>,
     /// Global monotonically increasing operation counter for replay ordering.
     op_counter: AtomicI64,
     /// The storage backend for reading/writing table metadata.
     storage: Arc<dyn DataStorage>,
-    /// Maximum number of rows per emitted batch/file.
-    max_rows_per_file: usize,
 }
 
 impl TpchDataset {
@@ -427,25 +394,14 @@ impl TpchDataset {
             .map(|(name, _)| (name.to_string(), AtomicU16::new(0)))
             .collect();
 
-        let pending_batches: HashMap<String, Mutex<VecDeque<RecordBatch>>> = TPCH_TABLES
-            .iter()
-            .map(|(name, _)| (name.to_string(), Mutex::new(VecDeque::new())))
-            .collect();
-
-        let max_rows_per_file = tpch_max_rows_per_file();
-
-        info!(max_rows_per_file, "Configured TPCH maximum rows per file");
-
         Ok(Self {
             scale_factor: config.scale_factor,
             num_steps: config.num_steps,
             mutations: mutations.clone(),
             table_steps,
-            pending_batches,
             key_sets,
             op_counter: AtomicI64::new(0),
             storage,
-            max_rows_per_file,
         })
     }
 }
@@ -507,15 +463,6 @@ impl Dataset for TpchDataset {
     }
 
     async fn raw_next_batch(&self, table: &str) -> anyhow::Result<Option<RecordBatch>> {
-        if let Some(queued) = self.pending_batches.get(table) {
-            let mut queued = queued
-                .lock()
-                .map_err(|e| anyhow::anyhow!("lock poisoned: {e}"))?;
-            if let Some(batch) = queued.pop_front() {
-                return Ok(Some(batch));
-            }
-        }
-
         // Each table independently tracks which step (part) it is on.
         let step_counter = self
             .table_steps
@@ -655,23 +602,7 @@ impl Dataset for TpchDataset {
         let op_indices: Vec<i64> = (op_base..op_base + total_rows as i64).collect();
         columns.push(Arc::new(Int64Array::from(op_indices)));
 
-        let combined_batch = RecordBatch::try_new(schema, columns)?;
-        let mut chunks = split_record_batch(combined_batch, self.max_rows_per_file);
-
-        let first = chunks
-            .pop_front()
-            .ok_or_else(|| anyhow::anyhow!("internal error: no chunks produced"))?;
-
-        if !chunks.is_empty()
-            && let Some(queued) = self.pending_batches.get(table)
-        {
-            let mut queued = queued
-                .lock()
-                .map_err(|e| anyhow::anyhow!("lock poisoned: {e}"))?;
-            queued.extend(chunks);
-        }
-
-        Ok(Some(first))
+        Ok(Some(RecordBatch::try_new(schema, columns)?))
     }
 
     fn tables(&self) -> HashMap<String, DatasetTable> {
@@ -711,6 +642,7 @@ mod tests {
             &self,
             _table_name: &str,
             _batch_id: u64,
+            _part_id: Option<usize>,
         ) -> anyhow::Result<Option<ReadResult>> {
             Ok(None)
         }
@@ -724,6 +656,7 @@ mod tests {
             Ok(WriteResult {
                 rows_written: 0,
                 bytes_written: 0,
+                part_ids: Vec::new(),
             })
         }
 
@@ -785,7 +718,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn tpch_num_batches_is_a_lower_bound_for_emitted_batches_per_table() {
+    async fn tpch_emits_exactly_one_batch_per_step_for_non_static_tables() {
         let dataset = build_dataset(1.0, 7);
 
         for (table, _) in TPCH_TABLES {
@@ -800,33 +733,12 @@ mod tests {
                 emitted_batches += 1;
             }
 
-            assert!(
-                emitted_batches >= dataset.num_batches(table),
-                "emitted batches should be >= planned batches for table '{table}'"
-            );
-        }
-    }
-
-    #[tokio::test]
-    async fn tpch_batches_are_capped_to_max_rows_per_file() {
-        let dataset = build_dataset(1.0, 7);
-
-        let mut saw_split = false;
-        while let Some(batch) = dataset
-            .raw_next_batch("lineitem")
-            .await
-            .expect("raw_next_batch should not fail")
-        {
-            assert!(
-                batch.num_rows() <= DEFAULT_TPCH_MAX_ROWS_PER_FILE,
-                "lineitem chunk exceeded max rows per file"
+            assert_eq!(
+                emitted_batches,
+                dataset.num_batches(table),
+                "emitted batches should match planned logical batches for table '{table}'"
             );
-            if batch.num_rows() == DEFAULT_TPCH_MAX_ROWS_PER_FILE {
-                saw_split = true;
-            }
         }
-
-        assert!(saw_split, "expected at least one full-size split chunk");
     }
 
     #[tokio::test]
 
@@ -80,9 +80,10 @@ impl DataGenerator {
             }
         });
 
-        // Track which batch IDs were successfully written per table so we can
-        // persist them in the table metadata at the end of the run.
-        let written_batch_ids: Arc<std::sync::Mutex<HashMap<String, Vec<u64>>>> =
+        // Track which logical batch IDs were successfully written per table,
+        // plus any split part IDs for each logical batch, so we can persist
+        // both in table metadata at the end of the run.
+        let written_batches: Arc<std::sync::Mutex<HashMap<String, HashMap<u64, Vec<usize>>>>> =
             Arc::new(std::sync::Mutex::new(HashMap::new()));
 
         // For each table, spawn a generator task and an uploader task connected
@@ -124,7 +125,7 @@ impl DataGenerator {
             // --- Uploader task ---
             let target = self.target.clone();
             let metrics_up = self.metrics.clone();
-            let written_ids = Arc::clone(&written_batch_ids);
+            let written_ids = Arc::clone(&written_batches);
             join_set.spawn(async move {
                 while let Some((batch_id, batch)) = rx.recv().await {
                     let start = Instant::now();
@@ -133,10 +134,10 @@ impl DataGenerator {
                             metrics_up.record_write(&result, start.elapsed());
                             written_ids
                                 .lock()
-                                .expect("written_batch_ids lock poisoned")
+                                .expect("written_batches lock poisoned")
                                 .entry(table_name.clone())
                                 .or_default()
-                                .push(batch_id);
+                                .insert(batch_id, result.part_ids.clone());
                         }
                         Err(e) => {
                             metrics_up.record_error();
@@ -166,15 +167,26 @@ impl DataGenerator {
         logger_handle.abort();
 
         // Build and persist the consolidated version metadata (version.json).
-        let written = Arc::try_unwrap(written_batch_ids)
+        let written = Arc::try_unwrap(written_batches)
             .expect("all tasks should be finished")
             .into_inner()
             .expect("mutex should not be poisoned");
 
         let dataset_tables = self.dataset.tables();
         let mut tables_metadata = HashMap::new();
-        for (table_name, mut ids) in written {
+        for (table_name, batch_parts) in written {
+            let mut ids: Vec<u64> = batch_parts.keys().copied().collect();
             ids.sort_unstable();
+
+            let mut normalized_batch_parts: HashMap<u64, Vec<usize>> = HashMap::new();
+            for (batch_id, mut part_ids) in batch_parts {
+                if part_ids.is_empty() {
+                    continue;
+                }
+                part_ids.sort_unstable();
+                normalized_batch_parts.insert(batch_id, part_ids);
+            }
+
             let key_columns = self.dataset.primary_key(&table_name);
             let dataset_table = dataset_tables.get(&table_name);
             let schema_json = dataset_table
@@ -192,6 +204,7 @@ impl DataGenerator {
                     time_column,
                     key_columns,
                     batch_ids: ids.clone(),
+                    batch_parts: normalized_batch_parts,
                 },
             );
 
@@ -400,6 +413,7 @@ mod tests {
             &self,
             _table_name: &str,
             _batch_id: u64,
+            _part_id: Option<usize>,
         ) -> anyhow::Result<Option<ReadResult>> {
             Ok(None)
         }
@@ -430,6 +444,7 @@ mod tests {
             Ok(WriteResult {
                 rows_written: batch.num_rows() as u64,
                 bytes_written: 0,
+                part_ids: Vec::new(),
             })
         }
 
 
@@ -39,31 +39,39 @@ impl ReadResult {
 pub struct WriteResult {
     pub rows_written: u64,
     pub bytes_written: u64,
+    pub part_ids: Vec<usize>,
 }
 
 #[async_trait]
 pub trait DataStorage: Send + Sync + 'static {
     /// List available batch object paths for a given table.
     ///
-    /// Batches are stored under `tables/{table_name}/batch-NNNNNN.parquet`.
+    /// A logical batch may be stored as either:
+    /// - `tables/{table_name}/batch-NNNNNN.parquet`, or
+    /// - one or more split parts like
+    ///   `tables/{table_name}/batch-NNNNNN-part-PPP.parquet`.
     async fn list_batches(&self, table_name: &str) -> anyhow::Result<Vec<String>>;
 
-    /// Read a single batch from the source by its batch ID and table name.
+    /// Read a single batch object from the source.
     ///
     /// Returns `Ok(None)` when the batch does not exist in the underlying
     /// storage (e.g. the table has fewer batches than others). The caller
     /// should treat this as the table having no more data.
     ///
-    /// Batches are stored at `tables/{table_name}/batch-{batch_id:06}.parquet`.
+    /// If `part_id` is `Some(p)`, this reads the split-part object for that
+    /// logical batch. If `part_id` is `None`, this reads the unsuffixed
+    /// logical batch object.
     async fn read_batch(
         &self,
         table_name: &str,
         batch_id: u64,
+        part_id: Option<usize>,
     ) -> anyhow::Result<Option<ReadResult>>;
 
-    /// Write a single batch to storage for the given table and batch ID.
+    /// Write a single logical batch to storage for the given table and batch ID.
     ///
-    /// Batches are written to `tables/{table_name}/batch-{batch_id:06}.parquet`.
+    /// Implementations may split large batches across multiple physical files
+    /// while preserving the same logical `batch_id`.
     async fn write(
         &self,
         table_name: &str,
@@ -113,6 +121,27 @@ pub trait DataStorage: Send + Sync + 'static {
         Ok(VecDeque::new())
     }
 
+    /// Reads split part IDs for a logical batch from version metadata.
+    ///
+    /// Returns an empty vector when a batch has no split parts and should be
+    /// read from the unsuffixed object path.
+    async fn read_batch_parts(
+        &self,
+        table_name: &str,
+        batch_id: u64,
+    ) -> anyhow::Result<Vec<usize>> {
+        if let Some(metadata) = self.read_version_metadata().await?
+            && let Some(table_meta) = metadata.tables.get(table_name)
+            && let Some(part_ids) = table_meta.batch_parts.get(&batch_id)
+        {
+            let mut sorted = part_ids.clone();
+            sorted.sort_unstable();
+            return Ok(sorted);
+        }
+
+        Ok(Vec::new())
+    }
+
     fn table_params(&self, table_name: &str) -> HashMap<String, serde_json::Value>;
 
     /// Returns the list of file paths/URIs that would exist after a successful