Various improvements (#202)

krinart · web-flow · commit 4f8893fd6d09 · 2026-03-14T09:36:00.000-07:00
* Reapply "Fix checkpoint determinism: bound coalescing to checkpoint interval (#187)" This reverts commit 5ea1ddb. * Databricks improvements * Logging * Add logging * Lint * Fix * Fix * Fix * Revert change * Logging
diff --git a/.github/workflows/run_spicebench.yml b/.github/workflows/run_spicebench.yml
@@ -308,8 +308,10 @@ jobs:
           fi
 
           if [ "${SYSTEM_UNDER_TEST_PREFIX}" = "databricks" ]; then
-            export SPICEBENCH_ADBC_DELETE_BATCH_SIZE=5000
+            export SPICEBENCH_ADBC_DELETE_BATCH_SIZE=50000
             export SPICEBENCH_ADBC_UPDATE_STRATEGY=staging_table
+            export SPICEBENCH_TARGET_BATCH_ROWS=500000
+            export SPICEBENCH_ADBC_MAX_INGEST_BATCH_BYTES=1268435456
             ADAPTER_CMD="${HOME}/.spice/bin/databricks-system-adapter"
             ADAPTER_ARGS="stdio"
             ADAPTER_ENVS="--system-adapter-env DATABRICKS_ENDPOINT=${DATABRICKS_ENDPOINT} --system-adapter-env DATABRICKS_TOKEN=${DATABRICKS_TOKEN} --system-adapter-env DATABRICKS_HTTP_PATH=${DATABRICKS_HTTP_PATH} --system-adapter-env DATABRICKS_SQL_WAREHOUSE_ID=${DATABRICKS_SQL_WAREHOUSE_ID} --system-adapter-env DATABRICKS_TABLE_FORMAT=${DATABRICKS_TABLE_FORMAT}"
@@ -326,6 +328,8 @@ jobs:
               ADAPTER_ENVS="${ADAPTER_ENVS} --system-adapter-env DATABRICKS_STAGING_VOLUME_PATH=${DATABRICKS_STAGING_VOLUME_PATH}"
             fi
           else
+            export SPICEBENCH_ADBC_UPDATE_STRATEGY=bulk_ingest_upsert
+            export SPICEBENCH_ADBC_DELETE_BATCH_SIZE=50000
             ADAPTER_CMD="docker"
             ADAPTER_ARGS="run -i -e SPICEAI_API_KEY -e SPICE_CLOUD_API_URL -e AWS_ACCESS_KEY_ID=${S3_AWS_ACCESS_KEY_ID} -e AWS_SECRET_ACCESS_KEY=${S3_AWS_SECRET_ACCESS_KEY} -e SPIDAPTER_ICEBERG_REGION -e SPIDAPTER_ICEBERG_CATALOG_FROM ghcr.io/spiceai/spidapter:latest stdio --verbose --channel nightly"
             ADAPTER_ENVS=""
diff --git a/crates/checkpointer/src/main.rs b/crates/checkpointer/src/main.rs
@@ -105,6 +105,30 @@ impl Cli {
     }
 }
 
+/// Log the row count for each table in the DuckDB sink.
+#[cfg(feature = "duckdb")]
+async fn log_table_row_counts(
+    sink: &DuckDBSink,
+    table_names: &[String],
+    checkpoint_idx: usize,
+) -> anyhow::Result<()> {
+    for table in table_names {
+        let sql = format!("SELECT COUNT(*) AS cnt FROM {table}");
+        let batches = sink.query(&sql).await?;
+        let count: i64 = batches
+            .first()
+            .and_then(|b| {
+                b.column(0)
+                    .as_any()
+                    .downcast_ref::<arrow::array::Int64Array>()
+                    .map(|a| a.value(0))
+            })
+            .unwrap_or(0);
+        tracing::info!("[checkpoint] Checkpoint {checkpoint_idx} | {table}: {count} rows");
+    }
+    Ok(())
+}
+
 /// Run all checkpoint queries against the DuckDB sink and write each result
 /// set to a parquet file at `<checkpoint_dir>/<checkpoint_idx>/<query_idx>.parquet`.
 #[cfg(feature = "duckdb")]
@@ -207,11 +231,11 @@ async fn main() -> anyhow::Result<()> {
 
         // Read version metadata to derive dataset config and mutations.
         let version_metadata = source.read_version_metadata().await?.ok_or_else(|| {
-        anyhow::anyhow!(
+            anyhow::anyhow!(
             "No version.json found in extracted data at {}. Was data generation run for this version?",
             extract_dir.path().display()
         )
-    })?;
+        })?;
 
         let dataset_source = DatasetSource::from_dataset_type(&version_metadata.dataset_type)?;
         let dataset_config = version_metadata.dataset_config();
@@ -243,6 +267,9 @@ async fn main() -> anyhow::Result<()> {
             "Starting Checkpointer"
         );
 
+        let mut table_names: Vec<String> = version_metadata.tables.keys().cloned().collect();
+        table_names.sort();
+
         pipeline.initialize().await?;
         pipeline.run(cli.checkpoint_interval_steps as usize).await?;
 
@@ -258,6 +285,7 @@ async fn main() -> anyhow::Result<()> {
                         checkpoint = checkpoint_idx,
                         "Pipeline paused, running checkpoint queries"
                     );
+                    log_table_row_counts(&target, &table_names, checkpoint_idx).await?;
                     run_checkpoint_queries(
                         &target,
                         &checkpoint_queries,
@@ -276,6 +304,7 @@ async fn main() -> anyhow::Result<()> {
                         checkpoint = checkpoint_idx,
                         "Pipeline completed, running final checkpoint queries"
                     );
+                    log_table_row_counts(&target, &table_names, checkpoint_idx).await?;
                     run_checkpoint_queries(
                         &target,
                         &checkpoint_queries,
diff --git a/crates/data-generation/src/storage/file.rs b/crates/data-generation/src/storage/file.rs
@@ -157,6 +157,15 @@ impl DataStorage for FileStorage {
         .await
         .map_err(|e| anyhow::anyhow!("spawn_blocking panicked reading parquet: {e}"))??;
 
+        if table_name == "lineitem"
+            && let Some(result) = result.as_ref()
+        {
+            eprintln!(
+                "[etl-read] table={table_name} batch_id={batch_id} rows={}",
+                result.rows_read,
+            );
+        }
+
         Ok(result)
     }
 
diff --git a/crates/etl/src/lib.rs b/crates/etl/src/lib.rs
@@ -244,18 +244,26 @@ fn coalesce_batches(batches: &[RecordBatch]) -> anyhow::Result<Vec<RecordBatch>>
 /// Removes and returns the next batch ID (greater than `after_batch_id`) that
 /// still has pending work for `table_name`.
 ///
+/// When `max_batch_id` is `Some(max)`, candidates with a batch ID greater than
+/// `max` are not considered. This is used to prevent coalescing across
+/// checkpoint interval boundaries.
+///
 /// This is used by the ETL runner to coalesce very small reads across multiple
 /// source batch IDs for the same table while ensuring consumed IDs are not
 /// replayed in later steps.
 fn reserve_next_batch_id_for_table(
     work_state: &mut PipelineWorkState,
     table_name: &str,
     after_batch_id: u64,
+    max_batch_id: Option<u64>,
 ) -> Option<(u64, bool)> {
     let mut found: Option<(u64, bool)> = None;
     let start = after_batch_id.saturating_add(1);
 
     for (candidate_batch_id, tables) in work_state.steps.range_mut(start..) {
+        if max_batch_id.is_some_and(|max| *candidate_batch_id > max) {
+            break;
+        }
         if let Some(pos) = tables.iter().position(|t| t == table_name) {
             tables.remove(pos);
             found = Some((*candidate_batch_id, tables.is_empty()));
@@ -371,6 +379,11 @@ async fn read_logical_batch(
 /// reserving and reading subsequent batch IDs for that table until at least
 /// [`target_batch_rows()`] rows have been accumulated (or no further work exists).
 ///
+/// When `max_batch_id` is `Some(max)`, coalescing will not reserve batch IDs
+/// beyond `max`. This prevents reads from crossing a checkpoint interval
+/// boundary, ensuring checkpoint results are deterministic regardless of
+/// the configured [`target_batch_rows()`] value.
+///
 /// Returns `(raw_batches, key_columns, table_finished, consumed_work_units, rows_read)` where
 /// `table_finished=true` means a read returned `None` and the table should be
 /// marked as fully consumed. `consumed_work_units` counts how many table+batch
@@ -382,6 +395,7 @@ async fn read_batches_until_min_rows(
     logical_steps_consumed: &StdArc<AtomicU64>,
     table_name: &str,
     start_batch_id: u64,
+    max_batch_id: Option<u64>,
 ) -> Result<(Vec<RecordBatch>, Vec<String>, bool, u64, u64), String> {
     let mut all_batches: Vec<RecordBatch> = Vec::new();
     let mut total_rows: usize = 0;
@@ -450,7 +464,12 @@ async fn read_batches_until_min_rows(
         {
             let reservation = {
                 let mut state = work_state.lock().expect("work_state lock poisoned");
-                reserve_next_batch_id_for_table(&mut state, table_name, reserve_cursor)
+                reserve_next_batch_id_for_table(
+                    &mut state,
+                    table_name,
+                    reserve_cursor,
+                    max_batch_id,
+                )
             };
 
             let Some((next_batch_id, removed_step_entry)) = reservation else {
@@ -1614,6 +1633,16 @@ async fn run_pipeline(
         })
     };
 
+    // When running with a step budget (checkpoint mode), compute the highest
+    // batch ID that belongs to this checkpoint interval.  Coalescing in
+    // read_batches_until_min_rows is allowed within the interval but will not
+    // reserve batch IDs beyond this boundary, ensuring checkpoint results are
+    // deterministic regardless of the configured target_batch_rows() value.
+    let checkpoint_max_batch_id: Option<u64> = step_limit.and_then(|limit| {
+        let state = work_state.lock().expect("work_state lock poisoned");
+        state.steps.keys().nth(limit.saturating_sub(1)).copied()
+    });
+
     let mut outer_steps_processed: usize = 0;
 
     loop {
@@ -1652,20 +1681,26 @@ async fn run_pipeline(
             return PipelineState::Stopped(StopReason::Cancelled);
         }
 
-        // Pop the next step from the shared work state.
+        // Pop the next step from the shared work state. If the next batch ID
+        // is beyond the checkpoint interval boundary, do not pop it — pause
+        // instead so the checkpoint can be taken with exact data.
         let next_step = {
             let mut state = work_state.lock().expect("work_state lock poisoned");
             if let Some(entry) = state.steps.first_entry() {
                 let batch_id = *entry.key();
-                let tables = entry.remove();
-                // Filter out already-finished tables.
-                let total_tables = tables.len();
-                let active: Vec<String> = tables
-                    .into_iter()
-                    .filter(|t| !state.finished_tables.contains(t))
-                    .collect();
-                let skipped = (total_tables - active.len()) as u64;
-                Some((batch_id, active, skipped))
+                if checkpoint_max_batch_id.is_some_and(|max| batch_id > max) {
+                    None
+                } else {
+                    let tables = entry.remove();
+                    // Filter out already-finished tables.
+                    let total_tables = tables.len();
+                    let active: Vec<String> = tables
+                        .into_iter()
+                        .filter(|t| !state.finished_tables.contains(t))
+                        .collect();
+                    let skipped = (total_tables - active.len()) as u64;
+                    Some((batch_id, active, skipped))
+                }
             } else {
                 None
             }
@@ -1685,6 +1720,23 @@ async fn run_pipeline(
                 }
                 (bid, tables)
             }
+            None if checkpoint_max_batch_id.is_some() => {
+                // Reached the checkpoint interval boundary (or all work
+                // within the interval was consumed by coalescing). Pause so
+                // the checkpoint validation can run against exact data.
+                info!(
+                    outer_steps_processed,
+                    logical_steps_consumed = logical_steps_consumed.load(Ordering::Relaxed),
+                    "Checkpoint interval boundary reached, pausing pipeline"
+                );
+                progress_logger.abort();
+                if let Err(e) = data_sink.flush().await {
+                    return PipelineState::Stopped(StopReason::Error(format!(
+                        "Failed to flush sink at checkpoint boundary: {e}"
+                    )));
+                }
+                return PipelineState::Paused;
+            }
             None => {
                 // No more work — pipeline is done.
                 break;
@@ -1723,6 +1775,7 @@ async fn run_pipeline(
                         &logical_steps_consumed,
                         &table_name,
                         batch_id,
+                        checkpoint_max_batch_id,
                     )
                     .await
                     {
diff --git a/crates/etl/src/sink/adbc.rs b/crates/etl/src/sink/adbc.rs
@@ -15,6 +15,7 @@ limitations under the License.
 */
 
 use std::collections::HashMap;
+use std::sync::Mutex;
 use std::time::Instant;
 
 use adbc_client::{
@@ -88,6 +89,7 @@ pub struct AdbcSink {
     pool: AdbcConnectionPool,
     target_db_catalog: Option<String>,
     target_db_schema: Option<String>,
+    row_counts: Mutex<HashMap<String, u64>>,
     /// Character used to quote SQL identifiers (e.g. '"' for ANSI, '`' for Databricks).
     identifier_quote_char: char,
     /// Whether Int64/UInt64 literals need an `L` suffix (Databricks).
@@ -130,6 +132,7 @@ impl AdbcSink {
             pool,
             target_db_catalog,
             target_db_schema,
+            row_counts: Mutex::new(HashMap::new()),
             identifier_quote_char,
             bigint_suffix,
         })
@@ -809,6 +812,16 @@ impl Sink for AdbcSink {
             .get()
             .map_err(|e| anyhow::anyhow!("Failed to get ADBC connection from pool: {e}"))?;
 
+        let rows_current = batch.num_rows() as u64;
+        let op_label = match &op {
+            InsertOp::Insert => "insert",
+            InsertOp::Update { .. } => "update",
+            InsertOp::Delete { .. } => "delete",
+        };
+
+        let now = chrono::Utc::now().format("%Y-%m-%d %H:%M:%S%.3f UTC");
+        tracing::info!("[adbc] {now} | {table_name} | {op_label} | rows: {rows_current}");
+
         match op {
             InsertOp::Insert => {
                 self.ingest_insert_batch(&mut conn, table_name, batch)?;
@@ -882,6 +895,22 @@ impl Sink for AdbcSink {
             }
         }
 
+        let rows_total = {
+            let mut counts = self.row_counts.lock().unwrap();
+            let total = counts.entry(table_name.to_string()).or_insert(0);
+            match op_label {
+                "insert" => *total += rows_current,
+                "delete" => *total = total.saturating_sub(rows_current),
+                _ => {} // updates don't change row count
+            }
+            *total
+        };
+
+        let now = chrono::Utc::now().format("%Y-%m-%d %H:%M:%S%.3f UTC");
+        tracing::info!(
+            "[adbc] WRITTEN {now} | {table_name} | {op_label} | rows: {rows_current} | total: {rows_total}"
+        );
+
         Ok(())
     }
 }
diff --git a/crates/etl/src/sink/duckdb.rs b/crates/etl/src/sink/duckdb.rs
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::path::Path;
 use std::sync::{Arc, Mutex};
 
@@ -35,6 +35,7 @@ use super::{InsertOp, Sink};
 pub struct DuckDBSink {
     conn: Arc<Mutex<duckdb::Connection>>,
     created_tables: TokioMutex<HashSet<String>>,
+    row_counts: Mutex<HashMap<String, u64>>,
 }
 
 impl DuckDBSink {
@@ -45,6 +46,7 @@ impl DuckDBSink {
         Ok(Self {
             conn: Arc::new(Mutex::new(conn)),
             created_tables: TokioMutex::new(HashSet::new()),
+            row_counts: Mutex::new(HashMap::new()),
         })
     }
 
@@ -390,6 +392,23 @@ impl Sink for DuckDBSink {
             created.insert(table_name.to_string());
         }
 
+        let rows_current = num_rows as u64;
+        let op_label = match &op {
+            InsertOp::Insert => "insert",
+            InsertOp::Update { .. } => "update",
+            InsertOp::Delete { .. } => "delete",
+        };
+        let rows_total = {
+            let mut counts = self.row_counts.lock().unwrap();
+            let total = counts.entry(table_name.to_string()).or_insert(0);
+            *total += rows_current;
+            *total
+        };
+        let now = chrono::Utc::now().format("%Y-%m-%d %H:%M:%S%.3f UTC");
+        tracing::info!(
+            "[duckdb] WRITTEN {now} | {table_name} | {op_label} | rows: {rows_current} | total: {rows_total}"
+        );
+
         Ok(())
     }
 }
diff --git a/system-adapters/databricks/src/main.rs b/system-adapters/databricks/src/main.rs