Skip to content

Commit 4f8893f

Browse files
authored
Various improvements (#202)
* Reapply "Fix checkpoint determinism: bound coalescing to checkpoint interval (#187)" This reverts commit 5ea1ddb. * Databricks improvements * Logging * Add logging * Lint * Fix * Fix * Fix * Revert change * Logging
1 parent 3ce6170 commit 4f8893f

7 files changed

Lines changed: 221 additions & 21 deletions

File tree

.github/workflows/run_spicebench.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,8 +308,10 @@ jobs:
308308
fi
309309
310310
if [ "${SYSTEM_UNDER_TEST_PREFIX}" = "databricks" ]; then
311-
export SPICEBENCH_ADBC_DELETE_BATCH_SIZE=5000
311+
export SPICEBENCH_ADBC_DELETE_BATCH_SIZE=50000
312312
export SPICEBENCH_ADBC_UPDATE_STRATEGY=staging_table
313+
export SPICEBENCH_TARGET_BATCH_ROWS=500000
314+
export SPICEBENCH_ADBC_MAX_INGEST_BATCH_BYTES=1268435456
313315
ADAPTER_CMD="${HOME}/.spice/bin/databricks-system-adapter"
314316
ADAPTER_ARGS="stdio"
315317
ADAPTER_ENVS="--system-adapter-env DATABRICKS_ENDPOINT=${DATABRICKS_ENDPOINT} --system-adapter-env DATABRICKS_TOKEN=${DATABRICKS_TOKEN} --system-adapter-env DATABRICKS_HTTP_PATH=${DATABRICKS_HTTP_PATH} --system-adapter-env DATABRICKS_SQL_WAREHOUSE_ID=${DATABRICKS_SQL_WAREHOUSE_ID} --system-adapter-env DATABRICKS_TABLE_FORMAT=${DATABRICKS_TABLE_FORMAT}"
@@ -326,6 +328,8 @@ jobs:
326328
ADAPTER_ENVS="${ADAPTER_ENVS} --system-adapter-env DATABRICKS_STAGING_VOLUME_PATH=${DATABRICKS_STAGING_VOLUME_PATH}"
327329
fi
328330
else
331+
export SPICEBENCH_ADBC_UPDATE_STRATEGY=bulk_ingest_upsert
332+
export SPICEBENCH_ADBC_DELETE_BATCH_SIZE=50000
329333
ADAPTER_CMD="docker"
330334
ADAPTER_ARGS="run -i -e SPICEAI_API_KEY -e SPICE_CLOUD_API_URL -e AWS_ACCESS_KEY_ID=${S3_AWS_ACCESS_KEY_ID} -e AWS_SECRET_ACCESS_KEY=${S3_AWS_SECRET_ACCESS_KEY} -e SPIDAPTER_ICEBERG_REGION -e SPIDAPTER_ICEBERG_CATALOG_FROM ghcr.io/spiceai/spidapter:latest stdio --verbose --channel nightly"
331335
ADAPTER_ENVS=""

crates/checkpointer/src/main.rs

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,30 @@ impl Cli {
105105
}
106106
}
107107

108+
/// Log the row count for each table in the DuckDB sink.
109+
#[cfg(feature = "duckdb")]
110+
async fn log_table_row_counts(
111+
sink: &DuckDBSink,
112+
table_names: &[String],
113+
checkpoint_idx: usize,
114+
) -> anyhow::Result<()> {
115+
for table in table_names {
116+
let sql = format!("SELECT COUNT(*) AS cnt FROM {table}");
117+
let batches = sink.query(&sql).await?;
118+
let count: i64 = batches
119+
.first()
120+
.and_then(|b| {
121+
b.column(0)
122+
.as_any()
123+
.downcast_ref::<arrow::array::Int64Array>()
124+
.map(|a| a.value(0))
125+
})
126+
.unwrap_or(0);
127+
tracing::info!("[checkpoint] Checkpoint {checkpoint_idx} | {table}: {count} rows");
128+
}
129+
Ok(())
130+
}
131+
108132
/// Run all checkpoint queries against the DuckDB sink and write each result
109133
/// set to a parquet file at `<checkpoint_dir>/<checkpoint_idx>/<query_idx>.parquet`.
110134
#[cfg(feature = "duckdb")]
@@ -207,11 +231,11 @@ async fn main() -> anyhow::Result<()> {
207231

208232
// Read version metadata to derive dataset config and mutations.
209233
let version_metadata = source.read_version_metadata().await?.ok_or_else(|| {
210-
anyhow::anyhow!(
234+
anyhow::anyhow!(
211235
"No version.json found in extracted data at {}. Was data generation run for this version?",
212236
extract_dir.path().display()
213237
)
214-
})?;
238+
})?;
215239

216240
let dataset_source = DatasetSource::from_dataset_type(&version_metadata.dataset_type)?;
217241
let dataset_config = version_metadata.dataset_config();
@@ -243,6 +267,9 @@ async fn main() -> anyhow::Result<()> {
243267
"Starting Checkpointer"
244268
);
245269

270+
let mut table_names: Vec<String> = version_metadata.tables.keys().cloned().collect();
271+
table_names.sort();
272+
246273
pipeline.initialize().await?;
247274
pipeline.run(cli.checkpoint_interval_steps as usize).await?;
248275

@@ -258,6 +285,7 @@ async fn main() -> anyhow::Result<()> {
258285
checkpoint = checkpoint_idx,
259286
"Pipeline paused, running checkpoint queries"
260287
);
288+
log_table_row_counts(&target, &table_names, checkpoint_idx).await?;
261289
run_checkpoint_queries(
262290
&target,
263291
&checkpoint_queries,
@@ -276,6 +304,7 @@ async fn main() -> anyhow::Result<()> {
276304
checkpoint = checkpoint_idx,
277305
"Pipeline completed, running final checkpoint queries"
278306
);
307+
log_table_row_counts(&target, &table_names, checkpoint_idx).await?;
279308
run_checkpoint_queries(
280309
&target,
281310
&checkpoint_queries,

crates/data-generation/src/storage/file.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,15 @@ impl DataStorage for FileStorage {
157157
.await
158158
.map_err(|e| anyhow::anyhow!("spawn_blocking panicked reading parquet: {e}"))??;
159159

160+
if table_name == "lineitem"
161+
&& let Some(result) = result.as_ref()
162+
{
163+
eprintln!(
164+
"[etl-read] table={table_name} batch_id={batch_id} rows={}",
165+
result.rows_read,
166+
);
167+
}
168+
160169
Ok(result)
161170
}
162171

crates/etl/src/lib.rs

Lines changed: 64 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -244,18 +244,26 @@ fn coalesce_batches(batches: &[RecordBatch]) -> anyhow::Result<Vec<RecordBatch>>
244244
/// Removes and returns the next batch ID (greater than `after_batch_id`) that
245245
/// still has pending work for `table_name`.
246246
///
247+
/// When `max_batch_id` is `Some(max)`, candidates with a batch ID greater than
248+
/// `max` are not considered. This is used to prevent coalescing across
249+
/// checkpoint interval boundaries.
250+
///
247251
/// This is used by the ETL runner to coalesce very small reads across multiple
248252
/// source batch IDs for the same table while ensuring consumed IDs are not
249253
/// replayed in later steps.
250254
fn reserve_next_batch_id_for_table(
251255
work_state: &mut PipelineWorkState,
252256
table_name: &str,
253257
after_batch_id: u64,
258+
max_batch_id: Option<u64>,
254259
) -> Option<(u64, bool)> {
255260
let mut found: Option<(u64, bool)> = None;
256261
let start = after_batch_id.saturating_add(1);
257262

258263
for (candidate_batch_id, tables) in work_state.steps.range_mut(start..) {
264+
if max_batch_id.is_some_and(|max| *candidate_batch_id > max) {
265+
break;
266+
}
259267
if let Some(pos) = tables.iter().position(|t| t == table_name) {
260268
tables.remove(pos);
261269
found = Some((*candidate_batch_id, tables.is_empty()));
@@ -371,6 +379,11 @@ async fn read_logical_batch(
371379
/// reserving and reading subsequent batch IDs for that table until at least
372380
/// [`target_batch_rows()`] rows have been accumulated (or no further work exists).
373381
///
382+
/// When `max_batch_id` is `Some(max)`, coalescing will not reserve batch IDs
383+
/// beyond `max`. This prevents reads from crossing a checkpoint interval
384+
/// boundary, ensuring checkpoint results are deterministic regardless of
385+
/// the configured [`target_batch_rows()`] value.
386+
///
374387
/// Returns `(raw_batches, key_columns, table_finished, consumed_work_units, rows_read)` where
375388
/// `table_finished=true` means a read returned `None` and the table should be
376389
/// marked as fully consumed. `consumed_work_units` counts how many table+batch
@@ -382,6 +395,7 @@ async fn read_batches_until_min_rows(
382395
logical_steps_consumed: &StdArc<AtomicU64>,
383396
table_name: &str,
384397
start_batch_id: u64,
398+
max_batch_id: Option<u64>,
385399
) -> Result<(Vec<RecordBatch>, Vec<String>, bool, u64, u64), String> {
386400
let mut all_batches: Vec<RecordBatch> = Vec::new();
387401
let mut total_rows: usize = 0;
@@ -450,7 +464,12 @@ async fn read_batches_until_min_rows(
450464
{
451465
let reservation = {
452466
let mut state = work_state.lock().expect("work_state lock poisoned");
453-
reserve_next_batch_id_for_table(&mut state, table_name, reserve_cursor)
467+
reserve_next_batch_id_for_table(
468+
&mut state,
469+
table_name,
470+
reserve_cursor,
471+
max_batch_id,
472+
)
454473
};
455474

456475
let Some((next_batch_id, removed_step_entry)) = reservation else {
@@ -1614,6 +1633,16 @@ async fn run_pipeline(
16141633
})
16151634
};
16161635

1636+
// When running with a step budget (checkpoint mode), compute the highest
1637+
// batch ID that belongs to this checkpoint interval. Coalescing in
1638+
// read_batches_until_min_rows is allowed within the interval but will not
1639+
// reserve batch IDs beyond this boundary, ensuring checkpoint results are
1640+
// deterministic regardless of the configured target_batch_rows() value.
1641+
let checkpoint_max_batch_id: Option<u64> = step_limit.and_then(|limit| {
1642+
let state = work_state.lock().expect("work_state lock poisoned");
1643+
state.steps.keys().nth(limit.saturating_sub(1)).copied()
1644+
});
1645+
16171646
let mut outer_steps_processed: usize = 0;
16181647

16191648
loop {
@@ -1652,20 +1681,26 @@ async fn run_pipeline(
16521681
return PipelineState::Stopped(StopReason::Cancelled);
16531682
}
16541683

1655-
// Pop the next step from the shared work state.
1684+
// Pop the next step from the shared work state. If the next batch ID
1685+
// is beyond the checkpoint interval boundary, do not pop it — pause
1686+
// instead so the checkpoint can be taken with exact data.
16561687
let next_step = {
16571688
let mut state = work_state.lock().expect("work_state lock poisoned");
16581689
if let Some(entry) = state.steps.first_entry() {
16591690
let batch_id = *entry.key();
1660-
let tables = entry.remove();
1661-
// Filter out already-finished tables.
1662-
let total_tables = tables.len();
1663-
let active: Vec<String> = tables
1664-
.into_iter()
1665-
.filter(|t| !state.finished_tables.contains(t))
1666-
.collect();
1667-
let skipped = (total_tables - active.len()) as u64;
1668-
Some((batch_id, active, skipped))
1691+
if checkpoint_max_batch_id.is_some_and(|max| batch_id > max) {
1692+
None
1693+
} else {
1694+
let tables = entry.remove();
1695+
// Filter out already-finished tables.
1696+
let total_tables = tables.len();
1697+
let active: Vec<String> = tables
1698+
.into_iter()
1699+
.filter(|t| !state.finished_tables.contains(t))
1700+
.collect();
1701+
let skipped = (total_tables - active.len()) as u64;
1702+
Some((batch_id, active, skipped))
1703+
}
16691704
} else {
16701705
None
16711706
}
@@ -1685,6 +1720,23 @@ async fn run_pipeline(
16851720
}
16861721
(bid, tables)
16871722
}
1723+
None if checkpoint_max_batch_id.is_some() => {
1724+
// Reached the checkpoint interval boundary (or all work
1725+
// within the interval was consumed by coalescing). Pause so
1726+
// the checkpoint validation can run against exact data.
1727+
info!(
1728+
outer_steps_processed,
1729+
logical_steps_consumed = logical_steps_consumed.load(Ordering::Relaxed),
1730+
"Checkpoint interval boundary reached, pausing pipeline"
1731+
);
1732+
progress_logger.abort();
1733+
if let Err(e) = data_sink.flush().await {
1734+
return PipelineState::Stopped(StopReason::Error(format!(
1735+
"Failed to flush sink at checkpoint boundary: {e}"
1736+
)));
1737+
}
1738+
return PipelineState::Paused;
1739+
}
16881740
None => {
16891741
// No more work — pipeline is done.
16901742
break;
@@ -1723,6 +1775,7 @@ async fn run_pipeline(
17231775
&logical_steps_consumed,
17241776
&table_name,
17251777
batch_id,
1778+
checkpoint_max_batch_id,
17261779
)
17271780
.await
17281781
{

crates/etl/src/sink/adbc.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ limitations under the License.
1515
*/
1616

1717
use std::collections::HashMap;
18+
use std::sync::Mutex;
1819
use std::time::Instant;
1920

2021
use adbc_client::{
@@ -88,6 +89,7 @@ pub struct AdbcSink {
8889
pool: AdbcConnectionPool,
8990
target_db_catalog: Option<String>,
9091
target_db_schema: Option<String>,
92+
row_counts: Mutex<HashMap<String, u64>>,
9193
/// Character used to quote SQL identifiers (e.g. '"' for ANSI, '`' for Databricks).
9294
identifier_quote_char: char,
9395
/// Whether Int64/UInt64 literals need an `L` suffix (Databricks).
@@ -130,6 +132,7 @@ impl AdbcSink {
130132
pool,
131133
target_db_catalog,
132134
target_db_schema,
135+
row_counts: Mutex::new(HashMap::new()),
133136
identifier_quote_char,
134137
bigint_suffix,
135138
})
@@ -809,6 +812,16 @@ impl Sink for AdbcSink {
809812
.get()
810813
.map_err(|e| anyhow::anyhow!("Failed to get ADBC connection from pool: {e}"))?;
811814

815+
let rows_current = batch.num_rows() as u64;
816+
let op_label = match &op {
817+
InsertOp::Insert => "insert",
818+
InsertOp::Update { .. } => "update",
819+
InsertOp::Delete { .. } => "delete",
820+
};
821+
822+
let now = chrono::Utc::now().format("%Y-%m-%d %H:%M:%S%.3f UTC");
823+
tracing::info!("[adbc] {now} | {table_name} | {op_label} | rows: {rows_current}");
824+
812825
match op {
813826
InsertOp::Insert => {
814827
self.ingest_insert_batch(&mut conn, table_name, batch)?;
@@ -882,6 +895,22 @@ impl Sink for AdbcSink {
882895
}
883896
}
884897

898+
let rows_total = {
899+
let mut counts = self.row_counts.lock().unwrap();
900+
let total = counts.entry(table_name.to_string()).or_insert(0);
901+
match op_label {
902+
"insert" => *total += rows_current,
903+
"delete" => *total = total.saturating_sub(rows_current),
904+
_ => {} // updates don't change row count
905+
}
906+
*total
907+
};
908+
909+
let now = chrono::Utc::now().format("%Y-%m-%d %H:%M:%S%.3f UTC");
910+
tracing::info!(
911+
"[adbc] WRITTEN {now} | {table_name} | {op_label} | rows: {rows_current} | total: {rows_total}"
912+
);
913+
885914
Ok(())
886915
}
887916
}

crates/etl/src/sink/duckdb.rs

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
1414
limitations under the License.
1515
*/
1616

17-
use std::collections::HashSet;
17+
use std::collections::{HashMap, HashSet};
1818
use std::path::Path;
1919
use std::sync::{Arc, Mutex};
2020

@@ -35,6 +35,7 @@ use super::{InsertOp, Sink};
3535
pub struct DuckDBSink {
3636
conn: Arc<Mutex<duckdb::Connection>>,
3737
created_tables: TokioMutex<HashSet<String>>,
38+
row_counts: Mutex<HashMap<String, u64>>,
3839
}
3940

4041
impl DuckDBSink {
@@ -45,6 +46,7 @@ impl DuckDBSink {
4546
Ok(Self {
4647
conn: Arc::new(Mutex::new(conn)),
4748
created_tables: TokioMutex::new(HashSet::new()),
49+
row_counts: Mutex::new(HashMap::new()),
4850
})
4951
}
5052

@@ -390,6 +392,23 @@ impl Sink for DuckDBSink {
390392
created.insert(table_name.to_string());
391393
}
392394

395+
let rows_current = num_rows as u64;
396+
let op_label = match &op {
397+
InsertOp::Insert => "insert",
398+
InsertOp::Update { .. } => "update",
399+
InsertOp::Delete { .. } => "delete",
400+
};
401+
let rows_total = {
402+
let mut counts = self.row_counts.lock().unwrap();
403+
let total = counts.entry(table_name.to_string()).or_insert(0);
404+
*total += rows_current;
405+
*total
406+
};
407+
let now = chrono::Utc::now().format("%Y-%m-%d %H:%M:%S%.3f UTC");
408+
tracing::info!(
409+
"[duckdb] WRITTEN {now} | {table_name} | {op_label} | rows: {rows_current} | total: {rows_total}"
410+
);
411+
393412
Ok(())
394413
}
395414
}

0 commit comments

Comments
 (0)