fix: Add ADBC sink options back to spicebench execution (#135)

peasee · github-actions[bot] · web-flow · commit e70c954e0d32 · 2026-02-23T17:39:54.000-08:00
* fix: Add ADBC sink options back to spicebench execution

* chore: auto-fix cargo fmt + clippy

* Update run_spicebench.yml

---------

Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
diff --git a/.github/workflows/run_spicebench.yml b/.github/workflows/run_spicebench.yml
@@ -37,20 +37,14 @@ on:
         required: false
         default: 'us-east-1'
         type: string
-      etl_endpoint:
-        description: 'S3 endpoint URL for ETL bucket (for MinIO/LocalStack)'
+      etl_sink:
+        description: 'ETL sink (hive or adbc)'
         required: false
-        type: string
-      table_format:
-        description: 'Table format across generation and adapter setup (iceberg, parquet, delta)'
-        required: false
-        default: 'parquet'
-        type: string
-      executor_instance_type:
-        description: 'Executor instance type label for benchmark comparison and dashboarding'
-        required: false
-        default: 'github-hosted-ubuntu-latest'
-        type: string
+        default: 'hive'
+        type: choice
+        options:
+          - hive
+          - adbc
       num_query_clients:
         description: 'Number of query clients to run (maps to spicebench --concurrency)'
         required: false
@@ -126,7 +120,6 @@ jobs:
           DATABRICKS_SQL_WAREHOUSE_ID: ${{ secrets.DATABRICKS_SQL_WAREHOUSE_ID }}
           DATABRICKS_CATALOG: ${{ secrets.DATABRICKS_CATALOG }}
           DATABRICKS_SCHEMA: ${{ secrets.DATABRICKS_SCHEMA }}
-          DATABRICKS_TABLE_FORMAT: ${{ github.event.inputs.table_format || 'parquet' }}
         run: |
           set -euo pipefail
           SYSTEM_UNDER_TEST_PREFIX="${SYSTEM_UNDER_TEST%%-*}"
@@ -187,12 +180,10 @@ jobs:
           esac
 
       - name: Install ADBC driver
-        env:
-          SYSTEM_UNDER_TEST: ${{ github.event.inputs.system_under_test || 'spice_cloud' }}
-          EXECUTOR_INSTANCE_TYPE: ${{ github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
         run: |
           set -euo pipefail
           curl -LsSf https://dbc.columnar.tech/install.sh | sh
+          SYSTEM_UNDER_TEST="${{ github.event.inputs.system_under_test || 'spice_cloud' }}"
           SYSTEM_UNDER_TEST_PREFIX="${SYSTEM_UNDER_TEST%%-*}"
 
           if [ "${SYSTEM_UNDER_TEST_PREFIX}" = "databricks" ]; then
@@ -211,23 +202,26 @@ jobs:
           DATABRICKS_SQL_WAREHOUSE_ID: ${{ secrets.DATABRICKS_SQL_WAREHOUSE_ID }}
           DATABRICKS_CATALOG: ${{ secrets.DATABRICKS_CATALOG }}
           DATABRICKS_SCHEMA: ${{ secrets.DATABRICKS_SCHEMA }}
-          DATABRICKS_TABLE_FORMAT: ${{ github.event.inputs.table_format || 'parquet' }}
-          SPICEAI_BENCHMARK_METRICS_KEY: ${{ secrets.SPICEAI_BENCHMARK_METRICS_KEY }}
           SCENARIO: ${{ github.event.inputs.scenario || 'tpch' }}
           SYSTEM_UNDER_TEST: ${{ github.event.inputs.system_under_test || 'spice_cloud' }}
-          EXECUTOR_INSTANCE_TYPE: ${{ github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
           NUM_QUERY_CLIENTS: ${{ github.event.inputs.num_query_clients || '8' }}
           ETL_BUCKET: ${{ github.event.inputs.etl_bucket }}
           ETL_PREFIX: ${{ github.event.inputs.etl_prefix || 'data-gen' }}
           ETL_VERSION: ${{ github.event.inputs.etl_version }}
           ETL_REGION: ${{ github.event.inputs.etl_region || 'us-east-1' }}
-          ETL_ENDPOINT: ${{ github.event.inputs.etl_endpoint }}
+          ETL_SINK: ${{ github.event.inputs.etl_sink || 'hive' }}
+          SPICEAI_BENCHMARK_METRICS_KEY: ${{ secrets.SPICEAI_BENCHMARK_METRICS_KEY }}
           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
           SPIDAPTER_ICEBERG_REGION: us-west-1
           SPIDAPTER_ICEBERG_CATALOG_FROM: iceberg:https://glue.us-west-1.amazonaws.com/iceberg/v1/catalogs/211125479522/namespaces
           RUST_LOG: 'info'
         run: |
+          set -euo pipefail
+          TABLE_FORMAT="parquet"
+          EXECUTOR_INSTANCE_TYPE="github-hosted-ubuntu-latest"
+          ETL_ENDPOINT=""
+          DATABRICKS_TABLE_FORMAT="${TABLE_FORMAT}"
           SYSTEM_UNDER_TEST_PREFIX="${SYSTEM_UNDER_TEST%%-*}"
           ETL_ARGS="--etl-bucket ${ETL_BUCKET} --etl-version ${ETL_VERSION}"
           if [ -n "${ETL_PREFIX}" ]; then
@@ -236,20 +230,25 @@ jobs:
           if [ -n "${ETL_REGION}" ]; then
             ETL_ARGS="${ETL_ARGS} --etl-region ${ETL_REGION}"
           fi
-          if [ -n "${ETL_ENDPOINT}" ]; then
+          if [ -n "${ETL_ENDPOINT:-}" ]; then
             ETL_ARGS="${ETL_ARGS} --etl-endpoint ${ETL_ENDPOINT}"
           fi
 
+          ETL_SINK_ARGS="--etl-sink ${ETL_SINK} --table-format ${TABLE_FORMAT}"
+          if [ "${ETL_SINK}" = "adbc" ]; then
+            :
+          fi
+
           if [ "${SYSTEM_UNDER_TEST_PREFIX}" = "databricks" ]; then
             ADAPTER_CMD="${HOME}/.spice/bin/databricks-system-adapter"
             ADAPTER_ARGS="stdio"
             ADAPTER_ENVS="--system-adapter-env DATABRICKS_ENDPOINT=${DATABRICKS_ENDPOINT} --system-adapter-env DATABRICKS_TOKEN=${DATABRICKS_TOKEN} --system-adapter-env DATABRICKS_HTTP_PATH=${DATABRICKS_HTTP_PATH} --system-adapter-env DATABRICKS_SQL_WAREHOUSE_ID=${DATABRICKS_SQL_WAREHOUSE_ID} --system-adapter-env DATABRICKS_TABLE_FORMAT=${DATABRICKS_TABLE_FORMAT}"
 
-            if [ -n "${DATABRICKS_CATALOG}" ]; then
+            if [ -n "${DATABRICKS_CATALOG:-}" ]; then
               ADAPTER_ENVS="${ADAPTER_ENVS} --system-adapter-env DATABRICKS_CATALOG=${DATABRICKS_CATALOG}"
             fi
 
-            if [ -n "${DATABRICKS_SCHEMA}" ]; then
+            if [ -n "${DATABRICKS_SCHEMA:-}" ]; then
               ADAPTER_ENVS="${ADAPTER_ENVS} --system-adapter-env DATABRICKS_SCHEMA=${DATABRICKS_SCHEMA}"
             fi
           else
@@ -263,6 +262,7 @@ jobs:
             --scenario "${SCENARIO}" \
             --executor-instance-type "${EXECUTOR_INSTANCE_TYPE}" \
             ${ETL_ARGS} \
+            ${ETL_SINK_ARGS} \
             --system-adapter-stdio-cmd "${ADAPTER_CMD}" \
             --system-adapter-stdio-args "${ADAPTER_ARGS}" \
             ${ADAPTER_ENVS}
diff --git a/crates/data-generation/src/dataset/tpch.rs b/crates/data-generation/src/dataset/tpch.rs
@@ -15,6 +15,7 @@ limitations under the License.
 */
 
 use std::collections::HashMap;
+use std::collections::VecDeque;
 use std::sync::atomic::{AtomicI64, AtomicU16, Ordering};
 use std::sync::{Arc, Mutex};
 
@@ -72,6 +73,34 @@ const SF1_ROW_COUNTS: &[(&str, u64)] = &[
     ("lineitem", 6_001_215),
 ];
 
+const MIN_TPCH_ROWS_PER_FILE: usize = 32_000;
+const MAX_TPCH_ROWS_PER_FILE: usize = 64_000;
+const DEFAULT_TPCH_MAX_ROWS_PER_FILE: usize = 48_000;
+
+fn tpch_max_rows_per_file() -> usize {
+    std::env::var("SPICEBENCH_TPCH_MAX_ROWS_PER_FILE")
+        .ok()
+        .and_then(|v| v.parse::<usize>().ok())
+        .filter(|v| *v > 0)
+        .map(|v| v.clamp(MIN_TPCH_ROWS_PER_FILE, MAX_TPCH_ROWS_PER_FILE))
+        .unwrap_or(DEFAULT_TPCH_MAX_ROWS_PER_FILE)
+}
+
+fn split_record_batch(batch: RecordBatch, max_rows: usize) -> VecDeque<RecordBatch> {
+    if batch.num_rows() <= max_rows {
+        return VecDeque::from([batch]);
+    }
+
+    let mut out = VecDeque::new();
+    let mut offset = 0usize;
+    while offset < batch.num_rows() {
+        let len = std::cmp::min(max_rows, batch.num_rows() - offset);
+        out.push_back(batch.slice(offset, len));
+        offset += len;
+    }
+    out
+}
+
 /// Returns the expected total number of rows for a given table at the
 /// specified scale factor.
 fn total_rows_for_table(table: &str, scale_factor: f64) -> u64 {
@@ -364,12 +393,16 @@ pub struct TpchDataset {
     mutations: MutationConfig,
     /// Per-table step counter tracking which part to generate next (0-indexed).
     table_steps: HashMap<String, AtomicU16>,
+    /// Per-table queue of already-generated chunks waiting to be emitted.
+    pending_batches: HashMap<String, Mutex<VecDeque<RecordBatch>>>,
     /// Per-table primary key tracking for update/delete targeting.
     key_sets: HashMap<String, Mutex<IndexedKeySet<PrimaryKeyValue>>>,
     /// Global monotonically increasing operation counter for replay ordering.
     op_counter: AtomicI64,
     /// The storage backend for reading/writing table metadata.
     storage: Arc<dyn DataStorage>,
+    /// Maximum number of rows per emitted batch/file.
+    max_rows_per_file: usize,
 }
 
 impl TpchDataset {
@@ -394,14 +427,25 @@ impl TpchDataset {
             .map(|(name, _)| (name.to_string(), AtomicU16::new(0)))
             .collect();
 
+        let pending_batches: HashMap<String, Mutex<VecDeque<RecordBatch>>> = TPCH_TABLES
+            .iter()
+            .map(|(name, _)| (name.to_string(), Mutex::new(VecDeque::new())))
+            .collect();
+
+        let max_rows_per_file = tpch_max_rows_per_file();
+
+        info!(max_rows_per_file, "Configured TPCH maximum rows per file");
+
         Ok(Self {
             scale_factor: config.scale_factor,
             num_steps: config.num_steps,
             mutations: mutations.clone(),
             table_steps,
+            pending_batches,
             key_sets,
             op_counter: AtomicI64::new(0),
             storage,
+            max_rows_per_file,
         })
     }
 }
@@ -463,6 +507,15 @@ impl Dataset for TpchDataset {
     }
 
     async fn raw_next_batch(&self, table: &str) -> anyhow::Result<Option<RecordBatch>> {
+        if let Some(queued) = self.pending_batches.get(table) {
+            let mut queued = queued
+                .lock()
+                .map_err(|e| anyhow::anyhow!("lock poisoned: {e}"))?;
+            if let Some(batch) = queued.pop_front() {
+                return Ok(Some(batch));
+            }
+        }
+
         // Each table independently tracks which step (part) it is on.
         let step_counter = self
             .table_steps
@@ -602,7 +655,23 @@ impl Dataset for TpchDataset {
         let op_indices: Vec<i64> = (op_base..op_base + total_rows as i64).collect();
         columns.push(Arc::new(Int64Array::from(op_indices)));
 
-        Ok(Some(RecordBatch::try_new(schema, columns)?))
+        let combined_batch = RecordBatch::try_new(schema, columns)?;
+        let mut chunks = split_record_batch(combined_batch, self.max_rows_per_file);
+
+        let first = chunks
+            .pop_front()
+            .ok_or_else(|| anyhow::anyhow!("internal error: no chunks produced"))?;
+
+        if !chunks.is_empty()
+            && let Some(queued) = self.pending_batches.get(table)
+        {
+            let mut queued = queued
+                .lock()
+                .map_err(|e| anyhow::anyhow!("lock poisoned: {e}"))?;
+            queued.extend(chunks);
+        }
+
+        Ok(Some(first))
     }
 
     fn tables(&self) -> HashMap<String, DatasetTable> {
@@ -716,7 +785,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn tpch_num_batches_matches_emitted_batches_per_table() {
+    async fn tpch_num_batches_is_a_lower_bound_for_emitted_batches_per_table() {
         let dataset = build_dataset(1.0, 7);
 
         for (table, _) in TPCH_TABLES {
@@ -731,14 +800,35 @@ mod tests {
                 emitted_batches += 1;
             }
 
-            assert_eq!(
-                emitted_batches,
-                dataset.num_batches(table),
-                "unexpected batch count for table '{table}'"
+            assert!(
+                emitted_batches >= dataset.num_batches(table),
+                "emitted batches should be >= planned batches for table '{table}'"
             );
         }
     }
 
+    #[tokio::test]
+    async fn tpch_batches_are_capped_to_max_rows_per_file() {
+        let dataset = build_dataset(1.0, 7);
+
+        let mut saw_split = false;
+        while let Some(batch) = dataset
+            .raw_next_batch("lineitem")
+            .await
+            .expect("raw_next_batch should not fail")
+        {
+            assert!(
+                batch.num_rows() <= DEFAULT_TPCH_MAX_ROWS_PER_FILE,
+                "lineitem chunk exceeded max rows per file"
+            );
+            if batch.num_rows() == DEFAULT_TPCH_MAX_ROWS_PER_FILE {
+                saw_split = true;
+            }
+        }
+
+        assert!(saw_split, "expected at least one full-size split chunk");
+    }
+
     #[tokio::test]
     async fn tpch_zero_mutation_batches_have_create_only_ops() {
         let dataset = build_dataset(1.0, 10);
diff --git a/crates/data-generation/src/storage/s3.rs b/crates/data-generation/src/storage/s3.rs
@@ -234,7 +234,7 @@ impl DataStorage for S3Storage {
 
         // Serialize RecordBatch to Parquet bytes in memory
         let props = WriterProperties::builder()
-            .set_compression(Compression::SNAPPY)
+            .set_compression(Compression::LZ4)
             .build();
 
         let mut buf = Vec::new();
diff --git a/src/args/mod.rs b/src/args/mod.rs
@@ -27,6 +27,13 @@ pub enum TableFormat {
     Delta,
 }
 
+#[derive(Clone, Debug, ValueEnum)]
+#[value(rename_all = "lower")]
+pub enum EtlSink {
+    Hive,
+    Adbc,
+}
+
 impl std::fmt::Display for TableFormat {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         let value = match self {
@@ -120,6 +127,10 @@ pub struct CommonArgs {
     #[arg(long, default_value = "")]
     pub(crate) etl_target_base_prefix: String,
 
+    /// ETL sink implementation used for loading generated data.
+    #[arg(long, value_enum, default_value = "hive")]
+    pub(crate) etl_sink: EtlSink,
+
     /// AWS region for the ETL S3 bucket
     #[arg(long, default_value = "us-east-1")]
     pub(crate) etl_region: Option<String>,
diff --git a/src/main.rs b/src/main.rs