feat: Connect ETL to spicebench (#49)

peasee · Jeadie · web-flow · commit 026cb00224c8 · 2026-02-18T17:17:35.000Z
* feat: Connect ETL pipeline in spicebench

* fix: Update run spicebench args

* fix

* fix

* chore: fmt, move pipeline start to load run

* fix log

* formatting

---------

Co-authored-by: jeadie &lt;jack@spice.ai&gt;
diff --git a/.github/workflows/data_generation_run.yml b/.github/workflows/data_generation_run.yml
@@ -50,7 +50,7 @@ on:
       runner_type:
         description: 'GitHub runner label to execute on'
         required: false
-        default: 'ubuntu-latest'
+        default: 'spiceai-macos'
         type: string
       scale_factor:
         description: 'TPC-H scale factor'
diff --git a/.github/workflows/run_spicebench.yml b/.github/workflows/run_spicebench.yml
@@ -17,66 +17,39 @@ on:
         options:
           - spidapter
           - databricks
-      run_data_generation:
-        description: 'Run data-generation before spicebench'
-        required: false
-        default: false
-        type: boolean
-      data_generation_runner_type:
-        description: 'Runner label for data-generation job'
-        required: false
-        default: 'ubuntu-latest'
+      etl_bucket:
+        description: 'S3 bucket for ETL source and target data'
+        required: true
+        default: "spiceai-public-datasets"
         type: string
-      data_generation_scale_factor:
-        description: 'Data-generation scale factor'
+      etl_source_prefix:
+        description: 'S3 key prefix for ETL source data'
         required: false
-        default: '1.0'
+        default: 'data-gen/tpch_sf1'
         type: string
-      data_generation_bucket:
-        description: 'S3 bucket for generated data (required if run_data_generation=true)'
+      etl_target_prefix:
+        description: 'S3 key prefix for ETL target (rehydrated) data'
         required: false
+        default: 'rehydrated/tpch_sf1'
         type: string
-      data_generation_prefix:
-        description: 'Base S3 prefix for generated data (scenario appended automatically)'
+      etl_region:
+        description: 'AWS region for the ETL S3 bucket'
         required: false
-        default: 'data-gen'
+        default: "us-east-1"
         type: string
-      data_generation_max_concurrency:
-        description: 'Data-generation max concurrent S3 writes'
+      etl_endpoint:
+        description: 'S3 endpoint URL for ETL bucket (for MinIO/LocalStack)'
         required: false
-        default: '8'
         type: string
-      data_generation_region:
-        description: 'AWS region for data-generation'
+      etl_num_steps:
+        description: 'Number of ETL data generation steps (partitions)'
         required: false
-        default: 'us-east-1'
+        default: '25'
         type: string
-      data_generation_skip_initial:
-        description: 'Skip data-generation initial ingest'
-        required: false
-        default: false
-        type: boolean
 
 jobs:
-  data-generation:
-    name: Run data generation
-    if: ${{ github.event.inputs.run_data_generation == 'true' }}
-    uses: ./.github/workflows/data_generation_run.yml
-    with:
-      scenario: ${{ github.event.inputs.scenario }}
-      runner_type: ${{ github.event.inputs.data_generation_runner_type }}
-      scale_factor: ${{ github.event.inputs.data_generation_scale_factor }}
-      bucket: ${{ github.event.inputs.data_generation_bucket }}
-      prefix: ${{ github.event.inputs.data_generation_prefix }}
-      max_concurrency: ${{ github.event.inputs.data_generation_max_concurrency }}
-      region: ${{ github.event.inputs.data_generation_region }}
-      skip_initial: ${{ github.event.inputs.data_generation_skip_initial == 'true' }}
-    secrets: inherit
-
   run-spicebench:
     name: Run spicebench
-    needs: [data-generation]
-    if: ${{ always() && (needs.data-generation.result == 'success' || needs.data-generation.result == 'skipped') }}
     runs-on: ubuntu-latest
     timeout-minutes: 600
     steps:
@@ -222,7 +195,30 @@ jobs:
           SPICEAI_BENCHMARK_METRICS_KEY: ${{ secrets.SPICEAI_BENCHMARK_METRICS_KEY }}
           SCENARIO: ${{ github.event.inputs.scenario || 'tpch' }}
           SYSTEM_ADAPTER: ${{ github.event.inputs.system_adapter || 'spidapter' }}
+          ETL_BUCKET: ${{ github.event.inputs.etl_bucket }}
+          ETL_SOURCE_PREFIX: ${{ github.event.inputs.etl_source_prefix }}
+          ETL_TARGET_PREFIX: ${{ github.event.inputs.etl_target_prefix }}
+          ETL_REGION: ${{ github.event.inputs.etl_region || 'us-east-1' }}
+          ETL_ENDPOINT: ${{ github.event.inputs.etl_endpoint }}
+          ETL_NUM_STEPS: ${{ github.event.inputs.etl_num_steps || '25' }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          RUST_LOG: "info"
         run: |
+          ETL_ARGS="--etl-bucket ${ETL_BUCKET} --etl-num-steps ${ETL_NUM_STEPS}"
+          if [ -n "${ETL_SOURCE_PREFIX}" ]; then
+            ETL_ARGS="${ETL_ARGS} --etl-source-prefix ${ETL_SOURCE_PREFIX}"
+          fi
+          if [ -n "${ETL_TARGET_PREFIX}" ]; then
+            ETL_ARGS="${ETL_ARGS} --etl-target-prefix ${ETL_TARGET_PREFIX}"
+          fi
+          if [ -n "${ETL_REGION}" ]; then
+            ETL_ARGS="${ETL_ARGS} --etl-region ${ETL_REGION}"
+          fi
+          if [ -n "${ETL_ENDPOINT}" ]; then
+            ETL_ARGS="${ETL_ARGS} --etl-endpoint ${ETL_ENDPOINT}"
+          fi
+          
           if [ "${SYSTEM_ADAPTER}" = "databricks" ]; then
             ADAPTER_CMD="${HOME}/.spice/bin/databricks-system-adapter"
             ADAPTER_ARGS="stdio"
@@ -244,6 +240,7 @@ jobs:
           ~/.spice/bin/spicebench \
             --concurrency 2  \
             --scenario "${SCENARIO}" \
+            ${ETL_ARGS} \
             --system-adapter-stdio-cmd "${ADAPTER_CMD}" \
             --system-adapter-stdio-args "${ADAPTER_ARGS}" \
             ${ADAPTER_ENVS}
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -136,14 +136,17 @@ adbc_client = { path = "crates/adbc_client" }
 arrow.workspace = true
 async-trait.workspace = true
 clap.workspace = true
+data-generation = { path = "crates/data-generation" }
+etl = { path = "crates/etl" }
 reqwest.workspace = true
 serde.workspace = true
-spicepod = { path = "crates/spicepod" }
 system-adapter-protocol = { path = "crates/system-adapter-protocol" }
 test-framework = { path = "crates/test-framework" }
 tokio.workspace = true
 tokio-util.workspace = true
+tracing.workspace = true
 uuid.workspace = true
+tracing-subscriber = { workspace = true, features = ["env-filter"] }
 
 [features]
 default = []
diff --git a/crates/data-generation/src/dataset/mod.rs b/crates/data-generation/src/dataset/mod.rs
@@ -266,7 +266,7 @@ pub trait Dataset: Send + Sync {
     async fn next_batches(&self) -> anyhow::Result<Option<HashMap<String, RecordBatch>>> {
         let tables = self.tables();
         let mut batches = HashMap::new();
-        for (name, _) in &tables {
+        for name in tables.keys() {
             if let Some(batch) = self.next_batch(name).await? {
                 batches.insert(name.clone(), batch);
             }
diff --git a/crates/data-generation/src/target/s3.rs b/crates/data-generation/src/target/s3.rs
@@ -35,6 +35,7 @@ pub struct S3Target {
     store: Arc<dyn ObjectStore>,
     bucket: String,
     prefix: String,
+    region: Option<String>,
 }
 
 impl S3Target {
@@ -59,6 +60,7 @@ impl S3Target {
             store,
             bucket: config.bucket.clone(),
             prefix: config.prefix.clone(),
+            region: config.region.clone(),
         })
     }
 
@@ -97,9 +99,21 @@ impl Target for S3Target {
             serde_json::Value::String("s3".to_string()),
         );
         params.insert(
-            "location".to_string(),
+            "from".to_string(),
             serde_json::Value::String(self.table_s3_path(table_name)),
         );
+        params.insert(
+            "file_format".to_string(),
+            serde_json::Value::String("parquet".to_string()),
+        );
+
+        if let Some(region) = &self.region {
+            params.insert(
+                "s3_region".to_string(),
+                serde_json::Value::String(region.clone()),
+            );
+        }
+
         params
     }
 
diff --git a/crates/etl/Cargo.toml b/crates/etl/Cargo.toml
@@ -20,7 +20,6 @@ path = "src/main.rs"
 anyhow.workspace = true
 clap = { workspace = true, features = ["derive"] }
 data-generation = { path = "../data-generation" }
-serde_json.workspace = true
 system-adapter-protocol = { path = "../system-adapter-protocol" }
 tokio.workspace = true
 tokio-util.workspace = true
diff --git a/crates/etl/src/lib.rs b/crates/etl/src/lib.rs
@@ -61,6 +61,10 @@ pub enum PipelineState {
     /// The pipeline has been created with a dataset, source, and target but has
     /// not yet started processing.
     NotStarted,
+    /// The pipeline has been initialized: the first batch for every table has
+    /// been ETL'd into the target so the system adapter can discover initial
+    /// data.
+    Initialized,
     /// The pipeline is actively rehydrating batches (in order of batch ID) from
     /// the configured [`Source`] into the configured [`Target`].
     Running,
@@ -88,9 +92,12 @@ pub enum StopReason {
 /// 1. **[`NotStarted`](PipelineState::NotStarted)** — created via [`ETLPipeline::new`]
 ///    with a dataset, source, and target. Call [`setup_request_datasets`](ETLPipeline::setup_request_datasets)
 ///    to obtain the dataset configurations that a system adapter needs.
-/// 2. **[`Running`](PipelineState::Running)** — the pipeline is actively processing
-///    batches.
-/// 3. **[`Stopped`](PipelineState::Stopped)** — the pipeline finished, was cancelled,
+/// 2. **[`Initialized`](PipelineState::Initialized)** — the first batch (batch 0)
+///    has been ETL'd into the target via [`initialize`](ETLPipeline::initialize).
+///    The system adapter can now discover initial data.
+/// 3. **[`Running`](PipelineState::Running)** — the pipeline is actively processing
+///    remaining batches (batch 1+).
+/// 4. **[`Stopped`](PipelineState::Stopped)** — the pipeline finished, was cancelled,
 ///    or hit an error.
 pub struct ETLPipeline {
     dataset_source: DatasetSource,
@@ -185,11 +192,93 @@ impl ETLPipeline {
             .collect()
     }
 
-    /// Starts the ETL pipeline, transitioning from [`PipelineState::NotStarted`]
+    /// Initializes the ETL pipeline by processing only the first batch (batch
+    /// ID 0) for every table.
+    ///
+    /// This ensures the target has some initial data before calling
+    /// `setup()` on the system adapter. After successful initialization the
+    /// pipeline transitions to [`PipelineState::Initialized`].
+    ///
+    /// Returns an error if the pipeline is not in the [`NotStarted`] state or
+    /// if any batch fails to process.
+    pub async fn initialize(&mut self) -> anyhow::Result<()> {
+        if *self.state_rx.borrow() != PipelineState::NotStarted {
+            anyhow::bail!(
+                "Cannot initialize pipeline: current state is {:?}",
+                *self.state_rx.borrow()
+            );
+        }
+
+        let tables = self.dataset.tables();
+        let first_batch_id = 0u64;
+
+        let mut join_set: JoinSet<Result<String, String>> = JoinSet::new();
+        for table_name in tables.keys() {
+            let dataset = Arc::clone(&self.dataset);
+            let source = Arc::clone(&self.source);
+            let target = Arc::clone(&self.target);
+            let table_name = table_name.clone();
+
+            join_set.spawn(async move {
+                let read_result = source
+                    .read_batch(&table_name, first_batch_id)
+                    .await
+                    .map_err(|e| format!("read {table_name} batch {first_batch_id}: {e}"))?
+                    .ok_or_else(|| {
+                        format!("No data for table {table_name} at batch {first_batch_id}")
+                    })?;
+
+                for batch in read_result.batches {
+                    let rehydrated = dataset.rehydrate(&table_name, &batch).map_err(|e| {
+                        format!("rehydrate {table_name} batch {first_batch_id}: {e}")
+                    })?;
+
+                    target
+                        .write(&table_name, first_batch_id, rehydrated)
+                        .await
+                        .map_err(|e| format!("write {table_name} batch {first_batch_id}: {e}"))?;
+                }
+
+                info!(
+                    table = %table_name,
+                    batch_id = first_batch_id,
+                    "Initial batch processed"
+                );
+                Ok(table_name)
+            });
+        }
+
+        while let Some(result) = join_set.join_next().await {
+            match result {
+                Ok(Ok(_table_name)) => {}
+                Ok(Err(err_msg)) => {
+                    let _ = self
+                        .state_tx
+                        .send(PipelineState::Stopped(StopReason::Error(err_msg.clone())));
+                    anyhow::bail!("ETL initialization failed: {err_msg}");
+                }
+                Err(e) => {
+                    let msg = format!("Task panicked during initialization: {e}");
+                    let _ = self
+                        .state_tx
+                        .send(PipelineState::Stopped(StopReason::Error(msg.clone())));
+                    anyhow::bail!("{msg}");
+                }
+            }
+        }
+
+        info!("ETL pipeline initialized with first batch for all tables");
+        let _ = self.state_tx.send(PipelineState::Initialized);
+        Ok(())
+    }
+
+    /// Starts the ETL pipeline, transitioning from [`PipelineState::Initialized`]
     /// to [`PipelineState::Running`].
     ///
     /// Spawns a background tokio task that iterates over every table and
-    /// processes batch IDs in ascending order. For each batch the task:
+    /// processes batch IDs in ascending order, skipping batch 0 which was
+    /// already processed during [`initialize`](ETLPipeline::initialize). For
+    /// each batch the task:
     ///
     /// 1. Reads the batch from the [`Source`].
     /// 2. Rehydrates it through the [`Dataset`] (appending time columns, etc.).
@@ -198,12 +287,13 @@ impl ETLPipeline {
     /// The task transitions to [`PipelineState::Stopped`] when all batches are
     /// processed, the [`CancellationToken`] is triggered, or an error occurs.
     ///
-    /// Returns an error if the pipeline is not in the [`NotStarted`] state.
+    /// Returns an error if the pipeline is not in the [`Initialized`] state.
     pub fn start(&mut self) -> anyhow::Result<()> {
-        if *self.state_rx.borrow() != PipelineState::NotStarted {
+        let current_state = self.state_rx.borrow().clone();
+        if current_state != PipelineState::Initialized {
             anyhow::bail!(
-                "Cannot start pipeline: current state is {:?}",
-                *self.state_rx.borrow()
+                "Cannot start pipeline: current state is {:?} (must be Initialized)",
+                current_state
             );
         }
 
@@ -219,8 +309,12 @@ impl ETLPipeline {
         // batch_id so all tables advance together.
         let tables = dataset.tables();
         let mut work: Vec<(String, u64)> = Vec::new();
-        for (name, _) in &tables {
+        for name in tables.keys() {
             for id in dataset.batch_ids(name) {
+                // Skip batch 0 — it was already processed during initialize().
+                if id == 0 {
+                    continue;
+                }
                 work.push((name.clone(), id));
             }
         }
diff --git a/crates/etl/src/main.rs b/crates/etl/src/main.rs
@@ -129,6 +129,7 @@ async fn main() -> anyhow::Result<()> {
         tracing::info!(table = %name, schema = ?config.schema, "Dataset table registered");
     }
 
+    pipeline.initialize().await?;
     pipeline.start()?;
 
     let final_state = pipeline.wait().await;
diff --git a/src/args/mod.rs b/src/args/mod.rs
diff --git a/src/commands/load/mod.rs b/src/commands/load/mod.rs
diff --git a/src/main.rs b/src/main.rs

Original file line number	Diff line number	Diff line change
`@@ -266,7 +266,7 @@ pub trait Dataset: Send + Sync {`
`266`	`266`	`async fn next_batches(&self) -> anyhow::Result<Option<HashMap<String, RecordBatch>>> {`
`267`	`267`	`let tables = self.tables();`
`268`	`268`	`let mut batches = HashMap::new();`
`269`		`- for (name, _) in &tables {`
	`269`	`+ for name in tables.keys() {`
`270`	`270`	`if let Some(batch) = self.next_batch(name).await? {`
`271`	`271`	`batches.insert(name.clone(), batch);`
`272`	`272`	`}`
Original file line number	Diff line number	Diff line change
`@@ -129,6 +129,7 @@ async fn main() -> anyhow::Result<()> {`
`129`	`129`	`tracing::info!(table = %name, schema = ?config.schema, "Dataset table registered");`
`130`	`130`	`}`
`131`	`131`
	`132`	`+ pipeline.initialize().await?;`
`132`	`133`	`pipeline.start()?;`
`133`	`134`
`134`	`135`	`let final_state = pipeline.wait().await;`