Skip to content

Commit 026cb00

Browse files
peaseeJeadie
andauthored
feat: Connect ETL to spicebench (#49)
* feat: Connect ETL pipeline in spicebench * fix: Update run spicebench args * fix * fix * chore: fmt, move pipeline start to load run * fix log * formatting --------- Co-authored-by: jeadie <jack@spice.ai>
1 parent 4c22c41 commit 026cb00

12 files changed

Lines changed: 283 additions & 65 deletions

File tree

.github/workflows/data_generation_run.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ on:
5050
runner_type:
5151
description: 'GitHub runner label to execute on'
5252
required: false
53-
default: 'ubuntu-latest'
53+
default: 'spiceai-macos'
5454
type: string
5555
scale_factor:
5656
description: 'TPC-H scale factor'

.github/workflows/run_spicebench.yml

Lines changed: 42 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -17,66 +17,39 @@ on:
1717
options:
1818
- spidapter
1919
- databricks
20-
run_data_generation:
21-
description: 'Run data-generation before spicebench'
22-
required: false
23-
default: false
24-
type: boolean
25-
data_generation_runner_type:
26-
description: 'Runner label for data-generation job'
27-
required: false
28-
default: 'ubuntu-latest'
20+
etl_bucket:
21+
description: 'S3 bucket for ETL source and target data'
22+
required: true
23+
default: "spiceai-public-datasets"
2924
type: string
30-
data_generation_scale_factor:
31-
description: 'Data-generation scale factor'
25+
etl_source_prefix:
26+
description: 'S3 key prefix for ETL source data'
3227
required: false
33-
default: '1.0'
28+
default: 'data-gen/tpch_sf1'
3429
type: string
35-
data_generation_bucket:
36-
description: 'S3 bucket for generated data (required if run_data_generation=true)'
30+
etl_target_prefix:
31+
description: 'S3 key prefix for ETL target (rehydrated) data'
3732
required: false
33+
default: 'rehydrated/tpch_sf1'
3834
type: string
39-
data_generation_prefix:
40-
description: 'Base S3 prefix for generated data (scenario appended automatically)'
35+
etl_region:
36+
description: 'AWS region for the ETL S3 bucket'
4137
required: false
42-
default: 'data-gen'
38+
default: "us-east-1"
4339
type: string
44-
data_generation_max_concurrency:
45-
description: 'Data-generation max concurrent S3 writes'
40+
etl_endpoint:
41+
description: 'S3 endpoint URL for ETL bucket (for MinIO/LocalStack)'
4642
required: false
47-
default: '8'
4843
type: string
49-
data_generation_region:
50-
description: 'AWS region for data-generation'
44+
etl_num_steps:
45+
description: 'Number of ETL data generation steps (partitions)'
5146
required: false
52-
default: 'us-east-1'
47+
default: '25'
5348
type: string
54-
data_generation_skip_initial:
55-
description: 'Skip data-generation initial ingest'
56-
required: false
57-
default: false
58-
type: boolean
5949

6050
jobs:
61-
data-generation:
62-
name: Run data generation
63-
if: ${{ github.event.inputs.run_data_generation == 'true' }}
64-
uses: ./.github/workflows/data_generation_run.yml
65-
with:
66-
scenario: ${{ github.event.inputs.scenario }}
67-
runner_type: ${{ github.event.inputs.data_generation_runner_type }}
68-
scale_factor: ${{ github.event.inputs.data_generation_scale_factor }}
69-
bucket: ${{ github.event.inputs.data_generation_bucket }}
70-
prefix: ${{ github.event.inputs.data_generation_prefix }}
71-
max_concurrency: ${{ github.event.inputs.data_generation_max_concurrency }}
72-
region: ${{ github.event.inputs.data_generation_region }}
73-
skip_initial: ${{ github.event.inputs.data_generation_skip_initial == 'true' }}
74-
secrets: inherit
75-
7651
run-spicebench:
7752
name: Run spicebench
78-
needs: [data-generation]
79-
if: ${{ always() && (needs.data-generation.result == 'success' || needs.data-generation.result == 'skipped') }}
8053
runs-on: ubuntu-latest
8154
timeout-minutes: 600
8255
steps:
@@ -222,7 +195,30 @@ jobs:
222195
SPICEAI_BENCHMARK_METRICS_KEY: ${{ secrets.SPICEAI_BENCHMARK_METRICS_KEY }}
223196
SCENARIO: ${{ github.event.inputs.scenario || 'tpch' }}
224197
SYSTEM_ADAPTER: ${{ github.event.inputs.system_adapter || 'spidapter' }}
198+
ETL_BUCKET: ${{ github.event.inputs.etl_bucket }}
199+
ETL_SOURCE_PREFIX: ${{ github.event.inputs.etl_source_prefix }}
200+
ETL_TARGET_PREFIX: ${{ github.event.inputs.etl_target_prefix }}
201+
ETL_REGION: ${{ github.event.inputs.etl_region || 'us-east-1' }}
202+
ETL_ENDPOINT: ${{ github.event.inputs.etl_endpoint }}
203+
ETL_NUM_STEPS: ${{ github.event.inputs.etl_num_steps || '25' }}
204+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
205+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
206+
RUST_LOG: "info"
225207
run: |
208+
ETL_ARGS="--etl-bucket ${ETL_BUCKET} --etl-num-steps ${ETL_NUM_STEPS}"
209+
if [ -n "${ETL_SOURCE_PREFIX}" ]; then
210+
ETL_ARGS="${ETL_ARGS} --etl-source-prefix ${ETL_SOURCE_PREFIX}"
211+
fi
212+
if [ -n "${ETL_TARGET_PREFIX}" ]; then
213+
ETL_ARGS="${ETL_ARGS} --etl-target-prefix ${ETL_TARGET_PREFIX}"
214+
fi
215+
if [ -n "${ETL_REGION}" ]; then
216+
ETL_ARGS="${ETL_ARGS} --etl-region ${ETL_REGION}"
217+
fi
218+
if [ -n "${ETL_ENDPOINT}" ]; then
219+
ETL_ARGS="${ETL_ARGS} --etl-endpoint ${ETL_ENDPOINT}"
220+
fi
221+
226222
if [ "${SYSTEM_ADAPTER}" = "databricks" ]; then
227223
ADAPTER_CMD="${HOME}/.spice/bin/databricks-system-adapter"
228224
ADAPTER_ARGS="stdio"
@@ -244,6 +240,7 @@ jobs:
244240
~/.spice/bin/spicebench \
245241
--concurrency 2 \
246242
--scenario "${SCENARIO}" \
243+
${ETL_ARGS} \
247244
--system-adapter-stdio-cmd "${ADAPTER_CMD}" \
248245
--system-adapter-stdio-args "${ADAPTER_ARGS}" \
249246
${ADAPTER_ENVS}

Cargo.lock

Lines changed: 4 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,14 +136,17 @@ adbc_client = { path = "crates/adbc_client" }
136136
arrow.workspace = true
137137
async-trait.workspace = true
138138
clap.workspace = true
139+
data-generation = { path = "crates/data-generation" }
140+
etl = { path = "crates/etl" }
139141
reqwest.workspace = true
140142
serde.workspace = true
141-
spicepod = { path = "crates/spicepod" }
142143
system-adapter-protocol = { path = "crates/system-adapter-protocol" }
143144
test-framework = { path = "crates/test-framework" }
144145
tokio.workspace = true
145146
tokio-util.workspace = true
147+
tracing.workspace = true
146148
uuid.workspace = true
149+
tracing-subscriber = { workspace = true, features = ["env-filter"] }
147150

148151
[features]
149152
default = []

crates/data-generation/src/dataset/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ pub trait Dataset: Send + Sync {
266266
async fn next_batches(&self) -> anyhow::Result<Option<HashMap<String, RecordBatch>>> {
267267
let tables = self.tables();
268268
let mut batches = HashMap::new();
269-
for (name, _) in &tables {
269+
for name in tables.keys() {
270270
if let Some(batch) = self.next_batch(name).await? {
271271
batches.insert(name.clone(), batch);
272272
}

crates/data-generation/src/target/s3.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ pub struct S3Target {
3535
store: Arc<dyn ObjectStore>,
3636
bucket: String,
3737
prefix: String,
38+
region: Option<String>,
3839
}
3940

4041
impl S3Target {
@@ -59,6 +60,7 @@ impl S3Target {
5960
store,
6061
bucket: config.bucket.clone(),
6162
prefix: config.prefix.clone(),
63+
region: config.region.clone(),
6264
})
6365
}
6466

@@ -97,9 +99,21 @@ impl Target for S3Target {
9799
serde_json::Value::String("s3".to_string()),
98100
);
99101
params.insert(
100-
"location".to_string(),
102+
"from".to_string(),
101103
serde_json::Value::String(self.table_s3_path(table_name)),
102104
);
105+
params.insert(
106+
"file_format".to_string(),
107+
serde_json::Value::String("parquet".to_string()),
108+
);
109+
110+
if let Some(region) = &self.region {
111+
params.insert(
112+
"s3_region".to_string(),
113+
serde_json::Value::String(region.clone()),
114+
);
115+
}
116+
103117
params
104118
}
105119

crates/etl/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ path = "src/main.rs"
2020
anyhow.workspace = true
2121
clap = { workspace = true, features = ["derive"] }
2222
data-generation = { path = "../data-generation" }
23-
serde_json.workspace = true
2423
system-adapter-protocol = { path = "../system-adapter-protocol" }
2524
tokio.workspace = true
2625
tokio-util.workspace = true

crates/etl/src/lib.rs

Lines changed: 104 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ pub enum PipelineState {
6161
/// The pipeline has been created with a dataset, source, and target but has
6262
/// not yet started processing.
6363
NotStarted,
64+
/// The pipeline has been initialized: the first batch for every table has
65+
/// been ETL'd into the target so the system adapter can discover initial
66+
/// data.
67+
Initialized,
6468
/// The pipeline is actively rehydrating batches (in order of batch ID) from
6569
/// the configured [`Source`] into the configured [`Target`].
6670
Running,
@@ -88,9 +92,12 @@ pub enum StopReason {
8892
/// 1. **[`NotStarted`](PipelineState::NotStarted)** — created via [`ETLPipeline::new`]
8993
/// with a dataset, source, and target. Call [`setup_request_datasets`](ETLPipeline::setup_request_datasets)
9094
/// to obtain the dataset configurations that a system adapter needs.
91-
/// 2. **[`Running`](PipelineState::Running)** — the pipeline is actively processing
92-
/// batches.
93-
/// 3. **[`Stopped`](PipelineState::Stopped)** — the pipeline finished, was cancelled,
95+
/// 2. **[`Initialized`](PipelineState::Initialized)** — the first batch (batch 0)
96+
/// has been ETL'd into the target via [`initialize`](ETLPipeline::initialize).
97+
/// The system adapter can now discover initial data.
98+
/// 3. **[`Running`](PipelineState::Running)** — the pipeline is actively processing
99+
/// remaining batches (batch 1+).
100+
/// 4. **[`Stopped`](PipelineState::Stopped)** — the pipeline finished, was cancelled,
94101
/// or hit an error.
95102
pub struct ETLPipeline {
96103
dataset_source: DatasetSource,
@@ -185,11 +192,93 @@ impl ETLPipeline {
185192
.collect()
186193
}
187194

188-
/// Starts the ETL pipeline, transitioning from [`PipelineState::NotStarted`]
195+
/// Initializes the ETL pipeline by processing only the first batch (batch
196+
/// ID 0) for every table.
197+
///
198+
/// This ensures the target has some initial data before calling
199+
/// `setup()` on the system adapter. After successful initialization the
200+
/// pipeline transitions to [`PipelineState::Initialized`].
201+
///
202+
/// Returns an error if the pipeline is not in the [`NotStarted`] state or
203+
/// if any batch fails to process.
204+
pub async fn initialize(&mut self) -> anyhow::Result<()> {
205+
if *self.state_rx.borrow() != PipelineState::NotStarted {
206+
anyhow::bail!(
207+
"Cannot initialize pipeline: current state is {:?}",
208+
*self.state_rx.borrow()
209+
);
210+
}
211+
212+
let tables = self.dataset.tables();
213+
let first_batch_id = 0u64;
214+
215+
let mut join_set: JoinSet<Result<String, String>> = JoinSet::new();
216+
for table_name in tables.keys() {
217+
let dataset = Arc::clone(&self.dataset);
218+
let source = Arc::clone(&self.source);
219+
let target = Arc::clone(&self.target);
220+
let table_name = table_name.clone();
221+
222+
join_set.spawn(async move {
223+
let read_result = source
224+
.read_batch(&table_name, first_batch_id)
225+
.await
226+
.map_err(|e| format!("read {table_name} batch {first_batch_id}: {e}"))?
227+
.ok_or_else(|| {
228+
format!("No data for table {table_name} at batch {first_batch_id}")
229+
})?;
230+
231+
for batch in read_result.batches {
232+
let rehydrated = dataset.rehydrate(&table_name, &batch).map_err(|e| {
233+
format!("rehydrate {table_name} batch {first_batch_id}: {e}")
234+
})?;
235+
236+
target
237+
.write(&table_name, first_batch_id, rehydrated)
238+
.await
239+
.map_err(|e| format!("write {table_name} batch {first_batch_id}: {e}"))?;
240+
}
241+
242+
info!(
243+
table = %table_name,
244+
batch_id = first_batch_id,
245+
"Initial batch processed"
246+
);
247+
Ok(table_name)
248+
});
249+
}
250+
251+
while let Some(result) = join_set.join_next().await {
252+
match result {
253+
Ok(Ok(_table_name)) => {}
254+
Ok(Err(err_msg)) => {
255+
let _ = self
256+
.state_tx
257+
.send(PipelineState::Stopped(StopReason::Error(err_msg.clone())));
258+
anyhow::bail!("ETL initialization failed: {err_msg}");
259+
}
260+
Err(e) => {
261+
let msg = format!("Task panicked during initialization: {e}");
262+
let _ = self
263+
.state_tx
264+
.send(PipelineState::Stopped(StopReason::Error(msg.clone())));
265+
anyhow::bail!("{msg}");
266+
}
267+
}
268+
}
269+
270+
info!("ETL pipeline initialized with first batch for all tables");
271+
let _ = self.state_tx.send(PipelineState::Initialized);
272+
Ok(())
273+
}
274+
275+
/// Starts the ETL pipeline, transitioning from [`PipelineState::Initialized`]
189276
/// to [`PipelineState::Running`].
190277
///
191278
/// Spawns a background tokio task that iterates over every table and
192-
/// processes batch IDs in ascending order. For each batch the task:
279+
/// processes batch IDs in ascending order, skipping batch 0 which was
280+
/// already processed during [`initialize`](ETLPipeline::initialize). For
281+
/// each batch the task:
193282
///
194283
/// 1. Reads the batch from the [`Source`].
195284
/// 2. Rehydrates it through the [`Dataset`] (appending time columns, etc.).
@@ -198,12 +287,13 @@ impl ETLPipeline {
198287
/// The task transitions to [`PipelineState::Stopped`] when all batches are
199288
/// processed, the [`CancellationToken`] is triggered, or an error occurs.
200289
///
201-
/// Returns an error if the pipeline is not in the [`NotStarted`] state.
290+
/// Returns an error if the pipeline is not in the [`Initialized`] state.
202291
pub fn start(&mut self) -> anyhow::Result<()> {
203-
if *self.state_rx.borrow() != PipelineState::NotStarted {
292+
let current_state = self.state_rx.borrow().clone();
293+
if current_state != PipelineState::Initialized {
204294
anyhow::bail!(
205-
"Cannot start pipeline: current state is {:?}",
206-
*self.state_rx.borrow()
295+
"Cannot start pipeline: current state is {:?} (must be Initialized)",
296+
current_state
207297
);
208298
}
209299

@@ -219,8 +309,12 @@ impl ETLPipeline {
219309
// batch_id so all tables advance together.
220310
let tables = dataset.tables();
221311
let mut work: Vec<(String, u64)> = Vec::new();
222-
for (name, _) in &tables {
312+
for name in tables.keys() {
223313
for id in dataset.batch_ids(name) {
314+
// Skip batch 0 — it was already processed during initialize().
315+
if id == 0 {
316+
continue;
317+
}
224318
work.push((name.clone(), id));
225319
}
226320
}

crates/etl/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ async fn main() -> anyhow::Result<()> {
129129
tracing::info!(table = %name, schema = ?config.schema, "Dataset table registered");
130130
}
131131

132+
pipeline.initialize().await?;
132133
pipeline.start()?;
133134

134135
let final_state = pipeline.wait().await;

0 commit comments

Comments
 (0)