Skip to content

Commit 93d1c2c

Browse files
authored
Merge branch 'trunk' into peasee/260218-tpch-mutation-generation
2 parents 521c1df + 12ecd1c commit 93d1c2c

20 files changed

Lines changed: 801 additions & 363 deletions

File tree

.github/workflows/data_generation_run.yml

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,6 @@ on:
2323
required: false
2424
default: '8'
2525
type: string
26-
table_format:
27-
required: false
28-
default: 'parquet'
29-
type: string
30-
executor_instance_type:
31-
required: false
32-
default: 'github-hosted-ubuntu-latest'
33-
type: string
3426
region:
3527
required: false
3628
default: 'us-east-1'
@@ -79,16 +71,6 @@ on:
7971
required: true
8072
default: '8'
8173
type: string
82-
table_format:
83-
description: 'Table format for generated datasets (iceberg, parquet, delta)'
84-
required: false
85-
default: 'parquet'
86-
type: string
87-
executor_instance_type:
88-
description: 'Executor instance type label for benchmark comparison and dashboarding'
89-
required: false
90-
default: 'github-hosted-ubuntu-latest'
91-
type: string
9274
region:
9375
description: 'AWS region'
9476
required: true
@@ -138,8 +120,6 @@ jobs:
138120
BUCKET: ${{ inputs.bucket || github.event.inputs.bucket }}
139121
PREFIX_BASE: ${{ inputs.prefix || github.event.inputs.prefix || 'data-gen' }}
140122
MAX_CONCURRENCY: ${{ inputs.max_concurrency || github.event.inputs.max_concurrency || '8' }}
141-
TABLE_FORMAT: ${{ inputs.table_format || github.event.inputs.table_format || 'parquet' }}
142-
EXECUTOR_INSTANCE_TYPE: ${{ inputs.executor_instance_type || github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
143123
REGION: ${{ inputs.region || github.event.inputs.region || 'us-east-1' }}
144124
SKIP_INITIAL: ${{ inputs.skip_initial || github.event.inputs.skip_initial || false }}
145125
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
@@ -154,8 +134,6 @@ jobs:
154134
ARGS="${ARGS} --bucket ${BUCKET}"
155135
ARGS="${ARGS} --prefix ${PREFIX}"
156136
ARGS="${ARGS} --max-concurrency ${MAX_CONCURRENCY}"
157-
ARGS="${ARGS} --table-format ${TABLE_FORMAT}"
158-
ARGS="${ARGS} --executor-instance-type ${EXECUTOR_INSTANCE_TYPE}"
159137
ARGS="${ARGS} --region ${REGION}"
160138
161139
if [ "${SKIP_INITIAL}" = "true" ]; then

.github/workflows/pr.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,9 @@ jobs:
4444
- uses: actions/checkout@v6
4545

4646
- name: Setup Rust toolchain
47-
uses: actions-rust-lang/setup-rust-toolchain@v1
47+
uses: dtolnay/rust-toolchain@master
4848
with:
4949
toolchain: 1.91
50-
cache: false # Using GHA cache is slower than re-installing
5150

5251
- name: cargo check
5352
run: make check
@@ -70,10 +69,9 @@ jobs:
7069
token: ${{ secrets.GITHUB_TOKEN }}
7170

7271
- name: Setup Rust toolchain
73-
uses: actions-rust-lang/setup-rust-toolchain@v1
72+
uses: dtolnay/rust-toolchain@master
7473
with:
7574
toolchain: 1.91
76-
cache: false # Using GHA cache is slower than re-installing
7775

7876
- name: cargo fmt + clippy fix
7977
run: make fix

.github/workflows/run_spicebench.yml

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,6 @@ on:
2727
required: false
2828
default: 'data-gen/tpch_sf1'
2929
type: string
30-
etl_target_base_prefix:
31-
description: 'Base S3 key prefix for ETL target (rehydrated) data. A random suffix is appended per run.'
32-
required: false
33-
default: 'rehydrated'
34-
type: string
3530
etl_region:
3631
description: 'AWS region for the ETL S3 bucket'
3732
required: false
@@ -212,7 +207,6 @@ jobs:
212207
EXECUTOR_INSTANCE_TYPE: ${{ github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
213208
ETL_BUCKET: ${{ github.event.inputs.etl_bucket }}
214209
ETL_SOURCE_PREFIX: ${{ github.event.inputs.etl_source_prefix }}
215-
ETL_TARGET_BASE_PREFIX: ${{ github.event.inputs.etl_target_base_prefix }}
216210
ETL_REGION: ${{ github.event.inputs.etl_region || 'us-east-1' }}
217211
ETL_ENDPOINT: ${{ github.event.inputs.etl_endpoint }}
218212
ETL_NUM_STEPS: ${{ github.event.inputs.etl_num_steps || '25' }}
@@ -224,9 +218,6 @@ jobs:
224218
if [ -n "${ETL_SOURCE_PREFIX}" ]; then
225219
ETL_ARGS="${ETL_ARGS} --etl-source-prefix ${ETL_SOURCE_PREFIX}"
226220
fi
227-
if [ -n "${ETL_TARGET_BASE_PREFIX}" ]; then
228-
ETL_ARGS="${ETL_ARGS} --etl-target-base-prefix ${ETL_TARGET_BASE_PREFIX}"
229-
fi
230221
if [ -n "${ETL_REGION}" ]; then
231222
ETL_ARGS="${ETL_ARGS} --etl-region ${ETL_REGION}"
232223
fi

Cargo.lock

Lines changed: 2 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/data-generation/src/dataset/simple_sequence.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ impl SimpleSequenceDataset {
7878
/// Returns the static Arrow schema for the `integer_sequence` table.
7979
///
8080
/// Includes change-tracking columns (`_op`, `_op_index`).
81+
///
82+
/// The time column (`__created_at`) is not included; it will be added during
83+
/// ETL rehydration.
8184
pub fn schema() -> SchemaRef {
8285
Arc::new(Schema::new(vec![
8386
Field::new("id", DataType::Int64, false),

crates/data-generation/src/dataset/tpch.rs

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ use crate::dataset::key_set::{IndexedKeySet, PrimaryKeyValue};
4242

4343
use super::{Dataset, DatasetTable};
4444

45-
/// TPC-H table definitions: (table_name, time_column).
46-
const TPCH_TABLE_TIME_COLUMNS: &[(&str, &str)] = &[
45+
/// TPC-H table definitions: `(table_name, time_column)`.
46+
const TPCH_TABLES: &[(&str, &str)] = &[
4747
("region", "r_created_at"),
4848
("nation", "n_created_at"),
4949
("supplier", "s_created_at"),
@@ -416,10 +416,7 @@ impl Dataset for TpchDataset {
416416
}
417417

418418
fn num_batches(&self, table: &str) -> u64 {
419-
if !TPCH_TABLE_TIME_COLUMNS
420-
.iter()
421-
.any(|(name, _)| *name == table)
422-
{
419+
if !TPCH_TABLES.iter().any(|(name, _)| *name == table) {
423420
return 0;
424421
}
425422

@@ -562,7 +559,7 @@ impl Dataset for TpchDataset {
562559
}
563560

564561
fn tables(&self) -> HashMap<String, DatasetTable> {
565-
TPCH_TABLE_TIME_COLUMNS
562+
TPCH_TABLES
566563
.iter()
567564
.map(|(name, time_col)| {
568565
(

crates/data-generation/src/storage/mod.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,18 @@ use arrow::array::RecordBatch;
2020
use async_trait::async_trait;
2121
use std::collections::HashMap;
2222

23+
#[derive(Debug, Clone, PartialEq, Eq)]
24+
pub enum BatchOperation {
25+
Insert,
26+
Update { key_columns: Vec<String> },
27+
Delete { key_columns: Vec<String> },
28+
}
29+
2330
pub struct ReadResult {
2431
pub batches: Vec<RecordBatch>,
2532
pub rows_read: u64,
2633
pub bytes_read: u64,
34+
pub operation: BatchOperation,
2735
}
2836

2937
pub struct WriteResult {
@@ -57,6 +65,15 @@ pub trait DataStorage: Send + Sync + 'static {
5765
batch: RecordBatch,
5866
) -> anyhow::Result<WriteResult>;
5967

68+
async fn write_batch_operation(
69+
&self,
70+
_table_name: &str,
71+
_batch_id: u64,
72+
_operation: &BatchOperation,
73+
) -> anyhow::Result<()> {
74+
Ok(())
75+
}
76+
6077
fn table_params(&self, table_name: &str) -> HashMap<String, serde_json::Value>;
6178

6279
/// Returns the list of file paths/URIs that would exist after a successful

crates/data-generation/src/storage/s3.rs

Lines changed: 115 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ use parquet::basic::Compression;
3333
use parquet::file::properties::WriterProperties;
3434
use std::collections::HashMap;
3535

36-
use super::{ReadResult, WriteResult};
36+
use super::{BatchOperation, ReadResult, WriteResult};
3737

3838
/// Unified S3 storage backend that implements both [`Source`] and [`Target`].
3939
///
@@ -102,6 +102,90 @@ impl S3Storage {
102102
ObjectPath::from(format!("{}/{table_name}/", self.prefix))
103103
}
104104
}
105+
106+
pub(crate) fn batch_metadata_object_path(&self, table_name: &str, batch_id: u64) -> ObjectPath {
107+
if self.prefix.is_empty() {
108+
ObjectPath::from(format!("{table_name}/batch-{batch_id:06}.metadata.json"))
109+
} else {
110+
ObjectPath::from(format!(
111+
"{}/{table_name}/batch-{batch_id:06}.metadata.json",
112+
self.prefix
113+
))
114+
}
115+
}
116+
117+
async fn read_batch_operation(
118+
&self,
119+
table_name: &str,
120+
batch_id: u64,
121+
) -> anyhow::Result<BatchOperation> {
122+
let metadata_path = self.batch_metadata_object_path(table_name, batch_id);
123+
let get_result = match self.store.get(&metadata_path).await {
124+
Ok(r) => r,
125+
Err(object_store::Error::NotFound { .. }) => return Ok(BatchOperation::Insert),
126+
Err(e) => return Err(e.into()),
127+
};
128+
129+
let bytes = get_result.bytes().await?;
130+
let json: serde_json::Value = serde_json::from_slice(&bytes)?;
131+
132+
let op = json
133+
.get("operation")
134+
.and_then(serde_json::Value::as_str)
135+
.or_else(|| json.get("op").and_then(serde_json::Value::as_str))
136+
.unwrap_or("insert")
137+
.to_ascii_lowercase();
138+
139+
let parse_key_columns = || -> anyhow::Result<Vec<String>> {
140+
let Some(keys_value) = json.get("key_columns") else {
141+
anyhow::bail!(
142+
"Missing 'key_columns' in metadata sidecar for {}",
143+
metadata_path
144+
);
145+
};
146+
let Some(keys) = keys_value.as_array() else {
147+
anyhow::bail!(
148+
"Invalid 'key_columns' (expected string array) in metadata sidecar for {}",
149+
metadata_path
150+
);
151+
};
152+
153+
let parsed = keys
154+
.iter()
155+
.map(|v| {
156+
v.as_str().map(ToOwned::to_owned).ok_or_else(|| {
157+
anyhow::anyhow!(
158+
"Invalid key column entry (expected string) in metadata sidecar for {}",
159+
metadata_path
160+
)
161+
})
162+
})
163+
.collect::<anyhow::Result<Vec<String>>>()?;
164+
165+
if parsed.is_empty() {
166+
anyhow::bail!(
167+
"'key_columns' cannot be empty in metadata sidecar for {}",
168+
metadata_path
169+
);
170+
}
171+
172+
Ok(parsed)
173+
};
174+
175+
match op.as_str() {
176+
"insert" => Ok(BatchOperation::Insert),
177+
"update" => Ok(BatchOperation::Update {
178+
key_columns: parse_key_columns()?,
179+
}),
180+
"delete" => Ok(BatchOperation::Delete {
181+
key_columns: parse_key_columns()?,
182+
}),
183+
other => anyhow::bail!(
184+
"Unsupported operation '{other}' in metadata sidecar for {}",
185+
metadata_path
186+
),
187+
}
188+
}
105189
}
106190

107191
#[async_trait]
@@ -179,6 +263,33 @@ impl DataStorage for S3Storage {
179263
})
180264
}
181265

266+
async fn write_batch_operation(
267+
&self,
268+
table_name: &str,
269+
batch_id: u64,
270+
operation: &BatchOperation,
271+
) -> anyhow::Result<()> {
272+
let path = self.batch_metadata_object_path(table_name, batch_id);
273+
274+
let value = match operation {
275+
BatchOperation::Insert => serde_json::json!({
276+
"operation": "insert"
277+
}),
278+
BatchOperation::Update { key_columns } => serde_json::json!({
279+
"operation": "update",
280+
"key_columns": key_columns,
281+
}),
282+
BatchOperation::Delete { key_columns } => serde_json::json!({
283+
"operation": "delete",
284+
"key_columns": key_columns,
285+
}),
286+
};
287+
288+
let bytes = serde_json::to_vec(&value)?;
289+
self.store.put(&path, PutPayload::from(bytes)).await?;
290+
Ok(())
291+
}
292+
182293
async fn list_batches(&self, table_name: &str) -> anyhow::Result<Vec<String>> {
183294
let prefix = self.table_object_prefix(table_name);
184295

@@ -218,10 +329,13 @@ impl DataStorage for S3Storage {
218329
batches.push(batch);
219330
}
220331

332+
let operation = self.read_batch_operation(table_name, batch_id).await?;
333+
221334
Ok(Some(ReadResult {
222335
batches,
223336
rows_read,
224337
bytes_read,
338+
operation,
225339
}))
226340
}
227341
}

crates/etl/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@ name = "etl"
1717
path = "src/main.rs"
1818

1919
[dependencies]
20+
adbc_client = { path = "../adbc_client" }
2021
anyhow.workspace = true
2122
arrow.workspace = true
2223
async-trait.workspace = true
24+
chrono.workspace = true
2325
clap = { workspace = true, features = ["derive"] }
2426
data-generation = { path = "../data-generation" }
2527
serde_json.workspace = true
@@ -28,4 +30,3 @@ tokio.workspace = true
2830
tokio-util.workspace = true
2931
tracing.workspace = true
3032
tracing-subscriber = { workspace = true, features = ["env-filter"] }
31-
uuid = { workspace = true, features = ["v4"] }

0 commit comments

Comments
 (0)