Skip to content

Commit 6327983

Browse files
feat: Staging table support for MERGE-based updates (#243)
- Add create_staging_table JSON-RPC method to system-adapter-protocol - AdbcSink accepts optional system adapter client for remote staging table creation - Strip primary_key_columns when SPICEBENCH_ADBC_UPDATE_STRATEGY=staging_table - Gracefully fall back if adapter does not support create_staging_table (METHOD_NOT_FOUND)
1 parent b5fc2ea commit 6327983

10 files changed

Lines changed: 157 additions & 18 deletions

File tree

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/etl/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ tempfile.workspace = true
3030
tokio.workspace = true
3131
tokio-util.workspace = true
3232
tracing.workspace = true
33+
uuid.workspace = true
3334

3435
[features]
3536
default = []

crates/etl/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -874,7 +874,7 @@ async fn write_segments_for_batch(
874874
partition_columns.to_vec(),
875875
)
876876
.await
877-
.map_err(|e| format!("write {table_name_owned} batch {batch_id}: {e}"))?;
877+
.map_err(|e| format!("write {table_name_owned} batch {batch_id}: {e:#}"))?;
878878
}
879879

880880
return Ok(());

crates/etl/src/sink/adbc.rs

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,10 @@ limitations under the License.
1515
*/
1616

1717
use std::collections::{HashMap, HashSet};
18+
use std::sync::Arc;
1819
use std::sync::atomic::{AtomicU64, Ordering};
1920
use std::time::Instant;
20-
use tokio::sync::{RwLock, mpsc};
21+
use tokio::sync::{Mutex, RwLock, mpsc};
2122
use tokio::task::JoinHandle;
2223

2324
use adbc_client::{
@@ -28,7 +29,8 @@ use arrow::datatypes::{DataType, Schema};
2829
use arrow::record_batch::RecordBatchIterator;
2930
use arrow_cast::display::array_value_to_string;
3031
use async_trait::async_trait;
31-
use system_adapter_protocol::DatasetConfig;
32+
use system_adapter_protocol::{Client as SystemAdapterClient, DatasetConfig};
33+
use uuid::Uuid;
3234

3335
use super::{InsertOp, Sink};
3436

@@ -174,6 +176,8 @@ pub struct AdbcSink {
174176
reuse_bulk_ingest_streams: bool,
175177
flush_stream_before_upsert: bool,
176178
bulk_ingest_stream_buffer: usize,
179+
/// Optional system adapter client for staging table creation.
180+
staging_adapter: Option<(Arc<Mutex<SystemAdapterClient>>, Uuid)>,
177181
}
178182

179183
impl AdbcSink {
@@ -230,11 +234,15 @@ impl AdbcSink {
230234
}
231235

232236
/// Creates a new [`AdbcSink`] backed by a connection pool.
237+
///
238+
/// When a system adapter client and run ID are provided, the `StagingTable`
239+
/// update strategy will use the adapter to create staging tables remotely.
233240
pub fn new(
234241
driver_name: &str,
235242
db_kwargs: HashMap<String, serde_json::Value>,
236243
target_db_catalog: Option<String>,
237244
target_db_schema: Option<String>,
245+
staging_adapter: Option<(Arc<Mutex<SystemAdapterClient>>, Uuid)>,
238246
) -> anyhow::Result<Self> {
239247
let update_strategy = UpdateStrategy::from_env()?;
240248
let pool_size = Self::pool_size();
@@ -269,6 +277,7 @@ impl AdbcSink {
269277
reuse_bulk_ingest_streams,
270278
flush_stream_before_upsert,
271279
bulk_ingest_stream_buffer,
280+
staging_adapter,
272281
})
273282
}
274283

@@ -1148,10 +1157,11 @@ impl AdbcSink {
11481157

11491158
/// Perform an UPDATE via a temporary staging table:
11501159
///
1151-
/// 1. Bulk-ingest the update batch into a staging table.
1152-
/// 2. `MERGE INTO target USING staging ON … WHEN MATCHED THEN UPDATE SET …`
1153-
/// 3. `DROP TABLE staging`.
1154-
fn staging_merge_update(
1160+
/// 1. Create the staging table via the hook (same schema/partitioning as target).
1161+
/// 2. Bulk-ingest the update batch into the staging table.
1162+
/// 3. `MERGE INTO target USING staging ON … WHEN MATCHED THEN UPDATE SET …`
1163+
/// 4. `DROP TABLE staging` (via hook, then SQL fallback).
1164+
async fn staging_merge_update(
11551165
&self,
11561166
conn: &mut AdbcConnection,
11571167
table_name: &str,
@@ -1182,15 +1192,30 @@ impl AdbcSink {
11821192
.as_millis();
11831193
let staging_table = format!("_spicebench_stg_{table_name}_{ts}");
11841194

1185-
// 1. Bulk-ingest batch into the staging table.
1195+
// 1. Create the staging table via the system adapter (if available).
1196+
if let Some((client, run_id)) = &self.staging_adapter {
1197+
client
1198+
.lock()
1199+
.await
1200+
.create_staging_table(*run_id, table_name, &staging_table)
1201+
.await
1202+
.map_err(|e| {
1203+
anyhow::anyhow!(
1204+
"Failed to create staging table '{staging_table}' for '{table_name}': {e}"
1205+
)
1206+
})?;
1207+
}
1208+
1209+
// 2. Bulk-ingest batch into the staging table.
11861210
if let Err(e) = self.ingest_insert_batch(conn, &staging_table, batch) {
1187-
self.drop_staging_table(conn, &staging_table);
1211+
self.drop_staging_table_best_effort(conn, &staging_table)
1212+
.await;
11881213
return Err(e.context(format!(
11891214
"Failed to ingest update data into staging table '{staging_table}'"
11901215
)));
11911216
}
11921217

1193-
// 2. MERGE INTO target from staging.
1218+
// 3. MERGE INTO target from staging.
11941219
let merge_sql = Self::build_staging_merge_sql(
11951220
&self.target_table_identifier(table_name),
11961221
&self.target_table_identifier(&staging_table),
@@ -1202,8 +1227,9 @@ impl AdbcSink {
12021227
.execute_update(&merge_sql)
12031228
.map_err(|e| anyhow::anyhow!("MERGE INTO update failed for '{table_name}': {e}"));
12041229

1205-
// 3. Drop staging table (always, even on merge failure).
1206-
self.drop_staging_table(conn, &staging_table);
1230+
// 4. Drop staging table (always, even on merge failure).
1231+
self.drop_staging_table_best_effort(conn, &staging_table)
1232+
.await;
12071233

12081234
merge_result?;
12091235
tracing::debug!(
@@ -1215,8 +1241,8 @@ impl AdbcSink {
12151241
Ok(())
12161242
}
12171243

1218-
/// Best-effort drop of a staging table.
1219-
fn drop_staging_table(&self, conn: &mut AdbcConnection, staging_table: &str) {
1244+
/// Best-effort drop of a staging table via SQL DROP TABLE.
1245+
async fn drop_staging_table_best_effort(&self, conn: &mut AdbcConnection, staging_table: &str) {
12201246
let drop_sql = format!(
12211247
"DROP TABLE IF EXISTS {}",
12221248
self.target_table_identifier(staging_table)
@@ -1372,7 +1398,8 @@ impl Sink for AdbcSink {
13721398
let mut conn = self.pool.get().map_err(|e| {
13731399
anyhow::anyhow!("Failed to get ADBC connection from pool: {e}")
13741400
})?;
1375-
self.staging_merge_update(&mut conn, table_name, batch, &key_columns)?;
1401+
self.staging_merge_update(&mut conn, table_name, batch, &key_columns)
1402+
.await?;
13761403
}
13771404
UpdateStrategy::BulkIngestUpsert => {
13781405
if self.reuse_bulk_ingest_streams && !self.flush_stream_before_upsert {

crates/system-adapter-protocol/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ serde = { workspace = true }
1212
serde_json = { workspace = true }
1313
uuid = { workspace = true, features = ["serde", "v4"] }
1414
tokio = { workspace = true, features = ["process", "io-util", "io-std"] }
15+
tracing = { workspace = true }
1516
reqwest = { workspace = true, optional = true }
1617
async-trait = { workspace = true, optional = true }
1718

crates/system-adapter-protocol/src/client.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,40 @@ impl Client {
228228
.ok_or_else(|| ClientError::InvalidResponse("Missing result".to_string()))
229229
}
230230

231+
/// Create a staging table for MERGE-based updates.
232+
///
233+
/// If the remote adapter does not support this method (returns
234+
/// `METHOD_NOT_FOUND`), the call is treated as a successful no-op so that
235+
/// newer spicebench versions work against older adapters.
236+
pub async fn create_staging_table(
237+
&mut self,
238+
run_id: uuid::Uuid,
239+
source_dataset: &str,
240+
staging_table_name: &str,
241+
) -> Result<crate::CreateStagingTableResponse> {
242+
let request = crate::CreateStagingTableRequest {
243+
run_id,
244+
source_dataset: source_dataset.to_string(),
245+
staging_table_name: staging_table_name.to_string(),
246+
};
247+
let rpc_request = JsonRpcRequest::new(1, crate::methods::CREATE_STAGING_TABLE, request);
248+
match self.call_typed(rpc_request).await {
249+
Ok(response) => response
250+
.result
251+
.ok_or_else(|| ClientError::InvalidResponse("Missing result".to_string())),
252+
Err(ClientError::JsonRpc(ref e)) if e.code == crate::error_codes::METHOD_NOT_FOUND => {
253+
tracing::warn!(
254+
source_dataset,
255+
staging_table_name,
256+
"System adapter does not support create_staging_table; \
257+
falling back to implicit table creation via bulk ingest"
258+
);
259+
Ok(crate::CreateStagingTableResponse { ok: true })
260+
}
261+
Err(e) => Err(e),
262+
}
263+
}
264+
231265
/// Make a typed JSON-RPC call with request and response types
232266
async fn call_typed<Req: Serialize, Resp: DeserializeOwned>(
233267
&mut self,

crates/system-adapter-protocol/src/lib.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,10 +371,31 @@ pub mod error_codes {
371371
pub const INTERNAL_ERROR: i32 = -32603;
372372
}
373373

374+
/// Request to create a staging table for MERGE-based updates.
375+
///
376+
/// JSON-RPC method: `create_staging_table`
377+
#[derive(Debug, Clone, Serialize, Deserialize)]
378+
pub struct CreateStagingTableRequest {
379+
/// Unique identifier for the benchmark run
380+
pub run_id: Uuid,
381+
/// Name of the source dataset this staging table is based on
382+
pub source_dataset: String,
383+
/// Name to use for the staging table
384+
pub staging_table_name: String,
385+
}
386+
387+
/// Response from create staging table request
388+
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
389+
pub struct CreateStagingTableResponse {
390+
/// Indicates if the staging table was created successfully
391+
pub ok: bool,
392+
}
393+
374394
/// Method names for the system adapter protocol
375395
pub mod methods {
376396
pub const SETUP: &str = "setup";
377397
pub const TEARDOWN: &str = "teardown";
378398
pub const METRICS: &str = "metrics";
399+
pub const CREATE_STAGING_TABLE: &str = "create_staging_table";
379400
pub const RPC_METHODS: &str = "rpc.methods";
380401
}

crates/system-adapter-protocol/src/server.rs

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@ limitations under the License.
1717
//! Server implementations for system adapter JSON-RPC protocol.
1818
1919
use crate::{
20-
DatasetConfig, EtlSinkType, JsonRpcError, JsonRpcResponse, MetricsRequest, MetricsResponse,
21-
SetupRequest, SetupResponse, TeardownRequest, TeardownResponse, error_codes, methods,
20+
CreateStagingTableRequest, CreateStagingTableResponse, DatasetConfig, EtlSinkType,
21+
JsonRpcError, JsonRpcResponse, MetricsRequest, MetricsResponse, SetupRequest, SetupResponse,
22+
TeardownRequest, TeardownResponse, error_codes, methods,
2223
};
2324
use async_trait::async_trait;
2425
use serde::de::DeserializeOwned;
@@ -106,9 +107,25 @@ pub trait Handler: Send + Sync {
106107
methods::SETUP.to_string(),
107108
methods::TEARDOWN.to_string(),
108109
methods::METRICS.to_string(),
110+
methods::CREATE_STAGING_TABLE.to_string(),
109111
methods::RPC_METHODS.to_string(),
110112
]
111113
}
114+
115+
/// Create a staging table for MERGE-based updates.
116+
///
117+
/// The staging table should have the same schema and partitioning as the
118+
/// source dataset but with the given staging table name. Adapters that
119+
/// handle staging table creation implicitly (e.g. via bulk ingest) can
120+
/// leave the default no-op implementation.
121+
async fn create_staging_table(
122+
&mut self,
123+
_run_id: Uuid,
124+
_source_dataset: &str,
125+
_staging_table_name: &str,
126+
) -> std::result::Result<CreateStagingTableResponse, String> {
127+
Ok(CreateStagingTableResponse { ok: true })
128+
}
112129
}
113130

114131
/// System adapter server
@@ -182,6 +199,9 @@ impl<H: Handler> Server<H> {
182199
methods::SETUP => self.handle_setup(&request, id.clone()).await,
183200
methods::TEARDOWN => self.handle_teardown(&request, id.clone()).await,
184201
methods::METRICS => self.handle_metrics(&request, id.clone()).await,
202+
methods::CREATE_STAGING_TABLE => {
203+
self.handle_create_staging_table(&request, id.clone()).await
204+
}
185205
methods::RPC_METHODS => self.handle_rpc_methods(id.clone()).await,
186206
_ => serde_json::to_value(JsonRpcResponse::<()>::error(
187207
id,
@@ -272,6 +292,23 @@ impl<H: Handler> Server<H> {
272292
Self::handler_response(self.handler.metrics(req.run_id, req.final_scrape).await, id)
273293
}
274294

295+
async fn handle_create_staging_table(
296+
&mut self,
297+
request: &serde_json::Value,
298+
id: serde_json::Value,
299+
) -> serde_json::Value {
300+
let req: CreateStagingTableRequest = match Self::parse_params(request, &id) {
301+
Ok(r) => r,
302+
Err(e) => return e,
303+
};
304+
Self::handler_response(
305+
self.handler
306+
.create_staging_table(req.run_id, &req.source_dataset, &req.staging_table_name)
307+
.await,
308+
id,
309+
)
310+
}
311+
275312
async fn handle_rpc_methods(&mut self, id: serde_json::Value) -> serde_json::Value {
276313
let methods = self.handler.rpc_methods();
277314
let result = serde_json::json!({ "methods": methods });

src/commands/etl_cmd.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ pub async fn execute(args: &EtlArgs) -> anyhow::Result<()> {
138138
db_kwargs,
139139
args.adbc_catalog.clone(),
140140
args.adbc_schema.clone(),
141+
None,
141142
)?);
142143

143144
(

src/commands/run.rs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,13 +118,27 @@ async fn run_benchmark(
118118
let etl_sink_type = system_adapter_protocol::EtlSinkType::Adbc;
119119
let target_config = None;
120120

121-
let datasets = ETLPipeline::create_tables_request_datasets(
121+
let mut datasets = ETLPipeline::create_tables_request_datasets(
122122
dataset_source.clone(),
123123
&generation_config,
124124
Arc::clone(&data_source),
125125
&mutations,
126126
target_config.clone(),
127127
)?;
128+
129+
// When using the staging_table update strategy, tables must NOT have
130+
// primary keys. MERGE INTO handles matching via the ON clause and uses
131+
// delete+insert execution, which conflicts with Cayenne's automatic
132+
// on_conflict: Upsert behavior on primary-key tables.
133+
let uses_staging_table = std::env::var("SPICEBENCH_ADBC_UPDATE_STRATEGY")
134+
.ok()
135+
.map(|v| v.eq_ignore_ascii_case("staging_table"))
136+
.unwrap_or(false);
137+
if uses_staging_table {
138+
for config in datasets.values_mut() {
139+
config.primary_key_columns.clear();
140+
}
141+
}
128142
let (setup_response, mut pipeline) = {
129143
let setup_response = system_adapter_client
130144
.lock()
@@ -156,6 +170,7 @@ async fn run_benchmark(
156170
db_kwargs,
157171
target_db_catalog,
158172
target_db_schema,
173+
Some((Arc::clone(&system_adapter_client), run_id)),
159174
)?);
160175

161176
let mut pipeline = ETLPipeline::new(

0 commit comments

Comments
 (0)