Skip to content

Commit e70c954

Browse files
fix: Add ADBC sink options back to spicebench execution (#135)
* fix: Add ADBC sink options back to spicebench execution * chore: auto-fix cargo fmt + clippy * Update run_spicebench.yml --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent 291f4d4 commit e70c954

5 files changed

Lines changed: 267 additions & 62 deletions

File tree

.github/workflows/run_spicebench.yml

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -37,20 +37,14 @@ on:
3737
required: false
3838
default: 'us-east-1'
3939
type: string
40-
etl_endpoint:
41-
description: 'S3 endpoint URL for ETL bucket (for MinIO/LocalStack)'
40+
etl_sink:
41+
description: 'ETL sink (hive or adbc)'
4242
required: false
43-
type: string
44-
table_format:
45-
description: 'Table format across generation and adapter setup (iceberg, parquet, delta)'
46-
required: false
47-
default: 'parquet'
48-
type: string
49-
executor_instance_type:
50-
description: 'Executor instance type label for benchmark comparison and dashboarding'
51-
required: false
52-
default: 'github-hosted-ubuntu-latest'
53-
type: string
43+
default: 'hive'
44+
type: choice
45+
options:
46+
- hive
47+
- adbc
5448
num_query_clients:
5549
description: 'Number of query clients to run (maps to spicebench --concurrency)'
5650
required: false
@@ -126,7 +120,6 @@ jobs:
126120
DATABRICKS_SQL_WAREHOUSE_ID: ${{ secrets.DATABRICKS_SQL_WAREHOUSE_ID }}
127121
DATABRICKS_CATALOG: ${{ secrets.DATABRICKS_CATALOG }}
128122
DATABRICKS_SCHEMA: ${{ secrets.DATABRICKS_SCHEMA }}
129-
DATABRICKS_TABLE_FORMAT: ${{ github.event.inputs.table_format || 'parquet' }}
130123
run: |
131124
set -euo pipefail
132125
SYSTEM_UNDER_TEST_PREFIX="${SYSTEM_UNDER_TEST%%-*}"
@@ -187,12 +180,10 @@ jobs:
187180
esac
188181
189182
- name: Install ADBC driver
190-
env:
191-
SYSTEM_UNDER_TEST: ${{ github.event.inputs.system_under_test || 'spice_cloud' }}
192-
EXECUTOR_INSTANCE_TYPE: ${{ github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
193183
run: |
194184
set -euo pipefail
195185
curl -LsSf https://dbc.columnar.tech/install.sh | sh
186+
SYSTEM_UNDER_TEST="${{ github.event.inputs.system_under_test || 'spice_cloud' }}"
196187
SYSTEM_UNDER_TEST_PREFIX="${SYSTEM_UNDER_TEST%%-*}"
197188
198189
if [ "${SYSTEM_UNDER_TEST_PREFIX}" = "databricks" ]; then
@@ -211,23 +202,26 @@ jobs:
211202
DATABRICKS_SQL_WAREHOUSE_ID: ${{ secrets.DATABRICKS_SQL_WAREHOUSE_ID }}
212203
DATABRICKS_CATALOG: ${{ secrets.DATABRICKS_CATALOG }}
213204
DATABRICKS_SCHEMA: ${{ secrets.DATABRICKS_SCHEMA }}
214-
DATABRICKS_TABLE_FORMAT: ${{ github.event.inputs.table_format || 'parquet' }}
215-
SPICEAI_BENCHMARK_METRICS_KEY: ${{ secrets.SPICEAI_BENCHMARK_METRICS_KEY }}
216205
SCENARIO: ${{ github.event.inputs.scenario || 'tpch' }}
217206
SYSTEM_UNDER_TEST: ${{ github.event.inputs.system_under_test || 'spice_cloud' }}
218-
EXECUTOR_INSTANCE_TYPE: ${{ github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
219207
NUM_QUERY_CLIENTS: ${{ github.event.inputs.num_query_clients || '8' }}
220208
ETL_BUCKET: ${{ github.event.inputs.etl_bucket }}
221209
ETL_PREFIX: ${{ github.event.inputs.etl_prefix || 'data-gen' }}
222210
ETL_VERSION: ${{ github.event.inputs.etl_version }}
223211
ETL_REGION: ${{ github.event.inputs.etl_region || 'us-east-1' }}
224-
ETL_ENDPOINT: ${{ github.event.inputs.etl_endpoint }}
212+
ETL_SINK: ${{ github.event.inputs.etl_sink || 'hive' }}
213+
SPICEAI_BENCHMARK_METRICS_KEY: ${{ secrets.SPICEAI_BENCHMARK_METRICS_KEY }}
225214
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
226215
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
227216
SPIDAPTER_ICEBERG_REGION: us-west-1
228217
SPIDAPTER_ICEBERG_CATALOG_FROM: iceberg:https://glue.us-west-1.amazonaws.com/iceberg/v1/catalogs/211125479522/namespaces
229218
RUST_LOG: 'info'
230219
run: |
220+
set -euo pipefail
221+
TABLE_FORMAT="parquet"
222+
EXECUTOR_INSTANCE_TYPE="github-hosted-ubuntu-latest"
223+
ETL_ENDPOINT=""
224+
DATABRICKS_TABLE_FORMAT="${TABLE_FORMAT}"
231225
SYSTEM_UNDER_TEST_PREFIX="${SYSTEM_UNDER_TEST%%-*}"
232226
ETL_ARGS="--etl-bucket ${ETL_BUCKET} --etl-version ${ETL_VERSION}"
233227
if [ -n "${ETL_PREFIX}" ]; then
@@ -236,20 +230,25 @@ jobs:
236230
if [ -n "${ETL_REGION}" ]; then
237231
ETL_ARGS="${ETL_ARGS} --etl-region ${ETL_REGION}"
238232
fi
239-
if [ -n "${ETL_ENDPOINT}" ]; then
233+
if [ -n "${ETL_ENDPOINT:-}" ]; then
240234
ETL_ARGS="${ETL_ARGS} --etl-endpoint ${ETL_ENDPOINT}"
241235
fi
242236
237+
ETL_SINK_ARGS="--etl-sink ${ETL_SINK} --table-format ${TABLE_FORMAT}"
238+
if [ "${ETL_SINK}" = "adbc" ]; then
239+
:
240+
fi
241+
243242
if [ "${SYSTEM_UNDER_TEST_PREFIX}" = "databricks" ]; then
244243
ADAPTER_CMD="${HOME}/.spice/bin/databricks-system-adapter"
245244
ADAPTER_ARGS="stdio"
246245
ADAPTER_ENVS="--system-adapter-env DATABRICKS_ENDPOINT=${DATABRICKS_ENDPOINT} --system-adapter-env DATABRICKS_TOKEN=${DATABRICKS_TOKEN} --system-adapter-env DATABRICKS_HTTP_PATH=${DATABRICKS_HTTP_PATH} --system-adapter-env DATABRICKS_SQL_WAREHOUSE_ID=${DATABRICKS_SQL_WAREHOUSE_ID} --system-adapter-env DATABRICKS_TABLE_FORMAT=${DATABRICKS_TABLE_FORMAT}"
247246
248-
if [ -n "${DATABRICKS_CATALOG}" ]; then
247+
if [ -n "${DATABRICKS_CATALOG:-}" ]; then
249248
ADAPTER_ENVS="${ADAPTER_ENVS} --system-adapter-env DATABRICKS_CATALOG=${DATABRICKS_CATALOG}"
250249
fi
251250
252-
if [ -n "${DATABRICKS_SCHEMA}" ]; then
251+
if [ -n "${DATABRICKS_SCHEMA:-}" ]; then
253252
ADAPTER_ENVS="${ADAPTER_ENVS} --system-adapter-env DATABRICKS_SCHEMA=${DATABRICKS_SCHEMA}"
254253
fi
255254
else
@@ -263,6 +262,7 @@ jobs:
263262
--scenario "${SCENARIO}" \
264263
--executor-instance-type "${EXECUTOR_INSTANCE_TYPE}" \
265264
${ETL_ARGS} \
265+
${ETL_SINK_ARGS} \
266266
--system-adapter-stdio-cmd "${ADAPTER_CMD}" \
267267
--system-adapter-stdio-args "${ADAPTER_ARGS}" \
268268
${ADAPTER_ENVS}

crates/data-generation/src/dataset/tpch.rs

Lines changed: 96 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ limitations under the License.
1515
*/
1616

1717
use std::collections::HashMap;
18+
use std::collections::VecDeque;
1819
use std::sync::atomic::{AtomicI64, AtomicU16, Ordering};
1920
use std::sync::{Arc, Mutex};
2021

@@ -72,6 +73,34 @@ const SF1_ROW_COUNTS: &[(&str, u64)] = &[
7273
("lineitem", 6_001_215),
7374
];
7475

76+
const MIN_TPCH_ROWS_PER_FILE: usize = 32_000;
77+
const MAX_TPCH_ROWS_PER_FILE: usize = 64_000;
78+
const DEFAULT_TPCH_MAX_ROWS_PER_FILE: usize = 48_000;
79+
80+
fn tpch_max_rows_per_file() -> usize {
81+
std::env::var("SPICEBENCH_TPCH_MAX_ROWS_PER_FILE")
82+
.ok()
83+
.and_then(|v| v.parse::<usize>().ok())
84+
.filter(|v| *v > 0)
85+
.map(|v| v.clamp(MIN_TPCH_ROWS_PER_FILE, MAX_TPCH_ROWS_PER_FILE))
86+
.unwrap_or(DEFAULT_TPCH_MAX_ROWS_PER_FILE)
87+
}
88+
89+
fn split_record_batch(batch: RecordBatch, max_rows: usize) -> VecDeque<RecordBatch> {
90+
if batch.num_rows() <= max_rows {
91+
return VecDeque::from([batch]);
92+
}
93+
94+
let mut out = VecDeque::new();
95+
let mut offset = 0usize;
96+
while offset < batch.num_rows() {
97+
let len = std::cmp::min(max_rows, batch.num_rows() - offset);
98+
out.push_back(batch.slice(offset, len));
99+
offset += len;
100+
}
101+
out
102+
}
103+
75104
/// Returns the expected total number of rows for a given table at the
76105
/// specified scale factor.
77106
fn total_rows_for_table(table: &str, scale_factor: f64) -> u64 {
@@ -364,12 +393,16 @@ pub struct TpchDataset {
364393
mutations: MutationConfig,
365394
/// Per-table step counter tracking which part to generate next (0-indexed).
366395
table_steps: HashMap<String, AtomicU16>,
396+
/// Per-table queue of already-generated chunks waiting to be emitted.
397+
pending_batches: HashMap<String, Mutex<VecDeque<RecordBatch>>>,
367398
/// Per-table primary key tracking for update/delete targeting.
368399
key_sets: HashMap<String, Mutex<IndexedKeySet<PrimaryKeyValue>>>,
369400
/// Global monotonically increasing operation counter for replay ordering.
370401
op_counter: AtomicI64,
371402
/// The storage backend for reading/writing table metadata.
372403
storage: Arc<dyn DataStorage>,
404+
/// Maximum number of rows per emitted batch/file.
405+
max_rows_per_file: usize,
373406
}
374407

375408
impl TpchDataset {
@@ -394,14 +427,25 @@ impl TpchDataset {
394427
.map(|(name, _)| (name.to_string(), AtomicU16::new(0)))
395428
.collect();
396429

430+
let pending_batches: HashMap<String, Mutex<VecDeque<RecordBatch>>> = TPCH_TABLES
431+
.iter()
432+
.map(|(name, _)| (name.to_string(), Mutex::new(VecDeque::new())))
433+
.collect();
434+
435+
let max_rows_per_file = tpch_max_rows_per_file();
436+
437+
info!(max_rows_per_file, "Configured TPCH maximum rows per file");
438+
397439
Ok(Self {
398440
scale_factor: config.scale_factor,
399441
num_steps: config.num_steps,
400442
mutations: mutations.clone(),
401443
table_steps,
444+
pending_batches,
402445
key_sets,
403446
op_counter: AtomicI64::new(0),
404447
storage,
448+
max_rows_per_file,
405449
})
406450
}
407451
}
@@ -463,6 +507,15 @@ impl Dataset for TpchDataset {
463507
}
464508

465509
async fn raw_next_batch(&self, table: &str) -> anyhow::Result<Option<RecordBatch>> {
510+
if let Some(queued) = self.pending_batches.get(table) {
511+
let mut queued = queued
512+
.lock()
513+
.map_err(|e| anyhow::anyhow!("lock poisoned: {e}"))?;
514+
if let Some(batch) = queued.pop_front() {
515+
return Ok(Some(batch));
516+
}
517+
}
518+
466519
// Each table independently tracks which step (part) it is on.
467520
let step_counter = self
468521
.table_steps
@@ -602,7 +655,23 @@ impl Dataset for TpchDataset {
602655
let op_indices: Vec<i64> = (op_base..op_base + total_rows as i64).collect();
603656
columns.push(Arc::new(Int64Array::from(op_indices)));
604657

605-
Ok(Some(RecordBatch::try_new(schema, columns)?))
658+
let combined_batch = RecordBatch::try_new(schema, columns)?;
659+
let mut chunks = split_record_batch(combined_batch, self.max_rows_per_file);
660+
661+
let first = chunks
662+
.pop_front()
663+
.ok_or_else(|| anyhow::anyhow!("internal error: no chunks produced"))?;
664+
665+
if !chunks.is_empty()
666+
&& let Some(queued) = self.pending_batches.get(table)
667+
{
668+
let mut queued = queued
669+
.lock()
670+
.map_err(|e| anyhow::anyhow!("lock poisoned: {e}"))?;
671+
queued.extend(chunks);
672+
}
673+
674+
Ok(Some(first))
606675
}
607676

608677
fn tables(&self) -> HashMap<String, DatasetTable> {
@@ -716,7 +785,7 @@ mod tests {
716785
}
717786

718787
#[tokio::test]
719-
async fn tpch_num_batches_matches_emitted_batches_per_table() {
788+
async fn tpch_num_batches_is_a_lower_bound_for_emitted_batches_per_table() {
720789
let dataset = build_dataset(1.0, 7);
721790

722791
for (table, _) in TPCH_TABLES {
@@ -731,14 +800,35 @@ mod tests {
731800
emitted_batches += 1;
732801
}
733802

734-
assert_eq!(
735-
emitted_batches,
736-
dataset.num_batches(table),
737-
"unexpected batch count for table '{table}'"
803+
assert!(
804+
emitted_batches >= dataset.num_batches(table),
805+
"emitted batches should be >= planned batches for table '{table}'"
738806
);
739807
}
740808
}
741809

810+
#[tokio::test]
811+
async fn tpch_batches_are_capped_to_max_rows_per_file() {
812+
let dataset = build_dataset(1.0, 7);
813+
814+
let mut saw_split = false;
815+
while let Some(batch) = dataset
816+
.raw_next_batch("lineitem")
817+
.await
818+
.expect("raw_next_batch should not fail")
819+
{
820+
assert!(
821+
batch.num_rows() <= DEFAULT_TPCH_MAX_ROWS_PER_FILE,
822+
"lineitem chunk exceeded max rows per file"
823+
);
824+
if batch.num_rows() == DEFAULT_TPCH_MAX_ROWS_PER_FILE {
825+
saw_split = true;
826+
}
827+
}
828+
829+
assert!(saw_split, "expected at least one full-size split chunk");
830+
}
831+
742832
#[tokio::test]
743833
async fn tpch_zero_mutation_batches_have_create_only_ops() {
744834
let dataset = build_dataset(1.0, 10);

crates/data-generation/src/storage/s3.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ impl DataStorage for S3Storage {
234234

235235
// Serialize RecordBatch to Parquet bytes in memory
236236
let props = WriterProperties::builder()
237-
.set_compression(Compression::SNAPPY)
237+
.set_compression(Compression::LZ4)
238238
.build();
239239

240240
let mut buf = Vec::new();

src/args/mod.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,13 @@ pub enum TableFormat {
2727
Delta,
2828
}
2929

30+
#[derive(Clone, Debug, ValueEnum)]
31+
#[value(rename_all = "lower")]
32+
pub enum EtlSink {
33+
Hive,
34+
Adbc,
35+
}
36+
3037
impl std::fmt::Display for TableFormat {
3138
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
3239
let value = match self {
@@ -120,6 +127,10 @@ pub struct CommonArgs {
120127
#[arg(long, default_value = "")]
121128
pub(crate) etl_target_base_prefix: String,
122129

130+
/// ETL sink implementation used for loading generated data.
131+
#[arg(long, value_enum, default_value = "hive")]
132+
pub(crate) etl_sink: EtlSink,
133+
123134
/// AWS region for the ETL S3 bucket
124135
#[arg(long, default_value = "us-east-1")]
125136
pub(crate) etl_region: Option<String>,

0 commit comments

Comments
 (0)