Skip to content

Commit edb6898

Browse files
authored
Merge branch 'trunk' into peasee/260218-data-gen-mutations
2 parents 4533690 + 53f58f3 commit edb6898

26 files changed

Lines changed: 891 additions & 372 deletions

File tree

.github/workflows/data_generation_run.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,14 @@ on:
2323
required: false
2424
default: '8'
2525
type: string
26+
table_format:
27+
required: false
28+
default: 'parquet'
29+
type: string
30+
executor_instance_type:
31+
required: false
32+
default: 'github-hosted-ubuntu-latest'
33+
type: string
2634
region:
2735
required: false
2836
default: 'us-east-1'
@@ -71,6 +79,16 @@ on:
7179
required: true
7280
default: '8'
7381
type: string
82+
table_format:
83+
description: 'Table format for generated datasets (iceberg, parquet, delta)'
84+
required: false
85+
default: 'parquet'
86+
type: string
87+
executor_instance_type:
88+
description: 'Executor instance type label for benchmark comparison and dashboarding'
89+
required: false
90+
default: 'github-hosted-ubuntu-latest'
91+
type: string
7492
region:
7593
description: 'AWS region'
7694
required: true
@@ -120,6 +138,8 @@ jobs:
120138
BUCKET: ${{ inputs.bucket || github.event.inputs.bucket }}
121139
PREFIX_BASE: ${{ inputs.prefix || github.event.inputs.prefix || 'data-gen' }}
122140
MAX_CONCURRENCY: ${{ inputs.max_concurrency || github.event.inputs.max_concurrency || '8' }}
141+
TABLE_FORMAT: ${{ inputs.table_format || github.event.inputs.table_format || 'parquet' }}
142+
EXECUTOR_INSTANCE_TYPE: ${{ inputs.executor_instance_type || github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
123143
REGION: ${{ inputs.region || github.event.inputs.region || 'us-east-1' }}
124144
SKIP_INITIAL: ${{ inputs.skip_initial || github.event.inputs.skip_initial || false }}
125145
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
@@ -134,6 +154,8 @@ jobs:
134154
ARGS="${ARGS} --bucket ${BUCKET}"
135155
ARGS="${ARGS} --prefix ${PREFIX}"
136156
ARGS="${ARGS} --max-concurrency ${MAX_CONCURRENCY}"
157+
ARGS="${ARGS} --table-format ${TABLE_FORMAT}"
158+
ARGS="${ARGS} --executor-instance-type ${EXECUTOR_INSTANCE_TYPE}"
137159
ARGS="${ARGS} --region ${REGION}"
138160
139161
if [ "${SKIP_INITIAL}" = "true" ]; then

.github/workflows/run_spicebench.yml

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ on:
2020
etl_bucket:
2121
description: 'S3 bucket for ETL source and target data'
2222
required: true
23-
default: "spiceai-public-datasets"
23+
default: 'spiceai-public-datasets'
2424
type: string
2525
etl_source_prefix:
2626
description: 'S3 key prefix for ETL source data'
@@ -35,7 +35,7 @@ on:
3535
etl_region:
3636
description: 'AWS region for the ETL S3 bucket'
3737
required: false
38-
default: "us-east-1"
38+
default: 'us-east-1'
3939
type: string
4040
etl_endpoint:
4141
description: 'S3 endpoint URL for ETL bucket (for MinIO/LocalStack)'
@@ -46,6 +46,16 @@ on:
4646
required: false
4747
default: '25'
4848
type: string
49+
table_format:
50+
description: 'Table format across generation and adapter setup (iceberg, parquet, delta)'
51+
required: false
52+
default: 'parquet'
53+
type: string
54+
executor_instance_type:
55+
description: 'Executor instance type label for benchmark comparison and dashboarding'
56+
required: false
57+
default: 'github-hosted-ubuntu-latest'
58+
type: string
4959

5060
jobs:
5161
run-spicebench:
@@ -112,6 +122,7 @@ jobs:
112122
DATABRICKS_SQL_WAREHOUSE_ID: ${{ secrets.DATABRICKS_SQL_WAREHOUSE_ID }}
113123
DATABRICKS_CATALOG: ${{ secrets.DATABRICKS_CATALOG }}
114124
DATABRICKS_SCHEMA: ${{ secrets.DATABRICKS_SCHEMA }}
125+
DATABRICKS_TABLE_FORMAT: ${{ github.event.inputs.table_format || 'parquet' }}
115126
run: |
116127
set -euo pipefail
117128
@@ -169,9 +180,11 @@ jobs:
169180
exit 1
170181
;;
171182
esac
183+
172184
- name: Install ADBC driver
173185
env:
174186
SYSTEM_ADAPTER: ${{ github.event.inputs.system_adapter || 'spidapter' }}
187+
EXECUTOR_INSTANCE_TYPE: ${{ github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
175188
run: |
176189
set -euo pipefail
177190
curl -LsSf https://dbc.columnar.tech/install.sh | sh
@@ -192,9 +205,11 @@ jobs:
192205
DATABRICKS_SQL_WAREHOUSE_ID: ${{ secrets.DATABRICKS_SQL_WAREHOUSE_ID }}
193206
DATABRICKS_CATALOG: ${{ secrets.DATABRICKS_CATALOG }}
194207
DATABRICKS_SCHEMA: ${{ secrets.DATABRICKS_SCHEMA }}
208+
DATABRICKS_TABLE_FORMAT: ${{ github.event.inputs.table_format || 'parquet' }}
195209
SPICEAI_BENCHMARK_METRICS_KEY: ${{ secrets.SPICEAI_BENCHMARK_METRICS_KEY }}
196210
SCENARIO: ${{ github.event.inputs.scenario || 'tpch' }}
197211
SYSTEM_ADAPTER: ${{ github.event.inputs.system_adapter || 'spidapter' }}
212+
EXECUTOR_INSTANCE_TYPE: ${{ github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
198213
ETL_BUCKET: ${{ github.event.inputs.etl_bucket }}
199214
ETL_SOURCE_PREFIX: ${{ github.event.inputs.etl_source_prefix }}
200215
ETL_TARGET_BASE_PREFIX: ${{ github.event.inputs.etl_target_base_prefix }}
@@ -203,7 +218,7 @@ jobs:
203218
ETL_NUM_STEPS: ${{ github.event.inputs.etl_num_steps || '25' }}
204219
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
205220
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
206-
RUST_LOG: "info"
221+
RUST_LOG: 'info'
207222
run: |
208223
ETL_ARGS="--etl-bucket ${ETL_BUCKET} --etl-num-steps ${ETL_NUM_STEPS}"
209224
if [ -n "${ETL_SOURCE_PREFIX}" ]; then
@@ -218,11 +233,11 @@ jobs:
218233
if [ -n "${ETL_ENDPOINT}" ]; then
219234
ETL_ARGS="${ETL_ARGS} --etl-endpoint ${ETL_ENDPOINT}"
220235
fi
221-
236+
222237
if [ "${SYSTEM_ADAPTER}" = "databricks" ]; then
223238
ADAPTER_CMD="${HOME}/.spice/bin/databricks-system-adapter"
224239
ADAPTER_ARGS="stdio"
225-
ADAPTER_ENVS="--system-adapter-env DATABRICKS_ENDPOINT=${DATABRICKS_ENDPOINT} --system-adapter-env DATABRICKS_TOKEN=${DATABRICKS_TOKEN} --system-adapter-env DATABRICKS_HTTP_PATH=${DATABRICKS_HTTP_PATH} --system-adapter-env DATABRICKS_SQL_WAREHOUSE_ID=${DATABRICKS_SQL_WAREHOUSE_ID}"
240+
ADAPTER_ENVS="--system-adapter-env DATABRICKS_ENDPOINT=${DATABRICKS_ENDPOINT} --system-adapter-env DATABRICKS_TOKEN=${DATABRICKS_TOKEN} --system-adapter-env DATABRICKS_HTTP_PATH=${DATABRICKS_HTTP_PATH} --system-adapter-env DATABRICKS_SQL_WAREHOUSE_ID=${DATABRICKS_SQL_WAREHOUSE_ID} --system-adapter-env DATABRICKS_TABLE_FORMAT=${DATABRICKS_TABLE_FORMAT}"
226241
227242
if [ -n "${DATABRICKS_CATALOG}" ]; then
228243
ADAPTER_ENVS="${ADAPTER_ENVS} --system-adapter-env DATABRICKS_CATALOG=${DATABRICKS_CATALOG}"
@@ -240,6 +255,7 @@ jobs:
240255
~/.spice/bin/spicebench \
241256
--concurrency 2 \
242257
--scenario "${SCENARIO}" \
258+
--executor-instance-type "${EXECUTOR_INSTANCE_TYPE}" \
243259
${ETL_ARGS} \
244260
--system-adapter-stdio-cmd "${ADAPTER_CMD}" \
245261
--system-adapter-stdio-args "${ADAPTER_ARGS}" \

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ data-generation = { path = "crates/data-generation" }
140140
etl = { path = "crates/etl" }
141141
reqwest.workspace = true
142142
serde.workspace = true
143+
serde_json.workspace = true
143144
system-adapter-protocol = { path = "crates/system-adapter-protocol" }
144145
test-framework = { path = "crates/test-framework" }
145146
tokio.workspace = true

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,20 @@ A **Run** is a single end-to-end execution of the benchmark for one system. Each
128128

129129
The **E2E benchmark duration** (phase 2, load test stage) is the primary ranking metric. After the load test, each query's p99 latency is compared against the baseline: >20% increase = FAIL, 10–20% = WARN, ≥3 WARNs = FAIL.
130130

131+
### Run Metadata
132+
133+
SpiceBench supports two run-level metadata knobs to keep cross-system comparisons consistent:
134+
135+
| Field | Default | Purpose | Propagation |
136+
| ------------------------ | ---------------------------------------------------------- | ---------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
137+
| `table_format` | `parquet` | Declares the dataset table format used for creation/registration. | Passed through data-generation/ETL dataset params and consumed by adapters (for example, Databricks UC table creation). |
138+
| `executor_instance_type` | `unknown` (CLI) / `github-hosted-ubuntu-latest` (workflow) | Identifies the benchmark executor hardware class for apples-to-apples comparisons. | Sent in adapter `setup` metadata and attached as an OpenTelemetry metric attribute for dashboard filtering. |
139+
140+
Common CLI/workflow usage:
141+
142+
- `spicebench --executor-instance-type "c6i.4xlarge" ...`
143+
- `data-generation run --table-format parquet --executor-instance-type "c6i.4xlarge" ...`
144+
131145
### Component Overview
132146

133147
| Component | Responsibility |

crates/data-generation/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Data Generator
22

33
```bash
4-
cargo run -p data-generation -- run --scale-factor 1 --bucket peasee-indexes --region us-west-2 --prefix raw --num-steps 10
4+
cargo run -p data-generation -- run --scale-factor 1 --bucket peasee-indexes --region us-west-2 --prefix raw --num-steps 10 --table-format parquet
55
```

crates/data-generation/src/config.rs

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
1414
limitations under the License.
1515
*/
1616

17-
use clap::{Parser, Subcommand};
17+
use clap::{Parser, Subcommand, ValueEnum};
1818

1919
#[derive(Parser)]
2020
#[command(about = "Spice.ai data generation tool - generates Arrow data and writes to S3")]
@@ -63,6 +63,14 @@ pub struct CommonArgs {
6363
#[arg(long, default_value = "")]
6464
pub prefix: String,
6565

66+
/// Logical table format propagated to system adapters
67+
#[arg(long, value_enum, default_value = "parquet")]
68+
pub table_format: TableFormat,
69+
70+
/// Executor instance type label propagated to adapters for dashboarding
71+
#[arg(long, default_value = "unknown")]
72+
pub executor_instance_type: String,
73+
6674
/// AWS region
6775
#[arg(long)]
6876
pub region: Option<String>,
@@ -85,10 +93,31 @@ pub struct DatasetConfig {
8593
pub struct TargetConfig {
8694
pub bucket: String,
8795
pub prefix: String,
96+
pub table_format: TableFormat,
97+
pub executor_instance_type: String,
8898
pub region: Option<String>,
8999
pub endpoint: Option<String>,
90100
}
91101

102+
#[derive(Clone, Debug, ValueEnum)]
103+
#[value(rename_all = "lower")]
104+
pub enum TableFormat {
105+
Iceberg,
106+
Parquet,
107+
Delta,
108+
}
109+
110+
impl std::fmt::Display for TableFormat {
111+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
112+
let value = match self {
113+
Self::Iceberg => "iceberg",
114+
Self::Parquet => "parquet",
115+
Self::Delta => "delta",
116+
};
117+
write!(f, "{value}")
118+
}
119+
}
120+
92121
pub struct IngestorConfig {
93122
pub max_concurrency: usize,
94123
}
@@ -106,6 +135,8 @@ impl CommonArgs {
106135
TargetConfig {
107136
bucket: self.bucket.clone(),
108137
prefix: self.prefix.clone(),
138+
table_format: self.table_format.clone(),
139+
executor_instance_type: self.executor_instance_type.clone(),
109140
region: self.region.clone(),
110141
endpoint: self.endpoint.clone(),
111142
}

crates/data-generation/src/dataset/mod.rs

Lines changed: 6 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -37,40 +37,26 @@ pub struct DatasetTable {
3737
pub name: String,
3838
/// The Arrow schema for the table (without the time column).
3939
pub schema: SchemaRef,
40-
/// The time column for the table, if any.
40+
/// The time column for the table.
4141
///
42-
/// When set, this column is *not* included in [`schema`] — it is appended
42+
/// This column is *not* included in [`schema`] — it is appended
4343
/// during rehydration via [`DatasetTable::rehydrate`].
44-
pub time_column: Option<String>,
44+
pub time_column: String,
4545
}
4646

4747
impl DatasetTable {
48-
/// Returns the full schema including the time column, if one is configured.
49-
///
50-
/// If `time_column` is `None`, this returns the same schema as [`schema`].
48+
/// Returns the full schema including the time column.
5149
pub fn rehydrated_schema(&self) -> SchemaRef {
52-
let Some(ref time_col) = self.time_column else {
53-
return Arc::clone(&self.schema);
54-
};
55-
5650
let ts_type = DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into()));
5751
let mut fields: Vec<_> = self.schema.fields().iter().cloned().collect();
58-
fields.push(Arc::new(Field::new(time_col, ts_type, true)));
52+
fields.push(Arc::new(Field::new(&self.time_column, ts_type, true)));
5953
Arc::new(arrow::datatypes::Schema::new(fields))
6054
}
6155

6256
/// Rehydrate a batch by appending the time column with the current timestamp.
6357
///
64-
/// If this table has no `time_column`, the batch is returned unchanged.
6558
/// The batch schema must match [`schema`] (i.e. without the time column).
6659
pub fn rehydrate(&self, batch: &RecordBatch) -> anyhow::Result<RecordBatch> {
67-
if self.time_column.is_none() {
68-
anyhow::bail!(
69-
"Cannot rehydrate table '{}' without a time column",
70-
self.name
71-
);
72-
}
73-
7460
if batch.schema() != self.schema {
7561
let mut diffs = Vec::new();
7662
let expected_fields = self.schema.fields();
@@ -309,8 +295,7 @@ pub trait Dataset: Send + Sync {
309295
/// Rehydrate a batch for the given table by appending the time column.
310296
///
311297
/// Uses the table metadata from [`tables()`] to look up the time column name
312-
/// and delegates to [`DatasetTable::rehydrate`]. If the table has no time column,
313-
/// a rehydration error is returned.
298+
/// and delegates to [`DatasetTable::rehydrate`].
314299
fn rehydrate(&self, table: &str, batch: &RecordBatch) -> anyhow::Result<RecordBatch> {
315300
let tables = self.tables();
316301
let dataset_table = tables

crates/data-generation/src/dataset/simple_sequence.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ impl Dataset for SimpleSequenceDataset {
227227
DatasetTable {
228228
name: "integer_sequence".to_string(),
229229
schema: Self::schema(),
230-
time_column: Some("inserted_at".to_string()),
230+
time_column: "inserted_at".to_string(),
231231
},
232232
)])
233233
}

crates/data-generation/src/dataset/tpch.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ impl Dataset for TpchDataset {
365365
DatasetTable {
366366
name: (*name).to_string(),
367367
schema: tpch_schema(name),
368-
time_column: Some((*time_col).to_string()),
368+
time_column: (*time_col).to_string(),
369369
},
370370
)
371371
})

0 commit comments

Comments
 (0)