Skip to content

Commit b080562

Browse files
authored
Merge branch 'trunk' into peasee/260218-refactor-duplicated-s3-sources
2 parents db65dd0 + 0b20750 commit b080562

18 files changed

Lines changed: 755 additions & 232 deletions

File tree

.github/workflows/data_generation_run.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,14 @@ on:
2323
required: false
2424
default: '8'
2525
type: string
26+
table_format:
27+
required: false
28+
default: 'parquet'
29+
type: string
30+
executor_instance_type:
31+
required: false
32+
default: 'github-hosted-ubuntu-latest'
33+
type: string
2634
region:
2735
required: false
2836
default: 'us-east-1'
@@ -71,6 +79,16 @@ on:
7179
required: true
7280
default: '8'
7381
type: string
82+
table_format:
83+
description: 'Table format for generated datasets (iceberg, parquet, delta)'
84+
required: false
85+
default: 'parquet'
86+
type: string
87+
executor_instance_type:
88+
description: 'Executor instance type label for benchmark comparison and dashboarding'
89+
required: false
90+
default: 'github-hosted-ubuntu-latest'
91+
type: string
7492
region:
7593
description: 'AWS region'
7694
required: true
@@ -120,6 +138,8 @@ jobs:
120138
BUCKET: ${{ inputs.bucket || github.event.inputs.bucket }}
121139
PREFIX_BASE: ${{ inputs.prefix || github.event.inputs.prefix || 'data-gen' }}
122140
MAX_CONCURRENCY: ${{ inputs.max_concurrency || github.event.inputs.max_concurrency || '8' }}
141+
TABLE_FORMAT: ${{ inputs.table_format || github.event.inputs.table_format || 'parquet' }}
142+
EXECUTOR_INSTANCE_TYPE: ${{ inputs.executor_instance_type || github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
123143
REGION: ${{ inputs.region || github.event.inputs.region || 'us-east-1' }}
124144
SKIP_INITIAL: ${{ inputs.skip_initial || github.event.inputs.skip_initial || false }}
125145
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
@@ -134,6 +154,8 @@ jobs:
134154
ARGS="${ARGS} --bucket ${BUCKET}"
135155
ARGS="${ARGS} --prefix ${PREFIX}"
136156
ARGS="${ARGS} --max-concurrency ${MAX_CONCURRENCY}"
157+
ARGS="${ARGS} --table-format ${TABLE_FORMAT}"
158+
ARGS="${ARGS} --executor-instance-type ${EXECUTOR_INSTANCE_TYPE}"
137159
ARGS="${ARGS} --region ${REGION}"
138160
139161
if [ "${SKIP_INITIAL}" = "true" ]; then

.github/workflows/run_spicebench.yml

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ on:
2020
etl_bucket:
2121
description: 'S3 bucket for ETL source and target data'
2222
required: true
23-
default: "spiceai-public-datasets"
23+
default: 'spiceai-public-datasets'
2424
type: string
2525
etl_source_prefix:
2626
description: 'S3 key prefix for ETL source data'
@@ -35,7 +35,7 @@ on:
3535
etl_region:
3636
description: 'AWS region for the ETL S3 bucket'
3737
required: false
38-
default: "us-east-1"
38+
default: 'us-east-1'
3939
type: string
4040
etl_endpoint:
4141
description: 'S3 endpoint URL for ETL bucket (for MinIO/LocalStack)'
@@ -46,6 +46,16 @@ on:
4646
required: false
4747
default: '25'
4848
type: string
49+
table_format:
50+
description: 'Table format across generation and adapter setup (iceberg, parquet, delta)'
51+
required: false
52+
default: 'parquet'
53+
type: string
54+
executor_instance_type:
55+
description: 'Executor instance type label for benchmark comparison and dashboarding'
56+
required: false
57+
default: 'github-hosted-ubuntu-latest'
58+
type: string
4959

5060
jobs:
5161
run-spicebench:
@@ -112,6 +122,7 @@ jobs:
112122
DATABRICKS_SQL_WAREHOUSE_ID: ${{ secrets.DATABRICKS_SQL_WAREHOUSE_ID }}
113123
DATABRICKS_CATALOG: ${{ secrets.DATABRICKS_CATALOG }}
114124
DATABRICKS_SCHEMA: ${{ secrets.DATABRICKS_SCHEMA }}
125+
DATABRICKS_TABLE_FORMAT: ${{ github.event.inputs.table_format || 'parquet' }}
115126
run: |
116127
set -euo pipefail
117128
@@ -169,9 +180,11 @@ jobs:
169180
exit 1
170181
;;
171182
esac
183+
172184
- name: Install ADBC driver
173185
env:
174186
SYSTEM_ADAPTER: ${{ github.event.inputs.system_adapter || 'spidapter' }}
187+
EXECUTOR_INSTANCE_TYPE: ${{ github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
175188
run: |
176189
set -euo pipefail
177190
curl -LsSf https://dbc.columnar.tech/install.sh | sh
@@ -192,9 +205,11 @@ jobs:
192205
DATABRICKS_SQL_WAREHOUSE_ID: ${{ secrets.DATABRICKS_SQL_WAREHOUSE_ID }}
193206
DATABRICKS_CATALOG: ${{ secrets.DATABRICKS_CATALOG }}
194207
DATABRICKS_SCHEMA: ${{ secrets.DATABRICKS_SCHEMA }}
208+
DATABRICKS_TABLE_FORMAT: ${{ github.event.inputs.table_format || 'parquet' }}
195209
SPICEAI_BENCHMARK_METRICS_KEY: ${{ secrets.SPICEAI_BENCHMARK_METRICS_KEY }}
196210
SCENARIO: ${{ github.event.inputs.scenario || 'tpch' }}
197211
SYSTEM_ADAPTER: ${{ github.event.inputs.system_adapter || 'spidapter' }}
212+
EXECUTOR_INSTANCE_TYPE: ${{ github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
198213
ETL_BUCKET: ${{ github.event.inputs.etl_bucket }}
199214
ETL_SOURCE_PREFIX: ${{ github.event.inputs.etl_source_prefix }}
200215
ETL_TARGET_BASE_PREFIX: ${{ github.event.inputs.etl_target_base_prefix }}
@@ -203,7 +218,7 @@ jobs:
203218
ETL_NUM_STEPS: ${{ github.event.inputs.etl_num_steps || '25' }}
204219
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
205220
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
206-
RUST_LOG: "info"
221+
RUST_LOG: 'info'
207222
run: |
208223
ETL_ARGS="--etl-bucket ${ETL_BUCKET} --etl-num-steps ${ETL_NUM_STEPS}"
209224
if [ -n "${ETL_SOURCE_PREFIX}" ]; then
@@ -218,11 +233,11 @@ jobs:
218233
if [ -n "${ETL_ENDPOINT}" ]; then
219234
ETL_ARGS="${ETL_ARGS} --etl-endpoint ${ETL_ENDPOINT}"
220235
fi
221-
236+
222237
if [ "${SYSTEM_ADAPTER}" = "databricks" ]; then
223238
ADAPTER_CMD="${HOME}/.spice/bin/databricks-system-adapter"
224239
ADAPTER_ARGS="stdio"
225-
ADAPTER_ENVS="--system-adapter-env DATABRICKS_ENDPOINT=${DATABRICKS_ENDPOINT} --system-adapter-env DATABRICKS_TOKEN=${DATABRICKS_TOKEN} --system-adapter-env DATABRICKS_HTTP_PATH=${DATABRICKS_HTTP_PATH} --system-adapter-env DATABRICKS_SQL_WAREHOUSE_ID=${DATABRICKS_SQL_WAREHOUSE_ID}"
240+
ADAPTER_ENVS="--system-adapter-env DATABRICKS_ENDPOINT=${DATABRICKS_ENDPOINT} --system-adapter-env DATABRICKS_TOKEN=${DATABRICKS_TOKEN} --system-adapter-env DATABRICKS_HTTP_PATH=${DATABRICKS_HTTP_PATH} --system-adapter-env DATABRICKS_SQL_WAREHOUSE_ID=${DATABRICKS_SQL_WAREHOUSE_ID} --system-adapter-env DATABRICKS_TABLE_FORMAT=${DATABRICKS_TABLE_FORMAT}"
226241
227242
if [ -n "${DATABRICKS_CATALOG}" ]; then
228243
ADAPTER_ENVS="${ADAPTER_ENVS} --system-adapter-env DATABRICKS_CATALOG=${DATABRICKS_CATALOG}"
@@ -240,6 +255,7 @@ jobs:
240255
~/.spice/bin/spicebench \
241256
--concurrency 2 \
242257
--scenario "${SCENARIO}" \
258+
--executor-instance-type "${EXECUTOR_INSTANCE_TYPE}" \
243259
${ETL_ARGS} \
244260
--system-adapter-stdio-cmd "${ADAPTER_CMD}" \
245261
--system-adapter-stdio-args "${ADAPTER_ARGS}" \

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ data-generation = { path = "crates/data-generation" }
140140
etl = { path = "crates/etl" }
141141
reqwest.workspace = true
142142
serde.workspace = true
143+
serde_json.workspace = true
143144
system-adapter-protocol = { path = "crates/system-adapter-protocol" }
144145
test-framework = { path = "crates/test-framework" }
145146
tokio.workspace = true

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,20 @@ A **Run** is a single end-to-end execution of the benchmark for one system. Each
128128

129129
The **E2E benchmark duration** (phase 2, load test stage) is the primary ranking metric. After the load test, each query's p99 latency is compared against the baseline: >20% increase = FAIL, 10–20% = WARN, ≥3 WARNs = FAIL.
130130

131+
### Run Metadata
132+
133+
SpiceBench supports two run-level metadata knobs to keep cross-system comparisons consistent:
134+
135+
| Field | Default | Purpose | Propagation |
136+
| ------------------------ | ---------------------------------------------------------- | ---------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
137+
| `table_format` | `parquet` | Declares the dataset table format used for creation/registration. | Passed through data-generation/ETL dataset params and consumed by adapters (for example, Databricks UC table creation). |
138+
| `executor_instance_type` | `unknown` (CLI) / `github-hosted-ubuntu-latest` (workflow) | Identifies the benchmark executor hardware class for apples-to-apples comparisons. | Sent in adapter `setup` metadata and attached as an OpenTelemetry metric attribute for dashboard filtering. |
139+
140+
Common CLI/workflow usage:
141+
142+
- `spicebench --executor-instance-type "c6i.4xlarge" ...`
143+
- `data-generation run --table-format parquet --executor-instance-type "c6i.4xlarge" ...`
144+
131145
### Component Overview
132146

133147
| Component | Responsibility |

crates/data-generation/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Data Generator
22

33
```bash
4-
cargo run -p data-generation -- run --scale-factor 1 --bucket peasee-indexes --region us-west-2 --prefix raw --num-steps 10
4+
cargo run -p data-generation -- run --scale-factor 1 --bucket peasee-indexes --region us-west-2 --prefix raw --num-steps 10 --table-format parquet
55
```

crates/data-generation/src/config.rs

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
1414
limitations under the License.
1515
*/
1616

17-
use clap::{Parser, Subcommand};
17+
use clap::{Parser, Subcommand, ValueEnum};
1818

1919
#[derive(Parser)]
2020
#[command(about = "Spice.ai data generation tool - generates Arrow data and writes to S3")]
@@ -63,6 +63,14 @@ pub struct CommonArgs {
6363
#[arg(long, default_value = "")]
6464
pub prefix: String,
6565

66+
/// Logical table format propagated to system adapters
67+
#[arg(long, value_enum, default_value = "parquet")]
68+
pub table_format: TableFormat,
69+
70+
/// Executor instance type label propagated to adapters for dashboarding
71+
#[arg(long, default_value = "unknown")]
72+
pub executor_instance_type: String,
73+
6674
/// AWS region
6775
#[arg(long)]
6876
pub region: Option<String>,
@@ -85,10 +93,31 @@ pub struct DatasetConfig {
8593
pub struct TargetConfig {
8694
pub bucket: String,
8795
pub prefix: String,
96+
pub table_format: TableFormat,
97+
pub executor_instance_type: String,
8898
pub region: Option<String>,
8999
pub endpoint: Option<String>,
90100
}
91101

102+
#[derive(Clone, Debug, ValueEnum)]
103+
#[value(rename_all = "lower")]
104+
pub enum TableFormat {
105+
Iceberg,
106+
Parquet,
107+
Delta,
108+
}
109+
110+
impl std::fmt::Display for TableFormat {
111+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
112+
let value = match self {
113+
Self::Iceberg => "iceberg",
114+
Self::Parquet => "parquet",
115+
Self::Delta => "delta",
116+
};
117+
write!(f, "{value}")
118+
}
119+
}
120+
92121
pub struct IngestorConfig {
93122
pub max_concurrency: usize,
94123
}
@@ -106,6 +135,8 @@ impl CommonArgs {
106135
TargetConfig {
107136
bucket: self.bucket.clone(),
108137
prefix: self.prefix.clone(),
138+
table_format: self.table_format.clone(),
139+
executor_instance_type: self.executor_instance_type.clone(),
109140
region: self.region.clone(),
110141
endpoint: self.endpoint.clone(),
111142
}
Lines changed: 3 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,14 @@ use super::dataset::Dataset;
2626
use super::metrics::{IngestResult, Metrics};
2727
use super::target::Target;
2828

29-
pub struct Ingestor {
29+
pub struct DataGenerator {
3030
dataset: Arc<dyn Dataset>,
3131
target: Arc<dyn Target>,
3232
metrics: Metrics,
3333
semaphore: Arc<Semaphore>,
3434
}
3535

36-
impl Ingestor {
36+
impl DataGenerator {
3737
pub fn new(
3838
dataset: Arc<dyn Dataset>,
3939
target: Arc<dyn Target>,
@@ -52,38 +52,7 @@ impl Ingestor {
5252
///
5353
/// Pulls one batch per table from the dataset using `next_batches()`, then writes
5454
/// them sequentially so the data is guaranteed to be present when this returns.
55-
///
56-
/// If `table_location_fn` is provided, prints a JSON object mapping each table to
57-
/// its connector and location, e.g.:
58-
/// `{"customer": {"connector": "s3", "location": "s3://bucket/prefix/customer/"}, ...}`
59-
pub async fn initialize(
60-
&self,
61-
table_location_fn: Option<&dyn Fn(&str) -> String>,
62-
) -> anyhow::Result<IngestResult> {
63-
// Print table locations as JSON
64-
if let Some(loc_fn) = table_location_fn {
65-
let mut map = serde_json::Map::new();
66-
for (name, table) in self.dataset.tables() {
67-
let mut entry = serde_json::Map::new();
68-
entry.insert(
69-
"connector".to_string(),
70-
serde_json::Value::String("s3".to_string()),
71-
);
72-
entry.insert(
73-
"location".to_string(),
74-
serde_json::Value::String(loc_fn(&name)),
75-
);
76-
if let Some(ref time_col) = table.time_column {
77-
entry.insert(
78-
"time_column".to_string(),
79-
serde_json::Value::String(time_col.clone()),
80-
);
81-
}
82-
map.insert(name, serde_json::Value::Object(entry));
83-
}
84-
println!("{}", serde_json::Value::Object(map));
85-
}
86-
55+
pub async fn initialize(&self) -> anyhow::Result<IngestResult> {
8756
let table_count = self.dataset.tables().len();
8857

8958
tracing::info!(

crates/data-generation/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ limitations under the License.
1616

1717
pub mod config;
1818
pub mod dataset;
19-
pub mod ingestor;
19+
pub mod generator;
2020
pub mod metrics;
2121
pub mod source;
2222
pub mod storage;

crates/data-generation/src/main.rs

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ use std::sync::Arc;
2323
use data_generation::config::{Cli, Command, CommonArgs};
2424
use data_generation::dataset;
2525
use data_generation::dataset::tpch::TpchDataset;
26-
use data_generation::ingestor::Ingestor;
26+
use data_generation::generator::DataGenerator;
2727
use data_generation::metrics::{IngestResult, Metrics};
2828
use data_generation::storage::s3::S3Storage;
2929

@@ -50,7 +50,7 @@ fn print_summary(result: &IngestResult) {
5050
println!(" Avg write latency: {:?}", result.avg_write_latency);
5151
}
5252

53-
fn build(args: &CommonArgs) -> anyhow::Result<(Ingestor, Arc<S3Storage>)> {
53+
fn build(args: &CommonArgs) -> anyhow::Result<DataGenerator> {
5454
let dataset_config = args.dataset_config();
5555
let target_config = args.target_config();
5656
let ingestor_config = args.ingestor_config();
@@ -72,13 +72,13 @@ fn build(args: &CommonArgs) -> anyhow::Result<(Ingestor, Arc<S3Storage>)> {
7272
let target = Arc::new(S3Storage::new(&target_config)?);
7373
let metrics = Metrics::new();
7474

75-
let ingestor = Ingestor::new(
75+
let ingestor = DataGenerator::new(
7676
dataset,
77-
Arc::clone(&target) as Arc<dyn Target>,
77+
target as Arc<dyn Target>,
7878
&ingestor_config,
7979
metrics,
8080
);
81-
Ok((ingestor, target))
81+
Ok(ingestor)
8282
}
8383

8484
#[tokio::main]
@@ -91,16 +91,15 @@ async fn main() -> anyhow::Result<()> {
9191

9292
match cli.command {
9393
Command::Initialize(args) => {
94-
let (ingestor, target) = build(&args)?;
95-
let loc_fn = |table: &str| target.table_s3_path(table);
96-
let result = ingestor.initialize(Some(&loc_fn)).await?;
94+
let ingestor = build(&args)?;
95+
let result = ingestor.initialize().await?;
9796

9897
if result.write_errors > 0 {
9998
anyhow::bail!("Initialization failed with {} errors", result.write_errors);
10099
}
101100
}
102101
Command::Run(run_args) => {
103-
let (ingestor, _target) = build(&run_args.common)?;
102+
let ingestor = build(&run_args.common)?;
104103
if run_args.skip_initial {
105104
ingestor.skip_initial_batches().await?;
106105
}

0 commit comments

Comments
 (0)