spiceai
diff --git a/‎.github/workflows/data_generation_run.yml‎
Lines changed: 22 additions & 0 deletions b/‎.github/workflows/data_generation_run.yml‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎.github/workflows/run_spicebench.yml‎
Lines changed: 21 additions & 5 deletions b/‎.github/workflows/run_spicebench.yml‎
Lines changed: 21 additions & 5 deletions
diff --git a/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 14 additions & 0 deletions b/‎README.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎crates/data-generation/README.md‎
Lines changed: 1 addition & 1 deletion b/‎crates/data-generation/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/data-generation/src/config.rs‎
Lines changed: 32 additions & 1 deletion b/‎crates/data-generation/src/config.rs‎
Lines changed: 32 additions & 1 deletion
diff --git a/‎crates/data-generation/src/ingestor.rs‎ ‎crates/data-generation/src/generator.rs‎crates/data-generation/src/ingestor.rs renamed to crates/data-generation/src/generator.rs
Lines changed: 3 additions & 34 deletions b/‎crates/data-generation/src/ingestor.rs‎ ‎crates/data-generation/src/generator.rs‎crates/data-generation/src/ingestor.rs renamed to crates/data-generation/src/generator.rs
Lines changed: 3 additions & 34 deletions
diff --git a/‎crates/data-generation/src/lib.rs‎
Lines changed: 1 addition & 1 deletion b/‎crates/data-generation/src/lib.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/data-generation/src/main.rs‎
Lines changed: 8 additions & 9 deletions b/‎crates/data-generation/src/main.rs‎
Lines changed: 8 additions & 9 deletions
@@ -23,6 +23,14 @@ on:
         required: false
         default: '8'
         type: string
+      table_format:
+        required: false
+        default: 'parquet'
+        type: string
+      executor_instance_type:
+        required: false
+        default: 'github-hosted-ubuntu-latest'
+        type: string
       region:
         required: false
         default: 'us-east-1'
@@ -71,6 +79,16 @@ on:
         required: true
         default: '8'
         type: string
+      table_format:
+        description: 'Table format for generated datasets (iceberg, parquet, delta)'
+        required: false
+        default: 'parquet'
+        type: string
+      executor_instance_type:
+        description: 'Executor instance type label for benchmark comparison and dashboarding'
+        required: false
+        default: 'github-hosted-ubuntu-latest'
+        type: string
       region:
         description: 'AWS region'
         required: true
@@ -120,6 +138,8 @@ jobs:
           BUCKET: ${{ inputs.bucket || github.event.inputs.bucket }}
           PREFIX_BASE: ${{ inputs.prefix || github.event.inputs.prefix || 'data-gen' }}
           MAX_CONCURRENCY: ${{ inputs.max_concurrency || github.event.inputs.max_concurrency || '8' }}
+          TABLE_FORMAT: ${{ inputs.table_format || github.event.inputs.table_format || 'parquet' }}
+          EXECUTOR_INSTANCE_TYPE: ${{ inputs.executor_instance_type || github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
           REGION: ${{ inputs.region || github.event.inputs.region || 'us-east-1' }}
           SKIP_INITIAL: ${{ inputs.skip_initial || github.event.inputs.skip_initial || false }}
           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
@@ -134,6 +154,8 @@ jobs:
           ARGS="${ARGS} --bucket ${BUCKET}"
           ARGS="${ARGS} --prefix ${PREFIX}"
           ARGS="${ARGS} --max-concurrency ${MAX_CONCURRENCY}"
+          ARGS="${ARGS} --table-format ${TABLE_FORMAT}"
+          ARGS="${ARGS} --executor-instance-type ${EXECUTOR_INSTANCE_TYPE}"
           ARGS="${ARGS} --region ${REGION}"
 
           if [ "${SKIP_INITIAL}" = "true" ]; then
 
@@ -20,7 +20,7 @@ on:
       etl_bucket:
         description: 'S3 bucket for ETL source and target data'
         required: true
-        default: "spiceai-public-datasets"
+        default: 'spiceai-public-datasets'
         type: string
       etl_source_prefix:
         description: 'S3 key prefix for ETL source data'
@@ -35,7 +35,7 @@ on:
       etl_region:
         description: 'AWS region for the ETL S3 bucket'
         required: false
-        default: "us-east-1"
+        default: 'us-east-1'
         type: string
       etl_endpoint:
         description: 'S3 endpoint URL for ETL bucket (for MinIO/LocalStack)'
@@ -46,6 +46,16 @@ on:
         required: false
         default: '25'
         type: string
+      table_format:
+        description: 'Table format across generation and adapter setup (iceberg, parquet, delta)'
+        required: false
+        default: 'parquet'
+        type: string
+      executor_instance_type:
+        description: 'Executor instance type label for benchmark comparison and dashboarding'
+        required: false
+        default: 'github-hosted-ubuntu-latest'
+        type: string
 
 jobs:
   run-spicebench:
@@ -112,6 +122,7 @@ jobs:
           DATABRICKS_SQL_WAREHOUSE_ID: ${{ secrets.DATABRICKS_SQL_WAREHOUSE_ID }}
           DATABRICKS_CATALOG: ${{ secrets.DATABRICKS_CATALOG }}
           DATABRICKS_SCHEMA: ${{ secrets.DATABRICKS_SCHEMA }}
+          DATABRICKS_TABLE_FORMAT: ${{ github.event.inputs.table_format || 'parquet' }}
         run: |
           set -euo pipefail
 
@@ -169,9 +180,11 @@ jobs:
               exit 1
               ;;
           esac
+
       - name: Install ADBC driver
         env:
           SYSTEM_ADAPTER: ${{ github.event.inputs.system_adapter || 'spidapter' }}
+          EXECUTOR_INSTANCE_TYPE: ${{ github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
         run: |
           set -euo pipefail
           curl -LsSf https://dbc.columnar.tech/install.sh | sh
@@ -192,9 +205,11 @@ jobs:
           DATABRICKS_SQL_WAREHOUSE_ID: ${{ secrets.DATABRICKS_SQL_WAREHOUSE_ID }}
           DATABRICKS_CATALOG: ${{ secrets.DATABRICKS_CATALOG }}
           DATABRICKS_SCHEMA: ${{ secrets.DATABRICKS_SCHEMA }}
+          DATABRICKS_TABLE_FORMAT: ${{ github.event.inputs.table_format || 'parquet' }}
           SPICEAI_BENCHMARK_METRICS_KEY: ${{ secrets.SPICEAI_BENCHMARK_METRICS_KEY }}
           SCENARIO: ${{ github.event.inputs.scenario || 'tpch' }}
           SYSTEM_ADAPTER: ${{ github.event.inputs.system_adapter || 'spidapter' }}
+          EXECUTOR_INSTANCE_TYPE: ${{ github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
           ETL_BUCKET: ${{ github.event.inputs.etl_bucket }}
           ETL_SOURCE_PREFIX: ${{ github.event.inputs.etl_source_prefix }}
           ETL_TARGET_BASE_PREFIX: ${{ github.event.inputs.etl_target_base_prefix }}
@@ -203,7 +218,7 @@ jobs:
           ETL_NUM_STEPS: ${{ github.event.inputs.etl_num_steps || '25' }}
           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          RUST_LOG: "info"
+          RUST_LOG: 'info'
         run: |
           ETL_ARGS="--etl-bucket ${ETL_BUCKET} --etl-num-steps ${ETL_NUM_STEPS}"
           if [ -n "${ETL_SOURCE_PREFIX}" ]; then
@@ -218,11 +233,11 @@ jobs:
           if [ -n "${ETL_ENDPOINT}" ]; then
             ETL_ARGS="${ETL_ARGS} --etl-endpoint ${ETL_ENDPOINT}"
           fi
-          
+
           if [ "${SYSTEM_ADAPTER}" = "databricks" ]; then
             ADAPTER_CMD="${HOME}/.spice/bin/databricks-system-adapter"
             ADAPTER_ARGS="stdio"
-            ADAPTER_ENVS="--system-adapter-env DATABRICKS_ENDPOINT=${DATABRICKS_ENDPOINT} --system-adapter-env DATABRICKS_TOKEN=${DATABRICKS_TOKEN} --system-adapter-env DATABRICKS_HTTP_PATH=${DATABRICKS_HTTP_PATH} --system-adapter-env DATABRICKS_SQL_WAREHOUSE_ID=${DATABRICKS_SQL_WAREHOUSE_ID}"
+            ADAPTER_ENVS="--system-adapter-env DATABRICKS_ENDPOINT=${DATABRICKS_ENDPOINT} --system-adapter-env DATABRICKS_TOKEN=${DATABRICKS_TOKEN} --system-adapter-env DATABRICKS_HTTP_PATH=${DATABRICKS_HTTP_PATH} --system-adapter-env DATABRICKS_SQL_WAREHOUSE_ID=${DATABRICKS_SQL_WAREHOUSE_ID} --system-adapter-env DATABRICKS_TABLE_FORMAT=${DATABRICKS_TABLE_FORMAT}"
 
             if [ -n "${DATABRICKS_CATALOG}" ]; then
               ADAPTER_ENVS="${ADAPTER_ENVS} --system-adapter-env DATABRICKS_CATALOG=${DATABRICKS_CATALOG}"
@@ -240,6 +255,7 @@ jobs:
           ~/.spice/bin/spicebench \
             --concurrency 2  \
             --scenario "${SCENARIO}" \
+            --executor-instance-type "${EXECUTOR_INSTANCE_TYPE}" \
             ${ETL_ARGS} \
             --system-adapter-stdio-cmd "${ADAPTER_CMD}" \
             --system-adapter-stdio-args "${ADAPTER_ARGS}" \
 
@@ -140,6 +140,7 @@ data-generation = { path = "crates/data-generation" }
 etl = { path = "crates/etl" }
 reqwest.workspace = true
 serde.workspace = true
+serde_json.workspace = true
 system-adapter-protocol = { path = "crates/system-adapter-protocol" }
 test-framework = { path = "crates/test-framework" }
 tokio.workspace = true
 
@@ -128,6 +128,20 @@ A **Run** is a single end-to-end execution of the benchmark for one system. Each
 
 The **E2E benchmark duration** (phase 2, load test stage) is the primary ranking metric. After the load test, each query's p99 latency is compared against the baseline: >20% increase = FAIL, 10–20% = WARN, ≥3 WARNs = FAIL.
 
+### Run Metadata
+
+SpiceBench supports two run-level metadata knobs to keep cross-system comparisons consistent:
+
+| Field                    | Default                                                    | Purpose                                                                            | Propagation                                                                                                             |
+| ------------------------ | ---------------------------------------------------------- | ---------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
+| `table_format`           | `parquet`                                                  | Declares the dataset table format used for creation/registration.                  | Passed through data-generation/ETL dataset params and consumed by adapters (for example, Databricks UC table creation). |
+| `executor_instance_type` | `unknown` (CLI) / `github-hosted-ubuntu-latest` (workflow) | Identifies the benchmark executor hardware class for apples-to-apples comparisons. | Sent in adapter `setup` metadata and attached as an OpenTelemetry metric attribute for dashboard filtering.             |
+
+Common CLI/workflow usage:
+
+- `spicebench --executor-instance-type "c6i.4xlarge" ...`
+- `data-generation run --table-format parquet --executor-instance-type "c6i.4xlarge" ...`
+
 ### Component Overview
 
 | Component                   | Responsibility                                                                                                                                                |
 
@@ -1,5 +1,5 @@
 # Data Generator
 
 ```bash
-cargo run -p data-generation -- run --scale-factor 1 --bucket peasee-indexes --region us-west-2 --prefix raw --num-steps 10
+cargo run -p data-generation -- run --scale-factor 1 --bucket peasee-indexes --region us-west-2 --prefix raw --num-steps 10 --table-format parquet
 ```
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-use clap::{Parser, Subcommand};
+use clap::{Parser, Subcommand, ValueEnum};
 
 #[derive(Parser)]
 #[command(about = "Spice.ai data generation tool - generates Arrow data and writes to S3")]
@@ -63,6 +63,14 @@ pub struct CommonArgs {
     #[arg(long, default_value = "")]
     pub prefix: String,
 
+    /// Logical table format propagated to system adapters
+    #[arg(long, value_enum, default_value = "parquet")]
+    pub table_format: TableFormat,
+
+    /// Executor instance type label propagated to adapters for dashboarding
+    #[arg(long, default_value = "unknown")]
+    pub executor_instance_type: String,
+
     /// AWS region
     #[arg(long)]
     pub region: Option<String>,
@@ -85,10 +93,31 @@ pub struct DatasetConfig {
 pub struct TargetConfig {
     pub bucket: String,
     pub prefix: String,
+    pub table_format: TableFormat,
+    pub executor_instance_type: String,
     pub region: Option<String>,
     pub endpoint: Option<String>,
 }
 
+#[derive(Clone, Debug, ValueEnum)]
+#[value(rename_all = "lower")]
+pub enum TableFormat {
+    Iceberg,
+    Parquet,
+    Delta,
+}
+
+impl std::fmt::Display for TableFormat {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let value = match self {
+            Self::Iceberg => "iceberg",
+            Self::Parquet => "parquet",
+            Self::Delta => "delta",
+        };
+        write!(f, "{value}")
+    }
+}
+
 pub struct IngestorConfig {
     pub max_concurrency: usize,
 }
@@ -106,6 +135,8 @@ impl CommonArgs {
         TargetConfig {
             bucket: self.bucket.clone(),
             prefix: self.prefix.clone(),
+            table_format: self.table_format.clone(),
+            executor_instance_type: self.executor_instance_type.clone(),
             region: self.region.clone(),
             endpoint: self.endpoint.clone(),
         }
 
@@ -26,14 +26,14 @@ use super::dataset::Dataset;
 use super::metrics::{IngestResult, Metrics};
 use super::target::Target;
 
-pub struct Ingestor {
+pub struct DataGenerator {
     dataset: Arc<dyn Dataset>,
     target: Arc<dyn Target>,
     metrics: Metrics,
     semaphore: Arc<Semaphore>,
 }
 
-impl Ingestor {
+impl DataGenerator {
     pub fn new(
         dataset: Arc<dyn Dataset>,
         target: Arc<dyn Target>,
@@ -52,38 +52,7 @@ impl Ingestor {
     ///
     /// Pulls one batch per table from the dataset using `next_batches()`, then writes
     /// them sequentially so the data is guaranteed to be present when this returns.
-    ///
-    /// If `table_location_fn` is provided, prints a JSON object mapping each table to
-    /// its connector and location, e.g.:
-    /// `{"customer": {"connector": "s3", "location": "s3://bucket/prefix/customer/"}, ...}`
-    pub async fn initialize(
-        &self,
-        table_location_fn: Option<&dyn Fn(&str) -> String>,
-    ) -> anyhow::Result<IngestResult> {
-        // Print table locations as JSON
-        if let Some(loc_fn) = table_location_fn {
-            let mut map = serde_json::Map::new();
-            for (name, table) in self.dataset.tables() {
-                let mut entry = serde_json::Map::new();
-                entry.insert(
-                    "connector".to_string(),
-                    serde_json::Value::String("s3".to_string()),
-                );
-                entry.insert(
-                    "location".to_string(),
-                    serde_json::Value::String(loc_fn(&name)),
-                );
-                if let Some(ref time_col) = table.time_column {
-                    entry.insert(
-                        "time_column".to_string(),
-                        serde_json::Value::String(time_col.clone()),
-                    );
-                }
-                map.insert(name, serde_json::Value::Object(entry));
-            }
-            println!("{}", serde_json::Value::Object(map));
-        }
-
+    pub async fn initialize(&self) -> anyhow::Result<IngestResult> {
         let table_count = self.dataset.tables().len();
 
         tracing::info!(
 
@@ -16,7 +16,7 @@ limitations under the License.
 
 pub mod config;
 pub mod dataset;
-pub mod ingestor;
+pub mod generator;
 pub mod metrics;
 pub mod source;
 pub mod storage;
 
@@ -23,7 +23,7 @@ use std::sync::Arc;
 use data_generation::config::{Cli, Command, CommonArgs};
 use data_generation::dataset;
 use data_generation::dataset::tpch::TpchDataset;
-use data_generation::ingestor::Ingestor;
+use data_generation::generator::DataGenerator;
 use data_generation::metrics::{IngestResult, Metrics};
 use data_generation::storage::s3::S3Storage;
 
@@ -50,7 +50,7 @@ fn print_summary(result: &IngestResult) {
     println!("  Avg write latency: {:?}", result.avg_write_latency);
 }
 
-fn build(args: &CommonArgs) -> anyhow::Result<(Ingestor, Arc<S3Storage>)> {
+fn build(args: &CommonArgs) -> anyhow::Result<DataGenerator> {
     let dataset_config = args.dataset_config();
     let target_config = args.target_config();
     let ingestor_config = args.ingestor_config();
@@ -72,13 +72,13 @@ fn build(args: &CommonArgs) -> anyhow::Result<(Ingestor, Arc<S3Storage>)> {
     let target = Arc::new(S3Storage::new(&target_config)?);
     let metrics = Metrics::new();
 
-    let ingestor = Ingestor::new(
+    let ingestor = DataGenerator::new(
         dataset,
-        Arc::clone(&target) as Arc<dyn Target>,
+        target as Arc<dyn Target>,
         &ingestor_config,
         metrics,
     );
-    Ok((ingestor, target))
+    Ok(ingestor)
 }
 
 #[tokio::main]
@@ -91,16 +91,15 @@ async fn main() -> anyhow::Result<()> {
 
     match cli.command {
         Command::Initialize(args) => {
-            let (ingestor, target) = build(&args)?;
-            let loc_fn = |table: &str| target.table_s3_path(table);
-            let result = ingestor.initialize(Some(&loc_fn)).await?;
+            let ingestor = build(&args)?;
+            let result = ingestor.initialize().await?;
 
             if result.write_errors > 0 {
                 anyhow::bail!("Initialization failed with {} errors", result.write_errors);
             }
         }
         Command::Run(run_args) => {
-            let (ingestor, _target) = build(&run_args.common)?;
+            let ingestor = build(&run_args.common)?;
             if run_args.skip_initial {
                 ingestor.skip_initial_batches().await?;
             }