spiceai
diff --git a/‎.github/workflows/data_generation_run.yml‎
Lines changed: 22 additions & 0 deletions b/‎.github/workflows/data_generation_run.yml‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎.github/workflows/run_spicebench.yml‎
Lines changed: 21 additions & 5 deletions b/‎.github/workflows/run_spicebench.yml‎
Lines changed: 21 additions & 5 deletions
diff --git a/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 14 additions & 0 deletions b/‎README.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎crates/data-generation/README.md‎
Lines changed: 1 addition & 1 deletion b/‎crates/data-generation/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/data-generation/src/config.rs‎
Lines changed: 32 additions & 1 deletion b/‎crates/data-generation/src/config.rs‎
Lines changed: 32 additions & 1 deletion
diff --git a/‎crates/data-generation/src/dataset/mod.rs‎
Lines changed: 6 additions & 21 deletions b/‎crates/data-generation/src/dataset/mod.rs‎
Lines changed: 6 additions & 21 deletions
diff --git a/‎crates/data-generation/src/dataset/simple_sequence.rs‎
Lines changed: 1 addition & 1 deletion b/‎crates/data-generation/src/dataset/simple_sequence.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/data-generation/src/dataset/tpch.rs‎
Lines changed: 1 addition & 1 deletion b/‎crates/data-generation/src/dataset/tpch.rs‎
Lines changed: 1 addition & 1 deletion
@@ -23,6 +23,14 @@ on:
         required: false
         default: '8'
         type: string
+      table_format:
+        required: false
+        default: 'parquet'
+        type: string
+      executor_instance_type:
+        required: false
+        default: 'github-hosted-ubuntu-latest'
+        type: string
       region:
         required: false
         default: 'us-east-1'
@@ -71,6 +79,16 @@ on:
         required: true
         default: '8'
         type: string
+      table_format:
+        description: 'Table format for generated datasets (iceberg, parquet, delta)'
+        required: false
+        default: 'parquet'
+        type: string
+      executor_instance_type:
+        description: 'Executor instance type label for benchmark comparison and dashboarding'
+        required: false
+        default: 'github-hosted-ubuntu-latest'
+        type: string
       region:
         description: 'AWS region'
         required: true
@@ -120,6 +138,8 @@ jobs:
           BUCKET: ${{ inputs.bucket || github.event.inputs.bucket }}
           PREFIX_BASE: ${{ inputs.prefix || github.event.inputs.prefix || 'data-gen' }}
           MAX_CONCURRENCY: ${{ inputs.max_concurrency || github.event.inputs.max_concurrency || '8' }}
+          TABLE_FORMAT: ${{ inputs.table_format || github.event.inputs.table_format || 'parquet' }}
+          EXECUTOR_INSTANCE_TYPE: ${{ inputs.executor_instance_type || github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
           REGION: ${{ inputs.region || github.event.inputs.region || 'us-east-1' }}
           SKIP_INITIAL: ${{ inputs.skip_initial || github.event.inputs.skip_initial || false }}
           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
@@ -134,6 +154,8 @@ jobs:
           ARGS="${ARGS} --bucket ${BUCKET}"
           ARGS="${ARGS} --prefix ${PREFIX}"
           ARGS="${ARGS} --max-concurrency ${MAX_CONCURRENCY}"
+          ARGS="${ARGS} --table-format ${TABLE_FORMAT}"
+          ARGS="${ARGS} --executor-instance-type ${EXECUTOR_INSTANCE_TYPE}"
           ARGS="${ARGS} --region ${REGION}"
 
           if [ "${SKIP_INITIAL}" = "true" ]; then
 
@@ -20,7 +20,7 @@ on:
       etl_bucket:
         description: 'S3 bucket for ETL source and target data'
         required: true
-        default: "spiceai-public-datasets"
+        default: 'spiceai-public-datasets'
         type: string
       etl_source_prefix:
         description: 'S3 key prefix for ETL source data'
@@ -35,7 +35,7 @@ on:
       etl_region:
         description: 'AWS region for the ETL S3 bucket'
         required: false
-        default: "us-east-1"
+        default: 'us-east-1'
         type: string
       etl_endpoint:
         description: 'S3 endpoint URL for ETL bucket (for MinIO/LocalStack)'
@@ -46,6 +46,16 @@ on:
         required: false
         default: '25'
         type: string
+      table_format:
+        description: 'Table format across generation and adapter setup (iceberg, parquet, delta)'
+        required: false
+        default: 'parquet'
+        type: string
+      executor_instance_type:
+        description: 'Executor instance type label for benchmark comparison and dashboarding'
+        required: false
+        default: 'github-hosted-ubuntu-latest'
+        type: string
 
 jobs:
   run-spicebench:
@@ -112,6 +122,7 @@ jobs:
           DATABRICKS_SQL_WAREHOUSE_ID: ${{ secrets.DATABRICKS_SQL_WAREHOUSE_ID }}
           DATABRICKS_CATALOG: ${{ secrets.DATABRICKS_CATALOG }}
           DATABRICKS_SCHEMA: ${{ secrets.DATABRICKS_SCHEMA }}
+          DATABRICKS_TABLE_FORMAT: ${{ github.event.inputs.table_format || 'parquet' }}
         run: |
           set -euo pipefail
 
@@ -169,9 +180,11 @@ jobs:
               exit 1
               ;;
           esac
+
       - name: Install ADBC driver
         env:
           SYSTEM_ADAPTER: ${{ github.event.inputs.system_adapter || 'spidapter' }}
+          EXECUTOR_INSTANCE_TYPE: ${{ github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
         run: |
           set -euo pipefail
           curl -LsSf https://dbc.columnar.tech/install.sh | sh
@@ -192,9 +205,11 @@ jobs:
           DATABRICKS_SQL_WAREHOUSE_ID: ${{ secrets.DATABRICKS_SQL_WAREHOUSE_ID }}
           DATABRICKS_CATALOG: ${{ secrets.DATABRICKS_CATALOG }}
           DATABRICKS_SCHEMA: ${{ secrets.DATABRICKS_SCHEMA }}
+          DATABRICKS_TABLE_FORMAT: ${{ github.event.inputs.table_format || 'parquet' }}
           SPICEAI_BENCHMARK_METRICS_KEY: ${{ secrets.SPICEAI_BENCHMARK_METRICS_KEY }}
           SCENARIO: ${{ github.event.inputs.scenario || 'tpch' }}
           SYSTEM_ADAPTER: ${{ github.event.inputs.system_adapter || 'spidapter' }}
+          EXECUTOR_INSTANCE_TYPE: ${{ github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
           ETL_BUCKET: ${{ github.event.inputs.etl_bucket }}
           ETL_SOURCE_PREFIX: ${{ github.event.inputs.etl_source_prefix }}
           ETL_TARGET_BASE_PREFIX: ${{ github.event.inputs.etl_target_base_prefix }}
@@ -203,7 +218,7 @@ jobs:
           ETL_NUM_STEPS: ${{ github.event.inputs.etl_num_steps || '25' }}
           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          RUST_LOG: "info"
+          RUST_LOG: 'info'
         run: |
           ETL_ARGS="--etl-bucket ${ETL_BUCKET} --etl-num-steps ${ETL_NUM_STEPS}"
           if [ -n "${ETL_SOURCE_PREFIX}" ]; then
@@ -218,11 +233,11 @@ jobs:
           if [ -n "${ETL_ENDPOINT}" ]; then
             ETL_ARGS="${ETL_ARGS} --etl-endpoint ${ETL_ENDPOINT}"
           fi
-          
+
           if [ "${SYSTEM_ADAPTER}" = "databricks" ]; then
             ADAPTER_CMD="${HOME}/.spice/bin/databricks-system-adapter"
             ADAPTER_ARGS="stdio"
-            ADAPTER_ENVS="--system-adapter-env DATABRICKS_ENDPOINT=${DATABRICKS_ENDPOINT} --system-adapter-env DATABRICKS_TOKEN=${DATABRICKS_TOKEN} --system-adapter-env DATABRICKS_HTTP_PATH=${DATABRICKS_HTTP_PATH} --system-adapter-env DATABRICKS_SQL_WAREHOUSE_ID=${DATABRICKS_SQL_WAREHOUSE_ID}"
+            ADAPTER_ENVS="--system-adapter-env DATABRICKS_ENDPOINT=${DATABRICKS_ENDPOINT} --system-adapter-env DATABRICKS_TOKEN=${DATABRICKS_TOKEN} --system-adapter-env DATABRICKS_HTTP_PATH=${DATABRICKS_HTTP_PATH} --system-adapter-env DATABRICKS_SQL_WAREHOUSE_ID=${DATABRICKS_SQL_WAREHOUSE_ID} --system-adapter-env DATABRICKS_TABLE_FORMAT=${DATABRICKS_TABLE_FORMAT}"
 
             if [ -n "${DATABRICKS_CATALOG}" ]; then
               ADAPTER_ENVS="${ADAPTER_ENVS} --system-adapter-env DATABRICKS_CATALOG=${DATABRICKS_CATALOG}"
@@ -240,6 +255,7 @@ jobs:
           ~/.spice/bin/spicebench \
             --concurrency 2  \
             --scenario "${SCENARIO}" \
+            --executor-instance-type "${EXECUTOR_INSTANCE_TYPE}" \
             ${ETL_ARGS} \
             --system-adapter-stdio-cmd "${ADAPTER_CMD}" \
             --system-adapter-stdio-args "${ADAPTER_ARGS}" \
 
@@ -140,6 +140,7 @@ data-generation = { path = "crates/data-generation" }
 etl = { path = "crates/etl" }
 reqwest.workspace = true
 serde.workspace = true
+serde_json.workspace = true
 system-adapter-protocol = { path = "crates/system-adapter-protocol" }
 test-framework = { path = "crates/test-framework" }
 tokio.workspace = true
 
@@ -128,6 +128,20 @@ A **Run** is a single end-to-end execution of the benchmark for one system. Each
 
 The **E2E benchmark duration** (phase 2, load test stage) is the primary ranking metric. After the load test, each query's p99 latency is compared against the baseline: >20% increase = FAIL, 10–20% = WARN, ≥3 WARNs = FAIL.
 
+### Run Metadata
+
+SpiceBench supports two run-level metadata knobs to keep cross-system comparisons consistent:
+
+| Field                    | Default                                                    | Purpose                                                                            | Propagation                                                                                                             |
+| ------------------------ | ---------------------------------------------------------- | ---------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
+| `table_format`           | `parquet`                                                  | Declares the dataset table format used for creation/registration.                  | Passed through data-generation/ETL dataset params and consumed by adapters (for example, Databricks UC table creation). |
+| `executor_instance_type` | `unknown` (CLI) / `github-hosted-ubuntu-latest` (workflow) | Identifies the benchmark executor hardware class for apples-to-apples comparisons. | Sent in adapter `setup` metadata and attached as an OpenTelemetry metric attribute for dashboard filtering.             |
+
+Common CLI/workflow usage:
+
+- `spicebench --executor-instance-type "c6i.4xlarge" ...`
+- `data-generation run --table-format parquet --executor-instance-type "c6i.4xlarge" ...`
+
 ### Component Overview
 
 | Component                   | Responsibility                                                                                                                                                |
 
@@ -1,5 +1,5 @@
 # Data Generator
 
 ```bash
-cargo run -p data-generation -- run --scale-factor 1 --bucket peasee-indexes --region us-west-2 --prefix raw --num-steps 10
+cargo run -p data-generation -- run --scale-factor 1 --bucket peasee-indexes --region us-west-2 --prefix raw --num-steps 10 --table-format parquet
 ```
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-use clap::{Parser, Subcommand};
+use clap::{Parser, Subcommand, ValueEnum};
 
 #[derive(Parser)]
 #[command(about = "Spice.ai data generation tool - generates Arrow data and writes to S3")]
@@ -63,6 +63,14 @@ pub struct CommonArgs {
     #[arg(long, default_value = "")]
     pub prefix: String,
 
+    /// Logical table format propagated to system adapters
+    #[arg(long, value_enum, default_value = "parquet")]
+    pub table_format: TableFormat,
+
+    /// Executor instance type label propagated to adapters for dashboarding
+    #[arg(long, default_value = "unknown")]
+    pub executor_instance_type: String,
+
     /// AWS region
     #[arg(long)]
     pub region: Option<String>,
@@ -85,10 +93,31 @@ pub struct DatasetConfig {
 pub struct TargetConfig {
     pub bucket: String,
     pub prefix: String,
+    pub table_format: TableFormat,
+    pub executor_instance_type: String,
     pub region: Option<String>,
     pub endpoint: Option<String>,
 }
 
+#[derive(Clone, Debug, ValueEnum)]
+#[value(rename_all = "lower")]
+pub enum TableFormat {
+    Iceberg,
+    Parquet,
+    Delta,
+}
+
+impl std::fmt::Display for TableFormat {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let value = match self {
+            Self::Iceberg => "iceberg",
+            Self::Parquet => "parquet",
+            Self::Delta => "delta",
+        };
+        write!(f, "{value}")
+    }
+}
+
 pub struct IngestorConfig {
     pub max_concurrency: usize,
 }
@@ -106,6 +135,8 @@ impl CommonArgs {
         TargetConfig {
             bucket: self.bucket.clone(),
             prefix: self.prefix.clone(),
+            table_format: self.table_format.clone(),
+            executor_instance_type: self.executor_instance_type.clone(),
             region: self.region.clone(),
             endpoint: self.endpoint.clone(),
         }
 
@@ -37,40 +37,26 @@ pub struct DatasetTable {
     pub name: String,
     /// The Arrow schema for the table (without the time column).
     pub schema: SchemaRef,
-    /// The time column for the table, if any.
+    /// The time column for the table.
     ///
-    /// When set, this column is *not* included in [`schema`] — it is appended
+    /// This column is *not* included in [`schema`] — it is appended
     /// during rehydration via [`DatasetTable::rehydrate`].
-    pub time_column: Option<String>,
+    pub time_column: String,
 }
 
 impl DatasetTable {
-    /// Returns the full schema including the time column, if one is configured.
-    ///
-    /// If `time_column` is `None`, this returns the same schema as [`schema`].
+    /// Returns the full schema including the time column.
     pub fn rehydrated_schema(&self) -> SchemaRef {
-        let Some(ref time_col) = self.time_column else {
-            return Arc::clone(&self.schema);
-        };
-
         let ts_type = DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into()));
         let mut fields: Vec<_> = self.schema.fields().iter().cloned().collect();
-        fields.push(Arc::new(Field::new(time_col, ts_type, true)));
+        fields.push(Arc::new(Field::new(&self.time_column, ts_type, true)));
         Arc::new(arrow::datatypes::Schema::new(fields))
     }
 
     /// Rehydrate a batch by appending the time column with the current timestamp.
     ///
-    /// If this table has no `time_column`, the batch is returned unchanged.
     /// The batch schema must match [`schema`] (i.e. without the time column).
     pub fn rehydrate(&self, batch: &RecordBatch) -> anyhow::Result<RecordBatch> {
-        if self.time_column.is_none() {
-            anyhow::bail!(
-                "Cannot rehydrate table '{}' without a time column",
-                self.name
-            );
-        }
-
         if batch.schema() != self.schema {
             let mut diffs = Vec::new();
             let expected_fields = self.schema.fields();
@@ -309,8 +295,7 @@ pub trait Dataset: Send + Sync {
     /// Rehydrate a batch for the given table by appending the time column.
     ///
     /// Uses the table metadata from [`tables()`] to look up the time column name
-    /// and delegates to [`DatasetTable::rehydrate`]. If the table has no time column,
-    /// a rehydration error is returned.
+    /// and delegates to [`DatasetTable::rehydrate`].
     fn rehydrate(&self, table: &str, batch: &RecordBatch) -> anyhow::Result<RecordBatch> {
         let tables = self.tables();
         let dataset_table = tables
 
@@ -227,7 +227,7 @@ impl Dataset for SimpleSequenceDataset {
             DatasetTable {
                 name: "integer_sequence".to_string(),
                 schema: Self::schema(),
-                time_column: Some("inserted_at".to_string()),
+                time_column: "inserted_at".to_string(),
             },
         )])
     }
 
@@ -365,7 +365,7 @@ impl Dataset for TpchDataset {
                     DatasetTable {
                         name: (*name).to_string(),
                         schema: tpch_schema(name),
-                        time_column: Some((*time_col).to_string()),
+                        time_column: (*time_col).to_string(),
                     },
                 )
             })
Original file line number	Diff line number	Diff line change
`@@ -227,7 +227,7 @@ impl Dataset for SimpleSequenceDataset {`
`227`	`227`	`DatasetTable {`
`228`	`228`	`name: "integer_sequence".to_string(),`
`229`	`229`	`schema: Self::schema(),`
`230`		`- time_column: Some("inserted_at".to_string()),`
	`230`	`+ time_column: "inserted_at".to_string(),`
`231`	`231`	`},`
`232`	`232`	`)])`
`233`	`233`	`}`
Original file line number	Diff line number	Diff line change
`@@ -365,7 +365,7 @@ impl Dataset for TpchDataset {`
`365`	`365`	`DatasetTable {`
`366`	`366`	`name: (*name).to_string(),`
`367`	`367`	`schema: tpch_schema(name),`
`368`		`- time_column: Some((*time_col).to_string()),`
	`368`	`+ time_column: (*time_col).to_string(),`
`369`	`369`	`},`
`370`	`370`	`)`
`371`	`371`	`})`