spiceai
diff --git a/‎.github/workflows/pr.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/pr.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎crates/adbc_client/src/lib.rs‎
Lines changed: 13 additions & 16 deletions b/‎crates/adbc_client/src/lib.rs‎
Lines changed: 13 additions & 16 deletions
diff --git a/‎crates/checkpointer/src/lib.rs‎
Lines changed: 0 additions & 3 deletions b/‎crates/checkpointer/src/lib.rs‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎crates/data-generation/src/config.rs‎
Lines changed: 0 additions & 14 deletions b/‎crates/data-generation/src/config.rs‎
Lines changed: 0 additions & 14 deletions
diff --git a/‎crates/data-generation/src/generator.rs‎
Lines changed: 2 additions & 4 deletions b/‎crates/data-generation/src/generator.rs‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎crates/data-generation/src/main.rs‎
Lines changed: 0 additions & 3 deletions b/‎crates/data-generation/src/main.rs‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎crates/data-generation/src/storage/file.rs‎
Lines changed: 1 addition & 1 deletion b/‎crates/data-generation/src/storage/file.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/etl/README.md‎
Lines changed: 1 addition & 21 deletions b/‎crates/etl/README.md‎
Lines changed: 1 addition & 21 deletions
diff --git a/‎crates/etl/src/lib.rs‎
Lines changed: 1 addition & 0 deletions b/‎crates/etl/src/lib.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crates/etl/src/main.rs‎
Lines changed: 2 additions & 68 deletions b/‎crates/etl/src/main.rs‎
Lines changed: 2 additions & 68 deletions
@@ -139,7 +139,6 @@ jobs:
             --scale-factor 0.001 \
             --bucket spiceai-public-datasets \
             --prefix pr-validation \
-            --max-concurrency 4 \
             --region us-east-1
 
       - name: Install ADBC driver
 
@@ -306,25 +306,22 @@ fn is_opaque_numeric(field: &Field) -> bool {
     let metadata = field.metadata();
 
     // PostgreSQL: arrow.opaque extension type for numeric
-    if let Some(ext_name) = metadata.get("ARROW:extension:name") {
-        if ext_name == "arrow.opaque" {
-            if let Some(ext_meta) = metadata.get("ARROW:extension:metadata") {
-                if serde_json::from_str::<serde_json::Value>(ext_meta)
-                    .ok()
-                    .and_then(|v| v.get("type_name")?.as_str().map(|s| s == "numeric"))
-                    .unwrap_or(false)
-                {
-                    return true;
-                }
-            }
-        }
+    if let Some(ext_name) = metadata.get("ARROW:extension:name")
+        && ext_name == "arrow.opaque"
+        && let Some(ext_meta) = metadata.get("ARROW:extension:metadata")
+        && serde_json::from_str::<serde_json::Value>(ext_meta)
+            .ok()
+            .and_then(|v| v.get("type_name")?.as_str().map(|s| s == "numeric"))
+            .unwrap_or(false)
+    {
+        return true;
     }
 
     // Databricks Spark: decimal type serialised as Utf8 with Spark metadata
-    if let Some(sql_name) = metadata.get("Spark:DataType:SqlName") {
-        if sql_name.starts_with("DECIMAL(") {
-            return true;
-        }
+    if let Some(sql_name) = metadata.get("Spark:DataType:SqlName")
+        && sql_name.starts_with("DECIMAL(")
+    {
+        return true;
     }
 
     false
 
@@ -72,8 +72,6 @@ pub struct ScenarioCheckpoint {
 /// S3‑backed store for uploading and downloading checkpoint artefacts.
 pub struct CheckpointStore {
     store: Arc<dyn ObjectStore>,
-    #[allow(dead_code)]
-    bucket: String,
     prefix: String,
 }
 
@@ -105,7 +103,6 @@ impl CheckpointStore {
         let store = Arc::new(builder.build()?);
         Ok(Self {
             store,
-            bucket: bucket.to_owned(),
             prefix: prefix.to_owned(),
         })
     }
 
@@ -67,10 +67,6 @@ pub struct CommonArgs {
     /// S3 endpoint URL (for MinIO/LocalStack)
     #[arg(long)]
     pub endpoint: Option<String>,
-
-    /// Maximum number of concurrent S3 writes (legacy, unused with file storage)
-    #[arg(long, default_value_t = 16)]
-    pub max_concurrency: usize,
 }
 
 pub struct DatasetConfig {
@@ -92,10 +88,6 @@ pub struct TargetConfig {
     pub partition_columns: Vec<String>,
 }
 
-pub struct IngestorConfig {
-    pub max_concurrency: usize,
-}
-
 impl CommonArgs {
     pub fn dataset_config(&self) -> DatasetConfig {
         DatasetConfig {
@@ -133,12 +125,6 @@ impl CommonArgs {
             partition_columns: vec![],
         })
     }
-
-    pub fn ingestor_config(&self) -> IngestorConfig {
-        IngestorConfig {
-            max_concurrency: self.max_concurrency,
-        }
-    }
 }
 
 /// Builds the versioned storage prefix: `{prefix}/{scenario}/{version}`.
 
@@ -20,7 +20,6 @@ use std::time::Instant;
 
 use tokio::task::JoinSet;
 
-use super::config::IngestorConfig;
 use super::dataset::Dataset;
 use super::metrics::{IngestResult, Metrics};
 use super::storage::DataStorage;
@@ -51,7 +50,6 @@ impl DataGenerator {
     pub fn new(
         dataset: Arc<dyn Dataset>,
         target: Arc<dyn DataStorage>,
-        _config: &IngestorConfig,
         metrics: Metrics,
         version_config: VersionConfig,
     ) -> Self {
@@ -88,12 +86,13 @@ impl DataGenerator {
 
         // For each table, spawn a single task that generates and writes inline.
         let mut join_set = JoinSet::new();
-        for table_name in self.dataset.tables().keys().cloned() {
+        for table_name in self.dataset.tables().keys() {
             let dataset = Arc::clone(&self.dataset);
             let target = self.target.clone();
             let metrics = self.metrics.clone();
             let written_ids = Arc::clone(&written_batches);
 
+            let table_name = table_name.clone();
             join_set.spawn(async move {
                 let mut batch_id: u64 = 0;
                 loop {
@@ -453,7 +452,6 @@ mod tests {
         let generator = DataGenerator::new(
             dataset,
             target,
-            &IngestorConfig { max_concurrency: 4 },
             Metrics::new(),
             VersionConfig {
                 scenario: "test".to_string(),
 
@@ -55,13 +55,11 @@ fn print_summary(result: &IngestResult) {
 
 fn build(args: &CommonArgs, file_storage: Arc<FileStorage>) -> anyhow::Result<DataGenerator> {
     let dataset_config = args.dataset_config();
-    let ingestor_config = args.ingestor_config();
     let version = format_scale_factor(args.scale_factor);
 
     tracing::info!(
         dataset_type = dataset_config.dataset_type,
         num_steps = dataset_config.num_steps,
-        max_concurrency = ingestor_config.max_concurrency,
         version = %version,
         scenario = %args.scenario,
         scale_factor = args.scale_factor,
@@ -88,7 +86,6 @@ fn build(args: &CommonArgs, file_storage: Arc<FileStorage>) -> anyhow::Result<Da
     let ingestor = DataGenerator::new(
         dataset,
         file_storage as Arc<dyn DataStorage>,
-        &ingestor_config,
         metrics,
         version_config,
     );
 
@@ -103,7 +103,7 @@ impl DataStorage for FileStorage {
         for entry in std::fs::read_dir(&dir)? {
             let entry = entry?;
             let path = entry.path();
-            if path.extension().map_or(false, |ext| ext == "parquet") {
+            if path.extension().is_some_and(|ext| ext == "parquet") {
                 paths.push(path.display().to_string());
             }
         }
 
@@ -2,8 +2,7 @@
 
 `etl` reads a generated archive, rehydrates records, and writes to either:
 
-- S3 as hive-partitioned Parquet (default)
-- an ADBC target via bulk ingest
+- an ADBC target via bulk ingest (default)
 - a null sink that discards writes for throughput benchmarking
 
 Dataset configuration is read from the extracted `version.json` metadata written by `data-generation`.
@@ -17,25 +16,6 @@ Provide one of these source modes:
 
 The version path is derived automatically from `--scale-factor`, so `--scale-factor 1` reads from the `1.0` version path.
 
-## S3 Hive Sink (default)
-
-Use `--sink s3-hive` to write hive-partitioned Parquet to S3.
-
-- `--target-prefix`: Base S3 key prefix for ETL output. Defaults to the source prefix when empty.
-- `--partition-by`: Comma-separated partition columns. Defaults to `__created_at`.
-
-### Example
-
-```bash
-cargo run -p etl -- \
-    --scenario tpch \
-    --scale-factor 1 \
-    --bucket peasee-indexes \
-    --prefix raw \
-    --target-prefix rehydrated \
-    --partition-by __created_at
-```
-
 ## ADBC Sink
 
 Use `--sink adbc` to write via ADBC bulk ingest.
 
@@ -1518,6 +1518,7 @@ impl ETLPipeline {
 /// it is consumed. If `step_limit` is `Some(n)`, at most `n` logical steps are
 /// consumed before the function returns [`PipelineState::Paused`]. Unconsumed
 /// steps remain in the shared work state for a subsequent call.
+#[allow(clippy::too_many_arguments)]
 async fn run_pipeline(
     data_storage: Arc<dyn DataStorage>,
     data_sink: Arc<dyn Sink>,
 
@@ -25,7 +25,6 @@ use data_generation::storage::s3::S3Storage;
 use etl::sink::Sink;
 use etl::sink::adbc::AdbcSink;
 use etl::sink::null::NullSink;
-use etl::sink::s3_hive::S3HiveSink;
 use etl::{DatasetSource, ETLPipeline, PipelineState, StopReason};
 use tracing_subscriber::EnvFilter;
 
@@ -35,8 +34,6 @@ const DEFAULT_FLIGHTSQL_MAX_MSG_SIZE_BYTES: &str = "78643200";
 #[derive(Clone, Debug, Default, ValueEnum)]
 enum SinkType {
     #[default]
-    #[value(name = "s3-hive")]
-    S3Hive,
     #[value(name = "adbc")]
     Adbc,
     #[value(name = "null")]
@@ -45,7 +42,7 @@ enum SinkType {
 
 #[derive(Parser)]
 #[command(
-    about = "Run an ETL pipeline that reads from a data archive, rehydrates data, and writes to S3 Hive, ADBC, or a null sink"
+    about = "Run an ETL pipeline that reads from a data archive, rehydrates data, and writes to ADBC or a null sink"
 )]
 struct Cli {
     /// Scenario name (e.g. "tpch") — used in the storage path `{prefix}/{scenario}/{version}/`
@@ -82,23 +79,11 @@ struct Cli {
     #[arg(long)]
     endpoint: Option<String>,
 
-    /// Base S3 key prefix for the ETL target (hive-partitioned output).
-    /// Defaults to the source prefix if not specified.
-    #[arg(long, default_value = "")]
-    target_prefix: String,
-
-    /// Ordered list of columns used for hive-style partitioning.
-    ///
-    /// Example: `--partition-by __created_at,product_type`
-    #[arg(long, value_delimiter = ',', default_value = "__created_at")]
-    partition_by: Vec<String>,
-
     /// ETL sink target.
     ///
-    /// - s3-hive: write hive-partitioned parquet to S3
     /// - adbc: write via ADBC bulk ingest
     /// - null: discard all writes (throughput benchmark mode)
-    #[arg(long, value_enum, default_value_t = SinkType::S3Hive)]
+    #[arg(long, value_enum, default_value_t = SinkType::Adbc)]
     sink: SinkType,
 
     /// ADBC driver name (for example: "databricks" or "flightsql").
@@ -271,55 +256,6 @@ async fn main() -> anyhow::Result<()> {
                 Some(adbc_sink),
             )
         }
-        SinkType::S3Hive => {
-            if cli.adbc_driver.is_some()
-                || cli.adbc_uri.is_some()
-                || !cli.adbc_options.is_empty()
-                || cli.adbc_catalog.is_some()
-                || cli.adbc_schema.is_some()
-                || cli.adbc_create_tables
-            {
-                anyhow::bail!(
-                    "ADBC options are only valid with --sink adbc. Remove ADBC flags or set --sink adbc."
-                );
-            }
-
-            let hive_prefix = if cli.target_prefix.is_empty() {
-                format!(
-                    "{}/{}/{}",
-                    cli.prefix.trim_matches('/'),
-                    cli.scenario,
-                    version
-                )
-            } else {
-                format!(
-                    "{}/{}/{}",
-                    cli.target_prefix.trim_matches('/'),
-                    cli.scenario,
-                    version
-                )
-            };
-
-            let bucket = cli
-                .bucket
-                .as_ref()
-                .ok_or_else(|| anyhow::anyhow!("--bucket is required for --sink s3-hive"))?;
-
-            let hive_config = TargetConfig {
-                bucket: bucket.clone(),
-                prefix: hive_prefix,
-                region: cli.region.clone(),
-                endpoint: cli.endpoint.clone(),
-                partition_columns: cli.partition_by.clone(),
-            };
-
-            (
-                Arc::new(S3HiveSink::new(&hive_config)?),
-                Some(hive_config),
-                "s3-hive".to_string(),
-                None,
-            )
-        }
         SinkType::Null => {
             if cli.adbc_driver.is_some()
                 || cli.adbc_uri.is_some()
@@ -373,8 +309,6 @@ async fn main() -> anyhow::Result<()> {
         adbc_catalog = ?cli.adbc_catalog,
         adbc_schema = ?cli.adbc_schema,
         adbc_create_tables = cli.adbc_create_tables,
-        target_prefix = %cli.target_prefix,
-        partition_by = ?cli.partition_by,
         scale_factor = version_metadata.scale_factor,
         num_steps = version_metadata.num_steps,
         "Starting ETL pipeline"
Original file line number	Diff line number	Diff line change
`@@ -103,7 +103,7 @@ impl DataStorage for FileStorage {`
`103`	`103`	`for entry in std::fs::read_dir(&dir)? {`
`104`	`104`	`let entry = entry?;`
`105`	`105`	`let path = entry.path();`
`106`		`- if path.extension().map_or(false, \|ext\| ext == "parquet") {`
	`106`	`+ if path.extension().is_some_and(\|ext\| ext == "parquet") {`
`107`	`107`	`paths.push(path.display().to_string());`
`108`	`108`	`}`
`109`	`109`	`}`