fix: Remove internal columns from ETL presented schema (#103)

peasee · github-actions[bot] · web-flow · commit 37e366e0ff0b · 2026-02-19T21:58:46.000Z
* fix: Remove internal columns from ETL presented schema

* fix

* chore: auto-fix cargo fmt + clippy

---------

Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
diff --git a/crates/etl/src/lib.rs b/crates/etl/src/lib.rs
@@ -444,10 +444,20 @@ impl ETLPipeline {
             .tables()
             .into_iter()
             .map(|(name, table)| {
+                // Strip internal columns (_op, _op_index) from the schema, as
+                // these are removed before data is written to the sink.
+                let fields: Vec<_> = table
+                    .schema
+                    .fields()
+                    .iter()
+                    .filter(|f| !INTERNAL_COLUMNS.contains(&f.name().as_str()))
+                    .cloned()
+                    .collect();
+                let schema: SchemaRef = Arc::new(Schema::new(fields));
                 let schema = if with_created_at {
-                    schema_with_created_at(&table.schema)
+                    schema_with_created_at(&schema)
                 } else {
-                    table.schema.clone()
+                    schema
                 };
                 let config = ProtocolDatasetConfig { schema };
                 (name, config)
@@ -966,3 +976,252 @@ async fn run_pipeline(
     );
     PipelineState::Stopped(StopReason::Completed)
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::RecordBatch;
+    use async_trait::async_trait;
+    use data_generation::config::DatasetConfig as GenerationDatasetConfig;
+    use data_generation::dataset::MutationConfig;
+    use data_generation::storage::{DataStorage, ReadResult, WriteResult};
+    use data_generation::version::VersionMetadata;
+    use std::collections::{HashMap, VecDeque};
+
+    /// A no-op [`DataStorage`] implementation for testing.
+    ///
+    /// The TPCH dataset only needs storage during batch generation/reading,
+    /// which is not exercised by `create_tables_request_datasets`.
+    struct MockStorage;
+
+    #[async_trait]
+    impl DataStorage for MockStorage {
+        async fn list_batches(&self, _table_name: &str) -> anyhow::Result<Vec<String>> {
+            Ok(Vec::new())
+        }
+
+        async fn read_batch(
+            &self,
+            _table_name: &str,
+            _batch_id: u64,
+        ) -> anyhow::Result<Option<ReadResult>> {
+            Ok(None)
+        }
+
+        async fn write(
+            &self,
+            _table_name: &str,
+            _batch_id: u64,
+            _batch: RecordBatch,
+        ) -> anyhow::Result<WriteResult> {
+            Ok(WriteResult {
+                rows_written: 0,
+                bytes_written: 0,
+            })
+        }
+
+        async fn write_version_metadata(&self, _metadata: &VersionMetadata) -> anyhow::Result<()> {
+            Ok(())
+        }
+
+        async fn read_version_metadata(&self) -> anyhow::Result<Option<VersionMetadata>> {
+            Ok(None)
+        }
+
+        async fn read_batch_ids(&self, _table_name: &str) -> anyhow::Result<VecDeque<u64>> {
+            Ok(VecDeque::new())
+        }
+
+        fn table_params(&self, _table_name: &str) -> HashMap<String, serde_json::Value> {
+            HashMap::new()
+        }
+
+        fn expected_files(&self, _table_name: &str, _batch_ids: &[u64]) -> Vec<String> {
+            Vec::new()
+        }
+    }
+
+    /// A no-op [`Sink`] implementation for testing.
+    struct MockSink;
+
+    #[async_trait]
+    impl Sink for MockSink {
+        async fn write(
+            &self,
+            _table_name: &str,
+            _batch_id: u64,
+            _batch: RecordBatch,
+            _op: InsertOp,
+        ) -> anyhow::Result<()> {
+            Ok(())
+        }
+    }
+
+    /// Helper to build a pipeline with the TPCH dataset for testing.
+    fn make_tpch_pipeline(with_created_at: bool) -> ETLPipeline {
+        let config = GenerationDatasetConfig {
+            dataset_type: "tpch".to_string(),
+            scale_factor: 0.01,
+            num_steps: 1,
+        };
+        let mutations = MutationConfig::new(0.0, 0.0);
+        let storage: Arc<dyn DataStorage> = Arc::new(MockStorage);
+        let sink: Arc<dyn Sink> = Arc::new(MockSink);
+
+        let pipeline = ETLPipeline::new(DatasetSource::Tpch, &config, storage, sink, &mutations)
+            .expect("failed to create pipeline");
+
+        if with_created_at {
+            pipeline.with_created_at(true)
+        } else {
+            pipeline
+        }
+    }
+
+    #[test]
+    fn create_tables_request_datasets_strips_internal_columns() {
+        let pipeline = make_tpch_pipeline(false);
+        let datasets = pipeline.create_tables_request_datasets();
+
+        // The TPCH dataset should expose all 8 tables.
+        assert!(!datasets.is_empty(), "Expected non-empty dataset map");
+
+        for (table_name, dataset_config) in &datasets {
+            let field_names: Vec<&str> = dataset_config
+                .schema
+                .fields()
+                .iter()
+                .map(|f| f.name().as_str())
+                .collect();
+
+            assert!(
+                !field_names.contains(&"_op"),
+                "Table '{table_name}' schema should not contain '_op' column, but found fields: {field_names:?}"
+            );
+            assert!(
+                !field_names.contains(&"_op_index"),
+                "Table '{table_name}' schema should not contain '_op_index' column, but found fields: {field_names:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn create_tables_request_datasets_does_not_include_created_at_by_default() {
+        let pipeline = make_tpch_pipeline(false);
+        let datasets = pipeline.create_tables_request_datasets();
+
+        for (table_name, dataset_config) in &datasets {
+            let field_names: Vec<&str> = dataset_config
+                .schema
+                .fields()
+                .iter()
+                .map(|f| f.name().as_str())
+                .collect();
+
+            assert!(
+                !field_names.contains(&CREATED_AT_COLUMN),
+                "Table '{table_name}' should not have '{CREATED_AT_COLUMN}' when with_created_at is false, but found fields: {field_names:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn create_tables_request_datasets_includes_created_at_when_enabled() {
+        let pipeline = make_tpch_pipeline(true);
+        let datasets = pipeline.create_tables_request_datasets();
+
+        for (table_name, dataset_config) in &datasets {
+            let field_names: Vec<&str> = dataset_config
+                .schema
+                .fields()
+                .iter()
+                .map(|f| f.name().as_str())
+                .collect();
+
+            assert!(
+                field_names.contains(&CREATED_AT_COLUMN),
+                "Table '{table_name}' should have '{CREATED_AT_COLUMN}' when with_created_at is true, but found fields: {field_names:?}"
+            );
+
+            // Internal columns should still be stripped.
+            assert!(
+                !field_names.contains(&"_op"),
+                "Table '{table_name}' schema should not contain '_op' even with created_at enabled, but found fields: {field_names:?}"
+            );
+            assert!(
+                !field_names.contains(&"_op_index"),
+                "Table '{table_name}' schema should not contain '_op_index' even with created_at enabled, but found fields: {field_names:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn create_tables_request_datasets_preserves_data_columns() {
+        let pipeline = make_tpch_pipeline(false);
+        let datasets = pipeline.create_tables_request_datasets();
+
+        // Verify the raw dataset's tables DO have internal columns (sanity
+        // check that the test is meaningful).
+        let raw_tables = pipeline.dataset().tables();
+        for (table_name, raw_table) in &raw_tables {
+            let raw_field_names: Vec<&str> = raw_table
+                .schema
+                .fields()
+                .iter()
+                .map(|f| f.name().as_str())
+                .collect();
+            assert!(
+                raw_field_names.contains(&"_op"),
+                "Sanity check: raw TPCH table '{table_name}' should contain '_op'"
+            );
+            assert!(
+                raw_field_names.contains(&"_op_index"),
+                "Sanity check: raw TPCH table '{table_name}' should contain '_op_index'"
+            );
+        }
+
+        // Now verify the returned datasets have all the non-internal columns.
+        for (table_name, raw_table) in &raw_tables {
+            let dataset_config = datasets
+                .get(table_name)
+                .unwrap_or_else(|| panic!("Table '{table_name}' missing from datasets"));
+
+            let expected_fields: Vec<&str> = raw_table
+                .schema
+                .fields()
+                .iter()
+                .map(|f| f.name().as_str())
+                .filter(|name| !INTERNAL_COLUMNS.contains(name))
+                .collect();
+            let actual_fields: Vec<&str> = dataset_config
+                .schema
+                .fields()
+                .iter()
+                .map(|f| f.name().as_str())
+                .collect();
+
+            assert_eq!(
+                expected_fields, actual_fields,
+                "Table '{table_name}' should retain all non-internal fields"
+            );
+        }
+    }
+
+    #[test]
+    fn create_tables_request_datasets_returns_all_tpch_tables() {
+        let pipeline = make_tpch_pipeline(false);
+        let datasets = pipeline.create_tables_request_datasets();
+
+        let expected_tables = [
+            "region", "nation", "supplier", "customer", "part", "partsupp", "orders", "lineitem",
+        ];
+
+        for table in expected_tables {
+            assert!(
+                datasets.contains_key(table),
+                "Expected table '{table}' in create_tables_request_datasets output"
+            );
+        }
+        assert_eq!(datasets.len(), expected_tables.len());
+    }
+}
diff --git a/src/main.rs b/src/main.rs
@@ -14,14 +14,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-use std::{collections::HashMap, sync::Arc};
+use std::sync::Arc;
 
 use adbc_client::AdbcConnection;
-use arrow_schema::{DataType, Field, Schema, TimeUnit};
 use checkpointer::CheckpointStore;
 use clap::Parser;
 use data_generation::config::{TargetConfig, build_version_prefix};
-use data_generation::dataset::Dataset;
 use data_generation::storage::DataStorage;
 use data_generation::storage::s3::S3Storage;
 use data_generation::version::VersionMetadata;
@@ -39,31 +37,6 @@ mod scenario;
 use crate::args::CommonArgs;
 use crate::commands::connect_system_adapter;
 
-fn create_tables_request_datasets(
-    dataset: &Arc<dyn Dataset>,
-    with_created_at: bool,
-) -> HashMap<String, system_adapter_protocol::DatasetConfig> {
-    dataset
-        .tables()
-        .into_iter()
-        .map(|(name, table)| {
-            let schema = if with_created_at {
-                let mut fields: Vec<_> = table.schema.fields().iter().cloned().collect();
-                fields.push(Arc::new(Field::new(
-                    "__created_at",
-                    DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())),
-                    true,
-                )));
-                Arc::new(Schema::new(fields))
-            } else {
-                table.schema.clone()
-            };
-
-            (name, system_adapter_protocol::DatasetConfig { schema })
-        })
-        .collect()
-}
-
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 struct Cli {
@@ -84,7 +57,6 @@ async fn run_benchmark(
     adbc_driver: system_adapter_protocol::SetupResponse,
     version_metadata: &VersionMetadata,
     source: Arc<S3Storage>,
-    datasets: HashMap<String, system_adapter_protocol::DatasetConfig>,
 ) -> anyhow::Result<()> {
     // --- Download checkpoints from S3 ---
     let scenario_name = common.scenario.to_string();
@@ -161,6 +133,8 @@ async fn run_benchmark(
     )?
     .with_created_at(common.with_created_at);
 
+    let datasets = pipeline.create_tables_request_datasets();
+
     if let Err(e) = system_adapter_client.create_tables(run_id, datasets).await {
         pipeline.cancel();
         return Err(anyhow::anyhow!(
@@ -252,10 +226,6 @@ async fn main() -> anyhow::Result<()> {
         )
     })?;
 
-    let dataset_source = DatasetSource::from_dataset_type(&version_metadata.dataset_type)?;
-    let generation_config = version_metadata.dataset_config();
-    let mutations = version_metadata.mutation_config();
-
     // --- Connect to the system adapter ---
     let mut system_adapter_client = match connect_system_adapter(&cli.common).await {
         Ok(system_adapter_client) => system_adapter_client,
@@ -266,13 +236,6 @@ async fn main() -> anyhow::Result<()> {
 
     let run_id = uuid::Uuid::new_v4();
 
-    let setup_dataset = dataset_source.create(
-        &generation_config,
-        &mutations,
-        Arc::clone(&source) as Arc<dyn DataStorage>,
-    )?;
-    let datasets = create_tables_request_datasets(&setup_dataset, cli.common.with_created_at);
-
     let setup_metadata = std::collections::HashMap::from([
         (
             "executor_instance_type".to_string(),
@@ -298,7 +261,6 @@ async fn main() -> anyhow::Result<()> {
         adbc_driver,
         &version_metadata,
         source,
-        datasets,
     )
     .await;