risingwavelabs
diff --git a/‎Cargo.lock
Lines changed: 1 addition & 3 deletions b/‎Cargo.lock
Lines changed: 1 addition & 3 deletions
diff --git a/‎Cargo.toml
Lines changed: 3 additions & 3 deletions b/‎Cargo.toml
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/common/src/catalog/column.rs
Lines changed: 12 additions & 0 deletions b/‎src/common/src/catalog/column.rs
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/connector/src/source/iceberg/mod.rs
Lines changed: 32 additions & 6 deletions b/‎src/connector/src/source/iceberg/mod.rs
Lines changed: 32 additions & 6 deletions
diff --git a/‎src/connector/src/source/reader/desc.rs
Lines changed: 11 additions & 3 deletions b/‎src/connector/src/source/reader/desc.rs
Lines changed: 11 additions & 3 deletions
diff --git a/‎src/connector/src/with_options.rs
Lines changed: 6 additions & 1 deletion b/‎src/connector/src/with_options.rs
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/frontend/src/optimizer/plan_node/generic/source.rs
Lines changed: 104 additions & 1 deletion b/‎src/frontend/src/optimizer/plan_node/generic/source.rs
Lines changed: 104 additions & 1 deletion
diff --git a/‎src/frontend/src/optimizer/plan_node/logical_iceberg_scan.rs
Lines changed: 10 additions & 1 deletion b/‎src/frontend/src/optimizer/plan_node/logical_iceberg_scan.rs
Lines changed: 10 additions & 1 deletion
@@ -155,12 +155,12 @@ icelake = { git = "https://github.com/risingwavelabs/icelake.git", rev = "0ec44f
     "prometheus",
 ] }
 # branch dev_rebase_main_20241230
-iceberg = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "683fb89edeaf8d1baae69e1f376d68b92be1d496", features = [
+iceberg = { path = "../iceberg-rust/crates/iceberg", features = [
     "storage-s3",
     "storage-gcs",
 ] }
-iceberg-catalog-rest = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "683fb89edeaf8d1baae69e1f376d68b92be1d496" }
-iceberg-catalog-glue = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "683fb89edeaf8d1baae69e1f376d68b92be1d496" }
+iceberg-catalog-rest = { path = "../iceberg-rust/crates/catalog/rest" }
+iceberg-catalog-glue = { path = "../iceberg-rust/crates/catalog/glue" }
 opendal = "0.49"
 # used only by arrow-udf-flight
 arrow-flight = "53"
 
@@ -424,6 +424,18 @@ impl ColumnCatalog {
             )),
         ]
     }
+
+    pub fn is_row_id_column(&self) -> bool {
+        self.column_desc.column_id == ROW_ID_COLUMN_ID
+    }
+
+    // Partition
+    // pub fn is_source_partition_or_offset_column(&self) -> bool {
+    //     self.column_desc
+    //         .additional_column
+    //         .column_type
+    //         .is_some_and(|col| matches!(col, ColumnType::Offset(_) | ColumnType::Partition(_)))
+    // }
 }
 
 impl From<PbColumnCatalog> for ColumnCatalog {
 
@@ -165,6 +165,18 @@ impl IcebergFileScanTask {
             IcebergFileScanTask::CountStar(_) => false,
         }
     }
+
+    pub fn files(&self) -> Vec<String> {
+        match self {
+            IcebergFileScanTask::Data(file_scan_tasks)
+            | IcebergFileScanTask::EqualityDelete(file_scan_tasks)
+            | IcebergFileScanTask::PositionDelete(file_scan_tasks) => file_scan_tasks
+                .iter()
+                .map(|task| task.data_file_path.clone())
+                .collect(),
+            IcebergFileScanTask::CountStar(_) => vec![],
+        }
+    }
 }
 
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
@@ -234,7 +246,9 @@ impl SplitEnumerator for IcebergSplitEnumerator {
     }
 
     async fn list_splits(&mut self) -> ConnectorResult<Vec<Self::Split>> {
-        // Iceberg source does not support streaming queries
+        // Like file source, iceberg streaming source has a List Executor and a Fetch Executor,
+        // instead of relying on SplitEnumerator on meta.
+        // TODO: add some validation logic here.
         Ok(vec![])
     }
 }
@@ -349,18 +363,30 @@ impl IcebergSplitEnumerator {
         let table_schema = table.metadata().current_schema();
         tracing::debug!("iceberg_table_schema: {:?}", table_schema);
 
-        let mut position_delete_files = vec![];
-        let mut data_files = vec![];
-        let mut equality_delete_files = vec![];
         let scan = table
             .scan()
             .with_filter(predicate)
             .snapshot_id(snapshot_id)
             .select(require_names)
             .build()
-            .map_err(|e| anyhow!(e))?;
+            .context("failed to build iceberg scan")?;
+        Self::scan_to_splits(snapshot_id, scan, iceberg_scan_type, batch_parallelism).await
+    }
 
-        let file_scan_stream = scan.plan_files().await.map_err(|e| anyhow!(e))?;
+    pub async fn scan_to_splits(
+        snapshot_id: i64,
+        scan: TableScan,
+        iceberg_scan_type: IcebergScanType,
+        batch_parallelism: usize,
+    ) -> ConnectorResult<Vec<IcebergSplit>> {
+        let mut position_delete_files = vec![];
+        let mut data_files = vec![];
+        let mut equality_delete_files = vec![];
+
+        let file_scan_stream = scan
+            .plan_files()
+            .await
+            .context("failed to plan iceberg FileScanTask")?;
 
         #[for_await]
         for task in file_scan_stream {
 
@@ -99,9 +99,13 @@ impl SourceDescBuilder {
             .map(|c| SourceColumnDesc::from(&c.column_desc))
             .collect();
 
-        for (existed, c) in columns_exist.iter().zip_eq_fast(&additional_columns) {
-            if !existed {
-                columns.push(SourceColumnDesc::hidden_addition_col_from_column_desc(c));
+        // currently iceberg uses other columns. See `extract_iceberg_columns`
+        // TODO: unify logic.
+        if connector_name != "iceberg" {
+            for (existed, c) in columns_exist.iter().zip_eq_fast(&additional_columns) {
+                if !existed {
+                    columns.push(SourceColumnDesc::hidden_addition_col_from_column_desc(c));
+                }
             }
         }
 
@@ -173,6 +177,10 @@ impl SourceDescBuilder {
             metrics: self.metrics.clone(),
         })
     }
+
+    pub fn with_properties(&self) -> WithOptionsSecResolved {
+        self.with_properties.clone()
+    }
 }
 
 pub mod test_utils {
 
@@ -150,7 +150,12 @@ pub trait WithPropertiesExt: Get + Sized {
 
     fn connector_need_pk(&self) -> bool {
         // Currently only iceberg connector doesn't need primary key
-        !self.is_iceberg_connector()
+        // introduced in https://github.com/risingwavelabs/risingwave/pull/14971
+        // XXX: This seems not the correct way. Iceberg doesn't necessarily has a PK.
+        // "batch source" doesn't need a PK?
+        // For streaming, if it has a PK, do we want to use it? It seems not safe.
+        // !self.is_iceberg_connector()
+        true
     }
 
     fn is_legacy_fs_connector(&self) -> bool {
 
@@ -15,7 +15,7 @@
 use std::rc::Rc;
 
 use educe::Educe;
-use risingwave_common::catalog::{ColumnCatalog, Field, Schema};
+use risingwave_common::catalog::{ColumnCatalog, ColumnDesc, Field, Schema};
 use risingwave_common::types::DataType;
 use risingwave_common::util::sort_util::OrderType;
 use risingwave_connector::WithPropertiesExt;
@@ -77,6 +77,8 @@ impl GenericPlanNode for Source {
     }
 
     fn stream_key(&self) -> Option<Vec<usize>> {
+        // FIXME: output col idx is not set. But iceberg source can prune cols.
+        // XXX: there's a RISINGWAVE_ICEBERG_ROW_ID. Should we use it?
         self.row_id_index.map(|idx| vec![idx])
     }
 
@@ -96,6 +98,79 @@ impl GenericPlanNode for Source {
 }
 
 impl Source {
+    /// The output is [`risingwave_connector::source::filesystem::FsPageItem`] / [`risingwave_connector::source::iceberg::IcebergSplit`]
+    pub fn file_list_node(core: Self) -> Self {
+        let column_catalog = if core.is_iceberg_connector() {
+            vec![
+                ColumnCatalog {
+                    column_desc: ColumnDesc::from_field_with_column_id(
+                        &Field {
+                            name: "partition_id".to_owned(),
+                            data_type: DataType::Varchar,
+                        },
+                        0,
+                    ),
+                    is_hidden: false,
+                },
+                ColumnCatalog {
+                    column_desc: ColumnDesc::from_field_with_column_id(
+                        &Field {
+                            name: "split".to_owned(),
+                            data_type: DataType::Jsonb,
+                        },
+                        0,
+                    ),
+                    is_hidden: false,
+                },
+            ]
+        } else if core.is_new_fs_connector() {
+            vec![
+                ColumnCatalog {
+                    column_desc: ColumnDesc::from_field_with_column_id(
+                        &Field {
+                            name: "filename".to_owned(),
+                            data_type: DataType::Varchar,
+                        },
+                        0,
+                    ),
+                    is_hidden: false,
+                },
+                // This columns seems unused.
+                ColumnCatalog {
+                    column_desc: ColumnDesc::from_field_with_column_id(
+                        &Field {
+                            name: "last_edit_time".to_owned(),
+                            data_type: DataType::Timestamptz,
+                            sub_fields: vec![],
+                            type_name: "".to_owned(),
+                        },
+                        1,
+                    ),
+                    is_hidden: false,
+                },
+                ColumnCatalog {
+                    column_desc: ColumnDesc::from_field_with_column_id(
+                        &Field {
+                            name: "file_size".to_owned(),
+                            data_type: DataType::Int64,
+                            sub_fields: vec![],
+                            type_name: "".to_owned(),
+                        },
+                        0,
+                    ),
+                    is_hidden: false,
+                },
+            ]
+        } else {
+            unreachable!()
+        };
+        Self {
+            column_catalog,
+            row_id_index: None,
+            ..core
+        }
+    }
+
     pub fn is_new_fs_connector(&self) -> bool {
         self.catalog
             .as_ref()
@@ -119,6 +194,34 @@ impl Source {
         self.is_iceberg_connector()
     }
 
+    pub fn exclude_iceberg_hidden_columns(mut self) -> Self {
+        let Some(catalog) = &mut self.catalog else {
+            return self;
+        };
+        if catalog.info.is_shared() {
+            // for shared source, we should produce all columns
+            return self;
+        }
+        if self.kind != SourceNodeKind::CreateMViewOrBatch {
+            return self;
+        }
+
+        let prune = |col: &ColumnCatalog| col.is_hidden() && !col.is_row_id_column();
+
+        // minus the number of hidden columns before row_id_index.
+        self.row_id_index = self.row_id_index.map(|idx| {
+            let mut cnt = 0;
+            for col in self.column_catalog.iter().take(idx + 1) {
+                if prune(col) {
+                    cnt += 1;
+                }
+            }
+            idx - cnt
+        });
+        self.column_catalog.retain(|c| !prune(c));
+        self
+    }
+
     /// The columns in stream/batch source node indicate the actual columns it will produce,
     /// instead of the columns defined in source catalog. The difference is generated columns.
     pub fn exclude_generated_columns(mut self) -> (Self, Option<usize>) {
 
@@ -64,10 +64,19 @@ impl LogicalIcebergScan {
     pub fn clone_with_required_cols(&self, required_cols: &[usize]) -> Self {
         assert!(!required_cols.is_empty());
         let mut core = self.core.clone();
+        let mut has_row_id = false;
         core.column_catalog = required_cols
             .iter()
-            .map(|idx| core.column_catalog[*idx].clone())
+            .map(|idx| {
+                if Some(*idx) == core.row_id_index {
+                    has_row_id = true;
+                }
+                core.column_catalog[*idx].clone()
+            })
             .collect();
+        if !has_row_id {
+            core.row_id_index = None;
+        }
         let base = PlanBase::new_logical_with_core(&core);
 
         LogicalIcebergScan {
Original file line number	Diff line number	Diff line change
`@@ -424,6 +424,18 @@ impl ColumnCatalog {`
`424`	`424`	`)),`
`425`	`425`	`]`
`426`	`426`	`}`
	`427`	`+`
	`428`	`+ pub fn is_row_id_column(&self) -> bool {`
	`429`	`+ self.column_desc.column_id == ROW_ID_COLUMN_ID`
	`430`	`+ }`
	`431`	`+`
	`432`	`+ // Partition`
	`433`	`+ // pub fn is_source_partition_or_offset_column(&self) -> bool {`
	`434`	`+ // self.column_desc`
	`435`	`+ // .additional_column`
	`436`	`+ // .column_type`
	`437`	`+ // .is_some_and(\|col\| matches!(col, ColumnType::Offset(_) \| ColumnType::Partition(_)))`
	`438`	`+ // }`
`427`	`439`	`}`
`428`	`440`
`429`	`441`	`impl From<PbColumnCatalog> for ColumnCatalog {`
Original file line number	Diff line number	Diff line change
`@@ -99,9 +99,13 @@ impl SourceDescBuilder {`
`99`	`99`	`.map(\|c\| SourceColumnDesc::from(&c.column_desc))`
`100`	`100`	`.collect();`
`101`	`101`
`102`		`- for (existed, c) in columns_exist.iter().zip_eq_fast(&additional_columns) {`
`103`		`- if !existed {`
`104`		`- columns.push(SourceColumnDesc::hidden_addition_col_from_column_desc(c));`
	`102`	+ // currently iceberg uses other columns. See `extract_iceberg_columns`
	`103`	`+ // TODO: unify logic.`
	`104`	`+ if connector_name != "iceberg" {`
	`105`	`+ for (existed, c) in columns_exist.iter().zip_eq_fast(&additional_columns) {`
	`106`	`+ if !existed {`
	`107`	`+ columns.push(SourceColumnDesc::hidden_addition_col_from_column_desc(c));`
	`108`	`+ }`
`105`	`109`	`}`
`106`	`110`	`}`
`107`	`111`
`@@ -173,6 +177,10 @@ impl SourceDescBuilder {`
`173`	`177`	`metrics: self.metrics.clone(),`
`174`	`178`	`})`
`175`	`179`	`}`
	`180`	`+`
	`181`	`+ pub fn with_properties(&self) -> WithOptionsSecResolved {`
	`182`	`+ self.with_properties.clone()`
	`183`	`+ }`
`176`	`184`	`}`
`177`	`185`
`178`	`186`	`pub mod test_utils {`