Eventual-Inc
diff --git a/‎src/daft-micropartition/src/micropartition.rs‎
Lines changed: 4 additions & 1 deletion b/‎src/daft-micropartition/src/micropartition.rs‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/daft-micropartition/src/ops/concat.rs‎
Lines changed: 4 additions & 1 deletion b/‎src/daft-micropartition/src/ops/concat.rs‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/daft-parquet/src/metadata_adapter.rs‎
Lines changed: 25 additions & 0 deletions b/‎src/daft-parquet/src/metadata_adapter.rs‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎src/daft-scan/src/glob.rs‎
Lines changed: 11 additions & 1 deletion b/‎src/daft-scan/src/glob.rs‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎src/daft-scan/src/lib.rs‎
Lines changed: 194 additions & 6 deletions b/‎src/daft-scan/src/lib.rs‎
Lines changed: 194 additions & 6 deletions
@@ -93,7 +93,10 @@ impl MicroPartition {
         Self {
             schema,
             chunks: record_batches,
-            metadata: TableMetadata { length },
+            metadata: TableMetadata {
+                length,
+                column_sizes: None,
+            },
             statistics,
         }
     }
 
@@ -32,7 +32,10 @@ impl MicroPartition {
         Ok(Self {
             schema,
             chunks: Arc::new(all_tables),
-            metadata: TableMetadata { length: new_len },
+            metadata: TableMetadata {
+                length: new_len,
+                column_sizes: None,
+            },
             statistics: all_stats,
         })
     }
 
@@ -201,6 +201,31 @@ impl DaftRowGroupMetaData {
     pub fn compressed_size(&self) -> usize {
         self.inner.compressed_size() as usize
     }
+
+    /// Uncompressed (encoded) size in bytes for each column chunk in this row group,
+    /// keyed by the top-level (root) column name.
+    ///
+    /// Nested columns are flattened to their root: every leaf chunk (e.g.
+    /// `position_ids.list.element`) is attributed to its top-level field
+    /// (`position_ids`). The caller is responsible for summing across row groups.
+    pub fn column_uncompressed_sizes(&self) -> Vec<(String, u64)> {
+        self.inner
+            .columns()
+            .iter()
+            .map(|col| {
+                let root = col
+                    .column_descr()
+                    .path()
+                    .parts()
+                    .first()
+                    .cloned()
+                    .unwrap_or_default();
+                // `uncompressed_size()` is an i64 and is always non-negative in practice;
+                // clamp defensively so a malformed (negative) value can't wrap to a huge u64.
+                (root, col.uncompressed_size().max(0) as u64)
+            })
+            .collect()
+    }
 }
 
 #[cfg(test)]
 
@@ -1,4 +1,4 @@
-use std::{sync::Arc, vec};
+use std::{collections::BTreeMap, sync::Arc, vec};
 
 use common_error::{DaftError, DaftResult};
 use common_file_formats::FileFormat;
@@ -288,10 +288,20 @@ impl GlobScanOperator {
                         Err(e) => return Err(e),
                     };
 
+                    // Sum the per-column uncompressed sizes across all row groups, keyed by
+                    // top-level column name. Storing per-column (rather than a single total)
+                    // lets size estimates respect column-projection pushdown.
+                    let mut column_sizes: BTreeMap<String, u64> = BTreeMap::new();
+                    for (_, rg) in metadata.row_groups() {
+                        for (name, bytes) in rg.column_uncompressed_sizes() {
+                            *column_sizes.entry(name).or_insert(0) += bytes;
+                        }
+                    }
                     let first_metadata = Some((
                         filepath.clone(),
                         TableMetadata {
                             length: metadata.num_rows(),
+                            column_sizes: (!column_sizes.is_empty()).then_some(column_sizes),
                         },
                     ));
                     (schema, first_metadata, filepath)
 
@@ -411,19 +411,31 @@ impl ScanTask {
                 .all(|s| s.partition_spec == sources.first().unwrap().partition_spec),
             "ScanTask sources must all have the same PartitionSpec at construction",
         );
-        let (length, size_bytes_on_disk, statistics) = sources
+        let (length, column_sizes, size_bytes_on_disk, statistics) = sources
             .iter()
             .map(|s| {
                 (
                     s.metadata.as_ref().map(|m| m.length),
+                    s.metadata.as_ref().and_then(|m| m.column_sizes.clone()),
                     s.size_bytes,
                     s.statistics.clone(),
                 )
             })
             .reduce(
-                |(acc_len, acc_size, acc_stats), (curr_len, curr_size, curr_stats)| {
+                |(acc_len, acc_col_sizes, acc_size, acc_stats),
+                 (curr_len, curr_col_sizes, curr_size, curr_stats)| {
                     (
                         acc_len.and_then(|acc_len| curr_len.map(|curr_len| acc_len + curr_len)),
+                        // All-or-nothing: only retain per-column sizes if every source has them,
+                        // summing each column across sources.
+                        acc_col_sizes.and_then(|mut acc| {
+                            curr_col_sizes.map(|curr| {
+                                for (name, bytes) in curr {
+                                    *acc.entry(name).or_insert(0) += bytes;
+                                }
+                                acc
+                            })
+                        }),
                         acc_size
                             .and_then(|acc_size| curr_size.map(|curr_size| acc_size + curr_size)),
                         acc_stats.and_then(|acc_stats| {
@@ -444,7 +456,10 @@ impl ScanTask {
                 },
             )
             .unwrap();
-        let metadata = length.map(|l| TableMetadata { length: l });
+        let metadata = length.map(|l| TableMetadata {
+            length: l,
+            column_sizes,
+        });
         Self {
             sources,
             schema,
@@ -756,7 +771,45 @@ impl ScanTask {
                     })
                 })
                 .or_else(|| {
-                    // use approximate number of rows multiplied by an approximate bytes-per-row
+                    // Use per-column uncompressed sizes from file metadata (e.g. Parquet
+                    // column-chunk totals) when available. This is more accurate than the
+                    // schema-based estimate for data with dictionary encoding, low-cardinality
+                    // columns, or variable-length nested types (e.g. List) where the schema
+                    // heuristic assumes a fixed element count.
+                    //
+                    // We restrict the sum to the materialized (projected) columns so the
+                    // estimate respects column-projection pushdown, and convert to a per-row
+                    // size scaled by `approx_num_rows`, so limit/filter pushdowns are honored
+                    // the same way as the schema-based fallback below.
+                    let metadata = self.metadata.as_ref()?;
+                    let column_sizes = metadata.column_sizes.as_ref()?;
+                    if metadata.length == 0 {
+                        return None;
+                    }
+                    let projected_bytes: u64 = mat_schema
+                        .field_names()
+                        .filter_map(|name| column_sizes.get(name).copied())
+                        .sum();
+                    // No projected column was found in the metadata (e.g. the scan only reads
+                    // generated/partition columns); defer to the schema-based estimate.
+                    if projected_bytes == 0 {
+                        return None;
+                    }
+                    let row_size = (projected_bytes as f64) / (metadata.length as f64);
+                    self.approx_num_rows(config).map(|approx_num_rows| {
+                        let estimate_f64 = approx_num_rows * row_size;
+                        if estimate_f64.is_nan()
+                            || estimate_f64.is_infinite()
+                            || estimate_f64 > REASONABLE_SIZE_BYTES as f64
+                        {
+                            REASONABLE_SIZE_BYTES
+                        } else {
+                            estimate_f64 as usize
+                        }
+                    })
+                })
+                .or_else(|| {
+                    // Fall back to approximate number of rows multiplied by an approximate bytes-per-row
                     self.approx_num_rows(config).map(|approx_num_rows| {
                         let row_size = mat_schema.estimate_row_size_bytes();
 
@@ -874,7 +927,7 @@ Pushdowns = {pushdowns}
 
 #[cfg(test)]
 mod test {
-    use std::sync::Arc;
+    use std::{collections::BTreeMap, sync::Arc};
 
     use common_display::{DisplayAs, DisplayLevel};
     use common_error::DaftResult;
@@ -1045,6 +1098,7 @@ mod test {
             size_bytes: Some(1_000_000),
             metadata: Some(TableMetadata {
                 length: usize::MAX, // Extremely large row count
+                column_sizes: None,
             }),
             statistics: None,
             partition_spec: None,
@@ -1132,6 +1186,7 @@ mod test {
             size_bytes: Some(10_000_000), // 10MB
             metadata: Some(TableMetadata {
                 length: 1000, // 1000 rows
+                column_sizes: None,
             }),
             statistics: None,
             partition_spec: None,
@@ -1175,6 +1230,7 @@ mod test {
             size_bytes: Some(1_000_000),
             metadata: Some(TableMetadata {
                 length: usize::MAX, // Extremely large row count
+                column_sizes: None,
             }),
             statistics: None,
             partition_spec: None,
@@ -1321,7 +1377,10 @@ mod test {
     fn test_schema_row_size_estimation_valid_case() {
         let sources = vec![ScanSource {
             size_bytes: Some(1_000_000),
-            metadata: Some(TableMetadata { length: 10_000 }),
+            metadata: Some(TableMetadata {
+                length: 10_000,
+                column_sizes: None,
+            }),
             statistics: None,
             partition_spec: None,
             kind: ScanSourceKind::File {
@@ -1364,6 +1423,135 @@ mod test {
         assert!(estimate_val < 1_000_000_000); // Less than 1GB is reasonable
     }
 
+    /// Builds a parquet scan task modeled on the customer's tokenized-sequence dataset:
+    /// four `List(Int64)` columns where `position_ids` dominates the on-disk bytes. The
+    /// per-column uncompressed sizes are taken from row group 0 of `rank_0_train.parquet`.
+    fn make_list_column_scan_task(pushdowns: Pushdowns) -> ScanTask {
+        // Uncompressed byte sizes per column, from the customer's parquet metadata.
+        let column_sizes = BTreeMap::from([
+            ("input_ids".to_string(), 10_760_083u64),
+            ("attention_mask".to_string(), 3_339u64),
+            ("labels".to_string(), 10_760_083u64),
+            ("position_ids".to_string(), 122_969_239u64),
+        ]);
+        let num_rows = 200; // row group 0 num_rows
+
+        let sources = vec![ScanSource {
+            size_bytes: Some(32_418_149), // compressed row group 0 bytes
+            metadata: Some(TableMetadata {
+                length: num_rows,
+                column_sizes: Some(column_sizes),
+            }),
+            statistics: None,
+            partition_spec: None,
+            kind: ScanSourceKind::File {
+                path: "rank_0_train.parquet".to_string(),
+                chunk_spec: None,
+                iceberg_delete_files: None,
+                parquet_metadata: None,
+            },
+        }];
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("input_ids", DataType::List(Box::new(DataType::Int64))),
+            Field::new("attention_mask", DataType::List(Box::new(DataType::Int64))),
+            Field::new("labels", DataType::List(Box::new(DataType::Int64))),
+            Field::new("position_ids", DataType::List(Box::new(DataType::Int64))),
+        ]));
+
+        ScanTask::new(
+            sources,
+            Arc::new(SourceConfig::File(FileFormatConfig::Parquet(
+                ParquetSourceConfig {
+                    coerce_int96_timestamp_unit: TimeUnit::Seconds,
+                    field_id_mapping: None,
+                    row_groups: None,
+                    chunk_size: None,
+                    ignore_corrupt_files: false,
+                },
+            ))),
+            schema,
+            Arc::new(StorageConfig::new_internal(false, None)),
+            pushdowns,
+            None,
+        )
+    }
+
+    /// Regression test for OOMs caused by under-estimating `List`-typed columns.
+    ///
+    /// The schema-based fallback assumes a fixed list length (`DEFAULT_LIST_LEN = 4`),
+    /// estimating ~130 bytes/row (~26 KB for 200 rows) when the real uncompressed size
+    /// is ~144 MB. Per-column metadata sizes must be used instead.
+    #[test]
+    fn test_list_column_estimation_uses_metadata_not_schema() {
+        let scan_task = make_list_column_scan_task(Pushdowns::default());
+        let estimate = scan_task.estimate_in_memory_size_bytes(None).unwrap();
+
+        // Expect the sum of all four columns' uncompressed sizes (~144 MB).
+        // (Allow a few bytes of slack for the per-row f64 roundtrip.)
+        let expected: i64 = 10_760_083 + 3_339 + 10_760_083 + 122_969_239;
+        assert!((estimate as i64 - expected).abs() <= 4);
+
+        // Sanity check: this must be vastly larger than the schema-based guess, which is
+        // what caused the OOM. DEFAULT_LIST_LEN=4 yields ~130 bytes/row.
+        let schema_based = 200.0 * scan_task.materialized_schema().estimate_row_size_bytes();
+        assert!(schema_based < 30_000.0);
+        assert!((estimate as f64) > 1000.0 * schema_based);
+    }
+
+    /// The metadata-based estimate must respect column-projection pushdown: selecting only
+    /// the small `attention_mask` column should not estimate the whole (position_ids-heavy)
+    /// row group.
+    #[test]
+    fn test_list_column_estimation_respects_projection() {
+        // Project only the small column.
+        let small_pushdowns = Pushdowns::new(
+            None,
+            None,
+            Some(Arc::new(vec!["attention_mask".to_string()])),
+            None,
+            None,
+            None,
+        );
+        let small = make_list_column_scan_task(small_pushdowns)
+            .estimate_in_memory_size_bytes(None)
+            .unwrap();
+        assert!((small as i64 - 3_339).abs() <= 4);
+
+        // Project only the dominant column.
+        let large_pushdowns = Pushdowns::new(
+            None,
+            None,
+            Some(Arc::new(vec!["position_ids".to_string()])),
+            None,
+            None,
+            None,
+        );
+        let large = make_list_column_scan_task(large_pushdowns)
+            .estimate_in_memory_size_bytes(None)
+            .unwrap();
+        assert!((large as i64 - 122_969_239).abs() <= 4);
+
+        // The projected small column must be orders of magnitude smaller than the full scan.
+        assert!(large > 1000 * small);
+    }
+
+    /// A limit pushdown should scale the metadata-based estimate down proportionally,
+    /// rather than returning the full-file size.
+    #[test]
+    fn test_list_column_estimation_respects_limit() {
+        let limit_pushdowns = Pushdowns::new(None, None, None, Some(50), None, None);
+        let estimate = make_list_column_scan_task(limit_pushdowns)
+            .estimate_in_memory_size_bytes(None)
+            .unwrap();
+
+        // 50 of 200 rows => roughly a quarter of the full ~144 MB.
+        let full = 10_760_083 + 3_339 + 10_760_083 + 122_969_239;
+        let expected = full / 4;
+        // Allow for f64 rounding.
+        assert!((estimate as i64 - expected as i64).abs() <= 4);
+    }
+
     #[test]
     fn test_overflow_protection_with_infinity() {
         let sources = vec![ScanSource {
Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,10 @@ impl MicroPartition {`
`93`	`93`	`Self {`
`94`	`94`	`schema,`
`95`	`95`	`chunks: record_batches,`
`96`		`- metadata: TableMetadata { length },`
	`96`	`+ metadata: TableMetadata {`
	`97`	`+ length,`
	`98`	`+ column_sizes: None,`
	`99`	`+ },`
`97`	`100`	`statistics,`
`98`	`101`	`}`
`99`	`102`	`}`