fix mapping issue with native_datafusion

mbutrovich · mbutrovich · commit 2be791c8de7a · 2026-04-03T12:39:08.000-04:00
diff --git a/native/core/src/parquet/schema_adapter.rs b/native/core/src/parquet/schema_adapter.rs
@@ -296,12 +296,26 @@ impl SparkPhysicalExprAdapter {
     ) -> DataFusionResult<Arc<dyn PhysicalExpr>> {
         expr.transform(|e| {
             if let Some(column) = e.as_any().downcast_ref::<Column>() {
-                let col_idx = column.index();
                 let col_name = column.name();
 
-                let logical_field = self.logical_file_schema.fields().get(col_idx);
-                // Look up physical field by name instead of index for correctness
-                // when logical and physical schemas have different column orderings
+                // Resolve fields by name because this is the fallback path
+                // that runs on the original expression when the default
+                // adapter fails. The original expression was built against
+                // the required (pruned) schema, so column indices refer to
+                // that schema — not the logical or physical file schemas.
+                // DataFusion's DefaultPhysicalExprAdapter::resolve_physical_column
+                // also resolves by name for the same reason.
+                let logical_field = if self.parquet_options.case_sensitive {
+                    self.logical_file_schema
+                        .fields()
+                        .iter()
+                        .find(|f| f.name() == col_name)
+                } else {
+                    self.logical_file_schema
+                        .fields()
+                        .iter()
+                        .find(|f| f.name().eq_ignore_ascii_case(col_name))
+                };
                 let physical_field = if self.parquet_options.case_sensitive {
                     self.physical_file_schema
                         .fields()
@@ -314,19 +328,40 @@ impl SparkPhysicalExprAdapter {
                         .find(|f| f.name().eq_ignore_ascii_case(col_name))
                 };
 
-                if let (Some(logical_field), Some(physical_field)) = (logical_field, physical_field)
+                // Remap the column index to the physical file schema so
+                // downstream evaluation reads the correct column from the
+                // parquet batch.
+                let physical_index = if self.parquet_options.case_sensitive {
+                    self.physical_file_schema.index_of(col_name).ok()
+                } else {
+                    self.physical_file_schema
+                        .fields()
+                        .iter()
+                        .position(|f| f.name().eq_ignore_ascii_case(col_name))
+                };
+
+                if let (Some(logical_field), Some(physical_field), Some(phys_idx)) =
+                    (logical_field, physical_field, physical_index)
                 {
+                    let remapped: Arc<dyn PhysicalExpr> = if column.index() != phys_idx {
+                        Arc::new(Column::new(col_name, phys_idx))
+                    } else {
+                        Arc::clone(&e)
+                    };
+
                     if logical_field.data_type() != physical_field.data_type() {
                         let cast_expr: Arc<dyn PhysicalExpr> = Arc::new(
                             CometCastColumnExpr::new(
-                                Arc::clone(&e),
+                                remapped,
                                 Arc::clone(physical_field),
                                 Arc::clone(logical_field),
                                 None,
                             )
                             .with_parquet_options(self.parquet_options.clone()),
                         );
                         return Ok(Transformed::yes(cast_expr));
+                    } else if column.index() != phys_idx {
+                        return Ok(Transformed::yes(remapped));
                     }
                 }
             }
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometNativeReaderSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometNativeReaderSuite.scala
@@ -19,6 +19,7 @@
 
 package org.apache.comet.exec
 
+import org.apache.hadoop.fs.Path
 import org.scalactic.source.Position
 import org.scalatest.Tag
 
@@ -602,4 +603,57 @@ class CometNativeReaderSuite extends CometTestBase with AdaptiveSparkPlanHelper
           |""".stripMargin,
       "select array(array(1, 2, null), array(), array(10), null, array(null)) from tbl")
   }
+
+  test("native reader - nested schema pruning with array of struct and filter") {
+    // Regression test found during DataFusion 53 upgrade (PR #3629).
+    // Spark's SchemaPruningSuite tests (e.g. "select a single complex field array
+    // and in clause", "select explode of nested field of array of struct",
+    // "SPARK-34638: nested column prune on generator output") were failing with:
+    //   native panic: called `Result::unwrap()` on an `Err` value:
+    //   Internal("Unexpected data type in GetArrayStructFields: Int32")
+    // The root cause was wrap_all_type_mismatches in Comet's schema adapter
+    // looking up the logical field by column index instead of by name. When
+    // filter expressions are created against the pruned required_schema (where
+    // "friends" is at index 0), the fallback would index into the full
+    // logical_file_schema and get "id: Int32" instead of "friends: List<...>".
+    withTempDir { dir =>
+      val path = new Path(dir.toURI.toString, "test").toUri.toString
+
+      // Create a table with multiple columns so that nested schema pruning
+      // can prune away unneeded columns. The friends column is an array of
+      // structs with first/middle/last, but the query only needs first and middle.
+      withSQLConf(CometConf.COMET_ENABLED.key -> "false") {
+        spark.sql(
+          """
+            |select
+            |  0 as id,
+            |  named_struct('first', 'Jane', 'middle', 'X.', 'last', 'Doe') as name,
+            |  '123 Main Street' as address,
+            |  1 as pets,
+            |  array(
+            |    named_struct('first', 'Susan', 'middle', 'Z.', 'last', 'Smith')
+            |  ) as friends
+            |union all
+            |select
+            |  1 as id,
+            |  named_struct('first', 'John', 'middle', 'Y.', 'last', 'Doe') as name,
+            |  '321 Wall Street' as address,
+            |  3 as pets,
+            |  array(
+            |    named_struct('first', 'Alice', 'middle', 'A.', 'last', 'Jones')
+            |  ) as friends
+            |""".stripMargin).repartition(1).write.parquet(path)
+      }
+
+      val schema = spark.read.parquet(path).schema
+
+      readParquetFile(path, Some(schema)) { df =>
+        df.createOrReplaceTempView("tbl")
+      }
+
+      val query = "select friends.middle from tbl where friends.first[0] = 'Susan'"
+      val df = sql(query)
+      checkSparkAnswer(df)
+    }
+  }
 }