Skip to content

Commit 6bd58c8

Browse files
committed
Merge remote-tracking branch 'apache/main' into feat/count-mixed-partial-final
# Conflicts: # spark/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q10/extended.txt
2 parents f0437e0 + e11fdbe commit 6bd58c8

72 files changed

Lines changed: 7756 additions & 132 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/pr_build_linux.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,7 @@ jobs:
302302
org.apache.comet.CometFuzzAggregateSuite
303303
org.apache.comet.CometFuzzIcebergSuite
304304
org.apache.comet.CometFuzzMathSuite
305+
org.apache.comet.CometCodegenFuzzSuite
305306
org.apache.comet.DataGeneratorSuite
306307
- name: "shuffle"
307308
value: |
@@ -380,6 +381,9 @@ jobs:
380381
org.apache.comet.expressions.conditional.CometIfSuite
381382
org.apache.comet.expressions.conditional.CometCoalesceSuite
382383
org.apache.comet.expressions.conditional.CometCaseWhenSuite
384+
org.apache.comet.CometCodegenSuite
385+
org.apache.comet.CometCodegenSourceSuite
386+
org.apache.comet.CometCodegenHOFSuite
383387
- name: "sql"
384388
value: |
385389
org.apache.spark.sql.CometToPrettyStringSuite

.github/workflows/pr_build_macos.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ jobs:
155155
org.apache.comet.CometFuzzAggregateSuite
156156
org.apache.comet.CometFuzzIcebergSuite
157157
org.apache.comet.CometFuzzMathSuite
158+
org.apache.comet.CometCodegenFuzzSuite
158159
org.apache.comet.DataGeneratorSuite
159160
- name: "shuffle"
160161
value: |
@@ -232,6 +233,9 @@ jobs:
232233
org.apache.comet.expressions.conditional.CometIfSuite
233234
org.apache.comet.expressions.conditional.CometCoalesceSuite
234235
org.apache.comet.expressions.conditional.CometCaseWhenSuite
236+
org.apache.comet.CometCodegenSuite
237+
org.apache.comet.CometCodegenSourceSuite
238+
org.apache.comet.CometCodegenHOFSuite
235239
- name: "sql"
236240
value: |
237241
org.apache.spark.sql.CometToPrettyStringSuite

dev/diffs/4.1.1.diff

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -695,10 +695,20 @@ index e1a2fd33c7c..632f4b695df 100644
695695
}
696696
assert(scanOption.isDefined)
697697
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
698-
index b27122a8de2..a4c5aac8212 100644
698+
index b27122a8de2..3c690dbe788 100644
699699
--- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
700700
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
701-
@@ -470,7 +470,8 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite
701+
@@ -267,7 +267,8 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite
702+
}
703+
}
704+
705+
- test("SPARK-33853: explain codegen - check presence of subquery") {
706+
+ test("SPARK-33853: explain codegen - check presence of subquery",
707+
+ IgnoreComet("Comet plan has a different WholeStageCodegen subtree count")) {
708+
withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true") {
709+
withTempView("df") {
710+
val df1 = spark.range(1, 100)
711+
@@ -470,7 +471,8 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite
702712
}
703713
}
704714

@@ -708,7 +718,7 @@ index b27122a8de2..a4c5aac8212 100644
708718
withTempDir { dir =>
709719
Seq("parquet", "orc", "csv", "json").foreach { fmt =>
710720
val basePath = dir.getCanonicalPath + "/" + fmt
711-
@@ -548,7 +549,9 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite
721+
@@ -548,7 +550,9 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite
712722
}
713723
}
714724

docs/source/contributor-guide/spark_expressions_support.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@
215215
### datetime_funcs
216216

217217
- [ ] add_months
218-
- [ ] convert_timezone
218+
- [x] convert_timezone
219219
- [ ] curdate
220220
- [ ] current_date
221221
- [ ] current_time
@@ -413,7 +413,7 @@
413413
- [ ] randstr
414414
- [ ] rint
415415
- [x] round
416-
- [ ] sec
416+
- [x] sec
417417
- [x] shiftleft
418418
- [x] sign
419419
- [x] signum
@@ -596,7 +596,7 @@
596596

597597
### url_funcs
598598

599-
- [ ] parse_url
599+
- [x] parse_url (Incompatible: native diverges from Spark on edge cases)
600600
- [x] try_url_decode
601601
- 4.0.1, 2026-05-05
602602
- [x] url_decode

docs/source/user-guide/latest/compatibility/expressions/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,5 +36,6 @@ math
3636
misc
3737
string
3838
struct
39+
url
3940
cast
4041
```
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<!---
2+
Licensed to the Apache Software Foundation (ASF) under one
3+
or more contributor license agreements. See the NOTICE file
4+
distributed with this work for additional information
5+
regarding copyright ownership. The ASF licenses this file
6+
to you under the Apache License, Version 2.0 (the
7+
"License"); you may not use this file except in compliance
8+
with the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing,
13+
software distributed under the License is distributed on an
14+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
KIND, either express or implied. See the License for the
16+
specific language governing permissions and limitations
17+
under the License.
18+
-->
19+
20+
# URL Expressions
21+
22+
<!--BEGIN:EXPR_COMPAT[url]-->
23+
<!--END:EXPR_COMPAT-->

docs/source/user-guide/latest/expressions.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ of expressions that be disabled.
101101

102102
| Expression | SQL |
103103
| ---------------- | ---------------------------- |
104+
| ConvertTimezone | `convert_timezone` |
104105
| CurrentTimeZone | `current_timezone` |
105106
| DateAdd | `date_add` |
106107
| DateDiff | `datediff` |
@@ -115,6 +116,7 @@ of expressions that be disabled.
115116
| LastDay | `last_day` |
116117
| LocalTimestamp | `localtimestamp` |
117118
| MakeDate | `make_date` |
119+
| MakeTime | `make_time` |
118120
| Minute | `minute` |
119121
| NextDay | `next_day` |
120122
| Second | `second` |
@@ -131,6 +133,8 @@ of expressions that be disabled.
131133
| DayOfYear | `dayofyear` |
132134
| WeekOfYear | `weekofyear` |
133135
| Quarter | `quarter` |
136+
| ToTime | `to_time` |
137+
| TryToTime | `try_to_time` |
134138

135139
## Math Expressions
136140

@@ -171,6 +175,7 @@ of expressions that be disabled.
171175
| Randn | `randn` |
172176
| Remainder | `%` |
173177
| Round | `round` |
178+
| Sec | `sec` |
174179
| Signum | `signum` |
175180
| Sin | `sin` |
176181
| Sinh | `sinh` |

docs/source/user-guide/latest/iceberg.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,24 @@ The following scenarios will fall back to Spark's native Iceberg reader:
146146
- Dynamic Partition Pruning under Adaptive Query Execution (non-AQE DPP is supported);
147147
see [#3510](https://github.com/apache/datafusion-comet/issues/3510)
148148

149+
### Iceberg UDFs
150+
151+
Iceberg ships several `ScalaUDF`s that surface in user queries and maintenance actions:
152+
153+
- `IcebergSpark.registerBucketUDF` and `registerTruncateUDF` register `bucket(N, col)` and
154+
`truncate(W, col)` for use in `SELECT` / `JOIN` / `WHERE` predicates that align with hidden
155+
partitioning.
156+
- `RewriteDataFiles` with `sort-strategy=zorder` builds a tree of per-type ordered-bytes UDFs
157+
(`INT_ORDERED_BYTES`, `LONG_ORDERED_BYTES`, ..., `INTERLEAVE_BYTES`) over the sort key columns
158+
during compaction.
159+
160+
By default these UDFs cause the enclosing operator to fall back to Spark, which forces a
161+
columnar-to-row roundtrip and demotes the surrounding shuffle from `CometExchange` to
162+
`CometColumnarExchange`. Enabling the experimental
163+
[Scala UDF and Java UDF Support](scala_java_udfs.md) feature
164+
(`spark.comet.exec.scalaUDF.codegen.enabled=true`) routes these UDFs through native execution so
165+
the project, exchange, and sort operators around them stay on the Comet path end-to-end.
166+
149167
### Task input metrics
150168

151169
The native Iceberg reader populates Spark's task-level `inputMetrics.bytesRead` (visible in the Spark UI Stages tab) using the `bytes_read` counter from iceberg-rust's `ScanMetrics`. This counter includes bytes read from both data files and delete files.

docs/source/user-guide/latest/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ to read more.
4343
Supported Data Types <datatypes>
4444
Supported Operators <operators>
4545
Supported Expressions <expressions>
46+
ScalaUDF and Java UDF Support <scala_java_udfs>
4647
Configuration Settings <configs>
4748
Compatibility Guide <compatibility/index>
4849
Understanding Comet Plans <understanding-comet-plans>
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
<!---
2+
Licensed to the Apache Software Foundation (ASF) under one
3+
or more contributor license agreements. See the NOTICE file
4+
distributed with this work for additional information
5+
regarding copyright ownership. The ASF licenses this file
6+
to you under the Apache License, Version 2.0 (the
7+
"License"); you may not use this file except in compliance
8+
with the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing,
13+
software distributed under the License is distributed on an
14+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
KIND, either express or implied. See the License for the
16+
specific language governing permissions and limitations
17+
under the License.
18+
-->
19+
20+
# Scala UDF and Java UDF Support
21+
22+
Comet executes Spark's Scala and Java [scalar user-defined functions (UDFs)](https://spark.apache.org/docs/latest/sql-ref-functions-udf-scalar.html) on the native Comet path. The presence of a UDF does not force the enclosing operator off the native path; surrounding native operators stay native.
23+
24+
This page covers Spark's `ScalaUDF` (Scala `udf(...)`, `spark.udf.register(...)` over Scala or Java functional interfaces, and SQL `CREATE FUNCTION ... AS 'com.example.MyUDF'`). Other UDF kinds (Python / Pandas, Hive, aggregate) are out of scope and continue to fall back to Spark.
25+
26+
This feature is experimental and disabled by default.
27+
28+
## Configuration
29+
30+
| Key | Default | Description |
31+
| ------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------ |
32+
| `spark.comet.exec.scalaUDF.codegen.enabled` | `false` | When `true`, eligible `ScalaUDF`s run on the Comet path. When `false`, the enclosing operator falls back to Spark. |
33+
34+
## Supported
35+
36+
- User functions registered via `udf(...)`, `spark.udf.register(...)` (Scala or Java functional interfaces), or SQL `CREATE FUNCTION ... AS 'com.example.MyUDF'`.
37+
- Scalar input/output types: `Boolean`, `Byte`, `Short`, `Int`, `Long`, `Float`, `Double`, `Decimal`, `String`, `Binary`, `Date`, `Timestamp`, `TimestampNTZ`.
38+
- Complex input/output types with arbitrary nesting: `ArrayType`, `StructType`, `MapType`.
39+
- Composition with other Catalyst expressions inside the argument tree (e.g. `myUdf(upper(s))` runs as one native unit).
40+
- Higher-order functions (`transform`, `filter`, `exists`, `aggregate`, `zip_with`, `map_filter`, `map_zip_with`, etc.) inside the argument tree.
41+
42+
## Not supported
43+
44+
- Aggregate UDFs (`ScalaAggregator`, `TypedImperativeAggregate`, the legacy `UserDefinedAggregateFunction`).
45+
- Table UDFs and generators.
46+
- Python `@udf` and Pandas `@pandas_udf`.
47+
- Hive `GenericUDF` and `SimpleUDF`.
48+
- `CalendarIntervalType`, `NullType`, and `UserDefinedType` arguments and return types. UDT-typed columns fall back to Spark; for native execution, store and read the underlying representation directly (e.g. write MLlib `Vector` outputs as `Struct<type: Byte, size: Int, indices: Array<Int>, values: Array<Double>>` rather than `VectorUDT`).
49+
- Trees whose total nested-field count (output plus all input columns the UDF tree references) exceeds `spark.sql.codegen.maxFields` (default 100). Comet refuses these at plan time and the operator falls back to Spark.
50+
51+
When a UDF is rejected, the reason surfaces through Comet's standard fallback diagnostics; the query still runs on Spark.
52+
53+
## Behavior
54+
55+
- Non-deterministic expressions referenced from the argument tree (`rand`, `uuid`, `monotonically_increasing_id`) produce per-partition sequences consistent with Spark.
56+
- `TaskContext.get()` inside the user function returns the driving Spark task's context.
57+
- The user function must be closure-serializable; the same function that works with Spark's executor execution works here.
58+
59+
## Known limitations
60+
61+
- Each query containing a ScalaUDF pays a one-time codegen cost on its first batch and reuses the compiled kernel for subsequent batches, matching Spark's whole-stage codegen behavior. Bytecode is deduped JVM-wide via the same `CodeGenerator` cache, so structurally identical queries across a session share the compiled class.

0 commit comments

Comments
 (0)