Skip to content

Commit 309e0f2

Browse files
committed
fix: fall back to Spark for str_to_map when legacy regex split truncation is enabled
Spark 4.1.1 added spark.sql.legacy.truncateForEmptyRegexSplit, which makes StringToMap truncate trailing empty entries from the split result. Comet's native str_to_map always behaves as if the flag were false, producing incorrect results in legacy mode. Downgrade CometStrToMap to Incompatible when the flag is enabled so it falls back to Spark unless the user explicitly opts in via allowIncompatible. The config is read by string key so it resolves on Spark versions where it is not registered. Closes #4477
1 parent 9e86dd9 commit 309e0f2

2 files changed

Lines changed: 58 additions & 1 deletion

File tree

spark/src/main/scala/org/apache/comet/serde/maps.scala

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
package org.apache.comet.serde
2121

2222
import org.apache.spark.sql.catalyst.expressions._
23+
import org.apache.spark.sql.internal.SQLConf
2324
import org.apache.spark.sql.types._
2425

2526
import org.apache.comet.serde.QueryPlanSerde.{createBinaryExpr, exprToProtoInternal, optExprWithFallbackReason, scalarFunctionExprToProto}
@@ -164,6 +165,21 @@ object CometMapFromEntries
164165
}
165166
}
166167

167-
object CometStrToMap extends CometScalarFunction[StringToMap]("str_to_map")
168+
object CometStrToMap extends CometScalarFunction[StringToMap]("str_to_map") {
169+
170+
// Spark 4.1.1+ honours spark.sql.legacy.truncateForEmptyRegexSplit by truncating trailing
171+
// empty entries from the split result. Comet's native str_to_map always behaves as if the flag
172+
// were false, so it is incompatible when legacy truncation is enabled. Read by string key so it
173+
// resolves on older Spark versions where the config is not registered.
174+
private val legacyTruncateConfig = "spark.sql.legacy.truncateForEmptyRegexSplit"
175+
176+
override def getSupportLevel(expr: StringToMap): SupportLevel = {
177+
if (SQLConf.get.getConfString(legacyTruncateConfig, "false").toBoolean) {
178+
Incompatible(Some(s"$legacyTruncateConfig is enabled"))
179+
} else {
180+
Compatible(None)
181+
}
182+
}
183+
}
168184

169185
object CometMapConcat extends CometCodegenDispatch[MapConcat]
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
-- Licensed to the Apache Software Foundation (ASF) under one
2+
-- or more contributor license agreements. See the NOTICE file
3+
-- distributed with this work for additional information
4+
-- regarding copyright ownership. The ASF licenses this file
5+
-- to you under the Apache License, Version 2.0 (the
6+
-- "License"); you may not use this file except in compliance
7+
-- with the License. You may obtain a copy of the License at
8+
--
9+
-- http://www.apache.org/licenses/LICENSE-2.0
10+
--
11+
-- Unless required by applicable law or agreed to in writing,
12+
-- software distributed under the License is distributed on an
13+
-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
-- KIND, either express or implied. See the License for the
15+
-- specific language governing permissions and limitations
16+
-- under the License.
17+
18+
-- Tests that str_to_map falls back to Spark when
19+
-- spark.sql.legacy.truncateForEmptyRegexSplit is enabled. In legacy mode Spark truncates trailing
20+
-- empty entries from the split result, which Comet's native str_to_map does not honour.
21+
-- See https://github.com/apache/datafusion-comet/issues/4477
22+
23+
-- Config: spark.sql.legacy.truncateForEmptyRegexSplit=true
24+
25+
-- trailing pair delimiter: legacy mode truncates the trailing empty entry, so Comet must fall
26+
-- back to Spark
27+
query expect_fallback(truncateForEmptyRegexSplit)
28+
SELECT str_to_map('a:1,b:2,', ',', ':')
29+
30+
-- column input also falls back
31+
statement
32+
CREATE TABLE test_str_to_map_legacy(s STRING, pair_delim STRING, key_value_delim STRING) USING parquet
33+
34+
statement
35+
INSERT INTO test_str_to_map_legacy VALUES
36+
('a:1,b:2,', ',', ':'),
37+
('x:1;y:2;', ';', ':'),
38+
(NULL, ',', ':')
39+
40+
query expect_fallback(truncateForEmptyRegexSplit)
41+
SELECT str_to_map(s, pair_delim, key_value_delim) FROM test_str_to_map_legacy

0 commit comments

Comments
 (0)