Skip to content

Commit ceecae7

Browse files
authored
feat: Add 100% Spark-compatible regex support via codegen dispatcher (#4239)
1 parent 44d4ea6 commit ceecae7

27 files changed

Lines changed: 1125 additions & 146 deletions

.github/workflows/pr_build_linux.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,7 @@ jobs:
382382
org.apache.comet.expressions.conditional.CometIfSuite
383383
org.apache.comet.expressions.conditional.CometCoalesceSuite
384384
org.apache.comet.expressions.conditional.CometCaseWhenSuite
385+
org.apache.comet.CometRegExpJvmSuite
385386
org.apache.comet.CometCodegenSuite
386387
org.apache.comet.CometCodegenSourceSuite
387388
org.apache.comet.CometCodegenHOFSuite

.github/workflows/pr_build_macos.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ jobs:
198198
org.apache.comet.expressions.conditional.CometIfSuite
199199
org.apache.comet.expressions.conditional.CometCoalesceSuite
200200
org.apache.comet.expressions.conditional.CometCaseWhenSuite
201+
org.apache.comet.CometRegExpJvmSuite
201202
org.apache.comet.CometCodegenSuite
202203
org.apache.comet.CometCodegenSourceSuite
203204
org.apache.comet.CometCodegenHOFSuite

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,4 @@ output
2727
docs/comet-*/
2828
docs/build/
2929
docs/temp/
30+
docs/superpowers/

dev/diffs/4.0.2.diff

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1187,7 +1187,7 @@ index 5ba69c8f9d9..ac1256afe88 100644
11871187
val session = classic.SparkSession.builder().sparkContext(sc).getOrCreate()
11881188
import session.implicits._
11891189
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
1190-
index 0df7f806272..92390bd819f 100644
1190+
index 0df7f806272..9cdfe8b8f46 100644
11911191
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
11921192
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
11931193
@@ -17,6 +17,8 @@
@@ -1253,6 +1253,20 @@ index 0df7f806272..92390bd819f 100644
12531253
}
12541254

12551255
test("non-matching optional group") {
1256+
@@ -1405,7 +1409,12 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
1257+
s"'$$3 $$1') FROM $tableName"
1258+
val df = sql(query)
1259+
val plan = df.queryExecution.executedPlan
1260+
- assert(plan.isInstanceOf[WholeStageCodegenExec] == (codegenMode == "CODEGEN_ONLY"))
1261+
+ // Comet routes regexp_replace through the codegen dispatcher, so the executed plan is a
1262+
+ // Comet operator rather than WholeStageCodegenExec. The exception assertions below still
1263+
+ // hold; only this Spark-internal plan-shape check is skipped under Comet.
1264+
+ if (!isCometEnabled) {
1265+
+ assert(plan.isInstanceOf[WholeStageCodegenExec] == (codegenMode == "CODEGEN_ONLY"))
1266+
+ }
1267+
val exception = intercept[SparkRuntimeException] {
1268+
df.collect()
1269+
}
12561270
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
12571271
index 2e33f6505ab..fc1a2c8f964 100644
12581272
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala

dev/diffs/4.1.2.diff

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1276,7 +1276,7 @@ index d7b2511eac2..d5f5b940b94 100644
12761276
val session = classic.SparkSession.builder().sparkContext(sc).getOrCreate()
12771277
import session.implicits._
12781278
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
1279-
index 7bfc8cf4fa6..7a425b74184 100644
1279+
index 7bfc8cf4fa6..4bd387801db 100644
12801280
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
12811281
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
12821282
@@ -17,6 +17,8 @@
@@ -1342,6 +1342,20 @@ index 7bfc8cf4fa6..7a425b74184 100644
13421342
}
13431343

13441344
test("non-matching optional group") {
1345+
@@ -1425,7 +1429,12 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
1346+
s"'$$3 $$1') FROM $tableName"
1347+
val df = sql(query)
1348+
val plan = df.queryExecution.executedPlan
1349+
- assert(plan.isInstanceOf[WholeStageCodegenExec] == (codegenMode == "CODEGEN_ONLY"))
1350+
+ // Comet routes regexp_replace through the codegen dispatcher, so the executed plan is a
1351+
+ // Comet operator rather than WholeStageCodegenExec. The exception assertions below still
1352+
+ // hold; only this Spark-internal plan-shape check is skipped under Comet.
1353+
+ if (!isCometEnabled) {
1354+
+ assert(plan.isInstanceOf[WholeStageCodegenExec] == (codegenMode == "CODEGEN_ONLY"))
1355+
+ }
1356+
val exception = intercept[SparkRuntimeException] {
1357+
df.collect()
1358+
}
13451359
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
13461360
index 3ba48da0e32..a33e65d4420 100644
13471361
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala

docs/source/user-guide/latest/compatibility/regex.md

Lines changed: 121 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,124 @@ under the License.
1919

2020
# Regular Expressions
2121

22-
Comet uses the Rust regexp crate for evaluating regular expressions, and this has different behavior from Java's
23-
regular expression engine. Comet will fall back to Spark for patterns that are known to produce different results, but
24-
this can be overridden by setting `spark.comet.expression.regexp.allowIncompatible=true`.
22+
Comet evaluates Spark regular-expression expressions (`rlike`, `regexp_replace`, `split`,
23+
`regexp_extract`, `regexp_extract_all`, `regexp_instr`) two ways:
24+
25+
- **Codegen dispatcher (default)** — Spark's own `doGenCode` for the expression runs inside Comet's
26+
Arrow-direct codegen dispatcher (the same dispatcher used by Comet's `ScalaUDF` codegen path).
27+
This is 100% compatible with Spark, at the cost of one JNI round-trip per batch. It is enabled by
28+
default (`spark.comet.exec.scalaUDF.codegen.enabled=true`); if the dispatcher is disabled, regex
29+
expressions fall back to Spark.
30+
- **Native (rust) engine** — the Rust [`regex`] crate, run natively with no JNI overhead. It is
31+
faster but has different semantics from Java regex (see below), so it is **opt-in per expression**
32+
via that expression's `allowIncompatible` flag. `rlike`, `regexp_replace`, and `split` have a
33+
native implementation; `regexp_extract`, `regexp_extract_all`, and `regexp_instr` do not and
34+
always run through the codegen dispatcher.
35+
36+
| SQL | Native (rust) opt-in config |
37+
| ---------------- | -------------------------------------------------------- |
38+
| `rlike` | `spark.comet.expression.RLike.allowIncompatible` |
39+
| `regexp_replace` | `spark.comet.expression.RegExpReplace.allowIncompatible` |
40+
| `split` | `spark.comet.expression.StringSplit.allowIncompatible` |
41+
42+
When the native path is opted in but a case has no native implementation (for example a non-scalar
43+
`rlike` pattern, or `regexp_replace` with a non-1 offset), Comet routes that case through the
44+
codegen dispatcher.
45+
46+
## Disabling Comet for individual regex expressions
47+
48+
Each regex expression has a per-class `spark.comet.expression.<ClassName>.enabled` flag (default
49+
`true`) that disables Comet's serde for that expression and forces a Spark fallback. This is
50+
useful for narrowing a regression or comparing performance on a single operator without changing
51+
the engine selector:
52+
53+
| Expression | Config |
54+
| -------------------- | ------------------------------------------------------- |
55+
| `rlike` | `spark.comet.expression.RLike.enabled=false` |
56+
| `regexp_extract` | `spark.comet.expression.RegExpExtract.enabled=false` |
57+
| `regexp_extract_all` | `spark.comet.expression.RegExpExtractAll.enabled=false` |
58+
| `regexp_instr` | `spark.comet.expression.RegExpInStr.enabled=false` |
59+
| `regexp_replace` | `spark.comet.expression.RegExpReplace.enabled=false` |
60+
| `split` | `spark.comet.expression.StringSplit.enabled=false` |
61+
62+
## Choosing an engine
63+
64+
| | Rust engine | Codegen dispatcher (default) |
65+
| -------------------- | ------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
66+
| **Compatibility** | Differs from Java regex (see below) | 100% compatible with Spark |
67+
| **Feature coverage** | `rlike`, `regexp_replace`, `split` natively; `regexp_extract`, `regexp_extract_all`, `regexp_instr` via fallthrough | All regexp expressions (`rlike`, `regexp_extract`, `regexp_extract_all`, `regexp_instr`, `regexp_replace`, `split`) |
68+
| **Performance** | Fully native, no JNI overhead | One JNI round-trip per batch (Arrow vectors stay columnar) |
69+
| **Pattern support** | Linear-time subset only | All Java regex features (backreferences, lookaround, etc.) |
70+
71+
The **Rust engine** is faster but cannot match Java regex semantics for every pattern. Opting in per
72+
expression (for example `spark.comet.expression.RLike.allowIncompatible=true`) declares acceptance
73+
of those differences.
74+
75+
The **codegen dispatcher** is the default and is enabled by `spark.comet.exec.scalaUDF.codegen.enabled`,
76+
so it can be disabled globally to fall back to Spark for the regex family.
77+
78+
## Why the engines differ
79+
80+
Java's `java.util.regex` is a backtracking engine in the Perl/PCRE family. It supports the full range of
81+
features that style of engine provides, including some whose worst-case running time grows exponentially with
82+
the input.
83+
84+
Rust's [`regex`] crate is a finite-automaton engine in the [RE2] family. It deliberately omits features that
85+
cannot be implemented with a guarantee of linear-time matching. In exchange, every pattern it does accept runs
86+
in time linear in the size of the input. This is the same trade-off RE2, Go's `regexp`, and several other
87+
engines make.
88+
89+
The practical consequence is that Java accepts a strictly larger set of patterns than the Rust engine, and
90+
several constructs that look the same in source have different semantics on the two sides.
91+
92+
## Features supported by Java but not by the Rust engine
93+
94+
Patterns that use any of the following will not compile in Comet's Rust engine and must run on Spark (or use
95+
the Java engine):
96+
97+
- **Backreferences** such as `\1`, `\2`, or `\k<name>`. The Rust engine has no backtracking and cannot match
98+
a previously captured group.
99+
- **Lookaround**, including lookahead (`(?=...)`, `(?!...)`) and lookbehind (`(?<=...)`, `(?<!...)`).
100+
- **Atomic groups** (`(?>...)`).
101+
- **Possessive quantifiers** (`*+`, `++`, `?+`, `{n,m}+`). Rust supports greedy and lazy quantifiers but not
102+
possessive.
103+
- **Embedded code, conditionals, and recursion** such as `(?(cond)yes|no)` or `(?R)`. Rust accepts none of
104+
these.
105+
106+
## Features that exist on both sides but behave differently
107+
108+
Even where both engines accept a construct, the matching behavior is not always the same.
109+
110+
- **Unicode-aware character classes.** In the Rust engine, `\d`, `\w`, `\s`, and `.` are Unicode-aware by
111+
default, so `\d` matches every digit codepoint defined by Unicode rather than only `0`-`9`. Java's defaults
112+
match ASCII only and require the `UNICODE_CHARACTER_CLASS` flag (or `(?U)` inline) to switch to Unicode
113+
semantics. The same pattern can therefore match a different set of characters on each side.
114+
- **Line terminators.** In multiline mode, Java treats `\r`, `\n`, `\r\n`, and a few additional Unicode line
115+
separators as line boundaries by default. The Rust engine treats only `\n` as a line boundary unless CRLF
116+
mode is enabled. `^`, `$`, and `.` (with `(?s)` off) all depend on this definition.
117+
- **Case-insensitive matching.** Both engines support `(?i)`, but Java's default is ASCII case folding while
118+
the Rust engine uses full Unicode simple case folding when Unicode mode is on. Patterns that match characters
119+
outside ASCII can produce different results.
120+
- **POSIX character classes.** The Rust engine supports `[[:alpha:]]` style POSIX classes inside bracket
121+
expressions but not Java's `\p{Alpha}` shorthand. Java accepts both. Unicode property escapes (`\p{L}`,
122+
`\p{Greek}`, etc.) are supported by both engines but cover slightly different sets of properties.
123+
- **Octal and Unicode escapes.** Java accepts `\0nnn` for octal and `\uXXXX` for a BMP codepoint. Rust uses
124+
`\x{...}` for arbitrary codepoints and does not accept Java's bare `\uXXXX` form.
125+
- **Empty matches in `split`.** Spark's `StringSplit`, which is built on Java's regex, includes leading empty
126+
strings produced by zero-width matches at the start of the input. The Rust engine's `split` follows different
127+
rules, so split results can differ in edge cases involving empty matches even when the pattern itself is
128+
identical on both sides.
129+
130+
## When the Rust engine is safe
131+
132+
For most ASCII-only, non-anchored patterns that use only literal characters, simple character classes, and
133+
ordinary quantifiers, the two engines produce the same results. If you are confident your patterns fit this
134+
shape and want to avoid the JNI overhead of the Java engine, switching to the Rust engine with
135+
`allowIncompatible=true` is generally safe.
136+
137+
For anything that uses backreferences, lookaround, or relies on Java's specific Unicode or line-handling
138+
defaults, use the Java engine.
139+
140+
[`java.util.regex`]: https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
141+
[`regex`]: https://docs.rs/regex/latest/regex/
142+
[RE2]: https://github.com/google/re2/wiki/Syntax

native/jni-bridge/src/errors.rs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,11 @@ impl From<CometError> for DataFusionError {
210210
fn from(value: CometError) -> Self {
211211
match value {
212212
CometError::DataFusion { msg: _, source } => source,
213+
// Preserve the original Java throwable (e.g. a SparkRuntimeException raised by Spark's
214+
// own codegen inside the JVM UDF kernel) as an `External` error so it survives the trip
215+
// back through DataFusion and can be re-thrown with its exact type at the JNI boundary.
216+
// Flattening it to a string here would surface it as a generic CometNativeException.
217+
value @ CometError::JavaException { .. } => DataFusionError::External(Box::new(value)),
213218
_ => DataFusionError::Execution(value.to_string()),
214219
}
215220
}
@@ -492,7 +497,15 @@ fn throw_exception(env: &mut Env, error: &CometError, backtrace: Option<String>)
492497
msg: _,
493498
source: DataFusionError::External(e),
494499
} => {
495-
if let Some(spark_error_with_ctx) = e.downcast_ref::<SparkErrorWithContext>() {
500+
if let Some(CometError::JavaException { throwable, .. }) =
501+
e.downcast_ref::<CometError>()
502+
{
503+
// A Java exception captured inside a JVM UDF kernel (e.g. Spark codegen
504+
// raising INVALID_REGEXP_REPLACE). Re-throw the original throwable so callers
505+
// see the exact Spark exception type rather than a wrapped CometNativeException.
506+
env.throw(throwable)
507+
} else if let Some(spark_error_with_ctx) = e.downcast_ref::<SparkErrorWithContext>()
508+
{
496509
let json_message = spark_error_with_ctx.to_json();
497510
env.throw_new(
498511
jni::jni_str!("org/apache/comet/exceptions/CometQueryExecutionException"),

pom.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1170,6 +1170,7 @@ under the License.
11701170
<exclude>native/proto/src/generated/**</exclude>
11711171
<exclude>benchmarks/tpc/queries/**</exclude>
11721172
<exclude>.claude/**</exclude>
1173+
<exclude>docs/superpowers/**</exclude>
11731174
</excludes>
11741175
</configuration>
11751176
</plugin>

spark/src/main/scala/org/apache/comet/CometConf.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,9 @@ object CometConf extends ShimCometConf {
369369
"Arrow-direct codegen dispatcher. When enabled, a supported ScalaUDF is compiled into " +
370370
"a per-batch kernel that reads and writes Arrow vectors directly from native " +
371371
"execution. When disabled, plans containing a ScalaUDF fall back to Spark for the " +
372-
"enclosing operator.")
372+
"enclosing operator. The same dispatcher backs the regex family (`rlike`, " +
373+
"`regexp_replace`, `split`, `regexp_extract`, `regexp_extract_all`, `regexp_instr`) so " +
374+
"those route through it by default as well.")
373375
.booleanConf
374376
.createWithDefault(true)
375377

spark/src/main/scala/org/apache/comet/expressions/RegExp.scala

Lines changed: 0 additions & 32 deletions
This file was deleted.

0 commit comments

Comments
 (0)