apache
diff --git a/‎.github/workflows/pr_build_linux.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/pr_build_linux.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/pr_build_macos.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/pr_build_macos.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎dev/diffs/4.0.2.diff‎
Lines changed: 15 additions & 1 deletion b/‎dev/diffs/4.0.2.diff‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎dev/diffs/4.1.2.diff‎
Lines changed: 15 additions & 1 deletion b/‎dev/diffs/4.1.2.diff‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎docs/source/user-guide/latest/compatibility/regex.md‎
Lines changed: 121 additions & 3 deletions b/‎docs/source/user-guide/latest/compatibility/regex.md‎
Lines changed: 121 additions & 3 deletions
diff --git a/‎native/jni-bridge/src/errors.rs‎
Lines changed: 14 additions & 1 deletion b/‎native/jni-bridge/src/errors.rs‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎pom.xml‎
Lines changed: 1 addition & 0 deletions b/‎pom.xml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/CometConf.scala‎
Lines changed: 3 additions & 1 deletion b/‎spark/src/main/scala/org/apache/comet/CometConf.scala‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎spark/src/main/scala/org/apache/comet/expressions/RegExp.scala‎
Lines changed: 0 additions & 32 deletions b/‎spark/src/main/scala/org/apache/comet/expressions/RegExp.scala‎
Lines changed: 0 additions & 32 deletions
@@ -382,6 +382,7 @@ jobs:
               org.apache.comet.expressions.conditional.CometIfSuite
               org.apache.comet.expressions.conditional.CometCoalesceSuite
               org.apache.comet.expressions.conditional.CometCaseWhenSuite
+              org.apache.comet.CometRegExpJvmSuite
               org.apache.comet.CometCodegenSuite
               org.apache.comet.CometCodegenSourceSuite
               org.apache.comet.CometCodegenHOFSuite
 
@@ -198,6 +198,7 @@ jobs:
               org.apache.comet.expressions.conditional.CometIfSuite
               org.apache.comet.expressions.conditional.CometCoalesceSuite
               org.apache.comet.expressions.conditional.CometCaseWhenSuite
+              org.apache.comet.CometRegExpJvmSuite
               org.apache.comet.CometCodegenSuite
               org.apache.comet.CometCodegenSourceSuite
               org.apache.comet.CometCodegenHOFSuite
 
@@ -27,3 +27,4 @@ output
 docs/comet-*/
 docs/build/
 docs/temp/
+docs/superpowers/
@@ -1187,7 +1187,7 @@ index 5ba69c8f9d9..ac1256afe88 100644
      val session = classic.SparkSession.builder().sparkContext(sc).getOrCreate()
      import session.implicits._
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
-index 0df7f806272..92390bd819f 100644
+index 0df7f806272..9cdfe8b8f46 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
 +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -17,6 +17,8 @@
@@ -1253,6 +1253,20 @@ index 0df7f806272..92390bd819f 100644
    }
 
    test("non-matching optional group") {
+@@ -1405,7 +1409,12 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
+             s"'$$3 $$1') FROM $tableName"
+           val df = sql(query)
+           val plan = df.queryExecution.executedPlan
+-          assert(plan.isInstanceOf[WholeStageCodegenExec] == (codegenMode == "CODEGEN_ONLY"))
++          // Comet routes regexp_replace through the codegen dispatcher, so the executed plan is a
++          // Comet operator rather than WholeStageCodegenExec. The exception assertions below still
++          // hold; only this Spark-internal plan-shape check is skipped under Comet.
++          if (!isCometEnabled) {
++            assert(plan.isInstanceOf[WholeStageCodegenExec] == (codegenMode == "CODEGEN_ONLY"))
++          }
+           val exception = intercept[SparkRuntimeException] {
+             df.collect()
+           }
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
 index 2e33f6505ab..fc1a2c8f964 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
 
@@ -1276,7 +1276,7 @@ index d7b2511eac2..d5f5b940b94 100644
      val session = classic.SparkSession.builder().sparkContext(sc).getOrCreate()
      import session.implicits._
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
-index 7bfc8cf4fa6..7a425b74184 100644
+index 7bfc8cf4fa6..4bd387801db 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
 +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -17,6 +17,8 @@
@@ -1342,6 +1342,20 @@ index 7bfc8cf4fa6..7a425b74184 100644
    }
 
    test("non-matching optional group") {
+@@ -1425,7 +1429,12 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
+             s"'$$3 $$1') FROM $tableName"
+           val df = sql(query)
+           val plan = df.queryExecution.executedPlan
+-          assert(plan.isInstanceOf[WholeStageCodegenExec] == (codegenMode == "CODEGEN_ONLY"))
++          // Comet routes regexp_replace through the codegen dispatcher, so the executed plan is a
++          // Comet operator rather than WholeStageCodegenExec. The exception assertions below still
++          // hold; only this Spark-internal plan-shape check is skipped under Comet.
++          if (!isCometEnabled) {
++            assert(plan.isInstanceOf[WholeStageCodegenExec] == (codegenMode == "CODEGEN_ONLY"))
++          }
+           val exception = intercept[SparkRuntimeException] {
+             df.collect()
+           }
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
 index 3ba48da0e32..a33e65d4420 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
 
@@ -19,6 +19,124 @@ under the License.
 
 # Regular Expressions
 
-Comet uses the Rust regexp crate for evaluating regular expressions, and this has different behavior from Java's
-regular expression engine. Comet will fall back to Spark for patterns that are known to produce different results, but
-this can be overridden by setting `spark.comet.expression.regexp.allowIncompatible=true`.
+Comet evaluates Spark regular-expression expressions (`rlike`, `regexp_replace`, `split`,
+`regexp_extract`, `regexp_extract_all`, `regexp_instr`) two ways:
+
+- **Codegen dispatcher (default)** — Spark's own `doGenCode` for the expression runs inside Comet's
+  Arrow-direct codegen dispatcher (the same dispatcher used by Comet's `ScalaUDF` codegen path).
+  This is 100% compatible with Spark, at the cost of one JNI round-trip per batch. It is enabled by
+  default (`spark.comet.exec.scalaUDF.codegen.enabled=true`); if the dispatcher is disabled, regex
+  expressions fall back to Spark.
+- **Native (rust) engine** — the Rust [`regex`] crate, run natively with no JNI overhead. It is
+  faster but has different semantics from Java regex (see below), so it is **opt-in per expression**
+  via that expression's `allowIncompatible` flag. `rlike`, `regexp_replace`, and `split` have a
+  native implementation; `regexp_extract`, `regexp_extract_all`, and `regexp_instr` do not and
+  always run through the codegen dispatcher.
+
+| SQL              | Native (rust) opt-in config                              |
+| ---------------- | -------------------------------------------------------- |
+| `rlike`          | `spark.comet.expression.RLike.allowIncompatible`         |
+| `regexp_replace` | `spark.comet.expression.RegExpReplace.allowIncompatible` |
+| `split`          | `spark.comet.expression.StringSplit.allowIncompatible`   |
+
+When the native path is opted in but a case has no native implementation (for example a non-scalar
+`rlike` pattern, or `regexp_replace` with a non-1 offset), Comet routes that case through the
+codegen dispatcher.
+
+## Disabling Comet for individual regex expressions
+
+Each regex expression has a per-class `spark.comet.expression.<ClassName>.enabled` flag (default
+`true`) that disables Comet's serde for that expression and forces a Spark fallback. This is
+useful for narrowing a regression or comparing performance on a single operator without changing
+the engine selector:
+
+| Expression           | Config                                                  |
+| -------------------- | ------------------------------------------------------- |
+| `rlike`              | `spark.comet.expression.RLike.enabled=false`            |
+| `regexp_extract`     | `spark.comet.expression.RegExpExtract.enabled=false`    |
+| `regexp_extract_all` | `spark.comet.expression.RegExpExtractAll.enabled=false` |
+| `regexp_instr`       | `spark.comet.expression.RegExpInStr.enabled=false`      |
+| `regexp_replace`     | `spark.comet.expression.RegExpReplace.enabled=false`    |
+| `split`              | `spark.comet.expression.StringSplit.enabled=false`      |
+
+## Choosing an engine
+
+|                      | Rust engine                                                                                                         | Codegen dispatcher (default)                                                                                        |
+| -------------------- | ------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
+| **Compatibility**    | Differs from Java regex (see below)                                                                                 | 100% compatible with Spark                                                                                          |
+| **Feature coverage** | `rlike`, `regexp_replace`, `split` natively; `regexp_extract`, `regexp_extract_all`, `regexp_instr` via fallthrough | All regexp expressions (`rlike`, `regexp_extract`, `regexp_extract_all`, `regexp_instr`, `regexp_replace`, `split`) |
+| **Performance**      | Fully native, no JNI overhead                                                                                       | One JNI round-trip per batch (Arrow vectors stay columnar)                                                          |
+| **Pattern support**  | Linear-time subset only                                                                                             | All Java regex features (backreferences, lookaround, etc.)                                                          |
+
+The **Rust engine** is faster but cannot match Java regex semantics for every pattern. Opting in per
+expression (for example `spark.comet.expression.RLike.allowIncompatible=true`) declares acceptance
+of those differences.
+
+The **codegen dispatcher** is the default and is enabled by `spark.comet.exec.scalaUDF.codegen.enabled`,
+so it can be disabled globally to fall back to Spark for the regex family.
+
+## Why the engines differ
+
+Java's `java.util.regex` is a backtracking engine in the Perl/PCRE family. It supports the full range of
+features that style of engine provides, including some whose worst-case running time grows exponentially with
+the input.
+
+Rust's [`regex`] crate is a finite-automaton engine in the [RE2] family. It deliberately omits features that
+cannot be implemented with a guarantee of linear-time matching. In exchange, every pattern it does accept runs
+in time linear in the size of the input. This is the same trade-off RE2, Go's `regexp`, and several other
+engines make.
+
+The practical consequence is that Java accepts a strictly larger set of patterns than the Rust engine, and
+several constructs that look the same in source have different semantics on the two sides.
+
+## Features supported by Java but not by the Rust engine
+
+Patterns that use any of the following will not compile in Comet's Rust engine and must run on Spark (or use
+the Java engine):
+
+- **Backreferences** such as `\1`, `\2`, or `\k<name>`. The Rust engine has no backtracking and cannot match
+  a previously captured group.
+- **Lookaround**, including lookahead (`(?=...)`, `(?!...)`) and lookbehind (`(?<=...)`, `(?<!...)`).
+- **Atomic groups** (`(?>...)`).
+- **Possessive quantifiers** (`*+`, `++`, `?+`, `{n,m}+`). Rust supports greedy and lazy quantifiers but not
+  possessive.
+- **Embedded code, conditionals, and recursion** such as `(?(cond)yes|no)` or `(?R)`. Rust accepts none of
+  these.
+
+## Features that exist on both sides but behave differently
+
+Even where both engines accept a construct, the matching behavior is not always the same.
+
+- **Unicode-aware character classes.** In the Rust engine, `\d`, `\w`, `\s`, and `.` are Unicode-aware by
+  default, so `\d` matches every digit codepoint defined by Unicode rather than only `0`-`9`. Java's defaults
+  match ASCII only and require the `UNICODE_CHARACTER_CLASS` flag (or `(?U)` inline) to switch to Unicode
+  semantics. The same pattern can therefore match a different set of characters on each side.
+- **Line terminators.** In multiline mode, Java treats `\r`, `\n`, `\r\n`, and a few additional Unicode line
+  separators as line boundaries by default. The Rust engine treats only `\n` as a line boundary unless CRLF
+  mode is enabled. `^`, `$`, and `.` (with `(?s)` off) all depend on this definition.
+- **Case-insensitive matching.** Both engines support `(?i)`, but Java's default is ASCII case folding while
+  the Rust engine uses full Unicode simple case folding when Unicode mode is on. Patterns that match characters
+  outside ASCII can produce different results.
+- **POSIX character classes.** The Rust engine supports `[[:alpha:]]` style POSIX classes inside bracket
+  expressions but not Java's `\p{Alpha}` shorthand. Java accepts both. Unicode property escapes (`\p{L}`,
+  `\p{Greek}`, etc.) are supported by both engines but cover slightly different sets of properties.
+- **Octal and Unicode escapes.** Java accepts `\0nnn` for octal and `\uXXXX` for a BMP codepoint. Rust uses
+  `\x{...}` for arbitrary codepoints and does not accept Java's bare `\uXXXX` form.
+- **Empty matches in `split`.** Spark's `StringSplit`, which is built on Java's regex, includes leading empty
+  strings produced by zero-width matches at the start of the input. The Rust engine's `split` follows different
+  rules, so split results can differ in edge cases involving empty matches even when the pattern itself is
+  identical on both sides.
+
+## When the Rust engine is safe
+
+For most ASCII-only, non-anchored patterns that use only literal characters, simple character classes, and
+ordinary quantifiers, the two engines produce the same results. If you are confident your patterns fit this
+shape and want to avoid the JNI overhead of the Java engine, switching to the Rust engine with
+`allowIncompatible=true` is generally safe.
+
+For anything that uses backreferences, lookaround, or relies on Java's specific Unicode or line-handling
+defaults, use the Java engine.
+
+[`java.util.regex`]: https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
+[`regex`]: https://docs.rs/regex/latest/regex/
+[RE2]: https://github.com/google/re2/wiki/Syntax
@@ -210,6 +210,11 @@ impl From<CometError> for DataFusionError {
     fn from(value: CometError) -> Self {
         match value {
             CometError::DataFusion { msg: _, source } => source,
+            // Preserve the original Java throwable (e.g. a SparkRuntimeException raised by Spark's
+            // own codegen inside the JVM UDF kernel) as an `External` error so it survives the trip
+            // back through DataFusion and can be re-thrown with its exact type at the JNI boundary.
+            // Flattening it to a string here would surface it as a generic CometNativeException.
+            value @ CometError::JavaException { .. } => DataFusionError::External(Box::new(value)),
             _ => DataFusionError::Execution(value.to_string()),
         }
     }
@@ -492,7 +497,15 @@ fn throw_exception(env: &mut Env, error: &CometError, backtrace: Option<String>)
                 msg: _,
                 source: DataFusionError::External(e),
             } => {
-                if let Some(spark_error_with_ctx) = e.downcast_ref::<SparkErrorWithContext>() {
+                if let Some(CometError::JavaException { throwable, .. }) =
+                    e.downcast_ref::<CometError>()
+                {
+                    // A Java exception captured inside a JVM UDF kernel (e.g. Spark codegen
+                    // raising INVALID_REGEXP_REPLACE). Re-throw the original throwable so callers
+                    // see the exact Spark exception type rather than a wrapped CometNativeException.
+                    env.throw(throwable)
+                } else if let Some(spark_error_with_ctx) = e.downcast_ref::<SparkErrorWithContext>()
+                {
                     let json_message = spark_error_with_ctx.to_json();
                     env.throw_new(
                         jni::jni_str!("org/apache/comet/exceptions/CometQueryExecutionException"),
 
@@ -1170,6 +1170,7 @@ under the License.
             <exclude>native/proto/src/generated/**</exclude>
             <exclude>benchmarks/tpc/queries/**</exclude>
             <exclude>.claude/**</exclude>
+            <exclude>docs/superpowers/**</exclude>
           </excludes>
         </configuration>
       </plugin>
 
@@ -369,7 +369,9 @@ object CometConf extends ShimCometConf {
         "Arrow-direct codegen dispatcher. When enabled, a supported ScalaUDF is compiled into " +
         "a per-batch kernel that reads and writes Arrow vectors directly from native " +
         "execution. When disabled, plans containing a ScalaUDF fall back to Spark for the " +
-        "enclosing operator.")
+        "enclosing operator. The same dispatcher backs the regex family (`rlike`, " +
+        "`regexp_replace`, `split`, `regexp_extract`, `regexp_extract_all`, `regexp_instr`) so " +
+        "those route through it by default as well.")
       .booleanConf
       .createWithDefault(true)