Skip to content

Commit 7203fb5

Browse files
authored
chore: Remove config option for native_iceberg_compat (#4019)
1 parent 0ca37e1 commit 7203fb5

339 files changed

Lines changed: 208 additions & 19626 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/actions/java-test/action.yaml

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,6 @@ inputs:
2929
description: 'Maven options passed to the mvn command'
3030
required: false
3131
default: ''
32-
scan_impl:
33-
description: 'The default Parquet scan implementation'
34-
required: false
35-
default: 'auto'
3632
upload-test-reports:
3733
description: 'Whether to upload test results including coverage to GitHub'
3834
required: false
@@ -72,7 +68,6 @@ runs:
7268
shell: bash
7369
if: ${{ inputs.suites == '' }}
7470
env:
75-
COMET_PARQUET_SCAN_IMPL: ${{ inputs.scan_impl }}
7671
SPARK_LOCAL_HOSTNAME: "localhost"
7772
SPARK_LOCAL_IP: "127.0.0.1"
7873
run: |
@@ -81,7 +76,6 @@ runs:
8176
shell: bash
8277
if: ${{ inputs.suites != '' }}
8378
env:
84-
COMET_PARQUET_SCAN_IMPL: ${{ inputs.scan_impl }}
8579
SPARK_LOCAL_HOSTNAME: "localhost"
8680
SPARK_LOCAL_IP: "127.0.0.1"
8781
run: |

.github/workflows/pr_build_linux.yml

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -279,27 +279,22 @@ jobs:
279279
- name: "Spark 3.4, JDK 11, Scala 2.12"
280280
java_version: "11"
281281
maven_opts: "-Pspark-3.4 -Pscala-2.12"
282-
scan_impl: "auto"
283282

284283
- name: "Spark 3.5, JDK 17, Scala 2.13"
285284
java_version: "17"
286285
maven_opts: "-Pspark-3.5 -Pscala-2.13"
287-
scan_impl: "native_iceberg_compat"
288286

289287
- name: "Spark 4.0, JDK 21"
290288
java_version: "21"
291289
maven_opts: "-Pspark-4.0"
292-
scan_impl: "auto"
293290

294291
- name: "Spark 4.1, JDK 17"
295292
java_version: "17"
296293
maven_opts: "-Pspark-4.1"
297-
scan_impl: "auto"
298294

299295
- name: "Spark 4.2, JDK 17"
300296
java_version: "17"
301297
maven_opts: "-Pspark-4.2"
302-
scan_impl: "auto"
303298
suite:
304299
- name: "fuzz"
305300
value: |
@@ -390,7 +385,7 @@ jobs:
390385
org.apache.spark.sql.CometToPrettyStringSuite
391386
org.apache.spark.sql.CometCollationSuite
392387
fail-fast: false
393-
name: ${{ matrix.profile.name }}/${{ matrix.profile.scan_impl }} [${{ matrix.suite.name }}]
388+
name: ${{ matrix.profile.name }} [${{ matrix.suite.name }}]
394389
runs-on: ubuntu-24.04
395390
container:
396391
image: amd64/rust
@@ -427,10 +422,9 @@ jobs:
427422
- name: Java test steps
428423
uses: ./.github/actions/java-test
429424
with:
430-
artifact_name: ${{ matrix.profile.name }}-${{ matrix.suite.name }}-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}-${{ matrix.profile.scan_impl }}
425+
artifact_name: ${{ matrix.profile.name }}-${{ matrix.suite.name }}-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}
431426
suites: ${{ matrix.suite.name == 'sql' && matrix.profile.name == 'Spark 3.4, JDK 11, Scala 2.12' && '' || matrix.suite.value }}
432427
maven_opts: ${{ matrix.profile.maven_opts }}
433-
scan_impl: ${{ matrix.profile.scan_impl }}
434428
upload-test-reports: true
435429
skip-native-build: true
436430

.github/workflows/spark_sql_test.yml

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -142,15 +142,13 @@ jobs:
142142
- {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
143143
- {name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"}
144144
- {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
145-
# Since 4f5eaf0, auto mode uses native_datafusion for V1 scans,
146-
# so we only need to test with auto.
147145
config:
148-
- {spark-short: '3.4', spark-full: '3.4.3', java: 11, scan-impl: 'auto'}
149-
- {spark-short: '3.5', spark-full: '3.5.8', java: 11, scan-impl: 'auto'}
150-
- {spark-short: '4.0', spark-full: '4.0.2', java: 21, scan-impl: 'auto'}
151-
- {spark-short: '4.1', spark-full: '4.1.1', java: 17, scan-impl: 'auto'}
146+
- {spark-short: '3.4', spark-full: '3.4.3', java: 11}
147+
- {spark-short: '3.5', spark-full: '3.5.8', java: 11}
148+
- {spark-short: '4.0', spark-full: '4.0.2', java: 21}
149+
- {spark-short: '4.1', spark-full: '4.1.1', java: 17}
152150
fail-fast: false
153-
name: spark-sql-${{ matrix.config.scan-impl }}-${{ matrix.module.name }}/spark-${{ matrix.config.spark-full }}-jdk${{ matrix.config.java }}
151+
name: spark-sql-${{ matrix.module.name }}/spark-${{ matrix.config.spark-full }}-jdk${{ matrix.config.java }}
154152
runs-on: ubuntu-24.04
155153
container:
156154
image: amd64/rust
@@ -190,7 +188,7 @@ jobs:
190188
# Cap parallel forked test JVMs at 1 so that even when
191189
# SparkParallelTestGrouping is enabled we don't blow the
192190
# 7 GB runner budget (each forked test JVM has -Xmx2g).
193-
NOLINT_ON_COMPILE=true ENABLE_COMET=true ENABLE_COMET_ONHEAP=true COMET_PARQUET_SCAN_IMPL=${{ matrix.config.scan-impl }} ENABLE_COMET_LOG_FALLBACK_REASONS=${{ github.event.inputs.collect-fallback-logs || 'false' }} \
191+
NOLINT_ON_COMPILE=true ENABLE_COMET=true ENABLE_COMET_ONHEAP=true ENABLE_COMET_LOG_FALLBACK_REASONS=${{ github.event.inputs.collect-fallback-logs || 'false' }} \
194192
build/sbt -Dsbt.log.noformat=true -mem $SBT_MEM \
195193
'set Global / concurrentRestrictions := Seq(Tags.limit(Tags.ForkedTestGroup, 1))' \
196194
${{ matrix.module.args1 }} "${{ matrix.module.args2 }}"
@@ -214,7 +212,7 @@ jobs:
214212
if: ${{ github.event.inputs.collect-fallback-logs == 'true' }}
215213
uses: actions/upload-artifact@v7
216214
with:
217-
name: fallback-log-spark-sql-${{ matrix.config.scan-impl }}-${{ matrix.module.name }}-spark-${{ matrix.config.spark-full }}-jdk${{ matrix.config.java }}
215+
name: fallback-log-spark-sql-${{ matrix.module.name }}-spark-${{ matrix.config.spark-full }}-jdk${{ matrix.config.java }}
218216
path: "**/fallback.log"
219217

220218
merge-fallback-logs:

.github/workflows/spark_sql_test_native_iceberg_compat.yml

Lines changed: 0 additions & 72 deletions
This file was deleted.

docs/source/contributor-guide/bug_triage.md

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,7 @@ help contributors find bugs in their area of expertise.
7373
| `area:ffi` | Arrow FFI / JNI boundary |
7474
| `area:ci` | CI/CD, GitHub Actions, build tooling |
7575

76-
The following pre-existing labels also serve as area indicators: `native_datafusion`,
77-
`native_iceberg_compat`, `spark 4`, `spark sql tests`.
76+
The following pre-existing labels also serve as area indicators: `spark 4`, `spark sql tests`.
7877

7978
## Triage Process
8079

@@ -109,9 +108,8 @@ Periodically review open bugs to ensure priorities are still accurate:
109108
crashes, because crashes are at least visible.
110109
2. **User-reported over test-only.** A bug hit by a real user on a real workload takes priority
111110
over one found only in test suites.
112-
3. **Core path over experimental.** Bugs in the default scan mode (`native_comet`) or widely-used
113-
expressions take priority over bugs in experimental features like `native_datafusion` or
114-
`native_iceberg_compat`.
111+
3. **Core path over experimental.** Bugs in widely-used expressions and operators take priority over
112+
bugs in experimental features.
115113
4. **Production safety over feature completeness.** Fixing a data corruption bug is more important
116114
than adding support for a new expression.
117115

docs/source/user-guide/latest/datasources.md

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,9 @@ Comet supports most standard storage systems, such as local file system and obje
6161

6262
Apache DataFusion Comet native reader seamlessly scans files from remote HDFS for [supported formats](#supported-spark-data-sources)
6363

64-
### Using experimental native DataFusion reader
64+
### Building Comet with HDFS support
6565

66-
Unlike to native Comet reader the Datafusion reader fully supports nested types processing. This reader is currently experimental only
67-
68-
To build Comet with native DataFusion reader and remote HDFS support it is required to have a JDK installed
66+
To build Comet with remote HDFS support it is required to have a JDK installed.
6967

7068
Example:
7169
Build a Comet for `spark-4.1` provide a JDK path in `JAVA_HOME`
@@ -76,11 +74,10 @@ export JAVA_HOME="/opt/homebrew/opt/openjdk@17"
7674
make release PROFILES="-Pspark-4.1" COMET_FEATURES=hdfs RUSTFLAGS="-L $JAVA_HOME/libexec/openjdk.jdk/Contents/Home/lib/server"
7775
```
7876

79-
Start Comet with experimental reader and HDFS support as [described](installation.md/#run-spark-shell-with-comet-enabled)
77+
Start Comet with HDFS support as [described](installation.md/#run-spark-shell-with-comet-enabled)
8078
and add additional parameters
8179

8280
```shell
83-
--conf spark.comet.scan.impl=native_datafusion \
8481
--conf spark.hadoop.fs.defaultFS="hdfs://namenode:9000" \
8582
--conf spark.hadoop.dfs.client.use.datanode.hostname = true \
8683
--conf dfs.client.use.datanode.hostname = true
@@ -158,7 +155,6 @@ JAVA_HOME="/opt/homebrew/opt/openjdk@17" make release PROFILES="-Pspark-4.1" COM
158155
withSQLConf(
159156
CometConf.COMET_ENABLED.key -> "true",
160157
CometConf.COMET_EXEC_ENABLED.key -> "true",
161-
CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_DATAFUSION,
162158
SQLConf.USE_V1_SOURCE_LIST.key -> "parquet",
163159
"fs.defaultFS" -> "hdfs://namenode:9000",
164160
"dfs.client.use.datanode.hostname" -> "true") {
@@ -169,7 +165,7 @@ JAVA_HOME="/opt/homebrew/opt/openjdk@17" make release PROFILES="-Pspark-4.1" COM
169165
}
170166
```
171167

172-
Or use `spark-shell` with HDFS support as described [above](#using-experimental-native-datafusion-reader)
168+
Or use `spark-shell` with HDFS support as described [above](#building-comet-with-hdfs-support)
173169

174170
## S3
175171

spark/src/main/scala/org/apache/comet/CometConf.scala

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -111,22 +111,23 @@ object CometConf extends ShimCometConf {
111111
.booleanConf
112112
.createWithEnvVarOrDefault("ENABLE_COMET_WRITE", false)
113113

114+
@deprecated
114115
val SCAN_NATIVE_DATAFUSION = "native_datafusion"
116+
117+
@deprecated
115118
val SCAN_NATIVE_ICEBERG_COMPAT = "native_iceberg_compat"
119+
120+
@deprecated
116121
val SCAN_AUTO = "auto"
117122

123+
@deprecated
118124
val COMET_NATIVE_SCAN_IMPL: ConfigEntry[String] = conf("spark.comet.scan.impl")
119-
.category(CATEGORY_PARQUET)
120-
.doc(
121-
"The implementation of Comet's Parquet scan to use. Available scans are " +
122-
s"`$SCAN_NATIVE_DATAFUSION`, and `$SCAN_NATIVE_ICEBERG_COMPAT`. " +
123-
s"`$SCAN_NATIVE_DATAFUSION` is a fully native implementation, and " +
124-
s"`$SCAN_NATIVE_ICEBERG_COMPAT` is a hybrid implementation that supports some " +
125-
"additional features, such as row indexes and field ids. " +
126-
s"`$SCAN_AUTO` (default) chooses the best available scan based on the scan schema.")
125+
.category(CATEGORY_TESTING)
126+
.internal()
127+
.doc("This configuration option is deprecated and has no effect on Comet behavior.")
127128
.stringConf
128129
.transform(_.toLowerCase(Locale.ROOT))
129-
.checkValues(Set(SCAN_NATIVE_DATAFUSION, SCAN_NATIVE_ICEBERG_COMPAT, SCAN_AUTO))
130+
.checkValues(Set(SCAN_NATIVE_DATAFUSION, SCAN_AUTO))
130131
.createWithEnvVarOrDefault("COMET_PARQUET_SCAN_IMPL", SCAN_AUTO)
131132

132133
val COMET_ICEBERG_NATIVE_ENABLED: ConfigEntry[Boolean] =

spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -183,12 +183,7 @@ case class CometScanRule(session: SparkSession)
183183
return scanExec
184184
}
185185

186-
COMET_NATIVE_SCAN_IMPL.get() match {
187-
case SCAN_AUTO | SCAN_NATIVE_DATAFUSION =>
188-
nativeDataFusionScan(plan, session, scanExec, r, hadoopConf).getOrElse(scanExec)
189-
case SCAN_NATIVE_ICEBERG_COMPAT =>
190-
nativeIcebergCompatScan(session, scanExec, r, hadoopConf).getOrElse(scanExec)
191-
}
186+
nativeDataFusionScan(plan, session, scanExec, r, hadoopConf).getOrElse(scanExec)
192187

193188
case _ =>
194189
withInfo(scanExec, s"Unsupported relation ${scanExec.relation}")
@@ -255,21 +250,6 @@ case class CometScanRule(session: SparkSession)
255250
Some(CometScanExec(scanExec, session, SCAN_NATIVE_DATAFUSION))
256251
}
257252

258-
private def nativeIcebergCompatScan(
259-
session: SparkSession,
260-
scanExec: FileSourceScanExec,
261-
r: HadoopFsRelation,
262-
hadoopConf: Configuration): Option[SparkPlan] = {
263-
if (encryptionEnabled(hadoopConf) && !isEncryptionConfigSupported(hadoopConf)) {
264-
withInfo(scanExec, s"$SCAN_NATIVE_ICEBERG_COMPAT does not support encryption")
265-
return None
266-
}
267-
if (!isSchemaSupported(scanExec, SCAN_NATIVE_ICEBERG_COMPAT, r)) {
268-
return None
269-
}
270-
Some(CometScanExec(scanExec, session, SCAN_NATIVE_ICEBERG_COMPAT))
271-
}
272-
273253
private def transformV2Scan(scanExec: BatchScanExec): SparkPlan = {
274254

275255
scanExec.scan match {
@@ -812,8 +792,7 @@ object CometScanRule extends Logging {
812792
Native.validateObjectStoreConfig(filePath, objectStoreOptions)
813793
} catch {
814794
case e: CometNativeException =>
815-
val reason = "Object store config not supported by " +
816-
s"$SCAN_NATIVE_ICEBERG_COMPAT: ${e.getMessage}"
795+
val reason = s"Object store config not supported: ${e.getMessage}"
817796
fallbackReasons += reason
818797
configValidityMap.put(cacheKey, Some(reason))
819798
}

spark/src/test/resources/tpcds-plan-stability/approved-plans-v1_4-spark3_5/q44.native_iceberg_compat/extended.txt

Lines changed: 0 additions & 64 deletions
This file was deleted.

spark/src/test/resources/tpcds-plan-stability/approved-plans-v1_4-spark3_5/q44.native_datafusion/extended.txt renamed to spark/src/test/resources/tpcds-plan-stability/approved-plans-v1_4-spark3_5/q44/extended.txt

File renamed without changes.

0 commit comments

Comments
 (0)