fix: complete native_datafusion Parquet schema-mismatch rejections #13595

Workflow file for this run

.github/workflows/spark_sql_test.yml at b346d0c

	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	name: Spark SQL Tests

	concurrency:
	group: ${{ github.repository }}-${{ github.head_ref \|\| github.sha }}-${{ github.workflow }}
	cancel-in-progress: true

	on:
	push:
	branches:
	- main
	paths:
	- "native//src/"
	- "native/**/Cargo.toml"
	- "native/Cargo.lock"
	- "!native/hdfs/**"
	- "!native/fs-hdfs/**"
	- "common/src/main/**"
	- "common/pom.xml"
	- "spark/src/main/**"
	- "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala"
	- "spark/pom.xml"
	- "dev/diffs/**"
	- "pom.xml"
	- "rust-toolchain.toml"
	- ".github/workflows/spark_sql_test.yml"
	- ".github/actions/setup-builder/**"
	- ".github/actions/setup-spark-builder/**"
	pull_request:
	paths:
	- "native//src/"
	- "native/**/Cargo.toml"
	- "native/Cargo.lock"
	- "!native/hdfs/**"
	- "!native/fs-hdfs/**"
	- "common/src/main/**"
	- "common/pom.xml"
	- "spark/src/main/**"
	- "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala"
	- "spark/pom.xml"
	- "dev/diffs/**"
	- "pom.xml"
	- "rust-toolchain.toml"
	- ".github/workflows/spark_sql_test.yml"
	- ".github/actions/setup-builder/**"
	- ".github/actions/setup-spark-builder/**"
	# manual trigger
	# https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
	workflow_dispatch:
	inputs:
	collect-fallback-logs:
	description: 'Whether to collect Comet fallback reasons from spark sql unit test logs'
	required: false
	default: 'false'
	type: boolean

	env:
	RUST_VERSION: stable
	RUST_BACKTRACE: 1
	# Force GNU ld on Linux: recent Rust stable defaults to rust-lld on
	# x86_64-unknown-linux-gnu, and rust-lld cannot resolve -ljvm against the
	# Zulu JDK layout installed by setup-java. Keep bfd for all cargo invocations.
	RUSTFLAGS: "-Clink-arg=-fuse-ld=bfd"

	jobs:

	# Build native library once and share with all test jobs
	build-native:
	name: Build Native Library
	runs-on: ubuntu-24.04
	container:
	image: amd64/rust
	steps:
	- uses: actions/checkout@v6

	- name: Setup Rust toolchain
	uses: ./.github/actions/setup-builder
	with:
	rust-version: ${{ env.RUST_VERSION }}
	jdk-version: 17

	- name: Restore Cargo cache
	uses: actions/cache/restore@v5
	with:
	path: \|
	~/.cargo/registry
	~/.cargo/git
	native/target
	key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native//Cargo.lock', 'native//Cargo.toml') }}-${{ hashFiles('native/*/.rs') }}
	restore-keys: \|
	${{ runner.os }}-cargo-ci-${{ hashFiles('native//Cargo.lock', 'native//Cargo.toml') }}-

	- name: Build native library (CI profile)
	run: \|
	cd native
	cargo build --profile ci
	env:
	RUSTFLAGS: "-Ctarget-cpu=x86-64-v3 -Clink-arg=-fuse-ld=bfd"

	- name: Upload native library
	uses: actions/upload-artifact@v7
	with:
	name: native-lib-linux
	path: native/target/ci/libcomet.so
	retention-days: 1

	- name: Save Cargo cache
	uses: actions/cache/save@v5
	if: github.ref == 'refs/heads/main'
	with:
	path: \|
	~/.cargo/registry
	~/.cargo/git
	native/target
	key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native//Cargo.lock', 'native//Cargo.toml') }}-${{ hashFiles('native/*/.rs') }}

	spark-sql-test:
	needs: build-native
	strategy:
	matrix:
	module:
	- {name: "catalyst", args1: "catalyst/test", args2: ""}
	- {name: "sql_core-1", args1: "", args2: "sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest"}
	- {name: "sql_core-2", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.ExtendedSQLTest"}
	- {name: "sql_core-3", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.SlowSQLTest"}
	- {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
	- {name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"}
	- {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
	# Since 4f5eaf0, auto mode uses native_datafusion for V1 scans,
	# so we only need to test with auto.
	config:
	- {spark-short: '3.4', spark-full: '3.4.3', java: 11, scan-impl: 'auto'}
	- {spark-short: '3.5', spark-full: '3.5.8', java: 11, scan-impl: 'auto'}
	- {spark-short: '4.0', spark-full: '4.0.2', java: 21, scan-impl: 'auto'}
	- {spark-short: '4.1', spark-full: '4.1.1', java: 17, scan-impl: 'auto'}
	fail-fast: false
	name: spark-sql-${{ matrix.config.scan-impl }}-${{ matrix.module.name }}/spark-${{ matrix.config.spark-full }}-jdk${{ matrix.config.java }}
	runs-on: ubuntu-24.04
	container:
	image: amd64/rust
	steps:
	- uses: actions/checkout@v6
	- name: Setup Rust & Java toolchain
	uses: ./.github/actions/setup-builder
	with:
	rust-version: ${{env.RUST_VERSION}}
	jdk-version: ${{ matrix.config.java }}
	- name: Download native library
	uses: actions/download-artifact@v8
	with:
	name: native-lib-linux
	path: native/target/release/
	- name: Setup Spark
	uses: ./.github/actions/setup-spark-builder
	with:
	spark-version: ${{ matrix.config.spark-full }}
	spark-short-version: ${{ matrix.config.spark-short }}
	skip-native-build: true
	- name: Run Spark tests
	run: \|
	cd apache-spark
	rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups
	NOLINT_ON_COMPILE=true ENABLE_COMET=true ENABLE_COMET_ONHEAP=true COMET_PARQUET_SCAN_IMPL=${{ matrix.config.scan-impl }} ENABLE_COMET_LOG_FALLBACK_REASONS=${{ github.event.inputs.collect-fallback-logs \|\| 'false' }} \
	build/sbt -Dsbt.log.noformat=true -mem $SBT_MEM ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}"
	if [ "${{ github.event.inputs.collect-fallback-logs }}" = "true" ]; then
	find . -type f -name "unit-tests.log" -print0 \| xargs -0 grep -h "Comet cannot accelerate" \| sed 's/.*Comet cannot accelerate/Comet cannot accelerate/' \| sort -u > fallback.log
	fi
	env:
	LC_ALL: "C.UTF-8"
	# Standard GitHub runners have 7 GB RAM; cap SBT heap so forked test
	# JVMs fit alongside it.
	SBT_MEM: "3072"
	# Disable parallel test execution to reduce peak memory usage —
	# mirrors what apache/spark does on GitHub Actions.
	SERIAL_SBT_TESTS: "1"
	# Mirror Spark's own JDK 21 / 25 CI workaround. apache/spark's
	# build_java21.yml and build_java25.yml set this same env var to
	# process-isolate the V1/V2 Parquet and Orc source suites because
	# they exhibit cross-suite resource interactions (file-stream and
	# thread leaks) under the newer JDKs. project/SparkBuild.scala
	# reads DEDICATED_JVM_SBT_TESTS and forks a separate JVM per
	# listed suite. Empty value is a safe no-op.
	DEDICATED_JVM_SBT_TESTS: ${{ matrix.config.spark-short == '4.0' && 'org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV1Suite,org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV2Suite,org.apache.spark.sql.execution.datasources.orc.OrcSourceV1Suite,org.apache.spark.sql.execution.datasources.orc.OrcSourceV2Suite' \|\| '' }}
	- name: Upload fallback log
	if: ${{ github.event.inputs.collect-fallback-logs == 'true' }}
	uses: actions/upload-artifact@v7
	with:
	name: fallback-log-spark-sql-${{ matrix.config.scan-impl }}-${{ matrix.module.name }}-spark-${{ matrix.config.spark-full }}-jdk${{ matrix.config.java }}
	path: "**/fallback.log"

	merge-fallback-logs:
	if: ${{ github.event.inputs.collect-fallback-logs == 'true' }}
	name: merge-fallback-logs
	needs: [spark-sql-test]
	runs-on: ubuntu-24.04
	steps:
	- name: Download fallback log artifacts
	uses: actions/download-artifact@v8
	with:
	path: fallback-logs/
	- name: Merge fallback logs
	run: \|
	find ./fallback-logs/ -type f -name "fallback.log" -print0 \| xargs -0 cat \| sort -u > all_fallback.log
	- name: Upload merged fallback log
	uses: actions/upload-artifact@v7
	with:
	name: all-fallback-log
	path: all_fallback.log

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

fix: complete native_datafusion Parquet schema-mismatch rejections #13595

Workflow file

fix: complete native_datafusion Parquet schema-mismatch rejections #13595

Uh oh!

Workflow file for this run