From e6e374f4749a6257322a072debefa46bb6ff46e3 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Thu, 9 Oct 2025 12:57:39 -0700 Subject: [PATCH 01/53] save --- DELTALOG_ISOLATION.md | 230 +++++++++++++++++ MODULE_STRUCTURE.md | 145 +++++++++++ PUBLISH_STRUCTURE.md | 181 ++++++++++++++ SHADED_DECISION.md | 148 +++++++++++ build.sbt | 236 ++++++++++++++---- .../spark/sql/delta/catalog/DeltaCatalog.java | 28 +++ .../spark/sql/delta/DeltaAnalysis.scala | 4 +- .../apache/spark/sql/delta/DeltaErrors.scala | 6 +- .../sql/delta/catalog/DeltaCatalog.scala | 6 +- 9 files changed, 923 insertions(+), 61 deletions(-) create mode 100644 DELTALOG_ISOLATION.md create mode 100644 MODULE_STRUCTURE.md create mode 100644 PUBLISH_STRUCTURE.md create mode 100644 SHADED_DECISION.md create mode 100644 spark-shaded/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java diff --git a/DELTALOG_ISOLATION.md b/DELTALOG_ISOLATION.md new file mode 100644 index 00000000000..dcce7e21f33 --- /dev/null +++ b/DELTALOG_ISOLATION.md @@ -0,0 +1,230 @@ +# DeltaLog 隔离架构 + +## ✅ 实现完成! + +成功实现了 delta-spark-v2 不依赖 DeltaLog 的架构。 + +## 架构设计 + +``` +delta-spark-v1 (7.4M) + ├─ 包含所有 V1 类,包括 DeltaLog + │ + ↓ 重新打包(排除 DeltaLog) + │ +delta-spark-v1-shaded (7.1M) + ├─ V1 的所有类,但排除: + │ • DeltaLog + │ • Snapshot + │ • OptimisticTransaction + │ + ↓ 依赖 + │ +delta-spark-v2 (34K) + ├─ Kernel-based connector + ├─ ✅ 编译时只能访问 v1-shaded + ├─ ✅ 无法访问 DeltaLog 类 + │ + ↓ 组合 + │ +delta-spark (final, 7.5M) + └─ 包含: + • V1 完整版(含 DeltaLog)← 从 delta-spark-v1 重新添加 + • V2 所有类 + • 可选的 delegation 层 +``` + +## 验证结果 + +### 1. delta-spark-v1-shaded 成功排除 DeltaLog + +```bash +$ jar -tf spark-v1-shaded/target/scala-2.12/delta-spark-v1-shaded_2.12-3.4.0-SNAPSHOT.jar | \ + grep -E "DeltaLog\.class|Snapshot\.class|OptimisticTransaction\.class" +# 返回空 ✓ - 成功排除 +``` + +### 2. delta-spark-v2 成功编译(无 DeltaLog) + +```bash +$ ./build/sbt "delta-spark-v2/compile" +[success] ✓ - 编译成功,证明 v2 不需要 DeltaLog +``` + +### 3. 最终 jar 包含完整 V1(含 DeltaLog) + +```bash +$ jar -tf spark-tests/target/scala-2.12/delta-spark_2.12-3.4.0-SNAPSHOT.jar | \ + grep "DeltaLog\.class" +org/apache/spark/sql/delta/DeltaLog.class ✓ - DeltaLog 存在 +``` + +## JAR 大小对比 + +| 模块 | 大小 | 内容 | +|------|------|------| +| delta-spark-v1 | 7.4M | V1 完整版(含 DeltaLog) | +| delta-spark-v1-shaded | 7.1M | V1 无 DeltaLog(-300KB) | +| delta-spark-v2 | 34K | Kernel connector | +| **delta-spark (final)** | **7.5M** | **V1完整 + V2** | + +## 排除的类 + +delta-spark-v1-shaded 排除了以下类: + +```scala +// build.sbt 配置 +Compile / packageBin / mappings := { + val v1Mappings = (`delta-spark-v1` / Compile / packageBin / mappings).value + + v1Mappings.filterNot { case (file, path) => + path.contains("org/apache/spark/sql/delta/DeltaLog") || + path.contains("org/apache/spark/sql/delta/Snapshot") || + path.contains("org/apache/spark/sql/delta/OptimisticTransaction") + } +} +``` + +**排除的具体类**: +- `org.apache.spark.sql.delta.DeltaLog` - 核心 Delta 日志类 +- `org.apache.spark.sql.delta.Snapshot` - 表快照类 +- `org.apache.spark.sql.delta.OptimisticTransaction` - 事务类 + +**未排除的类**(不直接依赖 DeltaLog): +- `CapturedSnapshot` - 快照包装类 +- `DummySnapshot` - 测试用假快照 +- `SnapshotOverwriteOperationMetrics` - 指标类 + +## 工作原理 + +### 编译时(delta-spark-v2) + +``` +delta-spark-v2 + → 依赖 delta-spark-v1-shaded + → 只能看到 V1 的部分类(无 DeltaLog) + → 编译成功 = 证明 v2 不需要 DeltaLog ✓ +``` + +### 运行时(用户使用) + +``` +delta-spark.jar + → 包含 V1 完整版(含 DeltaLog) + → 包含 V2 所有类 + → 用户可以使用所有功能 ✓ +``` + +## 依赖关系 + +```scala +// Module 1: delta-spark-v1 (完整版) +lazy val `delta-spark-v1` = (project in file("spark")) + .settings( + // 编译所有 V1 源码,包括 DeltaLog + ) + +// Module 2: delta-spark-v1-shaded (排除 DeltaLog) +lazy val `delta-spark-v1-shaded` = (project in file("spark-v1-shaded")) + .dependsOn(`delta-spark-v1`) + .settings( + // 重新打包 v1,排除 DeltaLog 相关类 + Compile / packageBin / mappings := { /* filter logic */ } + ) + +// Module 3: delta-spark-v2 (依赖 v1-shaded) +lazy val `delta-spark-v2` = (project in file("kernel-spark")) + .dependsOn(`delta-spark-v1-shaded`) // ← 只依赖 shaded 版本 + .settings(/* ... */) + +// Module 4: delta-spark-shaded (可选 delegation) +lazy val `delta-spark-shaded` = (project in file("spark-shaded")) + .dependsOn(`delta-spark-v1`) // ← 完整版 v1 + .dependsOn(`delta-spark-v2`) + +// Module 5: delta-spark (最终发布) +lazy val spark = (project in file("spark-combined")) + .dependsOn(`delta-spark-shaded`) + .settings( + // 重新打包:完整 v1 + v2 + Compile / packageBin / mappings := { + val v1Full = (`delta-spark-v1` / Compile / packageBin / mappings).value // ← 完整版 + val v2 = (`delta-spark-v2` / Compile / packageBin / mappings).value + val shaded = (`delta-spark-shaded` / Compile / packageBin / mappings).value + v1Full ++ v2 ++ shaded + } + ) +``` + +## 关键点 + +### ✅ 隔离成功 + +- **编译时隔离**:v2 无法访问 DeltaLog +- **运行时完整**:用户可以使用所有 V1 功能 + +### 🎯 测试策略 + +如果 delta-spark-v2 的测试全部通过,证明: +- v2 的所有代码路径都不需要加载 DeltaLog 类 +- v2 真正实现了与 V1 核心的解耦 + +### 🔄 工作流程 + +```bash +# 1. 编译 v1(完整版) +sbt delta-spark-v1/compile + +# 2. 打包 v1-shaded(排除 DeltaLog) +sbt delta-spark-v1-shaded/packageBin +# → 生成 7.1M jar(比 v1 少 300KB) + +# 3. 编译 v2(依赖 v1-shaded) +sbt delta-spark-v2/compile +# → 编译成功 = v2 不需要 DeltaLog ✓ + +# 4. 打包最终 jar(重新加入完整 v1) +sbt spark/packageBin +# → 生成 7.5M jar(包含完整 v1 + v2) +``` + +## 未来扩展 + +### 添加更多排除类 + +如果需要排除更多类: + +```scala +v1Mappings.filterNot { case (file, path) => + path.contains("org/apache/spark/sql/delta/DeltaLog") || + path.contains("org/apache/spark/sql/delta/Snapshot") || + path.contains("org/apache/spark/sql/delta/OptimisticTransaction") || + path.contains("org/apache/spark/sql/delta/SomeOtherClass") // ← 添加更多 +} +``` + +### 测试验证 + +运行 v2 测试确保不依赖 DeltaLog: + +```bash +sbt "delta-spark-v2/test" +# 如果测试通过 → 证明 v2 完全独立于 DeltaLog +``` + +## 总结 + +✅ **可以!** 这个架构完全可行并且已经实现: + +1. **delta-spark-v1-shaded** 排除 DeltaLog(通过 packageBin mapping 过滤) +2. **delta-spark-v2** 依赖 v1-shaded,编译成功(证明不需要 DeltaLog) +3. **delta-spark (final)** 重新打包完整 v1(含 DeltaLog)+ v2 +4. **零文件移动** - 所有源码保持原位 +5. **验证通过** - jar 文件分析确认架构正确 + +**用户体验**: +- 只需要依赖一个 `delta-spark.jar` +- jar 包含完整的 V1 和 V2 功能 +- V2 在内部确保了与 DeltaLog 的隔离 + + diff --git a/MODULE_STRUCTURE.md b/MODULE_STRUCTURE.md new file mode 100644 index 00000000000..d477455f315 --- /dev/null +++ b/MODULE_STRUCTURE.md @@ -0,0 +1,145 @@ +# Delta Spark Module Structure + +## Overview + +The delta-spark codebase has been refactored into 5 SBT modules to support both v1 (current) and v2 (kernel-based) implementations, with **DeltaLog isolation**: + +``` +delta-spark-v1 (not published, full v1 with DeltaLog) + ↓ repackage (exclude DeltaLog) +delta-spark-v1-shaded (not published, v1 without DeltaLog) + ↓ +delta-spark-v2 (not published, depends on v1-shaded) + ↓ +delta-spark-shaded (not published, optional delegation) + ↓ +delta-spark (published jar, full v1 + v2) +``` + +## Module Details + +### 1. delta-spark-v1 +- **Directory**: `spark/` +- **Published**: No +- **Content**: Production code only (no tests) +- **Description**: Current delta-spark production code +- **Key Features**: + - All existing Delta Spark functionality + - Antlr parser generation + - Python file packaging + +### 2. delta-spark-v1-shaded +- **Directory**: `spark-v1-shaded/` (virtual, no source files) +- **Published**: No +- **Content**: Repackaged delta-spark-v1 JAR with DeltaLog classes excluded +- **Dependencies**: delta-spark-v1 +- **Description**: V1 without DeltaLog for v2 to depend on +- **Key Features**: + - Filters out `DeltaLog`, `Snapshot`, `OptimisticTransaction` classes + - Used to enforce v2 doesn't depend on DeltaLog at compile time + - ~300KB smaller than full v1 (7.1M vs 7.4M) + +### 3. delta-spark-v2 +- **Directory**: `kernel-spark/` +- **Published**: No +- **Content**: Kernel-based Spark implementation +- **Dependencies**: **delta-spark-v1-shaded** (no DeltaLog), kernelApi, kernelDefaults +- **Description**: New kernel-based Spark connector +- **Key Features**: + - DSv2 Catalog and Tables + - Kernel-specific unit tests + - **Cannot access DeltaLog at compile time** (enforced by v1-shaded dependency) + +### 4. delta-spark-shaded +- **Directory**: `spark-shaded/` +- **Published**: No +- **Content**: Delegation layer +- **Dependencies**: **delta-spark-v1** (full version), delta-spark-v2 +- **Description**: Contains delegation code that routes to v1 or v2 +- **Key Features**: + - DeltaCatalog (delegates to V1 or V2) + - DeltaSparkSessionExtension (registers both) + +### 5. delta-spark (final module) +- **Directory**: `spark-combined/` +- **Published**: Yes (as `delta-spark.jar`) +- **Content**: + - No production code (packages v1+v2+shaded) + - All test code from `spark/src/test/` +- **Dependencies**: delta-spark-shaded, delta-spark-v1 (test utils) +- **Description**: Final published artifact combining all modules +- **Key Features**: + - Tests can access both v1 and v2 implementations + - Published jar contains complete v1+v2+shaded code + +## File Structure + +### No Files Moved! +- Production code remains in `spark/src/main/` +- Test code remains in `spark/src/test/` +- Kernel code remains in `kernel-spark/src/` + +### New Directories Created +- `spark-combined/` - final combined module (v1+v2+tests) +- `spark-shaded/` - delegation code +- `spark-v1-shaded/` - virtual module (no source files, only build configuration) + +## SBT Commands + +```bash +# Compile individual modules +sbt delta-spark-v1/compile +sbt delta-spark-v2/compile +sbt delta-spark-shaded/compile +sbt spark/compile + +# Run tests +sbt spark/test + +# Publish +sbt spark/publishLocal +``` + +## Key Implementation Details + +### Test Source Configuration +The `spark` module uses `unmanagedSourceDirectories` to point to original test locations: +```scala +Test / unmanagedSourceDirectories ++= Seq( + baseDirectory.value.getParentFile / "spark" / "src" / "test" / "scala", + baseDirectory.value.getParentFile / "spark" / "src" / "test" / "java" +) +``` + +### Package Assembly +The final `spark` module packages all classes: +```scala +Compile / packageBin / mappings := { + val v1 = (`delta-spark-v1` / Compile / packageBin / mappings).value + val v2 = (`delta-spark-v2` / Compile / packageBin / mappings).value + val shaded = (`delta-spark-shaded` / Compile / packageBin / mappings).value + v1 ++ v2 ++ shaded +} +``` + +## Benefits + +1. **Modular**: Clear separation between v1, v2, and delegation layers +2. **No File Movement**: All code stays in original locations +3. **Backward Compatible**: Final jar contains everything +4. **Testable**: Tests can verify both v1 and v2 implementations +5. **Not Published**: Internal modules (v1, v2, shaded) aren't published +6. **Clean Dependencies**: Avoids circular dependencies + +## Migration Notes + +### Dependency Updates +Modules that previously depended on `spark` should now depend on: +- `delta-spark-v1` - if only v1 functionality needed +- `delta-spark-shaded` - if both v1 and v2 needed +- `spark` - if test utilities needed + +### Updated Dependencies +- `kernelDefaults` → depends on `delta-spark-v1 % "test->test"` +- `goldenTables` → depends on `delta-spark-v1 % "test"` + diff --git a/PUBLISH_STRUCTURE.md b/PUBLISH_STRUCTURE.md new file mode 100644 index 00000000000..fca89b8e953 --- /dev/null +++ b/PUBLISH_STRUCTURE.md @@ -0,0 +1,181 @@ +# Delta Spark 发布结构说明 + +## publishM2 会发布哪些 JAR? + +### 发布的模块(使用 releaseSettings) + +只有 **1 个** delta-spark 相关的 jar 会被发布: + +#### 1. delta-spark.jar +- **SBT 模块**: `spark` +- **Maven Artifact**: `delta-spark_2.12` (或 `delta-spark_2.13`) +- **内容**: + - delta-spark-v1 的所有 classes(来自 `spark/src/main/`) + - delta-spark-v2 的所有 classes(来自 `kernel-spark/src/main/`) + - delta-spark-shaded 的所有 classes(来自 `spark-shaded/src/main/`) + - Python 文件(从 `python/` 目录) +- **发布配置**: `releaseSettings` → `publishArtifact := true` + +### 不发布的模块(使用 skipReleaseSettings) + +以下 3 个模块 **不会** 单独发布 jar: + +#### 1. delta-spark-v1 +- **配置**: `skipReleaseSettings` → `publishArtifact := false` +- **原因**: 内部模块,其 classes 会被打包到最终的 `delta-spark.jar` 中 + +#### 2. delta-spark-v2 +- **配置**: `skipReleaseSettings` → `publishArtifact := false` +- **原因**: 内部模块,其 classes 会被打包到最终的 `delta-spark.jar` 中 + +#### 3. delta-spark-shaded +- **配置**: `skipReleaseSettings` → `publishArtifact := false` +- **原因**: 内部模块,其 classes 会被打包到最终的 `delta-spark.jar` 中 + +## delta-spark.jar 包含的内容 + +最终发布的 `delta-spark.jar` 通过以下配置组合所有内容: + +```scala +Compile / packageBin / mappings := { + val v1 = (`delta-spark-v1` / Compile / packageBin / mappings).value + val v2 = (`delta-spark-v2` / Compile / packageBin / mappings).value + val shaded = (`delta-spark-shaded` / Compile / packageBin / mappings).value + v1 ++ v2 ++ shaded +} +``` + +### 详细内容列表 + +#### 来自 delta-spark-v1 (`spark/src/main/`) +- `org.apache.spark.sql.delta.*` - Delta Lake 核心功能 +- `io.delta.sql.*` - Delta SQL 扩展 +- `io.delta.tables.*` - Delta Tables API +- `io.delta.dynamodbcommitcoordinator.*` - DynamoDB 协调器 +- ANTLR 生成的 parser 类 +- Python 文件(`delta/*.py`) +- `META-INF/services/*` - 服务注册文件 + +#### 来自 delta-spark-v2 (`kernel-spark/src/main/`) +- `io.delta.kernel.spark.*` - Kernel-based Spark connector + - `catalog.SparkTable` - DSv2 Table 实现 + - `read.*` - 读取相关类(Scan, Batch, PartitionReader) + - `utils.*` - 工具类(Schema, Expression, Serialization) +- `io.delta.sql.DeltaSparkSessionExtension` - V2 扩展 +- `org.apache.spark.sql.delta.catalog.DeltaCatalog` - V2 Catalog + +#### 来自 delta-spark-shaded (`spark-shaded/src/main/`) +- Delegation 代码(如果添加) + - 例如:统一的 `DeltaCatalog` 入口,根据配置选择 V1 或 V2 + - 例如:`DeltaSparkSessionExtension` 可以同时注册 V1 和 V2 + +## 发布命令 + +```bash +# 发布到本地 Maven 仓库 +sbt publishM2 + +# 只发布 delta-spark 模块 +sbt spark/publishM2 + +# 发布到 Sonatype +sbt publishSigned +``` + +## Maven 依赖示例 + +用户只需要依赖一个 jar: + +```xml + + io.delta + delta-spark_2.12 + 3.4.0-SNAPSHOT + +``` + +这一个 jar 就包含了 V1、V2 和 delegation 层的所有功能。 + +## 其他会发布的 Delta 相关模块 + +除了 `delta-spark.jar`,以下模块也会被 publishM2: + +1. **delta-kernel-api.jar** - Kernel API +2. **delta-kernel-defaults.jar** - Kernel Defaults 实现 +3. **delta-storage.jar** - Storage 层 +4. **delta-storage-s3-dynamodb.jar** - S3/DynamoDB 存储 +5. **delta-iceberg.jar** - Iceberg 集成 +6. **delta-hudi.jar** - Hudi 集成 +7. **delta-sharing-spark.jar** - Delta Sharing +8. **delta-contribs.jar** - 贡献模块 +9. **delta-connect-*.jar** - Delta Connect 模块 +10. **delta-standalone*.jar** - Standalone 连接器 +11. **delta-hive*.jar** - Hive 连接器 +12. **delta-flink.jar** - Flink 连接器 + +但这些都是独立的 jar,与 `delta-spark.jar` 分开发布。 + +## 总结 + +**回答你的问题**: + +1. **publishM2 会生成几个 delta-spark jar?** + - 只有 **1 个**:`delta-spark_2.12-3.4.0-SNAPSHOT.jar` (约 7.5MB) + - 位置:`spark-combined/target/scala-2.12/` + +2. **delta-spark jar 包含哪些内容?** + + **来自 delta-spark-v1** (约 7.4MB): + ``` + org/apache/spark/sql/delta/* - Delta Lake 核心 + io/delta/sql/* - SQL 扩展 + io/delta/tables/* - Tables API + io/delta/dynamodbcommitcoordinator/* - DynamoDB + delta/*.py - Python 文件 + META-INF/services/* - 服务注册 + ``` + + **来自 delta-spark-v2** (约 34KB): + ``` + io/delta/kernel/spark/catalog/* - DSv2 Catalog + io/delta/kernel/spark/read/* - 读取实现 + io/delta/kernel/spark/utils/* - 工具类 + io/delta/sql/DeltaSparkSessionExtension - V2 扩展 + org/apache/spark/sql/delta/catalog/DeltaCatalog - V2 Catalog + ``` + + **来自 delta-spark-shaded** (约 288B): + ``` + (delegation 代码,如果添加) + ``` + +3. **v1, v2, shaded 三个内部模块会单独发布吗?** + - **不会**,它们有 `skipReleaseSettings` 配置 + - 它们只是内部模块,用于组织代码 + - 所有代码最终都打包进同一个 `delta-spark.jar` + +## 验证 + +生成的 jar 文件: +```bash +# 内部模块(不发布) +spark/target/scala-2.12/delta-spark-v1_2.12-3.4.0-SNAPSHOT.jar # 7.4M +kernel-spark/target/scala-2.12/delta-spark-v2_2.12-3.4.0-SNAPSHOT.jar # 34K +spark-shaded/target/scala-2.12/delta-spark-shaded_2.12-3.4.0-SNAPSHOT.jar # 288B + +# 最终发布的 jar(组合了上面三个) +spark-combined/target/scala-2.12/delta-spark_2.12-3.4.0-SNAPSHOT.jar # 7.5M +``` + +关键类验证: +```bash +# V1 类 +org.apache.spark.sql.delta.DeltaLog +org.apache.spark.sql.delta.catalog.DeltaCatalog + +# V2 类 +io.delta.kernel.spark.table.SparkTable +io.delta.kernel.spark.read.SparkScan +io.delta.sql.DeltaSparkSessionExtension +``` + diff --git a/SHADED_DECISION.md b/SHADED_DECISION.md new file mode 100644 index 00000000000..41ab81df19f --- /dev/null +++ b/SHADED_DECISION.md @@ -0,0 +1,148 @@ +# delta-spark-shaded 决策分析 + +## TL;DR + +**推荐:方案 C - 保留空模块** +- 成本低(只是一个空目录) +- 保持架构灵活性 +- 未来如需 delegation 可随时添加 + +--- + +## 当前状态 + +delta-spark-shaded 当前为空: +- 源码:`spark-shaded/src/main/scala/` 空目录 +- Jar:288 字节(只有 MANIFEST) +- 依赖:v1 + v2 + +## 是否必要? + +### ❌ 不必要的情况 + +1. **V1 和 V2 类名不冲突** + - V1: `org.apache.spark.sql.delta.catalog.DeltaCatalog` + - V2: `io.delta.kernel.spark.table.SparkTable` + - 不同的包名和类名,可以共存 + +2. **用户可以直接选择实现** + ```scala + // 方式1: 用 V1 Catalog + spark.conf.set("spark.sql.catalog.spark_catalog", + "org.apache.spark.sql.delta.catalog.DeltaCatalog") + + // 方式2: 用 V2 Table + spark.read.format("io.delta.kernel.spark").load(path) + ``` + +3. **不需要同时启用** + - 如果 V1 和 V2 是互斥的,不需要 delegation + +### ✅ 必要的情况 + +1. **需要统一入口点** + ```scala + // 统一的 DeltaCatalog,内部路由到 V1 或 V2 + class DeltaCatalog extends ... { + def loadTable(...) = { + if (useKernel) v2.SparkTable(...) + else v1.DeltaTable(...) + } + } + ``` + +2. **需要同时注册 V1 和 V2** + ```scala + class DeltaSparkSessionExtension { + override def apply(extensions: SparkSessionExtensions) = { + registerV1Rules(extensions) + registerV2Rules(extensions) + } + } + ``` + +3. **需要平滑迁移** + - 逐步从 V1 迁移到 V2 + - A/B 测试不同实现 + - 按功能分流(读用 V2,写用 V1) + +4. **需要 Shading(名字冲突)** + - 如果 V1 和 V2 有同名类 + - 使用 sbt-assembly shading 规则 + +## 三种方案对比 + +| 方案 | 优点 | 缺点 | 适用场景 | +|------|------|------|----------| +| **A. 保留并实现** | • 统一入口
• 灵活切换
• 平滑迁移 | • 额外代码
• 维护成本 | 需要 delegation | +| **B. 完全删除** | • 代码最简
• 依赖清晰 | • 未来加回成本高
• 缺少灵活性 | 确定不需要 delegation | +| **C. 保留空模块** | • 架构预留
• 无额外成本
• 随时可加 | • 多一个模块 | **推荐:暂不确定** | + +## 推荐方案:C(保留空模块) + +### 理由 + +1. **成本极低** + - 只是一个空目录 + 288B jar + - 不影响编译和发布 + +2. **架构清晰** + ``` + v1 (prod) ──┐ + ├──> shaded (delegation) ──> spark (final jar) + v2 (prod) ──┘ + ``` + +3. **未来灵活** + - 如果需要 delegation,直接添加代码 + - 不需要重构 build.sbt + +### 何时添加代码到 shaded? + +**触发条件**: +- [ ] 需要根据配置自动选择 V1/V2 +- [ ] 需要同时启用 V1 和 V2 +- [ ] 发现类名冲突 +- [ ] 需要 A/B 测试或灰度发布 + +**暂时不需要**: +- 用户可以显式选择 V1 或 V2 +- 两个实现可以独立使用 + +## 如何删除 delta-spark-shaded(如果确定不需要) + +```scala +// build.sbt 修改 + +// 删除 delta-spark-shaded 模块定义 +// lazy val `delta-spark-shaded` = ... + +// spark 模块直接依赖 v1 和 v2 +lazy val spark = (project in file("spark-tests")) + .dependsOn(`delta-spark-v1`) + .dependsOn(`delta-spark-v2`) + .settings( + Compile / packageBin / mappings := { + val v1 = (`delta-spark-v1` / Compile / packageBin / mappings).value + val v2 = (`delta-spark-v2` / Compile / packageBin / mappings).value + v1 ++ v2 // 移除 shaded + } + ) +``` + +```bash +# 删除目录 +rm -rf spark-shaded/ +``` + +## 决策 + +✅ **暂时保留空的 delta-spark-shaded** + +原因: +- 成本可忽略 +- 保持架构扩展性 +- 符合原始设计意图 +- 未来如需 delegation 可随时添加 + + diff --git a/build.sbt b/build.sbt index 685204f3b4e..5b97256bdd4 100644 --- a/build.sbt +++ b/build.sbt @@ -56,7 +56,9 @@ val LATEST_RELEASED_SPARK_VERSION = "3.5.7" val SPARK_MASTER_VERSION = "4.0.2-SNAPSHOT" val sparkVersion = settingKey[String]("Spark version") spark / sparkVersion := getSparkVersion() -kernelSpark / sparkVersion := getSparkVersion() +`delta-spark-v1` / sparkVersion := getSparkVersion() +`delta-spark-v2` / sparkVersion := getSparkVersion() +`delta-spark-shaded` / sparkVersion := getSparkVersion() connectCommon / sparkVersion := getSparkVersion() connectClient / sparkVersion := getSparkVersion() connectServer / sparkVersion := getSparkVersion() @@ -433,17 +435,25 @@ lazy val deltaSuiteGenerator = (project in file("spark/delta-suite-generator")) Test / baseDirectory := (ThisBuild / baseDirectory).value, ) -lazy val spark = (project in file("spark")) +// ============================================================ +// Module 1: delta-spark-v1 (prod code only, no tests) +// ============================================================ +lazy val `delta-spark-v1` = (project in file("spark")) .dependsOn(storage) .enablePlugins(Antlr4Plugin) .disablePlugins(JavaFormatterPlugin, ScalafmtPlugin) .settings ( - name := "delta-spark", + name := "delta-spark-v1", commonSettings, scalaStyleSettings, sparkMimaSettings, - releaseSettings, + skipReleaseSettings, // Not published crossSparkSettings(), + + // Only compile main sources, exclude tests + Test / sources := Seq.empty, + Test / resources := Seq.empty, + libraryDependencies ++= Seq( // Adding test classifier seems to break transitive resolution of the core dependencies "org.apache.spark" %% "spark-hive" % sparkVersion.value % "provided", @@ -452,7 +462,171 @@ lazy val spark = (project in file("spark")) "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "provided", // For DynamoDBCommitStore "com.amazonaws" % "aws-java-sdk" % "1.12.262" % "provided", + ), + Compile / packageBin / mappings := (Compile / packageBin / mappings).value ++ + listPythonFiles(baseDirectory.value.getParentFile / "python"), + Antlr4 / antlr4PackageName := Some("io.delta.sql.parser"), + Antlr4 / antlr4GenListener := true, + Antlr4 / antlr4GenVisitor := true, + + // Hack to avoid errors related to missing repo-root/target/scala-2.12/classes/ + createTargetClassesDir := { + val dir = baseDirectory.value.getParentFile / "target" / "scala-2.12" / "classes" + Files.createDirectories(dir.toPath) + }, + Compile / compile := ((Compile / compile) dependsOn createTargetClassesDir).value, + // Generate the package object to provide the version information in runtime. + Compile / sourceGenerators += Def.task { + val file = (Compile / sourceManaged).value / "io" / "delta" / "package.scala" + IO.write(file, + s"""package io + | + |package object delta { + | val VERSION = "${version.value}" + |} + |""".stripMargin) + Seq(file) + }, + ) + +// ============================================================ +// Module 2: delta-spark-v1-shaded (v1 without DeltaLog for v2 dependency) +// ============================================================ +lazy val `delta-spark-v1-shaded` = (project in file("spark-v1-shaded")) + .dependsOn(`delta-spark-v1`) + .dependsOn(storage) // Need to explicitly depend on storage for UCClient etc. + .settings( + name := "delta-spark-v1-shaded", + commonSettings, + skipReleaseSettings, // Not published + + // No source code - just repackage delta-spark-v1 + Compile / sources := Seq.empty, + Test / sources := Seq.empty, + + // Repackage delta-spark-v1 jar but exclude DeltaLog and related classes + Compile / packageBin / mappings := { + val v1Mappings = (`delta-spark-v1` / Compile / packageBin / mappings).value + + // Filter out DeltaLog, Snapshot, OptimisticTransaction classes + v1Mappings.filterNot { case (file, path) => + path.contains("org/apache/spark/sql/delta/DeltaLog") || + path.contains("org/apache/spark/sql/delta/Snapshot") || + path.contains("org/apache/spark/sql/delta/OptimisticTransaction") + // Add more exclusions here if needed + } + }, + + // Inherit v1's classpath for compilation + Compile / dependencyClasspath := (`delta-spark-v1` / Compile / dependencyClasspath).value, + ) + +// ============================================================ +// Module 3: delta-spark-v2 (kernel-spark based, depends on v1-shaded) +// ============================================================ +lazy val `delta-spark-v2` = (project in file("kernel-spark")) + .dependsOn(`delta-spark-v1-shaded`) // Only depends on shaded v1 (no DeltaLog) + .dependsOn(kernelApi) + .dependsOn(kernelDefaults) + .dependsOn(goldenTables % "test") + .settings( + name := "delta-spark-v2", + commonSettings, + javafmtCheckSettings, + skipReleaseSettings, // Not published + Test / javaOptions ++= Seq("-ea"), + libraryDependencies ++= Seq( + "org.apache.spark" %% "spark-sql" % sparkVersion.value % "provided", + "org.apache.spark" %% "spark-core" % sparkVersion.value % "provided", + "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "provided", + + "org.junit.jupiter" % "junit-jupiter-api" % "5.8.2" % "test", + "org.junit.jupiter" % "junit-jupiter-engine" % "5.8.2" % "test", + "org.junit.jupiter" % "junit-jupiter-params" % "5.8.2" % "test", + "net.aichler" % "jupiter-interface" % "0.11.1" % "test" + ), + Test / testOptions += Tests.Argument(TestFrameworks.JUnit, "-v", "-a") + ) + +// ============================================================ +// Module 4: delta-spark-shaded (optional delegation layer) +// ============================================================ +lazy val `delta-spark-shaded` = (project in file("spark-shaded")) + .dependsOn(`delta-spark-v1`) // Full v1 for delegation if needed + .dependsOn(`delta-spark-v2`) + .settings( + name := "delta-spark-shaded", + commonSettings, + skipReleaseSettings, // Not published + + libraryDependencies ++= Seq( + "org.apache.spark" %% "spark-sql" % sparkVersion.value % "provided", + "org.apache.spark" %% "spark-core" % sparkVersion.value % "provided", + "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "provided", + ), + + // This module contains delegation code like: + // - DeltaCatalog (delegates to V1 or V2) + // - DeltaSparkSessionExtension (registers both) + ) +// ============================================================ +// Module 5: delta-spark (final published module - combined v1+v2+shaded) +// ============================================================ +lazy val spark = (project in file("spark-combined")) + .dependsOn(`delta-spark-shaded`) + .dependsOn(`delta-spark-v1` % "test->test") + .dependsOn(storage) // Explicit dependency on storage + .settings ( + name := "delta-spark", + commonSettings, + scalaStyleSettings, + sparkMimaSettings, + releaseSettings, // Published as delta-spark.jar + crossSparkSettings(), + + // No prod code in this module + Compile / sources := Seq.empty, + + // Package combined classes: FULL v1 (with DeltaLog) + v2 + shaded + storage + // Note: v2 only depends on v1-shaded (without DeltaLog) at compile time, + // but final jar includes full v1 for users + Compile / packageBin / mappings := { + val v1Full = (`delta-spark-v1` / Compile / packageBin / mappings).value // Full v1 with DeltaLog + val v2 = (`delta-spark-v2` / Compile / packageBin / mappings).value + val shaded = (`delta-spark-shaded` / Compile / packageBin / mappings).value + val storageClasses = (storage / Compile / packageBin / mappings).value // Add storage classes + v1Full ++ v2 ++ shaded ++ storageClasses + }, + + // Test sources point to original spark/src/test/ (no file movement) + Test / unmanagedSourceDirectories ++= Seq( + baseDirectory.value.getParentFile / "spark" / "src" / "test" / "scala", + baseDirectory.value.getParentFile / "spark" / "src" / "test" / "java" + ), + Test / unmanagedResourceDirectories += + baseDirectory.value.getParentFile / "spark" / "src" / "test" / "resources", + + // Include spark-version-specific test sources + Test / unmanagedSourceDirectories ++= { + val sparkVer = sparkVersion.value + if (sparkVer.startsWith("3.5")) { + Seq(baseDirectory.value.getParentFile / "spark" / "src" / "test" / "scala-spark-3.5") + } else if (sparkVer.startsWith("4.0")) { + Seq(baseDirectory.value.getParentFile / "spark" / "src" / "test" / "scala-spark-master") + } else { + Seq.empty + } + }, + + libraryDependencies ++= Seq( + // Provided deps (needed for compile and test) + "org.apache.spark" %% "spark-hive" % sparkVersion.value % "provided", + "org.apache.spark" %% "spark-sql" % sparkVersion.value % "provided", + "org.apache.spark" %% "spark-core" % sparkVersion.value % "provided", + "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "provided", + "com.amazonaws" % "aws-java-sdk" % "1.12.262" % "provided", + // Test deps "org.scalatest" %% "scalatest" % scalaTestVersion % "test", "org.scalatestplus" %% "scalacheck-1-15" % "3.2.9.0" % "test", @@ -464,11 +638,6 @@ lazy val spark = (project in file("spark")) "org.apache.spark" %% "spark-hive" % sparkVersion.value % "test" classifier "tests", "org.mockito" % "mockito-inline" % "4.11.0" % "test", ), - Compile / packageBin / mappings := (Compile / packageBin / mappings).value ++ - listPythonFiles(baseDirectory.value.getParentFile / "python"), - Antlr4 / antlr4PackageName := Some("io.delta.sql.parser"), - Antlr4 / antlr4GenListener := true, - Antlr4 / antlr4GenVisitor := true, Test / testOptions += Tests.Argument("-oDF"), Test / testOptions += Tests.Argument(TestFrameworks.JUnit, "-v", "-a"), @@ -493,24 +662,6 @@ lazy val spark = (project in file("spark")) // Required for testing table features see https://github.com/delta-io/delta/issues/1602 Test / envVars += ("DELTA_TESTING", "1"), - // Hack to avoid errors related to missing repo-root/target/scala-2.12/classes/ - createTargetClassesDir := { - val dir = baseDirectory.value.getParentFile / "target" / "scala-2.12" / "classes" - Files.createDirectories(dir.toPath) - }, - Compile / compile := ((Compile / compile) dependsOn createTargetClassesDir).value, - // Generate the package object to provide the version information in runtime. - Compile / sourceGenerators += Def.task { - val file = (Compile / sourceManaged).value / "io" / "delta" / "package.scala" - IO.write(file, - s"""package io - | - |package object delta { - | val VERSION = "${version.value}" - |} - |""".stripMargin) - Seq(file) - }, TestParallelization.settings, ) .configureUnidoc( @@ -683,7 +834,7 @@ lazy val kernelDefaults = (project in file("kernel/kernel-defaults")) .dependsOn(kernelApi % "test->test") .dependsOn(storage) .dependsOn(storage % "test->test") // Required for InMemoryCommitCoordinator for tests - .dependsOn(spark % "test->test") + .dependsOn(`delta-spark-v1` % "test->test") .dependsOn(goldenTables % "test") .settings( name := "delta-kernel-defaults", @@ -724,30 +875,7 @@ lazy val kernelDefaults = (project in file("kernel/kernel-defaults")) ).configureUnidoc(docTitle = "Delta Kernel Defaults") -lazy val kernelSpark = (project in file("kernel-spark")) - .dependsOn(kernelApi) - .dependsOn(kernelDefaults) - .dependsOn(spark % "test->test") - .dependsOn(goldenTables % "test") - .settings( - name := "kernel-spark", - commonSettings, - javafmtCheckSettings, - skipReleaseSettings, - Test / javaOptions ++= Seq("-ea"), - libraryDependencies ++= Seq( - "org.apache.spark" %% "spark-sql" % sparkVersion.value % "provided", - "org.apache.spark" %% "spark-core" % sparkVersion.value % "provided", - "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "provided", - - "org.junit.jupiter" % "junit-jupiter-api" % "5.8.2" % "test", - "org.junit.jupiter" % "junit-jupiter-engine" % "5.8.2" % "test", - "org.junit.jupiter" % "junit-jupiter-params" % "5.8.2" % "test", - "net.aichler" % "jupiter-interface" % "0.11.1" % "test" - ), - Test / testOptions += Tests.Argument(TestFrameworks.JUnit, "-v", "-a") - ) - // TODO to enable unit doc for kernelSpark. +// kernelSpark module has been replaced by delta-spark-v2 above lazy val unity = (project in file("unity")) .enablePlugins(ScalafmtPlugin) @@ -1467,7 +1595,7 @@ lazy val compatibility = (project in file("connectors/oss-compatibility-tests")) */ lazy val goldenTables = (project in file("connectors/golden-tables")) - .dependsOn(spark % "test") // depends on delta-spark + .dependsOn(`delta-spark-v1` % "test") // depends on delta-spark v1 for test utilities .disablePlugins(JavaFormatterPlugin, ScalafmtPlugin) .settings( name := "golden-tables", @@ -1658,7 +1786,7 @@ val createTargetClassesDir = taskKey[Unit]("create target classes dir") // Don't use these groups for any other projects lazy val sparkGroup = project - .aggregate(spark, kernelSpark, contribs, storage, storageS3DynamoDB, sharing, hudi) + .aggregate(spark, `delta-spark-v1`, `delta-spark-v1-shaded`, `delta-spark-v2`, `delta-spark-shaded`, contribs, storage, storageS3DynamoDB, sharing, hudi) .settings( // crossScalaVersions must be set to Nil on the aggregating project crossScalaVersions := Nil, diff --git a/spark-shaded/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java b/spark-shaded/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java new file mode 100644 index 00000000000..5a9f9b58f6d --- /dev/null +++ b/spark-shaded/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java @@ -0,0 +1,28 @@ +/* + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.delta.catalog; + +/** + * Delta Catalog implementation that can delegate to both V1 and V2 implementations. + * This class sits in delta-spark-shaded module and can access: + * - V1: org.apache.spark.sql.delta.* (full version with DeltaLog) + * - V2: io.delta.kernel.spark.* + */ +public class DeltaCatalog extends AbstractDeltaCatalog { + +} + diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaAnalysis.scala b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaAnalysis.scala index 05ea7bb0e08..08b8e9ea84b 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaAnalysis.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaAnalysis.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.TimeTravel import org.apache.spark.sql.delta.DataFrameUtils import org.apache.spark.sql.delta.DeltaErrors.{TemporallyUnstableInputException, TimestampEarlierThanCommitRetentionException} import org.apache.spark.sql.delta.actions.TableFeatureProtocolUtils -import org.apache.spark.sql.delta.catalog.DeltaCatalog +import org.apache.spark.sql.delta.catalog.LegacyDeltaCatalog import org.apache.spark.sql.delta.catalog.DeltaTableV2 import org.apache.spark.sql.delta.catalog.IcebergTablePlaceHolder import org.apache.spark.sql.delta.commands._ @@ -245,7 +245,7 @@ class DeltaAnalysis(session: SparkSession) case _ => protocol } - val newDeltaCatalog = new DeltaCatalog() + val newDeltaCatalog = new LegacyDeltaCatalog() val existingTableOpt = newDeltaCatalog.getExistingTableIfExists(catalogTableTarget.identifier) val newTable = newDeltaCatalog .verifyTableAndSolidify( diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala index 371639d6b6f..fe838c1b341 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala @@ -25,7 +25,7 @@ import scala.collection.JavaConverters._ import org.apache.spark.sql.delta.skipping.clustering.temp.{ClusterBySpec} import org.apache.spark.sql.delta.actions.{CommitInfo, Metadata, Protocol, TableFeatureProtocolUtils} -import org.apache.spark.sql.delta.catalog.DeltaCatalog +import org.apache.spark.sql.delta.catalog.LegacyDeltaCatalog import org.apache.spark.sql.delta.commands.{AlterTableDropFeatureDeltaCommand, DeltaGenerateCommand} import org.apache.spark.sql.delta.constraints.Constraints import org.apache.spark.sql.delta.hooks.AutoCompactType @@ -1882,9 +1882,9 @@ trait DeltaErrorsBase new DeltaAnalysisException( errorClass = "DELTA_CONFIGURE_SPARK_SESSION_WITH_EXTENSION_AND_CATALOG", messageParameters = Array(classOf[DeltaSparkSessionExtension].getName, - catalogImplConfig, classOf[DeltaCatalog].getName, + catalogImplConfig, classOf[LegacyDeltaCatalog].getName, classOf[DeltaSparkSessionExtension].getName, - catalogImplConfig, classOf[DeltaCatalog].getName), + catalogImplConfig, classOf[LegacyDeltaCatalog].getName), cause = originalException) } diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaCatalog.scala b/spark/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaCatalog.scala index 73eb771c833..b7a570ea87d 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaCatalog.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaCatalog.scala @@ -67,7 +67,9 @@ import org.apache.spark.sql.types.{IntegerType, StructField, StructType} * A Catalog extension which can properly handle the interaction between the HiveMetaStore and * Delta tables. It delegates all operations DataSources other than Delta to the SparkCatalog. */ -class DeltaCatalog extends DelegatingCatalogExtension +class LegacyDeltaCatalog extends AbstractDeltaCatalog + +class AbstractDeltaCatalog extends DelegatingCatalogExtension with StagingTableCatalog with SupportsPathIdentifier with DeltaLogging { @@ -933,7 +935,7 @@ class DeltaCatalog extends DelegatingCatalogExtension * A trait for handling table access through delta.`/some/path`. This is a stop-gap solution * until PathIdentifiers are implemented in Apache Spark. */ -trait SupportsPathIdentifier extends TableCatalog { self: DeltaCatalog => +trait SupportsPathIdentifier extends TableCatalog { self: AbstractDeltaCatalog => private def supportSQLOnFile: Boolean = spark.sessionState.conf.runSQLonFile From 9560950342569f458f04dd5e305ef1f47509c144 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Thu, 9 Oct 2025 12:58:23 -0700 Subject: [PATCH 02/53] save --- DELTALOG_ISOLATION.md | 230 ------------------------------------------ MODULE_STRUCTURE.md | 145 -------------------------- PUBLISH_STRUCTURE.md | 181 --------------------------------- SHADED_DECISION.md | 148 --------------------------- 4 files changed, 704 deletions(-) delete mode 100644 DELTALOG_ISOLATION.md delete mode 100644 MODULE_STRUCTURE.md delete mode 100644 PUBLISH_STRUCTURE.md delete mode 100644 SHADED_DECISION.md diff --git a/DELTALOG_ISOLATION.md b/DELTALOG_ISOLATION.md deleted file mode 100644 index dcce7e21f33..00000000000 --- a/DELTALOG_ISOLATION.md +++ /dev/null @@ -1,230 +0,0 @@ -# DeltaLog 隔离架构 - -## ✅ 实现完成! - -成功实现了 delta-spark-v2 不依赖 DeltaLog 的架构。 - -## 架构设计 - -``` -delta-spark-v1 (7.4M) - ├─ 包含所有 V1 类,包括 DeltaLog - │ - ↓ 重新打包(排除 DeltaLog) - │ -delta-spark-v1-shaded (7.1M) - ├─ V1 的所有类,但排除: - │ • DeltaLog - │ • Snapshot - │ • OptimisticTransaction - │ - ↓ 依赖 - │ -delta-spark-v2 (34K) - ├─ Kernel-based connector - ├─ ✅ 编译时只能访问 v1-shaded - ├─ ✅ 无法访问 DeltaLog 类 - │ - ↓ 组合 - │ -delta-spark (final, 7.5M) - └─ 包含: - • V1 完整版(含 DeltaLog)← 从 delta-spark-v1 重新添加 - • V2 所有类 - • 可选的 delegation 层 -``` - -## 验证结果 - -### 1. delta-spark-v1-shaded 成功排除 DeltaLog - -```bash -$ jar -tf spark-v1-shaded/target/scala-2.12/delta-spark-v1-shaded_2.12-3.4.0-SNAPSHOT.jar | \ - grep -E "DeltaLog\.class|Snapshot\.class|OptimisticTransaction\.class" -# 返回空 ✓ - 成功排除 -``` - -### 2. delta-spark-v2 成功编译(无 DeltaLog) - -```bash -$ ./build/sbt "delta-spark-v2/compile" -[success] ✓ - 编译成功,证明 v2 不需要 DeltaLog -``` - -### 3. 最终 jar 包含完整 V1(含 DeltaLog) - -```bash -$ jar -tf spark-tests/target/scala-2.12/delta-spark_2.12-3.4.0-SNAPSHOT.jar | \ - grep "DeltaLog\.class" -org/apache/spark/sql/delta/DeltaLog.class ✓ - DeltaLog 存在 -``` - -## JAR 大小对比 - -| 模块 | 大小 | 内容 | -|------|------|------| -| delta-spark-v1 | 7.4M | V1 完整版(含 DeltaLog) | -| delta-spark-v1-shaded | 7.1M | V1 无 DeltaLog(-300KB) | -| delta-spark-v2 | 34K | Kernel connector | -| **delta-spark (final)** | **7.5M** | **V1完整 + V2** | - -## 排除的类 - -delta-spark-v1-shaded 排除了以下类: - -```scala -// build.sbt 配置 -Compile / packageBin / mappings := { - val v1Mappings = (`delta-spark-v1` / Compile / packageBin / mappings).value - - v1Mappings.filterNot { case (file, path) => - path.contains("org/apache/spark/sql/delta/DeltaLog") || - path.contains("org/apache/spark/sql/delta/Snapshot") || - path.contains("org/apache/spark/sql/delta/OptimisticTransaction") - } -} -``` - -**排除的具体类**: -- `org.apache.spark.sql.delta.DeltaLog` - 核心 Delta 日志类 -- `org.apache.spark.sql.delta.Snapshot` - 表快照类 -- `org.apache.spark.sql.delta.OptimisticTransaction` - 事务类 - -**未排除的类**(不直接依赖 DeltaLog): -- `CapturedSnapshot` - 快照包装类 -- `DummySnapshot` - 测试用假快照 -- `SnapshotOverwriteOperationMetrics` - 指标类 - -## 工作原理 - -### 编译时(delta-spark-v2) - -``` -delta-spark-v2 - → 依赖 delta-spark-v1-shaded - → 只能看到 V1 的部分类(无 DeltaLog) - → 编译成功 = 证明 v2 不需要 DeltaLog ✓ -``` - -### 运行时(用户使用) - -``` -delta-spark.jar - → 包含 V1 完整版(含 DeltaLog) - → 包含 V2 所有类 - → 用户可以使用所有功能 ✓ -``` - -## 依赖关系 - -```scala -// Module 1: delta-spark-v1 (完整版) -lazy val `delta-spark-v1` = (project in file("spark")) - .settings( - // 编译所有 V1 源码,包括 DeltaLog - ) - -// Module 2: delta-spark-v1-shaded (排除 DeltaLog) -lazy val `delta-spark-v1-shaded` = (project in file("spark-v1-shaded")) - .dependsOn(`delta-spark-v1`) - .settings( - // 重新打包 v1,排除 DeltaLog 相关类 - Compile / packageBin / mappings := { /* filter logic */ } - ) - -// Module 3: delta-spark-v2 (依赖 v1-shaded) -lazy val `delta-spark-v2` = (project in file("kernel-spark")) - .dependsOn(`delta-spark-v1-shaded`) // ← 只依赖 shaded 版本 - .settings(/* ... */) - -// Module 4: delta-spark-shaded (可选 delegation) -lazy val `delta-spark-shaded` = (project in file("spark-shaded")) - .dependsOn(`delta-spark-v1`) // ← 完整版 v1 - .dependsOn(`delta-spark-v2`) - -// Module 5: delta-spark (最终发布) -lazy val spark = (project in file("spark-combined")) - .dependsOn(`delta-spark-shaded`) - .settings( - // 重新打包:完整 v1 + v2 - Compile / packageBin / mappings := { - val v1Full = (`delta-spark-v1` / Compile / packageBin / mappings).value // ← 完整版 - val v2 = (`delta-spark-v2` / Compile / packageBin / mappings).value - val shaded = (`delta-spark-shaded` / Compile / packageBin / mappings).value - v1Full ++ v2 ++ shaded - } - ) -``` - -## 关键点 - -### ✅ 隔离成功 - -- **编译时隔离**:v2 无法访问 DeltaLog -- **运行时完整**:用户可以使用所有 V1 功能 - -### 🎯 测试策略 - -如果 delta-spark-v2 的测试全部通过,证明: -- v2 的所有代码路径都不需要加载 DeltaLog 类 -- v2 真正实现了与 V1 核心的解耦 - -### 🔄 工作流程 - -```bash -# 1. 编译 v1(完整版) -sbt delta-spark-v1/compile - -# 2. 打包 v1-shaded(排除 DeltaLog) -sbt delta-spark-v1-shaded/packageBin -# → 生成 7.1M jar(比 v1 少 300KB) - -# 3. 编译 v2(依赖 v1-shaded) -sbt delta-spark-v2/compile -# → 编译成功 = v2 不需要 DeltaLog ✓ - -# 4. 打包最终 jar(重新加入完整 v1) -sbt spark/packageBin -# → 生成 7.5M jar(包含完整 v1 + v2) -``` - -## 未来扩展 - -### 添加更多排除类 - -如果需要排除更多类: - -```scala -v1Mappings.filterNot { case (file, path) => - path.contains("org/apache/spark/sql/delta/DeltaLog") || - path.contains("org/apache/spark/sql/delta/Snapshot") || - path.contains("org/apache/spark/sql/delta/OptimisticTransaction") || - path.contains("org/apache/spark/sql/delta/SomeOtherClass") // ← 添加更多 -} -``` - -### 测试验证 - -运行 v2 测试确保不依赖 DeltaLog: - -```bash -sbt "delta-spark-v2/test" -# 如果测试通过 → 证明 v2 完全独立于 DeltaLog -``` - -## 总结 - -✅ **可以!** 这个架构完全可行并且已经实现: - -1. **delta-spark-v1-shaded** 排除 DeltaLog(通过 packageBin mapping 过滤) -2. **delta-spark-v2** 依赖 v1-shaded,编译成功(证明不需要 DeltaLog) -3. **delta-spark (final)** 重新打包完整 v1(含 DeltaLog)+ v2 -4. **零文件移动** - 所有源码保持原位 -5. **验证通过** - jar 文件分析确认架构正确 - -**用户体验**: -- 只需要依赖一个 `delta-spark.jar` -- jar 包含完整的 V1 和 V2 功能 -- V2 在内部确保了与 DeltaLog 的隔离 - - diff --git a/MODULE_STRUCTURE.md b/MODULE_STRUCTURE.md deleted file mode 100644 index d477455f315..00000000000 --- a/MODULE_STRUCTURE.md +++ /dev/null @@ -1,145 +0,0 @@ -# Delta Spark Module Structure - -## Overview - -The delta-spark codebase has been refactored into 5 SBT modules to support both v1 (current) and v2 (kernel-based) implementations, with **DeltaLog isolation**: - -``` -delta-spark-v1 (not published, full v1 with DeltaLog) - ↓ repackage (exclude DeltaLog) -delta-spark-v1-shaded (not published, v1 without DeltaLog) - ↓ -delta-spark-v2 (not published, depends on v1-shaded) - ↓ -delta-spark-shaded (not published, optional delegation) - ↓ -delta-spark (published jar, full v1 + v2) -``` - -## Module Details - -### 1. delta-spark-v1 -- **Directory**: `spark/` -- **Published**: No -- **Content**: Production code only (no tests) -- **Description**: Current delta-spark production code -- **Key Features**: - - All existing Delta Spark functionality - - Antlr parser generation - - Python file packaging - -### 2. delta-spark-v1-shaded -- **Directory**: `spark-v1-shaded/` (virtual, no source files) -- **Published**: No -- **Content**: Repackaged delta-spark-v1 JAR with DeltaLog classes excluded -- **Dependencies**: delta-spark-v1 -- **Description**: V1 without DeltaLog for v2 to depend on -- **Key Features**: - - Filters out `DeltaLog`, `Snapshot`, `OptimisticTransaction` classes - - Used to enforce v2 doesn't depend on DeltaLog at compile time - - ~300KB smaller than full v1 (7.1M vs 7.4M) - -### 3. delta-spark-v2 -- **Directory**: `kernel-spark/` -- **Published**: No -- **Content**: Kernel-based Spark implementation -- **Dependencies**: **delta-spark-v1-shaded** (no DeltaLog), kernelApi, kernelDefaults -- **Description**: New kernel-based Spark connector -- **Key Features**: - - DSv2 Catalog and Tables - - Kernel-specific unit tests - - **Cannot access DeltaLog at compile time** (enforced by v1-shaded dependency) - -### 4. delta-spark-shaded -- **Directory**: `spark-shaded/` -- **Published**: No -- **Content**: Delegation layer -- **Dependencies**: **delta-spark-v1** (full version), delta-spark-v2 -- **Description**: Contains delegation code that routes to v1 or v2 -- **Key Features**: - - DeltaCatalog (delegates to V1 or V2) - - DeltaSparkSessionExtension (registers both) - -### 5. delta-spark (final module) -- **Directory**: `spark-combined/` -- **Published**: Yes (as `delta-spark.jar`) -- **Content**: - - No production code (packages v1+v2+shaded) - - All test code from `spark/src/test/` -- **Dependencies**: delta-spark-shaded, delta-spark-v1 (test utils) -- **Description**: Final published artifact combining all modules -- **Key Features**: - - Tests can access both v1 and v2 implementations - - Published jar contains complete v1+v2+shaded code - -## File Structure - -### No Files Moved! -- Production code remains in `spark/src/main/` -- Test code remains in `spark/src/test/` -- Kernel code remains in `kernel-spark/src/` - -### New Directories Created -- `spark-combined/` - final combined module (v1+v2+tests) -- `spark-shaded/` - delegation code -- `spark-v1-shaded/` - virtual module (no source files, only build configuration) - -## SBT Commands - -```bash -# Compile individual modules -sbt delta-spark-v1/compile -sbt delta-spark-v2/compile -sbt delta-spark-shaded/compile -sbt spark/compile - -# Run tests -sbt spark/test - -# Publish -sbt spark/publishLocal -``` - -## Key Implementation Details - -### Test Source Configuration -The `spark` module uses `unmanagedSourceDirectories` to point to original test locations: -```scala -Test / unmanagedSourceDirectories ++= Seq( - baseDirectory.value.getParentFile / "spark" / "src" / "test" / "scala", - baseDirectory.value.getParentFile / "spark" / "src" / "test" / "java" -) -``` - -### Package Assembly -The final `spark` module packages all classes: -```scala -Compile / packageBin / mappings := { - val v1 = (`delta-spark-v1` / Compile / packageBin / mappings).value - val v2 = (`delta-spark-v2` / Compile / packageBin / mappings).value - val shaded = (`delta-spark-shaded` / Compile / packageBin / mappings).value - v1 ++ v2 ++ shaded -} -``` - -## Benefits - -1. **Modular**: Clear separation between v1, v2, and delegation layers -2. **No File Movement**: All code stays in original locations -3. **Backward Compatible**: Final jar contains everything -4. **Testable**: Tests can verify both v1 and v2 implementations -5. **Not Published**: Internal modules (v1, v2, shaded) aren't published -6. **Clean Dependencies**: Avoids circular dependencies - -## Migration Notes - -### Dependency Updates -Modules that previously depended on `spark` should now depend on: -- `delta-spark-v1` - if only v1 functionality needed -- `delta-spark-shaded` - if both v1 and v2 needed -- `spark` - if test utilities needed - -### Updated Dependencies -- `kernelDefaults` → depends on `delta-spark-v1 % "test->test"` -- `goldenTables` → depends on `delta-spark-v1 % "test"` - diff --git a/PUBLISH_STRUCTURE.md b/PUBLISH_STRUCTURE.md deleted file mode 100644 index fca89b8e953..00000000000 --- a/PUBLISH_STRUCTURE.md +++ /dev/null @@ -1,181 +0,0 @@ -# Delta Spark 发布结构说明 - -## publishM2 会发布哪些 JAR? - -### 发布的模块(使用 releaseSettings) - -只有 **1 个** delta-spark 相关的 jar 会被发布: - -#### 1. delta-spark.jar -- **SBT 模块**: `spark` -- **Maven Artifact**: `delta-spark_2.12` (或 `delta-spark_2.13`) -- **内容**: - - delta-spark-v1 的所有 classes(来自 `spark/src/main/`) - - delta-spark-v2 的所有 classes(来自 `kernel-spark/src/main/`) - - delta-spark-shaded 的所有 classes(来自 `spark-shaded/src/main/`) - - Python 文件(从 `python/` 目录) -- **发布配置**: `releaseSettings` → `publishArtifact := true` - -### 不发布的模块(使用 skipReleaseSettings) - -以下 3 个模块 **不会** 单独发布 jar: - -#### 1. delta-spark-v1 -- **配置**: `skipReleaseSettings` → `publishArtifact := false` -- **原因**: 内部模块,其 classes 会被打包到最终的 `delta-spark.jar` 中 - -#### 2. delta-spark-v2 -- **配置**: `skipReleaseSettings` → `publishArtifact := false` -- **原因**: 内部模块,其 classes 会被打包到最终的 `delta-spark.jar` 中 - -#### 3. delta-spark-shaded -- **配置**: `skipReleaseSettings` → `publishArtifact := false` -- **原因**: 内部模块,其 classes 会被打包到最终的 `delta-spark.jar` 中 - -## delta-spark.jar 包含的内容 - -最终发布的 `delta-spark.jar` 通过以下配置组合所有内容: - -```scala -Compile / packageBin / mappings := { - val v1 = (`delta-spark-v1` / Compile / packageBin / mappings).value - val v2 = (`delta-spark-v2` / Compile / packageBin / mappings).value - val shaded = (`delta-spark-shaded` / Compile / packageBin / mappings).value - v1 ++ v2 ++ shaded -} -``` - -### 详细内容列表 - -#### 来自 delta-spark-v1 (`spark/src/main/`) -- `org.apache.spark.sql.delta.*` - Delta Lake 核心功能 -- `io.delta.sql.*` - Delta SQL 扩展 -- `io.delta.tables.*` - Delta Tables API -- `io.delta.dynamodbcommitcoordinator.*` - DynamoDB 协调器 -- ANTLR 生成的 parser 类 -- Python 文件(`delta/*.py`) -- `META-INF/services/*` - 服务注册文件 - -#### 来自 delta-spark-v2 (`kernel-spark/src/main/`) -- `io.delta.kernel.spark.*` - Kernel-based Spark connector - - `catalog.SparkTable` - DSv2 Table 实现 - - `read.*` - 读取相关类(Scan, Batch, PartitionReader) - - `utils.*` - 工具类(Schema, Expression, Serialization) -- `io.delta.sql.DeltaSparkSessionExtension` - V2 扩展 -- `org.apache.spark.sql.delta.catalog.DeltaCatalog` - V2 Catalog - -#### 来自 delta-spark-shaded (`spark-shaded/src/main/`) -- Delegation 代码(如果添加) - - 例如:统一的 `DeltaCatalog` 入口,根据配置选择 V1 或 V2 - - 例如:`DeltaSparkSessionExtension` 可以同时注册 V1 和 V2 - -## 发布命令 - -```bash -# 发布到本地 Maven 仓库 -sbt publishM2 - -# 只发布 delta-spark 模块 -sbt spark/publishM2 - -# 发布到 Sonatype -sbt publishSigned -``` - -## Maven 依赖示例 - -用户只需要依赖一个 jar: - -```xml - - io.delta - delta-spark_2.12 - 3.4.0-SNAPSHOT - -``` - -这一个 jar 就包含了 V1、V2 和 delegation 层的所有功能。 - -## 其他会发布的 Delta 相关模块 - -除了 `delta-spark.jar`,以下模块也会被 publishM2: - -1. **delta-kernel-api.jar** - Kernel API -2. **delta-kernel-defaults.jar** - Kernel Defaults 实现 -3. **delta-storage.jar** - Storage 层 -4. **delta-storage-s3-dynamodb.jar** - S3/DynamoDB 存储 -5. **delta-iceberg.jar** - Iceberg 集成 -6. **delta-hudi.jar** - Hudi 集成 -7. **delta-sharing-spark.jar** - Delta Sharing -8. **delta-contribs.jar** - 贡献模块 -9. **delta-connect-*.jar** - Delta Connect 模块 -10. **delta-standalone*.jar** - Standalone 连接器 -11. **delta-hive*.jar** - Hive 连接器 -12. **delta-flink.jar** - Flink 连接器 - -但这些都是独立的 jar,与 `delta-spark.jar` 分开发布。 - -## 总结 - -**回答你的问题**: - -1. **publishM2 会生成几个 delta-spark jar?** - - 只有 **1 个**:`delta-spark_2.12-3.4.0-SNAPSHOT.jar` (约 7.5MB) - - 位置:`spark-combined/target/scala-2.12/` - -2. **delta-spark jar 包含哪些内容?** - - **来自 delta-spark-v1** (约 7.4MB): - ``` - org/apache/spark/sql/delta/* - Delta Lake 核心 - io/delta/sql/* - SQL 扩展 - io/delta/tables/* - Tables API - io/delta/dynamodbcommitcoordinator/* - DynamoDB - delta/*.py - Python 文件 - META-INF/services/* - 服务注册 - ``` - - **来自 delta-spark-v2** (约 34KB): - ``` - io/delta/kernel/spark/catalog/* - DSv2 Catalog - io/delta/kernel/spark/read/* - 读取实现 - io/delta/kernel/spark/utils/* - 工具类 - io/delta/sql/DeltaSparkSessionExtension - V2 扩展 - org/apache/spark/sql/delta/catalog/DeltaCatalog - V2 Catalog - ``` - - **来自 delta-spark-shaded** (约 288B): - ``` - (delegation 代码,如果添加) - ``` - -3. **v1, v2, shaded 三个内部模块会单独发布吗?** - - **不会**,它们有 `skipReleaseSettings` 配置 - - 它们只是内部模块,用于组织代码 - - 所有代码最终都打包进同一个 `delta-spark.jar` - -## 验证 - -生成的 jar 文件: -```bash -# 内部模块(不发布) -spark/target/scala-2.12/delta-spark-v1_2.12-3.4.0-SNAPSHOT.jar # 7.4M -kernel-spark/target/scala-2.12/delta-spark-v2_2.12-3.4.0-SNAPSHOT.jar # 34K -spark-shaded/target/scala-2.12/delta-spark-shaded_2.12-3.4.0-SNAPSHOT.jar # 288B - -# 最终发布的 jar(组合了上面三个) -spark-combined/target/scala-2.12/delta-spark_2.12-3.4.0-SNAPSHOT.jar # 7.5M -``` - -关键类验证: -```bash -# V1 类 -org.apache.spark.sql.delta.DeltaLog -org.apache.spark.sql.delta.catalog.DeltaCatalog - -# V2 类 -io.delta.kernel.spark.table.SparkTable -io.delta.kernel.spark.read.SparkScan -io.delta.sql.DeltaSparkSessionExtension -``` - diff --git a/SHADED_DECISION.md b/SHADED_DECISION.md deleted file mode 100644 index 41ab81df19f..00000000000 --- a/SHADED_DECISION.md +++ /dev/null @@ -1,148 +0,0 @@ -# delta-spark-shaded 决策分析 - -## TL;DR - -**推荐:方案 C - 保留空模块** -- 成本低(只是一个空目录) -- 保持架构灵活性 -- 未来如需 delegation 可随时添加 - ---- - -## 当前状态 - -delta-spark-shaded 当前为空: -- 源码:`spark-shaded/src/main/scala/` 空目录 -- Jar:288 字节(只有 MANIFEST) -- 依赖:v1 + v2 - -## 是否必要? - -### ❌ 不必要的情况 - -1. **V1 和 V2 类名不冲突** - - V1: `org.apache.spark.sql.delta.catalog.DeltaCatalog` - - V2: `io.delta.kernel.spark.table.SparkTable` - - 不同的包名和类名,可以共存 - -2. **用户可以直接选择实现** - ```scala - // 方式1: 用 V1 Catalog - spark.conf.set("spark.sql.catalog.spark_catalog", - "org.apache.spark.sql.delta.catalog.DeltaCatalog") - - // 方式2: 用 V2 Table - spark.read.format("io.delta.kernel.spark").load(path) - ``` - -3. **不需要同时启用** - - 如果 V1 和 V2 是互斥的,不需要 delegation - -### ✅ 必要的情况 - -1. **需要统一入口点** - ```scala - // 统一的 DeltaCatalog,内部路由到 V1 或 V2 - class DeltaCatalog extends ... { - def loadTable(...) = { - if (useKernel) v2.SparkTable(...) - else v1.DeltaTable(...) - } - } - ``` - -2. **需要同时注册 V1 和 V2** - ```scala - class DeltaSparkSessionExtension { - override def apply(extensions: SparkSessionExtensions) = { - registerV1Rules(extensions) - registerV2Rules(extensions) - } - } - ``` - -3. **需要平滑迁移** - - 逐步从 V1 迁移到 V2 - - A/B 测试不同实现 - - 按功能分流(读用 V2,写用 V1) - -4. **需要 Shading(名字冲突)** - - 如果 V1 和 V2 有同名类 - - 使用 sbt-assembly shading 规则 - -## 三种方案对比 - -| 方案 | 优点 | 缺点 | 适用场景 | -|------|------|------|----------| -| **A. 保留并实现** | • 统一入口
• 灵活切换
• 平滑迁移 | • 额外代码
• 维护成本 | 需要 delegation | -| **B. 完全删除** | • 代码最简
• 依赖清晰 | • 未来加回成本高
• 缺少灵活性 | 确定不需要 delegation | -| **C. 保留空模块** | • 架构预留
• 无额外成本
• 随时可加 | • 多一个模块 | **推荐:暂不确定** | - -## 推荐方案:C(保留空模块) - -### 理由 - -1. **成本极低** - - 只是一个空目录 + 288B jar - - 不影响编译和发布 - -2. **架构清晰** - ``` - v1 (prod) ──┐ - ├──> shaded (delegation) ──> spark (final jar) - v2 (prod) ──┘ - ``` - -3. **未来灵活** - - 如果需要 delegation,直接添加代码 - - 不需要重构 build.sbt - -### 何时添加代码到 shaded? - -**触发条件**: -- [ ] 需要根据配置自动选择 V1/V2 -- [ ] 需要同时启用 V1 和 V2 -- [ ] 发现类名冲突 -- [ ] 需要 A/B 测试或灰度发布 - -**暂时不需要**: -- 用户可以显式选择 V1 或 V2 -- 两个实现可以独立使用 - -## 如何删除 delta-spark-shaded(如果确定不需要) - -```scala -// build.sbt 修改 - -// 删除 delta-spark-shaded 模块定义 -// lazy val `delta-spark-shaded` = ... - -// spark 模块直接依赖 v1 和 v2 -lazy val spark = (project in file("spark-tests")) - .dependsOn(`delta-spark-v1`) - .dependsOn(`delta-spark-v2`) - .settings( - Compile / packageBin / mappings := { - val v1 = (`delta-spark-v1` / Compile / packageBin / mappings).value - val v2 = (`delta-spark-v2` / Compile / packageBin / mappings).value - v1 ++ v2 // 移除 shaded - } - ) -``` - -```bash -# 删除目录 -rm -rf spark-shaded/ -``` - -## 决策 - -✅ **暂时保留空的 delta-spark-shaded** - -原因: -- 成本可忽略 -- 保持架构扩展性 -- 符合原始设计意图 -- 未来如需 delegation 可随时添加 - - From 8b9fda19a84fe7faba5c8389dd12827c05b0562d Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Thu, 9 Oct 2025 13:08:34 -0700 Subject: [PATCH 03/53] save --- build.sbt | 9 +++++- .../sql/DeltaSparkSessionExtension.scala | 29 +++++++++++++++++++ .../sql/DeltaSparkSessionExtension.scala | 3 +- .../apache/spark/sql/delta/DeltaErrors.scala | 17 +++++------ 4 files changed, 46 insertions(+), 12 deletions(-) create mode 100644 spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala diff --git a/build.sbt b/build.sbt index 5b97256bdd4..2a8c24d0fc4 100644 --- a/build.sbt +++ b/build.sbt @@ -596,7 +596,14 @@ lazy val spark = (project in file("spark-combined")) val v2 = (`delta-spark-v2` / Compile / packageBin / mappings).value val shaded = (`delta-spark-shaded` / Compile / packageBin / mappings).value val storageClasses = (storage / Compile / packageBin / mappings).value // Add storage classes - v1Full ++ v2 ++ shaded ++ storageClasses + + // Merge all mappings, shaded classes override v1 classes if there are conflicts + // This allows delegation classes in shaded (DeltaCatalog, DeltaSparkSessionExtension) + // to replace v1 originals + val allMappings = v1Full ++ v2 ++ storageClasses ++ shaded + + // Remove duplicates by path (keep the last occurrence, which is from shaded) + allMappings.groupBy(_._2).map(_._2.last).toSeq }, // Test sources point to original spark/src/test/ (no file movement) diff --git a/spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala b/spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala new file mode 100644 index 00000000000..a7a1b39af23 --- /dev/null +++ b/spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala @@ -0,0 +1,29 @@ +/* + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.delta.sql + +import io.delta.sql.AbstractSparkSessionExtension + +/** + * Delta Spark Session Extension that can register both V1 and V2 implementations. + * This class sits in delta-spark-shaded module and can access: + * - V1: org.apache.spark.sql.delta.* (full version with DeltaLog) + * - V2: io.delta.kernel.spark.* + */ +class DeltaSparkSessionExtension extends AbstractSparkSessionExtension { +} + diff --git a/spark/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala b/spark/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala index 0b7dedc1196..2e20246266f 100644 --- a/spark/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala +++ b/spark/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala @@ -80,7 +80,8 @@ import org.apache.spark.sql.internal.SQLConf * * @since 0.4.0 */ -class DeltaSparkSessionExtension extends (SparkSessionExtensions => Unit) { +class LegacyDeltaSparkSessionExtension extends AbstractSparkSessionExtension +class AbstractSparkSessionExtension extends (SparkSessionExtensions => Unit) { override def apply(extensions: SparkSessionExtensions): Unit = { extensions.injectParser { (_, parser) => new DeltaSqlParser(parser) diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala index fe838c1b341..3ed1e941bf7 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala @@ -20,12 +20,10 @@ package org.apache.spark.sql.delta import java.io.{FileNotFoundException, IOException} import java.nio.file.FileAlreadyExistsException import java.util.{ConcurrentModificationException, UUID} - import scala.collection.JavaConverters._ - -import org.apache.spark.sql.delta.skipping.clustering.temp.{ClusterBySpec} +import org.apache.spark.sql.delta.skipping.clustering.temp.ClusterBySpec import org.apache.spark.sql.delta.actions.{CommitInfo, Metadata, Protocol, TableFeatureProtocolUtils} -import org.apache.spark.sql.delta.catalog.LegacyDeltaCatalog +import org.apache.spark.sql.delta.catalog.{AbstractDeltaCatalog, LegacyDeltaCatalog} import org.apache.spark.sql.delta.commands.{AlterTableDropFeatureDeltaCommand, DeltaGenerateCommand} import org.apache.spark.sql.delta.constraints.Constraints import org.apache.spark.sql.delta.hooks.AutoCompactType @@ -37,9 +35,8 @@ import org.apache.spark.sql.delta.redirect.RedirectState import org.apache.spark.sql.delta.schema.{DeltaInvariantViolationException, InvariantViolationException, SchemaUtils, UnsupportedDataTypeInfo} import org.apache.spark.sql.delta.sources.DeltaSQLConf import org.apache.spark.sql.delta.util.JsonUtils -import io.delta.sql.DeltaSparkSessionExtension +import io.delta.sql.AbstractSparkSessionExtension import org.apache.hadoop.fs.{ChecksumException, Path} - import org.apache.spark.{SparkConf, SparkEnv, SparkException} import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier @@ -1881,10 +1878,10 @@ trait DeltaErrorsBase val catalogImplConfig = SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION.key new DeltaAnalysisException( errorClass = "DELTA_CONFIGURE_SPARK_SESSION_WITH_EXTENSION_AND_CATALOG", - messageParameters = Array(classOf[DeltaSparkSessionExtension].getName, - catalogImplConfig, classOf[LegacyDeltaCatalog].getName, - classOf[DeltaSparkSessionExtension].getName, - catalogImplConfig, classOf[LegacyDeltaCatalog].getName), + messageParameters = Array(classOf[AbstractSparkSessionExtension].getName, + catalogImplConfig, classOf[AbstractDeltaCatalog].getName, + classOf[AbstractSparkSessionExtension].getName, + catalogImplConfig, classOf[AbstractDeltaCatalog].getName), cause = originalException) } From 4f45983d6d4bbcc826a7536b6afa8fa70014987b Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Thu, 9 Oct 2025 13:21:01 -0700 Subject: [PATCH 04/53] save --- build.sbt | 27 +++++++++++++++++-- .../delta/kernel/spark/read/SparkBatch.java | 2 +- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/build.sbt b/build.sbt index 2a8c24d0fc4..d9f7f39f51a 100644 --- a/build.sbt +++ b/build.sbt @@ -574,9 +574,12 @@ lazy val `delta-spark-shaded` = (project in file("spark-shaded")) // Module 5: delta-spark (final published module - combined v1+v2+shaded) // ============================================================ lazy val spark = (project in file("spark-combined")) - .dependsOn(`delta-spark-shaded`) - .dependsOn(`delta-spark-v1` % "test->test") + .dependsOn(`delta-spark-v1`) // Direct dependency on v1 (full version with DeltaLog) + .dependsOn(`delta-spark-v2`) // Direct dependency on v2 + .dependsOn(`delta-spark-v1` % "test->test") // Test utilities from v1 .dependsOn(storage) // Explicit dependency on storage + // Note: We don't .dependsOn delta-spark-shaded to avoid resolution issues, + // but we include its classes in packageBin / mappings below .settings ( name := "delta-spark", commonSettings, @@ -585,6 +588,26 @@ lazy val spark = (project in file("spark-combined")) releaseSettings, // Published as delta-spark.jar crossSparkSettings(), + // Remove internal module dependencies from published pom.xml + // Users should only depend on delta-spark jar, not internal modules + pomPostProcess := { node => + import scala.xml._ + import scala.xml.transform._ + new RuleTransformer(new RewriteRule { + override def transform(n: Node): Seq[Node] = n match { + case e: Elem if e.label == "dependency" => + val artifactId = (e \ "artifactId").text + // Remove delta-spark-v1, delta-spark-v2, delta-spark-v1-shaded, delta-spark-shaded from pom + if (artifactId.startsWith("delta-spark-v") || artifactId == "delta-spark-shaded") { + Seq.empty + } else { + Seq(n) + } + case _ => Seq(n) + } + }).transform(node).head + }, + // No prod code in this module Compile / sources := Seq.empty, diff --git a/kernel-spark/src/main/java/io/delta/kernel/spark/read/SparkBatch.java b/kernel-spark/src/main/java/io/delta/kernel/spark/read/SparkBatch.java index 202f59085d3..1b51e13dff6 100644 --- a/kernel-spark/src/main/java/io/delta/kernel/spark/read/SparkBatch.java +++ b/kernel-spark/src/main/java/io/delta/kernel/spark/read/SparkBatch.java @@ -152,7 +152,7 @@ private long calculateMaxSplitBytes(SparkSession sparkSession) { int minPartitionNum = minPartitionNumOption.isDefined() ? ((Number) minPartitionNumOption.get()).intValue() - : sparkSession.leafNodeDefaultParallelism(); + : sparkSession.sparkContext().defaultParallelism(); if (minPartitionNum <= 0) { minPartitionNum = 1; } From d6dc7ef5b871f12095b4bafa92e426c54388b886 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Thu, 9 Oct 2025 13:43:40 -0700 Subject: [PATCH 05/53] fix --- build.sbt | 12 +++++++++++- .../apache/spark/sql/delta/implicits/package.scala | 3 +-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/build.sbt b/build.sbt index d9f7f39f51a..8e6f1c2ce38 100644 --- a/build.sbt +++ b/build.sbt @@ -588,7 +588,7 @@ lazy val spark = (project in file("spark-combined")) releaseSettings, // Published as delta-spark.jar crossSparkSettings(), - // Remove internal module dependencies from published pom.xml + // Remove internal module dependencies from published pom.xml and ivy.xml // Users should only depend on delta-spark jar, not internal modules pomPostProcess := { node => import scala.xml._ @@ -608,6 +608,16 @@ lazy val spark = (project in file("spark-combined")) }).transform(node).head }, + // Also remove internal modules from ivy.xml + pomIncludeRepository := { _ => false }, // Don't include repositories in pom + + // Override projectDependencies to exclude internal modules + projectDependencies := { + projectDependencies.value.filterNot { dep => + dep.name.startsWith("delta-spark-v") || dep.name == "delta-spark-shaded" + } + }, + // No prod code in this module Compile / sources := Seq.empty, diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/implicits/package.scala b/spark/src/main/scala/org/apache/spark/sql/delta/implicits/package.scala index 3548c7e766d..80408134395 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/implicits/package.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/implicits/package.scala @@ -17,12 +17,11 @@ package org.apache.spark.sql.delta import org.apache.spark.sql.delta.actions.AddFile -import org.apache.spark.sql.delta.implicits.RichSparkClasses import org.apache.spark.sql.delta.util.DeltaEncoders import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} -package object implicits extends DeltaEncoders with RichSparkClasses { +package object implicits extends DeltaEncoders with implicits.RichSparkClasses { // Define a few implicit classes to provide the `toDF` method. These classes are not using generic // types to avoid touching Scala reflection. implicit class RichAddFileSeq(files: Seq[AddFile]) { From 3a6472b478750ed4565004b52ebc303f63af96c7 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Thu, 9 Oct 2025 16:53:04 -0700 Subject: [PATCH 06/53] fix --- build.sbt | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/build.sbt b/build.sbt index 8e6f1c2ce38..696837791d4 100644 --- a/build.sbt +++ b/build.sbt @@ -574,12 +574,9 @@ lazy val `delta-spark-shaded` = (project in file("spark-shaded")) // Module 5: delta-spark (final published module - combined v1+v2+shaded) // ============================================================ lazy val spark = (project in file("spark-combined")) - .dependsOn(`delta-spark-v1`) // Direct dependency on v1 (full version with DeltaLog) - .dependsOn(`delta-spark-v2`) // Direct dependency on v2 + .dependsOn(`delta-spark-shaded`) // Direct dependency on shaded (for delegation classes) .dependsOn(`delta-spark-v1` % "test->test") // Test utilities from v1 .dependsOn(storage) // Explicit dependency on storage - // Note: We don't .dependsOn delta-spark-shaded to avoid resolution issues, - // but we include its classes in packageBin / mappings below .settings ( name := "delta-spark", commonSettings, @@ -621,6 +618,15 @@ lazy val spark = (project in file("spark-combined")) // No prod code in this module Compile / sources := Seq.empty, + // Use test sources from original spark/ directory + Test / unmanagedSourceDirectories := Seq( + baseDirectory.value.getParentFile / "spark" / "src" / "test" / "scala", + baseDirectory.value.getParentFile / "spark" / "src" / "test" / "java" + ), + Test / unmanagedResourceDirectories := Seq( + baseDirectory.value.getParentFile / "spark" / "src" / "test" / "resources" + ), + // Package combined classes: FULL v1 (with DeltaLog) + v2 + shaded + storage // Note: v2 only depends on v1-shaded (without DeltaLog) at compile time, // but final jar includes full v1 for users From b55bfaaebdf5517cb0df6b65882cbe149393d7fa Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Thu, 9 Oct 2025 17:15:36 -0700 Subject: [PATCH 07/53] fix --- build.sbt | 23 ++++++++++++++++++- .../sql/DeltaSparkSessionExtension.scala | 12 ++++++++-- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/build.sbt b/build.sbt index 696837791d4..41720297916 100644 --- a/build.sbt +++ b/build.sbt @@ -446,7 +446,7 @@ lazy val `delta-spark-v1` = (project in file("spark")) name := "delta-spark-v1", commonSettings, scalaStyleSettings, - sparkMimaSettings, + // No MiMa check - this is an internal module not published skipReleaseSettings, // Not published crossSparkSettings(), @@ -618,6 +618,27 @@ lazy val spark = (project in file("spark-combined")) // No prod code in this module Compile / sources := Seq.empty, + // Copy all classes from dependencies to classes directory for MiMa + Compile / compile := { + val _ = (Compile / compile).value + val classesDir = (Compile / classDirectory).value + val v1Classes = (`delta-spark-v1` / Compile / classDirectory).value + val v2Classes = (`delta-spark-v2` / Compile / classDirectory).value + val shadedClasses = (`delta-spark-shaded` / Compile / classDirectory).value + val storageClasses = (storage / Compile / classDirectory).value + + // Ensure classes directory exists + IO.createDirectory(classesDir) + + // Copy all classes (shaded classes override v1 classes) + IO.copyDirectory(v1Classes, classesDir, overwrite = false, preserveLastModified = true) + IO.copyDirectory(storageClasses, classesDir, overwrite = false, preserveLastModified = true) + IO.copyDirectory(v2Classes, classesDir, overwrite = true, preserveLastModified = true) + IO.copyDirectory(shadedClasses, classesDir, overwrite = true, preserveLastModified = true) + + sbt.internal.inc.Analysis.Empty + }, + // Use test sources from original spark/ directory Test / unmanagedSourceDirectories := Seq( baseDirectory.value.getParentFile / "spark" / "src" / "test" / "scala", diff --git a/spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala b/spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala index a7a1b39af23..27af8f3a55a 100644 --- a/spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala +++ b/spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala @@ -16,7 +16,8 @@ package io.delta.sql -import io.delta.sql.AbstractSparkSessionExtension +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.rules.Rule /** * Delta Spark Session Extension that can register both V1 and V2 implementations. @@ -24,6 +25,13 @@ import io.delta.sql.AbstractSparkSessionExtension * - V1: org.apache.spark.sql.delta.* (full version with DeltaLog) * - V2: io.delta.kernel.spark.* */ -class DeltaSparkSessionExtension extends AbstractSparkSessionExtension { +class DeltaSparkSessionExtension extends io.delta.sql.AbstractSparkSessionExtension { + + /** + * NoOpRule for binary compatibility + */ + class NoOpRule extends Rule[LogicalPlan] { + override def apply(plan: LogicalPlan): LogicalPlan = plan + } } From 29a9dbeea746e3649bd4e6754aa1d5c1bfc8b8ce Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Thu, 9 Oct 2025 17:16:01 -0700 Subject: [PATCH 08/53] fix --- .../scala/io/delta/sql/DeltaSparkSessionExtension.scala | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala b/spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala index 27af8f3a55a..47ae2823884 100644 --- a/spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala +++ b/spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala @@ -25,13 +25,6 @@ import org.apache.spark.sql.catalyst.rules.Rule * - V1: org.apache.spark.sql.delta.* (full version with DeltaLog) * - V2: io.delta.kernel.spark.* */ -class DeltaSparkSessionExtension extends io.delta.sql.AbstractSparkSessionExtension { - - /** - * NoOpRule for binary compatibility - */ - class NoOpRule extends Rule[LogicalPlan] { - override def apply(plan: LogicalPlan): LogicalPlan = plan - } +class DeltaSparkSessionExtension extends AbstractSparkSessionExtension { } From 5cbf64fa887cfee2dc15e2e0eac27195e64381bb Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Thu, 9 Oct 2025 21:52:07 -0700 Subject: [PATCH 09/53] fix --- .../scala/io/delta/sql/DeltaSparkSessionExtension.scala | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala b/spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala index 47ae2823884..c547b8d5680 100644 --- a/spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala +++ b/spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala @@ -26,5 +26,13 @@ import org.apache.spark.sql.catalyst.rules.Rule * - V2: io.delta.kernel.spark.* */ class DeltaSparkSessionExtension extends AbstractSparkSessionExtension { + + /** + * NoOpRule for binary compatibility with Delta 3.3.0 + * This class must remain here to satisfy MiMa checks + */ + class NoOpRule extends Rule[LogicalPlan] { + override def apply(plan: LogicalPlan): LogicalPlan = plan + } } From 9a6acdd6fb3569080e03c86948e879a591055b6f Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Thu, 9 Oct 2025 22:36:44 -0700 Subject: [PATCH 10/53] fix --- build.sbt | 15 ++- .../defaults/CheckpointV2ReadSuite.scala | 3 +- .../kernel/defaults/ChecksumUtilsSuite.scala | 99 +++++++++---------- .../kernel/defaults/DomainMetadataSuite.scala | 17 ++-- .../kernel/defaults/TableChangesSuite.scala | 7 +- .../kernel/defaults/utils/TestUtils.scala | 35 ++++++- 6 files changed, 105 insertions(+), 71 deletions(-) diff --git a/build.sbt b/build.sbt index 41720297916..2c9f3bf2872 100644 --- a/build.sbt +++ b/build.sbt @@ -450,7 +450,8 @@ lazy val `delta-spark-v1` = (project in file("spark")) skipReleaseSettings, // Not published crossSparkSettings(), - // Only compile main sources, exclude tests + // Don't compile tests in delta-spark-v1 - they are compiled in the final spark module + // This avoids circular dependencies with delta-spark-shaded Test / sources := Seq.empty, Test / resources := Seq.empty, @@ -462,6 +463,16 @@ lazy val `delta-spark-v1` = (project in file("spark")) "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "provided", // For DynamoDBCommitStore "com.amazonaws" % "aws-java-sdk" % "1.12.262" % "provided", + + // Test dependencies + "org.scalatest" %% "scalatest" % scalaTestVersion % "test", + "org.scalatestplus" %% "scalacheck-1-15" % "3.2.9.0" % "test", + "junit" % "junit" % "4.13.2" % "test", + "com.github.sbt" % "junit-interface" % "0.13.3" % "test", + "org.mockito" % "mockito-inline" % "4.11.0" % "test", + "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "test" classifier "tests", + "org.apache.spark" %% "spark-core" % sparkVersion.value % "test" classifier "tests", + "org.apache.spark" %% "spark-sql" % sparkVersion.value % "test" classifier "tests", ), Compile / packageBin / mappings := (Compile / packageBin / mappings).value ++ listPythonFiles(baseDirectory.value.getParentFile / "python"), @@ -901,7 +912,6 @@ lazy val kernelDefaults = (project in file("kernel/kernel-defaults")) .dependsOn(kernelApi % "test->test") .dependsOn(storage) .dependsOn(storage % "test->test") // Required for InMemoryCommitCoordinator for tests - .dependsOn(`delta-spark-v1` % "test->test") .dependsOn(goldenTables % "test") .settings( name := "delta-kernel-defaults", @@ -924,6 +934,7 @@ lazy val kernelDefaults = (project in file("kernel/kernel-defaults")) "commons-io" % "commons-io" % "2.8.0" % "test", "com.novocode" % "junit-interface" % "0.11" % "test", "org.slf4j" % "slf4j-log4j12" % "1.7.36" % "test", + "io.delta" %% "delta-spark" % "3.3.2" % "test", // JMH dependencies allow writing micro-benchmarks for testing performance of components. // JMH has framework to define benchmarks and takes care of many common functionalities // such as warm runs, cold runs, defining benchmark parameter variables etc. diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/CheckpointV2ReadSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/CheckpointV2ReadSuite.scala index 382f1f88938..fa46a4c645d 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/CheckpointV2ReadSuite.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/CheckpointV2ReadSuite.scala @@ -29,7 +29,6 @@ import io.delta.tables.DeltaTable import org.apache.spark.sql.delta.{DeltaLog, Snapshot} import org.apache.spark.sql.delta.actions.{AddFile, Metadata, Protocol} import org.apache.spark.sql.delta.sources.DeltaSQLConf -import org.apache.spark.sql.delta.test.DeltaTestImplicits.OptimisticTxnTestHelper import org.apache.spark.sql.delta.util.FileNames import org.apache.hadoop.conf.Configuration @@ -196,7 +195,7 @@ trait AbstractCheckpointV2ReadSuite extends AnyFunSuite with ExpressionTestUtils val protocol = Protocol(3, 7, Some(Set("v2Checkpoint")), Some(supportedFeatures)) val add = AddFile(new Path("addfile").toUri.toString, Map.empty, 100L, 10L, dataChange = true) - log.startTransaction().commitManually(Seq(metadata, add): _*) + log.startTransaction().commitManuallyWithValidation(metadata, add) log.upgradeProtocol(None, log.update(), protocol) log.checkpoint(log.update()) diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/ChecksumUtilsSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/ChecksumUtilsSuite.scala index 8d9951e2d12..eb39e788f5a 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/ChecksumUtilsSuite.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/ChecksumUtilsSuite.scala @@ -30,7 +30,6 @@ import io.delta.kernel.utils.CloseableIterable.emptyIterable import org.apache.spark.sql.delta.DeltaLog import org.apache.spark.sql.delta.actions.CommitInfo -import org.apache.spark.sql.delta.test.DeltaTestImplicits.OptimisticTxnTestHelper import org.apache.hadoop.fs.Path import org.scalatest.funsuite.AnyFunSuite @@ -155,22 +154,21 @@ class ChecksumUtilsSuite extends AnyFunSuite with WriteUtils with LogReplayBaseS val deltaLog = DeltaLog.forTable(spark, new Path(path)) deltaLog .startTransaction() - .commitManually( - List( - CommitInfo( - time = 12345, - operation = "MANUAL UPDATE", - inCommitTimestamp = Some(12345), - operationParameters = Map.empty, - commandContext = Map.empty, - readVersion = Some(11), - isolationLevel = None, - isBlindAppend = None, - operationMetrics = None, - userMetadata = None, - tags = None, - txnId = None), - deltaLog.getSnapshotAt(11).allFiles.head().copy(dataChange = false).wrap.unwrap): _*) + .commitManuallyWithValidation( + CommitInfo( + time = 12345, + operation = "MANUAL UPDATE", + inCommitTimestamp = Some(12345), + operationParameters = Map.empty, + commandContext = Map.empty, + readVersion = Some(11), + isolationLevel = None, + isBlindAppend = None, + operationMetrics = None, + userMetadata = None, + tags = None, + txnId = None), + deltaLog.getSnapshotAt(11).allFiles.head().copy(dataChange = false).wrap.unwrap) deleteChecksumFileForTableUsingHadoopFs( table.getPath(engine).stripPrefix("file:"), Seq(11, 12)) @@ -194,22 +192,21 @@ class ChecksumUtilsSuite extends AnyFunSuite with WriteUtils with LogReplayBaseS val deltaLog = DeltaLog.forTable(spark, new Path(path)) deltaLog .startTransaction() - .commitManually( - List( - CommitInfo( - time = 12345, - operation = "REPLACE TABLE", - inCommitTimestamp = Some(12345), - operationParameters = Map.empty, - commandContext = Map.empty, - readVersion = Some(11), - isolationLevel = None, - isBlindAppend = None, - operationMetrics = None, - userMetadata = None, - tags = None, - txnId = None), - deltaLog.getSnapshotAt(11).allFiles.head().remove.copy(size = None).wrap.unwrap): _*) + .commitManuallyWithValidation( + CommitInfo( + time = 12345, + operation = "REPLACE TABLE", + inCommitTimestamp = Some(12345), + operationParameters = Map.empty, + commandContext = Map.empty, + readVersion = Some(11), + isolationLevel = None, + isBlindAppend = None, + operationMetrics = None, + userMetadata = None, + tags = None, + txnId = None), + deltaLog.getSnapshotAt(11).allFiles.head().remove.copy(size = None).wrap.unwrap) // Spark generated CRC from Spark doesn't include file size histogram deleteChecksumFileForTableUsingHadoopFs( table.getPath(engine).stripPrefix("file:"), @@ -282,22 +279,21 @@ class ChecksumUtilsSuite extends AnyFunSuite with WriteUtils with LogReplayBaseS val deltaLog = DeltaLog.forTable(spark, new Path(path)) deltaLog .startTransaction() - .commitManually( - List( - deltaLog.getSnapshotAt(11).allFiles.head().remove.wrap.unwrap, - CommitInfo( - time = 12345, - operation = "REPLACE TABLE", - inCommitTimestamp = Some(12345), - operationParameters = Map.empty, - commandContext = Map.empty, - readVersion = Some(11), - isolationLevel = None, - isBlindAppend = None, - operationMetrics = None, - userMetadata = None, - tags = None, - txnId = None).wrap.unwrap): _*) + .commitManuallyWithValidation( + deltaLog.getSnapshotAt(11).allFiles.head().remove.wrap.unwrap, + CommitInfo( + time = 12345, + operation = "REPLACE TABLE", + inCommitTimestamp = Some(12345), + operationParameters = Map.empty, + commandContext = Map.empty, + readVersion = Some(11), + isolationLevel = None, + isBlindAppend = None, + operationMetrics = None, + userMetadata = None, + tags = None, + txnId = None).wrap.unwrap) // Spark generated CRC from Spark doesn't include file size histogram deleteChecksumFileForTableUsingHadoopFs( table.getPath(engine).stripPrefix("file:"), @@ -320,9 +316,8 @@ class ChecksumUtilsSuite extends AnyFunSuite with WriteUtils with LogReplayBaseS val deltaLog = DeltaLog.forTable(spark, new Path(path)) deltaLog .startTransaction() - .commitManually( - List( - deltaLog.getSnapshotAt(11).allFiles.head().remove.wrap.unwrap): _*) + .commitManuallyWithValidation( + deltaLog.getSnapshotAt(11).allFiles.head().remove.wrap.unwrap) // Spark generated CRC from Spark doesn't include file size histogram deleteChecksumFileForTableUsingHadoopFs( table.getPath(engine).stripPrefix("file:"), diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DomainMetadataSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DomainMetadataSuite.scala index 6682ca3c947..de2e58338d9 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DomainMetadataSuite.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DomainMetadataSuite.scala @@ -35,7 +35,6 @@ import io.delta.kernel.utils.CloseableIterable.emptyIterable import org.apache.spark.sql.delta.DeltaLog import org.apache.spark.sql.delta.RowId.{RowTrackingMetadataDomain => SparkRowTrackingMetadataDomain} import org.apache.spark.sql.delta.actions.{DomainMetadata => SparkDomainMetadata} -import org.apache.spark.sql.delta.test.DeltaTestImplicits.OptimisticTxnTestHelper import org.apache.hadoop.fs.Path import org.scalatest.funsuite.AnyFunSuite @@ -439,11 +438,10 @@ trait AbstractDomainMetadataSuite extends AnyFunSuite with AbstractWriteUtils val deltaLog = DeltaLog.forTable(spark, new Path(tablePath)) deltaLog .startTransaction() - .commitManually( - List( - SparkDomainMetadata("testDomain1", "{\"key1\":\"1\"}", removed = false), - SparkDomainMetadata("testDomain2", "", removed = false), - SparkDomainMetadata("testDomain3", "", removed = false)): _*) + .commitManuallyWithValidation( + SparkDomainMetadata("testDomain1", "{\"key1\":\"1\"}", removed = false), + SparkDomainMetadata("testDomain2", "", removed = false), + SparkDomainMetadata("testDomain3", "", removed = false)) // This will create 03.json and 03.checkpoint spark.range(0, 2).write.format("delta").mode("append").save(tablePath) @@ -451,10 +449,9 @@ trait AbstractDomainMetadataSuite extends AnyFunSuite with AbstractWriteUtils // Manually commit domain metadata actions. This will create 04.json deltaLog .startTransaction() - .commitManually( - List( - SparkDomainMetadata("testDomain1", "{\"key1\":\"10\"}", removed = false), - SparkDomainMetadata("testDomain2", "", removed = true)): _*) + .commitManuallyWithValidation( + SparkDomainMetadata("testDomain1", "{\"key1\":\"10\"}", removed = false), + SparkDomainMetadata("testDomain2", "", removed = true)) // Use Delta Kernel to read the table's domain metadata and verify the result. // We will need to read 1 checkpoint file and 1 log file to replay the table. diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/TableChangesSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/TableChangesSuite.scala index 659f2e21f6b..5543397b1af 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/TableChangesSuite.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/TableChangesSuite.scala @@ -37,7 +37,6 @@ import io.delta.kernel.internal.util.{FileNames, ManualClock, VectorUtils} import org.apache.spark.sql.delta.DeltaLog import org.apache.spark.sql.delta.actions.{Action => SparkAction, AddCDCFile => SparkAddCDCFile, AddFile => SparkAddFile, CommitInfo => SparkCommitInfo, Metadata => SparkMetadata, Protocol => SparkProtocol, RemoveFile => SparkRemoveFile, SetTransaction => SparkSetTransaction} -import org.apache.spark.sql.delta.test.DeltaTestImplicits.OptimisticTxnTestHelper import org.apache.hadoop.fs.{Path => HadoopPath} import org.apache.spark.sql.functions.col @@ -326,7 +325,7 @@ abstract class TableChangesSuite extends AnyFunSuite with TestUtils with WriteUt val add1 = SparkAddFile("fake/path/1", Map.empty, 1, 1, dataChange = true) val txn1 = log.startTransaction() - txn1.commitManually(metadata :: add1 :: Nil: _*) + txn1.commitManuallyWithValidation(metadata, add1) val addCDC2 = SparkAddCDCFile( "fake/path/2", @@ -335,12 +334,12 @@ abstract class TableChangesSuite extends AnyFunSuite with TestUtils with WriteUt Map("tag_foo" -> "tag_bar")) val remove2 = SparkRemoveFile("fake/path/1", Some(100), dataChange = true) val txn2 = log.startTransaction() - txn2.commitManually(addCDC2 :: remove2 :: Nil: _*) + txn2.commitManuallyWithValidation(addCDC2, remove2) val setTransaction3 = SparkSetTransaction("fakeAppId", 3L, Some(200)) val txn3 = log.startTransaction() val latestTableProtocol = log.snapshot.protocol - txn3.commitManually(latestTableProtocol :: setTransaction3 :: Nil: _*) + txn3.commitManuallyWithValidation(latestTableProtocol, setTransaction3) // request subset of actions testGetChangesVsSpark( diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/utils/TestUtils.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/utils/TestUtils.scala index 24e53a9842f..35842e90c5b 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/utils/TestUtils.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/utils/TestUtils.scala @@ -17,7 +17,8 @@ package io.delta.kernel.defaults.utils import java.io.{File, FileNotFoundException} import java.math.{BigDecimal => BigDecimalJ} -import java.nio.file.Files +import java.nio.charset.StandardCharsets.UTF_8 +import java.nio.file.{Files, Paths} import java.util.{Optional, TimeZone, UUID} import scala.collection.JavaConverters._ @@ -170,6 +171,38 @@ trait AbstractTestUtils def toScala: Option[T] = if (optional.isPresent) Some(optional.get()) else None } + /** + * Provides test-only apis to internal Delta Spark APIs. + */ + implicit class OptimisticTxnTestHelper(txn: org.apache.spark.sql.delta.OptimisticTransaction) { + + /** + * Test only method to commit arbitrary actions to delta table. + */ + def commitManuallyWithValidation(actions: org.apache.spark.sql.delta.actions.Action*): Unit = { + txn.commit(actions.toSeq, org.apache.spark.sql.delta.DeltaOperations.ManualUpdate) + } + + /** + * Test only method to unsafe commit - writes actions directly to transaction log. + * Note: This bypasses Delta Spark transaction logic. + * + * @param tablePath The path to the Delta table + * @param version The commit version number + * @param actions Sequence of Action objects to write + */ + def commitUnsafe( + tablePath: String, + version: Long, + actions: org.apache.spark.sql.delta.actions.Action*): Unit = { + val logPath = new Path(tablePath, "_delta_log") + val commitFile = FileNames.unsafeDeltaFile(logPath, version) + val commitContent = actions.map(_.json + "\n").mkString.getBytes(UTF_8) + Files.write(Paths.get(commitFile.toString), commitContent) + Table.forPath(defaultEngine, tablePath).checksum(defaultEngine, version) + } + } + implicit object ResourceLoader { lazy val classLoader: ClassLoader = ResourceLoader.getClass.getClassLoader } From f8d4862edec7c0de6df15ff83171a9fe9f32f1c2 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 10 Oct 2025 00:40:44 -0700 Subject: [PATCH 11/53] fix --- .../test/scala/io/delta/kernel/defaults/utils/TestUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/utils/TestUtils.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/utils/TestUtils.scala index 35842e90c5b..91d4d6deaa0 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/utils/TestUtils.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/utils/TestUtils.scala @@ -196,7 +196,7 @@ trait AbstractTestUtils version: Long, actions: org.apache.spark.sql.delta.actions.Action*): Unit = { val logPath = new Path(tablePath, "_delta_log") - val commitFile = FileNames.unsafeDeltaFile(logPath, version) + val commitFile = org.apache.spark.sql.delta.util.FileNames.unsafeDeltaFile(logPath, version) val commitContent = actions.map(_.json + "\n").mkString.getBytes(UTF_8) Files.write(Paths.get(commitFile.toString), commitContent) Table.forPath(defaultEngine, tablePath).checksum(defaultEngine, version) From 6c7c972ac90d381ae3a2b24f68cbcabf151b1bb3 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 10 Oct 2025 00:56:21 -0700 Subject: [PATCH 12/53] fix --- .../spark/utils/StreamingHelperTest.java | 12 ++---------- .../defaults/DeltaTableWritesSuite.scala | 19 +++++++++++++++++-- .../kernel/defaults/utils/TestUtils.scala | 2 +- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/kernel-spark/src/test/java/io/delta/kernel/spark/utils/StreamingHelperTest.java b/kernel-spark/src/test/java/io/delta/kernel/spark/utils/StreamingHelperTest.java index 01786722c41..79fc2568203 100644 --- a/kernel-spark/src/test/java/io/delta/kernel/spark/utils/StreamingHelperTest.java +++ b/kernel-spark/src/test/java/io/delta/kernel/spark/utils/StreamingHelperTest.java @@ -140,7 +140,6 @@ public void testGetActiveCommitAtTime_pastTimestamp(@TempDir File tempDir) throw .history() .getActiveCommitAtTime( timestamp, - Option.empty(), false /* canReturnLastCommit */, true /* mustBeRecreatable */, false /* canReturnEarliestCommit */); @@ -172,7 +171,6 @@ public void testGetActiveCommitAtTime_futureTimestamp_canReturnLast(@TempDir Fil .history() .getActiveCommitAtTime( futureTimestamp, - Option.empty(), true /* canReturnLastCommit */, true /* mustBeRecreatable */, false /* canReturnEarliestCommit */); @@ -204,7 +202,6 @@ public void testGetActiveCommitAtTime_futureTimestamp_notMustBeRecreatable(@Temp .history() .getActiveCommitAtTime( futureTimestamp, - Option.empty(), true /* canReturnLastCommit */, false /* mustBeRecreatable */, false /* canReturnEarliestCommit */); @@ -236,7 +233,6 @@ public void testGetActiveCommitAtTime_earlyTimestamp_canReturnEarliest(@TempDir .history() .getActiveCommitAtTime( earlyTimestamp, - Option.empty(), false /* canReturnLastCommit */, true /* mustBeRecreatable */, true /* canReturnEarliestCommit */); @@ -268,7 +264,6 @@ public void testGetActiveCommitAtTime_earlyTimestamp_notMustBeRecreatable_canRet .history() .getActiveCommitAtTime( earlyTimestamp, - Option.empty(), false /* canReturnLastCommit */, false /* mustBeRecreatable */, true /* canReturnEarliestCommit */); @@ -352,13 +347,10 @@ public void testCheckVersionExists( () -> deltaLog .history() - .checkVersionExists( - versionToCheck, Option.empty(), mustBeRecreatable, allowOutOfRange)); + .checkVersionExists(versionToCheck, mustBeRecreatable, allowOutOfRange)); } else { streamingHelper.checkVersionExists(versionToCheck, mustBeRecreatable, allowOutOfRange); - deltaLog - .history() - .checkVersionExists(versionToCheck, Option.empty(), mustBeRecreatable, allowOutOfRange); + deltaLog.history().checkVersionExists(versionToCheck, mustBeRecreatable, allowOutOfRange); } } } diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableWritesSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableWritesSuite.scala index a1cd61f36da..d21460f4e73 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableWritesSuite.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableWritesSuite.scala @@ -423,8 +423,23 @@ abstract class AbstractDeltaTableWritesSuite extends AnyFunSuite with AbstractWr engine, tablePath, testSchema) - DeltaTable.forPath(spark, tablePath) - .addFeatureSupport("testUnsupportedWriter") + + // Use your new commitUnsafe API to write an unsupported writer feature + import org.apache.spark.sql.delta.DeltaLog + import org.apache.spark.sql.delta.actions.Protocol + + val deltaLog = DeltaLog.forTable(spark, tablePath) + val txn = deltaLog.startTransaction() + + // Create Protocol action with unsupported writer feature + val protocolAction = Protocol( + minReaderVersion = 3, + minWriterVersion = 7, + readerFeatures = Some(Set.empty), + writerFeatures = Some(Set("testUnsupportedWriter"))) + + // Use your elegant API to commit directly to version 1 + txn.commitUnsafe(tablePath, 1L, protocolAction) val e = intercept[KernelException] { getUpdateTxn(engine, tablePath) } diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/utils/TestUtils.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/utils/TestUtils.scala index 91d4d6deaa0..aba5c8553c1 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/utils/TestUtils.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/utils/TestUtils.scala @@ -195,7 +195,7 @@ trait AbstractTestUtils tablePath: String, version: Long, actions: org.apache.spark.sql.delta.actions.Action*): Unit = { - val logPath = new Path(tablePath, "_delta_log") + val logPath = new org.apache.hadoop.fs.Path(tablePath, "_delta_log") val commitFile = org.apache.spark.sql.delta.util.FileNames.unsafeDeltaFile(logPath, version) val commitContent = actions.map(_.json + "\n").mkString.getBytes(UTF_8) Files.write(Paths.get(commitFile.toString), commitContent) From 565f9cbe478b25fe0ca7a81a8eee16a21ca56eea Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 10 Oct 2025 10:36:31 -0700 Subject: [PATCH 13/53] fix --- build.sbt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index 2c9f3bf2872..b94e8c35606 100644 --- a/build.sbt +++ b/build.sbt @@ -650,7 +650,7 @@ lazy val spark = (project in file("spark-combined")) sbt.internal.inc.Analysis.Empty }, - // Use test sources from original spark/ directory + // Use test sources and resources from original spark/ directory Test / unmanagedSourceDirectories := Seq( baseDirectory.value.getParentFile / "spark" / "src" / "test" / "scala", baseDirectory.value.getParentFile / "spark" / "src" / "test" / "java" @@ -658,6 +658,9 @@ lazy val spark = (project in file("spark-combined")) Test / unmanagedResourceDirectories := Seq( baseDirectory.value.getParentFile / "spark" / "src" / "test" / "resources" ), + Test / resourceDirectory := baseDirectory.value.getParentFile / "spark" / "src" / "test" / "resources", + // Set working directory for tests to spark/ so relative paths work + Test / baseDirectory := baseDirectory.value.getParentFile / "spark", // Package combined classes: FULL v1 (with DeltaLog) + v2 + shaded + storage // Note: v2 only depends on v1-shaded (without DeltaLog) at compile time, From 74a1f5c1e429328484ff660ba006e1cad2a869fa Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 10 Oct 2025 10:41:14 -0700 Subject: [PATCH 14/53] fix From c6c306d310fc31b2d359a7c9387c26fe61af8962 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 10 Oct 2025 11:01:12 -0700 Subject: [PATCH 15/53] fix From dbf686bc7386010a85fb3edb7a1299a4d5cdf856 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 10 Oct 2025 12:36:54 -0700 Subject: [PATCH 16/53] fix --- build.sbt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/build.sbt b/build.sbt index b94e8c35606..85ae764764a 100644 --- a/build.sbt +++ b/build.sbt @@ -661,6 +661,8 @@ lazy val spark = (project in file("spark-combined")) Test / resourceDirectory := baseDirectory.value.getParentFile / "spark" / "src" / "test" / "resources", // Set working directory for tests to spark/ so relative paths work Test / baseDirectory := baseDirectory.value.getParentFile / "spark", + // Also set the working directory for forked test JVMs + Test / javaOptions += s"-Duser.dir=${(baseDirectory.value.getParentFile / "spark").getAbsolutePath}", // Package combined classes: FULL v1 (with DeltaLog) + v2 + shaded + storage // Note: v2 only depends on v1-shaded (without DeltaLog) at compile time, From 2929d7220bb6d03127fa75f8b558dd3262e9e13f Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 10 Oct 2025 16:19:05 -0700 Subject: [PATCH 17/53] fix --- build.sbt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/build.sbt b/build.sbt index 85ae764764a..09f23f4b7ee 100644 --- a/build.sbt +++ b/build.sbt @@ -663,6 +663,14 @@ lazy val spark = (project in file("spark-combined")) Test / baseDirectory := baseDirectory.value.getParentFile / "spark", // Also set the working directory for forked test JVMs Test / javaOptions += s"-Duser.dir=${(baseDirectory.value.getParentFile / "spark").getAbsolutePath}", + // Map target directory for tests that write logs/output + Test / target := baseDirectory.value.getParentFile / "spark" / "target", + // Ensure the build creates necessary directories before tests run + Test / test := { + val sparkTargetDir = (baseDirectory.value.getParentFile / "spark" / "target") + if (!sparkTargetDir.exists()) sparkTargetDir.mkdirs() + (Test / test).value + }, // Package combined classes: FULL v1 (with DeltaLog) + v2 + shaded + storage // Note: v2 only depends on v1-shaded (without DeltaLog) at compile time, From 642a42ad9ee1c8d646f846106f8e5a72a5dab2b3 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 10 Oct 2025 20:51:28 -0700 Subject: [PATCH 18/53] fix From e485cda697dbeff9b945efb1fc77c7cf22fbc5ae Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 10 Oct 2025 21:04:04 -0700 Subject: [PATCH 19/53] fix --- build.sbt | 38 +++++++++----------------------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/build.sbt b/build.sbt index 09f23f4b7ee..7749ef92337 100644 --- a/build.sbt +++ b/build.sbt @@ -586,7 +586,6 @@ lazy val `delta-spark-shaded` = (project in file("spark-shaded")) // ============================================================ lazy val spark = (project in file("spark-combined")) .dependsOn(`delta-spark-shaded`) // Direct dependency on shaded (for delegation classes) - .dependsOn(`delta-spark-v1` % "test->test") // Test utilities from v1 .dependsOn(storage) // Explicit dependency on storage .settings ( name := "delta-spark", @@ -650,28 +649,6 @@ lazy val spark = (project in file("spark-combined")) sbt.internal.inc.Analysis.Empty }, - // Use test sources and resources from original spark/ directory - Test / unmanagedSourceDirectories := Seq( - baseDirectory.value.getParentFile / "spark" / "src" / "test" / "scala", - baseDirectory.value.getParentFile / "spark" / "src" / "test" / "java" - ), - Test / unmanagedResourceDirectories := Seq( - baseDirectory.value.getParentFile / "spark" / "src" / "test" / "resources" - ), - Test / resourceDirectory := baseDirectory.value.getParentFile / "spark" / "src" / "test" / "resources", - // Set working directory for tests to spark/ so relative paths work - Test / baseDirectory := baseDirectory.value.getParentFile / "spark", - // Also set the working directory for forked test JVMs - Test / javaOptions += s"-Duser.dir=${(baseDirectory.value.getParentFile / "spark").getAbsolutePath}", - // Map target directory for tests that write logs/output - Test / target := baseDirectory.value.getParentFile / "spark" / "target", - // Ensure the build creates necessary directories before tests run - Test / test := { - val sparkTargetDir = (baseDirectory.value.getParentFile / "spark" / "target") - if (!sparkTargetDir.exists()) sparkTargetDir.mkdirs() - (Test / test).value - }, - // Package combined classes: FULL v1 (with DeltaLog) + v2 + shaded + storage // Note: v2 only depends on v1-shaded (without DeltaLog) at compile time, // but final jar includes full v1 for users @@ -682,21 +659,21 @@ lazy val spark = (project in file("spark-combined")) val storageClasses = (storage / Compile / packageBin / mappings).value // Add storage classes // Merge all mappings, shaded classes override v1 classes if there are conflicts - // This allows delegation classes in shaded (DeltaCatalog, DeltaSparkSessionExtension) - // to replace v1 originals val allMappings = v1Full ++ v2 ++ storageClasses ++ shaded // Remove duplicates by path (keep the last occurrence, which is from shaded) allMappings.groupBy(_._2).map(_._2.last).toSeq }, - // Test sources point to original spark/src/test/ (no file movement) - Test / unmanagedSourceDirectories ++= Seq( + // Test sources and resources from original spark/ directory + Test / unmanagedSourceDirectories := Seq( baseDirectory.value.getParentFile / "spark" / "src" / "test" / "scala", baseDirectory.value.getParentFile / "spark" / "src" / "test" / "java" ), - Test / unmanagedResourceDirectories += - baseDirectory.value.getParentFile / "spark" / "src" / "test" / "resources", + Test / unmanagedResourceDirectories := Seq( + baseDirectory.value.getParentFile / "spark" / "src" / "test" / "resources" + ), + Test / resourceDirectory := baseDirectory.value.getParentFile / "spark" / "src" / "test" / "resources", // Include spark-version-specific test sources Test / unmanagedSourceDirectories ++= { @@ -710,6 +687,9 @@ lazy val spark = (project in file("spark-combined")) } }, + // Set working directory for tests to spark/ so relative paths work + Test / baseDirectory := baseDirectory.value.getParentFile / "spark", + libraryDependencies ++= Seq( // Provided deps (needed for compile and test) "org.apache.spark" %% "spark-hive" % sparkVersion.value % "provided", From f6ef579e9b66f24e478ce656813fcc082ff99fa4 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 10 Oct 2025 22:55:04 -0700 Subject: [PATCH 20/53] fix --- build.sbt | 1 + 1 file changed, 1 insertion(+) diff --git a/build.sbt b/build.sbt index 7749ef92337..ea834f060f0 100644 --- a/build.sbt +++ b/build.sbt @@ -689,6 +689,7 @@ lazy val spark = (project in file("spark-combined")) // Set working directory for tests to spark/ so relative paths work Test / baseDirectory := baseDirectory.value.getParentFile / "spark", + Test / javaOptions += s"-Duser.dir=${(baseDirectory.value.getParentFile / "spark").getAbsolutePath}", libraryDependencies ++= Seq( // Provided deps (needed for compile and test) From 5f0dbc8418a89704506746ee96cb10db4afdb119 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sat, 11 Oct 2025 09:26:07 -0700 Subject: [PATCH 21/53] fix --- build.sbt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sbt b/build.sbt index ea834f060f0..e985fb8947c 100644 --- a/build.sbt +++ b/build.sbt @@ -689,7 +689,6 @@ lazy val spark = (project in file("spark-combined")) // Set working directory for tests to spark/ so relative paths work Test / baseDirectory := baseDirectory.value.getParentFile / "spark", - Test / javaOptions += s"-Duser.dir=${(baseDirectory.value.getParentFile / "spark").getAbsolutePath}", libraryDependencies ++= Seq( // Provided deps (needed for compile and test) @@ -728,7 +727,8 @@ lazy val spark = (project in file("spark-combined")) "-Ddelta.log.cacheSize=3", "-Dspark.databricks.delta.delta.log.cacheSize=3", "-Dspark.sql.sources.parallelPartitionDiscovery.parallelism=5", - "-Xmx1024m" + "-Xmx1024m", + s"-Duser.dir=${(baseDirectory.value.getParentFile / "spark").getAbsolutePath}" // Set working directory for relative paths ), // Required for testing table features see https://github.com/delta-io/delta/issues/1602 From 812ba1d4ad75e2d49e422020062162c30f78bca9 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sat, 11 Oct 2025 14:52:51 -0700 Subject: [PATCH 22/53] fix --- build.sbt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/build.sbt b/build.sbt index e985fb8947c..e76d1c3ae97 100644 --- a/build.sbt +++ b/build.sbt @@ -716,6 +716,9 @@ lazy val spark = (project in file("spark-combined")) // Don't execute in parallel since we can't have multiple Sparks in the same JVM Test / parallelExecution := false, + // Fork tests to ensure javaOptions (especially user.dir) are applied + Test / fork := true, + javaOptions += "-Xmx1024m", // Configurations to speed up tests and reduce memory footprint From 7dd8d18a37e8fc07e6769708e0df1858e1cd4ca2 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sat, 11 Oct 2025 15:47:41 -0700 Subject: [PATCH 23/53] fix --- build.sbt | 9 ++++++++- .../spark/utils/StreamingHelperTest.java | 19 +++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/build.sbt b/build.sbt index e76d1c3ae97..2e7473d71d8 100644 --- a/build.sbt +++ b/build.sbt @@ -551,10 +551,17 @@ lazy val `delta-spark-v2` = (project in file("kernel-spark")) "org.apache.spark" %% "spark-core" % sparkVersion.value % "provided", "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "provided", + // Test dependencies "org.junit.jupiter" % "junit-jupiter-api" % "5.8.2" % "test", "org.junit.jupiter" % "junit-jupiter-engine" % "5.8.2" % "test", "org.junit.jupiter" % "junit-jupiter-params" % "5.8.2" % "test", - "net.aichler" % "jupiter-interface" % "0.11.1" % "test" + "net.aichler" % "jupiter-interface" % "0.11.1" % "test", + // Spark test classes for Scala/Java test utilities + "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "test" classifier "tests", + "org.apache.spark" %% "spark-core" % sparkVersion.value % "test" classifier "tests", + "org.apache.spark" %% "spark-sql" % sparkVersion.value % "test" classifier "tests", + // ScalaTest for test utilities (needed by Spark test classes) + "org.scalatest" %% "scalatest" % scalaTestVersion % "test" ), Test / testOptions += Tests.Argument(TestFrameworks.JUnit, "-v", "-a") ) diff --git a/kernel-spark/src/test/java/io/delta/kernel/spark/utils/StreamingHelperTest.java b/kernel-spark/src/test/java/io/delta/kernel/spark/utils/StreamingHelperTest.java index 79fc2568203..ef600fdce22 100644 --- a/kernel-spark/src/test/java/io/delta/kernel/spark/utils/StreamingHelperTest.java +++ b/kernel-spark/src/test/java/io/delta/kernel/spark/utils/StreamingHelperTest.java @@ -140,6 +140,7 @@ public void testGetActiveCommitAtTime_pastTimestamp(@TempDir File tempDir) throw .history() .getActiveCommitAtTime( timestamp, + deltaLog.initialCatalogTable() /* catalogTableOpt */, false /* canReturnLastCommit */, true /* mustBeRecreatable */, false /* canReturnEarliestCommit */); @@ -171,6 +172,7 @@ public void testGetActiveCommitAtTime_futureTimestamp_canReturnLast(@TempDir Fil .history() .getActiveCommitAtTime( futureTimestamp, + deltaLog.initialCatalogTable() /* catalogTableOpt */, true /* canReturnLastCommit */, true /* mustBeRecreatable */, false /* canReturnEarliestCommit */); @@ -202,6 +204,7 @@ public void testGetActiveCommitAtTime_futureTimestamp_notMustBeRecreatable(@Temp .history() .getActiveCommitAtTime( futureTimestamp, + deltaLog.initialCatalogTable() /* catalogTableOpt */, true /* canReturnLastCommit */, false /* mustBeRecreatable */, false /* canReturnEarliestCommit */); @@ -233,6 +236,7 @@ public void testGetActiveCommitAtTime_earlyTimestamp_canReturnEarliest(@TempDir .history() .getActiveCommitAtTime( earlyTimestamp, + deltaLog.initialCatalogTable() /* catalogTableOpt */, false /* canReturnLastCommit */, true /* mustBeRecreatable */, true /* canReturnEarliestCommit */); @@ -264,6 +268,7 @@ public void testGetActiveCommitAtTime_earlyTimestamp_notMustBeRecreatable_canRet .history() .getActiveCommitAtTime( earlyTimestamp, + deltaLog.initialCatalogTable() /* catalogTableOpt */, false /* canReturnLastCommit */, false /* mustBeRecreatable */, true /* canReturnEarliestCommit */); @@ -347,10 +352,20 @@ public void testCheckVersionExists( () -> deltaLog .history() - .checkVersionExists(versionToCheck, mustBeRecreatable, allowOutOfRange)); + .checkVersionExists( + versionToCheck, + deltaLog.initialCatalogTable() /* catalogTableOpt */, + mustBeRecreatable, + allowOutOfRange)); } else { streamingHelper.checkVersionExists(versionToCheck, mustBeRecreatable, allowOutOfRange); - deltaLog.history().checkVersionExists(versionToCheck, mustBeRecreatable, allowOutOfRange); + deltaLog + .history() + .checkVersionExists( + versionToCheck, + scala.Option.apply(null) /* catalogTableOpt */, + mustBeRecreatable, + allowOutOfRange); } } } From 1658b8f587083a2437e1e03eb4e630560e3e6213 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sat, 11 Oct 2025 21:05:19 -0700 Subject: [PATCH 24/53] fix --- build.sbt | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/build.sbt b/build.sbt index 2e7473d71d8..bd4d7650c35 100644 --- a/build.sbt +++ b/build.sbt @@ -725,8 +725,15 @@ lazy val spark = (project in file("spark-combined")) // Fork tests to ensure javaOptions (especially user.dir) are applied Test / fork := true, - - javaOptions += "-Xmx1024m", + + // Set fork options to run tests in spark/ directory + Test / forkOptions := { + val sparkDir = (Test / baseDirectory).value + val currentEnv = (Test / envVars).value + ForkOptions() + .withWorkingDirectory(sparkDir) + .withEnvVars(currentEnv) + }, // Configurations to speed up tests and reduce memory footprint Test / javaOptions ++= Seq( @@ -737,8 +744,7 @@ lazy val spark = (project in file("spark-combined")) "-Ddelta.log.cacheSize=3", "-Dspark.databricks.delta.delta.log.cacheSize=3", "-Dspark.sql.sources.parallelPartitionDiscovery.parallelism=5", - "-Xmx1024m", - s"-Duser.dir=${(baseDirectory.value.getParentFile / "spark").getAbsolutePath}" // Set working directory for relative paths + "-Xmx1024m" ), // Required for testing table features see https://github.com/delta-io/delta/issues/1602 From 743ca99c15c8f07c02c30ed6e4f744c99625ed17 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 12 Oct 2025 11:35:15 -0700 Subject: [PATCH 25/53] fix --- build.sbt | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/build.sbt b/build.sbt index bd4d7650c35..e64ab8704b4 100644 --- a/build.sbt +++ b/build.sbt @@ -723,32 +723,34 @@ lazy val spark = (project in file("spark-combined")) // Don't execute in parallel since we can't have multiple Sparks in the same JVM Test / parallelExecution := false, - // Fork tests to ensure javaOptions (especially user.dir) are applied + // Required for testing table features see https://github.com/delta-io/delta/issues/1602 + Test / envVars += ("DELTA_TESTING", "1"), + + // Fork tests to ensure javaOptions are applied Test / fork := true, - // Set fork options to run tests in spark/ directory - Test / forkOptions := { - val sparkDir = (Test / baseDirectory).value - val currentEnv = (Test / envVars).value - ForkOptions() - .withWorkingDirectory(sparkDir) - .withEnvVars(currentEnv) - }, - - // Configurations to speed up tests and reduce memory footprint - Test / javaOptions ++= Seq( - "-Dspark.ui.enabled=false", - "-Dspark.ui.showConsoleProgress=false", - "-Dspark.databricks.delta.snapshotPartitions=2", - "-Dspark.sql.shuffle.partitions=5", - "-Ddelta.log.cacheSize=3", - "-Dspark.databricks.delta.delta.log.cacheSize=3", - "-Dspark.sql.sources.parallelPartitionDiscovery.parallelism=5", - "-Xmx1024m" + // Set working directory for forked tests to spark/ directory + Test / forkOptions := (Test / forkOptions).value.withWorkingDirectory( + (Test / baseDirectory).value ), - // Required for testing table features see https://github.com/delta-io/delta/issues/1602 - Test / envVars += ("DELTA_TESTING", "1"), + // Configurations to speed up tests and reduce memory footprint + Test / javaOptions ++= { + val sparkDir = (Test / baseDirectory).value + Seq( + // Explicitly set user.dir for cross-platform compatibility + // On some platforms, withWorkingDirectory doesn't update user.dir + s"-Duser.dir=$sparkDir", + "-Dspark.ui.enabled=false", + "-Dspark.ui.showConsoleProgress=false", + "-Dspark.databricks.delta.snapshotPartitions=2", + "-Dspark.sql.shuffle.partitions=5", + "-Ddelta.log.cacheSize=3", + "-Dspark.databricks.delta.delta.log.cacheSize=3", + "-Dspark.sql.sources.parallelPartitionDiscovery.parallelism=5", + "-Xmx1024m" + ) + }, TestParallelization.settings, ) From 33f484eba9bd155fd5154b969dc3257b9c7a7f83 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 12 Oct 2025 13:17:59 -0700 Subject: [PATCH 26/53] Fix test working directory: use baseDirectory instead of Test/baseDirectory to avoid evaluation order issues --- build.sbt | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/build.sbt b/build.sbt index e64ab8704b4..8df3bcf08e7 100644 --- a/build.sbt +++ b/build.sbt @@ -735,22 +735,20 @@ lazy val spark = (project in file("spark-combined")) ), // Configurations to speed up tests and reduce memory footprint - Test / javaOptions ++= { - val sparkDir = (Test / baseDirectory).value - Seq( - // Explicitly set user.dir for cross-platform compatibility - // On some platforms, withWorkingDirectory doesn't update user.dir - s"-Duser.dir=$sparkDir", - "-Dspark.ui.enabled=false", - "-Dspark.ui.showConsoleProgress=false", - "-Dspark.databricks.delta.snapshotPartitions=2", - "-Dspark.sql.shuffle.partitions=5", - "-Ddelta.log.cacheSize=3", - "-Dspark.databricks.delta.delta.log.cacheSize=3", - "-Dspark.sql.sources.parallelPartitionDiscovery.parallelism=5", - "-Xmx1024m" - ) - }, + Test / javaOptions ++= Seq( + // Explicitly set user.dir for cross-platform compatibility + // On some platforms, withWorkingDirectory doesn't update user.dir + // Use absolute path to avoid dependency on baseDirectory resolution order + s"-Duser.dir=${baseDirectory.value.getParentFile}/spark", + "-Dspark.ui.enabled=false", + "-Dspark.ui.showConsoleProgress=false", + "-Dspark.databricks.delta.snapshotPartitions=2", + "-Dspark.sql.shuffle.partitions=5", + "-Ddelta.log.cacheSize=3", + "-Dspark.databricks.delta.delta.log.cacheSize=3", + "-Dspark.sql.sources.parallelPartitionDiscovery.parallelism=5", + "-Xmx1024m" + ), TestParallelization.settings, ) From 6f833ef816c9964d84e128d889bbc2b33d283323 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 12 Oct 2025 13:22:51 -0700 Subject: [PATCH 27/53] Refactor: use delta-spark-v1's baseDirectory directly for better clarity - Changed from baseDirectory.getParentFile/spark to (delta-spark-v1/baseDirectory).value - This is more explicit and clearly shows we're using delta-spark-v1's directory - Makes the relationship between modules more obvious --- build.sbt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/build.sbt b/build.sbt index 8df3bcf08e7..053c0522498 100644 --- a/build.sbt +++ b/build.sbt @@ -694,8 +694,8 @@ lazy val spark = (project in file("spark-combined")) } }, - // Set working directory for tests to spark/ so relative paths work - Test / baseDirectory := baseDirectory.value.getParentFile / "spark", + // Set working directory for tests to spark/ (delta-spark-v1's directory) + Test / baseDirectory := (`delta-spark-v1` / baseDirectory).value, libraryDependencies ++= Seq( // Provided deps (needed for compile and test) @@ -738,8 +738,8 @@ lazy val spark = (project in file("spark-combined")) Test / javaOptions ++= Seq( // Explicitly set user.dir for cross-platform compatibility // On some platforms, withWorkingDirectory doesn't update user.dir - // Use absolute path to avoid dependency on baseDirectory resolution order - s"-Duser.dir=${baseDirectory.value.getParentFile}/spark", + // Use delta-spark-v1's baseDirectory (which is spark/) for clarity + s"-Duser.dir=${(`delta-spark-v1` / baseDirectory).value}", "-Dspark.ui.enabled=false", "-Dspark.ui.showConsoleProgress=false", "-Dspark.databricks.delta.snapshotPartitions=2", From 5c05ad4bc97288de02a1cf4c3bc233780b93c44f Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 12 Oct 2025 14:52:49 -0700 Subject: [PATCH 28/53] Fix: use delta-spark-v1's baseDirectory for all test paths Issue: Test resource directories were using baseDirectory.getParentFile/spark which could evaluate to the wrong path depending on evaluation order. Solution: Changed all test path configurations to consistently use (delta-spark-v1/baseDirectory).value: - Test/unmanagedSourceDirectories - Test/unmanagedResourceDirectories - Test/resourceDirectory - Test/baseDirectory - Test/javaOptions (-Duser.dir) This ensures all test paths correctly point to the spark/ directory regardless of evaluation order, fixing GitHub Actions failures. --- build.sbt | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/build.sbt b/build.sbt index 053c0522498..8addb9ec143 100644 --- a/build.sbt +++ b/build.sbt @@ -672,23 +672,27 @@ lazy val spark = (project in file("spark-combined")) allMappings.groupBy(_._2).map(_._2.last).toSeq }, - // Test sources and resources from original spark/ directory - Test / unmanagedSourceDirectories := Seq( - baseDirectory.value.getParentFile / "spark" / "src" / "test" / "scala", - baseDirectory.value.getParentFile / "spark" / "src" / "test" / "java" - ), + // Test sources and resources from original spark/ directory (delta-spark-v1's directory) + Test / unmanagedSourceDirectories := { + val sparkDir = (`delta-spark-v1` / baseDirectory).value + Seq( + sparkDir / "src" / "test" / "scala", + sparkDir / "src" / "test" / "java" + ) + }, Test / unmanagedResourceDirectories := Seq( - baseDirectory.value.getParentFile / "spark" / "src" / "test" / "resources" + (`delta-spark-v1` / baseDirectory).value / "src" / "test" / "resources" ), - Test / resourceDirectory := baseDirectory.value.getParentFile / "spark" / "src" / "test" / "resources", + Test / resourceDirectory := (`delta-spark-v1` / baseDirectory).value / "src" / "test" / "resources", // Include spark-version-specific test sources Test / unmanagedSourceDirectories ++= { val sparkVer = sparkVersion.value + val sparkDir = (`delta-spark-v1` / baseDirectory).value if (sparkVer.startsWith("3.5")) { - Seq(baseDirectory.value.getParentFile / "spark" / "src" / "test" / "scala-spark-3.5") + Seq(sparkDir / "src" / "test" / "scala-spark-3.5") } else if (sparkVer.startsWith("4.0")) { - Seq(baseDirectory.value.getParentFile / "spark" / "src" / "test" / "scala-spark-master") + Seq(sparkDir / "src" / "test" / "scala-spark-master") } else { Seq.empty } From fb98c0c213bec7af8bc0ddca186d6633e3b0e431 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 12 Oct 2025 16:18:43 -0700 Subject: [PATCH 29/53] Add debug output for Test/javaOptions user.dir to diagnose GitHub Actions issue --- build.sbt | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/build.sbt b/build.sbt index 8addb9ec143..0ce948445b5 100644 --- a/build.sbt +++ b/build.sbt @@ -733,17 +733,23 @@ lazy val spark = (project in file("spark-combined")) // Fork tests to ensure javaOptions are applied Test / fork := true, - // Set working directory for forked tests to spark/ directory + // Set working directory for forked tests to spark/ directory + // Note: withWorkingDirectory sets the process working directory, but Java's user.dir + // system property might not update automatically, so we also set it in javaOptions Test / forkOptions := (Test / forkOptions).value.withWorkingDirectory( (Test / baseDirectory).value ), // Configurations to speed up tests and reduce memory footprint - Test / javaOptions ++= Seq( - // Explicitly set user.dir for cross-platform compatibility - // On some platforms, withWorkingDirectory doesn't update user.dir - // Use delta-spark-v1's baseDirectory (which is spark/) for clarity - s"-Duser.dir=${(`delta-spark-v1` / baseDirectory).value}", + Test / javaOptions ++= { + val sparkDir = (`delta-spark-v1` / baseDirectory).value + // Print debug info (will show during SBT loading) + println(s"[Delta Build] Setting Test/javaOptions user.dir to: $sparkDir") + Seq( + // Explicitly set user.dir for cross-platform compatibility + // On some platforms, withWorkingDirectory doesn't update user.dir + // Use delta-spark-v1's baseDirectory (which is spark/) for clarity + s"-Duser.dir=$sparkDir", "-Dspark.ui.enabled=false", "-Dspark.ui.showConsoleProgress=false", "-Dspark.databricks.delta.snapshotPartitions=2", @@ -752,7 +758,8 @@ lazy val spark = (project in file("spark-combined")) "-Dspark.databricks.delta.delta.log.cacheSize=3", "-Dspark.sql.sources.parallelPartitionDiscovery.parallelism=5", "-Xmx1024m" - ), + ) + }, TestParallelization.settings, ) From 8c4dd7e0bd21574f3a3212b6e60fbe17ae561d43 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 12 Oct 2025 16:20:22 -0700 Subject: [PATCH 30/53] Add more debug output for forkOptions working directory --- build.sbt | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/build.sbt b/build.sbt index 0ce948445b5..bdac923d568 100644 --- a/build.sbt +++ b/build.sbt @@ -736,9 +736,12 @@ lazy val spark = (project in file("spark-combined")) // Set working directory for forked tests to spark/ directory // Note: withWorkingDirectory sets the process working directory, but Java's user.dir // system property might not update automatically, so we also set it in javaOptions - Test / forkOptions := (Test / forkOptions).value.withWorkingDirectory( - (Test / baseDirectory).value - ), + Test / forkOptions := { + val sparkDir = (Test / baseDirectory).value + val opts = (Test / forkOptions).value + println(s"[Delta Build] Setting Test/forkOptions workingDirectory to: $sparkDir") + opts.withWorkingDirectory(sparkDir) + }, // Configurations to speed up tests and reduce memory footprint Test / javaOptions ++= { From a0af2a92db9abf9fb694979244b882a3befbdf96 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 12 Oct 2025 16:31:51 -0700 Subject: [PATCH 31/53] Fix: remove duplicate javaOptions - only add user.dir The spark module was adding all test javaOptions again (which are already in commonSettings), causing duplicates. Now it only adds -Duser.dir which is spark-specific. --- build.sbt | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/build.sbt b/build.sbt index bdac923d568..531d0ce6247 100644 --- a/build.sbt +++ b/build.sbt @@ -743,7 +743,8 @@ lazy val spark = (project in file("spark-combined")) opts.withWorkingDirectory(sparkDir) }, - // Configurations to speed up tests and reduce memory footprint + // Set user.dir explicitly for cross-platform compatibility + // Note: commonSettings already includes standard test javaOptions, we only add user.dir here Test / javaOptions ++= { val sparkDir = (`delta-spark-v1` / baseDirectory).value // Print debug info (will show during SBT loading) @@ -752,15 +753,7 @@ lazy val spark = (project in file("spark-combined")) // Explicitly set user.dir for cross-platform compatibility // On some platforms, withWorkingDirectory doesn't update user.dir // Use delta-spark-v1's baseDirectory (which is spark/) for clarity - s"-Duser.dir=$sparkDir", - "-Dspark.ui.enabled=false", - "-Dspark.ui.showConsoleProgress=false", - "-Dspark.databricks.delta.snapshotPartitions=2", - "-Dspark.sql.shuffle.partitions=5", - "-Ddelta.log.cacheSize=3", - "-Dspark.databricks.delta.delta.log.cacheSize=3", - "-Dspark.sql.sources.parallelPartitionDiscovery.parallelism=5", - "-Xmx1024m" + s"-Duser.dir=$sparkDir" ) }, From 57312d9d07c100b413daf57928e312921a60e5a2 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 12 Oct 2025 16:34:55 -0700 Subject: [PATCH 32/53] Fix: TestParallelization should use Test/baseDirectory for workingDirectory Root cause: TestParallelization.defaultForkOptions was using baseDirectory.value for workingDirectory, but spark module's Test/baseDirectory points to spark/ while baseDirectory points to spark-combined/. When GitHub Actions runs 'spark/test' with TEST_PARALLELISM_COUNT=4 SHARD_ID=x, the forked test JVMs got spark-combined/ as working directory, causing tests that use relative paths (like 'src/test/resources/delta/table-with-dv-large') to fail. Solution: Changed defaultForkOptions to use (Test/baseDirectory).value instead of baseDirectory.value, so it correctly uses spark/ as the working directory. This only affects the spark module which is the only user of TestParallelization. --- project/TestParallelization.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/project/TestParallelization.scala b/project/TestParallelization.scala index 769d4fbcec1..e656e62d6e3 100644 --- a/project/TestParallelization.scala +++ b/project/TestParallelization.scala @@ -54,6 +54,7 @@ object TestParallelization { Test / testGroupingStrategy := { val groupsCount = (Test / forkTestJVMCount).value val shard = (Test / shardId).value + // Use regular baseDirectory for target directory (not Test/baseDirectory) val baseJvmDir = baseDirectory.value MinShardGroupDurationStrategy(groupsCount, baseJvmDir, shard, defaultForkOptions.value) }, @@ -81,7 +82,9 @@ object TestParallelization { javaHome = javaHome.value, outputStrategy = outputStrategy.value, bootJars = Vector.empty, - workingDirectory = Some(baseDirectory.value), + // Use Test/baseDirectory instead of baseDirectory to support modules where these differ + // (e.g. spark-combined module where Test/baseDirectory points to spark/ source directory) + workingDirectory = Some((Test / baseDirectory).value), runJVMOptions = (Test / javaOptions).value.toVector, connectInput = connectInput.value, envVars = (Test / envVars).value From e639b18d93a43284f87b9b477feee70d09a294d5 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 12 Oct 2025 19:09:14 -0700 Subject: [PATCH 33/53] fix --- .../src/test/java/io/delta/kernel/spark/Dsv2BasicTest.java | 4 ++-- .../test/java/io/delta/kernel/spark/SparkDsv2TestBase.java | 4 ++-- .../java/io/delta/kernel/spark/read/SparkGoldenTableTest.java | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel-spark/src/test/java/io/delta/kernel/spark/Dsv2BasicTest.java b/kernel-spark/src/test/java/io/delta/kernel/spark/Dsv2BasicTest.java index dcf47a88f4e..5933ba425a5 100644 --- a/kernel-spark/src/test/java/io/delta/kernel/spark/Dsv2BasicTest.java +++ b/kernel-spark/src/test/java/io/delta/kernel/spark/Dsv2BasicTest.java @@ -43,10 +43,10 @@ public void setUp(@TempDir File tempDir) { new SparkConf() .set("spark.sql.catalog.dsv2", "io.delta.kernel.spark.catalog.TestCatalog") .set("spark.sql.catalog.dsv2.base_path", tempDir.getAbsolutePath()) - .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .set("spark.sql.extensions", "io.delta.sql.LegacyDeltaSparkSessionExtension") .set( "spark.sql.catalog.spark_catalog", - "org.apache.spark.sql.delta.catalog.DeltaCatalog") + "org.apache.spark.sql.delta.catalog.LegacyDeltaCatalog") .setMaster("local[*]") .setAppName("Dsv2BasicTest"); spark = SparkSession.builder().config(conf).getOrCreate(); diff --git a/kernel-spark/src/test/java/io/delta/kernel/spark/SparkDsv2TestBase.java b/kernel-spark/src/test/java/io/delta/kernel/spark/SparkDsv2TestBase.java index e1de37a1147..2a388d79f0d 100644 --- a/kernel-spark/src/test/java/io/delta/kernel/spark/SparkDsv2TestBase.java +++ b/kernel-spark/src/test/java/io/delta/kernel/spark/SparkDsv2TestBase.java @@ -32,10 +32,10 @@ public static void setUpSparkAndEngine() { SparkSession.builder() .master("local[*]") .appName("SparkKernelDsv2Tests") - .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config("spark.sql.extensions", "io.delta.sql.LegacyDeltaSparkSessionExtension") .config( "spark.sql.catalog.spark_catalog", - "org.apache.spark.sql.delta.catalog.DeltaCatalog") + "org.apache.spark.sql.delta.catalog.LegacyDeltaCatalog") .getOrCreate(); defaultEngine = DefaultEngine.create(spark.sessionState().newHadoopConf()); } diff --git a/kernel-spark/src/test/java/io/delta/kernel/spark/read/SparkGoldenTableTest.java b/kernel-spark/src/test/java/io/delta/kernel/spark/read/SparkGoldenTableTest.java index 68561aac93c..5aa9236307a 100644 --- a/kernel-spark/src/test/java/io/delta/kernel/spark/read/SparkGoldenTableTest.java +++ b/kernel-spark/src/test/java/io/delta/kernel/spark/read/SparkGoldenTableTest.java @@ -60,10 +60,10 @@ public void setUp(@TempDir File tempDir) { new SparkConf() .set("spark.sql.catalog.dsv2", "io.delta.kernel.spark.catalog.TestCatalog") .set("spark.sql.catalog.dsv2.base_path", tempDir.getAbsolutePath()) - .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .set("spark.sql.extensions", "io.delta.sql.LegacyDeltaSparkSessionExtension") .set( "spark.sql.catalog.spark_catalog", - "org.apache.spark.sql.delta.catalog.DeltaCatalog") + "org.apache.spark.sql.delta.catalog.LegacyDeltaCatalog") .setMaster("local[*]") .setAppName("SparkGoldenTableTest"); spark = SparkSession.builder().config(conf).getOrCreate(); From 92b63269017d3d05828effec6c8dcb4b407d4240 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 12 Oct 2025 20:42:21 -0700 Subject: [PATCH 34/53] fix --- .../org/apache/spark/sql/delta/DeltaErrorsSuite.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaErrorsSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaErrorsSuite.scala index 5d40c4c7daf..637073eeab3 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaErrorsSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaErrorsSuite.scala @@ -16,12 +16,14 @@ package org.apache.spark.sql.delta +import io.delta.sql.AbstractSparkSessionExtension +import org.apache.spark.sql.delta.catalog.AbstractDeltaCatalog + import java.io.{FileNotFoundException, PrintWriter, StringWriter} import java.net.URI import java.sql.Timestamp import java.text.SimpleDateFormat import java.util.Locale - import scala.sys.process.Process // scalastyle:off import.ordering.noEmptyLine @@ -29,7 +31,6 @@ import scala.sys.process.Process import org.apache.spark.sql.delta.DeltaErrors.generateDocsLink import org.apache.spark.sql.delta.actions.{Action, Metadata, Protocol} import org.apache.spark.sql.delta.actions.TableFeatureProtocolUtils.{TABLE_FEATURES_MIN_READER_VERSION, TABLE_FEATURES_MIN_WRITER_VERSION} -import org.apache.spark.sql.delta.catalog.DeltaCatalog import org.apache.spark.sql.delta.constraints.CharVarcharConstraint import org.apache.spark.sql.delta.constraints.Constraints import org.apache.spark.sql.delta.constraints.Constraints.NotNull @@ -38,7 +39,6 @@ import org.apache.spark.sql.delta.schema.{DeltaInvariantViolationException, Inva import org.apache.spark.sql.delta.sources.DeltaSQLConf import org.apache.spark.sql.delta.test.DeltaSQLCommandTest import org.apache.spark.sql.delta.test.DeltaSQLTestUtils -import io.delta.sql.DeltaSparkSessionExtension import org.apache.hadoop.fs.Path import org.json4s.JString import org.scalatest.GivenWhenThen @@ -1969,9 +1969,9 @@ trait DeltaErrorsSuiteBase } checkError(e, "DELTA_CONFIGURE_SPARK_SESSION_WITH_EXTENSION_AND_CATALOG", "56038", Map( - "sparkSessionExtensionName" -> classOf[DeltaSparkSessionExtension].getName, + "sparkSessionExtensionName" -> classOf[AbstractSparkSessionExtension].getName, "catalogKey" -> SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION.key, - "catalogClassName" -> classOf[DeltaCatalog].getName + "catalogClassName" -> classOf[AbstractDeltaCatalog].getName )) } { From 1238ef84c4036482f5149704c3a4c5344e38d7d5 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 12 Oct 2025 23:01:44 -0700 Subject: [PATCH 35/53] Fix: avoid duplicate symlinks in connectClient test setup Issue: serverClassPath contains multiple 'classes' directories with the same name (e.g., spark/target/scala-2.12/classes, storage/target/scala-2.12/classes, etc.). When creating symlinks, the code tried to create multiple symlinks all named 'classes', causing FileAlreadyExistsException. Solution: Track created symlink names in a Set and skip duplicates. Only the first occurrence of each filename will have a symlink created. Also added Files.exists() check and similar fix for log4j properties symlink. --- build.sbt | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/build.sbt b/build.sbt index 531d0ce6247..0ae275f1c82 100644 --- a/build.sbt +++ b/build.sbt @@ -344,17 +344,29 @@ lazy val connectClient = (project in file("spark-connect/client")) val jarsDir = distributionDir / "jars" IO.createDirectory(jarsDir) // Create symlinks for all dependencies. + // Use a set to track already created symlink names to avoid duplicates + val createdLinks = scala.collection.mutable.Set[String]() serverClassPath.distinct.foreach { entry => val jarFile = entry.data.toPath - val linkedJarFile = jarsDir / entry.data.getName - Files.createSymbolicLink(linkedJarFile.toPath, jarFile) + val fileName = entry.data.getName + // Only create symlink if we haven't created one with this name yet + if (!createdLinks.contains(fileName)) { + val linkedJarFile = jarsDir / fileName + if (!Files.exists(linkedJarFile.toPath)) { + Files.createSymbolicLink(linkedJarFile.toPath, jarFile) + } + createdLinks += fileName + } } // Create a symlink for the log4j properties val confDir = distributionDir / "conf" IO.createDirectory(confDir) val log4jProps = (spark / Test / resourceDirectory).value / "log4j2_spark_master.properties" val linkedLog4jProps = confDir / "log4j2.properties" - Files.createSymbolicLink(linkedLog4jProps.toPath, log4jProps.toPath) + // Only create symlink if it doesn't already exist + if (!linkedLog4jProps.exists()) { + Files.createSymbolicLink(linkedLog4jProps.toPath, log4jProps.toPath) + } } // Return the location of the distribution directory. "-Ddelta.spark.home=" + distributionDir From 021582ee9904ffcd11f67fd395480e95ee682308 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 12 Oct 2025 23:10:03 -0700 Subject: [PATCH 36/53] Simplify connectClient symlink fix - remove try-catch The issue is simply that serverClassPath contains multiple directories with the same name (e.g., 7 different 'classes' directories). Using a Set to track created symlink names is sufficient - no need for try-catch or concurrent access handling since each shard runs in its own workspace. --- build.sbt | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/build.sbt b/build.sbt index 0ae275f1c82..99e4ab59ef7 100644 --- a/build.sbt +++ b/build.sbt @@ -345,16 +345,15 @@ lazy val connectClient = (project in file("spark-connect/client")) IO.createDirectory(jarsDir) // Create symlinks for all dependencies. // Use a set to track already created symlink names to avoid duplicates + // (e.g., multiple 'classes' directories from different modules) val createdLinks = scala.collection.mutable.Set[String]() serverClassPath.distinct.foreach { entry => val jarFile = entry.data.toPath val fileName = entry.data.getName - // Only create symlink if we haven't created one with this name yet + // Only create symlink for the first occurrence of each filename if (!createdLinks.contains(fileName)) { val linkedJarFile = jarsDir / fileName - if (!Files.exists(linkedJarFile.toPath)) { - Files.createSymbolicLink(linkedJarFile.toPath, jarFile) - } + Files.createSymbolicLink(linkedJarFile.toPath, jarFile) createdLinks += fileName } } @@ -363,10 +362,7 @@ lazy val connectClient = (project in file("spark-connect/client")) IO.createDirectory(confDir) val log4jProps = (spark / Test / resourceDirectory).value / "log4j2_spark_master.properties" val linkedLog4jProps = confDir / "log4j2.properties" - // Only create symlink if it doesn't already exist - if (!linkedLog4jProps.exists()) { - Files.createSymbolicLink(linkedLog4jProps.toPath, log4jProps.toPath) - } + Files.createSymbolicLink(linkedLog4jProps.toPath, log4jProps.toPath) } // Return the location of the distribution directory. "-Ddelta.spark.home=" + distributionDir From 89206cfbc148eaaf7b16f6427527a30853e67907 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 13 Oct 2025 10:05:31 -0700 Subject: [PATCH 37/53] Use local delta-spark-v1 in kernelDefaults tests Changed kernelDefaults to depend on local delta-spark-v1 instead of published delta-spark 3.3.2. This makes the dependency consistent with goldenTables (which already uses delta-spark-v1) and allows testing against the current codebase. Changes: - Added .dependsOn(`delta-spark-v1` % "test") to kernelDefaults - Removed external 'io.delta' %% 'delta-spark' % '3.3.2' % 'test' dependency --- build.sbt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index 99e4ab59ef7..fd325aa8a95 100644 --- a/build.sbt +++ b/build.sbt @@ -938,6 +938,7 @@ lazy val kernelDefaults = (project in file("kernel/kernel-defaults")) .dependsOn(storage) .dependsOn(storage % "test->test") // Required for InMemoryCommitCoordinator for tests .dependsOn(goldenTables % "test") + .dependsOn(`delta-spark-v1` % "test") // Use local delta-spark-v1 instead of published version .settings( name := "delta-kernel-defaults", commonSettings, @@ -959,7 +960,7 @@ lazy val kernelDefaults = (project in file("kernel/kernel-defaults")) "commons-io" % "commons-io" % "2.8.0" % "test", "com.novocode" % "junit-interface" % "0.11" % "test", "org.slf4j" % "slf4j-log4j12" % "1.7.36" % "test", - "io.delta" %% "delta-spark" % "3.3.2" % "test", + // Removed external delta-spark dependency - now using local delta-spark-v1 // JMH dependencies allow writing micro-benchmarks for testing performance of components. // JMH has framework to define benchmarks and takes care of many common functionalities // such as warm runs, cold runs, defining benchmark parameter variables etc. From ad155d11f1278c24a5831de5277acd2ef10a42f1 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 13 Oct 2025 11:18:41 -0700 Subject: [PATCH 38/53] try minimize change --- .../defaults/DeltaTableWritesSuite.scala | 19 +++----------- .../kernel/defaults/utils/TestUtils.scala | 26 ++++--------------- 2 files changed, 8 insertions(+), 37 deletions(-) diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableWritesSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableWritesSuite.scala index d21460f4e73..5fb3e94de16 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableWritesSuite.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableWritesSuite.scala @@ -424,22 +424,9 @@ abstract class AbstractDeltaTableWritesSuite extends AnyFunSuite with AbstractWr tablePath, testSchema) - // Use your new commitUnsafe API to write an unsupported writer feature - import org.apache.spark.sql.delta.DeltaLog - import org.apache.spark.sql.delta.actions.Protocol - - val deltaLog = DeltaLog.forTable(spark, tablePath) - val txn = deltaLog.startTransaction() - - // Create Protocol action with unsupported writer feature - val protocolAction = Protocol( - minReaderVersion = 3, - minWriterVersion = 7, - readerFeatures = Some(Set.empty), - writerFeatures = Some(Set("testUnsupportedWriter"))) - - // Use your elegant API to commit directly to version 1 - txn.commitUnsafe(tablePath, 1L, protocolAction) + // Add unsupported writer feature to test Kernel's validation + DeltaTable.forPath(spark, tablePath) + .addFeatureSupport("testUnsupportedWriter") val e = intercept[KernelException] { getUpdateTxn(engine, tablePath) } diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/utils/TestUtils.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/utils/TestUtils.scala index aba5c8553c1..359c219542f 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/utils/TestUtils.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/utils/TestUtils.scala @@ -97,8 +97,11 @@ trait AbstractTestUtils .builder() .appName("Spark Test Writer for Delta Kernel") .config("spark.master", "local") - .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") - .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") + // Use Legacy* classes because kernelDefaults depends on delta-spark-v1 to avoid circular deps + .config("spark.sql.extensions", "io.delta.sql.LegacyDeltaSparkSessionExtension") + .config( + "spark.sql.catalog.spark_catalog", + "org.apache.spark.sql.delta.catalog.LegacyDeltaCatalog") // Set this conf to empty string so that the golden tables generated // using with the test-prefix (i.e. there is no DELTA_TESTING set) can still work .config(DeltaSQLConf.TEST_DV_NAME_PREFIX.key, "") @@ -182,25 +185,6 @@ trait AbstractTestUtils def commitManuallyWithValidation(actions: org.apache.spark.sql.delta.actions.Action*): Unit = { txn.commit(actions.toSeq, org.apache.spark.sql.delta.DeltaOperations.ManualUpdate) } - - /** - * Test only method to unsafe commit - writes actions directly to transaction log. - * Note: This bypasses Delta Spark transaction logic. - * - * @param tablePath The path to the Delta table - * @param version The commit version number - * @param actions Sequence of Action objects to write - */ - def commitUnsafe( - tablePath: String, - version: Long, - actions: org.apache.spark.sql.delta.actions.Action*): Unit = { - val logPath = new org.apache.hadoop.fs.Path(tablePath, "_delta_log") - val commitFile = org.apache.spark.sql.delta.util.FileNames.unsafeDeltaFile(logPath, version) - val commitContent = actions.map(_.json + "\n").mkString.getBytes(UTF_8) - Files.write(Paths.get(commitFile.toString), commitContent) - Table.forPath(defaultEngine, tablePath).checksum(defaultEngine, version) - } } implicit object ResourceLoader { From 7864c773df51a7f1986f7c34ad12fdc400205e91 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 13 Oct 2025 14:32:40 -0700 Subject: [PATCH 39/53] fix test --- build.sbt | 36 +++---------------- .../spark/sql/delta/catalog/DeltaCatalog.java | 3 +- .../sql/DeltaSparkSessionExtension.scala | 5 ++- 3 files changed, 9 insertions(+), 35 deletions(-) rename {spark-shaded => spark-combined}/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java (93%) rename {spark-shaded => spark-combined}/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala (94%) diff --git a/build.sbt b/build.sbt index fd325aa8a95..44174db072a 100644 --- a/build.sbt +++ b/build.sbt @@ -574,34 +574,15 @@ lazy val `delta-spark-v2` = (project in file("kernel-spark")) Test / testOptions += Tests.Argument(TestFrameworks.JUnit, "-v", "-a") ) -// ============================================================ -// Module 4: delta-spark-shaded (optional delegation layer) -// ============================================================ -lazy val `delta-spark-shaded` = (project in file("spark-shaded")) - .dependsOn(`delta-spark-v1`) // Full v1 for delegation if needed - .dependsOn(`delta-spark-v2`) - .settings( - name := "delta-spark-shaded", - commonSettings, - skipReleaseSettings, // Not published - - libraryDependencies ++= Seq( - "org.apache.spark" %% "spark-sql" % sparkVersion.value % "provided", - "org.apache.spark" %% "spark-core" % sparkVersion.value % "provided", - "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "provided", - ), - - // This module contains delegation code like: - // - DeltaCatalog (delegates to V1 or V2) - // - DeltaSparkSessionExtension (registers both) - ) // ============================================================ // Module 5: delta-spark (final published module - combined v1+v2+shaded) // ============================================================ lazy val spark = (project in file("spark-combined")) - .dependsOn(`delta-spark-shaded`) // Direct dependency on shaded (for delegation classes) + .dependsOn(`delta-spark-v1`) + .dependsOn(`delta-spark-v2`)// Direct dependency on shaded (for delegation classes) .dependsOn(storage) // Explicit dependency on storage + .disablePlugins(JavaFormatterPlugin, ScalafmtPlugin) .settings ( name := "delta-spark", commonSettings, @@ -620,7 +601,7 @@ lazy val spark = (project in file("spark-combined")) case e: Elem if e.label == "dependency" => val artifactId = (e \ "artifactId").text // Remove delta-spark-v1, delta-spark-v2, delta-spark-v1-shaded, delta-spark-shaded from pom - if (artifactId.startsWith("delta-spark-v") || artifactId == "delta-spark-shaded") { + if (artifactId.startsWith("delta-spark-v")) { Seq.empty } else { Seq(n) @@ -649,7 +630,6 @@ lazy val spark = (project in file("spark-combined")) val classesDir = (Compile / classDirectory).value val v1Classes = (`delta-spark-v1` / Compile / classDirectory).value val v2Classes = (`delta-spark-v2` / Compile / classDirectory).value - val shadedClasses = (`delta-spark-shaded` / Compile / classDirectory).value val storageClasses = (storage / Compile / classDirectory).value // Ensure classes directory exists @@ -659,7 +639,6 @@ lazy val spark = (project in file("spark-combined")) IO.copyDirectory(v1Classes, classesDir, overwrite = false, preserveLastModified = true) IO.copyDirectory(storageClasses, classesDir, overwrite = false, preserveLastModified = true) IO.copyDirectory(v2Classes, classesDir, overwrite = true, preserveLastModified = true) - IO.copyDirectory(shadedClasses, classesDir, overwrite = true, preserveLastModified = true) sbt.internal.inc.Analysis.Empty }, @@ -670,11 +649,10 @@ lazy val spark = (project in file("spark-combined")) Compile / packageBin / mappings := { val v1Full = (`delta-spark-v1` / Compile / packageBin / mappings).value // Full v1 with DeltaLog val v2 = (`delta-spark-v2` / Compile / packageBin / mappings).value - val shaded = (`delta-spark-shaded` / Compile / packageBin / mappings).value val storageClasses = (storage / Compile / packageBin / mappings).value // Add storage classes // Merge all mappings, shaded classes override v1 classes if there are conflicts - val allMappings = v1Full ++ v2 ++ storageClasses ++ shaded + val allMappings = v1Full ++ v2 ++ storageClasses ++ // Remove duplicates by path (keep the last occurrence, which is from shaded) allMappings.groupBy(_._2).map(_._2.last).toSeq @@ -741,13 +719,9 @@ lazy val spark = (project in file("spark-combined")) // Fork tests to ensure javaOptions are applied Test / fork := true, - // Set working directory for forked tests to spark/ directory - // Note: withWorkingDirectory sets the process working directory, but Java's user.dir - // system property might not update automatically, so we also set it in javaOptions Test / forkOptions := { val sparkDir = (Test / baseDirectory).value val opts = (Test / forkOptions).value - println(s"[Delta Build] Setting Test/forkOptions workingDirectory to: $sparkDir") opts.withWorkingDirectory(sparkDir) }, diff --git a/spark-shaded/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java b/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java similarity index 93% rename from spark-shaded/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java rename to spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java index 5a9f9b58f6d..d7b6255c6ef 100644 --- a/spark-shaded/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java +++ b/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java @@ -18,7 +18,7 @@ /** * Delta Catalog implementation that can delegate to both V1 and V2 implementations. - * This class sits in delta-spark-shaded module and can access: + * This class sits in delta-spark (combined) module and can access: * - V1: org.apache.spark.sql.delta.* (full version with DeltaLog) * - V2: io.delta.kernel.spark.* */ @@ -26,3 +26,4 @@ public class DeltaCatalog extends AbstractDeltaCatalog { } + diff --git a/spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala b/spark-combined/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala similarity index 94% rename from spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala rename to spark-combined/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala index c547b8d5680..6f6695b3dfa 100644 --- a/spark-shaded/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala +++ b/spark-combined/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala @@ -21,12 +21,12 @@ import org.apache.spark.sql.catalyst.rules.Rule /** * Delta Spark Session Extension that can register both V1 and V2 implementations. - * This class sits in delta-spark-shaded module and can access: + * This class sits in delta-spark (combined) module and can access: * - V1: org.apache.spark.sql.delta.* (full version with DeltaLog) * - V2: io.delta.kernel.spark.* */ class DeltaSparkSessionExtension extends AbstractSparkSessionExtension { - + /** * NoOpRule for binary compatibility with Delta 3.3.0 * This class must remain here to satisfy MiMa checks @@ -35,4 +35,3 @@ class DeltaSparkSessionExtension extends AbstractSparkSessionExtension { override def apply(plan: LogicalPlan): LogicalPlan = plan } } - From 8716b18d9e71e6a5c558df79972f535224b90e03 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 13 Oct 2025 14:33:22 -0700 Subject: [PATCH 40/53] fix test --- build.sbt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sbt b/build.sbt index 44174db072a..09ceeef25c6 100644 --- a/build.sbt +++ b/build.sbt @@ -617,7 +617,7 @@ lazy val spark = (project in file("spark-combined")) // Override projectDependencies to exclude internal modules projectDependencies := { projectDependencies.value.filterNot { dep => - dep.name.startsWith("delta-spark-v") || dep.name == "delta-spark-shaded" + dep.name.startsWith("delta-spark-v") } }, @@ -1864,7 +1864,7 @@ val createTargetClassesDir = taskKey[Unit]("create target classes dir") // Don't use these groups for any other projects lazy val sparkGroup = project - .aggregate(spark, `delta-spark-v1`, `delta-spark-v1-shaded`, `delta-spark-v2`, `delta-spark-shaded`, contribs, storage, storageS3DynamoDB, sharing, hudi) + .aggregate(spark, `delta-spark-v1`, `delta-spark-v1-shaded`, `delta-spark-v2`, contribs, storage, storageS3DynamoDB, sharing, hudi) .settings( // crossScalaVersions must be set to Nil on the aggregating project crossScalaVersions := Nil, From 86c0186de21582eecbc3087e002fe0a935d95318 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 13 Oct 2025 14:41:26 -0700 Subject: [PATCH 41/53] fix test --- build.sbt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/build.sbt b/build.sbt index 09ceeef25c6..cb9547b8db8 100644 --- a/build.sbt +++ b/build.sbt @@ -58,7 +58,6 @@ val sparkVersion = settingKey[String]("Spark version") spark / sparkVersion := getSparkVersion() `delta-spark-v1` / sparkVersion := getSparkVersion() `delta-spark-v2` / sparkVersion := getSparkVersion() -`delta-spark-shaded` / sparkVersion := getSparkVersion() connectCommon / sparkVersion := getSparkVersion() connectClient / sparkVersion := getSparkVersion() connectServer / sparkVersion := getSparkVersion() @@ -650,10 +649,10 @@ lazy val spark = (project in file("spark-combined")) val v1Full = (`delta-spark-v1` / Compile / packageBin / mappings).value // Full v1 with DeltaLog val v2 = (`delta-spark-v2` / Compile / packageBin / mappings).value val storageClasses = (storage / Compile / packageBin / mappings).value // Add storage classes - + // Merge all mappings, shaded classes override v1 classes if there are conflicts - val allMappings = v1Full ++ v2 ++ storageClasses ++ - + val allMappings = v1Full ++ v2 ++ storageClasses + // Remove duplicates by path (keep the last occurrence, which is from shaded) allMappings.groupBy(_._2).map(_._2.last).toSeq }, From 3a19590a83b29cc50935a13632e88869a126b032 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 13 Oct 2025 14:42:42 -0700 Subject: [PATCH 42/53] fix test --- build.sbt | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/build.sbt b/build.sbt index cb9547b8db8..adbf3c2c7ae 100644 --- a/build.sbt +++ b/build.sbt @@ -620,28 +620,6 @@ lazy val spark = (project in file("spark-combined")) } }, - // No prod code in this module - Compile / sources := Seq.empty, - - // Copy all classes from dependencies to classes directory for MiMa - Compile / compile := { - val _ = (Compile / compile).value - val classesDir = (Compile / classDirectory).value - val v1Classes = (`delta-spark-v1` / Compile / classDirectory).value - val v2Classes = (`delta-spark-v2` / Compile / classDirectory).value - val storageClasses = (storage / Compile / classDirectory).value - - // Ensure classes directory exists - IO.createDirectory(classesDir) - - // Copy all classes (shaded classes override v1 classes) - IO.copyDirectory(v1Classes, classesDir, overwrite = false, preserveLastModified = true) - IO.copyDirectory(storageClasses, classesDir, overwrite = false, preserveLastModified = true) - IO.copyDirectory(v2Classes, classesDir, overwrite = true, preserveLastModified = true) - - sbt.internal.inc.Analysis.Empty - }, - // Package combined classes: FULL v1 (with DeltaLog) + v2 + shaded + storage // Note: v2 only depends on v1-shaded (without DeltaLog) at compile time, // but final jar includes full v1 for users @@ -655,6 +633,15 @@ lazy val spark = (project in file("spark-combined")) // Remove duplicates by path (keep the last occurrence, which is from shaded) allMappings.groupBy(_._2).map(_._2.last).toSeq + // Ensure classes directory exists + IO.createDirectory(classesDir) + + // Copy all classes (shaded classes override v1 classes) + IO.copyDirectory(v1Classes, classesDir, overwrite = false, preserveLastModified = true) + IO.copyDirectory(storageClasses, classesDir, overwrite = false, preserveLastModified = true) + IO.copyDirectory(v2Classes, classesDir, overwrite = true, preserveLastModified = true) + + sbt.internal.inc.Analysis.Empty }, // Test sources and resources from original spark/ directory (delta-spark-v1's directory) From a000aa8138e06495cbfbe37f60d3b8f5bedcc13c Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 13 Oct 2025 14:45:12 -0700 Subject: [PATCH 43/53] fix test --- build.sbt | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/build.sbt b/build.sbt index adbf3c2c7ae..c6823aab2df 100644 --- a/build.sbt +++ b/build.sbt @@ -589,6 +589,25 @@ lazy val spark = (project in file("spark-combined")) sparkMimaSettings, releaseSettings, // Published as delta-spark.jar crossSparkSettings(), + + // Copy all classes from dependencies to classes directory for MiMa + Compile / compile := { + val _ = (Compile / compile).value + val classesDir = (Compile / classDirectory).value + val v1Classes = (`delta-spark-v1` / Compile / classDirectory).value + val v2Classes = (`delta-spark-v2` / Compile / classDirectory).value + val storageClasses = (storage / Compile / classDirectory).value + + // Ensure classes directory exists + IO.createDirectory(classesDir) + + // Copy all classes (shaded classes override v1 classes) + IO.copyDirectory(v1Classes, classesDir, overwrite = false, preserveLastModified = true) + IO.copyDirectory(storageClasses, classesDir, overwrite = false, preserveLastModified = true) + IO.copyDirectory(v2Classes, classesDir, overwrite = true, preserveLastModified = true) + + sbt.internal.inc.Analysis.Empty + }, // Remove internal module dependencies from published pom.xml and ivy.xml // Users should only depend on delta-spark jar, not internal modules @@ -630,18 +649,6 @@ lazy val spark = (project in file("spark-combined")) // Merge all mappings, shaded classes override v1 classes if there are conflicts val allMappings = v1Full ++ v2 ++ storageClasses - - // Remove duplicates by path (keep the last occurrence, which is from shaded) - allMappings.groupBy(_._2).map(_._2.last).toSeq - // Ensure classes directory exists - IO.createDirectory(classesDir) - - // Copy all classes (shaded classes override v1 classes) - IO.copyDirectory(v1Classes, classesDir, overwrite = false, preserveLastModified = true) - IO.copyDirectory(storageClasses, classesDir, overwrite = false, preserveLastModified = true) - IO.copyDirectory(v2Classes, classesDir, overwrite = true, preserveLastModified = true) - - sbt.internal.inc.Analysis.Empty }, // Test sources and resources from original spark/ directory (delta-spark-v1's directory) From 163b123b595e26beb5c7ae8d9399dbf1c0d56349 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 13 Oct 2025 14:49:59 -0700 Subject: [PATCH 44/53] fix test --- build.sbt | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/build.sbt b/build.sbt index c6823aab2df..d7c147b236d 100644 --- a/build.sbt +++ b/build.sbt @@ -639,18 +639,6 @@ lazy val spark = (project in file("spark-combined")) } }, - // Package combined classes: FULL v1 (with DeltaLog) + v2 + shaded + storage - // Note: v2 only depends on v1-shaded (without DeltaLog) at compile time, - // but final jar includes full v1 for users - Compile / packageBin / mappings := { - val v1Full = (`delta-spark-v1` / Compile / packageBin / mappings).value // Full v1 with DeltaLog - val v2 = (`delta-spark-v2` / Compile / packageBin / mappings).value - val storageClasses = (storage / Compile / packageBin / mappings).value // Add storage classes - - // Merge all mappings, shaded classes override v1 classes if there are conflicts - val allMappings = v1Full ++ v2 ++ storageClasses - }, - // Test sources and resources from original spark/ directory (delta-spark-v1's directory) Test / unmanagedSourceDirectories := { val sparkDir = (`delta-spark-v1` / baseDirectory).value From a63fd0d8694e8115aab91d16331b82effbb585ab Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 13 Oct 2025 15:17:11 -0700 Subject: [PATCH 45/53] fix test --- build.sbt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/build.sbt b/build.sbt index d7c147b236d..47b3b38fc98 100644 --- a/build.sbt +++ b/build.sbt @@ -639,6 +639,10 @@ lazy val spark = (project in file("spark-combined")) } }, + // Include Python files in the JAR (using default packageBin for classes, then adding Python files) + Compile / packageBin / mappings := (Compile / packageBin / mappings).value ++ + listPythonFiles(baseDirectory.value.getParentFile / "python"), + // Test sources and resources from original spark/ directory (delta-spark-v1's directory) Test / unmanagedSourceDirectories := { val sparkDir = (`delta-spark-v1` / baseDirectory).value From fd53ce67a279c18119dde1eb9ce6e94630fdb5ed Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Tue, 14 Oct 2025 13:50:00 -0700 Subject: [PATCH 46/53] try running v1 test --- .../spark/sql/delta/catalog/DeltaCatalog.java | 54 ++++++++++++++++--- 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java b/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java index d7b6255c6ef..180a3793b3e 100644 --- a/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java +++ b/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java @@ -16,14 +16,56 @@ package org.apache.spark.sql.delta.catalog; +import org.apache.spark.sql.AnalysisException; +import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException; +import org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException; +import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; +import org.apache.spark.sql.connector.catalog.Identifier; +import org.apache.spark.sql.connector.catalog.Table; +import org.apache.spark.sql.connector.catalog.TableCatalog; +import org.apache.spark.sql.connector.catalog.V1Table; +import org.apache.spark.sql.delta.DeltaTableIdentifier; +import org.apache.spark.sql.delta.DeltaTableUtils; + /** - * Delta Catalog implementation that can delegate to both V1 and V2 implementations. - * This class sits in delta-spark (combined) module and can access: - * - V1: org.apache.spark.sql.delta.* (full version with DeltaLog) - * - V2: io.delta.kernel.spark.* + * Delta Catalog implementation that can delegate to both V1 and V2 implementations. This class sits + * in delta-spark (combined) module and can access: - V1: org.apache.spark.sql.delta.* (full version + * with DeltaLog) - V2: io.delta.kernel.spark.* */ public class DeltaCatalog extends AbstractDeltaCatalog { - -} + @Override + public Table loadTable(Identifier identifier) { + try { + // Load table from delegate catalog directly + Table delegateTable = ((TableCatalog) delegate).loadTable(identifier); + // If delegate table is a V1Table and it's a Delta table, return SparkTable + if (delegateTable instanceof V1Table) { + V1Table v1Table = (V1Table) delegateTable; + if (DeltaTableUtils.isDeltaTable(v1Table.catalogTable())) { + return new io.delta.kernel.spark.table.SparkTable( + identifier, v1Table.catalogTable().location().toString()); + } + } + // Otherwise return the delegate table as-is + return delegateTable; + } catch (AnalysisException e) { + // Handle NoSuchTableException and its related exceptions + if (e instanceof NoSuchTableException + || e instanceof NoSuchNamespaceException + || e instanceof NoSuchDatabaseException) { + if (isPathIdentifier(identifier)) { + return newDeltaPathTable(identifier); + } else if (isIcebergPathIdentifier(identifier)) { + return newIcebergPathTable(identifier); + } + } else if (DeltaTableIdentifier.gluePermissionError(e) && isPathIdentifier(identifier)) { + // Handle Glue permission errors for path identifiers + return newDeltaPathTable(identifier); + } + // Rethrow as RuntimeException since AnalysisException is checked + throw new RuntimeException(e); + } + } +} From abbc80a5db73ac5d883f066415869e4d74a5192f Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Tue, 14 Oct 2025 14:26:06 -0700 Subject: [PATCH 47/53] safe --- build.sbt | 9 +++++++++ .../org/apache/spark/sql/delta/catalog/DeltaCatalog.java | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index 47b3b38fc98..84679bd1743 100644 --- a/build.sbt +++ b/build.sbt @@ -558,6 +558,10 @@ lazy val `delta-spark-v2` = (project in file("kernel-spark")) "org.apache.spark" %% "spark-core" % sparkVersion.value % "provided", "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "provided", + // Jackson dependencies needed by kernel-defaults (for Jdk8Module support) + "com.fasterxml.jackson.core" % "jackson-databind" % "2.13.5", + "com.fasterxml.jackson.datatype" % "jackson-datatype-jdk8" % "2.13.5", + // Test dependencies "org.junit.jupiter" % "junit-jupiter-api" % "5.8.2" % "test", "org.junit.jupiter" % "junit-jupiter-engine" % "5.8.2" % "test", @@ -680,6 +684,11 @@ lazy val spark = (project in file("spark-combined")) "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "provided", "com.amazonaws" % "aws-java-sdk" % "1.12.262" % "provided", + // Jackson dependencies needed by kernel-defaults (for Jdk8Module support) + // These are needed because delta-spark-v2 uses kernel-defaults which requires these + "com.fasterxml.jackson.core" % "jackson-databind" % "2.13.5", + "com.fasterxml.jackson.datatype" % "jackson-datatype-jdk8" % "2.13.5", + // Test deps "org.scalatest" %% "scalatest" % scalaTestVersion % "test", "org.scalatestplus" %% "scalacheck-1-15" % "3.2.9.0" % "test", diff --git a/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java b/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java index 180a3793b3e..dc5745f9176 100644 --- a/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java +++ b/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java @@ -39,7 +39,7 @@ public Table loadTable(Identifier identifier) { try { // Load table from delegate catalog directly Table delegateTable = ((TableCatalog) delegate).loadTable(identifier); - + System.out.println("using new connector" + identifier); // If delegate table is a V1Table and it's a Delta table, return SparkTable if (delegateTable instanceof V1Table) { V1Table v1Table = (V1Table) delegateTable; From 6a4d27ea2563148ff4b8afa60e4bc8d43bfbb275 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Tue, 14 Oct 2025 15:31:22 -0700 Subject: [PATCH 48/53] fix --- .../java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java | 1 - 1 file changed, 1 deletion(-) diff --git a/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java b/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java index dc5745f9176..ce7ba599b2d 100644 --- a/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java +++ b/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java @@ -39,7 +39,6 @@ public Table loadTable(Identifier identifier) { try { // Load table from delegate catalog directly Table delegateTable = ((TableCatalog) delegate).loadTable(identifier); - System.out.println("using new connector" + identifier); // If delegate table is a V1Table and it's a Delta table, return SparkTable if (delegateTable instanceof V1Table) { V1Table v1Table = (V1Table) delegateTable; From 87b79acf9321d9009e73079841ee547335e1b702 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Tue, 14 Oct 2025 16:03:13 -0700 Subject: [PATCH 49/53] fix --- .../kernel/spark/catalog/SparkTable.java | 45 +++++++- .../spark/sql/delta/catalog/DeltaCatalog.java | 5 +- .../sql/DeltaSparkSessionExtension.scala | 7 ++ .../delta/sql/MaybeFallbackV1Connector.scala | 107 ++++++++++++++++++ .../sql/DeltaSparkSessionExtension.scala | 3 + 5 files changed, 163 insertions(+), 4 deletions(-) create mode 100644 spark-combined/src/main/scala/io/delta/sql/MaybeFallbackV1Connector.scala diff --git a/kernel-spark/src/main/java/io/delta/kernel/spark/catalog/SparkTable.java b/kernel-spark/src/main/java/io/delta/kernel/spark/catalog/SparkTable.java index 0dc13b484f5..f6e4e333a81 100644 --- a/kernel-spark/src/main/java/io/delta/kernel/spark/catalog/SparkTable.java +++ b/kernel-spark/src/main/java/io/delta/kernel/spark/catalog/SparkTable.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.delta.kernel.spark.table; +package io.delta.kernel.spark.catalog; import static io.delta.kernel.spark.utils.ScalaUtils.toScalaMap; import static java.util.Objects.requireNonNull; @@ -22,6 +22,7 @@ import io.delta.kernel.spark.read.SparkScanBuilder; import io.delta.kernel.spark.utils.SchemaUtils; import java.util.*; +import java.util.Optional; import org.apache.hadoop.conf.Configuration; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.connector.catalog.*; @@ -52,6 +53,7 @@ public class SparkTable implements Table, SupportsRead { private final StructType partitionSchema; private final Column[] columns; private final Transform[] partitionTransforms; + private final Optional v1CatalogTable; /** * Creates a SparkTable backed by a Delta Kernel snapshot and initializes Spark-facing metadata @@ -71,11 +73,24 @@ public class SparkTable implements Table, SupportsRead { * @throws NullPointerException if identifier or tablePath is null */ public SparkTable(Identifier identifier, String tablePath, Map options) { + this(identifier, tablePath, options, Optional.empty()); + } + + /** Private constructor with v1CatalogTable parameter. */ + private SparkTable( + Identifier identifier, + String tablePath, + Map options, + Optional v1CatalogTable) { this.identifier = requireNonNull(identifier, "identifier is null"); this.tablePath = requireNonNull(tablePath, "snapshot is null"); this.options = options; + this.v1CatalogTable = v1CatalogTable; this.hadoopConf = - SparkSession.active().sessionState().newHadoopConfWithOptions(toScalaMap(options)); + SparkSession.active() + .sessionState() + .newHadoopConfWithOptions( + v1CatalogTable.map(t -> t.storage().properties()).orElse(toScalaMap((options)))); this.snapshot = (SnapshotImpl) io.delta.kernel.TableManager.loadSnapshot(tablePath) @@ -132,6 +147,32 @@ public SparkTable(Identifier identifier, String tablePath) { this(identifier, tablePath, Collections.emptyMap()); } + /** + * Constructor that accepts a Spark CatalogTable. Extracts the table location from the catalog + * table and initializes the SparkTable using that location with empty options. + * + * @param identifier logical table identifier used by Spark's catalog + * @param catalogTable the Spark CatalogTable containing table metadata including location + * @throws NullPointerException if identifier or catalogTable is null + */ + public SparkTable( + Identifier identifier, org.apache.spark.sql.catalyst.catalog.CatalogTable catalogTable) { + this( + identifier, + requireNonNull(catalogTable, "catalogTable is null").location().toString(), + Collections.emptyMap(), + Optional.of(catalogTable)); + } + + /** + * Returns the V1 CatalogTable if this SparkTable was created from a catalog table. + * + * @return Optional containing the CatalogTable, or empty if this table was created from a path + */ + public Optional getV1CatalogTable() { + return v1CatalogTable; + } + @Override public String name() { return identifier.name(); diff --git a/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java b/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java index ce7ba599b2d..37fba4b717a 100644 --- a/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java +++ b/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java @@ -16,6 +16,7 @@ package org.apache.spark.sql.delta.catalog; +import io.delta.kernel.spark.catalog.SparkTable; import org.apache.spark.sql.AnalysisException; import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException; import org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException; @@ -43,8 +44,8 @@ public Table loadTable(Identifier identifier) { if (delegateTable instanceof V1Table) { V1Table v1Table = (V1Table) delegateTable; if (DeltaTableUtils.isDeltaTable(v1Table.catalogTable())) { - return new io.delta.kernel.spark.table.SparkTable( - identifier, v1Table.catalogTable().location().toString()); + return new SparkTable( + identifier, v1Table.catalogTable()); } } // Otherwise return the delegate table as-is diff --git a/spark-combined/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala b/spark-combined/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala index 6f6695b3dfa..28877aa37fa 100644 --- a/spark-combined/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala +++ b/spark-combined/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala @@ -16,6 +16,7 @@ package io.delta.sql +import org.apache.spark.sql.SparkSessionExtensions import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule @@ -27,6 +28,12 @@ import org.apache.spark.sql.catalyst.rules.Rule */ class DeltaSparkSessionExtension extends AbstractSparkSessionExtension { + override def preDeltaAnalysisRule(extensions: SparkSessionExtensions): Unit = { + extensions.injectResolutionRule( + session => new MaybeFallbackV1Connector(session) + ) + } + /** * NoOpRule for binary compatibility with Delta 3.3.0 * This class must remain here to satisfy MiMa checks diff --git a/spark-combined/src/main/scala/io/delta/sql/MaybeFallbackV1Connector.scala b/spark-combined/src/main/scala/io/delta/sql/MaybeFallbackV1Connector.scala new file mode 100644 index 00000000000..e6ecf0a1448 --- /dev/null +++ b/spark-combined/src/main/scala/io/delta/sql/MaybeFallbackV1Connector.scala @@ -0,0 +1,107 @@ +/* + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.delta.sql + +import io.delta.kernel.spark.catalog.SparkTable + +import org.apache.spark.sql.delta.DeltaTableUtils +import org.apache.spark.sql.delta.catalog.DeltaTableV2 + +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.catalog.CatalogTable +import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoStatement, LogicalPlan} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.streaming.StreamingRelationV2 +import org.apache.spark.sql.catalyst.trees.TreePattern.COMMAND +import org.apache.spark.sql.execution.datasources.{DataSource, DataSourceUtils} +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation +import org.apache.spark.sql.execution.streaming.StreamingRelation +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +class MaybeFallbackV1Connector(session: SparkSession) + extends Rule[LogicalPlan] { + + override def apply(plan: LogicalPlan): LogicalPlan = { + def replaceKernelWithFallback(node: LogicalPlan): LogicalPlan = { + node.resolveOperatorsDown { + case Batch(fallback) => fallback + case Streaming(fallback) => fallback + } + } + + plan.resolveOperatorsDown { + case i @ InsertIntoStatement(table, part, cols, query, overwrite, byName, ifNotExists) => + val newTable = replaceKernelWithFallback(table) + i.copy(table = newTable) + case Batch(fallback) => fallback + case Streaming(fallback) if !isReadOnly(plan) => fallback + } + } + + private def isReadOnly(plan: LogicalPlan): Boolean = { + !plan.containsPattern(COMMAND) && !plan.exists(_.isInstanceOf[InsertIntoStatement]) + } + + object Batch { + def unapply(dsv2: DataSourceV2Relation): Option[DataSourceV2Relation] = dsv2.table match { + case d: SparkTable => + val v1CatalogTable = d.getV1CatalogTable() + if (v1CatalogTable.isPresent()) { + // scalastyle:off println + println("falling back") + // scalastyle:off println + val catalogTable = v1CatalogTable.get() + Some(dsv2.copy(table = DeltaTableV2( + session, + new Path(catalogTable.location), + catalogTable = Some(catalogTable), + tableIdentifier = Some(catalogTable.identifier.toString)))) + } else { + Some(dsv2.copy(table = DeltaTableV2(session, new Path(d.name())))) + } + case _ => None + } + } + object Streaming { + def unapply(dsv2: StreamingRelationV2): Option[StreamingRelation] = dsv2.table match { + case d: SparkTable => + // Streaming's fallback is not via DeltaAnalysis, so directly create v1 streaming relation. + val v1CatalogTable = d.getV1CatalogTable() + assert(v1CatalogTable.isPresent()) + val catalogTable = v1CatalogTable.get() + Some(getStreamingRelation(catalogTable, dsv2.extraOptions)) + case _ => None + } + + } + + private def getStreamingRelation( + table: CatalogTable, + extraOptions: CaseInsensitiveStringMap): StreamingRelation = { + val dsOptions = DataSourceUtils.generateDatasourceOptions(extraOptions, table) + val dataSource = DataSource( + SparkSession.active, + className = table.provider.get, + userSpecifiedSchema = if (!DeltaTableUtils.isDeltaTable(table)) { + Some(table.schema) + } else None, + options = dsOptions, + catalogTable = Some(table)) + StreamingRelation(dataSource) + } +} diff --git a/spark/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala b/spark/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala index 2e20246266f..5473d14f62e 100644 --- a/spark/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala +++ b/spark/src/main/scala/io/delta/sql/DeltaSparkSessionExtension.scala @@ -82,6 +82,8 @@ import org.apache.spark.sql.internal.SQLConf */ class LegacyDeltaSparkSessionExtension extends AbstractSparkSessionExtension class AbstractSparkSessionExtension extends (SparkSessionExtensions => Unit) { + protected def preDeltaAnalysisRule(extensions: SparkSessionExtensions): Unit = {} + override def apply(extensions: SparkSessionExtensions): Unit = { extensions.injectParser { (_, parser) => new DeltaSqlParser(parser) @@ -92,6 +94,7 @@ class AbstractSparkSessionExtension extends (SparkSessionExtensions => Unit) { extensions.injectResolutionRule { session => PreprocessTimeTravel(session) } + preDeltaAnalysisRule(extensions) extensions.injectResolutionRule { session => // To ensure the parquet field id reader is turned on, these fields are required to support // id column mapping mode for Delta. From 691eb821d0179ff64bb203f591493f2fff1a309d Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Tue, 14 Oct 2025 16:46:46 -0700 Subject: [PATCH 50/53] fix --- .../delta/sql/MaybeFallbackV1Connector.scala | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/spark-combined/src/main/scala/io/delta/sql/MaybeFallbackV1Connector.scala b/spark-combined/src/main/scala/io/delta/sql/MaybeFallbackV1Connector.scala index e6ecf0a1448..ad43c142590 100644 --- a/spark-combined/src/main/scala/io/delta/sql/MaybeFallbackV1Connector.scala +++ b/spark-combined/src/main/scala/io/delta/sql/MaybeFallbackV1Connector.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.delta.catalog.DeltaTableV2 import org.apache.hadoop.fs.Path import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.catalog.CatalogTable -import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoStatement, LogicalPlan} +import org.apache.spark.sql.catalyst.plans.logical.{AppendData, InsertIntoStatement, LogicalPlan, OverwriteByExpression} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.streaming.StreamingRelationV2 import org.apache.spark.sql.catalyst.trees.TreePattern.COMMAND @@ -43,12 +43,30 @@ class MaybeFallbackV1Connector(session: SparkSession) case Streaming(fallback) => fallback } } - plan.resolveOperatorsDown { + // Handle V1 INSERT INTO case i @ InsertIntoStatement(table, part, cols, query, overwrite, byName, ifNotExists) => val newTable = replaceKernelWithFallback(table) i.copy(table = newTable) - case Batch(fallback) => fallback + + // Handle V2 AppendData (DataFrameWriterV2.append) + case a @ AppendData(Batch(fallback), _, _, _, _, _) => + // scalastyle:off println + println("falling back AppendData") + // scalastyle:on println + a.copy(table = fallback) + + // Handle V2 OverwriteByExpression (DataFrameWriterV2.overwrite) + case o @ OverwriteByExpression(Batch(fallback), _, _, _, _, _, _) => + // scalastyle:off println + println("falling back OverwriteByExpression") + // scalastyle:on println + o.copy(table = fallback) + + // Handle batch reads + case Batch(fallback) if !isReadOnly(plan) => fallback + + // Handle streaming case Streaming(fallback) if !isReadOnly(plan) => fallback } } From 6393ad980b38fae6c5775430e2f7504b4f610271 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Wed, 15 Oct 2025 12:55:49 -0700 Subject: [PATCH 51/53] save --- .../spark/sql/delta/catalog/DeltaCatalog.java | 33 ++++++++--- .../delta/sql/MaybeFallbackV1Connector.scala | 59 ++++++++++++++++--- 2 files changed, 76 insertions(+), 16 deletions(-) diff --git a/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java b/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java index 37fba4b717a..635846d6325 100644 --- a/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java +++ b/spark-combined/src/main/java/org/apache/spark/sql/delta/catalog/DeltaCatalog.java @@ -44,28 +44,45 @@ public Table loadTable(Identifier identifier) { if (delegateTable instanceof V1Table) { V1Table v1Table = (V1Table) delegateTable; if (DeltaTableUtils.isDeltaTable(v1Table.catalogTable())) { - return new SparkTable( - identifier, v1Table.catalogTable()); + return new SparkTable(identifier, v1Table.catalogTable()); } } // Otherwise return the delegate table as-is return delegateTable; + } catch (NoSuchTableException e) { + // Handle path-based tables + if (isPathIdentifier(identifier)) { + return newDeltaPathTable(identifier); + } else if (isIcebergPathIdentifier(identifier)) { + return newIcebergPathTable(identifier); + } else { + // *** KEY FIX: Directly rethrow NoSuchTableException as unchecked *** + // This allows saveAsTable to catch it properly + // Use uncheckedThrow to bypass Java's checked exception requirement + throw uncheckedThrow(e); + } } catch (AnalysisException e) { - // Handle NoSuchTableException and its related exceptions - if (e instanceof NoSuchTableException - || e instanceof NoSuchNamespaceException - || e instanceof NoSuchDatabaseException) { + // Handle other AnalysisException subtypes + if (e instanceof NoSuchNamespaceException || e instanceof NoSuchDatabaseException) { if (isPathIdentifier(identifier)) { return newDeltaPathTable(identifier); } else if (isIcebergPathIdentifier(identifier)) { return newIcebergPathTable(identifier); } } else if (DeltaTableIdentifier.gluePermissionError(e) && isPathIdentifier(identifier)) { - // Handle Glue permission errors for path identifiers return newDeltaPathTable(identifier); } - // Rethrow as RuntimeException since AnalysisException is checked + // For other AnalysisException, wrap in RuntimeException throw new RuntimeException(e); } } + + /** + * Utility method to throw checked exceptions as unchecked. + * This is a workaround for Java's checked exception requirement when overriding Scala methods. + */ + @SuppressWarnings("unchecked") + private static RuntimeException uncheckedThrow(Throwable e) throws E { + throw (E) e; + } } diff --git a/spark-combined/src/main/scala/io/delta/sql/MaybeFallbackV1Connector.scala b/spark-combined/src/main/scala/io/delta/sql/MaybeFallbackV1Connector.scala index ad43c142590..14756b560aa 100644 --- a/spark-combined/src/main/scala/io/delta/sql/MaybeFallbackV1Connector.scala +++ b/spark-combined/src/main/scala/io/delta/sql/MaybeFallbackV1Connector.scala @@ -24,7 +24,8 @@ import org.apache.spark.sql.delta.catalog.DeltaTableV2 import org.apache.hadoop.fs.Path import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.catalog.CatalogTable -import org.apache.spark.sql.catalyst.plans.logical.{AppendData, InsertIntoStatement, LogicalPlan, OverwriteByExpression} +import org.apache.spark.sql.catalyst.plans.logical.{AppendData, InsertIntoStatement, LogicalPlan, MergeIntoTable, OverwriteByExpression} +import org.apache.spark.sql.catalyst.plans.logical.DeltaMergeInto import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.streaming.StreamingRelationV2 import org.apache.spark.sql.catalyst.trees.TreePattern.COMMAND @@ -37,6 +38,10 @@ class MaybeFallbackV1Connector(session: SparkSession) extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = { + // scalastyle:off println + println(s"[MaybeFallbackV1] plan: ${plan.getClass.getSimpleName}, node: ${plan.nodeName}") + // scalastyle:on println + def replaceKernelWithFallback(node: LogicalPlan): LogicalPlan = { node.resolveOperatorsDown { case Batch(fallback) => fallback @@ -44,30 +49,68 @@ class MaybeFallbackV1Connector(session: SparkSession) } } plan.resolveOperatorsDown { + // Handle MERGE INTO (Spark generic MergeIntoTable) + case m @ MergeIntoTable(targetTable, sourceTable, mergeCondition, + matchedActions, notMatchedActions, notMatchedBySourceActions) => + // scalastyle:off println + println("[MaybeFallbackV1] MergeIntoTable -> replacing target and source") + // scalastyle:on println + val newTarget = replaceKernelWithFallback(targetTable) + val newSource = replaceKernelWithFallback(sourceTable) + m.copy(targetTable = newTarget, sourceTable = newSource) + + // Handle MERGE INTO (DeltaMergeInto) + case m @ DeltaMergeInto(target, source, condition, matched, notMatched, notMatchedBySource, + withSchemaEvolution, finalSchema) => + // scalastyle:off println + println("[MaybeFallbackV1] DeltaMergeInto -> replacing target and source") + // scalastyle:on println + val newTarget = replaceKernelWithFallback(target) + val newSource = replaceKernelWithFallback(source) + m.copy(target = newTarget, source = newSource) + // Handle V1 INSERT INTO case i @ InsertIntoStatement(table, part, cols, query, overwrite, byName, ifNotExists) => + // scalastyle:off println + println("[MaybeFallbackV1] InsertIntoStatement") + // scalastyle:on println val newTable = replaceKernelWithFallback(table) i.copy(table = newTable) // Handle V2 AppendData (DataFrameWriterV2.append) case a @ AppendData(Batch(fallback), _, _, _, _, _) => // scalastyle:off println - println("falling back AppendData") + println("[MaybeFallbackV1] AppendData -> falling back") // scalastyle:on println a.copy(table = fallback) // Handle V2 OverwriteByExpression (DataFrameWriterV2.overwrite) case o @ OverwriteByExpression(Batch(fallback), _, _, _, _, _, _) => // scalastyle:off println - println("falling back OverwriteByExpression") + println("[MaybeFallbackV1] OverwriteByExpression -> falling back") // scalastyle:on println o.copy(table = fallback) // Handle batch reads - case Batch(fallback) if !isReadOnly(plan) => fallback + case Batch(fallback) if !isReadOnly(plan) => + // scalastyle:off println + println("[MaybeFallbackV1] Batch write -> falling back") + // scalastyle:on println + fallback // Handle streaming - case Streaming(fallback) if !isReadOnly(plan) => fallback + case Streaming(fallback) if !isReadOnly(plan) => + // scalastyle:off println + println("[MaybeFallbackV1] Streaming write -> falling back") + // scalastyle:on println + fallback + + // Print unhandled COMMAND nodes + case other if other.containsPattern(COMMAND) => + // scalastyle:off println + println(s"[MaybeFallbackV1] UNHANDLED COMMAND: ${other.getClass.getSimpleName}") + // scalastyle:on println + other } } @@ -78,11 +121,11 @@ class MaybeFallbackV1Connector(session: SparkSession) object Batch { def unapply(dsv2: DataSourceV2Relation): Option[DataSourceV2Relation] = dsv2.table match { case d: SparkTable => + // scalastyle:off println + println(s"[MaybeFallbackV1] Batch extractor: SparkTable -> DeltaTableV2") + // scalastyle:on println val v1CatalogTable = d.getV1CatalogTable() if (v1CatalogTable.isPresent()) { - // scalastyle:off println - println("falling back") - // scalastyle:off println val catalogTable = v1CatalogTable.get() Some(dsv2.copy(table = DeltaTableV2( session, From 8807fb1175504d5dd1eede1b1da7fccf89880e6c Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Wed, 22 Oct 2025 20:29:03 -0700 Subject: [PATCH 52/53] fis --- .../delta/sql/MaybeFallbackV1Connector.scala | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/spark-combined/src/main/scala/io/delta/sql/MaybeFallbackV1Connector.scala b/spark-combined/src/main/scala/io/delta/sql/MaybeFallbackV1Connector.scala index 14756b560aa..db1c45d305e 100644 --- a/spark-combined/src/main/scala/io/delta/sql/MaybeFallbackV1Connector.scala +++ b/spark-combined/src/main/scala/io/delta/sql/MaybeFallbackV1Connector.scala @@ -38,10 +38,6 @@ class MaybeFallbackV1Connector(session: SparkSession) extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = { - // scalastyle:off println - println(s"[MaybeFallbackV1] plan: ${plan.getClass.getSimpleName}, node: ${plan.nodeName}") - // scalastyle:on println - def replaceKernelWithFallback(node: LogicalPlan): LogicalPlan = { node.resolveOperatorsDown { case Batch(fallback) => fallback @@ -52,9 +48,6 @@ class MaybeFallbackV1Connector(session: SparkSession) // Handle MERGE INTO (Spark generic MergeIntoTable) case m @ MergeIntoTable(targetTable, sourceTable, mergeCondition, matchedActions, notMatchedActions, notMatchedBySourceActions) => - // scalastyle:off println - println("[MaybeFallbackV1] MergeIntoTable -> replacing target and source") - // scalastyle:on println val newTarget = replaceKernelWithFallback(targetTable) val newSource = replaceKernelWithFallback(sourceTable) m.copy(targetTable = newTarget, sourceTable = newSource) @@ -62,9 +55,6 @@ class MaybeFallbackV1Connector(session: SparkSession) // Handle MERGE INTO (DeltaMergeInto) case m @ DeltaMergeInto(target, source, condition, matched, notMatched, notMatchedBySource, withSchemaEvolution, finalSchema) => - // scalastyle:off println - println("[MaybeFallbackV1] DeltaMergeInto -> replacing target and source") - // scalastyle:on println val newTarget = replaceKernelWithFallback(target) val newSource = replaceKernelWithFallback(source) m.copy(target = newTarget, source = newSource) @@ -86,30 +76,18 @@ class MaybeFallbackV1Connector(session: SparkSession) // Handle V2 OverwriteByExpression (DataFrameWriterV2.overwrite) case o @ OverwriteByExpression(Batch(fallback), _, _, _, _, _, _) => - // scalastyle:off println - println("[MaybeFallbackV1] OverwriteByExpression -> falling back") - // scalastyle:on println o.copy(table = fallback) // Handle batch reads case Batch(fallback) if !isReadOnly(plan) => - // scalastyle:off println - println("[MaybeFallbackV1] Batch write -> falling back") - // scalastyle:on println fallback // Handle streaming case Streaming(fallback) if !isReadOnly(plan) => - // scalastyle:off println - println("[MaybeFallbackV1] Streaming write -> falling back") - // scalastyle:on println fallback // Print unhandled COMMAND nodes case other if other.containsPattern(COMMAND) => - // scalastyle:off println - println(s"[MaybeFallbackV1] UNHANDLED COMMAND: ${other.getClass.getSimpleName}") - // scalastyle:on println other } } @@ -121,9 +99,6 @@ class MaybeFallbackV1Connector(session: SparkSession) object Batch { def unapply(dsv2: DataSourceV2Relation): Option[DataSourceV2Relation] = dsv2.table match { case d: SparkTable => - // scalastyle:off println - println(s"[MaybeFallbackV1] Batch extractor: SparkTable -> DeltaTableV2") - // scalastyle:on println val v1CatalogTable = d.getV1CatalogTable() if (v1CatalogTable.isPresent()) { val catalogTable = v1CatalogTable.get() From 666519e1ce9e4841a681c2a53869841f0810ab20 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Wed, 22 Oct 2025 22:44:31 -0700 Subject: [PATCH 53/53] fis --- .../test/java/io/delta/kernel/spark/catalog/SparkTableTest.java | 2 +- .../test/java/io/delta/kernel/spark/catalog/TestCatalog.java | 1 - .../java/io/delta/kernel/spark/read/SparkGoldenTableTest.java | 2 +- .../src/test/java/io/delta/kernel/spark/read/SparkScanTest.java | 2 +- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel-spark/src/test/java/io/delta/kernel/spark/catalog/SparkTableTest.java b/kernel-spark/src/test/java/io/delta/kernel/spark/catalog/SparkTableTest.java index 6e8dc13905b..6f7387eb15f 100644 --- a/kernel-spark/src/test/java/io/delta/kernel/spark/catalog/SparkTableTest.java +++ b/kernel-spark/src/test/java/io/delta/kernel/spark/catalog/SparkTableTest.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.delta.kernel.spark.table; +package io.delta.kernel.spark.catalog; import static org.apache.spark.sql.connector.catalog.TableCapability.BATCH_READ; import static org.junit.jupiter.api.Assertions.assertEquals; diff --git a/kernel-spark/src/test/java/io/delta/kernel/spark/catalog/TestCatalog.java b/kernel-spark/src/test/java/io/delta/kernel/spark/catalog/TestCatalog.java index 7b8b34907c1..d6eaeab6808 100644 --- a/kernel-spark/src/test/java/io/delta/kernel/spark/catalog/TestCatalog.java +++ b/kernel-spark/src/test/java/io/delta/kernel/spark/catalog/TestCatalog.java @@ -18,7 +18,6 @@ import io.delta.kernel.Operation; import io.delta.kernel.defaults.engine.DefaultEngine; import io.delta.kernel.engine.Engine; -import io.delta.kernel.spark.table.SparkTable; import io.delta.kernel.spark.utils.SchemaUtils; import io.delta.kernel.utils.CloseableIterable; import java.util.ArrayList; diff --git a/kernel-spark/src/test/java/io/delta/kernel/spark/read/SparkGoldenTableTest.java b/kernel-spark/src/test/java/io/delta/kernel/spark/read/SparkGoldenTableTest.java index 5aa9236307a..0af30a7ac07 100644 --- a/kernel-spark/src/test/java/io/delta/kernel/spark/read/SparkGoldenTableTest.java +++ b/kernel-spark/src/test/java/io/delta/kernel/spark/read/SparkGoldenTableTest.java @@ -22,7 +22,7 @@ import io.delta.kernel.expressions.Column; import io.delta.kernel.expressions.Literal; import io.delta.kernel.expressions.Predicate; -import io.delta.kernel.spark.table.SparkTable; +import io.delta.kernel.spark.catalog.SparkTable; import java.io.File; import java.lang.reflect.Field; import java.math.BigDecimal; diff --git a/kernel-spark/src/test/java/io/delta/kernel/spark/read/SparkScanTest.java b/kernel-spark/src/test/java/io/delta/kernel/spark/read/SparkScanTest.java index a91a37a12e5..29a56bbdb0d 100644 --- a/kernel-spark/src/test/java/io/delta/kernel/spark/read/SparkScanTest.java +++ b/kernel-spark/src/test/java/io/delta/kernel/spark/read/SparkScanTest.java @@ -3,7 +3,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import io.delta.kernel.spark.SparkDsv2TestBase; -import io.delta.kernel.spark.table.SparkTable; +import io.delta.kernel.spark.catalog.SparkTable; import java.io.File; import java.lang.reflect.Field; import java.util.ArrayList;