Skip to content

Commit d6ef81b

Browse files
committed
Fix UC setup cache key collisions and local Spark-version switching
Signed-off-by: Yi Li <yi.li@databricks.com>
1 parent b10907d commit d6ef81b

4 files changed

Lines changed: 81 additions & 25 deletions

File tree

.github/actions/setup-unitycatalog/action.yml

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@ name: "Set up pinned Unity Catalog build"
22
description: >-
33
Publishes Unity Catalog jars from the commit pinned in project/scripts/setup_unitycatalog_main.sh
44
(the UC_PIN_SHA= line) to the runner's local Ivy / Maven caches, using GitHub Actions cache so the
5-
slow UC build only runs the first time a pin is seen.
5+
slow UC build only runs the first time a pin is seen. Reads SPARK_VERSION (Spark major.minor
6+
short form) from the calling job's env to pick the Spark variant UC builds against; matrix
7+
workflows set this from `matrix.spark_version`, other workflows leave it unset and inherit the
8+
script's default. The cache key reflects SPARK_VERSION literally - workflows that share the
9+
default share one cache entry under an empty `spark-` segment.
610
711
runs:
812
using: "composite"
@@ -15,9 +19,9 @@ runs:
1519
path: |
1620
~/.ivy2/local
1721
~/.m2/repository/io/unitycatalog
18-
# Cache key hashes the setup script, so bumping UC_PIN_SHA (or any other script change)
19-
# invalidates the cache.
20-
key: uc-jars-${{ runner.os }}-${{ hashFiles('project/scripts/setup_unitycatalog_main.sh') }}
22+
# Cache key hashes the setup script (so any script change invalidates) and includes the
23+
# Spark short version (so 4.0 and 4.1 don't fight over a single shared cache entry).
24+
key: uc-jars-${{ runner.os }}-spark${{ env.SPARK_VERSION }}-${{ hashFiles('project/scripts/setup_unitycatalog_main.sh') }}
2125
- name: Build Unity Catalog from pinned SHA
2226
shell: bash
2327
run: bash project/scripts/setup_unitycatalog_main.sh
@@ -33,4 +37,4 @@ runs:
3337
path: |
3438
~/.ivy2/local
3539
~/.m2/repository/io/unitycatalog
36-
key: uc-jars-${{ runner.os }}-${{ hashFiles('project/scripts/setup_unitycatalog_main.sh') }}
40+
key: uc-jars-${{ runner.os }}-spark${{ env.SPARK_VERSION }}-${{ hashFiles('project/scripts/setup_unitycatalog_main.sh') }}

.github/workflows/build.yaml

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,23 @@ jobs:
3535
~/.sbt
3636
~/.ivy2
3737
~/.cache/coursier
38-
key: delta-sbt-cache-cross-spark
38+
key: delta-sbt-cache-cross-spark-${{ hashFiles('project/scripts/setup_unitycatalog_main.sh') }}
3939

4040
# publishM2 compiles every aggregated project, including storage, which has
41-
# unitycatalog-client as a compile-scope dependency. Publish the pinned UC build locally
42-
# first so Delta compiles against the UC APIs it actually targets.
43-
- name: Set up pinned Unity Catalog
41+
# unitycatalog-client as a compile-scope dependency. test_cross_spark_publish.py also
42+
# iterates over released Spark versions (sbt -DsparkVersion=<X.Y>), so we need UC's
43+
# spark connector published for each variant Delta will resolve. Invoke the setup action
44+
# once per Spark variant; the action's cache key partitions by SPARK_VERSION so each
45+
# variant warms its own cache entry. Keep this list in sync with the released versions
46+
# in project/spark-versions.json (snapshot versions are skipped by the cross-Spark test).
47+
- name: Set up pinned Unity Catalog (Spark 4.0)
4448
uses: ./.github/actions/setup-unitycatalog
49+
env:
50+
SPARK_VERSION: "4.0"
51+
- name: Set up pinned Unity Catalog (Spark 4.1)
52+
uses: ./.github/actions/setup-unitycatalog
53+
env:
54+
SPARK_VERSION: "4.1"
4555

4656
- name: Run cross-Spark build test
4757
run: python project/tests/test_cross_spark_publish.py
@@ -54,4 +64,4 @@ jobs:
5464
~/.sbt
5565
~/.ivy2
5666
~/.cache/coursier
57-
key: delta-sbt-cache-cross-spark
67+
key: delta-sbt-cache-cross-spark-${{ hashFiles('project/scripts/setup_unitycatalog_main.sh') }}

build.sbt

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -797,6 +797,12 @@ val unityCatalogVersion: String = sys.props.getOrElse(
797797
if (useDefaultUnityCatalogReleaseVersion) defaultUnityCatalogReleaseVersion
798798
else unityCatalogReleaseVersion.getOrElse(pinnedUnityCatalogVersion))
799799

800+
// UC publishes its Spark connector per Spark major.minor (e.g. unitycatalog-spark_4.1). This
801+
// is the artifact name without the Scala suffix - sbt's `%%` appends `_2.13` for dep
802+
// resolution; the canary check below appends `_2.13` explicitly for the Ivy/Maven path.
803+
val unityCatalogSparkArtifactName: String =
804+
s"unitycatalog-spark_${CrossSparkVersions.getSparkVersionSpec().shortVersion}"
805+
800806
/**
801807
* Returns true when `current` is at least `target`. Numeric segments only; suffix after
802808
* the first `-` (e.g. `-SNAPSHOT-abc1234`) is stripped before comparison.
@@ -842,7 +848,12 @@ def publishPinnedUnityCatalog(log: sbt.util.Logger, canary: java.io.File): Unit
842848
val procLogger = ProcessLogger(
843849
line => log.info(s"[UC setup] $line"),
844850
line => log.warn(s"[UC setup] $line"))
845-
val exit = Process(Seq("bash", unityCatalogSetupScript)).!(procLogger)
851+
// SPARK_VERSION tells the script which Spark variant to build (forwarded to UC's sbt as
852+
// -DsparkVersion).
853+
val exit = Process(
854+
Seq("bash", unityCatalogSetupScript),
855+
None,
856+
"SPARK_VERSION" -> CrossSparkVersions.getSparkVersionSpec().shortVersion).!(procLogger)
846857
if (exit != 0) {
847858
sys.error(
848859
s"[UC] $unityCatalogSetupScript exited with code $exit. Run it manually to see full output.")
@@ -864,13 +875,17 @@ Global / ensurePinnedUnityCatalog := {
864875
sys.props.contains("unityCatalogVersion")
865876
if (unityCatalogReleaseVersion.isEmpty && !usingReleasedVersion) {
866877
val home = file(sys.props("user.home"))
878+
// Canary on the spark artifact, not client/server: those are Spark-version-independent and
879+
// would short-circuit the trigger when only the active Spark version changed, leaving the
880+
// needed unitycatalog-spark_${X.Y}_2.13 unpublished.
881+
val sparkArtifact = s"${unityCatalogSparkArtifactName}_2.13"
867882
// Check both layouts: a restored sbt cache can pre-populate ivy alone, leaving m2 empty -
868883
// checking only ivy would silently skip the slow publish and break mvn-based consumers.
869884
val ivy2Canary = home / ".ivy2" / "local" / "io.unitycatalog" /
870-
"unitycatalog-client" / unityCatalogVersion / "ivys" / "ivy.xml"
885+
sparkArtifact / unityCatalogVersion / "ivys" / "ivy.xml"
871886
val m2Canary = home / ".m2" / "repository" / "io" / "unitycatalog" /
872-
"unitycatalog-client" / unityCatalogVersion /
873-
s"unitycatalog-client-$unityCatalogVersion.pom"
887+
sparkArtifact / unityCatalogVersion /
888+
s"$sparkArtifact-$unityCatalogVersion.pom"
874889
if (!ivy2Canary.exists || !m2Canary.exists) {
875890
publishPinnedUnityCatalog(log, ivy2Canary)
876891
}

project/scripts/setup_unitycatalog_main.sh

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,17 @@
4242
# 4. Open a focused PR.
4343
#
4444
# Environment overrides:
45-
# UC_DIR directory to clone into (default: /tmp/unitycatalog)
46-
# UC_REPO git remote URL (default: upstream unitycatalog)
47-
# UC_REF must be `main` or UC_PIN_SHA (default: UC_PIN_SHA below)
48-
# UC_FORCE set to "1" to rebuild even when the Ivy artifact exists
45+
# UC_DIR directory to clone into (default: /tmp/unitycatalog)
46+
# UC_REPO git remote URL (default: upstream unitycatalog)
47+
# UC_REF must be `main` or UC_PIN_SHA (default: UC_PIN_SHA below)
48+
# UC_FORCE set to "1" to rebuild even when the Ivy artifact exists
49+
# SPARK_VERSION Spark major.minor UC should build for (default: 4.1)
50+
# Forwarded as -DsparkVersion to UC's sbt; also determines the published artifact
51+
# name (unitycatalog-spark_${X.Y}_2.13). Delta's build.sbt sets this from
52+
# CrossSparkVersions when invoking the script; matrix CI workflows set it from
53+
# `matrix.spark_version`. Default matches UC's own default in
54+
# project/spark-versions.json - workflows that don't care which Spark variant UC
55+
# builds (kernel/flink/etc.) inherit it.
4956
#
5057
# UC_REF is restricted to exactly two values by design: the pinned SHA (the normal case) or
5158
# `main` (for the floating-main canary flow). Any other value is rejected. CI should never set
@@ -65,6 +72,7 @@ UC_DIR="${UC_DIR:-/tmp/unitycatalog}"
6572
UC_REPO="${UC_REPO:-https://github.com/unitycatalog/unitycatalog.git}"
6673
UC_REF="${UC_REF:-$UC_PIN_SHA}"
6774
UC_FORCE="${UC_FORCE:-0}"
75+
SPARK_VERSION="${SPARK_VERSION:-4.1}"
6876

6977
# Enforce the two-value contract. Anything else is either a typo or a misuse and would bypass the
7078
# safety check below.
@@ -92,11 +100,13 @@ fi
92100
# Canonical Ivy + Maven artifact paths. Delta depends on all three UC modules; sbt resolves from
93101
# ~/.ivy2/local, mvn (kernel-examples integration tests) resolves from ~/.m2/repository. If any
94102
# is missing in either layout we must re-publish.
103+
# UC publishes its Spark connector under a per-Spark-version coordinate
104+
# (e.g. unitycatalog-spark_4.1_2.13). The suffix tracks SPARK_VERSION so the
105+
# canary check matches whatever variant we tell UC to build below.
95106
IVY_LOCAL="$HOME/.ivy2/local/io.unitycatalog"
96107
IVY_CANARY_CLIENT="$IVY_LOCAL/unitycatalog-client/$UC_VERSION/ivys/ivy.xml"
97108
IVY_CANARY_SERVER="$IVY_LOCAL/unitycatalog-server/$UC_VERSION/ivys/ivy.xml"
98-
SPARK_MAJOR_MINOR="${SPARK_MAJOR_MINOR:-$(echo "${SPARK_VERSION:-4.1}" | cut -d. -f1,2)}"
99-
UC_SPARK_ARTIFACT="unitycatalog-spark_${SPARK_MAJOR_MINOR}_2.13"
109+
UC_SPARK_ARTIFACT="unitycatalog-spark_${SPARK_VERSION}_2.13"
100110
IVY_CANARY_SPARK="$IVY_LOCAL/$UC_SPARK_ARTIFACT/$UC_VERSION/ivys/ivy.xml"
101111
IVY_CANARY_HADOOP="$IVY_LOCAL/unitycatalog-hadoop/$UC_VERSION/ivys/ivy.xml"
102112
M2_LOCAL="$HOME/.m2/repository/io/unitycatalog"
@@ -162,12 +172,28 @@ fi
162172
# coordinate. Applied as a persistent setting so it sticks across the two sbt invocations below.
163173
SET_VERSION_CMD="set ThisBuild / version := \"$UC_VERSION\""
164174

175+
# Force publishLocal / publishM2 to overwrite existing artifacts. UC artifacts at the same
176+
# coordinate can be left behind from a prior run (e.g. cross-Spark publish re-invokes this
177+
# script for a different sparkVersion while client/server/hadoop are already in ~/.ivy2/local
178+
# and ~/.m2 from the first invocation). publishLocalConfiguration / publishM2Configuration are
179+
# task settings scoped per-project (ThisBuild / Global don't propagate), so we set them on each
180+
# project we publish. Both configs need overriding: publishLocal uses the former, publishM2
181+
# uses the latter.
182+
SET_OVERWRITE_CMDS=()
183+
for p in client server hadoop spark; do
184+
SET_OVERWRITE_CMDS+=(
185+
"set $p / publishLocalConfiguration := ($p / publishLocalConfiguration).value.withOverwrite(true)"
186+
"set $p / publishM2Configuration := ($p / publishM2Configuration).value.withOverwrite(true)"
187+
)
188+
done
189+
165190
echo ">>> Building and publishing UC client + server to local Maven repo"
166191
# Clear stale UC artifacts — GHA cache may restore jars from a prior run at the same coordinate,
167192
# and SBT's publishM2 refuses to overwrite (ThisBuild / publishM2Configuration is ignored).
168193
rm -rf "$HOME/.ivy2/local/io.unitycatalog" "$HOME/.m2/repository/io/unitycatalog"
169194
./build/sbt \
170195
"$SET_VERSION_CMD" \
196+
"${SET_OVERWRITE_CMDS[@]}" \
171197
"set client / Compile / packageDoc / publishArtifact := false" \
172198
clean \
173199
client/generate \
@@ -179,23 +205,24 @@ rm -rf "$HOME/.ivy2/local/io.unitycatalog" "$HOME/.m2/repository/io/unitycatalog
179205
hadoop/publishM2
180206

181207
# Publish the Spark connector for the caller's Spark version. Each CI matrix cell passes its own
182-
# SPARK_MAJOR_MINOR; when auto-triggered by ensurePinnedUnityCatalog (no env), defaults above.
183-
echo ">>> Publishing UC spark connector for Spark $SPARK_MAJOR_MINOR"
208+
# SPARK_VERSION; when auto-triggered by ensurePinnedUnityCatalog (no env), defaults above.
209+
echo ">>> Publishing UC spark connector for Spark $SPARK_VERSION"
184210
for attempt in 1 2 3; do
185211
if ./build/sbt \
186-
-DsparkVersion="$SPARK_MAJOR_MINOR" \
212+
-DsparkVersion="$SPARK_VERSION" \
187213
-DskipDeltaSpark=true \
188214
"$SET_VERSION_CMD" \
215+
"${SET_OVERWRITE_CMDS[@]}" \
189216
"set client / Compile / packageDoc / publishArtifact := false" \
190217
spark/publishLocal \
191218
spark/publishM2; then
192219
break
193220
fi
194221
if [[ "$attempt" -eq 3 ]]; then
195-
echo ">>> spark/publishM2 (Spark $SPARK_MAJOR_MINOR) failed after 3 attempts"
222+
echo ">>> spark/publishM2 (Spark $SPARK_VERSION) failed after 3 attempts"
196223
exit 1
197224
fi
198-
echo ">>> spark/publishM2 (Spark $SPARK_MAJOR_MINOR) failed on attempt $attempt; retrying..."
225+
echo ">>> spark/publishM2 (Spark $SPARK_VERSION) failed on attempt $attempt; retrying..."
199226
sleep 5
200227
done
201228

0 commit comments

Comments
 (0)