Skip to content

Commit 28e8908

Browse files
committed
Upgrade Spark 4.0
1 parent fa33ea0 commit 28e8908

File tree

7 files changed

+57
-2
lines changed

7 files changed

+57
-2
lines changed

assembly/pom.xml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,21 @@
146146
<artifactId>bcprov-jdk18on</artifactId>
147147
<scope>${hadoop.deps.scope}</scope>
148148
</dependency>
149+
150+
<!--
151+
OpenLineage and Kafka dependencies for Affirm custom distribution.
152+
These are bundled into /opt/spark/jars/ so all Spark applications have lineage tracking.
153+
-->
154+
<dependency>
155+
<groupId>io.openlineage</groupId>
156+
<artifactId>openlineage-spark_${scala.binary.version}</artifactId>
157+
<version>${openlineage.version}</version>
158+
</dependency>
159+
<dependency>
160+
<groupId>org.apache.kafka</groupId>
161+
<artifactId>kafka-clients</artifactId>
162+
<version>${kafka.version}</version>
163+
</dependency>
149164
</dependencies>
150165

151166
<build>

pom.xml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,13 @@
102102
<module>examples</module>
103103
<module>repl</module>
104104
<module>launcher</module>
105+
<!-- Disable Kafka connector modules because we are not using Spark with Kafka at Affirm -->
106+
<!--
105107
<module>connector/kafka-0-10-token-provider</module>
106108
<module>connector/kafka-0-10</module>
107109
<module>connector/kafka-0-10-assembly</module>
108110
<module>connector/kafka-0-10-sql</module>
111+
-->
109112
<module>connector/avro</module>
110113
<module>connector/protobuf</module>
111114
<!-- See additional modules enabled by profiles below -->
@@ -122,6 +125,7 @@
122125
<asm.version>9.7.1</asm.version>
123126
<slf4j.version>2.0.16</slf4j.version>
124127
<log4j.version>2.24.3</log4j.version>
128+
<iceberg.version>1.10.1</iceberg.version>
125129
<!-- make sure to update IsolatedClientLoader whenever this version is changed -->
126130
<hadoop.version>3.4.1</hadoop.version>
127131
<!-- SPARK-41247: When updating `protobuf.version`, also need to update `protoVersion` in `SparkBuild.scala` -->
@@ -135,6 +139,8 @@
135139
<hive.version>2.3.10</hive.version>
136140
<!-- note that this should be compatible with Kafka brokers version 0.10 and up -->
137141
<kafka.version>3.9.0</kafka.version>
142+
<!-- OpenLineage for Spark lineage tracking - bundled in Affirm custom distribution -->
143+
<openlineage.version>1.38.0</openlineage.version>
138144
<!-- After 10.17.1.0, the minimum required version is JDK19 -->
139145
<derby.version>10.16.1.1</derby.version>
140146
<parquet.version>1.15.2</parquet.version>
@@ -450,6 +456,28 @@
450456
<artifactId>jupiter-interface</artifactId>
451457
<scope>test</scope>
452458
</dependency>
459+
<dependency>
460+
<groupId>com.mysql</groupId>
461+
<artifactId>mysql-connector-j</artifactId>
462+
<version>9.2.0</version>
463+
<scope>runtime</scope>
464+
</dependency>
465+
<dependency>
466+
<groupId>org.apache.iceberg</groupId>
467+
<artifactId>iceberg-spark-runtime-4.0_2.13</artifactId>
468+
<version>${iceberg.version}</version>
469+
<scope>runtime</scope>
470+
</dependency>
471+
<dependency>
472+
<groupId>org.apache.iceberg</groupId>
473+
<artifactId>iceberg-aws-bundle</artifactId>
474+
<version>${iceberg.version}</version>
475+
</dependency>
476+
<dependency>
477+
<groupId>org.apache.hadoop</groupId>
478+
<artifactId>hadoop-aws</artifactId>
479+
<version>${hadoop.version}</version>
480+
</dependency>
453481
</dependencies>
454482
<dependencyManagement>
455483
<dependencies>

python/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,6 @@ pyspark_client.egg-info
55
pyspark_connect.egg-info
66
build/
77
dist/
8+
venv/
89
./setup.py
910
./setup.cfg

python/MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
recursive-include pyspark *.pyi py.typed *.json
2020
recursive-include deps/jars *.jar
2121
graft deps/bin
22+
graft deps/k8s
2223
recursive-include deps/sbin spark-config.sh spark-daemon.sh start-history-server.sh stop-history-server.sh
2324
recursive-include deps/data *.data *.txt
2425
recursive-include deps/licenses *.txt

python/packaging/classic/setup.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,13 +105,15 @@
105105
USER_SCRIPTS_PATH = os.path.join(SPARK_HOME, "sbin")
106106
DATA_PATH = os.path.join(SPARK_HOME, "data")
107107
LICENSES_PATH = os.path.join(SPARK_HOME, "licenses")
108+
DOCKER_PATH = os.path.join(SPARK_HOME, "resource-managers/kubernetes/docker/src/main/dockerfiles/spark")
108109

109110
SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin")
110111
USER_SCRIPTS_TARGET = os.path.join(TEMP_PATH, "sbin")
111112
JARS_TARGET = os.path.join(TEMP_PATH, "jars")
112113
EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples")
113114
DATA_TARGET = os.path.join(TEMP_PATH, "data")
114115
LICENSES_TARGET = os.path.join(TEMP_PATH, "licenses")
116+
DOCKER_TARGET = os.path.join(TEMP_PATH, "k8s")
115117

116118
# Check and see if we are under the spark path in which case we need to build the symlink farm.
117119
# This is important because we only want to build the symlink farm while under Spark otherwise we
@@ -221,6 +223,7 @@ def run(self):
221223
os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET)
222224
os.symlink(DATA_PATH, DATA_TARGET)
223225
os.symlink(LICENSES_PATH, LICENSES_TARGET)
226+
os.symlink(DOCKER_PATH, DOCKER_TARGET)
224227
else:
225228
# For windows fall back to the slower copytree
226229
copytree(JARS_PATH, JARS_TARGET)
@@ -229,6 +232,7 @@ def run(self):
229232
copytree(EXAMPLES_PATH, EXAMPLES_TARGET)
230233
copytree(DATA_PATH, DATA_TARGET)
231234
copytree(LICENSES_PATH, LICENSES_TARGET)
235+
copytree(DOCKER_PATH, DOCKER_TARGET)
232236
else:
233237
# If we are not inside of SPARK_HOME verify we have the required symlink farm
234238
if not os.path.exists(JARS_TARGET):
@@ -296,6 +300,7 @@ def run(self):
296300
"pyspark.streaming",
297301
"pyspark.bin",
298302
"pyspark.sbin",
303+
"pyspark.k8s",
299304
"pyspark.jars",
300305
"pyspark.pandas",
301306
"pyspark.pandas.data_type_ops",
@@ -321,6 +326,7 @@ def run(self):
321326
"pyspark.jars": "deps/jars",
322327
"pyspark.bin": "deps/bin",
323328
"pyspark.sbin": "deps/sbin",
329+
"pyspark.k8s": "deps/k8s",
324330
"pyspark.python.lib": "lib",
325331
"pyspark.data": "deps/data",
326332
"pyspark.licenses": "deps/licenses",
@@ -329,6 +335,7 @@ def run(self):
329335
package_data={
330336
"pyspark.jars": ["*.jar"],
331337
"pyspark.bin": ["*"],
338+
"pyspark.k8s": ["*"],
332339
"pyspark.sbin": [
333340
"spark-config.sh",
334341
"spark-daemon.sh",
@@ -398,11 +405,13 @@ def run(self):
398405
os.remove(os.path.join(TEMP_PATH, "examples"))
399406
os.remove(os.path.join(TEMP_PATH, "data"))
400407
os.remove(os.path.join(TEMP_PATH, "licenses"))
408+
os.remove(os.path.join(TEMP_PATH, "k8s"))
401409
else:
402410
rmtree(os.path.join(TEMP_PATH, "jars"))
403411
rmtree(os.path.join(TEMP_PATH, "bin"))
404412
rmtree(os.path.join(TEMP_PATH, "sbin"))
405413
rmtree(os.path.join(TEMP_PATH, "examples"))
406414
rmtree(os.path.join(TEMP_PATH, "data"))
407415
rmtree(os.path.join(TEMP_PATH, "licenses"))
416+
rmtree(os.path.join(TEMP_PATH, "k8s"))
408417
os.rmdir(TEMP_PATH)

python/pyspark/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@
1616
# See the License for the specific language governing permissions and
1717
# limitations under the License.
1818

19-
__version__: str = "4.0.0"
19+
__version__: str = "2815!4.0.0+affirm.dev1"

resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,4 +115,5 @@ case "$1" in
115115
esac
116116

117117
# Execute the container CMD under tini for better hygiene
118-
exec /usr/bin/tini -s -- "${CMD[@]}"
118+
export SPARK_VERSION="4.0.0"
119+
exec /tini -s -- "${CMD[@]}"

0 commit comments

Comments
 (0)