Skip to content

Commit 20e809f

Browse files
committed
TPCDS Iceberg benchmarks
Signed-off-by: vara-bonthu <vara.bonthu@gmail.com>
1 parent 7462369 commit 20e809f

File tree

8 files changed

+1313
-3
lines changed

8 files changed

+1313
-3
lines changed

data-stacks/spark-on-eks/benchmarks/benchmark-testdata-generation/tpcds-benchmark-data-generation-3t.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ spec:
2121
mainApplicationFile: local:///opt/spark/examples/jars/eks-spark-benchmark-assembly-1.0.jar
2222
arguments:
2323
# TPC-DS data location
24-
- "s3a://spark-on-eks-spark-logs-20251124184754110300000008/TPCDS-TEST-3TB"
24+
- "s3a://${S3_BUCKET}/TPCDS-TEST-3TB"
2525
# Path to kit in the docker image
2626
- "/opt/tpcds-kit/tools"
2727
# Data Format
@@ -46,7 +46,7 @@ spec:
4646

4747
# Spark Event logs
4848
"spark.eventLog.enabled": "true"
49-
"spark.eventLog.dir": "s3a://spark-on-eks-spark-logs-20251124184754110300000008/spark-event-logs"
49+
"spark.eventLog.dir": "s3a://${S3_BUCKET}/spark-event-logs"
5050
"spark.eventLog.rolling.enabled": "true"
5151
"spark.eventLog.rolling.maxFileSize": "64m"
5252

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
# ============================================================
2+
# DataFusion Comet + Apache Iceberg + TPC-DS v4.0 benchmark image
3+
# Target: AWS Graviton4 (r8g) — linux/arm64
4+
#
5+
# Bundles:
6+
# - Spark 3.5.8 + Scala 2.12 + Java 17
7+
# - DataFusion Comet 0.14.0
8+
# - Apache Iceberg 1.10.1 (spark-runtime + aws-bundle)
9+
# - Hadoop 3.4.3 + AWS SDK v2 (for S3A)
10+
# - TPC-DS toolkit v4.0.0 (dsdgen/dsqgen)
11+
# - spark-sql-perf (support-tpcds-v4.0 branch)
12+
# - spark-k8s-benchmarks (KubedAI/spark-k8s-benchmarks)
13+
# - Parquet-to-Iceberg conversion script (/opt/scripts/)
14+
# - UID 185 non-root Spark user
15+
#
16+
# Purpose: Run TPC-DS v4 benchmarks with Iceberg tables to enable
17+
# Dynamic Partition Pruning (DPP) — addressing poor performance on
18+
# partition-sensitive queries (q14, q23, q24, q39, q47, q57) when
19+
# reading raw Parquet files without catalog statistics.
20+
# ============================================================
21+
# WARNING:
22+
# This Dockerfile is provided for sample and demonstration purposes only.
23+
# It is NOT intended for production use.
24+
# ============================================================
25+
26+
# ============================================================
27+
### Stage 1 – Build spark-sql-perf (TPC-DS v4 support)
28+
# ============================================================
29+
FROM hseeberger/scala-sbt:11.0.13_1.5.5_2.12.15 AS spark_sql_perf_builder
30+
ARG SCALA_VERSION=2.12.15
31+
ARG SPARK_VERSION=3.5.8
32+
ARG SBT_VERSION=1.9.7
33+
34+
# Clone support-tpcds-v4.0 fork
35+
RUN git clone --depth 1 --branch support-iceberg-tpcds-v4.0 \
36+
https://github.com/KubedAI/spark-sql-perf.git /build/spark-sql-perf
37+
WORKDIR /build/spark-sql-perf
38+
39+
# Minimal plugin — only sbt-assembly needed
40+
RUN rm -f project/plugins.sbt && \
41+
mkdir -p project && \
42+
echo 'addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.2.0")' > project/plugins.sbt
43+
44+
# Pin sbt version
45+
RUN echo "sbt.version=${SBT_VERSION}" > project/build.properties
46+
47+
# Minimal build.sbt
48+
RUN rm -f build.sbt && \
49+
cat > build.sbt <<SBT
50+
name := "spark-sql-perf"
51+
organization := "com.databricks"
52+
scalaVersion := "${SCALA_VERSION}"
53+
crossScalaVersions := Seq("${SCALA_VERSION}")
54+
55+
val sparkVer = "${SPARK_VERSION}"
56+
57+
libraryDependencies ++= Seq(
58+
"org.apache.spark" %% "spark-sql" % sparkVer % "provided",
59+
"org.apache.spark" %% "spark-hive" % sparkVer % "provided",
60+
"org.apache.spark" %% "spark-mllib" % sparkVer % "provided",
61+
"com.github.scopt" %% "scopt" % "3.7.1",
62+
"com.twitter" %% "util-jvm" % "6.45.0" % "provided",
63+
"org.scalatest" %% "scalatest" % "3.0.5" % Test,
64+
"org.yaml" % "snakeyaml" % "1.23"
65+
)
66+
SBT
67+
68+
# Use Maven Central only — avoids dead Ivy/Typesafe repos
69+
RUN mkdir -p ~/.sbt && \
70+
cat > ~/.sbt/repositories <<'EOF'
71+
[repositories]
72+
local
73+
maven-central: https://repo1.maven.org/maven2/
74+
typesafe-ivy-releases: https://repo.scala-sbt.org/scalasbt/sbt-plugin-releases/, [organization]/[module]/(scala_[scalaVersion]/)(sbt_[sbtVersion]/)[revision]/[type]s/[artifact](-[classifier]).[ext]
75+
EOF
76+
77+
RUN sbt -Dsbt.override.build.repos=true -batch clean package
78+
79+
# ============================================================
80+
### Stage 2 – Build spark-k8s-benchmarks
81+
# ============================================================
82+
FROM hseeberger/scala-sbt:11.0.13_1.5.5_2.12.15 AS benchmark_builder
83+
ARG SCALA_VERSION=2.12.18
84+
ARG SPARK_VERSION=3.5.8
85+
ARG SBT_VERSION=1.9.9
86+
87+
RUN git clone --depth 1 --branch support-iceberg-tpcds-v4.0 \
88+
https://github.com/KubedAI/spark-k8s-benchmarks.git /build/spark-k8s-benchmarks
89+
WORKDIR /build/spark-k8s-benchmarks
90+
91+
RUN mkdir -p lib
92+
COPY --from=spark_sql_perf_builder /build/spark-sql-perf/target/scala-2.12/*.jar lib/
93+
94+
RUN mkdir -p ~/.sbt && \
95+
cat > ~/.sbt/repositories <<'EOF'
96+
[repositories]
97+
local
98+
maven-central: https://repo1.maven.org/maven2/
99+
typesafe-ivy-releases: https://repo.scala-sbt.org/scalasbt/sbt-plugin-releases/, [organization]/[module]/(scala_[scalaVersion]/)(sbt_[sbtVersion]/)[revision]/[type]s/[artifact](-[classifier]).[ext]
100+
EOF
101+
102+
RUN sbt -Dsbt.override.build.repos=true -batch clean assembly
103+
104+
# ============================================================
105+
### Stage 3 – Runtime image (Graviton4 / ARM64)
106+
# ============================================================
107+
FROM apache/spark:3.5.8-scala2.12-java17-python3-ubuntu
108+
109+
ARG SPARK_VERSION=3.5.8
110+
ARG HADOOP_VERSION=3.4.3
111+
ARG AWS_SDK_VERSION=2.30.31
112+
ARG COMET_VERSION=0.14.0
113+
ARG ICEBERG_VERSION=1.10.1
114+
ARG SCALA_BINARY=2.12
115+
ARG SPARK_UID=185
116+
ARG SPARK_GID=185
117+
118+
ENV DEBIAN_FRONTEND=noninteractive
119+
ENV SPARK_HOME=/opt/spark
120+
ENV PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH
121+
# JDK path from apache/spark base image (Eclipse Temurin)
122+
ENV JAVA_HOME=/opt/java/openjdk
123+
124+
USER root
125+
WORKDIR ${SPARK_HOME}
126+
127+
# System dependencies for TPC-DS toolkit compilation
128+
RUN apt-get update && apt-get install -y --no-install-recommends \
129+
gcc make flex bison git wget curl ca-certificates && \
130+
rm -rf /var/lib/apt/lists/*
131+
132+
# Non-root Spark user — UID 185 matches Kubernetes runAsUser convention
133+
RUN (groupadd -g ${SPARK_GID} spark 2>/dev/null || true) && \
134+
(useradd -m -u ${SPARK_UID} -g ${SPARK_GID} -s /bin/bash spark 2>/dev/null || true)
135+
136+
# ── Hadoop upgrade to 3.4.3 ──────────────────────────────────────────────────
137+
# Remove all bundled Hadoop + legacy AWS SDK jars to prevent version conflicts
138+
RUN cd ${SPARK_HOME}/jars && \
139+
find . -maxdepth 1 -type f -name 'hadoop-*.jar' -delete || true && \
140+
find . -maxdepth 1 -type f -name 'aws-java-sdk-*.jar' -delete || true && \
141+
# Core Hadoop 3.4.3
142+
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/${HADOOP_VERSION}/hadoop-common-${HADOOP_VERSION}.jar && \
143+
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-auth/${HADOOP_VERSION}/hadoop-auth-${HADOOP_VERSION}.jar && \
144+
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-api/${HADOOP_VERSION}/hadoop-client-api-${HADOOP_VERSION}.jar && \
145+
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-runtime/${HADOOP_VERSION}/hadoop-client-runtime-${HADOOP_VERSION}.jar && \
146+
# S3A connector + AWS SDK v2 standalone bundle
147+
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar && \
148+
wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar && \
149+
# commons-configuration2 required by Hadoop 3.4.x
150+
wget -q https://repo1.maven.org/maven2/org/apache/commons/commons-configuration2/2.9.0/commons-configuration2-2.9.0.jar && \
151+
# Spark cloud committers
152+
wget -q https://repo1.maven.org/maven2/org/apache/spark/spark-hadoop-cloud_${SCALA_BINARY}/${SPARK_VERSION}/spark-hadoop-cloud_${SCALA_BINARY}-${SPARK_VERSION}.jar && \
153+
# XML deps for jackson-dataformat-xml
154+
wget -q https://repo1.maven.org/maven2/com/fasterxml/woodstox/woodstox-core/6.5.1/woodstox-core-6.5.1.jar && \
155+
wget -q https://repo1.maven.org/maven2/org/codehaus/woodstox/stax2-api/4.2.2/stax2-api-4.2.2.jar
156+
157+
# ── DataFusion Comet 0.14.0 ──────────────────────────────────────────────────
158+
RUN wget -q \
159+
https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_${SCALA_BINARY}/${COMET_VERSION}/comet-spark-spark3.5_${SCALA_BINARY}-${COMET_VERSION}.jar \
160+
-P ${SPARK_HOME}/jars/
161+
162+
# ── Apache Iceberg 1.10.1 ────────────────────────────────────────────────────
163+
# iceberg-spark-runtime: Spark/Iceberg integration (catalog, DPP, read/write)
164+
# iceberg-aws-bundle: Glue catalog implementation + S3FileIO
165+
RUN wget -q \
166+
https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_${SCALA_BINARY}/${ICEBERG_VERSION}/iceberg-spark-runtime-3.5_${SCALA_BINARY}-${ICEBERG_VERSION}.jar \
167+
-P ${SPARK_HOME}/jars/ && \
168+
wget -q \
169+
https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
170+
-P ${SPARK_HOME}/jars/
171+
172+
# ── TPC-DS toolkit v4.0.0 ────────────────────────────────────────────────────
173+
RUN git clone -b v4.0.0 https://github.com/heyujiao99/tpcds-kit.git /opt/tpcds-kit && \
174+
cd /opt/tpcds-kit/tools && \
175+
make OS=LINUX CFLAGS="-D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -DYYDEBUG -DLINUX -g -Wall -fcommon" && \
176+
install -m 0755 dsdgen dsqgen /usr/local/bin
177+
178+
# ── Benchmark JARs from build stages ─────────────────────────────────────────
179+
COPY --from=spark_sql_perf_builder /build/spark-sql-perf/target/scala-2.12/*.jar \
180+
${SPARK_HOME}/jars/
181+
COPY --from=benchmark_builder /build/spark-k8s-benchmarks/target/scala-2.12/*.jar \
182+
${SPARK_HOME}/jars/
183+
184+
# ── Parquet-to-Iceberg conversion script ─────────────────────────────────────
185+
RUN mkdir -p /opt/scripts
186+
COPY convert_parquet_to_iceberg.py /opt/scripts/convert_parquet_to_iceberg.py
187+
188+
# ── Permissions ──────────────────────────────────────────────────────────────
189+
RUN chown -R spark:spark ${SPARK_HOME} && \
190+
chown -R spark:spark /opt/tpcds-kit && \
191+
chown -R spark:spark /opt/scripts
192+
193+
USER ${SPARK_UID}
194+
WORKDIR ${SPARK_HOME}
195+
196+
ENTRYPOINT ["/opt/entrypoint.sh"]

0 commit comments

Comments
 (0)