|
| 1 | +# ============================================================ |
| 2 | +# DataFusion Comet + Apache Iceberg + TPC-DS v4.0 benchmark image |
| 3 | +# Target: AWS Graviton4 (r8g) — linux/arm64 |
| 4 | +# |
| 5 | +# Bundles: |
| 6 | +# - Spark 3.5.8 + Scala 2.12 + Java 17 |
| 7 | +# - DataFusion Comet 0.14.0 |
| 8 | +# - Apache Iceberg 1.10.1 (spark-runtime + aws-bundle) |
| 9 | +# - Hadoop 3.4.3 + AWS SDK v2 (for S3A) |
| 10 | +# - TPC-DS toolkit v4.0.0 (dsdgen/dsqgen) |
| 11 | +# - spark-sql-perf (support-tpcds-v4.0 branch) |
| 12 | +# - spark-k8s-benchmarks (KubedAI/spark-k8s-benchmarks) |
| 13 | +# - Parquet-to-Iceberg conversion script (/opt/scripts/) |
| 14 | +# - UID 185 non-root Spark user |
| 15 | +# |
| 16 | +# Purpose: Run TPC-DS v4 benchmarks with Iceberg tables to enable |
| 17 | +# Dynamic Partition Pruning (DPP) — addressing poor performance on |
| 18 | +# partition-sensitive queries (q14, q23, q24, q39, q47, q57) when |
| 19 | +# reading raw Parquet files without catalog statistics. |
| 20 | +# ============================================================ |
| 21 | +# WARNING: |
| 22 | +# This Dockerfile is provided for sample and demonstration purposes only. |
| 23 | +# It is NOT intended for production use. |
| 24 | +# ============================================================ |
| 25 | + |
| 26 | +# ============================================================ |
| 27 | +### Stage 1 – Build spark-sql-perf (TPC-DS v4 support) |
| 28 | +# ============================================================ |
| 29 | +FROM hseeberger/scala-sbt:11.0.13_1.5.5_2.12.15 AS spark_sql_perf_builder |
| 30 | +ARG SCALA_VERSION=2.12.15 |
| 31 | +ARG SPARK_VERSION=3.5.8 |
| 32 | +ARG SBT_VERSION=1.9.7 |
| 33 | + |
| 34 | +# Clone support-tpcds-v4.0 fork |
| 35 | +RUN git clone --depth 1 --branch support-iceberg-tpcds-v4.0 \ |
| 36 | + https://github.com/KubedAI/spark-sql-perf.git /build/spark-sql-perf |
| 37 | +WORKDIR /build/spark-sql-perf |
| 38 | + |
| 39 | +# Minimal plugin — only sbt-assembly needed |
| 40 | +RUN rm -f project/plugins.sbt && \ |
| 41 | + mkdir -p project && \ |
| 42 | + echo 'addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.2.0")' > project/plugins.sbt |
| 43 | + |
| 44 | +# Pin sbt version |
| 45 | +RUN echo "sbt.version=${SBT_VERSION}" > project/build.properties |
| 46 | + |
| 47 | +# Minimal build.sbt |
| 48 | +RUN rm -f build.sbt && \ |
| 49 | + cat > build.sbt <<SBT |
| 50 | +name := "spark-sql-perf" |
| 51 | +organization := "com.databricks" |
| 52 | +scalaVersion := "${SCALA_VERSION}" |
| 53 | +crossScalaVersions := Seq("${SCALA_VERSION}") |
| 54 | + |
| 55 | +val sparkVer = "${SPARK_VERSION}" |
| 56 | + |
| 57 | +libraryDependencies ++= Seq( |
| 58 | + "org.apache.spark" %% "spark-sql" % sparkVer % "provided", |
| 59 | + "org.apache.spark" %% "spark-hive" % sparkVer % "provided", |
| 60 | + "org.apache.spark" %% "spark-mllib" % sparkVer % "provided", |
| 61 | + "com.github.scopt" %% "scopt" % "3.7.1", |
| 62 | + "com.twitter" %% "util-jvm" % "6.45.0" % "provided", |
| 63 | + "org.scalatest" %% "scalatest" % "3.0.5" % Test, |
| 64 | + "org.yaml" % "snakeyaml" % "1.23" |
| 65 | +) |
| 66 | +SBT |
| 67 | + |
| 68 | +# Use Maven Central only — avoids dead Ivy/Typesafe repos |
| 69 | +RUN mkdir -p ~/.sbt && \ |
| 70 | + cat > ~/.sbt/repositories <<'EOF' |
| 71 | +[repositories] |
| 72 | +local |
| 73 | +maven-central: https://repo1.maven.org/maven2/ |
| 74 | +typesafe-ivy-releases: https://repo.scala-sbt.org/scalasbt/sbt-plugin-releases/, [organization]/[module]/(scala_[scalaVersion]/)(sbt_[sbtVersion]/)[revision]/[type]s/[artifact](-[classifier]).[ext] |
| 75 | +EOF |
| 76 | + |
| 77 | +RUN sbt -Dsbt.override.build.repos=true -batch clean package |
| 78 | + |
| 79 | +# ============================================================ |
| 80 | +### Stage 2 – Build spark-k8s-benchmarks |
| 81 | +# ============================================================ |
| 82 | +FROM hseeberger/scala-sbt:11.0.13_1.5.5_2.12.15 AS benchmark_builder |
| 83 | +ARG SCALA_VERSION=2.12.18 |
| 84 | +ARG SPARK_VERSION=3.5.8 |
| 85 | +ARG SBT_VERSION=1.9.9 |
| 86 | + |
| 87 | +RUN git clone --depth 1 --branch support-iceberg-tpcds-v4.0 \ |
| 88 | + https://github.com/KubedAI/spark-k8s-benchmarks.git /build/spark-k8s-benchmarks |
| 89 | +WORKDIR /build/spark-k8s-benchmarks |
| 90 | + |
| 91 | +RUN mkdir -p lib |
| 92 | +COPY --from=spark_sql_perf_builder /build/spark-sql-perf/target/scala-2.12/*.jar lib/ |
| 93 | + |
| 94 | +RUN mkdir -p ~/.sbt && \ |
| 95 | + cat > ~/.sbt/repositories <<'EOF' |
| 96 | +[repositories] |
| 97 | +local |
| 98 | +maven-central: https://repo1.maven.org/maven2/ |
| 99 | +typesafe-ivy-releases: https://repo.scala-sbt.org/scalasbt/sbt-plugin-releases/, [organization]/[module]/(scala_[scalaVersion]/)(sbt_[sbtVersion]/)[revision]/[type]s/[artifact](-[classifier]).[ext] |
| 100 | +EOF |
| 101 | + |
| 102 | +RUN sbt -Dsbt.override.build.repos=true -batch clean assembly |
| 103 | + |
| 104 | +# ============================================================ |
| 105 | +### Stage 3 – Runtime image (Graviton4 / ARM64) |
| 106 | +# ============================================================ |
| 107 | +FROM apache/spark:3.5.8-scala2.12-java17-python3-ubuntu |
| 108 | + |
| 109 | +ARG SPARK_VERSION=3.5.8 |
| 110 | +ARG HADOOP_VERSION=3.4.3 |
| 111 | +ARG AWS_SDK_VERSION=2.30.31 |
| 112 | +ARG COMET_VERSION=0.14.0 |
| 113 | +ARG ICEBERG_VERSION=1.10.1 |
| 114 | +ARG SCALA_BINARY=2.12 |
| 115 | +ARG SPARK_UID=185 |
| 116 | +ARG SPARK_GID=185 |
| 117 | + |
| 118 | +ENV DEBIAN_FRONTEND=noninteractive |
| 119 | +ENV SPARK_HOME=/opt/spark |
| 120 | +ENV PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH |
| 121 | +# JDK path from apache/spark base image (Eclipse Temurin) |
| 122 | +ENV JAVA_HOME=/opt/java/openjdk |
| 123 | + |
| 124 | +USER root |
| 125 | +WORKDIR ${SPARK_HOME} |
| 126 | + |
| 127 | +# System dependencies for TPC-DS toolkit compilation |
| 128 | +RUN apt-get update && apt-get install -y --no-install-recommends \ |
| 129 | + gcc make flex bison git wget curl ca-certificates && \ |
| 130 | + rm -rf /var/lib/apt/lists/* |
| 131 | + |
| 132 | +# Non-root Spark user — UID 185 matches Kubernetes runAsUser convention |
| 133 | +RUN (groupadd -g ${SPARK_GID} spark 2>/dev/null || true) && \ |
| 134 | + (useradd -m -u ${SPARK_UID} -g ${SPARK_GID} -s /bin/bash spark 2>/dev/null || true) |
| 135 | + |
| 136 | +# ── Hadoop upgrade to 3.4.3 ────────────────────────────────────────────────── |
| 137 | +# Remove all bundled Hadoop + legacy AWS SDK jars to prevent version conflicts |
| 138 | +RUN cd ${SPARK_HOME}/jars && \ |
| 139 | + find . -maxdepth 1 -type f -name 'hadoop-*.jar' -delete || true && \ |
| 140 | + find . -maxdepth 1 -type f -name 'aws-java-sdk-*.jar' -delete || true && \ |
| 141 | + # Core Hadoop 3.4.3 |
| 142 | + wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/${HADOOP_VERSION}/hadoop-common-${HADOOP_VERSION}.jar && \ |
| 143 | + wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-auth/${HADOOP_VERSION}/hadoop-auth-${HADOOP_VERSION}.jar && \ |
| 144 | + wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-api/${HADOOP_VERSION}/hadoop-client-api-${HADOOP_VERSION}.jar && \ |
| 145 | + wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-runtime/${HADOOP_VERSION}/hadoop-client-runtime-${HADOOP_VERSION}.jar && \ |
| 146 | + # S3A connector + AWS SDK v2 standalone bundle |
| 147 | + wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar && \ |
| 148 | + wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar && \ |
| 149 | + # commons-configuration2 required by Hadoop 3.4.x |
| 150 | + wget -q https://repo1.maven.org/maven2/org/apache/commons/commons-configuration2/2.9.0/commons-configuration2-2.9.0.jar && \ |
| 151 | + # Spark cloud committers |
| 152 | + wget -q https://repo1.maven.org/maven2/org/apache/spark/spark-hadoop-cloud_${SCALA_BINARY}/${SPARK_VERSION}/spark-hadoop-cloud_${SCALA_BINARY}-${SPARK_VERSION}.jar && \ |
| 153 | + # XML deps for jackson-dataformat-xml |
| 154 | + wget -q https://repo1.maven.org/maven2/com/fasterxml/woodstox/woodstox-core/6.5.1/woodstox-core-6.5.1.jar && \ |
| 155 | + wget -q https://repo1.maven.org/maven2/org/codehaus/woodstox/stax2-api/4.2.2/stax2-api-4.2.2.jar |
| 156 | + |
| 157 | +# ── DataFusion Comet 0.14.0 ────────────────────────────────────────────────── |
| 158 | +RUN wget -q \ |
| 159 | + https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_${SCALA_BINARY}/${COMET_VERSION}/comet-spark-spark3.5_${SCALA_BINARY}-${COMET_VERSION}.jar \ |
| 160 | + -P ${SPARK_HOME}/jars/ |
| 161 | + |
| 162 | +# ── Apache Iceberg 1.10.1 ──────────────────────────────────────────────────── |
| 163 | +# iceberg-spark-runtime: Spark/Iceberg integration (catalog, DPP, read/write) |
| 164 | +# iceberg-aws-bundle: Glue catalog implementation + S3FileIO |
| 165 | +RUN wget -q \ |
| 166 | + https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_${SCALA_BINARY}/${ICEBERG_VERSION}/iceberg-spark-runtime-3.5_${SCALA_BINARY}-${ICEBERG_VERSION}.jar \ |
| 167 | + -P ${SPARK_HOME}/jars/ && \ |
| 168 | + wget -q \ |
| 169 | + https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \ |
| 170 | + -P ${SPARK_HOME}/jars/ |
| 171 | + |
| 172 | +# ── TPC-DS toolkit v4.0.0 ──────────────────────────────────────────────────── |
| 173 | +RUN git clone -b v4.0.0 https://github.com/heyujiao99/tpcds-kit.git /opt/tpcds-kit && \ |
| 174 | + cd /opt/tpcds-kit/tools && \ |
| 175 | + make OS=LINUX CFLAGS="-D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -DYYDEBUG -DLINUX -g -Wall -fcommon" && \ |
| 176 | + install -m 0755 dsdgen dsqgen /usr/local/bin |
| 177 | + |
| 178 | +# ── Benchmark JARs from build stages ───────────────────────────────────────── |
| 179 | +COPY --from=spark_sql_perf_builder /build/spark-sql-perf/target/scala-2.12/*.jar \ |
| 180 | + ${SPARK_HOME}/jars/ |
| 181 | +COPY --from=benchmark_builder /build/spark-k8s-benchmarks/target/scala-2.12/*.jar \ |
| 182 | + ${SPARK_HOME}/jars/ |
| 183 | + |
| 184 | +# ── Parquet-to-Iceberg conversion script ───────────────────────────────────── |
| 185 | +RUN mkdir -p /opt/scripts |
| 186 | +COPY convert_parquet_to_iceberg.py /opt/scripts/convert_parquet_to_iceberg.py |
| 187 | + |
| 188 | +# ── Permissions ────────────────────────────────────────────────────────────── |
| 189 | +RUN chown -R spark:spark ${SPARK_HOME} && \ |
| 190 | + chown -R spark:spark /opt/tpcds-kit && \ |
| 191 | + chown -R spark:spark /opt/scripts |
| 192 | + |
| 193 | +USER ${SPARK_UID} |
| 194 | +WORKDIR ${SPARK_HOME} |
| 195 | + |
| 196 | +ENTRYPOINT ["/opt/entrypoint.sh"] |
0 commit comments