Skip to content

Commit f3ed63d

Browse files
authored
reduce image size, consolidate iceberg file location (#1038)
1 parent 20e809f commit f3ed63d

File tree

5 files changed

+70
-77
lines changed

5 files changed

+70
-77
lines changed

data-stacks/spark-on-eks/benchmarks/tpcds-iceberg-benchmarks/Dockerfile-comet-iceberg

Lines changed: 29 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,20 @@ EOF
102102
RUN sbt -Dsbt.override.build.repos=true -batch clean assembly
103103

104104
# ============================================================
105-
### Stage 3 – Runtime image (Graviton4 / ARM64)
105+
### Stage 3 – Build TPC-DS toolkit
106+
# ============================================================
107+
FROM ubuntu:22.04 AS tpcds_builder
108+
109+
RUN apt-get update && apt-get install -y --no-install-recommends \
110+
gcc libc6-dev make flex bison git ca-certificates && \
111+
rm -rf /var/lib/apt/lists/*
112+
113+
RUN git clone -b v4.0.0 https://github.com/heyujiao99/tpcds-kit.git /opt/tpcds-kit && \
114+
cd /opt/tpcds-kit/tools && \
115+
make OS=LINUX CFLAGS="-D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -DYYDEBUG -DLINUX -g -Wall -fcommon"
116+
117+
# ============================================================
118+
### Stage 4 – Final runtime image (Graviton4 / ARM64)
106119
# ============================================================
107120
FROM apache/spark:3.5.8-scala2.12-java17-python3-ubuntu
108121

@@ -115,26 +128,19 @@ ARG SCALA_BINARY=2.12
115128
ARG SPARK_UID=185
116129
ARG SPARK_GID=185
117130

118-
ENV DEBIAN_FRONTEND=noninteractive
119131
ENV SPARK_HOME=/opt/spark
120132
ENV PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH
121-
# JDK path from apache/spark base image (Eclipse Temurin)
122133
ENV JAVA_HOME=/opt/java/openjdk
123134

124135
USER root
125136
WORKDIR ${SPARK_HOME}
126137

127-
# System dependencies for TPC-DS toolkit compilation
128-
RUN apt-get update && apt-get install -y --no-install-recommends \
129-
gcc make flex bison git wget curl ca-certificates && \
130-
rm -rf /var/lib/apt/lists/*
131-
132138
# Non-root Spark user — UID 185 matches Kubernetes runAsUser convention
133139
RUN (groupadd -g ${SPARK_GID} spark 2>/dev/null || true) && \
134140
(useradd -m -u ${SPARK_UID} -g ${SPARK_GID} -s /bin/bash spark 2>/dev/null || true)
135141

136-
# ── Hadoop upgrade to 3.4.3 ──────────────────────────────────────────────────
137-
# Remove all bundled Hadoop + legacy AWS SDK jars to prevent version conflicts
142+
# ── Hadoop upgrade + dependency JARs ─────────────────────────────────────────
143+
# Remove bundled Hadoop + legacy AWS SDK jars, then download replacements
138144
RUN cd ${SPARK_HOME}/jars && \
139145
find . -maxdepth 1 -type f -name 'hadoop-*.jar' -delete || true && \
140146
find . -maxdepth 1 -type f -name 'aws-java-sdk-*.jar' -delete || true && \
@@ -152,28 +158,17 @@ RUN cd ${SPARK_HOME}/jars && \
152158
wget -q https://repo1.maven.org/maven2/org/apache/spark/spark-hadoop-cloud_${SCALA_BINARY}/${SPARK_VERSION}/spark-hadoop-cloud_${SCALA_BINARY}-${SPARK_VERSION}.jar && \
153159
# XML deps for jackson-dataformat-xml
154160
wget -q https://repo1.maven.org/maven2/com/fasterxml/woodstox/woodstox-core/6.5.1/woodstox-core-6.5.1.jar && \
155-
wget -q https://repo1.maven.org/maven2/org/codehaus/woodstox/stax2-api/4.2.2/stax2-api-4.2.2.jar
156-
157-
# ── DataFusion Comet 0.14.0 ──────────────────────────────────────────────────
158-
RUN wget -q \
159-
https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_${SCALA_BINARY}/${COMET_VERSION}/comet-spark-spark3.5_${SCALA_BINARY}-${COMET_VERSION}.jar \
160-
-P ${SPARK_HOME}/jars/
161-
162-
# ── Apache Iceberg 1.10.1 ────────────────────────────────────────────────────
163-
# iceberg-spark-runtime: Spark/Iceberg integration (catalog, DPP, read/write)
164-
# iceberg-aws-bundle: Glue catalog implementation + S3FileIO
165-
RUN wget -q \
166-
https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_${SCALA_BINARY}/${ICEBERG_VERSION}/iceberg-spark-runtime-3.5_${SCALA_BINARY}-${ICEBERG_VERSION}.jar \
167-
-P ${SPARK_HOME}/jars/ && \
168-
wget -q \
169-
https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
170-
-P ${SPARK_HOME}/jars/
171-
172-
# ── TPC-DS toolkit v4.0.0 ────────────────────────────────────────────────────
173-
RUN git clone -b v4.0.0 https://github.com/heyujiao99/tpcds-kit.git /opt/tpcds-kit && \
174-
cd /opt/tpcds-kit/tools && \
175-
make OS=LINUX CFLAGS="-D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -DYYDEBUG -DLINUX -g -Wall -fcommon" && \
176-
install -m 0755 dsdgen dsqgen /usr/local/bin
161+
wget -q https://repo1.maven.org/maven2/org/codehaus/woodstox/stax2-api/4.2.2/stax2-api-4.2.2.jar && \
162+
# DataFusion Comet
163+
wget -q https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_${SCALA_BINARY}/${COMET_VERSION}/comet-spark-spark3.5_${SCALA_BINARY}-${COMET_VERSION}.jar && \
164+
# Apache Iceberg (spark-runtime + aws-bundle)
165+
wget -q https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_${SCALA_BINARY}/${ICEBERG_VERSION}/iceberg-spark-runtime-3.5_${SCALA_BINARY}-${ICEBERG_VERSION}.jar && \
166+
wget -q https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
167+
168+
# ── TPC-DS toolkit binaries ──────────────────────────────────────────────────
169+
COPY --from=tpcds_builder /opt/tpcds-kit /opt/tpcds-kit
170+
COPY --from=tpcds_builder /opt/tpcds-kit/tools/dsdgen /usr/local/bin/dsdgen
171+
COPY --from=tpcds_builder /opt/tpcds-kit/tools/dsqgen /usr/local/bin/dsqgen
177172

178173
# ── Benchmark JARs from build stages ─────────────────────────────────────────
179174
COPY --from=spark_sql_perf_builder /build/spark-sql-perf/target/scala-2.12/*.jar \
@@ -182,8 +177,7 @@ COPY --from=benchmark_builder /build/spark-k8s-benchmarks/target/scala-2.12/*.ja
182177
${SPARK_HOME}/jars/
183178

184179
# ── Parquet-to-Iceberg conversion script ─────────────────────────────────────
185-
RUN mkdir -p /opt/scripts
186-
COPY convert_parquet_to_iceberg.py /opt/scripts/convert_parquet_to_iceberg.py
180+
COPY --chown=spark:spark convert_parquet_to_iceberg.py /opt/scripts/convert_parquet_to_iceberg.py
187181

188182
# ── Permissions ──────────────────────────────────────────────────────────────
189183
RUN chown -R spark:spark ${SPARK_HOME} && \
@@ -193,4 +187,4 @@ RUN chown -R spark:spark ${SPARK_HOME} && \
193187
USER ${SPARK_UID}
194188
WORKDIR ${SPARK_HOME}
195189

196-
ENTRYPOINT ["/opt/entrypoint.sh"]
190+
ENTRYPOINT ["/opt/entrypoint.sh"]

data-stacks/spark-on-eks/benchmarks/tpcds-iceberg-benchmarks/convert_parquet_to_iceberg.py

Lines changed: 28 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -13,20 +13,20 @@
1313
1414
Usage (via SparkApplication YAML — all args come from SparkApplication.spec.arguments):
1515
spark-submit convert_parquet_to_iceberg.py \\
16-
--src-bucket <S3_BUCKET> \\
17-
--src-prefix TPCDS-TEST-3TB \\
18-
--glue-db tpcds_3tb \\
19-
--warehouse s3a://<S3_BUCKET>/TPCDS-TEST-3TB-ICEBERG \\
20-
--region us-west-2 \\
21-
[--table <single_table>] # optional: convert one table only
22-
23-
S3 layout (three dedicated prefixes — no cross-contamination between benchmark runs):
24-
Parquet-only data : s3://<bucket>/TPCDS-TEST-{N}TB/{table}/ (--src-prefix)
25-
Iceberg data files: s3://<bucket>/TPCDS-TEST-{N}TB-ICEBERG-DATA/{table}/ (--data-path)
26-
Iceberg metadata : s3://<bucket>/TPCDS-TEST-{N}TB-ICEBERG-WH/ (--warehouse)
16+
--src-bucket <S3_BUCKET> \\
17+
--src-prefix TPCDS-TEST-3TB \\
18+
--glue-db tpcds_3tb \\
19+
--iceberg-prefix TPCDS-TEST-3TB-ICEBERG \\
20+
--region us-west-2 \\
21+
[--table <single_table>] # optional: convert one table only
22+
23+
S3 layout (single Iceberg prefix with data/ and metadata/ subdirectories):
24+
Parquet-only data : s3://<bucket>/TPCDS-TEST-{N}TB/{table}/ (--src-prefix)
25+
Iceberg data files: s3://<bucket>/TPCDS-TEST-{N}TB-ICEBERG/data/{table}/ (derived)
26+
Iceberg metadata : s3://<bucket>/TPCDS-TEST-{N}TB-ICEBERG/metadata/ (derived)
2727
2828
To run at a different scale (3TB, 10TB), change --src-prefix, --glue-db,
29-
--warehouse, and --data-path in the SparkApplication YAML — no image rebuild needed.
29+
and --iceberg-prefix in the SparkApplication YAML — no image rebuild needed.
3030
3131
The job is idempotent — it checks existing Glue tables and skips any already converted.
3232
@@ -81,15 +81,14 @@
8181
def parse_args():
8282
parser = argparse.ArgumentParser(description="Convert TPC-DS Parquet → Iceberg (Glue catalog)")
8383
# ── Scale-specific args — only these change per benchmark scale in the YAML ─
84-
# Scale | --src-prefix | --glue-db | --warehouse | --data-path
85-
# 1TB | TPCDS-TEST-1TB | tpcds_1tb | s3a://<b>/TPCDS-TEST-1TB-ICEBERG-WH | s3a://<b>/TPCDS-TEST-1TB-ICEBERG-DATA
86-
# 3TB | TPCDS-TEST-3TB | tpcds_3tb | s3a://<b>/TPCDS-TEST-3TB-ICEBERG-WH | s3a://<b>/TPCDS-TEST-3TB-ICEBERG-DATA
87-
# 10TB | TPCDS-TEST-10TB | tpcds_10tb | s3a://<b>/TPCDS-TEST-10TB-ICEBERG-WH | s3a://<b>/TPCDS-TEST-10TB-ICEBERG-DATA
88-
parser.add_argument("--src-bucket", required=True, help="Source S3 bucket name (no s3a:// prefix)")
89-
parser.add_argument("--src-prefix", required=True, help="S3 prefix for source Parquet, e.g. TPCDS-TEST-3TB")
90-
parser.add_argument("--glue-db", required=True, help="Target Glue database name, e.g. tpcds_3tb")
91-
parser.add_argument("--warehouse", required=True, help="Iceberg metadata warehouse, e.g. s3a://<bucket>/TPCDS-TEST-3TB-ICEBERG-WH")
92-
parser.add_argument("--data-path", required=True, help="Iceberg data file root, e.g. s3a://<bucket>/TPCDS-TEST-3TB-ICEBERG-DATA")
84+
# Scale | --src-prefix | --glue-db | --iceberg-prefix
85+
# 1TB | TPCDS-TEST-1TB | tpcds_1tb | TPCDS-TEST-1TB-ICEBERG
86+
# 3TB | TPCDS-TEST-3TB | tpcds_3tb | TPCDS-TEST-3TB-ICEBERG
87+
# 10TB | TPCDS-TEST-10TB | tpcds_10tb | TPCDS-TEST-10TB-ICEBERG
88+
parser.add_argument("--src-bucket", required=True, help="Source S3 bucket name (no s3a:// prefix)")
89+
parser.add_argument("--src-prefix", required=True, help="S3 prefix for source Parquet, e.g. TPCDS-TEST-3TB")
90+
parser.add_argument("--glue-db", required=True, help="Target Glue database name, e.g. tpcds_3tb")
91+
parser.add_argument("--iceberg-prefix", required=True, help="S3 prefix for Iceberg output, e.g. TPCDS-TEST-3TB-ICEBERG")
9392
# ── Fixed args ───────────────────────────────────────────────────────────────
9493
parser.add_argument("--region", default="us-west-2", help="AWS region for Glue")
9594
parser.add_argument("--table", default=None, help="Convert a single table only (optional)")
@@ -115,10 +114,10 @@ def convert_table(spark, table_name, partition_col, src_path, glue_db, data_path
115114
CTAS: read Parquet from src_path, write as partitioned Iceberg table in Glue.
116115
Uses CREATE OR REPLACE so re-running is safe (drops old Iceberg snapshot).
117116
118-
Three fully separate S3 prefixes (set via YAML args — no image rebuild needed):
119-
src_path → --src-prefix (raw Parquet, read-only source)
120-
data_path → --data-path (Iceberg data files, e.g. TPCDS-TEST-3TB-ICEBERG-DATA)
121-
warehouse → --warehouse (Iceberg metadata/snapshots, e.g. TPCDS-TEST-3TB-ICEBERG-WH)
117+
S3 layout (derived from --iceberg-prefix):
118+
src_path → --src-prefix (raw Parquet, read-only source)
119+
data_path → <iceberg-prefix>/data (Iceberg data files)
120+
warehouse → <iceberg-prefix>/metadata (Iceberg metadata/snapshots)
122121
"""
123122
full_table = f"glue_catalog.{glue_db}.{table_name}"
124123

@@ -152,6 +151,8 @@ def main():
152151
args = parse_args()
153152

154153
src_base = f"s3a://{args.src_bucket}/{args.src_prefix}"
154+
warehouse = f"s3a://{args.src_bucket}/{args.iceberg_prefix}/metadata"
155+
data_path = f"s3a://{args.src_bucket}/{args.iceberg_prefix}/data"
155156

156157
spark = (
157158
SparkSession.builder
@@ -163,7 +164,7 @@ def main():
163164
"org.apache.iceberg.spark.SparkCatalog")
164165
.config("spark.sql.catalog.glue_catalog.catalog-impl",
165166
"org.apache.iceberg.aws.glue.GlueCatalog")
166-
.config("spark.sql.catalog.glue_catalog.warehouse", args.warehouse)
167+
.config("spark.sql.catalog.glue_catalog.warehouse", warehouse)
167168
.config("spark.sql.catalog.glue_catalog.io-impl",
168169
"org.apache.iceberg.aws.s3.S3FileIO")
169170
.config("spark.sql.catalog.glue_catalog.glue.region", args.region)
@@ -204,7 +205,7 @@ def main():
204205

205206
src_path = f"{src_base}/{table_name}"
206207
try:
207-
convert_table(spark, table_name, partition_col, src_path, args.glue_db, args.data_path)
208+
convert_table(spark, table_name, partition_col, src_path, args.glue_db, data_path)
208209
converted += 1
209210
except Exception as exc:
210211
print(f"[fail] {table_name}: {exc}")

data-stacks/spark-on-eks/benchmarks/tpcds-iceberg-benchmarks/tpcds-benchmark-iceberg-comet.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
# 3. Image: ${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/spark-benchmark-native:3.5.8-tpcds4
1010
#
1111
# S3 layout:
12-
# Iceberg data : s3://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-ICEBERG-DATA/
13-
# Iceberg metadata : s3://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-ICEBERG-WH/
12+
# Iceberg data : s3://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-ICEBERG/data/
13+
# Iceberg metadata : s3://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-ICEBERG/metadata/
1414
# Results : s3://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-RESULT-COMET-ICEBERG-R8G/
1515
#
1616
# Usage:
@@ -29,15 +29,15 @@ metadata:
2929
spec:
3030
type: Scala
3131
mode: cluster
32-
image: "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/spark-benchmark-native:3.5.8-tpcds4"
32+
image: "public.ecr.aws/data-on-eks/spark-benchmark:3.5.8-tpcds4"
3333
imagePullPolicy: Always
3434
sparkVersion: "3.5.8"
3535
mainClass: com.k8s.spark.benchmark.BenchmarkSQL
3636
mainApplicationFile: "local:///opt/spark/jars/spark-k8s-benchmarks-assembly-1.0.0.jar"
3737
arguments:
3838
# arg[0] tpcdsDataDir — Iceberg data files root (used for TPCDSTables init;
3939
# format=iceberg bypasses path-based table creation, uses icebergDatabase instead)
40-
- "s3a://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-ICEBERG-DATA"
40+
- "s3a://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-ICEBERG/data"
4141
# arg[1] Results location
4242
- "s3a://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-RESULT-COMET-ICEBERG-R8G"
4343
# arg[2] Path to TPC-DS kit (dsdgen/dsqgen) in the image
@@ -169,7 +169,7 @@ spec:
169169
# Primary catalog — glue_catalog.tpcds_${SCALE}tb.<table>
170170
"spark.sql.catalog.glue_catalog": "org.apache.iceberg.spark.SparkCatalog"
171171
"spark.sql.catalog.glue_catalog.catalog-impl": "org.apache.iceberg.aws.glue.GlueCatalog"
172-
"spark.sql.catalog.glue_catalog.warehouse": "s3a://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-ICEBERG-WH"
172+
"spark.sql.catalog.glue_catalog.warehouse": "s3a://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-ICEBERG/metadata"
173173
"spark.sql.catalog.glue_catalog.io-impl": "org.apache.iceberg.aws.s3.S3FileIO"
174174
"spark.sql.catalog.glue_catalog.glue.region": "${AWS_REGION}"
175175
# SparkSessionCatalog v2 bridge — allows DPP to work with default catalog tables

data-stacks/spark-on-eks/benchmarks/tpcds-iceberg-benchmarks/tpcds-data-generation-3tb.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ metadata:
3232
spec:
3333
type: Scala
3434
mode: cluster
35-
image: "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/spark-benchmark-native:3.5.8-tpcds4"
35+
image: "public.ecr.aws/data-on-eks/spark-benchmark:3.5.8-tpcds4"
3636
imagePullPolicy: Always
3737
sparkVersion: "3.5.8"
3838
mainClass: com.k8s.spark.benchmark.DataGeneration

data-stacks/spark-on-eks/benchmarks/tpcds-iceberg-benchmarks/tpcds-parquet-to-iceberg.yaml

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
#
1313
# Scale-derived paths (auto-resolved from SCALE):
1414
# Source Parquet : s3://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB/
15-
# Iceberg data : s3://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-ICEBERG-DATA/
16-
# Iceberg metadata: s3://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-ICEBERG-WH/
15+
# Iceberg data : s3://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-ICEBERG/data/
16+
# Iceberg metadata: s3://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-ICEBERG/metadata/
1717
# Glue database : tpcds_${SCALE}tb
1818
#
1919
# Usage:
@@ -41,7 +41,7 @@ metadata:
4141
spec:
4242
type: Python
4343
mode: cluster
44-
image: "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/spark-benchmark-native:3.5.8-tpcds4"
44+
image: "public.ecr.aws/data-on-eks/spark-benchmark:3.5.8-tpcds4"
4545
imagePullPolicy: Always
4646
sparkVersion: "3.5.8"
4747
mainApplicationFile: "local:///opt/scripts/convert_parquet_to_iceberg.py"
@@ -52,10 +52,8 @@ spec:
5252
- "TPCDS-TEST-${SCALE}TB"
5353
- "--glue-db"
5454
- "tpcds_${SCALE}tb"
55-
- "--warehouse"
56-
- "s3a://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-ICEBERG-WH"
57-
- "--data-path"
58-
- "s3a://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-ICEBERG-DATA"
55+
- "--iceberg-prefix"
56+
- "TPCDS-TEST-${SCALE}TB-ICEBERG"
5957
- "--region"
6058
- "${AWS_REGION}"
6159
restartPolicy:
@@ -135,7 +133,7 @@ spec:
135133
"spark.sql.extensions": "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
136134
"spark.sql.catalog.glue_catalog": "org.apache.iceberg.spark.SparkCatalog"
137135
"spark.sql.catalog.glue_catalog.catalog-impl": "org.apache.iceberg.aws.glue.GlueCatalog"
138-
"spark.sql.catalog.glue_catalog.warehouse": "s3a://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-ICEBERG-WH"
136+
"spark.sql.catalog.glue_catalog.warehouse": "s3a://${S3_BUCKET}/TPCDS-TEST-${SCALE}TB-ICEBERG/metadata"
139137
"spark.sql.catalog.glue_catalog.io-impl": "org.apache.iceberg.aws.s3.S3FileIO"
140138
"spark.sql.catalog.glue_catalog.glue.region": "${AWS_REGION}"
141139
"spark.sql.catalog.spark_catalog": "org.apache.iceberg.spark.SparkSessionCatalog"
@@ -165,7 +163,7 @@ spec:
165163
# shuffle.partitions: set high enough to avoid OOM on large fact tables;
166164
# AQE will auto-coalesce small partitions at runtime.
167165
# Rule of thumb: ~800 × SCALE (1TB→800, 3TB→2400, 10TB→8000)
168-
"spark.sql.shuffle.partitions": "800"
166+
"spark.sql.shuffle.partitions": "2400"
169167
# Large Iceberg writes — no per-file record cap (let Iceberg manage file sizing)
170168
"spark.sql.files.maxRecordsPerFile": "0"
171169

0 commit comments

Comments
 (0)