Skip to content

Commit d5d01c9

Browse files
authored
[4.0preview][Kernel][Parquet Writer] Fix an issue with writing decimal as binary (#3233) (#3235)
## Description The number of bytes needed to calculate the max buffer size needed when writing the decimal type to Parquet is off by one. Resolved #3152 ## How was this patch tested? Added unit tests that read and write decimals with various precision and scales.
1 parent 8428e88 commit d5d01c9

File tree

8 files changed

+90
-6
lines changed

8 files changed

+90
-6
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{"commitInfo":{"timestamp":1717778521300,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"3","numOutputBytes":"9126"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.3.0-SNAPSHOT","txnId":"5e3bfa16-cf0f-4d40-ad7d-b6426a6b4b7a"}}
2+
{"metaData":{"id":"7f750aff-9bf2-4e52-bfce-39811932da26","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"decimal_4_0\",\"type\":\"decimal(4,0)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_7_0\",\"type\":\"decimal(7,0)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_7_6\",\"type\":\"decimal(7,6)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_12_0\",\"type\":\"decimal(12,0)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_12_6\",\"type\":\"decimal(12,6)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_15_0\",\"type\":\"decimal(15,0)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_15_6\",\"type\":\"decimal(15,6)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_15_12\",\"type\":\"decimal(15,12)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_18_0\",\"type\":\"decimal(18,0)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_18_6\",\"type\":\"decimal(18,6)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_18_12\",\"type\":\"decimal(18,12)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_25_0\",\"type\":\"decimal(25,0)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_25_6\",\"type\":\"decimal(25,6)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_25_12\",\"type\":\"decimal(25,12)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_25_18\",\"type\":\"decimal(25,18)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_25_24\",\"type\":\"decimal(25,24)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_35_0\",\"type\":\"decimal(35,0)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_35_6\",\"type\":\"decimal(35,6)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_35_12\",\"type\":\"decimal(35,12)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_35_18\",\"type\":\"decimal(35,18)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_35_24\",\"type\":\"decimal(35,24)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_35_30\",\"type\":\"decimal(35,30)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_38_0\",\"type\":\"decimal(38,0)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_38_6\",\"type\":\"decimal(38,6)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_38_12\",\"type\":\"decimal(38,12)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_38_18\",\"type\":\"decimal(38,18)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_38_24\",\"type\":\"decimal(38,24)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_38_30\",\"type\":\"decimal(38,30)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_38_36\",\"type\":\"decimal(38,36)\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1717778519308}}
3+
{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
4+
{"add":{"path":"part-00000-bb4b3e59-ddb9-4d26-beaf-de9554e14517-c000.snappy.parquet","partitionValues":{},"size":9126,"modificationTime":1717778521237,"dataChange":true,"stats":"{\"numRecords\":3,\"minValues\":{\"decimal_4_0\":-13,\"decimal_7_0\":0,\"decimal_12_0\":0,\"decimal_12_6\":-0.000098,\"decimal_15_0\":-157,\"decimal_15_6\":-3.346000,\"decimal_15_12\":-0.002162000000,\"decimal_18_0\":0,\"decimal_18_6\":-22641.000000,\"decimal_18_12\":-5.190000000000,\"decimal_25_0\":0,\"decimal_25_6\":-0.000013,\"decimal_25_12\":-3.1661E-8,\"decimal_25_18\":-24199.000000000000000000,\"decimal_35_0\":0,\"decimal_35_6\":-0.000161,\"decimal_35_12\":-2.59176E-7,\"decimal_35_18\":-1.36744000E-10,\"decimal_35_24\":-22827907.000000000000000000000000,\"decimal_35_30\":-32805.309000000000000000000000000000,\"decimal_38_0\":-17,\"decimal_38_6\":-0.027994,\"decimal_38_12\":-0.000024695819,\"decimal_38_18\":-4.614771000E-9,\"decimal_38_24\":-9.718032000000E-12,\"decimal_38_30\":-2.6626087000000000E-14,\"decimal_38_36\":-2.9546424000000000000E-17},\"maxValues\":{\"decimal_4_0\":4,\"decimal_7_0\":0,\"decimal_12_0\":0,\"decimal_12_6\":0.000062,\"decimal_15_0\":481,\"decimal_15_6\":3.302000,\"decimal_15_12\":0.001469000000,\"decimal_18_0\":0,\"decimal_18_6\":7998.000000,\"decimal_18_12\":10.994000000000,\"decimal_25_0\":0,\"decimal_25_6\":0.000021,\"decimal_25_12\":5.925E-9,\"decimal_25_18\":234942.000000000000000000,\"decimal_35_0\":0,\"decimal_35_6\":0.000161,\"decimal_35_12\":1.65519E-7,\"decimal_35_18\":1.52896000E-10,\"decimal_35_24\":14797356.000000000000000000000000,\"decimal_35_30\":8083.687000000000000000000000000000,\"decimal_38_0\":26,\"decimal_38_6\":0.021882,\"decimal_38_12\":0.000032950993,\"decimal_38_18\":1.2783803000E-8,\"decimal_38_24\":2.395564000000E-12,\"decimal_38_30\":2.9414203000000000E-14,\"decimal_38_36\":3.241836000000000000E-18},\"nullCount\":{\"decimal_4_0\":1,\"decimal_7_0\":1,\"decimal_7_6\":3,\"decimal_12_0\":1,\"decimal_12_6\":1,\"decimal_15_0\":1,\"decimal_15_6\":1,\"decimal_15_12\":1,\"decimal_18_0\":1,\"decimal_18_6\":1,\"decimal_18_12\":1,\"decimal_25_0\":1,\"decimal_25_6\":1,\"decimal_25_12\":1,\"decimal_25_18\":1,\"decimal_25_24\":3,\"decimal_35_0\":1,\"decimal_35_6\":1,\"decimal_35_12\":1,\"decimal_35_18\":1,\"decimal_35_24\":1,\"decimal_35_30\":1,\"decimal_38_0\":1,\"decimal_38_6\":1,\"decimal_38_12\":1,\"decimal_38_18\":1,\"decimal_38_24\":1,\"decimal_38_30\":1,\"decimal_38_36\":1}}"}}

connectors/golden-tables/src/test/scala/io/delta/golden/GoldenTables.scala

+50-2
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,13 @@
1717
package io.delta.golden
1818

1919
import java.io.File
20-
import java.math.{BigDecimal => JBigDecimal}
20+
import java.math.{BigInteger, BigDecimal => JBigDecimal}
2121
import java.sql.Timestamp
2222
import java.time.ZoneOffset.UTC
2323
import java.time.LocalDateTime
24-
import java.util.{Locale, TimeZone}
24+
import java.util.{Locale, Random, TimeZone}
2525

26+
import scala.collection.mutable.ArrayBuffer
2627
import scala.concurrent.duration._
2728
import scala.language.implicitConversions
2829

@@ -1235,6 +1236,53 @@ class GoldenTables extends QueryTest with SharedSparkSession {
12351236
}
12361237
}
12371238

1239+
generateGoldenTable("decimal-various-scale-precision") { tablePath =>
1240+
val fields = ArrayBuffer[StructField]()
1241+
Seq(0, 4, 7, 12, 15, 18, 25, 35, 38).foreach { precision =>
1242+
Seq.range(start = 0, end = precision, step = 6).foreach { scale =>
1243+
fields.append(
1244+
StructField(s"decimal_${precision}_${scale}", DecimalType(precision, scale)))
1245+
}
1246+
}
1247+
1248+
val schema = StructType(fields)
1249+
1250+
val random = new Random(27 /* seed */)
1251+
def generateRandomBigDecimal(precision: Int, scale: Int): JBigDecimal = {
1252+
// Generate a random BigInteger with the specified precision
1253+
val unscaledValue = new BigInteger(precision, random)
1254+
1255+
// Create a BigDecimal with the unscaled value and the specified scale
1256+
new JBigDecimal(unscaledValue, scale)
1257+
}
1258+
1259+
val rows = ArrayBuffer[Row]()
1260+
Seq.range(start = 0, end = 3).foreach { i =>
1261+
val rowValues = ArrayBuffer[BigDecimal]()
1262+
Seq(0, 4, 7, 12, 15, 18, 25, 35, 38).foreach { precision =>
1263+
Seq.range(start = 0, end = precision, step = 3).foreach { scale =>
1264+
i match {
1265+
case 0 =>
1266+
rowValues.append(null)
1267+
case 1 =>
1268+
// Generate a positive random BigDecimal with the specified precision and scale
1269+
rowValues.append(generateRandomBigDecimal(precision, scale))
1270+
case 2 =>
1271+
// Generate a negative random BigDecimal with the specified precision and scale
1272+
rowValues.append(generateRandomBigDecimal(precision, scale).negate())
1273+
}
1274+
}
1275+
}
1276+
rows.append(Row(rowValues: _*))
1277+
}
1278+
1279+
spark.createDataFrame(spark.sparkContext.parallelize(rows), schema)
1280+
.repartition(1)
1281+
.write
1282+
.format("delta")
1283+
.save(tablePath)
1284+
}
1285+
12381286
for (parquetFormat <- Seq("v1", "v2")) {
12391287
// PARQUET_1_0 doesn't support dictionary encoding for FIXED_LEN_BYTE_ARRAY (only PARQUET_2_0)
12401288
generateGoldenTable(s"parquet-decimal-dictionaries-$parquetFormat") { tablePath =>

kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/parquet/ParquetSchemaUtils.java

+9-4
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,10 @@
2020
import static java.lang.String.format;
2121

2222
import org.apache.parquet.schema.*;
23+
import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation;
2324
import org.apache.parquet.schema.Type.Repetition;
2425
import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MICROS;
26+
import static org.apache.parquet.schema.LogicalTypeAnnotation.decimalType;
2527
import static org.apache.parquet.schema.LogicalTypeAnnotation.timestampType;
2628
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*;
2729
import static org.apache.parquet.schema.Type.Repetition.OPTIONAL;
@@ -54,7 +56,7 @@ class ParquetSchemaUtils {
5456

5557
static {
5658
List<Integer> maxBytesPerPrecision = new ArrayList<>();
57-
for (int i = 1; i <= 38; i++) {
59+
for (int i = 0; i <= 38; i++) {
5860
int numBytes = 1;
5961
while (Math.pow(2.0, 8 * numBytes - 1) < Math.pow(10.0, i)) {
6062
numBytes += 1;
@@ -205,18 +207,21 @@ private static Type toParquetType(
205207
DecimalType decimalType = (DecimalType) dataType;
206208
int precision = decimalType.getPrecision();
207209
int scale = decimalType.getScale();
210+
// DecimalType constructor already has checks to make sure the precision and scale are
211+
// within the valid range. No need to check them again.
208212

213+
DecimalLogicalTypeAnnotation decimalAnnotation = decimalType(scale, precision);
209214
if (precision <= DECIMAL_MAX_DIGITS_IN_INT) {
210215
type = primitive(INT32, repetition)
211-
.as(LogicalTypeAnnotation.decimalType(scale, precision))
216+
.as(decimalAnnotation)
212217
.named(name);
213218
} else if (precision <= DECIMAL_MAX_DIGITS_IN_LONG) {
214219
type = primitive(INT64, repetition)
215-
.as(LogicalTypeAnnotation.decimalType(scale, precision))
220+
.as(decimalAnnotation)
216221
.named(name);
217222
} else {
218223
type = primitive(FIXED_LEN_BYTE_ARRAY, repetition)
219-
.as(LogicalTypeAnnotation.decimalType(scale, precision))
224+
.as(decimalAnnotation)
220225
.length(MAX_BYTES_PER_PRECISION.get(precision))
221226
.named(name);
222227
}

kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableReadsSuite.scala

+12
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,18 @@ class DeltaTableReadsSuite extends AnyFunSuite with TestUtils {
184184
}
185185
}
186186

187+
test(s"end to end: reading decimal-various-scale-precision") {
188+
val tablePath = goldenTablePath("decimal-various-scale-precision")
189+
val expResults = spark.sql(s"SELECT * FROM delta.`$tablePath`")
190+
.collect()
191+
.map(TestRow(_))
192+
193+
checkTable(
194+
path = goldenTablePath("decimal-various-scale-precision"),
195+
expectedAnswer = expResults
196+
)
197+
}
198+
187199
//////////////////////////////////////////////////////////////////////////////////
188200
// Table/Snapshot tests
189201
//////////////////////////////////////////////////////////////////////////////////

kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/parquet/ParquetFileWriterSuite.scala

+15
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,21 @@ class ParquetFileWriterSuite extends AnyFunSuite
153153
),
154154
4 // how many columns have the stats collected from given list above
155155
)
156+
},
157+
// Decimal types with various precision and scales
158+
Seq((10000, 1)).map {
159+
case (targetFileSize, expParquetFileCount) =>
160+
(
161+
"write decimal various scales and precision (with stats)", // test name
162+
"decimal-various-scale-precision",
163+
targetFileSize,
164+
expParquetFileCount,
165+
3, /* expected number of rows written to Parquet files */
166+
Option.empty[Predicate], // predicate for filtering what rows to write to parquet files
167+
leafLevelPrimitiveColumns(
168+
Seq.empty, tableSchema(goldenTablePath("decimal-various-scale-precision"))),
169+
29 // how many columns have the stats collected from given list above
170+
)
156171
}
157172
).flatten.foreach {
158173
case (name, input, fileSize, expFileCount, expRowCount, predicate, statsCols, expStatsColCnt) =>

0 commit comments

Comments
 (0)