Open
Description
Environment
Delta-rs version:
0.24.0
Binding:
Python
Environment:
- Cloud provider: N/A
- OS: MacOS 15.2
- Other:
Bug
What happened:
When table stats are calculated, only the first 32 columns are included. However, if a column has nested fields, these fields count towards the 32 limit.
Additionally, the stats are not calculated if the column is a list of structs (but that is maybe intended?)
What you expected to happen:
That a nested field only count once, and table stats are calculated for the first 32 columns at the root level.
How to reproduce it:
from deltalake import write_deltalake, DeltaTable
from polars import Schema, List, Struct, Int64, String
import polars as pl
schema = Schema(
{
"1": String,
"nested": List(
Struct(
{
"2": Int64,
"3": Int64,
"4": Int64,
"5": Int64,
"6": String,
"7": String,
"8": String,
"9": String,
"10": String,
"11": String,
"12": String,
"13": String,
"14": String,
"15": String,
"16": String,
"17": String,
"18": String,
"19": String,
"20": String,
"21": String,
"22": String,
"23": String,
"24": String,
"25": String,
"26": String,
"27": String,
"28": String,
"29": String,
"30": String,
"31": String,
"32": String,
}
)
),
"year": Int64,
"month": Int64,
"day": Int64,
}
)
df = pl.DataFrame(
{
"1": ["foo"],
"nested": [[]],
"year": [2024],
"month": [12],
"day": [1],
},
schema=schema,
)
write_deltalake(
"my_temp_table", df.to_arrow(), mode="overwrite", schema_mode="overwrite"
)
ds = DeltaTable("my_temp_table").to_pyarrow_dataset()
result = pl.scan_pyarrow_dataset(ds).filter(pl.col("year") == 2024).collect()
print(result)
# shape: (0, 5)
# ┌─────┬──────────────────┬──────┬───────┬─────┐
# │ 1 ┆ nested ┆ year ┆ month ┆ day │
# │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
# │ str ┆ list[struct[31]] ┆ i64 ┆ i64 ┆ i64 │
# ╞═════╪══════════════════╪══════╪═══════╪═════╡
# └─────┴──────────────────┴──────┴───────┴─────┘
The resulting transaction file. Here the interesting part is the stats
field of the add
transaction.
{
"protocol": {
"minReaderVersion": 1,
"minWriterVersion": 2
}
}
{
"metaData": {
"id": "3b36e7e3-abd2-4eac-a39a-f6fbfd5f39c6",
"name": null,
"description": null,
"format": {
"provider": "parquet",
"options": {}
},
"schemaString": "{\"type\":\"struct\",\"fields\":[{\"name\":\"1\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"nested\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"2\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"3\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"4\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"5\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"6\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"7\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"8\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"9\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"10\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"11\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"12\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"13\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"14\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"15\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"16\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"17\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"18\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"19\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"20\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"21\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"22\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"23\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"24\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"25\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"26\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"27\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"28\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"29\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"30\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"31\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"32\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"year\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"month\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"day\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]}",
"partitionColumns": [],
"createdTime": 1738246259519,
"configuration": {}
}
}
{
"add": {
"path": "part-00001-f9991ba8-46d0-4fa5-bae2-6ff3bfed4c56-c000.snappy.parquet",
"partitionValues": {},
"size": 8860,
"modificationTime": 1738246259565,
"dataChange": true,
"stats": "{\"numRecords\":1,\"minValues\":{\"1\":\"foo\"},\"maxValues\":{\"1\":\"foo\"},\"nullCount\":{\"1\":0}}",
"tags": null,
"deletionVector": null,
"baseRowId": null,
"defaultRowCommitVersion": null,
"clusteringProvider": null
}
}
{
"commitInfo": {
"timestamp": 1738246259569,
"operation": "WRITE",
"operationParameters": {
"mode": "Overwrite"
},
"operationMetrics": {
"execution_time_ms": 52,
"num_added_files": 1,
"num_added_rows": 1,
"num_partitions": 0,
"num_removed_files": 0
},
"clientVersion": "delta-rs.0.23.1"
}
}
More details:
Slack conversation:
https://delta-users.slack.com/archives/C013LCAEB98/p1738184140820519