Skip to content

Conversation

@niltecedu
Copy link
Contributor

Description

This PR aims to add per column parquet encoding support similar to what pyarrow has right now. Since pyarrow engine is being deprecated this is the best way to resolve the issue. It adds encoding spec same as what parquet spec has defined.

https://parquet.apache.org/docs/file-format/data-pages/encodings/

Thanks to ghost or deleted-user who has carried out most of the work but it never got merged in.

Related Issue(s)

Documentation

This is implementing

https://parquet.apache.org/docs/file-format/data-pages/encodings/

into ColumnProperties, as part of this it has also added Encoding enum.
It will the same format as the other ColumnProperties arguments, PLAIN_DICTIONARY and RLE_DICTIONARY could not be implemented due to some issues, will be looked into later on.

Here is a code snippet on how you can how use the delta tables and how you can compare against existing implementation

import os
import pandas as pd
import numpy as np

from deltalake import write_deltalake, WriterProperties, ColumnProperties,DeltaTable
from pathlib import Path

import pyarrow.parquet as pq
import pyarrow as pa

# Make some fake time series data
TOTAL_ROWS = 100_000_00
timestamps = pd.date_range(start=pd.Timestamp.now(), periods=TOTAL_ROWS, freq="5ms")
timeline = np.linspace(0, len(timestamps), len(timestamps))
print("Generating data...")
pat = pa.Table.from_pandas(
    pd.DataFrame(
        {
            # timestamp (auto-generated)
            "timestamp": timestamps,
            # Timeseries data as float32
            "timeseries_data": (10 * np.sin(2 * np.pi * 50 * timeline)).astype(
                np.float32
            ),
            "timeseries_int_data": (1000 * np.sin(2 * np.pi * 50 * timeline)).astype(
                np.int32
            ),
            # 1 minute partitions
            "partition_label": timestamps.strftime("%H%M"),
        }
    )
)
print("Data generated.")
output_path_normal = "example_deltalake"
write_deltalake(
    output_path_normal,
    data=pat,
    partition_by=["partition_label"],
    # Enabled compression for equivalent comparison, dictionary enabled leads to a larger file size
    writer_properties=WriterProperties(
        compression="ZSTD",
        compression_level=1,
        default_column_properties=ColumnProperties(dictionary_enabled=True),
    ),
    mode="overwrite",
    # Can't specify per-column encoding
)
print("Wrote normal delta table.")



output_path_encoded = "encoded_example_deltalake"
write_deltalake(
    output_path_encoded,
    data=pat,
    partition_by=["partition_label"],
    # Enabled compression for equivalent comparison, dictionary enabled leads to a larger file size
    writer_properties=WriterProperties(
        compression="ZSTD",
        compression_level=1,
        column_properties={
            "timestamp": ColumnProperties(dictionary_enabled=False,encoding="DELTA_BINARY_PACKED"),
            "timeseries_data": ColumnProperties(dictionary_enabled=False,encoding="BYTE_STREAM_SPLIT"),
            "timeseries_int_data": ColumnProperties(dictionary_enabled=False,encoding="DELTA_BINARY_PACKED"),
            "partition_label": ColumnProperties(dictionary_enabled=False,encoding="DELTA_BINARY_PACKED"),
        },
    ),
    mode="overwrite",
    # Can't specify per-column encoding
)
print("Wrote encoded delta table.")

output_path_default_encoded = "example_pyarrow_delta_default_encoding"
pq.write_to_dataset(
    pat,
    output_path_default_encoded,
    partition_cols=["partition_label"],
    use_dictionary=False,
    use_byte_stream_split=True,
    compression="ZSTD",
    compression_level=1,
)


output_path_delta_specifc_encoded = "example_pyarrow_delta_specifc_col_encoding"
pq.write_to_dataset(
    pat,
    output_path_delta_specifc_encoded,
    partition_cols=["partition_label"],
    # Ability to specify column encodings here
    use_dictionary=False,
    use_byte_stream_split=False,
    column_encoding={
        "timestamp": "DELTA_BINARY_PACKED",
        "timeseries_data": "BYTE_STREAM_SPLIT",
        "timeseries_int_data": "DELTA_BINARY_PACKED",
        "partition_label": "DELTA_BINARY_PACKED",
    },
    compression="ZSTD",
    compression_level=1,
)
print("Wrote delta table with pyarrow column encodings.")


def get_folder_size(folder):
    return ByteSize(
        sum(file.stat().st_size for file in Path(folder).rglob("*"))
    ).megabytes


class ByteSize(int):
    _KB = 1024
    _suffixes = "B", "KB", "MB", "GB", "PB"

    def __new__(cls, *args, **kwargs):
        return super().__new__(cls, *args, **kwargs)

    def __init__(self, *args, **kwargs):
        self.bytes = self.B = int(self)
        self.kilobytes = self.KB = self / self._KB**1
        self.megabytes = self.MB = self / self._KB**2
        self.gigabytes = self.GB = self / self._KB**3
        self.petabytes = self.PB = self / self._KB**4
        *suffixes, last = self._suffixes
        suffix = next(
            (suffix for suffix in suffixes if 1 < getattr(self, suffix) < self._KB),
            last,
        )
        self.readable = suffix, getattr(self, suffix)

        super().__init__()

    def __str__(self):
        return self.__format__(".2f")

print(DeltaTable(output_path_encoded).to_pandas())

print(f"The File size of delta table is {get_folder_size(output_path_normal)} MB")
print(f"The File size of delta table with parquet encoding is {get_folder_size(output_path_default_encoded)} MB")
print(
    f"The File size of delta table with pyarrow default column encodings is {get_folder_size(output_path_default_encoded)} MB"
)
print(
    f"The File size of delta table with pyarrow specific column encodings is {get_folder_size(output_path_delta_specifc_encoded)} MB"
)


print("Deleting the folders now...")
import shutil

shutil.rmtree(output_path_normal)
shutil.rmtree(output_path_encoded)
shutil.rmtree(output_path_default_encoded)
shutil.rmtree(output_path_delta_specifc_encoded)
print("Deleted the folders.")

I have ran this script on my dev environment of Ubuntu 24.04 WSL2 and python 3.12.3

The outputs came to

The File size of delta table is 122.21567821502686 MB
The File size of delta table with parquet encoding is 25.041666984558105 MB
The File size of delta table with pyarrow default column encodings is 25.041666984558105 MB
The File size of delta table with pyarrow specific column encodings is 20.985719680786133 MB

Which is roughly 75% improvement with mixed data. If you decide to further optimisations you can go down to 95%-98% (if all INT data)

Copilot AI review requested due to automatic review settings September 5, 2025 16:04
@github-actions github-actions bot added the binding/python Issues for the Python package label Sep 5, 2025
Copy link
Contributor

Copilot AI left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull Request Overview

This PR adds per-column Parquet encoding support to Delta tables, allowing users to specify different encoding algorithms for individual columns to optimize file size and performance. This feature mirrors capabilities previously available only in PyArrow and addresses the deprecation of the PyArrow engine.

  • Introduces an Encoding enum with support for various Parquet encoding algorithms
  • Adds encoding parameter to ColumnProperties with validation logic
  • Implements Rust-side encoding configuration in the writer properties

Reviewed Changes

Copilot reviewed 3 out of 3 changed files in this pull request and generated 4 comments.

File Description
python/deltalake/writer/properties.py Adds Encoding enum and encoding parameter to ColumnProperties with validation
python/src/lib.rs Implements Rust-side encoding configuration and dictionary disabling logic
python/tests/test_writerproperties.py Adds comprehensive tests for encoding functionality and edge cases

Tip: Customize your code reviews with copilot-instructions.md. Create the file or learn how to get started.

@codecov
Copy link

codecov bot commented Sep 5, 2025

Codecov Report

❌ Patch coverage is 0% with 7 lines in your changes missing coverage. Please review.
✅ Project coverage is 75.53%. Comparing base (538a90c) to head (dd97fe1).
⚠️ Report is 1 commits behind head on main.

Files with missing lines Patch % Lines
python/src/lib.rs 0.00% 7 Missing ⚠️
Additional details and impacted files
@@            Coverage Diff             @@
##             main    #3737      +/-   ##
==========================================
- Coverage   75.55%   75.53%   -0.02%     
==========================================
  Files         145      145              
  Lines       44424    44431       +7     
  Branches    44424    44431       +7     
==========================================
- Hits        33565    33563       -2     
- Misses       9215     9224       +9     
  Partials     1644     1644              

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.
  • 📦 JS Bundle Analysis: Save yourself from yourself by tracking and limiting bundle sizes in JS merges.

@ion-elgreco ion-elgreco self-assigned this Sep 5, 2025
@niltecedu
Copy link
Contributor Author

Just saw the actions failing and commits not being signed off, the comment in the src.rs causing codecov to fail. I will get it fixed

@ion-elgreco Regarding one of the tests for the writerproperties, I can skip out on them and skip on pandas and pyarrow. There were just included to check for edges cases for my sanity; should I remove those?

@rtyler
Copy link
Member

rtyler commented Sep 6, 2025

Thanks for taking the time to move this forward @niltecedu! Don't mind the robot who is clearly annoying 😒

For my own understanding, what do people typically want per-column parquet encodings for? I have not encountered them "in the wild" yet.

@niltecedu
Copy link
Contributor Author

niltecedu commented Sep 6, 2025

Hey @rtyler yeah understood about the robot xD still need to follow proper rules. The other person with the deleted account had already done 90% of the work.

Regarding per-column encoding, not a lot of people use it generally, atleast not in my company because they don't know about it. We are using it for archive data storage and open data sharing, so all the data is expected to be the region in 300-400TiB (in different tables), so compressing that efficiently based on parquet encoding helps a lot. One of our major tables is in the format of the example that I showed, that alone would be 75% percent saving on our archive data with little effort. Currently in the effort of trying to find a way to shared 300billion rows of data feasibly, this helps out in storage costs and network bandwidth costs for the public users and us.

The main thing is regarding compression, if you setup encoding properly compression can go wild. Even compression level for ztd 1 can wildly compress, easily, for reference these are results I got from a dummy delta table from code above

raw: 13.87 MiB
lz4_raw without col encoding: 13.914MiB (0% compression)
ztd 1 without col encoding: 13.7502 MiB (1% compression)

ztd 1 with col encoding: 3.4277 MiB (75.28% compression)
lz4_raw with col encoding: 3.4298 MiB (75.27% compression)

ztd 9 without col encoding: 10.19310 MiB (26.50% compression)
ztd 22 without col encoding: 9.7050 MiB (30.02% compression)

ztd 9 with col encoding: 3.15104 MiB (77.28% compression)
ztd 22 with col encoding: 2.9701 MiB (78.58% compression)

So 64% roughly extra saving of the highest non col encoding (zstd22) vs the lowest col encoding (lz4_raw). And 3% difference ztd1 vs ztd22 due to it. We have done something similar with netcdf files and blosc2 compression which saves us 70% more data while just using lz4.

For us extra stuff like storing H3 columns which are all int64, its a huge space savings, which leads to larger,faster read times as well. So overall its just a one time setting which can avoid you a lot of issues.

niltecedu and others added 4 commits September 6, 2025 23:53
…s and update ColumnProperties to use them

Signed-off-by: Nilesh Zagade <[email protected]>
Signed-off-by: Nilesh Zagade <[email protected]>
Co-authored-by: Copilot <[email protected]>
Signed-off-by: Nilesh Zagade <[email protected]>
@niltecedu niltecedu force-pushed the feat/per-col-pq-encoding branch from c47c348 to 15d9df0 Compare September 6, 2025 22:55
… for writer properties encoding to not fail on non pyarrow test

Signed-off-by: Nilesh Zagade <[email protected]>
@niltecedu
Copy link
Contributor Author

Hey guys so fixed the other issues, regarding the no pyarrow tests and signed off commits however I cannot figure out how to fix the codecoverage. I am very bad with rust so do not know where I would put the rust tests for this.

"""

PLAIN = "PLAIN"
PLAIN_DICTIONARY = "PLAIN_DICTIONARY" # Deprecated
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since its deprecated, better not include the option in general.

Then you don't have to do the if else checks later

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it, will remove

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's put this simply in a gist for future reference, I don't think we will need this after this is merged

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would this be useful to slap into a doc file?

Copy link
Contributor Author

@niltecedu niltecedu Sep 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was not meant to be committed, sorry will remove. I will add a small snippet into the writerproperties docs or usage as an example later on

…m and update ColumnProperties validation to past test_minimal issues on py3.9

Signed-off-by: Nilesh Zagade <[email protected]>
"""
Encoding types for Parquet columns.
https://parquet.apache.org/docs/file-format/data-pages/encodings/
Dictionary encodings (PLAIN_DICTIONARY and RLE_DICTIONARY) are enabled via the `dictionary_enabled` flag
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How can you choose between rle and plain then?

Copy link
Contributor Author

@niltecedu niltecedu Sep 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PLAIN and RLE can still be enabled however PLAIN_DICTIONARY and RLE_DICTIONARY are classed as together, based on the parquet documentation it looks like RLE_DICTIONARY replaced the older PLAIN_DICTIONARY.

If you have dictionary_enabled then it should take care of those two params, and then other encoding you do it via encoding param

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And also the current behaviour is the same as spark as well
https://spark.apache.org/docs/latest/sql-data-sources-load-save-functions.html

@ion-elgreco
Copy link
Collaborator

@niltecedu great work!

@ion-elgreco ion-elgreco enabled auto-merge (squash) September 11, 2025 18:44
@ion-elgreco ion-elgreco merged commit 80d3bb9 into delta-io:main Sep 11, 2025
28 of 29 checks passed
@niltecedu
Copy link
Contributor Author

Thanks! @ion-elgreco

fvaleye pushed a commit to fvaleye/delta-rs that referenced this pull request Sep 12, 2025
…elta-io#3737)

# Description
This PR aims to add per column parquet encoding support similar to what
pyarrow has right now. Since pyarrow engine is being deprecated this is
the best way to resolve the issue. It adds encoding spec same as what
parquet spec has defined.

https://parquet.apache.org/docs/file-format/data-pages/encodings/

Thanks to ghost or deleted-user who has carried out most of the work but
it never got merged in.

# Related Issue(s)

- closes delta-io#3319


# Documentation

This is implementing 

https://parquet.apache.org/docs/file-format/data-pages/encodings/

into ColumnProperties, as part of this it has also added Encoding enum.
It will the same format as the other ColumnProperties arguments,
PLAIN_DICTIONARY and RLE_DICTIONARY could not be implemented due to some
issues, will be looked into later on.

Here is a code snippet on how you can how use the delta tables and how
you can compare against existing implementation

```python
import os
import pandas as pd
import numpy as np

from deltalake import write_deltalake, WriterProperties, ColumnProperties,DeltaTable
from pathlib import Path

import pyarrow.parquet as pq
import pyarrow as pa

# Make some fake time series data
TOTAL_ROWS = 100_000_00
timestamps = pd.date_range(start=pd.Timestamp.now(), periods=TOTAL_ROWS, freq="5ms")
timeline = np.linspace(0, len(timestamps), len(timestamps))
print("Generating data...")
pat = pa.Table.from_pandas(
    pd.DataFrame(
        {
            # timestamp (auto-generated)
            "timestamp": timestamps,
            # Timeseries data as float32
            "timeseries_data": (10 * np.sin(2 * np.pi * 50 * timeline)).astype(
                np.float32
            ),
            "timeseries_int_data": (1000 * np.sin(2 * np.pi * 50 * timeline)).astype(
                np.int32
            ),
            # 1 minute partitions
            "partition_label": timestamps.strftime("%H%M"),
        }
    )
)
print("Data generated.")
output_path_normal = "example_deltalake"
write_deltalake(
    output_path_normal,
    data=pat,
    partition_by=["partition_label"],
    # Enabled compression for equivalent comparison, dictionary enabled leads to a larger file size
    writer_properties=WriterProperties(
        compression="ZSTD",
        compression_level=1,
        default_column_properties=ColumnProperties(dictionary_enabled=True),
    ),
    mode="overwrite",
    # Can't specify per-column encoding
)
print("Wrote normal delta table.")



output_path_encoded = "encoded_example_deltalake"
write_deltalake(
    output_path_encoded,
    data=pat,
    partition_by=["partition_label"],
    # Enabled compression for equivalent comparison, dictionary enabled leads to a larger file size
    writer_properties=WriterProperties(
        compression="ZSTD",
        compression_level=1,
        column_properties={
            "timestamp": ColumnProperties(dictionary_enabled=False,encoding="DELTA_BINARY_PACKED"),
            "timeseries_data": ColumnProperties(dictionary_enabled=False,encoding="BYTE_STREAM_SPLIT"),
            "timeseries_int_data": ColumnProperties(dictionary_enabled=False,encoding="DELTA_BINARY_PACKED"),
            "partition_label": ColumnProperties(dictionary_enabled=False,encoding="DELTA_BINARY_PACKED"),
        },
    ),
    mode="overwrite",
    # Can't specify per-column encoding
)
print("Wrote encoded delta table.")

output_path_default_encoded = "example_pyarrow_delta_default_encoding"
pq.write_to_dataset(
    pat,
    output_path_default_encoded,
    partition_cols=["partition_label"],
    use_dictionary=False,
    use_byte_stream_split=True,
    compression="ZSTD",
    compression_level=1,
)


output_path_delta_specifc_encoded = "example_pyarrow_delta_specifc_col_encoding"
pq.write_to_dataset(
    pat,
    output_path_delta_specifc_encoded,
    partition_cols=["partition_label"],
    # Ability to specify column encodings here
    use_dictionary=False,
    use_byte_stream_split=False,
    column_encoding={
        "timestamp": "DELTA_BINARY_PACKED",
        "timeseries_data": "BYTE_STREAM_SPLIT",
        "timeseries_int_data": "DELTA_BINARY_PACKED",
        "partition_label": "DELTA_BINARY_PACKED",
    },
    compression="ZSTD",
    compression_level=1,
)
print("Wrote delta table with pyarrow column encodings.")


def get_folder_size(folder):
    return ByteSize(
        sum(file.stat().st_size for file in Path(folder).rglob("*"))
    ).megabytes


class ByteSize(int):
    _KB = 1024
    _suffixes = "B", "KB", "MB", "GB", "PB"

    def __new__(cls, *args, **kwargs):
        return super().__new__(cls, *args, **kwargs)

    def __init__(self, *args, **kwargs):
        self.bytes = self.B = int(self)
        self.kilobytes = self.KB = self / self._KB**1
        self.megabytes = self.MB = self / self._KB**2
        self.gigabytes = self.GB = self / self._KB**3
        self.petabytes = self.PB = self / self._KB**4
        *suffixes, last = self._suffixes
        suffix = next(
            (suffix for suffix in suffixes if 1 < getattr(self, suffix) < self._KB),
            last,
        )
        self.readable = suffix, getattr(self, suffix)

        super().__init__()

    def __str__(self):
        return self.__format__(".2f")

print(DeltaTable(output_path_encoded).to_pandas())

print(f"The File size of delta table is {get_folder_size(output_path_normal)} MB")
print(f"The File size of delta table with parquet encoding is {get_folder_size(output_path_default_encoded)} MB")
print(
    f"The File size of delta table with pyarrow default column encodings is {get_folder_size(output_path_default_encoded)} MB"
)
print(
    f"The File size of delta table with pyarrow specific column encodings is {get_folder_size(output_path_delta_specifc_encoded)} MB"
)


print("Deleting the folders now...")
import shutil

shutil.rmtree(output_path_normal)
shutil.rmtree(output_path_encoded)
shutil.rmtree(output_path_default_encoded)
shutil.rmtree(output_path_delta_specifc_encoded)
print("Deleted the folders.")


``` 


I have ran this script on my dev environment of Ubuntu 24.04 WSL2 and
python 3.12.3

The outputs came to 

The File size of delta table is 122.21567821502686 MB
The File size of delta table with parquet encoding is 25.041666984558105
MB
The File size of delta table with pyarrow default column encodings is
25.041666984558105 MB
The File size of delta table with pyarrow specific column encodings is
20.985719680786133 MB

Which is roughly 75% improvement with mixed data. If you decide to
further optimisations you can go down to 95%-98% (if all INT data)

---------

Signed-off-by: Nilesh Zagade <[email protected]>
Co-authored-by: Copilot <[email protected]>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

binding/python Issues for the Python package

Projects

None yet

Development

Successfully merging this pull request may close these issues.

Allow specifying per-column encoding when writing delta lake tables

3 participants