Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion python/cudf/cudf/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2024, NVIDIA CORPORATION.
# Copyright (c) 2019-2025, NVIDIA CORPORATION.

import itertools
import os
Expand Down Expand Up @@ -176,3 +176,21 @@ def pytest_runtest_makereport(item, call):
# Set a report attribute for each phase of a call, which can
# be "setup", "call", "teardown"
setattr(item, "report", {rep.when: rep})


@pytest.fixture(
params=[
{
"LIBCUDF_HOST_DECOMPRESSION": "OFF",
"LIBCUDF_NVCOMP_POLICY": "ALWAYS",
},
{"LIBCUDF_HOST_DECOMPRESSION": "OFF", "LIBCUDF_NVCOMP_POLICY": "OFF"},
{"LIBCUDF_HOST_DECOMPRESSION": "ON"},
],
)
def set_decomp_env_vars(monkeypatch, request):
env_vars = request.param
with monkeypatch.context() as m:
for key, value in env_vars.items():
m.setenv(key, value)
yield
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION.
# Copyright (c) 2021-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -212,7 +212,7 @@ def test_can_parse_no_schema():

@pytest.mark.parametrize("rows", [0, 1, 10, 1000])
@pytest.mark.parametrize("codec", ["null", "deflate", "snappy"])
def test_avro_compression(rows, codec):
def test_avro_decompression(set_decomp_env_vars, rows, codec):
schema = {
"name": "root",
"type": "record",
Expand Down
61 changes: 60 additions & 1 deletion python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os
import random
from io import BytesIO
from string import ascii_lowercase
from string import ascii_letters, ascii_lowercase

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -65,6 +65,53 @@ def _make_path_or_buf(src):
yield _make_path_or_buf


@pytest.fixture(scope="module")
def non_nested_pdf():
rng = np.random.default_rng(seed=0)
types = [
"bool",
"int8",
"int16",
"int32",
"int64",
"float32",
"float64",
"datetime64[ns]",
"str",
]
nrows = 12345

# Create a pandas dataframe with random data of mixed types
test_pdf = pd.DataFrame(
{
f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ)
for typ in types
},
)

for t in [
{
"name": "datetime64[ns]",
"nsDivisor": 1000,
"dayModulus": 86400000000,
},
]:
data = [
rng.integers(0, (0x7FFFFFFFFFFFFFFF / t["nsDivisor"]))
for i in range(nrows)
]

test_pdf["col_" + t["name"]] = pd.Series(
np.asarray(data, dtype=t["name"])
)

# Create non-numeric str data
data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)]
test_pdf["col_str"] = pd.Series(data, dtype="str")

return test_pdf


@pytest.mark.filterwarnings("ignore:Using CPU")
@pytest.mark.parametrize("engine", ["pyarrow", "cudf"])
@pytest.mark.parametrize("use_index", [False, True])
Expand Down Expand Up @@ -1995,3 +2042,15 @@ def test_orc_reader_desynced_timestamp(datadir, inputfile):
got = cudf.read_orc(path)

assert_frame_equal(cudf.from_pandas(expect), got)


@pytest.mark.parametrize("compression", ["LZ4", "SNAPPY", "ZLIB", "ZSTD"])
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could not include brotli, seems like pandas does not support it on the writer side.

def test_orc_decompression(set_decomp_env_vars, compression, non_nested_pdf):
# Write the DataFrame to a Parquet file
buffer = BytesIO()
non_nested_pdf.to_orc(buffer, engine_kwargs={"compression": compression})

# Read the Parquet file back into a DataFrame
got = cudf.read_orc(buffer)

assert_eq(non_nested_pdf, got)
20 changes: 20 additions & 0 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -4457,3 +4457,23 @@ def test_parquet_reader_empty_compressed_page(datadir):

df = cudf.DataFrame({"value": cudf.Series([None], dtype="float32")})
assert_eq(cudf.read_parquet(fname), df)


@pytest.fixture(params=[12345], scope="module")
def my_pdf(request):
return build_pdf(request, True)


@pytest.mark.parametrize("compression", ["brotli", "gzip", "snappy", "zstd"])
def test_parquet_decompression(set_decomp_env_vars, my_pdf, compression):
# PANDAS returns category objects whereas cuDF returns hashes
expect = my_pdf.drop(columns=["col_category"])

# Write the DataFrame to a Parquet file
buffer = BytesIO()
expect.to_parquet(buffer, compression=compression)

# Read the Parquet file back into a DataFrame
got = cudf.read_parquet(buffer)

assert_eq(expect, got)
Loading