rapidsai · rapids-bot · May 13, 2025 · May 7, 2025 · May 7, 2025 · May 7, 2025
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import itertools
 import os
@@ -176,3 +176,21 @@ def pytest_runtest_makereport(item, call):
     # Set a report attribute for each phase of a call, which can
     # be "setup", "call", "teardown"
     setattr(item, "report", {rep.when: rep})
+
+
+@pytest.fixture(
+    params=[
+        {
+            "LIBCUDF_HOST_DECOMPRESSION": "OFF",
+            "LIBCUDF_NVCOMP_POLICY": "ALWAYS",
+        },
+        {"LIBCUDF_HOST_DECOMPRESSION": "OFF", "LIBCUDF_NVCOMP_POLICY": "OFF"},
+        {"LIBCUDF_HOST_DECOMPRESSION": "ON"},
+    ],
+)
+def set_decomp_env_vars(monkeypatch, request):
+    env_vars = request.param
+    with monkeypatch.context() as m:
+        for key, value in env_vars.items():
+            m.setenv(key, value)
+        yield
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -212,7 +212,7 @@ def test_can_parse_no_schema():
 
 @pytest.mark.parametrize("rows", [0, 1, 10, 1000])
 @pytest.mark.parametrize("codec", ["null", "deflate", "snappy"])
-def test_avro_compression(rows, codec):
+def test_avro_decompression(set_decomp_env_vars, rows, codec):
     schema = {
         "name": "root",
         "type": "record",

@@ -5,7 +5,7 @@
 import os
 import random
 from io import BytesIO
-from string import ascii_lowercase
+from string import ascii_letters, ascii_lowercase
 
 import numpy as np
 import pandas as pd
@@ -65,6 +65,53 @@ def _make_path_or_buf(src):
     yield _make_path_or_buf
 
 
+@pytest.fixture(scope="module")
+def non_nested_pdf():
+    rng = np.random.default_rng(seed=0)
+    types = [
+        "bool",
+        "int8",
+        "int16",
+        "int32",
+        "int64",
+        "float32",
+        "float64",
+        "datetime64[ns]",
+        "str",
+    ]
+    nrows = 12345
+
+    # Create a pandas dataframe with random data of mixed types
+    test_pdf = pd.DataFrame(
+        {
+            f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ)
+            for typ in types
+        },
+    )
+
+    for t in [
+        {
+            "name": "datetime64[ns]",
+            "nsDivisor": 1000,
+            "dayModulus": 86400000000,
+        },
+    ]:
+        data = [
+            rng.integers(0, (0x7FFFFFFFFFFFFFFF / t["nsDivisor"]))
+            for i in range(nrows)
+        ]
+
+        test_pdf["col_" + t["name"]] = pd.Series(
+            np.asarray(data, dtype=t["name"])
+        )
+
+    # Create non-numeric str data
+    data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)]
+    test_pdf["col_str"] = pd.Series(data, dtype="str")
+
+    return test_pdf
+
+
 @pytest.mark.filterwarnings("ignore:Using CPU")
 @pytest.mark.parametrize("engine", ["pyarrow", "cudf"])
 @pytest.mark.parametrize("use_index", [False, True])
@@ -1995,3 +2042,15 @@ def test_orc_reader_desynced_timestamp(datadir, inputfile):
     got = cudf.read_orc(path)
 
     assert_frame_equal(cudf.from_pandas(expect), got)
+
+
+@pytest.mark.parametrize("compression", ["LZ4", "SNAPPY", "ZLIB", "ZSTD"])
+def test_orc_decompression(set_decomp_env_vars, compression, non_nested_pdf):
+    # Write the DataFrame to a Parquet file
+    buffer = BytesIO()
+    non_nested_pdf.to_orc(buffer, engine_kwargs={"compression": compression})
+
+    # Read the Parquet file back into a DataFrame
+    got = cudf.read_orc(buffer)
+
+    assert_eq(non_nested_pdf, got)
@@ -4457,3 +4457,23 @@ def test_parquet_reader_empty_compressed_page(datadir):
 
     df = cudf.DataFrame({"value": cudf.Series([None], dtype="float32")})
     assert_eq(cudf.read_parquet(fname), df)
+
+
+@pytest.fixture(params=[12345], scope="module")
+def my_pdf(request):
+    return build_pdf(request, True)
+
+
+@pytest.mark.parametrize("compression", ["brotli", "gzip", "snappy", "zstd"])
+def test_parquet_decompression(set_decomp_env_vars, my_pdf, compression):
+    # PANDAS returns category objects whereas cuDF returns hashes
+    expect = my_pdf.drop(columns=["col_category"])
+
+    # Write the DataFrame to a Parquet file
+    buffer = BytesIO()
+    expect.to_parquet(buffer, compression=compression)
+
+    # Read the Parquet file back into a DataFrame
+    got = cudf.read_parquet(buffer)
+
+    assert_eq(expect, got)