Bugfix: If OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG passed then explicitly override content encoding (#505)

pfaraone · web-flow · commit 11bbbeda3523 · 2025-03-12T10:43:58.000-07:00
diff --git a/deltacat/tests/utils/test_pyarrow.py b/deltacat/tests/utils/test_pyarrow.py
@@ -2,9 +2,12 @@
 from deltacat.utils.pyarrow import (
     s3_partial_parquet_file_to_table,
     pyarrow_read_csv,
+    ContentTypeValidationError,
     content_type_to_reader_kwargs,
     _add_column_kwargs,
+    logger,
     s3_file_to_table,
+    s3_file_to_parquet,
     ReadKwargsProviderPyArrowSchemaOverride,
     RAISE_ON_EMPTY_CSV_KWARG,
     RAISE_ON_DECIMAL_OVERFLOW,
@@ -435,7 +438,7 @@ def test_read_csv_when_decimal_precision_overflows_and_raise_kwarg_specified(sel
             pa.lib.ArrowInvalid,
             lambda: pyarrow_read_csv(
                 OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
-                **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
+                **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
             ),
         )
 
@@ -479,7 +482,7 @@ def test_read_csv_when_decimal_scale_overflows_and_raise_kwarg_specified(self):
             pa.lib.ArrowInvalid,
             lambda: pyarrow_read_csv(
                 OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
-                **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
+                **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
             ),
         )
 
@@ -590,7 +593,7 @@ def test_read_csv_when_decimal_scale_overflows_with_decimal256_and_raise_on_over
             pa.lib.ArrowNotImplementedError,
             lambda: pyarrow_read_csv(
                 OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
-                **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
+                **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
             ),
         )
 
@@ -818,8 +821,11 @@ def test_s3_file_to_table_when_utsv_gzip_and_content_type_overridden(self):
         schema = pa.schema(
             [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
         )
-
         # OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG has no effect on uTSV files
+        pa_kwargs_provider = lambda content_type, kwargs: {
+            "reader_type": "pyarrow",
+            **kwargs,
+        }
         pa_kwargs_provider = lambda content_type, kwargs: {
             "reader_type": "pyarrow",
             OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
@@ -864,3 +870,99 @@ def test_s3_file_to_table_when_parquet_gzip_and_encoding_overridden(self):
         schema = result.schema
         schema_index = schema.get_field_index("n_legs")
         self.assertEqual(schema.field(schema_index).type, "int64")
+
+
+class TestS3FileToParquet(TestCase):
+    def test_s3_file_to_parquet_sanity(self):
+        test_s3_url = PARQUET_FILE_PATH
+        test_content_type = ContentType.PARQUET.value
+        test_content_encoding = ContentEncoding.IDENTITY.value
+        pa_kwargs_provider = lambda content_type, kwargs: {
+            "reader_type": "pyarrow",
+            **kwargs,
+        }
+        with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
+            result_parquet_file: ParquetFile = s3_file_to_parquet(
+                test_s3_url,
+                test_content_type,
+                test_content_encoding,
+                ["n_legs", "animal"],
+                ["n_legs"],
+                pa_read_func_kwargs_provider=pa_kwargs_provider,
+            )
+        log_message_log_args = cm.records[0].getMessage()
+        log_message_presanitize_kwargs = cm.records[1].getMessage()
+        self.assertIn(
+            f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
+            log_message_log_args,
+        )
+        self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
+        for index, field in enumerate(result_parquet_file.schema_arrow):
+            self.assertEqual(
+                field.name, result_parquet_file.schema_arrow.field(index).name
+            )
+        self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
+
+    def test_s3_file_to_parquet_when_parquet_gzip_encoding_and_overridden_returns_success(
+        self,
+    ):
+        test_s3_url = PARQUET_FILE_PATH
+        test_content_type = ContentType.PARQUET.value
+        test_content_encoding = ContentEncoding.GZIP.value
+        pa_kwargs_provider = lambda content_type, kwargs: {
+            "reader_type": "pyarrow",
+            OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
+            **kwargs,
+        }
+        with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
+            result_parquet_file: ParquetFile = s3_file_to_parquet(
+                test_s3_url,
+                test_content_type,
+                test_content_encoding,
+                ["n_legs", "animal"],
+                ["n_legs"],
+                pa_read_func_kwargs_provider=pa_kwargs_provider,
+            )
+        log_message_log_args = cm.records[0].getMessage()
+        log_message_log_new_content_encoding = cm.records[1].getMessage()
+        log_message_presanitize_kwargs = cm.records[2].getMessage()
+        self.assertIn(
+            f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
+            log_message_log_args,
+        )
+        self.assertIn(
+            f"Overriding {test_s3_url} content encoding from {ContentEncoding.GZIP.value} to {ContentEncoding.IDENTITY.value}",
+            log_message_log_new_content_encoding,
+        )
+        self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
+        for index, field in enumerate(result_parquet_file.schema_arrow):
+            self.assertEqual(
+                field.name, result_parquet_file.schema_arrow.field(index).name
+            )
+        self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
+
+    def test_s3_file_to_parquet_when_parquet_gzip_encoding_not_overridden_throws_error(
+        self,
+    ):
+        test_s3_url = PARQUET_FILE_PATH
+        test_content_type = ContentType.PARQUET.value
+        test_content_encoding = ContentEncoding.GZIP.value
+        pa_kwargs_provider = lambda content_type, kwargs: {
+            "reader_type": "pyarrow",
+            **kwargs,
+        }
+        with self.assertRaises(ContentTypeValidationError):
+            with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
+                s3_file_to_parquet(
+                    test_s3_url,
+                    test_content_type,
+                    test_content_encoding,
+                    ["n_legs", "animal"],
+                    ["n_legs"],
+                    pa_read_func_kwargs_provider=pa_kwargs_provider,
+                )
+        log_message_log_args = cm.records[0].getMessage()
+        self.assertIn(
+            f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
+            log_message_log_args,
+        )
diff --git a/deltacat/utils/pyarrow.py b/deltacat/utils/pyarrow.py
@@ -617,7 +617,18 @@ def s3_file_to_parquet(
         f"Reading {s3_url} to PyArrow ParquetFile. "
         f"Content type: {content_type}. Encoding: {content_encoding}"
     )
+    kwargs = {}
+    if pa_read_func_kwargs_provider:
+        kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
 
+    if OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG in kwargs:
+        new_content_encoding = kwargs.pop(OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG)
+        if content_type == ContentType.PARQUET.value:
+            logger.debug(
+                f"Overriding {s3_url} content encoding from {content_encoding} "
+                f"to {new_content_encoding}"
+            )
+            content_encoding = new_content_encoding
     if (
         content_type != ContentType.PARQUET.value
         or content_encoding != ContentEncoding.IDENTITY
@@ -630,15 +641,10 @@ def s3_file_to_parquet(
     if s3_client_kwargs is None:
         s3_client_kwargs = {}
 
-    kwargs = {}
-
     if s3_url.startswith("s3://"):
         s3_file_system = create_s3_file_system(s3_client_kwargs)
         kwargs["filesystem"] = s3_file_system
 
-    if pa_read_func_kwargs_provider:
-        kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
-
     logger.debug(f"Pre-sanitize kwargs for {s3_url}: {kwargs}")
 
     kwargs = sanitize_kwargs_to_callable(ParquetFile.__init__, kwargs)