|
2 | 2 | from deltacat.utils.pyarrow import (
|
3 | 3 | s3_partial_parquet_file_to_table,
|
4 | 4 | pyarrow_read_csv,
|
| 5 | + ContentTypeValidationError, |
5 | 6 | content_type_to_reader_kwargs,
|
6 | 7 | _add_column_kwargs,
|
| 8 | + logger, |
7 | 9 | s3_file_to_table,
|
| 10 | + s3_file_to_parquet, |
8 | 11 | ReadKwargsProviderPyArrowSchemaOverride,
|
9 | 12 | RAISE_ON_EMPTY_CSV_KWARG,
|
10 | 13 | RAISE_ON_DECIMAL_OVERFLOW,
|
@@ -435,7 +438,7 @@ def test_read_csv_when_decimal_precision_overflows_and_raise_kwarg_specified(sel
|
435 | 438 | pa.lib.ArrowInvalid,
|
436 | 439 | lambda: pyarrow_read_csv(
|
437 | 440 | OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
|
438 |
| - **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True} |
| 441 | + **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}, |
439 | 442 | ),
|
440 | 443 | )
|
441 | 444 |
|
@@ -479,7 +482,7 @@ def test_read_csv_when_decimal_scale_overflows_and_raise_kwarg_specified(self):
|
479 | 482 | pa.lib.ArrowInvalid,
|
480 | 483 | lambda: pyarrow_read_csv(
|
481 | 484 | OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
482 |
| - **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True} |
| 485 | + **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}, |
483 | 486 | ),
|
484 | 487 | )
|
485 | 488 |
|
@@ -590,7 +593,7 @@ def test_read_csv_when_decimal_scale_overflows_with_decimal256_and_raise_on_over
|
590 | 593 | pa.lib.ArrowNotImplementedError,
|
591 | 594 | lambda: pyarrow_read_csv(
|
592 | 595 | OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
593 |
| - **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True} |
| 596 | + **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}, |
594 | 597 | ),
|
595 | 598 | )
|
596 | 599 |
|
@@ -818,8 +821,11 @@ def test_s3_file_to_table_when_utsv_gzip_and_content_type_overridden(self):
|
818 | 821 | schema = pa.schema(
|
819 | 822 | [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
|
820 | 823 | )
|
821 |
| - |
822 | 824 | # OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG has no effect on uTSV files
|
| 825 | + pa_kwargs_provider = lambda content_type, kwargs: { |
| 826 | + "reader_type": "pyarrow", |
| 827 | + **kwargs, |
| 828 | + } |
823 | 829 | pa_kwargs_provider = lambda content_type, kwargs: {
|
824 | 830 | "reader_type": "pyarrow",
|
825 | 831 | OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
|
@@ -864,3 +870,99 @@ def test_s3_file_to_table_when_parquet_gzip_and_encoding_overridden(self):
|
864 | 870 | schema = result.schema
|
865 | 871 | schema_index = schema.get_field_index("n_legs")
|
866 | 872 | self.assertEqual(schema.field(schema_index).type, "int64")
|
| 873 | + |
| 874 | + |
| 875 | +class TestS3FileToParquet(TestCase): |
| 876 | + def test_s3_file_to_parquet_sanity(self): |
| 877 | + test_s3_url = PARQUET_FILE_PATH |
| 878 | + test_content_type = ContentType.PARQUET.value |
| 879 | + test_content_encoding = ContentEncoding.IDENTITY.value |
| 880 | + pa_kwargs_provider = lambda content_type, kwargs: { |
| 881 | + "reader_type": "pyarrow", |
| 882 | + **kwargs, |
| 883 | + } |
| 884 | + with self.assertLogs(logger=logger.name, level="DEBUG") as cm: |
| 885 | + result_parquet_file: ParquetFile = s3_file_to_parquet( |
| 886 | + test_s3_url, |
| 887 | + test_content_type, |
| 888 | + test_content_encoding, |
| 889 | + ["n_legs", "animal"], |
| 890 | + ["n_legs"], |
| 891 | + pa_read_func_kwargs_provider=pa_kwargs_provider, |
| 892 | + ) |
| 893 | + log_message_log_args = cm.records[0].getMessage() |
| 894 | + log_message_presanitize_kwargs = cm.records[1].getMessage() |
| 895 | + self.assertIn( |
| 896 | + f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}", |
| 897 | + log_message_log_args, |
| 898 | + ) |
| 899 | + self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs) |
| 900 | + for index, field in enumerate(result_parquet_file.schema_arrow): |
| 901 | + self.assertEqual( |
| 902 | + field.name, result_parquet_file.schema_arrow.field(index).name |
| 903 | + ) |
| 904 | + self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64") |
| 905 | + |
| 906 | + def test_s3_file_to_parquet_when_parquet_gzip_encoding_and_overridden_returns_success( |
| 907 | + self, |
| 908 | + ): |
| 909 | + test_s3_url = PARQUET_FILE_PATH |
| 910 | + test_content_type = ContentType.PARQUET.value |
| 911 | + test_content_encoding = ContentEncoding.GZIP.value |
| 912 | + pa_kwargs_provider = lambda content_type, kwargs: { |
| 913 | + "reader_type": "pyarrow", |
| 914 | + OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value, |
| 915 | + **kwargs, |
| 916 | + } |
| 917 | + with self.assertLogs(logger=logger.name, level="DEBUG") as cm: |
| 918 | + result_parquet_file: ParquetFile = s3_file_to_parquet( |
| 919 | + test_s3_url, |
| 920 | + test_content_type, |
| 921 | + test_content_encoding, |
| 922 | + ["n_legs", "animal"], |
| 923 | + ["n_legs"], |
| 924 | + pa_read_func_kwargs_provider=pa_kwargs_provider, |
| 925 | + ) |
| 926 | + log_message_log_args = cm.records[0].getMessage() |
| 927 | + log_message_log_new_content_encoding = cm.records[1].getMessage() |
| 928 | + log_message_presanitize_kwargs = cm.records[2].getMessage() |
| 929 | + self.assertIn( |
| 930 | + f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}", |
| 931 | + log_message_log_args, |
| 932 | + ) |
| 933 | + self.assertIn( |
| 934 | + f"Overriding {test_s3_url} content encoding from {ContentEncoding.GZIP.value} to {ContentEncoding.IDENTITY.value}", |
| 935 | + log_message_log_new_content_encoding, |
| 936 | + ) |
| 937 | + self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs) |
| 938 | + for index, field in enumerate(result_parquet_file.schema_arrow): |
| 939 | + self.assertEqual( |
| 940 | + field.name, result_parquet_file.schema_arrow.field(index).name |
| 941 | + ) |
| 942 | + self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64") |
| 943 | + |
| 944 | + def test_s3_file_to_parquet_when_parquet_gzip_encoding_not_overridden_throws_error( |
| 945 | + self, |
| 946 | + ): |
| 947 | + test_s3_url = PARQUET_FILE_PATH |
| 948 | + test_content_type = ContentType.PARQUET.value |
| 949 | + test_content_encoding = ContentEncoding.GZIP.value |
| 950 | + pa_kwargs_provider = lambda content_type, kwargs: { |
| 951 | + "reader_type": "pyarrow", |
| 952 | + **kwargs, |
| 953 | + } |
| 954 | + with self.assertRaises(ContentTypeValidationError): |
| 955 | + with self.assertLogs(logger=logger.name, level="DEBUG") as cm: |
| 956 | + s3_file_to_parquet( |
| 957 | + test_s3_url, |
| 958 | + test_content_type, |
| 959 | + test_content_encoding, |
| 960 | + ["n_legs", "animal"], |
| 961 | + ["n_legs"], |
| 962 | + pa_read_func_kwargs_provider=pa_kwargs_provider, |
| 963 | + ) |
| 964 | + log_message_log_args = cm.records[0].getMessage() |
| 965 | + self.assertIn( |
| 966 | + f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}", |
| 967 | + log_message_log_args, |
| 968 | + ) |
0 commit comments