[Converter][Test]Add working end-to-end converter compute test with Minio S3 endpoint (#494)

Zyiqin-Miranda · yiqinzhu · web-flow · commit bf9076c7dd04 · 2025-03-13T13:18:57.000-07:00
* [Converter][Test]Add working end-to-end converter compute test with Minio S3 endpoint

* address comments

---------

Co-authored-by: Miranda &lt;yiqin121@gmail.com&gt;
diff --git a/Makefile b/Makefile
@@ -44,6 +44,7 @@ test-integration: install
 	docker-compose -f dev/iceberg-integration/docker-compose-integration.yml up -d
 	sleep 3
 	docker-compose -f dev/iceberg-integration/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py
+	export SPARK_LOCAL_IP="127.0.0.1"
 	venv/bin/python -m pytest deltacat/tests/integ -v -m integration
 
 test-converter:
@@ -52,6 +53,7 @@ test-converter:
 	docker-compose -f dev/iceberg-integration/docker-compose-integration.yml up -d
 	sleep 3
 	docker-compose -f dev/iceberg-integration/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py
+	export SPARK_LOCAL_IP="127.0.0.1"
 	venv/bin/python -m pytest deltacat/tests/compute/converter -vv
 
 test-integration-rebuild:
diff --git a/deltacat/compute/converter/converter_session.py b/deltacat/compute/converter/converter_session.py
@@ -14,6 +14,7 @@
 from deltacat.compute.converter.model.converter_session_params import (
     ConverterSessionParams,
 )
+
 from deltacat.compute.converter.constants import DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
 from deltacat.compute.converter.steps.convert import convert
 from deltacat.compute.converter.model.convert_input import ConvertInput
@@ -23,6 +24,7 @@
 )
 from deltacat.compute.converter.utils.converter_session_utils import (
     check_data_files_sequence_number,
+    construct_iceberg_table_prefix,
 )
 from deltacat.compute.converter.pyiceberg.replace_snapshot import (
     commit_overwrite_snapshot,
@@ -70,7 +72,13 @@ def converter_session(params: ConverterSessionParams, **kwargs):
         )
 
     iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
-    print(f"iceberg_warehouse_bucket_name:{iceberg_warehouse_bucket_name}")
+    iceberg_namespace = params.iceberg_namespace
+    iceberg_table_warehouse_prefix = construct_iceberg_table_prefix(
+        iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
+        table_name=table_name,
+        iceberg_namespace=iceberg_namespace,
+    )
+    logger.info(f"iceberg_warehouse_bucket_name:{iceberg_warehouse_bucket_name}")
     merge_keys = params.merge_keys
     # Using table identifier fields as merge keys if merge keys not provided
     if not merge_keys:
@@ -105,7 +113,7 @@ def convert_input_provider(index, item):
             "convert_input": ConvertInput.of(
                 files_for_each_bucket=item,
                 convert_task_index=index,
-                iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
+                iceberg_table_warehouse_prefix=iceberg_table_warehouse_prefix,
                 identifier_fields=identifier_fields,
                 compact_small_files=compact_small_files,
                 position_delete_for_multiple_data_files=position_delete_for_multiple_data_files,
diff --git a/deltacat/compute/converter/model/convert_input.py b/deltacat/compute/converter/model/convert_input.py
@@ -7,23 +7,26 @@ class ConvertInput(Dict):
     def of(
         files_for_each_bucket,
         convert_task_index,
-        iceberg_warehouse_bucket_name,
+        iceberg_table_warehouse_prefix,
         identifier_fields,
         compact_small_files,
         position_delete_for_multiple_data_files,
         max_parallel_data_file_download,
+        s3_file_system,
     ) -> ConvertInput:
 
         result = ConvertInput()
         result["files_for_each_bucket"] = files_for_each_bucket
         result["convert_task_index"] = convert_task_index
         result["identifier_fields"] = identifier_fields
-        result["iceberg_warehouse_bucket_name"] = iceberg_warehouse_bucket_name
+        result["iceberg_table_warehouse_prefix"] = iceberg_table_warehouse_prefix
         result["compact_small_files"] = compact_small_files
         result[
             "position_delete_for_multiple_data_files"
         ] = position_delete_for_multiple_data_files
         result["max_parallel_data_file_download"] = max_parallel_data_file_download
+        result["s3_file_system"] = s3_file_system
+
         return result
 
     @property
@@ -39,8 +42,8 @@ def convert_task_index(self) -> int:
         return self["convert_task_index"]
 
     @property
-    def iceberg_warehouse_bucket_name(self) -> str:
-        return self["iceberg_warehouse_bucket_name"]
+    def iceberg_table_warehouse_prefix(self) -> str:
+        return self["iceberg_table_warehouse_prefix"]
 
     @property
     def compact_small_files(self) -> bool:
@@ -53,3 +56,7 @@ def position_delete_for_multiple_data_files(self) -> bool:
     @property
     def max_parallel_data_file_download(self) -> int:
         return self["max_parallel_data_file_download"]
+
+    @property
+    def s3_file_system(self):
+        return self["s3_file_system"]
diff --git a/deltacat/compute/converter/model/converter_session_params.py b/deltacat/compute/converter/model/converter_session_params.py
@@ -18,6 +18,9 @@ def of(params: Optional[Dict]) -> ConverterSessionParams:
         assert (
             params.get("iceberg_warehouse_bucket_name") is not None
         ), "iceberg_warehouse_bucket_name is a required arg"
+        assert (
+            params.get("iceberg_namespace") is not None
+        ), "iceberg_namespace is a required arg"
         result = ConverterSessionParams(params)
 
         result.compact_small_files = params.get("compact_small_files", False)
@@ -44,6 +47,10 @@ def iceberg_table_name(self) -> str:
     def iceberg_warehouse_bucket_name(self) -> str:
         return self["iceberg_warehouse_bucket_name"]
 
+    @property
+    def iceberg_namespace(self) -> str:
+        return self["iceberg_namespace"]
+
     @property
     def compact_small_files(self) -> bool:
         return self["compact_small_files"]
diff --git a/deltacat/compute/converter/pyiceberg/replace_snapshot.py b/deltacat/compute/converter/pyiceberg/replace_snapshot.py
@@ -12,7 +12,7 @@
 )
 import itertools
 from pyiceberg.utils.concurrent import ExecutorFactory
-from pyiceberg.table import UpdateSnapshot, _SnapshotProducer
+from pyiceberg.table.update.snapshot import UpdateSnapshot, _SnapshotProducer
 
 
 class _ReplaceFiles(_SnapshotProducer["_ReplaceFiles"]):
diff --git a/deltacat/compute/converter/steps/convert.py b/deltacat/compute/converter/steps/convert.py
@@ -8,6 +8,9 @@
 import logging
 from deltacat.compute.converter.model.convert_input import ConvertInput
 from deltacat.compute.converter.utils.s3u import upload_table_with_retry
+from deltacat.compute.converter.utils.converter_session_utils import (
+    partition_value_record_to_partition_value_string,
+)
 from deltacat import logs
 
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -17,14 +20,14 @@
 def convert(convert_input: ConvertInput):
     files_for_each_bucket = convert_input.files_for_each_bucket
     convert_task_index = convert_input.convert_task_index
-    iceberg_warehouse_bucket_name = convert_input.iceberg_warehouse_bucket_name
+    iceberg_table_warehouse_prefix = convert_input.iceberg_table_warehouse_prefix
     identifier_fields = convert_input.identifier_fields
     compact_small_files = convert_input.compact_small_files
     position_delete_for_multiple_data_files = (
         convert_input.position_delete_for_multiple_data_files
     )
     max_parallel_data_file_download = convert_input.max_parallel_data_file_download
-
+    s3_file_system = convert_input.s3_file_system
     if not position_delete_for_multiple_data_files:
         raise NotImplementedError(
             f"Distributed file level position delete compute is not supported yet"
@@ -34,16 +37,23 @@ def convert(convert_input: ConvertInput):
 
     logger.info(f"Starting convert task index: {convert_task_index}")
     data_files, equality_delete_files, position_delete_files = files_for_each_bucket[1]
+    partition_value_str = partition_value_record_to_partition_value_string(
+        files_for_each_bucket[0]
+    )
     partition_value = files_for_each_bucket[0]
+    iceberg_table_warehouse_prefix_with_partition = (
+        f"{iceberg_table_warehouse_prefix}/{partition_value_str}"
+    )
     (
         to_be_deleted_files_list,
         to_be_added_files_list,
     ) = compute_pos_delete_with_limited_parallelism(
         data_files_list=data_files,
         identifier_columns=identifier_fields,
         equality_delete_files_list=equality_delete_files,
-        iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
+        iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
         max_parallel_data_file_download=max_parallel_data_file_download,
+        s3_file_system=s3_file_system,
     )
     to_be_delete_files_dict = defaultdict()
     to_be_delete_files_dict[partition_value] = to_be_deleted_files_list
@@ -68,7 +78,9 @@ def filter_rows_to_be_deleted(
             f"length_pos_delete_table, {len(positional_delete_table)}, length_data_table:{len(data_file_table)}"
         )
     if positional_delete_table:
-        positional_delete_table = positional_delete_table.drop(["primarykey"])
+        # TODO: Add support for multiple identify columns
+        identifier_column = identifier_columns[0]
+        positional_delete_table = positional_delete_table.drop([identifier_column])
     if len(positional_delete_table) == len(data_file_table):
         return True, None
     return False, positional_delete_table
@@ -78,7 +90,8 @@ def compute_pos_delete(
     equality_delete_table,
     data_file_table,
     identifier_columns,
-    iceberg_warehouse_bucket_name,
+    iceberg_table_warehouse_prefix_with_partition,
+    s3_file_system,
 ):
     delete_whole_file, new_position_delete_table = filter_rows_to_be_deleted(
         data_file_table=data_file_table,
@@ -89,7 +102,10 @@ def compute_pos_delete(
         logger.info(f"compute_pos_delete_table:{new_position_delete_table.to_pydict()}")
     if new_position_delete_table:
         new_pos_delete_s3_link = upload_table_with_retry(
-            new_position_delete_table, iceberg_warehouse_bucket_name, {}
+            table=new_position_delete_table,
+            s3_url_prefix=iceberg_table_warehouse_prefix_with_partition,
+            s3_table_writer_kwargs={},
+            s3_file_system=s3_file_system,
         )
     return delete_whole_file, new_pos_delete_s3_link
 
@@ -126,8 +142,9 @@ def compute_pos_delete_with_limited_parallelism(
     data_files_list,
     identifier_columns,
     equality_delete_files_list,
-    iceberg_warehouse_bucket_name,
+    iceberg_table_warehouse_prefix_with_partition,
     max_parallel_data_file_download,
+    s3_file_system,
 ):
     to_be_deleted_file_list = []
     to_be_added_pos_delete_file_list = []
@@ -144,8 +161,9 @@ def compute_pos_delete_with_limited_parallelism(
         delete_whole_file, new_pos_delete_s3_link = compute_pos_delete(
             equality_delete_table=equality_delete_table,
             data_file_table=data_table,
-            iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
+            iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
             identifier_columns=identifier_columns,
+            s3_file_system=s3_file_system,
         )
         if delete_whole_file:
             to_be_deleted_file_list.extend(data_files)
@@ -182,7 +200,6 @@ def download_parquet_with_daft_hash_applied(
         io_config=io_config,
         coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
     )
-    logger.info(f"debug_identify_columns:{identify_columns}")
     df = df.select(daft.col(identify_columns[0]).hash())
     arrow_table = df.to_arrow()
     return arrow_table
diff --git a/deltacat/compute/converter/utils/converter_session_utils.py b/deltacat/compute/converter/utils/converter_session_utils.py
@@ -48,3 +48,15 @@ def append_larger_sequence_number_data_files(data_files_list):
             sublist_file_list.append(file)
         result.append(sublist_file_list)
     return result
+
+
+def construct_iceberg_table_prefix(
+    iceberg_warehouse_bucket_name, table_name, iceberg_namespace
+):
+    return f"{iceberg_warehouse_bucket_name}/{iceberg_namespace}/{table_name}/data"
+
+
+def partition_value_record_to_partition_value_string(partition):
+    # Get string representation of partition value out of Record[partition_value]
+    partition_value_str = partition.__repr__().split("[", 1)[1].split("]")[0]
+    return partition_value_str
diff --git a/deltacat/compute/converter/utils/iceberg_columns.py b/deltacat/compute/converter/utils/iceberg_columns.py
@@ -1,16 +1,27 @@
 import pyarrow as pa
 from typing import Union
 
+# Refer to: https://iceberg.apache.org/spec/#reserved-field-ids for reserved field ids
+ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN = 2147483546
+
+# Refer to: https://iceberg.apache.org/spec/#reserved-field-ids for reserved field ids
+ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN = 2147483545
+
 
 def _get_iceberg_col_name(suffix):
-    return f"{suffix}"
+    return suffix
 
 
 _ORDERED_RECORD_IDX_COLUMN_NAME = _get_iceberg_col_name("pos")
 _ORDERED_RECORD_IDX_COLUMN_TYPE = pa.int64()
+_ORDERED_RECORD_IDX_FIELD_METADATA = {
+    b"PARQUET:field_id": f"{ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN}"
+}
 _ORDERED_RECORD_IDX_COLUMN_FIELD = pa.field(
     _ORDERED_RECORD_IDX_COLUMN_NAME,
     _ORDERED_RECORD_IDX_COLUMN_TYPE,
+    metadata=_ORDERED_RECORD_IDX_FIELD_METADATA,
+    nullable=False,
 )
 
 
@@ -32,7 +43,12 @@ def append_record_idx_col(table: pa.Table, ordered_record_indices) -> pa.Table:
 
 _FILE_PATH_COLUMN_NAME = _get_iceberg_col_name("file_path")
 _FILE_PATH_COLUMN_TYPE = pa.string()
+_FILE_PATH_FIELD_METADATA = {
+    b"PARQUET:field_id": f"{ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN}"
+}
 _FILE_PATH_COLUMN_FIELD = pa.field(
     _FILE_PATH_COLUMN_NAME,
     _FILE_PATH_COLUMN_TYPE,
+    metadata=_FILE_PATH_FIELD_METADATA,
+    nullable=False,
 )
diff --git a/deltacat/compute/converter/utils/s3u.py b/deltacat/compute/converter/utils/s3u.py
@@ -57,6 +57,7 @@ def upload_table_with_retry(
     s3_table_writer_kwargs: Optional[Dict[str, Any]],
     content_type: ContentType = ContentType.PARQUET,
     max_records_per_file: Optional[int] = 4000000,
+    s3_file_system=None,
     **s3_client_kwargs,
 ) -> List[str]:
     """
@@ -72,9 +73,12 @@ def upload_table_with_retry(
     if s3_table_writer_kwargs is None:
         s3_table_writer_kwargs = {}
 
-    s3_file_system = get_s3_file_system(content_type=content_type)
+    if not s3_file_system:
+        s3_file_system = get_s3_file_system(content_type=content_type)
     capture_object = CapturedBlockWritePaths()
-    block_write_path_provider = UuidBlockWritePathProvider(capture_object)
+    block_write_path_provider = UuidBlockWritePathProvider(
+        capture_object=capture_object
+    )
     s3_table_writer_func = get_table_writer(table)
     table_record_count = get_table_length(table)
     if max_records_per_file is None or not table_record_count:
diff --git a/deltacat/tests/compute/converter/conftest.py b/deltacat/tests/compute/converter/conftest.py
@@ -1,6 +1,7 @@
 import pytest
 from pyspark.sql import SparkSession
 import os
+import ray
 from pyiceberg.catalog import Catalog, load_catalog
 
 
@@ -70,3 +71,10 @@ def session_catalog() -> Catalog:
             "s3.secret-access-key": "password",
         },
     )
+
+
+@pytest.fixture(autouse=True, scope="module")
+def setup_ray_cluster():
+    ray.init(local_mode=True, ignore_reinit_error=True)
+    yield
+    ray.shutdown()
diff --git a/deltacat/tests/compute/converter/test_convert_session.py b/deltacat/tests/compute/converter/test_convert_session.py
diff --git a/deltacat/tests/compute/converter/utils.py b/deltacat/tests/compute/converter/utils.py
diff --git a/dev/_sandbox/compute/converter/example_single_merge_key_converter.py b/dev/_sandbox/compute/converter/example_single_merge_key_converter.py

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@`
`12`	`12`	`)`
`13`	`13`	`import itertools`
`14`	`14`	`from pyiceberg.utils.concurrent import ExecutorFactory`
`15`		`-from pyiceberg.table import UpdateSnapshot, _SnapshotProducer`
	`15`	`+from pyiceberg.table.update.snapshot import UpdateSnapshot, _SnapshotProducer`
`16`	`16`
`17`	`17`
`18`	`18`	`class _ReplaceFiles(_SnapshotProducer["_ReplaceFiles"]):`