address comments

yiqinzhu · yiqinzhu · commit 2e0d530ece01 · 2025-03-13T12:37:33.000-07:00
diff --git a/deltacat/compute/converter/steps/convert.py b/deltacat/compute/converter/steps/convert.py
@@ -8,6 +8,9 @@
 import logging
 from deltacat.compute.converter.model.convert_input import ConvertInput
 from deltacat.compute.converter.utils.s3u import upload_table_with_retry
+from deltacat.compute.converter.utils.converter_session_utils import (
+    partition_value_record_to_partition_value_string,
+)
 from deltacat import logs
 
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -34,12 +37,8 @@ def convert(convert_input: ConvertInput):
 
     logger.info(f"Starting convert task index: {convert_task_index}")
     data_files, equality_delete_files, position_delete_files = files_for_each_bucket[1]
-    # Get string representation of partition value out of Record[partition_value]
-    partition_value_str = (
-        files_for_each_bucket[0].__repr__().split("[", 1)[1].split("]")[0]
-    )
-    partition_value_str = (
-        files_for_each_bucket[0].__repr__().split("[", 1)[1].split("]")[0]
+    partition_value_str = partition_value_record_to_partition_value_string(
+        files_for_each_bucket[0]
     )
     partition_value = files_for_each_bucket[0]
     iceberg_table_warehouse_prefix_with_partition = (
@@ -81,7 +80,7 @@ def filter_rows_to_be_deleted(
     if positional_delete_table:
         # TODO: Add support for multiple identify columns
         identifier_column = identifier_columns[0]
-        positional_delete_table = positional_delete_table.drop(identifier_column)
+        positional_delete_table = positional_delete_table.drop([identifier_column])
     if len(positional_delete_table) == len(data_file_table):
         return True, None
     return False, positional_delete_table
diff --git a/deltacat/compute/converter/utils/converter_session_utils.py b/deltacat/compute/converter/utils/converter_session_utils.py
@@ -54,3 +54,9 @@ def construct_iceberg_table_prefix(
     iceberg_warehouse_bucket_name, table_name, iceberg_namespace
 ):
     return f"{iceberg_warehouse_bucket_name}/{iceberg_namespace}/{table_name}/data"
+
+
+def partition_value_record_to_partition_value_string(partition):
+    # Get string representation of partition value out of Record[partition_value]
+    partition_value_str = partition.__repr__().split("[", 1)[1].split("]")[0]
+    return partition_value_str
diff --git a/deltacat/compute/converter/utils/iceberg_columns.py b/deltacat/compute/converter/utils/iceberg_columns.py
@@ -1,14 +1,22 @@
 import pyarrow as pa
 from typing import Union
 
+# Refer to: https://iceberg.apache.org/spec/#reserved-field-ids for reserved field ids
+ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN = 2147483546
+
+# Refer to: https://iceberg.apache.org/spec/#reserved-field-ids for reserved field ids
+ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN = 2147483545
+
 
 def _get_iceberg_col_name(suffix):
     return suffix
 
 
 _ORDERED_RECORD_IDX_COLUMN_NAME = _get_iceberg_col_name("pos")
 _ORDERED_RECORD_IDX_COLUMN_TYPE = pa.int64()
-_ORDERED_RECORD_IDX_FIELD_METADATA = {b"PARQUET:field_id": "2147483545"}
+_ORDERED_RECORD_IDX_FIELD_METADATA = {
+    b"PARQUET:field_id": f"{ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN}"
+}
 _ORDERED_RECORD_IDX_COLUMN_FIELD = pa.field(
     _ORDERED_RECORD_IDX_COLUMN_NAME,
     _ORDERED_RECORD_IDX_COLUMN_TYPE,
@@ -35,7 +43,9 @@ def append_record_idx_col(table: pa.Table, ordered_record_indices) -> pa.Table:
 
 _FILE_PATH_COLUMN_NAME = _get_iceberg_col_name("file_path")
 _FILE_PATH_COLUMN_TYPE = pa.string()
-_FILE_PATH_FIELD_METADATA = {b"PARQUET:field_id": "2147483546"}
+_FILE_PATH_FIELD_METADATA = {
+    b"PARQUET:field_id": f"{ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN}"
+}
 _FILE_PATH_COLUMN_FIELD = pa.field(
     _FILE_PATH_COLUMN_NAME,
     _FILE_PATH_COLUMN_TYPE,