8
8
import logging
9
9
from deltacat .compute .converter .model .convert_input import ConvertInput
10
10
from deltacat .compute .converter .utils .s3u import upload_table_with_retry
11
+ from deltacat .compute .converter .utils .converter_session_utils import (
12
+ partition_value_record_to_partition_value_string ,
13
+ )
11
14
from deltacat import logs
12
15
13
16
logger = logs .configure_deltacat_logger (logging .getLogger (__name__ ))
17
20
def convert (convert_input : ConvertInput ):
18
21
files_for_each_bucket = convert_input .files_for_each_bucket
19
22
convert_task_index = convert_input .convert_task_index
20
- iceberg_warehouse_bucket_name = convert_input .iceberg_warehouse_bucket_name
23
+ iceberg_table_warehouse_prefix = convert_input .iceberg_table_warehouse_prefix
21
24
identifier_fields = convert_input .identifier_fields
22
25
compact_small_files = convert_input .compact_small_files
23
26
position_delete_for_multiple_data_files = (
24
27
convert_input .position_delete_for_multiple_data_files
25
28
)
26
29
max_parallel_data_file_download = convert_input .max_parallel_data_file_download
27
-
30
+ s3_file_system = convert_input . s3_file_system
28
31
if not position_delete_for_multiple_data_files :
29
32
raise NotImplementedError (
30
33
f"Distributed file level position delete compute is not supported yet"
@@ -34,16 +37,23 @@ def convert(convert_input: ConvertInput):
34
37
35
38
logger .info (f"Starting convert task index: { convert_task_index } " )
36
39
data_files , equality_delete_files , position_delete_files = files_for_each_bucket [1 ]
40
+ partition_value_str = partition_value_record_to_partition_value_string (
41
+ files_for_each_bucket [0 ]
42
+ )
37
43
partition_value = files_for_each_bucket [0 ]
44
+ iceberg_table_warehouse_prefix_with_partition = (
45
+ f"{ iceberg_table_warehouse_prefix } /{ partition_value_str } "
46
+ )
38
47
(
39
48
to_be_deleted_files_list ,
40
49
to_be_added_files_list ,
41
50
) = compute_pos_delete_with_limited_parallelism (
42
51
data_files_list = data_files ,
43
52
identifier_columns = identifier_fields ,
44
53
equality_delete_files_list = equality_delete_files ,
45
- iceberg_warehouse_bucket_name = iceberg_warehouse_bucket_name ,
54
+ iceberg_table_warehouse_prefix_with_partition = iceberg_table_warehouse_prefix_with_partition ,
46
55
max_parallel_data_file_download = max_parallel_data_file_download ,
56
+ s3_file_system = s3_file_system ,
47
57
)
48
58
to_be_delete_files_dict = defaultdict ()
49
59
to_be_delete_files_dict [partition_value ] = to_be_deleted_files_list
@@ -68,7 +78,9 @@ def filter_rows_to_be_deleted(
68
78
f"length_pos_delete_table, { len (positional_delete_table )} , length_data_table:{ len (data_file_table )} "
69
79
)
70
80
if positional_delete_table :
71
- positional_delete_table = positional_delete_table .drop (["primarykey" ])
81
+ # TODO: Add support for multiple identify columns
82
+ identifier_column = identifier_columns [0 ]
83
+ positional_delete_table = positional_delete_table .drop ([identifier_column ])
72
84
if len (positional_delete_table ) == len (data_file_table ):
73
85
return True , None
74
86
return False , positional_delete_table
@@ -78,7 +90,8 @@ def compute_pos_delete(
78
90
equality_delete_table ,
79
91
data_file_table ,
80
92
identifier_columns ,
81
- iceberg_warehouse_bucket_name ,
93
+ iceberg_table_warehouse_prefix_with_partition ,
94
+ s3_file_system ,
82
95
):
83
96
delete_whole_file , new_position_delete_table = filter_rows_to_be_deleted (
84
97
data_file_table = data_file_table ,
@@ -89,7 +102,10 @@ def compute_pos_delete(
89
102
logger .info (f"compute_pos_delete_table:{ new_position_delete_table .to_pydict ()} " )
90
103
if new_position_delete_table :
91
104
new_pos_delete_s3_link = upload_table_with_retry (
92
- new_position_delete_table , iceberg_warehouse_bucket_name , {}
105
+ table = new_position_delete_table ,
106
+ s3_url_prefix = iceberg_table_warehouse_prefix_with_partition ,
107
+ s3_table_writer_kwargs = {},
108
+ s3_file_system = s3_file_system ,
93
109
)
94
110
return delete_whole_file , new_pos_delete_s3_link
95
111
@@ -126,8 +142,9 @@ def compute_pos_delete_with_limited_parallelism(
126
142
data_files_list ,
127
143
identifier_columns ,
128
144
equality_delete_files_list ,
129
- iceberg_warehouse_bucket_name ,
145
+ iceberg_table_warehouse_prefix_with_partition ,
130
146
max_parallel_data_file_download ,
147
+ s3_file_system ,
131
148
):
132
149
to_be_deleted_file_list = []
133
150
to_be_added_pos_delete_file_list = []
@@ -144,8 +161,9 @@ def compute_pos_delete_with_limited_parallelism(
144
161
delete_whole_file , new_pos_delete_s3_link = compute_pos_delete (
145
162
equality_delete_table = equality_delete_table ,
146
163
data_file_table = data_table ,
147
- iceberg_warehouse_bucket_name = iceberg_warehouse_bucket_name ,
164
+ iceberg_table_warehouse_prefix_with_partition = iceberg_table_warehouse_prefix_with_partition ,
148
165
identifier_columns = identifier_columns ,
166
+ s3_file_system = s3_file_system ,
149
167
)
150
168
if delete_whole_file :
151
169
to_be_deleted_file_list .extend (data_files )
@@ -182,7 +200,6 @@ def download_parquet_with_daft_hash_applied(
182
200
io_config = io_config ,
183
201
coerce_int96_timestamp_unit = coerce_int96_timestamp_unit ,
184
202
)
185
- logger .info (f"debug_identify_columns:{ identify_columns } " )
186
203
df = df .select (daft .col (identify_columns [0 ]).hash ())
187
204
arrow_table = df .to_arrow ()
188
205
return arrow_table
0 commit comments