10
10
)
11
11
import logging
12
12
from deltacat import logs
13
- from collections import defaultdict
14
13
from deltacat .compute .converter .model .converter_session_params import (
15
14
ConverterSessionParams ,
16
15
)
17
16
17
+
18
18
from deltacat .compute .converter .constants import DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
19
19
from deltacat .compute .converter .steps .convert import convert
20
20
from deltacat .compute .converter .model .convert_input import ConvertInput
23
23
parquet_files_dict_to_iceberg_data_files ,
24
24
)
25
25
from deltacat .compute .converter .utils .converter_session_utils import (
26
- check_data_files_sequence_number ,
27
26
construct_iceberg_table_prefix ,
28
27
)
29
28
from deltacat .compute .converter .pyiceberg .replace_snapshot import (
30
29
commit_overwrite_snapshot ,
30
+ commit_append_snapshot ,
31
31
)
32
32
from deltacat .compute .converter .pyiceberg .catalog import load_table
33
+ from deltacat .compute .converter .utils .converter_session_utils import (
34
+ group_all_files_to_each_bucket ,
35
+ )
33
36
34
37
logger = logs .configure_deltacat_logger (logging .getLogger (__name__ ))
35
38
@@ -44,33 +47,15 @@ def converter_session(params: ConverterSessionParams, **kwargs):
44
47
catalog = params .catalog
45
48
table_name = params .iceberg_table_name
46
49
iceberg_table = load_table (catalog , table_name )
50
+ enforce_primary_key_uniqueness = params .enforce_primary_key_uniqueness
47
51
data_file_dict , equality_delete_dict , pos_delete_dict = fetch_all_bucket_files (
48
52
iceberg_table
49
53
)
50
-
51
- # files_for_each_bucket contains the following files list:
52
- # {partition_value: [(equality_delete_files_list, data_files_list, pos_delete_files_list)]
53
- files_for_each_bucket = defaultdict (tuple )
54
- for k , v in data_file_dict .items ():
55
- logger .info (f"data_file: k, v:{ k , v } " )
56
- for k , v in equality_delete_dict .items ():
57
- logger .info (f"equality_delete_file: k, v:{ k , v } " )
58
- for partition_value , equality_delete_file_list in equality_delete_dict .items ():
59
- (
60
- result_equality_delete_file ,
61
- result_data_file ,
62
- ) = check_data_files_sequence_number (
63
- data_files_list = data_file_dict [partition_value ],
64
- equality_delete_files_list = equality_delete_dict [partition_value ],
65
- )
66
- logger .info (f"result_data_file:{ result_data_file } " )
67
- logger .info (f"result_equality_delete_file:{ result_equality_delete_file } " )
68
- files_for_each_bucket [partition_value ] = (
69
- result_data_file ,
70
- result_equality_delete_file ,
71
- [],
72
- )
73
-
54
+ convert_input_files_for_all_buckets = group_all_files_to_each_bucket (
55
+ data_file_dict = data_file_dict ,
56
+ equality_delete_dict = equality_delete_dict ,
57
+ pos_delete_dict = pos_delete_dict ,
58
+ )
74
59
iceberg_warehouse_bucket_name = params .iceberg_warehouse_bucket_name
75
60
iceberg_namespace = params .iceberg_namespace
76
61
iceberg_table_warehouse_prefix = construct_iceberg_table_prefix (
@@ -116,6 +101,7 @@ def convert_input_provider(index, item):
116
101
iceberg_table_warehouse_prefix = iceberg_table_warehouse_prefix ,
117
102
identifier_fields = identifier_fields ,
118
103
compact_small_files = compact_small_files ,
104
+ enforce_primary_key_uniqueness = enforce_primary_key_uniqueness ,
119
105
position_delete_for_multiple_data_files = position_delete_for_multiple_data_files ,
120
106
max_parallel_data_file_download = max_parallel_data_file_download ,
121
107
)
@@ -125,7 +111,7 @@ def convert_input_provider(index, item):
125
111
# Assuming that memory consume by each bucket doesn't exceed one node's memory limit.
126
112
# TODO: Add split mechanism to split large buckets
127
113
convert_tasks_pending = invoke_parallel (
128
- items = files_for_each_bucket .items (),
114
+ items = convert_input_files_for_all_buckets .items (),
129
115
ray_task = convert ,
130
116
max_parallelism = task_max_parallelism ,
131
117
options_provider = convert_options_provider ,
@@ -143,9 +129,16 @@ def convert_input_provider(index, item):
143
129
table_metadata = iceberg_table .metadata ,
144
130
files_dict_list = to_be_added_files_dict_list ,
145
131
)
146
- commit_overwrite_snapshot (
147
- iceberg_table = iceberg_table ,
148
- # equality_delete_files + data file that all rows are deleted
149
- to_be_deleted_files_list = to_be_deleted_files_list [0 ],
150
- new_position_delete_files = new_position_delete_files ,
151
- )
132
+ print (f"debug_to_be_deleted_files:{ to_be_deleted_files_list } " )
133
+ if not to_be_deleted_files_list :
134
+ commit_append_snapshot (
135
+ iceberg_table = iceberg_table ,
136
+ new_position_delete_files = new_position_delete_files ,
137
+ )
138
+ else :
139
+ commit_overwrite_snapshot (
140
+ iceberg_table = iceberg_table ,
141
+ # equality_delete_files + data file that all rows are deleted
142
+ to_be_deleted_files_list = to_be_deleted_files_list ,
143
+ new_position_delete_files = new_position_delete_files ,
144
+ )
0 commit comments