21
21
import java .util .ArrayDeque ;
22
22
import java .util .Collection ;
23
23
import java .util .Deque ;
24
+ import java .util .List ;
24
25
import java .util .Map ;
26
+ import java .util .Objects ;
25
27
import java .util .Set ;
26
28
import java .util .stream .Collectors ;
27
29
import org .apache .iceberg .ManifestGroup .CreateTasksFunction ;
28
30
import org .apache .iceberg .ManifestGroup .TaskContext ;
29
31
import org .apache .iceberg .io .CloseableIterable ;
30
- import org .apache .iceberg .relocated .com .google .common .collect .FluentIterable ;
31
- import org .apache .iceberg .relocated .com .google .common .collect .ImmutableList ;
32
+ import org .apache .iceberg .relocated .com .google .common .collect .Lists ;
32
33
import org .apache .iceberg .relocated .com .google .common .collect .Maps ;
34
+ import org .apache .iceberg .relocated .com .google .common .collect .Sets ;
33
35
import org .apache .iceberg .util .SnapshotUtil ;
34
36
import org .apache .iceberg .util .TableScanUtil ;
35
37
@@ -63,21 +65,27 @@ protected CloseableIterable<ChangelogScanTask> doPlanFiles(
63
65
return CloseableIterable .empty ();
64
66
}
65
67
66
- Set <Long > changelogSnapshotIds = toSnapshotIds (changelogSnapshots );
67
-
68
- Set <ManifestFile > newDataManifests =
69
- FluentIterable .from (changelogSnapshots )
70
- .transformAndConcat (snapshot -> snapshot .dataManifests (table ().io ()))
71
- .filter (manifest -> changelogSnapshotIds .contains (manifest .snapshotId ()))
72
- .toSet ();
68
+ Set <ManifestFile > newDataManifests = Sets .newHashSet ();
69
+ Set <ManifestFile > newDeleteManifests = Sets .newHashSet ();
70
+ Map <Long , Snapshot > addedToChangedSnapshots = Maps .newHashMap ();
71
+ for (Snapshot snapshot : changelogSnapshots ) {
72
+ List <ManifestFile > dataManifests = snapshot .dataManifests (table ().io ());
73
+ for (ManifestFile manifest : dataManifests ) {
74
+ if (!newDataManifests .contains (manifest )) {
75
+ addedToChangedSnapshots .put (manifest .snapshotId (), snapshot );
76
+ newDataManifests .add (manifest );
77
+ }
78
+ }
79
+ newDeleteManifests .addAll (snapshot .deleteManifests (table ().io ()));
80
+ }
73
81
74
82
ManifestGroup manifestGroup =
75
- new ManifestGroup (table ().io (), newDataManifests , ImmutableList . of () )
83
+ new ManifestGroup (table ().io (), newDataManifests , newDeleteManifests )
76
84
.specsById (table ().specs ())
77
85
.caseSensitive (isCaseSensitive ())
78
86
.select (scanColumns ())
79
87
.filterData (filter ())
80
- .filterManifestEntries (entry -> changelogSnapshotIds . contains (entry .snapshotId ()))
88
+ .filterManifestEntries (entry -> addedToChangedSnapshots . containsKey (entry .snapshotId ()))
81
89
.ignoreExisting ()
82
90
.columnsToKeepStats (columnsToKeepStats ());
83
91
@@ -89,7 +97,8 @@ protected CloseableIterable<ChangelogScanTask> doPlanFiles(
89
97
manifestGroup = manifestGroup .planWith (planExecutor ());
90
98
}
91
99
92
- return manifestGroup .plan (new CreateDataFileChangeTasks (changelogSnapshots ));
100
+ return manifestGroup .plan (
101
+ new CreateDataFileChangeTasks (changelogSnapshots , addedToChangedSnapshots ));
93
102
}
94
103
95
104
@ Override
@@ -105,11 +114,6 @@ private Deque<Snapshot> orderedChangelogSnapshots(Long fromIdExcl, long toIdIncl
105
114
106
115
for (Snapshot snapshot : SnapshotUtil .ancestorsBetween (table (), toIdIncl , fromIdExcl )) {
107
116
if (!snapshot .operation ().equals (DataOperations .REPLACE )) {
108
- if (!snapshot .deleteManifests (table ().io ()).isEmpty ()) {
109
- throw new UnsupportedOperationException (
110
- "Delete files are currently not supported in changelog scans" );
111
- }
112
-
113
117
changelogSnapshots .addFirst (snapshot );
114
118
}
115
119
}
@@ -134,50 +138,81 @@ private static Map<Long, Integer> computeSnapshotOrdinals(Deque<Snapshot> snapsh
134
138
}
135
139
136
140
private static class CreateDataFileChangeTasks implements CreateTasksFunction <ChangelogScanTask > {
137
- private static final DeleteFile [] NO_DELETES = new DeleteFile [0 ];
138
141
139
142
private final Map <Long , Integer > snapshotOrdinals ;
143
+ private final Map <Long , Snapshot > addedToChangedSnapshots ;
140
144
141
- CreateDataFileChangeTasks (Deque <Snapshot > snapshots ) {
145
+ CreateDataFileChangeTasks (
146
+ Deque <Snapshot > snapshots , Map <Long , Snapshot > addedToChangedSnapshots ) {
142
147
this .snapshotOrdinals = computeSnapshotOrdinals (snapshots );
148
+ this .addedToChangedSnapshots = addedToChangedSnapshots ;
143
149
}
144
150
145
151
@ Override
146
152
public CloseableIterable <ChangelogScanTask > apply (
147
153
CloseableIterable <ManifestEntry <DataFile >> entries , TaskContext context ) {
148
154
149
- return CloseableIterable .transform (
150
- entries ,
151
- entry -> {
152
- long commitSnapshotId = entry .snapshotId ();
153
- int changeOrdinal = snapshotOrdinals .get (commitSnapshotId );
154
- DataFile dataFile = entry .file ().copy (context .shouldKeepStats ());
155
-
156
- switch (entry .status ()) {
157
- case ADDED :
158
- return new BaseAddedRowsScanTask (
159
- changeOrdinal ,
160
- commitSnapshotId ,
161
- dataFile ,
162
- NO_DELETES ,
163
- context .schemaAsString (),
164
- context .specAsString (),
165
- context .residuals ());
166
-
167
- case DELETED :
168
- return new BaseDeletedDataFileScanTask (
169
- changeOrdinal ,
170
- commitSnapshotId ,
171
- dataFile ,
172
- NO_DELETES ,
173
- context .schemaAsString (),
174
- context .specAsString (),
175
- context .residuals ());
176
-
177
- default :
178
- throw new IllegalArgumentException ("Unexpected entry status: " + entry .status ());
179
- }
180
- });
155
+ return CloseableIterable .filter (
156
+ CloseableIterable .transform (
157
+ entries ,
158
+ entry -> {
159
+ long snapshotId = entry .snapshotId ();
160
+ Snapshot snapshot = addedToChangedSnapshots .get (snapshotId );
161
+ long commitSnapshotId = snapshot .snapshotId ();
162
+ int changeOrdinal = snapshotOrdinals .get (snapshot .snapshotId ());
163
+ DataFile dataFile = entry .file ().copy (context .shouldKeepStats ());
164
+ DeleteFile [] deleteFiles = context .deletes ().forDataFile (dataFile );
165
+ List <DeleteFile > addedDeletes = Lists .newArrayList ();
166
+ List <DeleteFile > existingDeletes = Lists .newArrayList ();
167
+ for (DeleteFile file : deleteFiles ) {
168
+ if (file .dataSequenceNumber () == snapshot .sequenceNumber ()) {
169
+ addedDeletes .add (file );
170
+ } else {
171
+ existingDeletes .add (file );
172
+ }
173
+ }
174
+
175
+ switch (entry .status ()) {
176
+ case ADDED :
177
+ if (snapshotId == commitSnapshotId ) {
178
+ return new BaseAddedRowsScanTask (
179
+ changeOrdinal ,
180
+ commitSnapshotId ,
181
+ dataFile ,
182
+ addedDeletes .toArray (new DeleteFile [0 ]),
183
+ context .schemaAsString (),
184
+ context .specAsString (),
185
+ context .residuals ());
186
+ } else if (deleteFiles .length > 0 ) {
187
+ return new BaseDeletedRowsScanTask (
188
+ changeOrdinal ,
189
+ commitSnapshotId ,
190
+ dataFile ,
191
+ addedDeletes .toArray (new DeleteFile [0 ]),
192
+ existingDeletes .toArray (new DeleteFile [0 ]),
193
+ context .schemaAsString (),
194
+ context .specAsString (),
195
+ context .residuals ());
196
+ } else {
197
+ return null ;
198
+ }
199
+
200
+ case DELETED :
201
+ return new BaseDeletedDataFileScanTask (
202
+ changeOrdinal ,
203
+ commitSnapshotId ,
204
+ dataFile ,
205
+ existingDeletes .toArray (new DeleteFile [0 ]),
206
+ context .schemaAsString (),
207
+ context .specAsString (),
208
+ context .residuals ());
209
+
210
+ default :
211
+ throw new IllegalArgumentException (
212
+ "Unexpected entry status: " + entry .status ());
213
+ }
214
+ }),
215
+ Objects ::nonNull );
181
216
}
182
217
}
183
218
}
0 commit comments