3737import org .apache .paimon .types .DataField ;
3838import org .apache .paimon .types .RowType ;
3939import org .apache .paimon .utils .FileStorePathFactory ;
40+ import org .apache .paimon .utils .Pair ;
4041import org .apache .paimon .utils .StatsCollectorFactories ;
4142
4243import java .io .IOException ;
@@ -122,7 +123,7 @@ private KeyValueDataFileWriter createDataFileWriter(
122123 new KeyValueThinSerializer (keyType , valueType )::toRow ,
123124 keyType ,
124125 valueType ,
125- type -> formatContext .statsProducer (level , options , type ),
126+ formatContext .statsProducer (level , options ),
126127 schemaId ,
127128 level ,
128129 formatContext .compression (level ),
@@ -137,7 +138,7 @@ private KeyValueDataFileWriter createDataFileWriter(
137138 new KeyValueSerializer (keyType , valueType )::toRow ,
138139 keyType ,
139140 valueType ,
140- type -> formatContext .statsProducer (level , options , type ),
141+ formatContext .statsProducer (level , options ),
141142 schemaId ,
142143 level ,
143144 formatContext .compression (level ),
@@ -233,12 +234,19 @@ private static class WriteFormatContext {
233234 private final IntFunction <String > level2Compress ;
234235 private final IntFunction <String > level2Stats ;
235236
236- private final Map <String , IntFunction <SimpleStatsExtractor >> format2Extractor ;
237+ private final Map <Pair <String , String >, SimpleStatsExtractor > formatStats2Extractor ;
238+ private final Map <String , SimpleColStatsCollector .Factory []> statsMode2AvroStats ;
237239 private final Map <String , DataFilePathFactory > format2PathFactory ;
240+ private final Map <String , FileFormat > formatFactory ;
238241 private final Map <String , FormatWriterFactory > format2WriterFactory ;
239242
243+ private final BinaryRow partition ;
244+ private final int bucket ;
240245 private final RowType keyType ;
241246 private final RowType valueType ;
247+ private final RowType writeRowType ;
248+ private final Map <String , FileStorePathFactory > parentFactories ;
249+ private final CoreOptions options ;
242250 private final boolean thinModeEnabled ;
243251
244252 private WriteFormatContext (
@@ -249,11 +257,15 @@ private WriteFormatContext(
249257 FileFormat defaultFormat ,
250258 Map <String , FileStorePathFactory > parentFactories ,
251259 CoreOptions options ) {
260+ this .partition = partition ;
261+ this .bucket = bucket ;
252262 this .keyType = keyType ;
253263 this .valueType = valueType ;
264+ this .parentFactories = parentFactories ;
265+ this .options = options ;
254266 this .thinModeEnabled =
255267 options .dataFileThinMode () && supportsThinMode (keyType , valueType );
256- RowType writeRowType =
268+ this . writeRowType =
257269 KeyValue .schema (thinModeEnabled ? RowType .of () : keyType , valueType );
258270 Map <Integer , String > fileFormatPerLevel = options .fileFormatPerLevel ();
259271 this .level2Format =
@@ -270,33 +282,11 @@ private WriteFormatContext(
270282 Map <Integer , String > statsModePerLevel = options .statsModePerLevel ();
271283 this .level2Stats = level -> statsModePerLevel .getOrDefault (level , statsMode );
272284
273- this .format2Extractor = new HashMap <>();
285+ this .formatStats2Extractor = new HashMap <>();
286+ this .statsMode2AvroStats = new HashMap <>();
274287 this .format2PathFactory = new HashMap <>();
275288 this .format2WriterFactory = new HashMap <>();
276- for (String format : parentFactories .keySet ()) {
277- format2PathFactory .put (
278- format ,
279- parentFactories .get (format ).createDataFilePathFactory (partition , bucket ));
280-
281- FileFormat fileFormat =
282- FileFormat .fromIdentifier (format , options .toConfiguration ());
283- IntFunction <SimpleStatsExtractor > extractor =
284- level -> {
285- SimpleColStatsCollector .Factory [] statsFactories =
286- StatsCollectorFactories .createStatsFactories (
287- level2Stats .apply (level ),
288- options ,
289- writeRowType .getFieldNames (),
290- thinModeEnabled
291- ? keyType .getFieldNames ()
292- : Collections .emptyList ());
293- return fileFormat
294- .createStatsExtractor (writeRowType , statsFactories )
295- .orElse (null );
296- };
297- format2Extractor .put (format , extractor );
298- format2WriterFactory .put (format , fileFormat .createWriterFactory (writeRowType ));
299- }
289+ this .formatFactory = new HashMap <>();
300290 }
301291
302292 private boolean supportsThinMode (RowType keyType , RowType valueType ) {
@@ -314,34 +304,64 @@ private boolean supportsThinMode(RowType keyType, RowType valueType) {
314304 return true ;
315305 }
316306
317- private SimpleStatsProducer statsProducer (
318- int level , CoreOptions options , RowType writeRowType ) {
307+ private SimpleStatsProducer statsProducer (int level , CoreOptions options ) {
319308 String format = level2Format .apply (level );
320309 String statsMode = level2Stats .apply (level );
321310 if (format .equals ("avro" )) {
322311 // In avro format, minValue, maxValue, and nullCount are not counted, so use
323312 // SimpleStatsExtractor to collect stats
324- SimpleColStatsCollector .Factory [] collectors =
325- StatsCollectorFactories .createStatsFactoriesForAvro (
326- statsMode , options , writeRowType .getFieldNames ());
327- SimpleStatsCollector collector = new SimpleStatsCollector (writeRowType , collectors );
313+ SimpleColStatsCollector .Factory [] factories =
314+ statsMode2AvroStats .computeIfAbsent (
315+ statsMode ,
316+ key ->
317+ StatsCollectorFactories .createStatsFactoriesForAvro (
318+ statsMode , options , writeRowType .getFieldNames ()));
319+ SimpleStatsCollector collector = new SimpleStatsCollector (writeRowType , factories );
328320 return SimpleStatsProducer .fromCollector (collector );
329321 }
322+
330323 SimpleStatsExtractor extractor =
331- format2Extractor .get (level2Format .apply (level )).apply (level );
324+ formatStats2Extractor .computeIfAbsent (
325+ Pair .of (format , statsMode ),
326+ key -> {
327+ SimpleColStatsCollector .Factory [] statsFactories =
328+ StatsCollectorFactories .createStatsFactories (
329+ statsMode ,
330+ options ,
331+ writeRowType .getFieldNames (),
332+ thinModeEnabled
333+ ? keyType .getFieldNames ()
334+ : Collections .emptyList ());
335+ return fileFormat (format )
336+ .createStatsExtractor (writeRowType , statsFactories )
337+ .orElse (null );
338+ });
332339 return SimpleStatsProducer .fromExtractor (extractor );
333340 }
334341
335342 private DataFilePathFactory pathFactory (int level ) {
336- return format2PathFactory .get (level2Format .apply (level ));
343+ String format = level2Format .apply (level );
344+ return format2PathFactory .computeIfAbsent (
345+ format ,
346+ key ->
347+ parentFactories
348+ .get (format )
349+ .createDataFilePathFactory (partition , bucket ));
337350 }
338351
339352 private FormatWriterFactory writerFactory (int level ) {
340- return format2WriterFactory .get (level2Format .apply (level ));
353+ return format2WriterFactory .computeIfAbsent (
354+ level2Format .apply (level ),
355+ format -> fileFormat (format ).createWriterFactory (writeRowType ));
341356 }
342357
343358 private String compression (int level ) {
344359 return level2Compress .apply (level );
345360 }
361+
362+ private FileFormat fileFormat (String format ) {
363+ return formatFactory .computeIfAbsent (
364+ format , k -> FileFormat .fromIdentifier (format , options .toConfiguration ()));
365+ }
346366 }
347367}
0 commit comments