4949import org .apache .fluss .server .kv .rocksdb .RocksDBKv ;
5050import org .apache .fluss .server .kv .rocksdb .RocksDBKvBuilder ;
5151import org .apache .fluss .server .kv .rocksdb .RocksDBResourceContainer ;
52+ import org .apache .fluss .server .kv .rowmerger .DefaultRowMerger ;
5253import org .apache .fluss .server .kv .rowmerger .RowMerger ;
5354import org .apache .fluss .server .kv .snapshot .KvFileHandleAndLocalPath ;
5455import org .apache .fluss .server .kv .snapshot .KvSnapshotDataUploader ;
@@ -113,6 +114,9 @@ public final class KvTablet {
113114
114115 private final SchemaGetter schemaGetter ;
115116
117+ // whether to ignore UPDATE_BEFORE records in changelog
118+ private final boolean ignoreUpdateBefore ;
119+
116120 /**
117121 * The kv data in pre-write buffer whose log offset is less than the flushedLogOffset has been
118122 * flushed into kv.
@@ -136,7 +140,8 @@ private KvTablet(
136140 KvFormat kvFormat ,
137141 RowMerger rowMerger ,
138142 ArrowCompressionInfo arrowCompressionInfo ,
139- SchemaGetter schemaGetter ) {
143+ SchemaGetter schemaGetter ,
144+ boolean ignoreUpdateBefore ) {
140145 this .physicalPath = physicalPath ;
141146 this .tableBucket = tableBucket ;
142147 this .logTablet = logTablet ;
@@ -151,6 +156,7 @@ private KvTablet(
151156 this .rowMerger = rowMerger ;
152157 this .arrowCompressionInfo = arrowCompressionInfo ;
153158 this .schemaGetter = schemaGetter ;
159+ this .ignoreUpdateBefore = ignoreUpdateBefore ;
154160 }
155161
156162 public static KvTablet create (
@@ -163,7 +169,8 @@ public static KvTablet create(
163169 KvFormat kvFormat ,
164170 RowMerger rowMerger ,
165171 ArrowCompressionInfo arrowCompressionInfo ,
166- SchemaGetter schemaGetter )
172+ SchemaGetter schemaGetter ,
173+ boolean ignoreUpdateBefore )
167174 throws IOException {
168175 Tuple2 <PhysicalTablePath , TableBucket > tablePathAndBucket =
169176 FlussPaths .parseTabletDir (kvTabletDir );
@@ -179,7 +186,8 @@ public static KvTablet create(
179186 kvFormat ,
180187 rowMerger ,
181188 arrowCompressionInfo ,
182- schemaGetter );
189+ schemaGetter ,
190+ ignoreUpdateBefore );
183191 }
184192
185193 public static KvTablet create (
@@ -194,7 +202,8 @@ public static KvTablet create(
194202 KvFormat kvFormat ,
195203 RowMerger rowMerger ,
196204 ArrowCompressionInfo arrowCompressionInfo ,
197- SchemaGetter schemaGetter )
205+ SchemaGetter schemaGetter ,
206+ boolean ignoreUpdateBefore )
198207 throws IOException {
199208 RocksDBKv kv = buildRocksDBKv (serverConf , kvTabletDir );
200209 return new KvTablet (
@@ -211,7 +220,8 @@ public static KvTablet create(
211220 kvFormat ,
212221 rowMerger ,
213222 arrowCompressionInfo ,
214- schemaGetter );
223+ schemaGetter ,
224+ ignoreUpdateBefore );
215225 }
216226
217227 private static RocksDBKv buildRocksDBKv (Configuration configuration , File kvDir )
@@ -345,52 +355,89 @@ public LogAppendInfo putAsLeader(KvRecordBatch kvRecords, @Nullable int[] target
345355 latestSchemaRow .replaceRow (oldValue .row ));
346356 kvPreWriteBuffer .delete (key , logOffset ++);
347357 } else {
348- // otherwise, it's a partial update, should produce -U,+U
349- walBuilder .append (
350- ChangeType .UPDATE_BEFORE ,
351- latestSchemaRow .replaceRow (oldValue .row ));
352- walBuilder .append (
353- ChangeType .UPDATE_AFTER ,
354- latestSchemaRow .replaceRow (newValue .row ));
355- kvPreWriteBuffer .put (
356- key , newValue .encodeValue (), logOffset + 1 );
357- logOffset += 2 ;
358+ // otherwise, it's a partial update
359+ if (ignoreUpdateBefore ) {
360+ // only produce +U
361+ walBuilder .append (
362+ ChangeType .UPDATE_AFTER ,
363+ latestSchemaRow .replaceRow (newValue .row ));
364+ kvPreWriteBuffer .put (
365+ key , newValue .encodeValue (), logOffset );
366+ logOffset ++;
367+ } else {
368+ // produce -U, +U
369+ walBuilder .append (
370+ ChangeType .UPDATE_BEFORE ,
371+ latestSchemaRow .replaceRow (oldValue .row ));
372+ walBuilder .append (
373+ ChangeType .UPDATE_AFTER ,
374+ latestSchemaRow .replaceRow (newValue .row ));
375+ kvPreWriteBuffer .put (
376+ key , newValue .encodeValue (), logOffset + 1 );
377+ logOffset += 2 ;
378+ }
358379 }
359380 }
360381 } else {
361382 // upsert operation
362- byte [] oldValueBytes = getFromBufferOrKv (key );
363- // it's update
364- if (oldValueBytes != null ) {
365- BinaryValue oldValue = valueDecoder .decodeValue (oldValueBytes );
366- BinaryValue newValue =
367- currentMerger .merge (oldValue , currentValue );
368- if (newValue == oldValue ) {
369- // newValue is the same to oldValue, means nothing
370- // happens (no update/delete), and input should be ignored
371- continue ;
372- }
373-
374- walBuilder .append (
375- ChangeType .UPDATE_BEFORE ,
376- latestSchemaRow .replaceRow (oldValue .row ));
383+ // Optimization: when ignoring UPDATE_BEFORE and merger is
384+ // DefaultRowMerger (full update, not partial update), we can skip
385+ // fetching old value for better performance since it always returns
386+ // new value.
387+ if (ignoreUpdateBefore
388+ && currentMerger instanceof DefaultRowMerger ) {
389+ // Fast path: directly produce +U without fetching old value
377390 walBuilder .append (
378391 ChangeType .UPDATE_AFTER ,
379- latestSchemaRow .replaceRow (newValue .row ));
380- // logOffset is for -U, logOffset + 1 is for +U, we need to use
381- // the log offset for +U
382- kvPreWriteBuffer .put (
383- key , newValue .encodeValue (), logOffset + 1 );
384- logOffset += 2 ;
385- } else {
386- // it's insert
387- // TODO: we should add guarantees that all non-specified columns
388- // of the input row are set to null.
389- walBuilder .append (
390- ChangeType .INSERT ,
391392 latestSchemaRow .replaceRow (currentValue .row ));
392393 kvPreWriteBuffer .put (
393394 key , currentValue .encodeValue (), logOffset ++);
395+ } else {
396+ byte [] oldValueBytes = getFromBufferOrKv (key );
397+ // it's update
398+ if (oldValueBytes != null ) {
399+ BinaryValue oldValue =
400+ valueDecoder .decodeValue (oldValueBytes );
401+ BinaryValue newValue =
402+ currentMerger .merge (oldValue , currentValue );
403+ if (newValue == oldValue ) {
404+ // newValue is the same to oldValue, means nothing
405+ // happens (no update/delete), and input should be
406+ // ignored
407+ continue ;
408+ }
409+
410+ if (ignoreUpdateBefore ) {
411+ // only produce +U when ignoring UPDATE_BEFORE
412+ walBuilder .append (
413+ ChangeType .UPDATE_AFTER ,
414+ latestSchemaRow .replaceRow (newValue .row ));
415+ kvPreWriteBuffer .put (
416+ key , newValue .encodeValue (), logOffset );
417+ logOffset ++;
418+ } else {
419+ walBuilder .append (
420+ ChangeType .UPDATE_BEFORE ,
421+ latestSchemaRow .replaceRow (oldValue .row ));
422+ walBuilder .append (
423+ ChangeType .UPDATE_AFTER ,
424+ latestSchemaRow .replaceRow (newValue .row ));
425+ // logOffset is for -U, logOffset + 1 is for +U, we need
426+ // to use the log offset for +U
427+ kvPreWriteBuffer .put (
428+ key , newValue .encodeValue (), logOffset + 1 );
429+ logOffset += 2 ;
430+ }
431+ } else {
432+ // it's insert
433+ // TODO: we should add guarantees that all non-specified
434+ // columns of the input row are set to null.
435+ walBuilder .append (
436+ ChangeType .INSERT ,
437+ latestSchemaRow .replaceRow (currentValue .row ));
438+ kvPreWriteBuffer .put (
439+ key , currentValue .encodeValue (), logOffset ++);
440+ }
394441 }
395442 }
396443 }
0 commit comments