[server] Optimize log recovery time

LiebingYu · LiebingYu · commit 0f2c95a48470 · 2025-11-28T13:05:21.000+08:00
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/log/LogLoader.java b/fluss-server/src/main/java/org/apache/fluss/server/log/LogLoader.java
@@ -19,9 +19,11 @@
 
 import org.apache.fluss.config.ConfigOptions;
 import org.apache.fluss.config.Configuration;
+import org.apache.fluss.exception.InvalidOffsetException;
 import org.apache.fluss.exception.LogSegmentOffsetOverflowException;
 import org.apache.fluss.exception.LogStorageException;
 import org.apache.fluss.metadata.LogFormat;
+import org.apache.fluss.server.exception.CorruptIndexException;
 import org.apache.fluss.utils.FlussPaths;
 import org.apache.fluss.utils.types.Tuple2;
 
@@ -31,8 +33,13 @@
 import java.io.File;
 import java.io.IOException;
 import java.nio.file.Files;
+import java.nio.file.NoSuchFileException;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Collectors;
 
 /* This file is based on source code of Apache Kafka Project (https://kafka.apache.org/), licensed by the Apache
  * Software Foundation (ASF) under the Apache License, Version 2.0. See the NOTICE file distributed with this work for
@@ -129,6 +136,61 @@ public LoadedLogOffsets load() throws IOException {
      *     overflow
      */
     private Tuple2<Long, Long> recoverLog() throws IOException {
+        if (!isCleanShutdown) {
+            List<LogSegment> unflushed =
+                    logSegments.values(recoveryPointCheckpoint, Long.MAX_VALUE);
+            int numUnflushed = unflushed.size();
+            Iterator<LogSegment> unflushedIter = unflushed.iterator();
+            boolean truncated = false;
+            int numFlushed = 1;
+
+            while (unflushedIter.hasNext() && !truncated) {
+                LogSegment segment = unflushedIter.next();
+                LOG.info(
+                        "Recovering unflushed segment {}. {}/{} recovered for bucket {}",
+                        segment.getBaseOffset(),
+                        numFlushed,
+                        numUnflushed,
+                        logSegments.getTableBucket());
+
+                try {
+                    segment.sanityCheck();
+                } catch (NoSuchFileException | CorruptIndexException e) {
+                    LOG.warn(
+                            "Found invalid index file corresponding log file {} for bucket {}, "
+                                    + "recovering segment and rebuilding index files...",
+                            segment.getFileLogRecords().file().getAbsoluteFile(),
+                            logSegments.getTableBucket(),
+                            e);
+
+                    int truncatedBytes = -1;
+                    try {
+                        truncatedBytes = recoverSegment(segment);
+                    } catch (InvalidOffsetException invalidOffsetException) {
+                        long startOffset = segment.getBaseOffset();
+                        LOG.warn(
+                                "Found invalid offset during recovery for bucket {}. Deleting the corrupt segment "
+                                        + "and creating an empty one with starting offset {}",
+                                logSegments.getTableBucket(),
+                                startOffset);
+                        truncatedBytes = segment.truncateTo(startOffset);
+                    }
+
+                    if (truncatedBytes > 0) {
+                        // we had an invalid message, delete all remaining log
+                        LOG.warn(
+                                "Corruption found in segment {} for bucket {}, truncating to offset {}",
+                                segment.getBaseOffset(),
+                                logSegments.getTableBucket(),
+                                segment.readNextOffset());
+                        removeAndDeleteSegments(unflushedIter);
+                        truncated = true;
+                    }
+                }
+                numFlushed += 1;
+            }
+        }
+
         // TODO truncate log to recover maybe unflush segments.
         if (logSegments.isEmpty()) {
             logSegments.add(LogSegment.open(logTabletDir, 0L, conf, logFormat));
@@ -137,6 +199,80 @@ private Tuple2<Long, Long> recoverLog() throws IOException {
         return Tuple2.of(recoveryPointCheckpoint, logEndOffset);
     }
 
+    /**
+     * This method deletes the given log segments and the associated writer snapshots.
+     *
+     * <p>This method does not need to convert IOException to {@link LogStorageException} because it
+     * is either called before all logs are loaded or the immediate caller will catch and handle
+     * IOException
+     *
+     * @param segmentsToDelete The log segments to schedule for deletion
+     */
+    private void removeAndDeleteSegments(Iterator<LogSegment> segmentsToDelete) {
+        if (segmentsToDelete.hasNext()) {
+            List<LogSegment> toDelete = new ArrayList<>();
+            segmentsToDelete.forEachRemaining(toDelete::add);
+
+            LOG.info(
+                    "Deleting segments for bucket {} as part of log recovery: {}",
+                    logSegments.getTableBucket(),
+                    toDelete.stream().map(LogSegment::toString).collect(Collectors.joining(",")));
+            toDelete.forEach(segment -> logSegments.remove(segment.getBaseOffset()));
+
+            try {
+                LocalLog.deleteSegmentFiles(
+                        toDelete, LocalLog.SegmentDeletionReason.LOG_TRUNCATION);
+            } catch (IOException e) {
+                LOG.error(
+                        "Failed to delete truncated segments {} for bucket {}",
+                        toDelete,
+                        logSegments.getTableBucket(),
+                        e);
+            }
+
+            try {
+                LogTablet.deleteWriterSnapshots(toDelete, writerStateManager);
+            } catch (IOException e) {
+                LOG.error(
+                        "Failed to delete truncated writer snapshots {} for bucket {}",
+                        toDelete,
+                        logSegments.getTableBucket(),
+                        e);
+            }
+        }
+    }
+
+    /**
+     * Just recovers the given segment, without adding it to the provided segments.
+     *
+     * @param segment Segment to recover
+     * @return The number of bytes truncated from the segment
+     * @throws LogSegmentOffsetOverflowException if the segment contains messages that cause index
+     *     offset overflow
+     */
+    private int recoverSegment(LogSegment segment) throws IOException {
+        WriterStateManager writerStateManager =
+                new WriterStateManager(
+                        logSegments.getTableBucket(),
+                        logTabletDir,
+                        this.writerStateManager.writerExpirationMs());
+        // TODO, Here, we use 0 as the logStartOffset passed into rebuildWriterState. The reason is
+        // that the current implementation of logStartOffset in Fluss is not yet fully refined, and
+        // there may be cases where logStartOffset is not updated. As a result, logStartOffset is
+        // not yet reliable. Once the issue with correctly updating logStartOffset is resolved in
+        // issue https://github.com/apache/fluss/issues/744, we can use logStartOffset here.
+        // Additionally, using 0 versus using logStartOffset does not affect correctness—they both
+        // can restore the complete WriterState. The only difference is that using logStartOffset
+        // can potentially skip over more segments.
+        LogTablet.rebuildWriterState(
+                writerStateManager, logSegments, 0, segment.getBaseOffset(), false);
+        int bytesTruncated = segment.recover();
+        // once we have recovered the segment's data, take a snapshot to ensure that we won't
+        // need to reload the same segment again while recovering another segment.
+        writerStateManager.takeSnapshot();
+        return bytesTruncated;
+    }
+
     /** Loads segments from disk into the provided segments. */
     private void loadSegmentFiles() throws IOException {
         File[] sortedFiles = logTabletDir.listFiles();
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/log/LogSegment.java b/fluss-server/src/main/java/org/apache/fluss/server/log/LogSegment.java
@@ -44,6 +44,7 @@
 
 import java.io.File;
 import java.io.IOException;
+import java.nio.file.NoSuchFileException;
 import java.util.Optional;
 
 import static org.apache.fluss.record.LogRecordBatchFormat.V0_RECORD_BATCH_HEADER_SIZE;
@@ -172,6 +173,24 @@ public void resizeIndexes(int size) throws IOException {
         timeIndex().resize(size);
     }
 
+    public void sanityCheck() throws IOException {
+        if (!lazyOffsetIndex.file().exists()) {
+            throw new NoSuchFileException(
+                    "Offset index file "
+                            + lazyOffsetIndex.file().getAbsolutePath()
+                            + " does not exist.");
+        }
+        lazyOffsetIndex.get().sanityCheck();
+
+        if (!lazyTimeIndex.file().exists()) {
+            throw new NoSuchFileException(
+                    "Time index file "
+                            + lazyTimeIndex.file().getAbsolutePath()
+                            + " does not exist.");
+        }
+        lazyTimeIndex.get().sanityCheck();
+    }
+
     /**
      * The maximum timestamp we see so far.
      *
@@ -284,7 +303,7 @@ public boolean deleted() {
      * Run recovery on the given segment. This will rebuild the index from the log file and lop off
      * any invalid bytes from the end of the log and index.
      */
-    public int recover() throws Exception {
+    public int recover() throws IOException {
         offsetIndex().reset();
         timeIndex().reset();
         int validBytes = 0;
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/log/LogTablet.java b/fluss-server/src/main/java/org/apache/fluss/server/log/LogTablet.java
@@ -1283,7 +1283,7 @@ private static void loadWritersFromRecords(
         loadedWriters.values().forEach(writerStateManager::update);
     }
 
-    private static void deleteWriterSnapshots(
+    public static void deleteWriterSnapshots(
             List<LogSegment> segments, WriterStateManager writerStateManager) throws IOException {
         for (LogSegment segment : segments) {
             writerStateManager.removeAndDeleteSnapshot(segment.getBaseOffset());
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/log/WriterStateManager.java b/fluss-server/src/main/java/org/apache/fluss/server/log/WriterStateManager.java
@@ -99,6 +99,10 @@ public WriterStateManager(TableBucket tableBucket, File logTabletDir, int writer
         this.snapshots = loadSnapshots();
     }
 
+    public int writerExpirationMs() {
+        return writerExpirationMs;
+    }
+
     public int writerIdCount() {
         return writerIdCount;
     }
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/log/LogLoaderTest.java b/fluss-server/src/test/java/org/apache/fluss/server/log/LogLoaderTest.java

Original file line number	Diff line number	Diff line change
`@@ -1283,7 +1283,7 @@ private static void loadWritersFromRecords(`
`1283`	`1283`	`loadedWriters.values().forEach(writerStateManager::update);`
`1284`	`1284`	`}`
`1285`	`1285`
`1286`		`- private static void deleteWriterSnapshots(`
	`1286`	`+ public static void deleteWriterSnapshots(`
`1287`	`1287`	`List<LogSegment> segments, WriterStateManager writerStateManager) throws IOException {`
`1288`	`1288`	`for (LogSegment segment : segments) {`
`1289`	`1289`	`writerStateManager.removeAndDeleteSnapshot(segment.getBaseOffset());`
Original file line number	Diff line number	Diff line change
`@@ -99,6 +99,10 @@ public WriterStateManager(TableBucket tableBucket, File logTabletDir, int writer`
`99`	`99`	`this.snapshots = loadSnapshots();`
`100`	`100`	`}`
`101`	`101`
	`102`	`+ public int writerExpirationMs() {`
	`103`	`+ return writerExpirationMs;`
	`104`	`+ }`
	`105`	`+`
`102`	`106`	`public int writerIdCount() {`
`103`	`107`	`return writerIdCount;`
`104`	`108`	`}`