[server] Recover log and index file for unclean shutdown (#1749)

LiebingYu · web-flow · commit d5cb521471ee · 2025-11-10T20:01:09.000+08:00
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/log/LogLoader.java b/fluss-server/src/main/java/org/apache/fluss/server/log/LogLoader.java
@@ -19,6 +19,7 @@
 
 import org.apache.fluss.config.ConfigOptions;
 import org.apache.fluss.config.Configuration;
+import org.apache.fluss.exception.InvalidOffsetException;
 import org.apache.fluss.exception.LogSegmentOffsetOverflowException;
 import org.apache.fluss.exception.LogStorageException;
 import org.apache.fluss.metadata.LogFormat;
@@ -31,8 +32,13 @@
 import java.io.File;
 import java.io.IOException;
 import java.nio.file.Files;
+import java.nio.file.NoSuchFileException;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Collectors;
 
 /* This file is based on source code of Apache Kafka Project (https://kafka.apache.org/), licensed by the Apache
  * Software Foundation (ASF) under the Apache License, Version 2.0. See the NOTICE file distributed with this work for
@@ -117,6 +123,37 @@ public LoadedLogOffsets load() throws IOException {
                         nextOffset, activeSegment.getBaseOffset(), activeSegment.getSizeInBytes()));
     }
 
+    /**
+     * Just recovers the given segment, without adding it to the provided segments.
+     *
+     * @param segment Segment to recover
+     * @return The number of bytes truncated from the segment
+     * @throws LogSegmentOffsetOverflowException if the segment contains messages that cause index
+     *     offset overflow
+     */
+    private int recoverSegment(LogSegment segment) throws IOException {
+        WriterStateManager writerStateManager =
+                new WriterStateManager(
+                        logSegments.getTableBucket(),
+                        logTabletDir,
+                        this.writerStateManager.writerExpirationMs());
+        // TODO, Here, we use 0 as the logStartOffset passed into rebuildWriterState. The reason is
+        // that the current implementation of logStartOffset in Fluss is not yet fully refined, and
+        // there may be cases where logStartOffset is not updated. As a result, logStartOffset is
+        // not yet reliable. Once the issue with correctly updating logStartOffset is resolved in
+        // issue https://github.com/apache/fluss/issues/744, we can use logStartOffset here.
+        // Additionally, using 0 versus using logStartOffset does not affect correctness—they both
+        // can restore the complete WriterState. The only difference is that using logStartOffset
+        // can potentially skip over more segments.
+        LogTablet.rebuildWriterState(
+                writerStateManager, logSegments, 0, segment.getBaseOffset(), false);
+        int bytesTruncated = segment.recover();
+        // once we have recovered the segment's data, take a snapshot to ensure that we won't
+        // need to reload the same segment again while recovering another segment.
+        writerStateManager.takeSnapshot();
+        return bytesTruncated;
+    }
+
     /**
      * Recover the log segments (if there was an unclean shutdown). Ensures there is at least one
      * active segment, and returns the updated recovery point and next offset after recovery.
@@ -129,14 +166,106 @@ public LoadedLogOffsets load() throws IOException {
      *     overflow
      */
     private Tuple2<Long, Long> recoverLog() throws IOException {
-        // TODO truncate log to recover maybe unflush segments.
+        if (!isCleanShutdown) {
+            List<LogSegment> unflushed =
+                    logSegments.values(recoveryPointCheckpoint, Long.MAX_VALUE);
+            int numUnflushed = unflushed.size();
+            Iterator<LogSegment> unflushedIter = unflushed.iterator();
+            boolean truncated = false;
+            int numFlushed = 1;
+
+            while (unflushedIter.hasNext() && !truncated) {
+                LogSegment segment = unflushedIter.next();
+                LOG.info(
+                        "Recovering unflushed segment {}. {}/{} recovered for bucket {}",
+                        segment.getBaseOffset(),
+                        numFlushed,
+                        numUnflushed,
+                        logSegments.getTableBucket());
+
+                int truncatedBytes = -1;
+                try {
+                    truncatedBytes = recoverSegment(segment);
+                } catch (Exception e) {
+                    if (e instanceof InvalidOffsetException) {
+                        long startOffset = segment.getBaseOffset();
+                        LOG.warn(
+                                "Found invalid offset during recovery for bucket {}. Deleting the corrupt segment "
+                                        + "and creating an empty one with starting offset {}",
+                                logSegments.getTableBucket(),
+                                startOffset);
+                        truncatedBytes = segment.truncateTo(startOffset);
+                    } else {
+                        throw e;
+                    }
+                }
+
+                if (truncatedBytes > 0) {
+                    // we had an invalid message, delete all remaining log
+                    LOG.warn(
+                            "Corruption found in segment {} for bucket {}, truncating to offset {}",
+                            segment.getBaseOffset(),
+                            logSegments.getTableBucket(),
+                            segment.readNextOffset());
+                    removeAndDeleteSegments(unflushedIter);
+                    truncated = true;
+                } else {
+                    numFlushed += 1;
+                }
+            }
+        }
+
         if (logSegments.isEmpty()) {
+            // TODO: use logStartOffset if issue https://github.com/apache/fluss/issues/744 ready
             logSegments.add(LogSegment.open(logTabletDir, 0L, conf, logFormat));
         }
         long logEndOffset = logSegments.lastSegment().get().readNextOffset();
         return Tuple2.of(recoveryPointCheckpoint, logEndOffset);
     }
 
+    /**
+     * This method deletes the given log segments and the associated writer snapshots.
+     *
+     * <p>This method does not need to convert IOException to {@link LogStorageException} because it
+     * is either called before all logs are loaded or the immediate caller will catch and handle
+     * IOException
+     *
+     * @param segmentsToDelete The log segments to schedule for deletion
+     */
+    private void removeAndDeleteSegments(Iterator<LogSegment> segmentsToDelete) {
+        if (segmentsToDelete.hasNext()) {
+            List<LogSegment> toDelete = new ArrayList<>();
+            segmentsToDelete.forEachRemaining(toDelete::add);
+
+            LOG.info(
+                    "Deleting segments for bucket {} as part of log recovery: {}",
+                    logSegments.getTableBucket(),
+                    toDelete.stream().map(LogSegment::toString).collect(Collectors.joining(",")));
+            toDelete.forEach(segment -> logSegments.remove(segment.getBaseOffset()));
+
+            try {
+                LocalLog.deleteSegmentFiles(
+                        toDelete, LocalLog.SegmentDeletionReason.LOG_TRUNCATION);
+            } catch (IOException e) {
+                LOG.error(
+                        "Failed to delete truncated segments {} for bucket {}",
+                        toDelete,
+                        logSegments.getTableBucket(),
+                        e);
+            }
+
+            try {
+                LogTablet.deleteWriterSnapshots(toDelete, writerStateManager);
+            } catch (IOException e) {
+                LOG.error(
+                        "Failed to delete truncated writer snapshots {} for bucket {}",
+                        toDelete,
+                        logSegments.getTableBucket(),
+                        e);
+            }
+        }
+    }
+
     /** Loads segments from disk into the provided segments. */
     private void loadSegmentFiles() throws IOException {
         File[] sortedFiles = logTabletDir.listFiles();
@@ -156,8 +285,28 @@ private void loadSegmentFiles() throws IOException {
                         }
                     } else if (LocalLog.isLogFile(file)) {
                         long baseOffset = FlussPaths.offsetFromFile(file);
+                        boolean timeIndexFileNewlyCreated =
+                                !FlussPaths.timeIndexFile(logTabletDir, baseOffset).exists();
                         LogSegment segment =
                                 LogSegment.open(logTabletDir, baseOffset, conf, true, 0, logFormat);
+
+                        try {
+                            segment.sanityCheck(timeIndexFileNewlyCreated);
+                        } catch (IOException e) {
+                            if (e instanceof NoSuchFileException) {
+                                if (isCleanShutdown
+                                        || segment.getBaseOffset() < recoveryPointCheckpoint) {
+                                    LOG.error(
+                                            "Could not find offset index file corresponding to log file {} "
+                                                    + "for bucket {}, recovering segment and rebuilding index files...",
+                                            logSegments.getTableBucket(),
+                                            segment.getFileLogRecords().file().getAbsoluteFile());
+                                }
+                                recoverSegment(segment);
+                            } else {
+                                throw e;
+                            }
+                        }
                         logSegments.add(segment);
                     }
                 }
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/log/LogSegment.java b/fluss-server/src/main/java/org/apache/fluss/server/log/LogSegment.java
@@ -44,6 +44,7 @@
 
 import java.io.File;
 import java.io.IOException;
+import java.nio.file.NoSuchFileException;
 import java.util.Optional;
 
 import static org.apache.fluss.record.LogRecordBatchFormat.V0_RECORD_BATCH_HEADER_SIZE;
@@ -172,6 +173,23 @@ public void resizeIndexes(int size) throws IOException {
         timeIndex().resize(size);
     }
 
+    public void sanityCheck(boolean timeIndexFileNewlyCreated) throws IOException {
+        if (lazyOffsetIndex.file().exists()) {
+            // Resize the time index file to 0 if it is newly created.
+            if (timeIndexFileNewlyCreated) {
+                timeIndex().resize(0);
+            }
+            // Sanity checks for time index and offset index are skipped because
+            // we will recover the segments above the recovery point in recoverLog()
+            // in any case so sanity checking them here is redundant.
+        } else {
+            throw new NoSuchFileException(
+                    "Offset index file "
+                            + lazyOffsetIndex.file().getAbsolutePath()
+                            + " does not exist.");
+        }
+    }
+
     /**
      * The maximum timestamp we see so far.
      *
@@ -284,7 +302,7 @@ public boolean deleted() {
      * Run recovery on the given segment. This will rebuild the index from the log file and lop off
      * any invalid bytes from the end of the log and index.
      */
-    public int recover() throws Exception {
+    public int recover() throws IOException {
         offsetIndex().reset();
         timeIndex().reset();
         int validBytes = 0;
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/log/LogTablet.java b/fluss-server/src/main/java/org/apache/fluss/server/log/LogTablet.java
@@ -1283,7 +1283,7 @@ private static void loadWritersFromRecords(
         loadedWriters.values().forEach(writerStateManager::update);
     }
 
-    private static void deleteWriterSnapshots(
+    public static void deleteWriterSnapshots(
             List<LogSegment> segments, WriterStateManager writerStateManager) throws IOException {
         for (LogSegment segment : segments) {
             writerStateManager.removeAndDeleteSnapshot(segment.getBaseOffset());
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/log/WriterStateManager.java b/fluss-server/src/main/java/org/apache/fluss/server/log/WriterStateManager.java
@@ -99,6 +99,10 @@ public WriterStateManager(TableBucket tableBucket, File logTabletDir, int writer
         this.snapshots = loadSnapshots();
     }
 
+    public int writerExpirationMs() {
+        return writerExpirationMs;
+    }
+
     public int writerIdCount() {
         return writerIdCount;
     }
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/log/LogLoaderTest.java b/fluss-server/src/test/java/org/apache/fluss/server/log/LogLoaderTest.java

Original file line number	Diff line number	Diff line change
`@@ -1283,7 +1283,7 @@ private static void loadWritersFromRecords(`
`1283`	`1283`	`loadedWriters.values().forEach(writerStateManager::update);`
`1284`	`1284`	`}`
`1285`	`1285`
`1286`		`- private static void deleteWriterSnapshots(`
	`1286`	`+ public static void deleteWriterSnapshots(`
`1287`	`1287`	`List<LogSegment> segments, WriterStateManager writerStateManager) throws IOException {`
`1288`	`1288`	`for (LogSegment segment : segments) {`
`1289`	`1289`	`writerStateManager.removeAndDeleteSnapshot(segment.getBaseOffset());`
Original file line number	Diff line number	Diff line change
`@@ -99,6 +99,10 @@ public WriterStateManager(TableBucket tableBucket, File logTabletDir, int writer`
`99`	`99`	`this.snapshots = loadSnapshots();`
`100`	`100`	`}`
`101`	`101`
	`102`	`+ public int writerExpirationMs() {`
	`103`	`+ return writerExpirationMs;`
	`104`	`+ }`
	`105`	`+`
`102`	`106`	`public int writerIdCount() {`
`103`	`107`	`return writerIdCount;`
`104`	`108`	`}`