Skip to content

Commit af1f98c

Browse files
authored
HDDS-10822. Tool to omit raft log in OM. (apache#8154)
1 parent 3201ca4 commit af1f98c

File tree

2 files changed

+219
-1
lines changed

2 files changed

+219
-1
lines changed
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.hadoop.ozone.repair.om;
19+
20+
import static org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.Type.EchoRPC;
21+
22+
import java.io.File;
23+
import java.io.IOException;
24+
import java.nio.ByteBuffer;
25+
import java.nio.file.Files;
26+
import java.nio.file.StandardCopyOption;
27+
import java.util.regex.Matcher;
28+
import java.util.regex.Pattern;
29+
import org.apache.hadoop.hdds.cli.HddsVersionProvider;
30+
import org.apache.hadoop.ozone.om.helpers.OMRatisHelper;
31+
import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos;
32+
import org.apache.hadoop.ozone.repair.RepairTool;
33+
import org.apache.ratis.proto.RaftProtos;
34+
import org.apache.ratis.server.RaftServerConfigKeys;
35+
import org.apache.ratis.server.raftlog.segmented.LogSegment;
36+
import org.apache.ratis.server.raftlog.segmented.LogSegmentPath;
37+
import org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogOutputStream;
38+
import org.apache.ratis.util.SizeInBytes;
39+
import picocli.CommandLine;
40+
41+
/**
42+
* Tool to omit a raft log in a ratis segment file.
43+
*/
44+
@CommandLine.Command(
45+
name = "skip-ratis-transaction",
46+
aliases = "srt",
47+
description = "CLI to omit a raft log in a ratis segment file. The raft log at the index specified " +
48+
"is replaced with an EchoOM command (which is a dummy command). It is an offline command " +
49+
"i.e., doesn't require OM to be running. " +
50+
"The command should be run for the same transaction on all 3 OMs only when all the OMs are crashing " +
51+
"while applying the same transaction. If only one OM is crashing and the " +
52+
"other OMs have executed the log successfully, then the DB should be manually copied " +
53+
"from one of the good OMs to the crashing OM instead.",
54+
mixinStandardHelpOptions = true,
55+
versionProvider = HddsVersionProvider.class
56+
)
57+
public class OMRatisLogRepair extends RepairTool {
58+
59+
@CommandLine.ArgGroup(multiplicity = "1")
60+
private ExclusiveArguments exclusiveArguments;
61+
62+
@CommandLine.Option(names = {"-b", "--backup"},
63+
required = true,
64+
description = "Directory to put the backup of the original repaired segment file before the repair.")
65+
private File backupDir;
66+
67+
@CommandLine.Option(names = {"--index"},
68+
required = true,
69+
description = "Index of the failing transaction that should be removed")
70+
private long index;
71+
72+
private SegmentedRaftLogOutputStream outputStream = null;
73+
74+
@Override
75+
public void execute() throws Exception {
76+
77+
if (exclusiveArguments.logDir != null) {
78+
exclusiveArguments.segmentFile = findSegmentFileContainingIndex();
79+
}
80+
81+
if (exclusiveArguments.segmentFile.getParentFile().toPath().equals(backupDir.toPath())) {
82+
throw new IOException("Backup directory cannot be same as segment file's parent directory.");
83+
}
84+
85+
LogSegmentPath pi = LogSegmentPath.matchLogSegment(this.exclusiveArguments.segmentFile.toPath());
86+
if (pi == null) {
87+
throw new IOException("Invalid Segment File");
88+
}
89+
90+
if (!exclusiveArguments.segmentFile.exists()) {
91+
throw new IOException("Error: Source segment file \"" + exclusiveArguments.segmentFile + "\" does not exist.");
92+
}
93+
if (!backupDir.exists()) {
94+
info("BackupDir \"" + backupDir + "\" does not exist. Creating the directory path.");
95+
if (!isDryRun()) {
96+
Files.createDirectories(backupDir.toPath());
97+
}
98+
}
99+
100+
File backupPath = new File(backupDir, exclusiveArguments.segmentFile.getName());
101+
if (backupPath.exists()) {
102+
throw new IOException("Error: Backup file for segment file \"" + exclusiveArguments.segmentFile +
103+
"\" already exists. Either delete the old backup or provide a different directory to take the backup.");
104+
}
105+
info("Taking back up of Raft Log file: " + this.exclusiveArguments.segmentFile.getAbsolutePath() +
106+
" to location: " + backupPath);
107+
if (!isDryRun()) {
108+
Files.copy(exclusiveArguments.segmentFile.toPath(), backupPath.toPath());
109+
}
110+
info("File backed-up successfully!");
111+
112+
File outputFile = null;
113+
if (!isDryRun()) {
114+
outputFile = File.createTempFile("srt-output", null, backupDir);
115+
outputFile.deleteOnExit();
116+
}
117+
info("Created temporary output file: " + (outputFile == null ? "<None>" : outputFile.toPath()));
118+
119+
info("Processing Raft Log file: " + this.exclusiveArguments.segmentFile.getAbsolutePath() + " size:" +
120+
this.exclusiveArguments.segmentFile.length());
121+
122+
if (!isDryRun()) {
123+
outputStream = new SegmentedRaftLogOutputStream(outputFile, false,
124+
1024, 1024, ByteBuffer.allocateDirect(SizeInBytes.valueOf("8MB").getSizeInt()));
125+
}
126+
127+
int entryCount = LogSegment.readSegmentFile(exclusiveArguments.segmentFile, pi.getStartEnd(),
128+
SizeInBytes.valueOf("32MB"), RaftServerConfigKeys.Log.CorruptionPolicy.EXCEPTION, null, this::processLogEntry);
129+
if (!isDryRun()) {
130+
outputStream.flush();
131+
outputStream.close();
132+
Files.move(outputFile.toPath(), exclusiveArguments.segmentFile.toPath(),
133+
StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE);
134+
}
135+
info("Finished processing all the entries (" + entryCount + " logs) from the segment file.");
136+
info("Moved temporary output file to correct raft log location : " + exclusiveArguments.segmentFile.toPath());
137+
138+
}
139+
140+
private void processLogEntry(RaftProtos.LogEntryProto proto) {
141+
try {
142+
RaftProtos.LogEntryProto newLogEntry = proto.getIndex() != index ? proto : getOmEchoLogEntry(proto);
143+
if (!isDryRun()) {
144+
outputStream.write(newLogEntry);
145+
outputStream.flush();
146+
}
147+
} catch (IOException ex) {
148+
throw new RuntimeException("Error while processing logEntry: (" + proto.getIndex() + "). Exception: " + ex);
149+
}
150+
}
151+
152+
private RaftProtos.LogEntryProto getOmEchoLogEntry(RaftProtos.LogEntryProto proto) throws IOException {
153+
OzoneManagerProtocolProtos.OMRequest.Builder newRequest = OzoneManagerProtocolProtos.OMRequest.newBuilder()
154+
.setCmdType(EchoRPC)
155+
.setClientId("skip-ratis-transaction-repair-tool")
156+
.setEchoRPCRequest(OzoneManagerProtocolProtos.EchoRPCRequest.newBuilder().build());
157+
RaftProtos.StateMachineLogEntryProto.Builder entry = proto.getStateMachineLogEntry().toBuilder()
158+
.setLogData(OMRatisHelper.convertRequestToByteString(newRequest.build()));
159+
OzoneManagerProtocolProtos.OMRequest oldRequest = OMRatisHelper
160+
.convertByteStringToOMRequest(proto.getStateMachineLogEntry().getLogData());
161+
info("Replacing {" + oldRequest.toString().replace("\n", " ")
162+
+ "} with EchoRPC command at index " + proto.getIndex());
163+
return proto.toBuilder()
164+
.setStateMachineLogEntry(entry)
165+
.build();
166+
}
167+
168+
private File findSegmentFileContainingIndex() {
169+
if (!exclusiveArguments.logDir.exists() || !exclusiveArguments.logDir.isDirectory()) {
170+
throw new IllegalArgumentException("Invalid log directory: " + exclusiveArguments.logDir);
171+
}
172+
173+
// Pattern to match Ratis log files: log_<start>-<end> or log_inprogress_<start>
174+
Pattern pattern = Pattern.compile("log(?:_inprogress)?_(\\d+)(?:-(\\d+))?");
175+
176+
File[] segmentFiles = exclusiveArguments.logDir.listFiles();
177+
if (segmentFiles == null) {
178+
throw new IllegalArgumentException("Invalid log directory: " + exclusiveArguments.logDir +
179+
". No segment files present.");
180+
}
181+
182+
for (File file : segmentFiles) {
183+
Matcher matcher = pattern.matcher(file.getName());
184+
if (matcher.matches()) {
185+
long start = Long.parseLong(matcher.group(1));
186+
String endStr = matcher.group(2);
187+
188+
// If it's an in-progress file, assume it contains all entries from start onwards
189+
if (endStr == null) {
190+
if (index >= start) {
191+
info("Segment file \"" + file + "\" contains the index (" + index + ").");
192+
return file;
193+
}
194+
} else {
195+
long end = Long.parseLong(endStr);
196+
if (index >= start && index <= end) {
197+
info("Segment file \"" + file + "\" contains the index (" + index + ").");
198+
return file;
199+
}
200+
}
201+
}
202+
}
203+
204+
throw new IllegalArgumentException("Invalid index (" + index
205+
+ ") for log directory: \"" + exclusiveArguments.logDir + "\". None of the segment files have the index.");
206+
}
207+
208+
private static final class ExclusiveArguments {
209+
@CommandLine.Option(names = {"-s", "--segment-path"},
210+
description = "Path of the input segment file")
211+
private File segmentFile;
212+
213+
@CommandLine.Option(names = {"-d", "--ratis-log-dir"},
214+
description = "Path of the ratis log directory")
215+
private File logDir;
216+
}
217+
}

hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRepair.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@
3232
SnapshotRepair.class,
3333
TransactionInfoRepair.class,
3434
QuotaRepair.class,
35-
CompactOMDB.class
35+
CompactOMDB.class,
36+
OMRatisLogRepair.class
3637
},
3738
description = "Operational tool to repair OM.")
3839
@MetaInfServices(RepairSubcommand.class)

0 commit comments

Comments
 (0)