Skip to content

Commit 9bcc95a

Browse files
committed
Add corruption guards against hardware bit flips
Summary: Investigation of a production SST corruption revealed that a single hardware bit flip (bit 32) in a value Slice's size_ field during compaction caused a 5.6MB value to be treated as 4.3GB. The corruption was silent because: - BlockBuilder's varint encoding uses static_cast<uint32_t>(value.size()), which truncated the corrupted size back to the correct value - buffer_.append(value.data(), value.size()) used the full 64-bit corrupted size, appending 4GB of adjacent heap memory into the block - The block checksum was computed over the corrupted data, so it matched - paranoid_file_checks was disabled This change adds 2 layers of defense: 1. **Value size uint32 truncation guard**: In BlockBasedTableBuilder::Add(), detect when value.size() exceeds uint32_t range before passing it to BlockBuilder where the varint encoding would silently truncate it. Returns Status::Corruption so the flush/compaction can abort gracefully without crashing the process. This applies to both flush and compaction paths since all key-value pairs flow through BlockBasedTableBuilder::Add(). 2. **Compaction output/input size ratio check**: New option max_compaction_output_to_input_ratio (default: 10, 0 to disable) in MutableCFOptions. After compaction, if total output size exceeds this ratio times total input size, return Status::Corruption. This catches cases where corrupted values inflate the output file far beyond what input data justifies. This check applies to compaction only (not flush), since flush writes from a memtable and does not have input files for comparison. However, flush output is still protected by guards #1 and #2. Additionally, both the flush loop (db/builder.cc) and the compaction output loop (db/compaction/compaction_outputs.cc) now check builder->status() after each Add() call. Previously, a corruption error set inside the builder was only surfaced when Finish() was called, causing the loop to continue iterating through potentially millions of keys doing wasted work. Test Plan: Unit test
1 parent 4c89ff1 commit 9bcc95a

20 files changed

+368
-3
lines changed

db/builder.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,10 @@ Status BuildTable(
263263
break;
264264
}
265265
builder->Add(key_after_flush, value_after_flush);
266+
if (!builder->status().ok()) {
267+
s = builder->status();
268+
break;
269+
}
266270

267271
if (flush_stats) {
268272
flush_stats->num_output_records++;

db/c_test.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2557,6 +2557,10 @@ int main(int argc, char** argv) {
25572557
CheckNoError(err);
25582558

25592559
rocksdb_cuckoo_options_destroy(cuckoo_options);
2560+
2561+
// Reset table factory back to block-based so subsequent test phases
2562+
// (e.g., transactions) don't inherit the cuckoo table factory.
2563+
rocksdb_options_set_block_based_table_factory(options, table_options);
25602564
}
25612565

25622566
StartPhase("options");

db/compaction/compaction_job.cc

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -887,6 +887,42 @@ Status CompactionJob::VerifyOutputFiles() {
887887
VerifyOutputFlags verify_output_flags =
888888
compact_->compaction->mutable_cf_options().verify_output_flags;
889889

890+
// Check compaction output/input size ratio. A corrupted value (e.g., from
891+
// a hardware bit flip in Slice::size_) can inflate the output file far
892+
// beyond what the input data justifies.
893+
const uint64_t max_ratio = compact_->compaction->mutable_cf_options()
894+
.max_compaction_output_to_input_ratio;
895+
if (max_ratio > 0) {
896+
uint64_t total_input_bytes =
897+
compact_->compaction->CalculateTotalInputSize();
898+
uint64_t total_output_bytes = 0;
899+
for (const auto& state : compact_->sub_compact_states) {
900+
for (const auto& output : state.GetOutputs()) {
901+
total_output_bytes += output.meta.fd.file_size;
902+
}
903+
}
904+
// Only apply the ratio check when input is large enough for the ratio
905+
// to be meaningful. Small inputs (e.g., < 1MB) can legitimately produce
906+
// larger output due to block overhead, index/filter blocks, and metadata.
907+
static constexpr uint64_t kMinInputForRatioCheck = 1u << 20; // 1MB
908+
if (total_input_bytes >= kMinInputForRatioCheck &&
909+
total_output_bytes > total_input_bytes * max_ratio) {
910+
ROCKS_LOG_ERROR(
911+
db_options_.info_log,
912+
"[%s] [JOB %d] Compaction output size anomaly: "
913+
"output=%" PRIu64 " input=%" PRIu64 " ratio=%.1f max_ratio=%" PRIu64
914+
". Possible data corruption (e.g., hardware bit flip).",
915+
cfd->GetName().c_str(), job_id_, total_output_bytes,
916+
total_input_bytes,
917+
static_cast<double>(total_output_bytes) / total_input_bytes,
918+
max_ratio);
919+
return Status::Corruption(
920+
"Compaction output size (" + std::to_string(total_output_bytes) +
921+
") exceeds " + std::to_string(max_ratio) + "x input size (" +
922+
std::to_string(total_input_bytes) + "). Possible data corruption.");
923+
}
924+
}
925+
890926
// For backward compatibility
891927
if (paranoid_file_checks_) {
892928
verify_output_flags |= VerifyOutputFlags::kVerifyIteration;

db/compaction/compaction_outputs.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,9 @@ Status CompactionOutputs::AddToOutput(
421421
return s;
422422
}
423423
builder_->Add(key, value);
424+
if (!builder_->status().ok()) {
425+
return builder_->status();
426+
}
424427

425428
stats_.num_output_records++;
426429
current_output_file_size_ = builder_->EstimatedFileSize();

db/db_compaction_test.cc

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11663,6 +11663,211 @@ TEST_F(DBCompactionTest, PeriodicTask) {
1166311663
ASSERT_EQ(listener->num_periodic_compactions, 1);
1166411664
Close();
1166511665
}
11666+
// End-to-end test: verify that a transient hardware bit flip in a value
11667+
// Slice's size_ during compaction is detected, the first compaction attempt
11668+
// fails gracefully (without making DB read-only), and the retry succeeds
11669+
// because the transient error does not recur.
11670+
TEST_F(DBCompactionTest, ValueSliceBitFlipDuringCompaction) {
11671+
Options options = CurrentOptions();
11672+
options.compression = kNoCompression;
11673+
options.num_levels = 3;
11674+
options.level0_file_num_compaction_trigger = 3;
11675+
// Disable auto compaction so we can trigger it manually.
11676+
options.disable_auto_compactions = true;
11677+
DestroyAndReopen(options);
11678+
11679+
// Write 3 L0 files with overlapping key ranges.
11680+
for (int file = 0; file < 3; file++) {
11681+
for (int i = 0; i < 10; i++) {
11682+
ASSERT_OK(Put(Key(i),
11683+
"value_" + std::to_string(file) + "_" + std::to_string(i)));
11684+
}
11685+
ASSERT_OK(Flush());
11686+
}
11687+
ASSERT_EQ("3", FilesPerLevel());
11688+
11689+
// Verify all data is readable before compaction.
11690+
for (int i = 0; i < 10; i++) {
11691+
ASSERT_EQ("value_2_" + std::to_string(i), Get(Key(i)));
11692+
}
11693+
11694+
// Set up SyncPoint to flip bit 32 on the 5th key's value during compaction.
11695+
// Uses atomic to be safe and only corrupts once (simulating transient error).
11696+
std::atomic<int> add_count{0};
11697+
std::atomic<bool> corrupted{false};
11698+
SyncPoint::GetInstance()->SetCallBack(
11699+
"BlockBasedTableBuilder::Add:TamperWithValue", [&](void* arg) {
11700+
int count = add_count.fetch_add(1);
11701+
if (count == 4 && !corrupted.load()) {
11702+
corrupted.store(true);
11703+
Slice* v = static_cast<Slice*>(arg);
11704+
v->size_ |= size_t{1} << 32;
11705+
}
11706+
});
11707+
SyncPoint::GetInstance()->EnableProcessing();
11708+
11709+
// First compaction attempt: fails due to bit flip, but treated as transient.
11710+
Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
11711+
ASSERT_TRUE(s.IsCorruption()) << s.ToString();
11712+
11713+
// Disable the corruption injection before testing DB availability.
11714+
SyncPoint::GetInstance()->DisableProcessing();
11715+
SyncPoint::GetInstance()->ClearAllCallBacks();
11716+
11717+
// DB should NOT be in a fatal error state — transient corruption allows
11718+
// retry. Reads should still work.
11719+
for (int i = 0; i < 10; i++) {
11720+
ASSERT_EQ("value_2_" + std::to_string(i), Get(Key(i)));
11721+
}
11722+
11723+
// Writes should still work (DB is not read-only).
11724+
ASSERT_OK(Put(Key(100), "new_value"));
11725+
ASSERT_OK(Flush());
11726+
11727+
// Second compaction attempt: succeeds because the bit flip was transient.
11728+
s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
11729+
ASSERT_OK(s) << s.ToString();
11730+
11731+
// All data should be readable after successful retry.
11732+
for (int i = 0; i < 10; i++) {
11733+
ASSERT_EQ("value_2_" + std::to_string(i), Get(Key(i)));
11734+
}
11735+
ASSERT_EQ("new_value", Get(Key(100)));
11736+
11737+
// Verify the retry counter was reset: new writes, flushes, and compactions
11738+
// should all succeed, proving the DB is fully recovered.
11739+
for (int i = 200; i < 210; i++) {
11740+
ASSERT_OK(Put(Key(i), "post_recovery_" + std::to_string(i)));
11741+
}
11742+
ASSERT_OK(Flush());
11743+
s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
11744+
ASSERT_OK(s) << s.ToString();
11745+
11746+
// All data (old and new) should be readable.
11747+
for (int i = 0; i < 10; i++) {
11748+
ASSERT_EQ("value_2_" + std::to_string(i), Get(Key(i)));
11749+
}
11750+
for (int i = 200; i < 210; i++) {
11751+
ASSERT_EQ("post_recovery_" + std::to_string(i), Get(Key(i)));
11752+
}
11753+
}
11754+
11755+
// End-to-end test: verify that a persistent (non-transient) bit flip causes
11756+
// the DB to become read-only after the retry fails. The first compaction
11757+
// attempt is retried, but the second attempt also hits the corruption
11758+
// (because we keep the SyncPoint active), so the DB escalates to fatal.
11759+
TEST_F(DBCompactionTest, PersistentBitFlipMakesDBReadOnly) {
11760+
Options options = CurrentOptions();
11761+
options.compression = kNoCompression;
11762+
options.num_levels = 3;
11763+
options.level0_file_num_compaction_trigger = 3;
11764+
options.disable_auto_compactions = true;
11765+
DestroyAndReopen(options);
11766+
11767+
for (int file = 0; file < 3; file++) {
11768+
for (int i = 0; i < 10; i++) {
11769+
ASSERT_OK(Put(Key(i),
11770+
"value_" + std::to_string(file) + "_" + std::to_string(i)));
11771+
}
11772+
ASSERT_OK(Flush());
11773+
}
11774+
ASSERT_EQ("3", FilesPerLevel());
11775+
11776+
// Set up SyncPoint to ALWAYS corrupt (simulating persistent hardware error).
11777+
// Every compaction attempt will hit this corruption.
11778+
std::atomic<int> add_count{0};
11779+
SyncPoint::GetInstance()->SetCallBack(
11780+
"BlockBasedTableBuilder::Add:TamperWithValue", [&](void* arg) {
11781+
int count = add_count.fetch_add(1);
11782+
// Corrupt the 5th key in every compaction attempt
11783+
if (count % 10 == 4) {
11784+
Slice* v = static_cast<Slice*>(arg);
11785+
v->size_ |= size_t{1} << 32;
11786+
}
11787+
});
11788+
SyncPoint::GetInstance()->EnableProcessing();
11789+
11790+
// First compaction attempt: fails, but treated as transient (retry allowed).
11791+
Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
11792+
ASSERT_TRUE(s.IsCorruption()) << s.ToString();
11793+
11794+
// Reads should still work after first failure (no BG error set).
11795+
for (int i = 0; i < 10; i++) {
11796+
ASSERT_EQ("value_2_" + std::to_string(i), Get(Key(i)));
11797+
}
11798+
11799+
// Writes should still work after first failure.
11800+
ASSERT_OK(Put(Key(200), "still_writable"));
11801+
11802+
// Second compaction attempt: also fails. This time it should escalate
11803+
// to a fatal BG error because the corruption is persistent.
11804+
s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
11805+
ASSERT_TRUE(s.IsCorruption()) << s.ToString();
11806+
11807+
SyncPoint::GetInstance()->DisableProcessing();
11808+
SyncPoint::GetInstance()->ClearAllCallBacks();
11809+
11810+
// Reads of existing data should still work even after fatal error.
11811+
for (int i = 0; i < 10; i++) {
11812+
ASSERT_EQ("value_2_" + std::to_string(i), Get(Key(i)));
11813+
}
11814+
11815+
// Writes should now FAIL because the DB is in a fatal error state.
11816+
s = Put(Key(300), "should_fail");
11817+
ASSERT_TRUE(!s.ok()) << "Put should fail after persistent corruption: "
11818+
<< s.ToString();
11819+
}
11820+
11821+
// End-to-end test: verify that the max_compaction_output_to_input_ratio
11822+
// check detects grossly inflated compaction output and aborts the compaction.
11823+
TEST_F(DBCompactionTest, CompactionOutputToInputRatioCheck) {
11824+
Options options = CurrentOptions();
11825+
options.compression = kNoCompression;
11826+
options.num_levels = 3;
11827+
options.level0_file_num_compaction_trigger = 3;
11828+
options.disable_auto_compactions = true;
11829+
// Set a tight ratio to make it easy to trigger.
11830+
options.max_compaction_output_to_input_ratio = 2;
11831+
DestroyAndReopen(options);
11832+
11833+
// Write 3 L0 files large enough to exceed the 1MB minimum threshold
11834+
// for the ratio check.
11835+
std::string value_1kb(1024, 'x');
11836+
for (int file = 0; file < 3; file++) {
11837+
for (int i = 0; i < 500; i++) {
11838+
ASSERT_OK(Put(Key(i), value_1kb));
11839+
}
11840+
ASSERT_OK(Flush());
11841+
}
11842+
ASSERT_EQ("3", FilesPerLevel());
11843+
11844+
// Set up SyncPoint to inflate every value to 2MB during compaction.
11845+
// This will make the output file much larger than input.
11846+
std::string large_buf(2 << 20, 'x'); // 2MB buffer
11847+
11848+
int add_count = 0;
11849+
SyncPoint::GetInstance()->SetCallBack(
11850+
"BlockBasedTableBuilder::Add:TamperWithValue", [&](void* arg) {
11851+
add_count++;
11852+
Slice* v = static_cast<Slice*>(arg);
11853+
v->data_ = large_buf.data();
11854+
v->size_ = large_buf.size();
11855+
});
11856+
SyncPoint::GetInstance()->EnableProcessing();
11857+
11858+
// Trigger compaction. The output should be much larger than input,
11859+
// exceeding the 2x ratio limit.
11860+
Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
11861+
11862+
SyncPoint::GetInstance()->DisableProcessing();
11863+
SyncPoint::GetInstance()->ClearAllCallBacks();
11864+
11865+
// The compaction should have failed due to the output/input ratio check.
11866+
ASSERT_TRUE(s.IsCorruption()) << s.ToString();
11867+
ASSERT_TRUE(s.ToString().find("exceeds") != std::string::npos)
11868+
<< s.ToString();
11869+
}
11870+
1166611871
} // namespace ROCKSDB_NAMESPACE
1166711872

1166811873
int main(int argc, char** argv) {

db/db_impl/db_impl.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3051,6 +3051,17 @@ class DBImpl : public DB {
30513051
// stores the number of compactions are currently running
30523052
int num_running_compactions_ = 0;
30533053

3054+
// Count consecutive transient in-memory corruption errors (e.g., hardware
3055+
// bit flips). On first occurrence, allow retry. On second, escalate to fatal.
3056+
int transient_corruption_retry_count_ = 0;
3057+
3058+
// Returns true if the error is a transient data corruption that should be
3059+
// retried rather than escalated to a fatal BG error. On first occurrence,
3060+
// returns true and increments the counter. On second consecutive occurrence,
3061+
// returns false (escalate). Resets counter on non-transient errors.
3062+
// Requires: db_mutex_ held.
3063+
bool ShouldRetryTransientCorruption(const Status& s, const char* context);
3064+
30543065
// number of background memtable flush jobs, submitted to the HIGH pool
30553066
int bg_flush_scheduled_ = 0;
30563067

db/db_impl/db_impl_compaction_flush.cc

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,31 @@ IOStatus DBImpl::SyncClosedWals(const WriteOptions& write_options,
142142
return io_s;
143143
}
144144

145+
bool DBImpl::ShouldRetryTransientCorruption(const Status& s,
146+
const char* context) {
147+
mutex_.AssertHeld();
148+
if (!s.IsCorruption() || s.subcode() != Status::kTransientDataCorruption) {
149+
// Not a transient corruption — reset counter and don't retry.
150+
transient_corruption_retry_count_ = 0;
151+
return false;
152+
}
153+
if (transient_corruption_retry_count_ == 0) {
154+
// First occurrence — allow retry.
155+
transient_corruption_retry_count_++;
156+
ROCKS_LOG_WARN(immutable_db_options_.info_log,
157+
"[%s] Transient data corruption detected, will retry: %s",
158+
context, s.ToString().c_str());
159+
return true;
160+
}
161+
// Second consecutive occurrence — escalate to fatal.
162+
ROCKS_LOG_ERROR(immutable_db_options_.info_log,
163+
"[%s] Transient data corruption persisted after retry, "
164+
"escalating to fatal error: %s",
165+
context, s.ToString().c_str());
166+
transient_corruption_retry_count_ = 0;
167+
return false;
168+
}
169+
145170
Status DBImpl::FlushMemTableToOutputFile(
146171
ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
147172
bool* made_progress, JobContext* job_context, FlushReason flush_reason,
@@ -330,7 +355,10 @@ Status DBImpl::FlushMemTableToOutputFile(
330355

331356
if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() &&
332357
!skip_set_bg_error) {
333-
if (log_io_s.ok()) {
358+
if (ShouldRetryTransientCorruption(s, cfd->GetName().c_str())) {
359+
// Transient corruption: memtable has been rolled back and
360+
// imm_flush_needed is set, so the flush will be rescheduled.
361+
} else if (log_io_s.ok()) {
334362
// Error while writing to MANIFEST.
335363
// In fact, versions_->io_status() can also be the result of renaming
336364
// CURRENT file. With current code, it's just difficult to tell. So just
@@ -1710,7 +1738,8 @@ Status DBImpl::CompactFilesImpl(
17101738
mutex_.Lock();
17111739

17121740
if (status.ok()) {
1713-
// Done
1741+
// Done. Reset transient corruption counter on success.
1742+
transient_corruption_retry_count_ = 0;
17141743
} else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
17151744
// Ignore compaction errors found during shutting down
17161745
} else if (status.IsManualCompactionPaused()) {
@@ -1724,6 +1753,9 @@ Status DBImpl::CompactFilesImpl(
17241753
ROCKS_LOG_INFO(
17251754
immutable_db_options_.info_log, "[%s] [JOB %d] Compaction aborted",
17261755
c->column_family_data()->GetName().c_str(), job_context->job_id);
1756+
} else if (ShouldRetryTransientCorruption(
1757+
status, c->column_family_data()->GetName().c_str())) {
1758+
// Transient corruption: input files intact, output not installed.
17271759
} else {
17281760
ROCKS_LOG_WARN(immutable_db_options_.info_log,
17291761
"[%s] [JOB %d] Compaction error: %s",
@@ -4464,9 +4496,14 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
44644496

44654497
if (status.ok() || status.IsCompactionTooLarge() ||
44664498
status.IsManualCompactionPaused() || status.IsCompactionAborted()) {
4467-
// Done
4499+
// Done. Reset transient corruption counter on success.
4500+
if (status.ok()) {
4501+
transient_corruption_retry_count_ = 0;
4502+
}
44684503
} else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
44694504
// Ignore compaction errors found during shutting down
4505+
} else if (ShouldRetryTransientCorruption(status, "BackgroundCompaction")) {
4506+
// Transient corruption: input files intact, output not installed.
44704507
} else {
44714508
ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s",
44724509
status.ToString().c_str());

db_stress_tool/db_stress_common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,7 @@ DECLARE_int32(approximate_size_one_in);
326326
DECLARE_bool(best_efforts_recovery);
327327
DECLARE_bool(skip_verifydb);
328328
DECLARE_bool(paranoid_file_checks);
329+
DECLARE_uint64(max_compaction_output_to_input_ratio);
329330
DECLARE_uint64(batch_protection_bytes_per_key);
330331
DECLARE_uint32(memtable_protection_bytes_per_key);
331332
DECLARE_uint32(block_protection_bytes_per_key);

0 commit comments

Comments
 (0)