facebook
diff --git a/‎db/builder.cc‎
Lines changed: 4 additions & 0 deletions b/‎db/builder.cc‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎db/c_test.c‎
Lines changed: 4 additions & 0 deletions b/‎db/c_test.c‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎db/compaction/compaction_job.cc‎
Lines changed: 36 additions & 0 deletions b/‎db/compaction/compaction_job.cc‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎db/compaction/compaction_outputs.cc‎
Lines changed: 3 additions & 0 deletions b/‎db/compaction/compaction_outputs.cc‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎db/db_compaction_test.cc‎
Lines changed: 205 additions & 0 deletions b/‎db/db_compaction_test.cc‎
Lines changed: 205 additions & 0 deletions
diff --git a/‎db/db_impl/db_impl.h‎
Lines changed: 11 additions & 0 deletions b/‎db/db_impl/db_impl.h‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎db/db_impl/db_impl_compaction_flush.cc‎
Lines changed: 40 additions & 3 deletions b/‎db/db_impl/db_impl_compaction_flush.cc‎
Lines changed: 40 additions & 3 deletions
diff --git a/‎db_stress_tool/db_stress_common.h‎
Lines changed: 1 addition & 0 deletions b/‎db_stress_tool/db_stress_common.h‎
Lines changed: 1 addition & 0 deletions
@@ -263,6 +263,10 @@ Status BuildTable(
         break;
       }
       builder->Add(key_after_flush, value_after_flush);
+      if (!builder->status().ok()) {
+        s = builder->status();
+        break;
+      }
 
       if (flush_stats) {
         flush_stats->num_output_records++;
 
@@ -2557,6 +2557,10 @@ int main(int argc, char** argv) {
     CheckNoError(err);
 
     rocksdb_cuckoo_options_destroy(cuckoo_options);
+
+    // Reset table factory back to block-based so subsequent test phases
+    // (e.g., transactions) don't inherit the cuckoo table factory.
+    rocksdb_options_set_block_based_table_factory(options, table_options);
   }
 
   StartPhase("options");
 
@@ -887,6 +887,42 @@ Status CompactionJob::VerifyOutputFiles() {
   VerifyOutputFlags verify_output_flags =
       compact_->compaction->mutable_cf_options().verify_output_flags;
 
+  // Check compaction output/input size ratio. A corrupted value (e.g., from
+  // a hardware bit flip in Slice::size_) can inflate the output file far
+  // beyond what the input data justifies.
+  const uint64_t max_ratio = compact_->compaction->mutable_cf_options()
+                                 .max_compaction_output_to_input_ratio;
+  if (max_ratio > 0) {
+    uint64_t total_input_bytes =
+        compact_->compaction->CalculateTotalInputSize();
+    uint64_t total_output_bytes = 0;
+    for (const auto& state : compact_->sub_compact_states) {
+      for (const auto& output : state.GetOutputs()) {
+        total_output_bytes += output.meta.fd.file_size;
+      }
+    }
+    // Only apply the ratio check when input is large enough for the ratio
+    // to be meaningful. Small inputs (e.g., < 1MB) can legitimately produce
+    // larger output due to block overhead, index/filter blocks, and metadata.
+    static constexpr uint64_t kMinInputForRatioCheck = 1u << 20;  // 1MB
+    if (total_input_bytes >= kMinInputForRatioCheck &&
+        total_output_bytes > total_input_bytes * max_ratio) {
+      ROCKS_LOG_ERROR(
+          db_options_.info_log,
+          "[%s] [JOB %d] Compaction output size anomaly: "
+          "output=%" PRIu64 " input=%" PRIu64 " ratio=%.1f max_ratio=%" PRIu64
+          ". Possible data corruption (e.g., hardware bit flip).",
+          cfd->GetName().c_str(), job_id_, total_output_bytes,
+          total_input_bytes,
+          static_cast<double>(total_output_bytes) / total_input_bytes,
+          max_ratio);
+      return Status::Corruption(
+          "Compaction output size (" + std::to_string(total_output_bytes) +
+          ") exceeds " + std::to_string(max_ratio) + "x input size (" +
+          std::to_string(total_input_bytes) + "). Possible data corruption.");
+    }
+  }
+
   // For backward compatibility
   if (paranoid_file_checks_) {
     verify_output_flags |= VerifyOutputFlags::kVerifyIteration;
 
@@ -421,6 +421,9 @@ Status CompactionOutputs::AddToOutput(
     return s;
   }
   builder_->Add(key, value);
+  if (!builder_->status().ok()) {
+    return builder_->status();
+  }
 
   stats_.num_output_records++;
   current_output_file_size_ = builder_->EstimatedFileSize();
 
@@ -11663,6 +11663,211 @@ TEST_F(DBCompactionTest, PeriodicTask) {
   ASSERT_EQ(listener->num_periodic_compactions, 1);
   Close();
 }
+// End-to-end test: verify that a transient hardware bit flip in a value
+// Slice's size_ during compaction is detected, the first compaction attempt
+// fails gracefully (without making DB read-only), and the retry succeeds
+// because the transient error does not recur.
+TEST_F(DBCompactionTest, ValueSliceBitFlipDuringCompaction) {
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 3;
+  // Disable auto compaction so we can trigger it manually.
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  // Write 3 L0 files with overlapping key ranges.
+  for (int file = 0; file < 3; file++) {
+    for (int i = 0; i < 10; i++) {
+      ASSERT_OK(Put(Key(i),
+                    "value_" + std::to_string(file) + "_" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("3", FilesPerLevel());
+
+  // Verify all data is readable before compaction.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_EQ("value_2_" + std::to_string(i), Get(Key(i)));
+  }
+
+  // Set up SyncPoint to flip bit 32 on the 5th key's value during compaction.
+  // Uses atomic to be safe and only corrupts once (simulating transient error).
+  std::atomic<int> add_count{0};
+  std::atomic<bool> corrupted{false};
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::Add:TamperWithValue", [&](void* arg) {
+        int count = add_count.fetch_add(1);
+        if (count == 4 && !corrupted.load()) {
+          corrupted.store(true);
+          Slice* v = static_cast<Slice*>(arg);
+          v->size_ |= size_t{1} << 32;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // First compaction attempt: fails due to bit flip, but treated as transient.
+  Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_TRUE(s.IsCorruption()) << s.ToString();
+
+  // Disable the corruption injection before testing DB availability.
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // DB should NOT be in a fatal error state — transient corruption allows
+  // retry. Reads should still work.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_EQ("value_2_" + std::to_string(i), Get(Key(i)));
+  }
+
+  // Writes should still work (DB is not read-only).
+  ASSERT_OK(Put(Key(100), "new_value"));
+  ASSERT_OK(Flush());
+
+  // Second compaction attempt: succeeds because the bit flip was transient.
+  s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(s) << s.ToString();
+
+  // All data should be readable after successful retry.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_EQ("value_2_" + std::to_string(i), Get(Key(i)));
+  }
+  ASSERT_EQ("new_value", Get(Key(100)));
+
+  // Verify the retry counter was reset: new writes, flushes, and compactions
+  // should all succeed, proving the DB is fully recovered.
+  for (int i = 200; i < 210; i++) {
+    ASSERT_OK(Put(Key(i), "post_recovery_" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+  s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(s) << s.ToString();
+
+  // All data (old and new) should be readable.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_EQ("value_2_" + std::to_string(i), Get(Key(i)));
+  }
+  for (int i = 200; i < 210; i++) {
+    ASSERT_EQ("post_recovery_" + std::to_string(i), Get(Key(i)));
+  }
+}
+
+// End-to-end test: verify that a persistent (non-transient) bit flip causes
+// the DB to become read-only after the retry fails. The first compaction
+// attempt is retried, but the second attempt also hits the corruption
+// (because we keep the SyncPoint active), so the DB escalates to fatal.
+TEST_F(DBCompactionTest, PersistentBitFlipMakesDBReadOnly) {
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 3;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  for (int file = 0; file < 3; file++) {
+    for (int i = 0; i < 10; i++) {
+      ASSERT_OK(Put(Key(i),
+                    "value_" + std::to_string(file) + "_" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("3", FilesPerLevel());
+
+  // Set up SyncPoint to ALWAYS corrupt (simulating persistent hardware error).
+  // Every compaction attempt will hit this corruption.
+  std::atomic<int> add_count{0};
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::Add:TamperWithValue", [&](void* arg) {
+        int count = add_count.fetch_add(1);
+        // Corrupt the 5th key in every compaction attempt
+        if (count % 10 == 4) {
+          Slice* v = static_cast<Slice*>(arg);
+          v->size_ |= size_t{1} << 32;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // First compaction attempt: fails, but treated as transient (retry allowed).
+  Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_TRUE(s.IsCorruption()) << s.ToString();
+
+  // Reads should still work after first failure (no BG error set).
+  for (int i = 0; i < 10; i++) {
+    ASSERT_EQ("value_2_" + std::to_string(i), Get(Key(i)));
+  }
+
+  // Writes should still work after first failure.
+  ASSERT_OK(Put(Key(200), "still_writable"));
+
+  // Second compaction attempt: also fails. This time it should escalate
+  // to a fatal BG error because the corruption is persistent.
+  s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_TRUE(s.IsCorruption()) << s.ToString();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Reads of existing data should still work even after fatal error.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_EQ("value_2_" + std::to_string(i), Get(Key(i)));
+  }
+
+  // Writes should now FAIL because the DB is in a fatal error state.
+  s = Put(Key(300), "should_fail");
+  ASSERT_TRUE(!s.ok()) << "Put should fail after persistent corruption: "
+                       << s.ToString();
+}
+
+// End-to-end test: verify that the max_compaction_output_to_input_ratio
+// check detects grossly inflated compaction output and aborts the compaction.
+TEST_F(DBCompactionTest, CompactionOutputToInputRatioCheck) {
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 3;
+  options.disable_auto_compactions = true;
+  // Set a tight ratio to make it easy to trigger.
+  options.max_compaction_output_to_input_ratio = 2;
+  DestroyAndReopen(options);
+
+  // Write 3 L0 files large enough to exceed the 1MB minimum threshold
+  // for the ratio check.
+  std::string value_1kb(1024, 'x');
+  for (int file = 0; file < 3; file++) {
+    for (int i = 0; i < 500; i++) {
+      ASSERT_OK(Put(Key(i), value_1kb));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("3", FilesPerLevel());
+
+  // Set up SyncPoint to inflate every value to 2MB during compaction.
+  // This will make the output file much larger than input.
+  std::string large_buf(2 << 20, 'x');  // 2MB buffer
+
+  int add_count = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::Add:TamperWithValue", [&](void* arg) {
+        add_count++;
+        Slice* v = static_cast<Slice*>(arg);
+        v->data_ = large_buf.data();
+        v->size_ = large_buf.size();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Trigger compaction. The output should be much larger than input,
+  // exceeding the 2x ratio limit.
+  Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // The compaction should have failed due to the output/input ratio check.
+  ASSERT_TRUE(s.IsCorruption()) << s.ToString();
+  ASSERT_TRUE(s.ToString().find("exceeds") != std::string::npos)
+      << s.ToString();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
 
@@ -3051,6 +3051,17 @@ class DBImpl : public DB {
   // stores the number of compactions are currently running
   int num_running_compactions_ = 0;
 
+  // Count consecutive transient in-memory corruption errors (e.g., hardware
+  // bit flips). On first occurrence, allow retry. On second, escalate to fatal.
+  int transient_corruption_retry_count_ = 0;
+
+  // Returns true if the error is a transient data corruption that should be
+  // retried rather than escalated to a fatal BG error. On first occurrence,
+  // returns true and increments the counter. On second consecutive occurrence,
+  // returns false (escalate). Resets counter on non-transient errors.
+  // Requires: db_mutex_ held.
+  bool ShouldRetryTransientCorruption(const Status& s, const char* context);
+
   // number of background memtable flush jobs, submitted to the HIGH pool
   int bg_flush_scheduled_ = 0;
 
 
@@ -142,6 +142,31 @@ IOStatus DBImpl::SyncClosedWals(const WriteOptions& write_options,
   return io_s;
 }
 
+bool DBImpl::ShouldRetryTransientCorruption(const Status& s,
+                                            const char* context) {
+  mutex_.AssertHeld();
+  if (!s.IsCorruption() || s.subcode() != Status::kTransientDataCorruption) {
+    // Not a transient corruption — reset counter and don't retry.
+    transient_corruption_retry_count_ = 0;
+    return false;
+  }
+  if (transient_corruption_retry_count_ == 0) {
+    // First occurrence — allow retry.
+    transient_corruption_retry_count_++;
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "[%s] Transient data corruption detected, will retry: %s",
+                   context, s.ToString().c_str());
+    return true;
+  }
+  // Second consecutive occurrence — escalate to fatal.
+  ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                  "[%s] Transient data corruption persisted after retry, "
+                  "escalating to fatal error: %s",
+                  context, s.ToString().c_str());
+  transient_corruption_retry_count_ = 0;
+  return false;
+}
+
 Status DBImpl::FlushMemTableToOutputFile(
     ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
     bool* made_progress, JobContext* job_context, FlushReason flush_reason,
@@ -330,7 +355,10 @@ Status DBImpl::FlushMemTableToOutputFile(
 
   if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() &&
       !skip_set_bg_error) {
-    if (log_io_s.ok()) {
+    if (ShouldRetryTransientCorruption(s, cfd->GetName().c_str())) {
+      // Transient corruption: memtable has been rolled back and
+      // imm_flush_needed is set, so the flush will be rescheduled.
+    } else if (log_io_s.ok()) {
       // Error while writing to MANIFEST.
       // In fact, versions_->io_status() can also be the result of renaming
       // CURRENT file. With current code, it's just difficult to tell. So just
@@ -1710,7 +1738,8 @@ Status DBImpl::CompactFilesImpl(
   mutex_.Lock();
 
   if (status.ok()) {
-    // Done
+    // Done. Reset transient corruption counter on success.
+    transient_corruption_retry_count_ = 0;
   } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
     // Ignore compaction errors found during shutting down
   } else if (status.IsManualCompactionPaused()) {
@@ -1724,6 +1753,9 @@ Status DBImpl::CompactFilesImpl(
     ROCKS_LOG_INFO(
         immutable_db_options_.info_log, "[%s] [JOB %d] Compaction aborted",
         c->column_family_data()->GetName().c_str(), job_context->job_id);
+  } else if (ShouldRetryTransientCorruption(
+                 status, c->column_family_data()->GetName().c_str())) {
+    // Transient corruption: input files intact, output not installed.
   } else {
     ROCKS_LOG_WARN(immutable_db_options_.info_log,
                    "[%s] [JOB %d] Compaction error: %s",
@@ -4464,9 +4496,14 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
 
   if (status.ok() || status.IsCompactionTooLarge() ||
       status.IsManualCompactionPaused() || status.IsCompactionAborted()) {
-    // Done
+    // Done. Reset transient corruption counter on success.
+    if (status.ok()) {
+      transient_corruption_retry_count_ = 0;
+    }
   } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
     // Ignore compaction errors found during shutting down
+  } else if (ShouldRetryTransientCorruption(status, "BackgroundCompaction")) {
+    // Transient corruption: input files intact, output not installed.
   } else {
     ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s",
                    status.ToString().c_str());
 
@@ -326,6 +326,7 @@ DECLARE_int32(approximate_size_one_in);
 DECLARE_bool(best_efforts_recovery);
 DECLARE_bool(skip_verifydb);
 DECLARE_bool(paranoid_file_checks);
+DECLARE_uint64(max_compaction_output_to_input_ratio);
 DECLARE_uint64(batch_protection_bytes_per_key);
 DECLARE_uint32(memtable_protection_bytes_per_key);
 DECLARE_uint32(block_protection_bytes_per_key);
Original file line number	Diff line number	Diff line change
`@@ -2557,6 +2557,10 @@ int main(int argc, char** argv) {`
`2557`	`2557`	`CheckNoError(err);`
`2558`	`2558`
`2559`	`2559`	`rocksdb_cuckoo_options_destroy(cuckoo_options);`
	`2560`	`+`
	`2561`	`+ // Reset table factory back to block-based so subsequent test phases`
	`2562`	`+ // (e.g., transactions) don't inherit the cuckoo table factory.`
	`2563`	`+ rocksdb_options_set_block_based_table_factory(options, table_options);`
`2560`	`2564`	`}`
`2561`	`2565`
`2562`	`2566`	`StartPhase("options");`
Original file line number	Diff line number	Diff line change
`@@ -421,6 +421,9 @@ Status CompactionOutputs::AddToOutput(`
`421`	`421`	`return s;`
`422`	`422`	`}`
`423`	`423`	`builder_->Add(key, value);`
	`424`	`+ if (!builder_->status().ok()) {`
	`425`	`+ return builder_->status();`
	`426`	`+ }`
`424`	`427`
`425`	`428`	`stats_.num_output_records++;`
`426`	`429`	`current_output_file_size_ = builder_->EstimatedFileSize();`