Skip to content

Commit 98a80e9

Browse files
Little-Wallacetabokie
authored andcommitted
Do not hold mutex when write keys if not necessary (facebook#7516)
Summary: RocksDB will acquire the global mutex of db instance for every time when user calls `Write`. When RocksDB schedules a lot of compaction jobs, it will compete the mutex with write thread and it will hurt the write performance. I want to use log_write_mutex to replace the global mutex in most case so that we do not acquire it in write-thread unless there is a write-stall event or a write-buffer-full event occur. Pull Request resolved: facebook#7516 Test Plan: 1. make check 2. CI 3. COMPILE_WITH_TSAN=1 make db_stress make crash_test make crash_test_with_multiops_wp_txn make crash_test_with_multiops_wc_txn make crash_test_with_atomic_flush Reviewed By: siying Differential Revision: D36908702 Pulled By: riversand963 fbshipit-source-id: 59b13881f4f5c0a58fd3ca79128a396d9cd98efe Signed-off-by: tabokie <[email protected]>
1 parent 1022df7 commit 98a80e9

17 files changed

+357
-220
lines changed

HISTORY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
## Behavior Changes
2323
* For track_and_verify_wals_in_manifest, revert to the original behavior before #10087: syncing of live WAL file is not tracked, and we track only the synced sizes of **closed** WALs. (PR #10330).
24+
* DB::Write does not hold global `mutex_` if this db instance does not need to switch wal and mem-table (#7516).
2425

2526
## 6.29.5 (03/29/2022)
2627
### Bug Fixes

db/db_compaction_test.cc

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5333,18 +5333,10 @@ TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
53335333
for (int j = 0; j != kNumKeysPerFile; ++j) {
53345334
ASSERT_OK(Put(Key(j), rnd.RandomString(990)));
53355335
}
5336-
if (0 == i) {
5337-
// When we reach here, the memtables have kNumKeysPerFile keys. Note that
5338-
// flush is not yet triggered. We need to write an extra key so that the
5339-
// write path will call PreprocessWrite and flush the previous key-value
5340-
// pairs to e flushed. After that, there will be the newest key in the
5341-
// memtable, and a bunch of L0 files. Since there is already one key in
5342-
// the memtable, then for i = 1, 2, ..., we do not have to write this
5343-
// extra key to trigger flush.
5344-
ASSERT_OK(Put("", ""));
5336+
if (i > 0) {
5337+
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
5338+
ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i);
53455339
}
5346-
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
5347-
ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i + 1);
53485340
}
53495341
// When we reach this point, there will be level0_stop_writes_trigger L0
53505342
// files and one extra key (99) in memory, which overlaps with the external

db/db_impl/db_impl.cc

Lines changed: 54 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
181181
log_dir_synced_(false),
182182
log_empty_(true),
183183
persist_stats_cf_handle_(nullptr),
184-
log_sync_cv_(&mutex_),
184+
log_sync_cv_(&log_write_mutex_),
185185
total_log_size_(0),
186186
is_snapshot_supported_(true),
187187
write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()),
@@ -271,6 +271,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
271271
mutable_db_options_.Dump(immutable_db_options_.info_log.get());
272272
DumpSupportInfo(immutable_db_options_.info_log.get());
273273

274+
max_total_wal_size_.store(mutable_db_options_.max_total_wal_size,
275+
std::memory_order_relaxed);
274276
if (write_buffer_manager_) {
275277
wbm_stall_.reset(new WBMStallInterface());
276278
}
@@ -641,26 +643,28 @@ Status DBImpl::CloseHelper() {
641643
job_context.Clean();
642644
mutex_.Lock();
643645
}
644-
645-
for (auto l : logs_to_free_) {
646-
delete l;
647-
}
648-
for (auto& log : logs_) {
649-
uint64_t log_number = log.writer->get_log_number();
650-
Status s = log.ClearWriter();
651-
if (!s.ok()) {
652-
ROCKS_LOG_WARN(
653-
immutable_db_options_.info_log,
654-
"Unable to Sync WAL file %s with error -- %s",
655-
LogFileName(immutable_db_options_.GetWalDir(), log_number).c_str(),
656-
s.ToString().c_str());
657-
// Retain the first error
658-
if (ret.ok()) {
659-
ret = s;
646+
{
647+
InstrumentedMutexLock lock(&log_write_mutex_);
648+
for (auto l : logs_to_free_) {
649+
delete l;
650+
}
651+
for (auto& log : logs_) {
652+
uint64_t log_number = log.writer->get_log_number();
653+
Status s = log.ClearWriter();
654+
if (!s.ok()) {
655+
ROCKS_LOG_WARN(
656+
immutable_db_options_.info_log,
657+
"Unable to Sync WAL file %s with error -- %s",
658+
LogFileName(immutable_db_options_.GetWalDir(), log_number).c_str(),
659+
s.ToString().c_str());
660+
// Retain the first error
661+
if (ret.ok()) {
662+
ret = s;
663+
}
660664
}
661665
}
666+
logs_.clear();
662667
}
663-
logs_.clear();
664668

665669
// Table cache may have table handles holding blocks from the block cache.
666670
// We need to release them before the block cache is destroyed. The block
@@ -1046,6 +1050,7 @@ Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
10461050
}
10471051

10481052
void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
1053+
mutex_.AssertHeld();
10491054
if (!job_context->logs_to_free.empty()) {
10501055
for (auto l : job_context->logs_to_free) {
10511056
AddToLogsToFreeQueue(l);
@@ -1225,6 +1230,11 @@ Status DBImpl::SetDBOptions(
12251230
new_options.stats_persist_period_sec);
12261231
mutex_.Lock();
12271232
}
1233+
if (new_options.max_total_wal_size !=
1234+
mutable_db_options_.max_total_wal_size) {
1235+
max_total_wal_size_.store(new_options.max_total_wal_size,
1236+
std::memory_order_release);
1237+
}
12281238
write_controller_.set_max_delayed_write_rate(
12291239
new_options.delayed_write_rate);
12301240
table_cache_.get()->SetCapacity(new_options.max_open_files == -1
@@ -1345,7 +1355,7 @@ Status DBImpl::SyncWAL() {
13451355
uint64_t current_log_number;
13461356

13471357
{
1348-
InstrumentedMutexLock l(&mutex_);
1358+
InstrumentedMutexLock l(&log_write_mutex_);
13491359
assert(!logs_.empty());
13501360

13511361
// This SyncWAL() call only cares about logs up to this number.
@@ -1402,19 +1412,37 @@ Status DBImpl::SyncWAL() {
14021412
TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
14031413

14041414
TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1");
1415+
VersionEdit synced_wals;
14051416
{
1406-
InstrumentedMutexLock l(&mutex_);
1417+
InstrumentedMutexLock l(&log_write_mutex_);
14071418
if (status.ok()) {
1408-
status = MarkLogsSynced(current_log_number, need_log_dir_sync);
1419+
MarkLogsSynced(current_log_number, need_log_dir_sync, &synced_wals);
14091420
} else {
14101421
MarkLogsNotSynced(current_log_number);
14111422
}
14121423
}
1424+
if (status.ok() && synced_wals.IsWalAddition()) {
1425+
InstrumentedMutexLock l(&mutex_);
1426+
status = ApplyWALToManifest(&synced_wals);
1427+
}
1428+
14131429
TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2");
14141430

14151431
return status;
14161432
}
14171433

1434+
Status DBImpl::ApplyWALToManifest(VersionEdit* synced_wals) {
1435+
// not empty, write to MANIFEST.
1436+
mutex_.AssertHeld();
1437+
Status status =
1438+
versions_->LogAndApplyToDefaultColumnFamily(synced_wals, &mutex_);
1439+
if (!status.ok() && versions_->io_status().IsIOError()) {
1440+
status = error_handler_.SetBGError(versions_->io_status(),
1441+
BackgroundErrorReason::kManifestWrite);
1442+
}
1443+
return status;
1444+
}
1445+
14181446
Status DBImpl::LockWAL() {
14191447
log_write_mutex_.Lock();
14201448
auto cur_log_writer = logs_.back().writer;
@@ -1434,24 +1462,22 @@ Status DBImpl::UnlockWAL() {
14341462
return Status::OK();
14351463
}
14361464

1437-
Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) {
1438-
mutex_.AssertHeld();
1465+
void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
1466+
VersionEdit* synced_wals) {
1467+
log_write_mutex_.AssertHeld();
14391468
if (synced_dir && logfile_number_ == up_to) {
14401469
log_dir_synced_ = true;
14411470
}
1442-
VersionEdit synced_wals;
14431471
for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
14441472
auto& wal = *it;
14451473
assert(wal.IsSyncing());
14461474

14471475
if (logs_.size() > 1) {
14481476
if (immutable_db_options_.track_and_verify_wals_in_manifest &&
14491477
wal.GetPreSyncSize() > 0) {
1450-
synced_wals.AddWal(wal.number, WalMetadata(wal.GetPreSyncSize()));
1478+
synced_wals->AddWal(wal.number, WalMetadata(wal.GetPreSyncSize()));
14511479
}
14521480
logs_to_free_.push_back(wal.ReleaseWriter());
1453-
// To modify logs_ both mutex_ and log_write_mutex_ must be held
1454-
InstrumentedMutexLock l(&log_write_mutex_);
14551481
it = logs_.erase(it);
14561482
} else {
14571483
wal.FinishSync();
@@ -1460,22 +1486,11 @@ Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) {
14601486
}
14611487
assert(logs_.empty() || logs_[0].number > up_to ||
14621488
(logs_.size() == 1 && !logs_[0].IsSyncing()));
1463-
1464-
Status s;
1465-
if (synced_wals.IsWalAddition()) {
1466-
// not empty, write to MANIFEST.
1467-
s = versions_->LogAndApplyToDefaultColumnFamily(&synced_wals, &mutex_);
1468-
if (!s.ok() && versions_->io_status().IsIOError()) {
1469-
s = error_handler_.SetBGError(versions_->io_status(),
1470-
BackgroundErrorReason::kManifestWrite);
1471-
}
1472-
}
14731489
log_sync_cv_.SignalAll();
1474-
return s;
14751490
}
14761491

14771492
void DBImpl::MarkLogsNotSynced(uint64_t up_to) {
1478-
mutex_.AssertHeld();
1493+
log_write_mutex_.AssertHeld();
14791494
for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;
14801495
++it) {
14811496
auto& wal = *it;

0 commit comments

Comments
 (0)