Skip to content

Commit 9f690ec

Browse files
committed
Fix a deadlock in CompactRange()
Summary: The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions can cause it to deadlock when CompactRange() is called concurrently from multiple threads. Imagine a following scenario with only two threads (max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0): 1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction (bg_compaction_scheduled_ is now 10) and waits for it to complete. 2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_ (now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to drop to LargeNumber. 3. BG thread completes the first manual compaction, decrements bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_. Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again (now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue, thread #2 also goes to sleep (waiting on bg_cv_), without resetting bg_compaction_scheduled_. This diff attempts to address the problem by introducing a new counter bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only schedule manual compactions). Test Plan: I could pretty much consistently reproduce the deadlock with a program that calls CompactRange(nullptr, nullptr) immediately after Write() from multiple threads. This no longer happens with this patch. Tests (make check) pass. Reviewers: dhruba, igor, sdong, haobo Reviewed By: igor CC: leveldb Differential Revision: https://reviews.facebook.net/D14799
1 parent c370f55 commit 9f690ec

File tree

2 files changed

+44
-34
lines changed

2 files changed

+44
-34
lines changed

db/db_impl.cc

+38-33
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
244244
super_version_(nullptr),
245245
tmp_batch_(),
246246
bg_compaction_scheduled_(0),
247+
bg_manual_only_(0),
247248
bg_flush_scheduled_(0),
248249
bg_logstats_scheduled_(false),
249250
manual_compaction_(nullptr),
@@ -1600,45 +1601,44 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
16001601

16011602
MutexLock l(&mutex_);
16021603

1603-
// When a manual compaction arrives, temporarily throttle down
1604-
// the number of background compaction threads to 1. This is
1605-
// needed to ensure that this manual compaction can compact
1606-
// any range of keys/files. We artificialy increase
1607-
// bg_compaction_scheduled_ by a large number, this causes
1608-
// the system to have a single background thread. Now,
1609-
// this manual compaction can progress without stomping
1610-
// on any other concurrent compactions.
1611-
const int LargeNumber = 10000000;
1612-
const int newvalue = options_.max_background_compactions-1;
1613-
bg_compaction_scheduled_ += LargeNumber;
1614-
while (bg_compaction_scheduled_ > LargeNumber) {
1615-
Log(options_.info_log, "Manual compaction request waiting for background threads to fall below 1");
1604+
// When a manual compaction arrives, temporarily disable scheduling of
1605+
// non-manual compactions and wait until the number of scheduled compaction
1606+
// jobs drops to zero. This is needed to ensure that this manual compaction
1607+
// can compact any range of keys/files.
1608+
//
1609+
// bg_manual_only_ is non-zero when at least one thread is inside
1610+
// TEST_CompactRange(), i.e. during that time no other compaction will
1611+
// get scheduled (see MaybeScheduleFlushOrCompaction).
1612+
//
1613+
// Note that the following loop doesn't stop more that one thread calling
1614+
// TEST_CompactRange() from getting to the second while loop below.
1615+
// However, only one of them will actually schedule compaction, while
1616+
// others will wait on a condition variable until it completes.
1617+
1618+
++bg_manual_only_;
1619+
while (bg_compaction_scheduled_ > 0) {
1620+
Log(options_.info_log,
1621+
"Manual compaction waiting for all other scheduled background "
1622+
"compactions to finish");
16161623
bg_cv_.Wait();
16171624
}
1625+
16181626
Log(options_.info_log, "Manual compaction starting");
16191627

1620-
while (!manual.done) {
1621-
while (manual_compaction_ != nullptr) {
1622-
bg_cv_.Wait();
1623-
}
1624-
manual_compaction_ = &manual;
1625-
if (bg_compaction_scheduled_ == LargeNumber) {
1626-
bg_compaction_scheduled_ = newvalue;
1627-
}
1628-
MaybeScheduleFlushOrCompaction();
1629-
while (manual_compaction_ == &manual) {
1628+
while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) {
1629+
assert(bg_manual_only_ > 0);
1630+
if (manual_compaction_ != nullptr) {
1631+
// Running either this or some other manual compaction
16301632
bg_cv_.Wait();
1633+
} else {
1634+
manual_compaction_ = &manual;
1635+
MaybeScheduleFlushOrCompaction();
16311636
}
16321637
}
1633-
assert(!manual.in_progress);
16341638

1635-
// wait till there are no background threads scheduled
1636-
bg_compaction_scheduled_ += LargeNumber;
1637-
while (bg_compaction_scheduled_ > LargeNumber + newvalue) {
1638-
Log(options_.info_log, "Manual compaction resetting background threads");
1639-
bg_cv_.Wait();
1640-
}
1641-
bg_compaction_scheduled_ = 0;
1639+
assert(!manual.in_progress);
1640+
assert(bg_manual_only_ > 0);
1641+
--bg_manual_only_;
16421642
}
16431643

16441644
Status DBImpl::FlushMemTable(const FlushOptions& options) {
@@ -1703,11 +1703,16 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
17031703
env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH);
17041704
}
17051705

1706+
// Schedule BGWorkCompaction if there's a compaction pending (or a memtable
1707+
// flush, but the HIGH pool is not enabled). Do it only if
1708+
// max_background_compactions hasn't been reached and, in case
1709+
// bg_manual_only_ > 0, if it's a manual compaction.
17061710
if ((manual_compaction_ ||
17071711
versions_->NeedsCompaction() ||
17081712
(is_flush_pending && (options_.max_background_flushes <= 0))) &&
1709-
bg_compaction_scheduled_ < options_.max_background_compactions) {
1710-
// compaction needed, or memtable flush needed but HIGH pool not enabled.
1713+
bg_compaction_scheduled_ < options_.max_background_compactions &&
1714+
(!bg_manual_only_ || manual_compaction_)) {
1715+
17111716
bg_compaction_scheduled_++;
17121717
env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
17131718
}

db/db_impl.h

+6-1
Original file line numberDiff line numberDiff line change
@@ -388,9 +388,14 @@ class DBImpl : public DB {
388388
// part of ongoing compactions.
389389
std::set<uint64_t> pending_outputs_;
390390

391-
// count how many background compaction been scheduled or is running?
391+
// count how many background compactions are running or have been scheduled
392392
int bg_compaction_scheduled_;
393393

394+
// If non-zero, MaybeScheduleFlushOrCompaction() will only schedule manual
395+
// compactions (if manual_compaction_ is not null). This mechanism enables
396+
// manual compactions to wait until all other compactions are finished.
397+
int bg_manual_only_;
398+
394399
// number of background memtable flush jobs, submitted to the HIGH pool
395400
int bg_flush_scheduled_;
396401

0 commit comments

Comments
 (0)