Skip to content

Commit ae25742

Browse files
committed
Fix race condition in manifest roll
Summary: When the manifest is getting rolled the following happens: 1) manifest_file_number_ is assigned to a new manifest number (even though the old one is still current) 2) mutex is unlocked 3) SetCurrentFile() creates temporary file manifest_file_number_.dbtmp 4) SetCurrentFile() renames manifest_file_number_.dbtmp to CURRENT 5) mutex is locked If FindObsoleteFiles happens between (3) and (4) it will: 1) Delete manifest_file_number_.dbtmp (because it's not in pending_outputs_) 2) Delete old manifest (because the manifest_file_number_ already points to a new one) I introduce the concept of prev_manifest_file_number_ that will avoid the race condition. However, we should discuss the future of MANIFEST file rolling. We found some race conditions with it last week and who knows how many more are there. Nobody is using it in production because we don't trust the implementation. Should we even support it? Test Plan: make check Reviewers: ljin, dhruba, haobo, sdong Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D16929
1 parent 5601bc4 commit ae25742

File tree

4 files changed

+52
-42
lines changed

4 files changed

+52
-42
lines changed

db/db_impl.cc

+14-14
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,8 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
601601

602602
// store the current filenum, lognum, etc
603603
deletion_state.manifest_file_number = versions_->ManifestFileNumber();
604+
deletion_state.pending_manifest_file_number =
605+
versions_->PendingManifestFileNumber();
604606
deletion_state.log_number = versions_->LogNumber();
605607
deletion_state.prev_log_number = versions_->PrevLogNumber();
606608

@@ -651,12 +653,10 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
651653
return;
652654
}
653655

654-
655656
// Now, convert live list to an unordered set, WITHOUT mutex held;
656657
// set is slow.
657-
std::unordered_set<uint64_t> sst_live(
658-
state.sst_live.begin(), state.sst_live.end()
659-
);
658+
std::unordered_set<uint64_t> sst_live(state.sst_live.begin(),
659+
state.sst_live.end());
660660

661661
auto& candidate_files = state.candidate_files;
662662
candidate_files.reserve(
@@ -674,19 +674,15 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
674674

675675
for (auto file_num : state.log_delete_files) {
676676
if (file_num > 0) {
677-
candidate_files.push_back(
678-
LogFileName(kDumbDbName, file_num).substr(1)
679-
);
677+
candidate_files.push_back(LogFileName(kDumbDbName, file_num).substr(1));
680678
}
681679
}
682680

683681
// dedup state.candidate_files so we don't try to delete the same
684682
// file twice
685683
sort(candidate_files.begin(), candidate_files.end());
686-
candidate_files.erase(
687-
unique(candidate_files.begin(), candidate_files.end()),
688-
candidate_files.end()
689-
);
684+
candidate_files.erase(unique(candidate_files.begin(), candidate_files.end()),
685+
candidate_files.end());
690686

691687
std::vector<std::string> old_info_log_files;
692688

@@ -706,16 +702,20 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
706702
break;
707703
case kDescriptorFile:
708704
// Keep my manifest file, and any newer incarnations'
709-
// (in case there is a race that allows other incarnations)
705+
// (can happen during manifest roll)
710706
keep = (number >= state.manifest_file_number);
711707
break;
712708
case kTableFile:
713709
keep = (sst_live.find(number) != sst_live.end());
714710
break;
715711
case kTempFile:
716712
// Any temp files that are currently being written to must
717-
// be recorded in pending_outputs_, which is inserted into "live"
718-
keep = (sst_live.find(number) != sst_live.end());
713+
// be recorded in pending_outputs_, which is inserted into "live".
714+
// Also, SetCurrentFile creates a temp file when writing out new
715+
// manifest, which is equal to state.pending_manifest_file_number. We
716+
// should not delete that file
717+
keep = (sst_live.find(number) != sst_live.end()) ||
718+
(number == state.pending_manifest_file_number);
719719
break;
720720
case kInfoLogFile:
721721
keep = true;

db/db_impl.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -230,10 +230,12 @@ class DBImpl : public DB {
230230

231231
// the current manifest_file_number, log_number and prev_log_number
232232
// that corresponds to the set of files in 'live'.
233-
uint64_t manifest_file_number, log_number, prev_log_number;
233+
uint64_t manifest_file_number, pending_manifest_file_number, log_number,
234+
prev_log_number;
234235

235236
explicit DeletionState(bool create_superversion = false) {
236237
manifest_file_number = 0;
238+
pending_manifest_file_number = 0;
237239
log_number = 0;
238240
prev_log_number = 0;
239241
new_superversion =

db/version_set.cc

+28-26
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@
77
// Use of this source code is governed by a BSD-style license that can be
88
// found in the LICENSE file. See the AUTHORS file for names of contributors.
99

10+
#define __STDC_FORMAT_MACROS
1011
#include "db/version_set.h"
1112

13+
#include <inttypes.h>
1214
#include <algorithm>
1315
#include <climits>
1416
#include <stdio.h>
@@ -1435,6 +1437,7 @@ VersionSet::VersionSet(const std::string& dbname, const Options* options,
14351437
icmp_(*cmp),
14361438
next_file_number_(2),
14371439
manifest_file_number_(0), // Filled by Recover()
1440+
pending_manifest_file_number_(0),
14381441
last_sequence_(0),
14391442
log_number_(0),
14401443
prev_log_number_(0),
@@ -1527,22 +1530,17 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
15271530

15281531
// Initialize new descriptor log file if necessary by creating
15291532
// a temporary file that contains a snapshot of the current version.
1530-
std::string new_manifest_filename;
15311533
uint64_t new_manifest_file_size = 0;
15321534
Status s;
1533-
// we will need this if we are creating new manifest
1534-
uint64_t old_manifest_file_number = manifest_file_number_;
15351535

1536-
// No need to perform this check if a new Manifest is being created anyways.
1536+
assert(pending_manifest_file_number_ == 0);
15371537
if (!descriptor_log_ ||
15381538
manifest_file_size_ > options_->max_manifest_file_size) {
1539+
pending_manifest_file_number_ = NewFileNumber();
1540+
batch_edits.back()->SetNextFile(next_file_number_);
15391541
new_descriptor_log = true;
1540-
manifest_file_number_ = NewFileNumber(); // Change manifest file no.
1541-
}
1542-
1543-
if (new_descriptor_log) {
1544-
new_manifest_filename = DescriptorFileName(dbname_, manifest_file_number_);
1545-
edit->SetNextFile(next_file_number_);
1542+
} else {
1543+
pending_manifest_file_number_ = manifest_file_number_;
15461544
}
15471545

15481546
// Unlock during expensive operations. New writes cannot get here
@@ -1562,10 +1560,11 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
15621560

15631561
// This is fine because everything inside of this block is serialized --
15641562
// only one thread can be here at the same time
1565-
if (!new_manifest_filename.empty()) {
1563+
if (new_descriptor_log) {
15661564
unique_ptr<WritableFile> descriptor_file;
1567-
s = env_->NewWritableFile(new_manifest_filename, &descriptor_file,
1568-
storage_options_.AdaptForLogWrite());
1565+
s = env_->NewWritableFile(
1566+
DescriptorFileName(dbname_, pending_manifest_file_number_),
1567+
&descriptor_file, storage_options_.AdaptForLogWrite());
15691568
if (s.ok()) {
15701569
descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
15711570
s = WriteSnapshot(descriptor_log_.get());
@@ -1604,7 +1603,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
16041603
for (auto& e : batch_edits) {
16051604
std::string record;
16061605
e->EncodeTo(&record);
1607-
if (!ManifestContains(record)) {
1606+
if (!ManifestContains(pending_manifest_file_number_, record)) {
16081607
all_records_in = false;
16091608
break;
16101609
}
@@ -1621,17 +1620,16 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
16211620

16221621
// If we just created a new descriptor file, install it by writing a
16231622
// new CURRENT file that points to it.
1624-
if (s.ok() && !new_manifest_filename.empty()) {
1625-
s = SetCurrentFile(env_, dbname_, manifest_file_number_);
1626-
if (s.ok() && old_manifest_file_number < manifest_file_number_) {
1623+
if (s.ok() && new_descriptor_log) {
1624+
s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_);
1625+
if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) {
16271626
// delete old manifest file
16281627
Log(options_->info_log,
1629-
"Deleting manifest %lu current manifest %lu\n",
1630-
(unsigned long)old_manifest_file_number,
1631-
(unsigned long)manifest_file_number_);
1628+
"Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n",
1629+
manifest_file_number_, pending_manifest_file_number_);
16321630
// we don't care about an error here, PurgeObsoleteFiles will take care
16331631
// of it later
1634-
env_->DeleteFile(DescriptorFileName(dbname_, old_manifest_file_number));
1632+
env_->DeleteFile(DescriptorFileName(dbname_, manifest_file_number_));
16351633
}
16361634
if (!options_->disableDataSync && db_directory != nullptr) {
16371635
db_directory->Fsync();
@@ -1649,23 +1647,25 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
16491647

16501648
// Install the new version
16511649
if (s.ok()) {
1650+
manifest_file_number_ = pending_manifest_file_number_;
16521651
manifest_file_size_ = new_manifest_file_size;
16531652
AppendVersion(v);
16541653
if (max_log_number_in_batch != 0) {
16551654
assert(log_number_ < max_log_number_in_batch);
16561655
log_number_ = max_log_number_in_batch;
16571656
}
16581657
prev_log_number_ = edit->prev_log_number_;
1659-
16601658
} else {
16611659
Log(options_->info_log, "Error in committing version %lu",
16621660
(unsigned long)v->GetVersionNumber());
16631661
delete v;
1664-
if (!new_manifest_filename.empty()) {
1662+
if (new_descriptor_log) {
16651663
descriptor_log_.reset();
1666-
env_->DeleteFile(new_manifest_filename);
1664+
env_->DeleteFile(
1665+
DescriptorFileName(dbname_, pending_manifest_file_number_));
16671666
}
16681667
}
1668+
pending_manifest_file_number_ = 0;
16691669

16701670
// wake up all the waiting writers
16711671
while (true) {
@@ -2103,8 +2103,10 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
21032103

21042104
// Opens the mainfest file and reads all records
21052105
// till it finds the record we are looking for.
2106-
bool VersionSet::ManifestContains(const std::string& record) const {
2107-
std::string fname = DescriptorFileName(dbname_, manifest_file_number_);
2106+
bool VersionSet::ManifestContains(uint64_t manifest_file_number,
2107+
const std::string& record) const {
2108+
std::string fname =
2109+
DescriptorFileName(dbname_, manifest_file_number);
21082110
Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str());
21092111
unique_ptr<SequentialFile> file;
21102112
Status s = env_->NewSequentialFile(fname, &file, storage_options_);

db/version_set.h

+7-1
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,10 @@ class VersionSet {
327327
// Return the current manifest file number
328328
uint64_t ManifestFileNumber() const { return manifest_file_number_; }
329329

330+
uint64_t PendingManifestFileNumber() const {
331+
return pending_manifest_file_number_;
332+
}
333+
330334
// Allocate and return a new file number
331335
uint64_t NewFileNumber() { return next_file_number_++; }
332336

@@ -436,7 +440,8 @@ class VersionSet {
436440

437441
void AppendVersion(Version* v);
438442

439-
bool ManifestContains(const std::string& record) const;
443+
bool ManifestContains(uint64_t manifest_file_number,
444+
const std::string& record) const;
440445

441446
Env* const env_;
442447
const std::string dbname_;
@@ -445,6 +450,7 @@ class VersionSet {
445450
const InternalKeyComparator icmp_;
446451
uint64_t next_file_number_;
447452
uint64_t manifest_file_number_;
453+
uint64_t pending_manifest_file_number_;
448454
std::atomic<uint64_t> last_sequence_;
449455
uint64_t log_number_;
450456
uint64_t prev_log_number_; // 0 or backing store for memtable being compacted

0 commit comments

Comments
 (0)