Skip to content

Commit d343c3f

Browse files
Improve db recovery
Summary: Avoid creating unnecessary sst files while db opening Test Plan: make all check Reviewers: sdong, igor Reviewed By: igor Subscribers: zagfox, yhchiang, ljin, leveldb Differential Revision: https://reviews.facebook.net/D20661
1 parent 6bb7e3e commit d343c3f

File tree

3 files changed

+243
-103
lines changed

3 files changed

+243
-103
lines changed

db/db_impl.cc

+113-98
Original file line numberDiff line numberDiff line change
@@ -1219,14 +1219,16 @@ Status DBImpl::Recover(
12191219
"flag but a log file already exists");
12201220
}
12211221

1222-
// Recover in the order in which the logs were generated
1223-
std::sort(logs.begin(), logs.end());
1224-
for (const auto& log : logs) {
1225-
// The previous incarnation may not have written any MANIFEST
1226-
// records after allocating this log number. So we manually
1227-
// update the file number allocation counter in VersionSet.
1228-
versions_->MarkFileNumberUsed(log);
1229-
s = RecoverLogFile(log, &max_sequence, read_only);
1222+
if (!logs.empty()) {
1223+
// Recover in the order in which the logs were generated
1224+
std::sort(logs.begin(), logs.end());
1225+
s = RecoverLogFiles(logs, &max_sequence, read_only);
1226+
if (!s.ok()) {
1227+
// Clear memtables if recovery failed
1228+
for (auto cfd : *versions_->GetColumnFamilySet()) {
1229+
cfd->CreateNewMemtable();
1230+
}
1231+
}
12301232
}
12311233
SetTickerCount(stats_, SEQUENCE_NUMBER, versions_->LastSequence());
12321234
}
@@ -1239,8 +1241,9 @@ Status DBImpl::Recover(
12391241
return s;
12401242
}
12411243

1242-
Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
1243-
bool read_only) {
1244+
// REQUIRES: log_numbers are sorted in ascending order
1245+
Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
1246+
SequenceNumber* max_sequence, bool read_only) {
12441247
struct LogReporter : public log::Reader::Reporter {
12451248
Env* env;
12461249
Logger* info_log;
@@ -1256,7 +1259,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
12561259
};
12571260

12581261
mutex_.AssertHeld();
1259-
1262+
Status status;
12601263
std::unordered_map<int, VersionEdit> version_edits;
12611264
// no need to refcount because iteration is under mutex
12621265
for (auto cfd : *versions_->GetColumnFamilySet()) {
@@ -1265,102 +1268,113 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
12651268
version_edits.insert({cfd->GetID(), edit});
12661269
}
12671270

1268-
// Open the log file
1269-
std::string fname = LogFileName(db_options_.wal_dir, log_number);
1270-
unique_ptr<SequentialFile> file;
1271-
Status status = env_->NewSequentialFile(fname, &file, env_options_);
1272-
if (!status.ok()) {
1273-
MaybeIgnoreError(&status);
1274-
return status;
1275-
}
1276-
1277-
// Create the log reader.
1278-
LogReporter reporter;
1279-
reporter.env = env_;
1280-
reporter.info_log = db_options_.info_log.get();
1281-
reporter.fname = fname.c_str();
1282-
reporter.status = (db_options_.paranoid_checks &&
1283-
!db_options_.skip_log_error_on_recovery ? &status
1284-
: nullptr);
1285-
// We intentially make log::Reader do checksumming even if
1286-
// paranoid_checks==false so that corruptions cause entire commits
1287-
// to be skipped instead of propagating bad information (like overly
1288-
// large sequence numbers).
1289-
log::Reader reader(std::move(file), &reporter, true/*checksum*/,
1290-
0/*initial_offset*/);
1291-
Log(db_options_.info_log, "Recovering log #%" PRIu64 "", log_number);
1292-
1293-
// Read all the records and add to a memtable
1294-
std::string scratch;
1295-
Slice record;
1296-
WriteBatch batch;
1297-
while (reader.ReadRecord(&record, &scratch)) {
1298-
if (record.size() < 12) {
1299-
reporter.Corruption(record.size(),
1300-
Status::Corruption("log record too small"));
1301-
continue;
1271+
for (auto log_number : log_numbers) {
1272+
// The previous incarnation may not have written any MANIFEST
1273+
// records after allocating this log number. So we manually
1274+
// update the file number allocation counter in VersionSet.
1275+
versions_->MarkFileNumberUsed(log_number);
1276+
// Open the log file
1277+
std::string fname = LogFileName(db_options_.wal_dir, log_number);
1278+
unique_ptr<SequentialFile> file;
1279+
status = env_->NewSequentialFile(fname, &file, env_options_);
1280+
if (!status.ok()) {
1281+
MaybeIgnoreError(&status);
1282+
if (!status.ok()) {
1283+
return status;
1284+
} else {
1285+
// Fail with one log file, but that's ok.
1286+
// Try next one.
1287+
continue;
1288+
}
13021289
}
1303-
WriteBatchInternal::SetContents(&batch, record);
13041290

1305-
// If column family was not found, it might mean that the WAL write
1306-
// batch references to the column family that was dropped after the
1307-
// insert. We don't want to fail the whole write batch in that case -- we
1308-
// just ignore the update. That's why we set ignore missing column families
1309-
// to true
1310-
status = WriteBatchInternal::InsertInto(
1311-
&batch, column_family_memtables_.get(),
1312-
true /* ignore missing column families */, log_number);
1291+
// Create the log reader.
1292+
LogReporter reporter;
1293+
reporter.env = env_;
1294+
reporter.info_log = db_options_.info_log.get();
1295+
reporter.fname = fname.c_str();
1296+
reporter.status =
1297+
(db_options_.paranoid_checks && !db_options_.skip_log_error_on_recovery
1298+
? &status
1299+
: nullptr);
1300+
// We intentially make log::Reader do checksumming even if
1301+
// paranoid_checks==false so that corruptions cause entire commits
1302+
// to be skipped instead of propagating bad information (like overly
1303+
// large sequence numbers).
1304+
log::Reader reader(std::move(file), &reporter, true /*checksum*/,
1305+
0 /*initial_offset*/);
1306+
Log(db_options_.info_log, "Recovering log #%" PRIu64 "", log_number);
1307+
1308+
// Read all the records and add to a memtable
1309+
std::string scratch;
1310+
Slice record;
1311+
WriteBatch batch;
1312+
while (reader.ReadRecord(&record, &scratch)) {
1313+
if (record.size() < 12) {
1314+
reporter.Corruption(record.size(),
1315+
Status::Corruption("log record too small"));
1316+
continue;
1317+
}
1318+
WriteBatchInternal::SetContents(&batch, record);
13131319

1314-
MaybeIgnoreError(&status);
1315-
if (!status.ok()) {
1316-
return status;
1317-
}
1318-
const SequenceNumber last_seq =
1319-
WriteBatchInternal::Sequence(&batch) +
1320-
WriteBatchInternal::Count(&batch) - 1;
1321-
if (last_seq > *max_sequence) {
1322-
*max_sequence = last_seq;
1323-
}
1320+
// If column family was not found, it might mean that the WAL write
1321+
// batch references to the column family that was dropped after the
1322+
// insert. We don't want to fail the whole write batch in that case --
1323+
// we just ignore the update.
1324+
// That's why we set ignore missing column families to true
1325+
status = WriteBatchInternal::InsertInto(
1326+
&batch, column_family_memtables_.get(), true, log_number);
13241327

1325-
if (!read_only) {
1326-
// no need to refcount since client still doesn't have access
1327-
// to the DB and can not drop column families while we iterate
1328-
for (auto cfd : *versions_->GetColumnFamilySet()) {
1329-
if (cfd->mem()->ShouldFlush()) {
1330-
// If this asserts, it means that InsertInto failed in
1331-
// filtering updates to already-flushed column families
1332-
assert(cfd->GetLogNumber() <= log_number);
1333-
auto iter = version_edits.find(cfd->GetID());
1334-
assert(iter != version_edits.end());
1335-
VersionEdit* edit = &iter->second;
1336-
status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit);
1337-
// we still want to clear the memtable, even if the recovery failed
1338-
cfd->CreateNewMemtable();
1339-
if (!status.ok()) {
1340-
// Reflect errors immediately so that conditions like full
1341-
// file-systems cause the DB::Open() to fail.
1342-
return status;
1328+
MaybeIgnoreError(&status);
1329+
if (!status.ok()) {
1330+
return status;
1331+
}
1332+
const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) +
1333+
WriteBatchInternal::Count(&batch) - 1;
1334+
if (last_seq > *max_sequence) {
1335+
*max_sequence = last_seq;
1336+
}
1337+
1338+
if (!read_only) {
1339+
// no need to refcount since client still doesn't have access
1340+
// to the DB and can not drop column families while we iterate
1341+
for (auto cfd : *versions_->GetColumnFamilySet()) {
1342+
if (cfd->mem()->ShouldFlush()) {
1343+
// If this asserts, it means that InsertInto failed in
1344+
// filtering updates to already-flushed column families
1345+
assert(cfd->GetLogNumber() <= log_number);
1346+
auto iter = version_edits.find(cfd->GetID());
1347+
assert(iter != version_edits.end());
1348+
VersionEdit* edit = &iter->second;
1349+
status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit);
1350+
if (!status.ok()) {
1351+
// Reflect errors immediately so that conditions like full
1352+
// file-systems cause the DB::Open() to fail.
1353+
return status;
1354+
}
1355+
cfd->CreateNewMemtable();
13431356
}
13441357
}
13451358
}
13461359
}
1347-
}
13481360

1349-
if (versions_->LastSequence() < *max_sequence) {
1350-
versions_->SetLastSequence(*max_sequence);
1361+
if (versions_->LastSequence() < *max_sequence) {
1362+
versions_->SetLastSequence(*max_sequence);
1363+
}
13511364
}
13521365

13531366
if (!read_only) {
13541367
// no need to refcount since client still doesn't have access
13551368
// to the DB and can not drop column families while we iterate
1369+
auto max_log_number = log_numbers.back();
13561370
for (auto cfd : *versions_->GetColumnFamilySet()) {
13571371
auto iter = version_edits.find(cfd->GetID());
13581372
assert(iter != version_edits.end());
13591373
VersionEdit* edit = &iter->second;
13601374

1361-
if (cfd->GetLogNumber() > log_number) {
1375+
if (cfd->GetLogNumber() > max_log_number) {
13621376
// Column family cfd has already flushed the data
1363-
// from log_number. Memtable has to be empty because
1377+
// from all logs. Memtable has to be empty because
13641378
// we filter the updates based on log_number
13651379
// (in WriteBatch::InsertInto)
13661380
assert(cfd->mem()->GetFirstSequenceNumber() == 0);
@@ -1371,28 +1385,29 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
13711385
// flush the final memtable (if non-empty)
13721386
if (cfd->mem()->GetFirstSequenceNumber() != 0) {
13731387
status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit);
1374-
}
1375-
// we still want to clear the memtable, even if the recovery failed
1376-
cfd->CreateNewMemtable();
1377-
if (!status.ok()) {
1378-
return status;
1388+
if (!status.ok()) {
1389+
// Recovery failed
1390+
break;
1391+
}
1392+
cfd->CreateNewMemtable();
13791393
}
13801394

13811395
// write MANIFEST with update
1382-
// writing log number in the manifest means that any log file
1396+
// writing log_number in the manifest means that any log file
13831397
// with number strongly less than (log_number + 1) is already
13841398
// recovered and should be ignored on next reincarnation.
1385-
// Since we already recovered log_number, we want all logs
1386-
// with numbers `<= log_number` (includes this one) to be ignored
1387-
edit->SetLogNumber(log_number + 1);
1399+
// Since we already recovered max_log_number, we want all logs
1400+
// with numbers `<= max_log_number` (includes this one) to be ignored
1401+
edit->SetLogNumber(max_log_number + 1);
13881402
// we must mark the next log number as used, even though it's
13891403
// not actually used. that is because VersionSet assumes
13901404
// VersionSet::next_file_number_ always to be strictly greater than any
13911405
// log number
1392-
versions_->MarkFileNumberUsed(log_number + 1);
1406+
versions_->MarkFileNumberUsed(max_log_number + 1);
13931407
status = versions_->LogAndApply(cfd, edit, &mutex_);
13941408
if (!status.ok()) {
1395-
return status;
1409+
// Recovery failed
1410+
break;
13961411
}
13971412
}
13981413
}

db/db_impl.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -344,8 +344,9 @@ class DBImpl : public DB {
344344
DeletionState& deletion_state,
345345
LogBuffer* log_buffer);
346346

347-
Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
348-
bool read_only);
347+
// REQUIRES: log_numbers are sorted in ascending order
348+
Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
349+
SequenceNumber* max_sequence, bool read_only);
349350

350351
// The following two methods are used to flush a memtable to
351352
// storage. The first one is used atdatabase RecoveryTime (when the

0 commit comments

Comments
 (0)