@@ -1219,14 +1219,16 @@ Status DBImpl::Recover(
1219
1219
" flag but a log file already exists" );
1220
1220
}
1221
1221
1222
- // Recover in the order in which the logs were generated
1223
- std::sort (logs.begin (), logs.end ());
1224
- for (const auto & log : logs) {
1225
- // The previous incarnation may not have written any MANIFEST
1226
- // records after allocating this log number. So we manually
1227
- // update the file number allocation counter in VersionSet.
1228
- versions_->MarkFileNumberUsed (log );
1229
- s = RecoverLogFile (log , &max_sequence, read_only);
1222
+ if (!logs.empty ()) {
1223
+ // Recover in the order in which the logs were generated
1224
+ std::sort (logs.begin (), logs.end ());
1225
+ s = RecoverLogFiles (logs, &max_sequence, read_only);
1226
+ if (!s.ok ()) {
1227
+ // Clear memtables if recovery failed
1228
+ for (auto cfd : *versions_->GetColumnFamilySet ()) {
1229
+ cfd->CreateNewMemtable ();
1230
+ }
1231
+ }
1230
1232
}
1231
1233
SetTickerCount (stats_, SEQUENCE_NUMBER, versions_->LastSequence ());
1232
1234
}
@@ -1239,8 +1241,9 @@ Status DBImpl::Recover(
1239
1241
return s;
1240
1242
}
1241
1243
1242
- Status DBImpl::RecoverLogFile (uint64_t log_number, SequenceNumber* max_sequence,
1243
- bool read_only) {
1244
+ // REQUIRES: log_numbers are sorted in ascending order
1245
+ Status DBImpl::RecoverLogFiles (const std::vector<uint64_t >& log_numbers,
1246
+ SequenceNumber* max_sequence, bool read_only) {
1244
1247
struct LogReporter : public log ::Reader::Reporter {
1245
1248
Env* env;
1246
1249
Logger* info_log;
@@ -1256,7 +1259,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
1256
1259
};
1257
1260
1258
1261
mutex_.AssertHeld ();
1259
-
1262
+ Status status;
1260
1263
std::unordered_map<int , VersionEdit> version_edits;
1261
1264
// no need to refcount because iteration is under mutex
1262
1265
for (auto cfd : *versions_->GetColumnFamilySet ()) {
@@ -1265,102 +1268,113 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
1265
1268
version_edits.insert ({cfd->GetID (), edit});
1266
1269
}
1267
1270
1268
- // Open the log file
1269
- std::string fname = LogFileName (db_options_.wal_dir , log_number);
1270
- unique_ptr<SequentialFile> file;
1271
- Status status = env_->NewSequentialFile (fname, &file, env_options_);
1272
- if (!status.ok ()) {
1273
- MaybeIgnoreError (&status);
1274
- return status;
1275
- }
1276
-
1277
- // Create the log reader.
1278
- LogReporter reporter;
1279
- reporter.env = env_;
1280
- reporter.info_log = db_options_.info_log .get ();
1281
- reporter.fname = fname.c_str ();
1282
- reporter.status = (db_options_.paranoid_checks &&
1283
- !db_options_.skip_log_error_on_recovery ? &status
1284
- : nullptr );
1285
- // We intentially make log::Reader do checksumming even if
1286
- // paranoid_checks==false so that corruptions cause entire commits
1287
- // to be skipped instead of propagating bad information (like overly
1288
- // large sequence numbers).
1289
- log ::Reader reader (std::move (file), &reporter, true /* checksum*/ ,
1290
- 0 /* initial_offset*/ );
1291
- Log (db_options_.info_log , " Recovering log #%" PRIu64 " " , log_number);
1292
-
1293
- // Read all the records and add to a memtable
1294
- std::string scratch;
1295
- Slice record;
1296
- WriteBatch batch;
1297
- while (reader.ReadRecord (&record, &scratch)) {
1298
- if (record.size () < 12 ) {
1299
- reporter.Corruption (record.size (),
1300
- Status::Corruption (" log record too small" ));
1301
- continue ;
1271
+ for (auto log_number : log_numbers) {
1272
+ // The previous incarnation may not have written any MANIFEST
1273
+ // records after allocating this log number. So we manually
1274
+ // update the file number allocation counter in VersionSet.
1275
+ versions_->MarkFileNumberUsed (log_number);
1276
+ // Open the log file
1277
+ std::string fname = LogFileName (db_options_.wal_dir , log_number);
1278
+ unique_ptr<SequentialFile> file;
1279
+ status = env_->NewSequentialFile (fname, &file, env_options_);
1280
+ if (!status.ok ()) {
1281
+ MaybeIgnoreError (&status);
1282
+ if (!status.ok ()) {
1283
+ return status;
1284
+ } else {
1285
+ // Fail with one log file, but that's ok.
1286
+ // Try next one.
1287
+ continue ;
1288
+ }
1302
1289
}
1303
- WriteBatchInternal::SetContents (&batch, record);
1304
1290
1305
- // If column family was not found, it might mean that the WAL write
1306
- // batch references to the column family that was dropped after the
1307
- // insert. We don't want to fail the whole write batch in that case -- we
1308
- // just ignore the update. That's why we set ignore missing column families
1309
- // to true
1310
- status = WriteBatchInternal::InsertInto (
1311
- &batch, column_family_memtables_.get (),
1312
- true /* ignore missing column families */ , log_number);
1291
+ // Create the log reader.
1292
+ LogReporter reporter;
1293
+ reporter.env = env_;
1294
+ reporter.info_log = db_options_.info_log .get ();
1295
+ reporter.fname = fname.c_str ();
1296
+ reporter.status =
1297
+ (db_options_.paranoid_checks && !db_options_.skip_log_error_on_recovery
1298
+ ? &status
1299
+ : nullptr );
1300
+ // We intentially make log::Reader do checksumming even if
1301
+ // paranoid_checks==false so that corruptions cause entire commits
1302
+ // to be skipped instead of propagating bad information (like overly
1303
+ // large sequence numbers).
1304
+ log ::Reader reader (std::move (file), &reporter, true /* checksum*/ ,
1305
+ 0 /* initial_offset*/ );
1306
+ Log (db_options_.info_log , " Recovering log #%" PRIu64 " " , log_number);
1307
+
1308
+ // Read all the records and add to a memtable
1309
+ std::string scratch;
1310
+ Slice record;
1311
+ WriteBatch batch;
1312
+ while (reader.ReadRecord (&record, &scratch)) {
1313
+ if (record.size () < 12 ) {
1314
+ reporter.Corruption (record.size (),
1315
+ Status::Corruption (" log record too small" ));
1316
+ continue ;
1317
+ }
1318
+ WriteBatchInternal::SetContents (&batch, record);
1313
1319
1314
- MaybeIgnoreError (&status);
1315
- if (!status.ok ()) {
1316
- return status;
1317
- }
1318
- const SequenceNumber last_seq =
1319
- WriteBatchInternal::Sequence (&batch) +
1320
- WriteBatchInternal::Count (&batch) - 1 ;
1321
- if (last_seq > *max_sequence) {
1322
- *max_sequence = last_seq;
1323
- }
1320
+ // If column family was not found, it might mean that the WAL write
1321
+ // batch references to the column family that was dropped after the
1322
+ // insert. We don't want to fail the whole write batch in that case --
1323
+ // we just ignore the update.
1324
+ // That's why we set ignore missing column families to true
1325
+ status = WriteBatchInternal::InsertInto (
1326
+ &batch, column_family_memtables_.get (), true , log_number);
1324
1327
1325
- if (!read_only) {
1326
- // no need to refcount since client still doesn't have access
1327
- // to the DB and can not drop column families while we iterate
1328
- for (auto cfd : *versions_->GetColumnFamilySet ()) {
1329
- if (cfd->mem ()->ShouldFlush ()) {
1330
- // If this asserts, it means that InsertInto failed in
1331
- // filtering updates to already-flushed column families
1332
- assert (cfd->GetLogNumber () <= log_number);
1333
- auto iter = version_edits.find (cfd->GetID ());
1334
- assert (iter != version_edits.end ());
1335
- VersionEdit* edit = &iter->second ;
1336
- status = WriteLevel0TableForRecovery (cfd, cfd->mem (), edit);
1337
- // we still want to clear the memtable, even if the recovery failed
1338
- cfd->CreateNewMemtable ();
1339
- if (!status.ok ()) {
1340
- // Reflect errors immediately so that conditions like full
1341
- // file-systems cause the DB::Open() to fail.
1342
- return status;
1328
+ MaybeIgnoreError (&status);
1329
+ if (!status.ok ()) {
1330
+ return status;
1331
+ }
1332
+ const SequenceNumber last_seq = WriteBatchInternal::Sequence (&batch) +
1333
+ WriteBatchInternal::Count (&batch) - 1 ;
1334
+ if (last_seq > *max_sequence) {
1335
+ *max_sequence = last_seq;
1336
+ }
1337
+
1338
+ if (!read_only) {
1339
+ // no need to refcount since client still doesn't have access
1340
+ // to the DB and can not drop column families while we iterate
1341
+ for (auto cfd : *versions_->GetColumnFamilySet ()) {
1342
+ if (cfd->mem ()->ShouldFlush ()) {
1343
+ // If this asserts, it means that InsertInto failed in
1344
+ // filtering updates to already-flushed column families
1345
+ assert (cfd->GetLogNumber () <= log_number);
1346
+ auto iter = version_edits.find (cfd->GetID ());
1347
+ assert (iter != version_edits.end ());
1348
+ VersionEdit* edit = &iter->second ;
1349
+ status = WriteLevel0TableForRecovery (cfd, cfd->mem (), edit);
1350
+ if (!status.ok ()) {
1351
+ // Reflect errors immediately so that conditions like full
1352
+ // file-systems cause the DB::Open() to fail.
1353
+ return status;
1354
+ }
1355
+ cfd->CreateNewMemtable ();
1343
1356
}
1344
1357
}
1345
1358
}
1346
1359
}
1347
- }
1348
1360
1349
- if (versions_->LastSequence () < *max_sequence) {
1350
- versions_->SetLastSequence (*max_sequence);
1361
+ if (versions_->LastSequence () < *max_sequence) {
1362
+ versions_->SetLastSequence (*max_sequence);
1363
+ }
1351
1364
}
1352
1365
1353
1366
if (!read_only) {
1354
1367
// no need to refcount since client still doesn't have access
1355
1368
// to the DB and can not drop column families while we iterate
1369
+ auto max_log_number = log_numbers.back ();
1356
1370
for (auto cfd : *versions_->GetColumnFamilySet ()) {
1357
1371
auto iter = version_edits.find (cfd->GetID ());
1358
1372
assert (iter != version_edits.end ());
1359
1373
VersionEdit* edit = &iter->second ;
1360
1374
1361
- if (cfd->GetLogNumber () > log_number ) {
1375
+ if (cfd->GetLogNumber () > max_log_number ) {
1362
1376
// Column family cfd has already flushed the data
1363
- // from log_number . Memtable has to be empty because
1377
+ // from all logs . Memtable has to be empty because
1364
1378
// we filter the updates based on log_number
1365
1379
// (in WriteBatch::InsertInto)
1366
1380
assert (cfd->mem ()->GetFirstSequenceNumber () == 0 );
@@ -1371,28 +1385,29 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
1371
1385
// flush the final memtable (if non-empty)
1372
1386
if (cfd->mem ()->GetFirstSequenceNumber () != 0 ) {
1373
1387
status = WriteLevel0TableForRecovery (cfd, cfd->mem (), edit);
1374
- }
1375
- // we still want to clear the memtable, even if the recovery failed
1376
- cfd-> CreateNewMemtable () ;
1377
- if (!status. ok ()) {
1378
- return status ;
1388
+ if (!status. ok ()) {
1389
+ // Recovery failed
1390
+ break ;
1391
+ }
1392
+ cfd-> CreateNewMemtable () ;
1379
1393
}
1380
1394
1381
1395
// write MANIFEST with update
1382
- // writing log number in the manifest means that any log file
1396
+ // writing log_number in the manifest means that any log file
1383
1397
// with number strongly less than (log_number + 1) is already
1384
1398
// recovered and should be ignored on next reincarnation.
1385
- // Since we already recovered log_number , we want all logs
1386
- // with numbers `<= log_number ` (includes this one) to be ignored
1387
- edit->SetLogNumber (log_number + 1 );
1399
+ // Since we already recovered max_log_number , we want all logs
1400
+ // with numbers `<= max_log_number ` (includes this one) to be ignored
1401
+ edit->SetLogNumber (max_log_number + 1 );
1388
1402
// we must mark the next log number as used, even though it's
1389
1403
// not actually used. that is because VersionSet assumes
1390
1404
// VersionSet::next_file_number_ always to be strictly greater than any
1391
1405
// log number
1392
- versions_->MarkFileNumberUsed (log_number + 1 );
1406
+ versions_->MarkFileNumberUsed (max_log_number + 1 );
1393
1407
status = versions_->LogAndApply (cfd, edit, &mutex_);
1394
1408
if (!status.ok ()) {
1395
- return status;
1409
+ // Recovery failed
1410
+ break ;
1396
1411
}
1397
1412
}
1398
1413
}
0 commit comments