Skip to content

Commit 58ca641

Browse files
committed
Make Log::Reader more robust
Summary: This diff does two things: (1) Log::Reader does not report a corruption when the last record in a log or manifest file is truncated (meaning that log writer died in the middle of the write). Inherited the code from LevelDB: https://code.google.com/p/leveldb/source/detail?r=269fc6ca9416129248db5ca57050cd5d39d177c8# (2) Turn off mmap writes for all writes to log and manifest files (2) is necessary because if we use mmap writes, the last record is not truncated, but is actually filled with zeros, making checksum fail. It is hard to recover from checksum failing. Test Plan: Added unit tests from LevelDB Actually recovered a "corrupted" MANIFEST file. Reviewers: dhruba, haobo Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D16119
1 parent a77527f commit 58ca641

File tree

8 files changed

+74
-26
lines changed

8 files changed

+74
-26
lines changed

HISTORY.md

+5
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@
99
* Added is_manual_compaction to CompactionFilter::Context
1010
* Added "virtual void WaitForJoin() = 0" in class Env
1111

12+
### New Features
13+
* If we find one truncated record at the end of the MANIFEST or WAL files,
14+
we will ignore it. We assume that writers of these records were interrupted
15+
and that we can safely ignore it.
16+
1217
## 2.7.0 (01/28/2014)
1318

1419
### Public API changes

db/db_impl.cc

+6-10
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,8 @@ Status DBImpl::NewDB() {
441441

442442
const std::string manifest = DescriptorFileName(dbname_, 1);
443443
unique_ptr<WritableFile> file;
444-
Status s = env_->NewWritableFile(manifest, &file, storage_options_);
444+
Status s = env_->NewWritableFile(manifest, &file,
445+
storage_options_.AdaptForLogWrite());
445446
if (!s.ok()) {
446447
return s;
447448
}
@@ -3524,11 +3525,9 @@ Status DBImpl::MakeRoomForWrite(bool force,
35243525
SuperVersion* new_superversion = nullptr;
35253526
mutex_.Unlock();
35263527
{
3527-
EnvOptions soptions(storage_options_);
3528-
soptions.use_mmap_writes = false;
35293528
DelayLoggingAndReset();
35303529
s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number),
3531-
&lfile, soptions);
3530+
&lfile, storage_options_.AdaptForLogWrite());
35323531
if (s.ok()) {
35333532
// Our final size should be less than write_buffer_size
35343533
// (compression, etc) but err on the side of caution.
@@ -3784,7 +3783,6 @@ DB::~DB() { }
37843783

37853784
Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
37863785
*dbptr = nullptr;
3787-
EnvOptions soptions(options);
37883786

37893787
if (options.block_cache != nullptr && options.no_block_cache) {
37903788
return Status::InvalidArgument(
@@ -3808,12 +3806,10 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
38083806
if (s.ok()) {
38093807
uint64_t new_log_number = impl->versions_->NewFileNumber();
38103808
unique_ptr<WritableFile> lfile;
3811-
soptions.use_mmap_writes = false;
3809+
EnvOptions soptions(options);
38123810
s = impl->options_.env->NewWritableFile(
3813-
LogFileName(impl->options_.wal_dir, new_log_number),
3814-
&lfile,
3815-
soptions
3816-
);
3811+
LogFileName(impl->options_.wal_dir, new_log_number), &lfile,
3812+
soptions.AdaptForLogWrite());
38173813
if (s.ok()) {
38183814
lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size);
38193815
VersionEdit edit;

db/log_reader.cc

+17-8
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,9 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
140140

141141
case kEof:
142142
if (in_fragmented_record) {
143-
ReportCorruption(scratch->size(), "partial record without end(3)");
143+
// This can be caused by the writer dying immediately after
144+
// writing a physical record but before completing the next; don't
145+
// treat it as a corruption, just ignore the entire logical record.
144146
scratch->clear();
145147
}
146148
return false;
@@ -264,13 +266,12 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
264266
eof_offset_ = buffer_.size();
265267
}
266268
continue;
267-
} else if (buffer_.size() == 0) {
268-
// End of file
269-
return kEof;
270269
} else {
271-
size_t drop_size = buffer_.size();
270+
// Note that if buffer_ is non-empty, we have a truncated header at the
271+
// end of the file, which can be caused by the writer crashing in the
272+
// middle of writing the header. Instead of considering this an error,
273+
// just report EOF.
272274
buffer_.clear();
273-
ReportCorruption(drop_size, "truncated record at end of file");
274275
return kEof;
275276
}
276277
}
@@ -284,14 +285,22 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
284285
if (kHeaderSize + length > buffer_.size()) {
285286
size_t drop_size = buffer_.size();
286287
buffer_.clear();
287-
ReportCorruption(drop_size, "bad record length");
288-
return kBadRecord;
288+
if (!eof_) {
289+
ReportCorruption(drop_size, "bad record length");
290+
return kBadRecord;
291+
}
292+
// If the end of the file has been reached without reading |length| bytes
293+
// of payload, assume the writer died in the middle of writing the record.
294+
// Don't report a corruption.
295+
return kEof;
289296
}
290297

291298
if (type == kZeroType && length == 0) {
292299
// Skip zero length record without reporting any drops since
293300
// such records are produced by the mmap based writing code in
294301
// env_posix.cc that preallocates file regions.
302+
// NOTE: this should never happen in DB written by new RocksDB versions,
303+
// since we turn off mmap writes to manifest and log files
295304
buffer_.clear();
296305
return kBadRecord;
297306
}

db/log_test.cc

+35-5
Original file line numberDiff line numberDiff line change
@@ -446,20 +446,32 @@ TEST(LogTest, BadRecordType) {
446446
ASSERT_EQ("OK", MatchError("unknown record type"));
447447
}
448448

449-
TEST(LogTest, TruncatedTrailingRecord) {
449+
TEST(LogTest, TruncatedTrailingRecordIsIgnored) {
450450
Write("foo");
451451
ShrinkSize(4); // Drop all payload as well as a header byte
452452
ASSERT_EQ("EOF", Read());
453-
ASSERT_EQ((unsigned int)(kHeaderSize - 1), DroppedBytes());
454-
ASSERT_EQ("OK", MatchError("truncated record at end of file"));
453+
// Truncated last record is ignored, not treated as an error
454+
ASSERT_EQ(0, DroppedBytes());
455+
ASSERT_EQ("", ReportMessage());
455456
}
456457

457458
TEST(LogTest, BadLength) {
459+
const int kPayloadSize = kBlockSize - kHeaderSize;
460+
Write(BigString("bar", kPayloadSize));
461+
Write("foo");
462+
// Least significant size byte is stored in header[4].
463+
IncrementByte(4, 1);
464+
ASSERT_EQ("foo", Read());
465+
ASSERT_EQ(kBlockSize, DroppedBytes());
466+
ASSERT_EQ("OK", MatchError("bad record length"));
467+
}
468+
469+
TEST(LogTest, BadLengthAtEndIsIgnored) {
458470
Write("foo");
459471
ShrinkSize(1);
460472
ASSERT_EQ("EOF", Read());
461-
ASSERT_EQ((unsigned int)(kHeaderSize + 2), DroppedBytes());
462-
ASSERT_EQ("OK", MatchError("bad record length"));
473+
ASSERT_EQ(0, DroppedBytes());
474+
ASSERT_EQ("", ReportMessage());
463475
}
464476

465477
TEST(LogTest, ChecksumMismatch) {
@@ -510,6 +522,24 @@ TEST(LogTest, UnexpectedFirstType) {
510522
ASSERT_EQ("OK", MatchError("partial record without end"));
511523
}
512524

525+
TEST(LogTest, MissingLastIsIgnored) {
526+
Write(BigString("bar", kBlockSize));
527+
// Remove the LAST block, including header.
528+
ShrinkSize(14);
529+
ASSERT_EQ("EOF", Read());
530+
ASSERT_EQ("", ReportMessage());
531+
ASSERT_EQ(0, DroppedBytes());
532+
}
533+
534+
TEST(LogTest, PartialLastIsIgnored) {
535+
Write(BigString("bar", kBlockSize));
536+
// Cause a bad record length in the LAST block.
537+
ShrinkSize(1);
538+
ASSERT_EQ("EOF", Read());
539+
ASSERT_EQ("", ReportMessage());
540+
ASSERT_EQ(0, DroppedBytes());
541+
}
542+
513543
TEST(LogTest, ErrorJoinsRecords) {
514544
// Consider two fragmented records:
515545
// first(R1) last(R1) first(R2) last(R2)

db/repair.cc

+2-2
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,6 @@ class Repairer {
241241
}
242242

243243
void ExtractMetaData() {
244-
std::vector<TableInfo> kept;
245244
for (size_t i = 0; i < table_numbers_.size(); i++) {
246245
TableInfo t;
247246
t.meta.number = table_numbers_[i];
@@ -307,7 +306,8 @@ class Repairer {
307306
Status WriteDescriptor() {
308307
std::string tmp = TempFileName(dbname_, 1);
309308
unique_ptr<WritableFile> file;
310-
Status status = env_->NewWritableFile(tmp, &file, storage_options_);
309+
Status status =
310+
env_->NewWritableFile(tmp, &file, storage_options_.AdaptForLogWrite());
311311
if (!status.ok()) {
312312
return status;
313313
}

db/version_set.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -1555,7 +1555,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
15551555
unique_ptr<WritableFile> descriptor_file;
15561556
s = env_->NewWritableFile(new_manifest_filename,
15571557
&descriptor_file,
1558-
storage_options_);
1558+
storage_options_.AdaptForLogWrite());
15591559
if (s.ok()) {
15601560
descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
15611561
s = WriteSnapshot(descriptor_log_.get());

include/rocksdb/env.h

+2
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ struct EnvOptions {
4949
// construct from Options
5050
explicit EnvOptions(const Options& options);
5151

52+
EnvOptions AdaptForLogWrite() const;
53+
5254
// If true, then allow caching of data in environment buffers
5355
bool use_os_buffer = true;
5456

util/env.cc

+6
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,12 @@ void AssignEnvOptions(EnvOptions* env_options, const Options& options) {
237237

238238
}
239239

240+
EnvOptions EnvOptions::AdaptForLogWrite() const {
241+
EnvOptions adapted = *this;
242+
adapted.use_mmap_writes = false;
243+
return adapted;
244+
}
245+
240246
EnvOptions::EnvOptions(const Options& options) {
241247
AssignEnvOptions(this, options);
242248
}

0 commit comments

Comments
 (0)