Skip to content

Commit 3d33da7

Browse files
committed
Fix UnmarkEOF for partial blocks
Summary: Blocks in the transaction log are a fixed size, but the last block in the transaction log file is usually a partial block. When a new record is added after the reader hit the end of the file, a new physical record will be appended to the last block. ReadPhysicalRecord can only read full blocks and assumes that the file position indicator is aligned to the start of a block. If the reader is forced to read further by simply clearing the EOF flag, ReadPhysicalRecord will read a full block starting from somewhere in the middle of a real block, causing it to lose alignment and to have a partial physical record at the end of the read buffer. This will result in length mismatches and checksum failures. When the log file is tailed for replication this will cause the log iterator to become invalid, necessitating the creation of a new iterator which will have to read the log file from scratch. This diff fixes this issue by reading the remaining portion of the last block we read from. This is done when the reader is forced to read further (UnmarkEOF is called). Test Plan: - Added unit tests - Stress test (with replication). Check dbdir/LOG file for corruptions. - Test on test tier Reviewers: emayanke, haobo, dhruba Reviewed By: haobo CC: vamsi, sheki, dhruba, kailiu, igor Differential Revision: https://reviews.facebook.net/D15249
1 parent 832158e commit 3d33da7

File tree

3 files changed

+232
-29
lines changed

3 files changed

+232
-29
lines changed

db/log_reader.cc

+68-2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ Reader::Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
2828
backing_store_(new char[kBlockSize]),
2929
buffer_(),
3030
eof_(false),
31+
read_error_(false),
32+
eof_offset_(0),
3133
last_record_offset_(0),
3234
end_of_buffer_offset_(0),
3335
initial_offset_(initial_offset) {
@@ -170,6 +172,69 @@ uint64_t Reader::LastRecordOffset() {
170172
return last_record_offset_;
171173
}
172174

175+
void Reader::UnmarkEOF() {
176+
if (read_error_) {
177+
return;
178+
}
179+
180+
eof_ = false;
181+
182+
if (eof_offset_ == 0) {
183+
return;
184+
}
185+
186+
// If the EOF was in the middle of a block (a partial block was read) we have
187+
// to read the rest of the block as ReadPhysicalRecord can only read full
188+
// blocks and expects the file position indicator to be aligned to the start
189+
// of a block.
190+
//
191+
// consumed_bytes + buffer_size() + remaining == kBlockSize
192+
193+
size_t consumed_bytes = eof_offset_ - buffer_.size();
194+
size_t remaining = kBlockSize - eof_offset_;
195+
196+
// backing_store_ is used to concatenate what is left in buffer_ and
197+
// the remainder of the block. If buffer_ already uses backing_store_,
198+
// we just append the new data.
199+
if (buffer_.data() != backing_store_ + consumed_bytes) {
200+
// Buffer_ does not use backing_store_ for storage.
201+
// Copy what is left in buffer_ to backing_store.
202+
memmove(backing_store_ + consumed_bytes, buffer_.data(), buffer_.size());
203+
}
204+
205+
Slice read_buffer;
206+
Status status = file_->Read(remaining, &read_buffer,
207+
backing_store_ + eof_offset_);
208+
209+
size_t added = read_buffer.size();
210+
end_of_buffer_offset_ += added;
211+
212+
if (!status.ok()) {
213+
if (added > 0) {
214+
ReportDrop(added, status);
215+
}
216+
217+
read_error_ = true;
218+
return;
219+
}
220+
221+
if (read_buffer.data() != backing_store_ + eof_offset_) {
222+
// Read did not write to backing_store_
223+
memmove(backing_store_ + eof_offset_, read_buffer.data(),
224+
read_buffer.size());
225+
}
226+
227+
buffer_ = Slice(backing_store_ + consumed_bytes,
228+
eof_offset_ + added - consumed_bytes);
229+
230+
if (added < remaining) {
231+
eof_ = true;
232+
eof_offset_ += added;
233+
} else {
234+
eof_offset_ = 0;
235+
}
236+
}
237+
173238
void Reader::ReportCorruption(size_t bytes, const char* reason) {
174239
ReportDrop(bytes, Status::Corruption(reason));
175240
}
@@ -184,18 +249,19 @@ void Reader::ReportDrop(size_t bytes, const Status& reason) {
184249
unsigned int Reader::ReadPhysicalRecord(Slice* result) {
185250
while (true) {
186251
if (buffer_.size() < (size_t)kHeaderSize) {
187-
if (!eof_) {
252+
if (!eof_ && !read_error_) {
188253
// Last read was a full read, so this is a trailer to skip
189254
buffer_.clear();
190255
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
191256
end_of_buffer_offset_ += buffer_.size();
192257
if (!status.ok()) {
193258
buffer_.clear();
194259
ReportDrop(kBlockSize, status);
195-
eof_ = true;
260+
read_error_ = true;
196261
return kEof;
197262
} else if (buffer_.size() < (size_t)kBlockSize) {
198263
eof_ = true;
264+
eof_offset_ = buffer_.size();
199265
}
200266
continue;
201267
} else if (buffer_.size() == 0) {

db/log_reader.h

+9-3
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,10 @@ class Reader {
6969

7070
// when we know more data has been written to the file. we can use this
7171
// function to force the reader to look again in the file.
72-
void UnmarkEOF() {
73-
eof_ = false;
74-
}
72+
// Also aligns the file position indicator to the start of the next block
73+
// by reading the rest of the data from the EOF position to the end of the
74+
// block that was partially read.
75+
void UnmarkEOF();
7576

7677
SequentialFile* file() { return file_.get(); }
7778

@@ -82,6 +83,11 @@ class Reader {
8283
char* const backing_store_;
8384
Slice buffer_;
8485
bool eof_; // Last Read() indicated EOF by returning < kBlockSize
86+
bool read_error_; // Error occurred while reading from file
87+
88+
// Offset of the file position indicator within the last block when an
89+
// EOF was detected.
90+
size_t eof_offset_;
8591

8692
// Offset of the last record returned by ReadRecord.
8793
uint64_t last_record_offset_;

0 commit comments

Comments
 (0)