Skip to content

Commit f26cb0f

Browse files
committed
Optimize fallocation
Summary: Based on my recent findings (posted in our internal group), if we use fallocate without KEEP_SIZE flag, we get superior performance of fdatasync() in append-only workloads. This diff provides an option for user to not use KEEP_SIZE flag, thus optimizing his sync performance by up to 2x-3x. At one point we also just called posix_fallocate instead of fallocate, which isn't very fast: http://code.woboq.org/userspace/glibc/sysdeps/posix/posix_fallocate.c.html (tl;dr it manually writes out zero bytes to allocate storage). This diff also fixes that, by first calling fallocate and then posix_fallocate if fallocate is not supported. Test Plan: make check Reviewers: dhruba, sdong, haobo, ljin Reviewed By: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D16761
1 parent ae25742 commit f26cb0f

File tree

6 files changed

+86
-37
lines changed

6 files changed

+86
-37
lines changed

db/db_impl.cc

+5-4
Original file line numberDiff line numberDiff line change
@@ -456,8 +456,8 @@ Status DBImpl::NewDB() {
456456

457457
const std::string manifest = DescriptorFileName(dbname_, 1);
458458
unique_ptr<WritableFile> file;
459-
Status s = env_->NewWritableFile(manifest, &file,
460-
storage_options_.AdaptForLogWrite());
459+
Status s = env_->NewWritableFile(
460+
manifest, &file, env_->OptimizeForManifestWrite(storage_options_));
461461
if (!s.ok()) {
462462
return s;
463463
}
@@ -3626,7 +3626,8 @@ Status DBImpl::MakeRoomForWrite(bool force,
36263626
{
36273627
DelayLoggingAndReset();
36283628
s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number),
3629-
&lfile, storage_options_.AdaptForLogWrite());
3629+
&lfile,
3630+
env_->OptimizeForLogWrite(storage_options_));
36303631
if (s.ok()) {
36313632
// Our final size should be less than write_buffer_size
36323633
// (compression, etc) but err on the side of caution.
@@ -3912,7 +3913,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
39123913
EnvOptions soptions(options);
39133914
s = impl->options_.env->NewWritableFile(
39143915
LogFileName(impl->options_.wal_dir, new_log_number), &lfile,
3915-
soptions.AdaptForLogWrite());
3916+
impl->options_.env->OptimizeForLogWrite(soptions));
39163917
if (s.ok()) {
39173918
lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size);
39183919
VersionEdit edit;

db/repair.cc

+2-2
Original file line numberDiff line numberDiff line change
@@ -306,8 +306,8 @@ class Repairer {
306306
Status WriteDescriptor() {
307307
std::string tmp = TempFileName(dbname_, 1);
308308
unique_ptr<WritableFile> file;
309-
Status status =
310-
env_->NewWritableFile(tmp, &file, storage_options_.AdaptForLogWrite());
309+
Status status = env_->NewWritableFile(
310+
tmp, &file, env_->OptimizeForManifestWrite(storage_options_));
311311
if (!status.ok()) {
312312
return status;
313313
}

db/version_set.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -1564,7 +1564,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
15641564
unique_ptr<WritableFile> descriptor_file;
15651565
s = env_->NewWritableFile(
15661566
DescriptorFileName(dbname_, pending_manifest_file_number_),
1567-
&descriptor_file, storage_options_.AdaptForLogWrite());
1567+
&descriptor_file, env_->OptimizeForManifestWrite(storage_options_));
15681568
if (s.ok()) {
15691569
descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
15701570
s = WriteSnapshot(descriptor_log_.get());

include/rocksdb/env.h

+19-3
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,6 @@ struct EnvOptions {
4949
// construct from Options
5050
explicit EnvOptions(const Options& options);
5151

52-
EnvOptions AdaptForLogWrite() const;
53-
5452
// If true, then allow caching of data in environment buffers
5553
bool use_os_buffer = true;
5654

@@ -61,13 +59,21 @@ struct EnvOptions {
6159
bool use_mmap_writes = true;
6260

6361
// If true, set the FD_CLOEXEC on open fd.
64-
bool set_fd_cloexec= true;
62+
bool set_fd_cloexec = true;
6563

6664
// Allows OS to incrementally sync files to disk while they are being
6765
// written, in the background. Issue one request for every bytes_per_sync
6866
// written. 0 turns it off.
6967
// Default: 0
7068
uint64_t bytes_per_sync = 0;
69+
70+
// If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which
71+
// means that file size won't change as part of preallocation.
72+
// If false, preallocation will also change the file size. This option will
73+
// improve the performance in workloads where you sync the data on every
74+
// write. By default, we set it to true for MANIFEST writes and false for
75+
// WAL writes
76+
bool fallocate_with_keep_size = true;
7177
};
7278

7379
class Env {
@@ -260,6 +266,16 @@ class Env {
260266
// Generates a unique id that can be used to identify a db
261267
virtual std::string GenerateUniqueId();
262268

269+
// OptimizeForLogWrite will create a new EnvOptions object that is a copy of
270+
// the EnvOptions in the parameters, but is optimized for writing log files.
271+
// Default implementation returns the copy of the same object.
272+
virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const;
273+
// OptimizeForManifestWrite will create a new EnvOptions object that is a copy
274+
// of the EnvOptions in the parameters, but is optimized for writing manifest
275+
// files. Default implementation returns the copy of the same object.
276+
virtual EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options)
277+
const;
278+
263279
private:
264280
// No copying allowed
265281
Env(const Env&);

util/env.cc

+6-4
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,12 @@ void AssignEnvOptions(EnvOptions* env_options, const Options& options) {
241241

242242
}
243243

244-
EnvOptions EnvOptions::AdaptForLogWrite() const {
245-
EnvOptions adapted = *this;
246-
adapted.use_mmap_writes = false;
247-
return adapted;
244+
EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options) const {
245+
return env_options;
246+
}
247+
248+
EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const {
249+
return env_options;
248250
}
249251

250252
EnvOptions::EnvOptions(const Options& options) {

util/env_posix.cc

+53-23
Original file line numberDiff line numberDiff line change
@@ -354,9 +354,9 @@ class PosixMmapFile : public WritableFile {
354354
char* dst_; // Where to write next (in range [base_,limit_])
355355
char* last_sync_; // Where have we synced up to
356356
uint64_t file_offset_; // Offset of base_ in file
357-
358357
// Have we done an munmap of unsynced data?
359358
bool pending_sync_;
359+
bool fallocate_with_keep_size_;
360360

361361
// Roundup x to a multiple of y
362362
static size_t Roundup(size_t x, size_t y) {
@@ -399,7 +399,12 @@ class PosixMmapFile : public WritableFile {
399399
assert(base_ == nullptr);
400400

401401
TEST_KILL_RANDOM(rocksdb_kill_odds);
402-
int alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
402+
// we can't fallocate with FALLOC_FL_KEEP_SIZE here
403+
int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
404+
if (alloc_status != 0) {
405+
// fallback to posix_fallocate
406+
alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
407+
}
403408
if (alloc_status != 0) {
404409
return Status::IOError("Error allocating space to file : " + filename_ +
405410
"Error : " + strerror(alloc_status));
@@ -436,7 +441,8 @@ class PosixMmapFile : public WritableFile {
436441
dst_(nullptr),
437442
last_sync_(nullptr),
438443
file_offset_(0),
439-
pending_sync_(false) {
444+
pending_sync_(false),
445+
fallocate_with_keep_size_(options.fallocate_with_keep_size) {
440446
assert((page_size & (page_size - 1)) == 0);
441447
assert(options.use_mmap_writes);
442448
}
@@ -584,7 +590,9 @@ class PosixMmapFile : public WritableFile {
584590
#ifdef ROCKSDB_FALLOCATE_PRESENT
585591
virtual Status Allocate(off_t offset, off_t len) {
586592
TEST_KILL_RANDOM(rocksdb_kill_odds);
587-
if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
593+
int alloc_status = fallocate(
594+
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
595+
if (alloc_status == 0) {
588596
return Status::OK();
589597
} else {
590598
return IOError(filename_, errno);
@@ -606,20 +614,22 @@ class PosixWritableFile : public WritableFile {
606614
bool pending_fsync_;
607615
uint64_t last_sync_size_;
608616
uint64_t bytes_per_sync_;
617+
bool fallocate_with_keep_size_;
609618

610619
public:
611620
PosixWritableFile(const std::string& fname, int fd, size_t capacity,
612-
const EnvOptions& options) :
613-
filename_(fname),
614-
fd_(fd),
615-
cursize_(0),
616-
capacity_(capacity),
617-
buf_(new char[capacity]),
618-
filesize_(0),
619-
pending_sync_(false),
620-
pending_fsync_(false),
621-
last_sync_size_(0),
622-
bytes_per_sync_(options.bytes_per_sync) {
621+
const EnvOptions& options)
622+
: filename_(fname),
623+
fd_(fd),
624+
cursize_(0),
625+
capacity_(capacity),
626+
buf_(new char[capacity]),
627+
filesize_(0),
628+
pending_sync_(false),
629+
pending_fsync_(false),
630+
last_sync_size_(0),
631+
bytes_per_sync_(options.bytes_per_sync),
632+
fallocate_with_keep_size_(options.fallocate_with_keep_size) {
623633
assert(!options.use_mmap_writes);
624634
}
625635

@@ -771,7 +781,9 @@ class PosixWritableFile : public WritableFile {
771781
#ifdef ROCKSDB_FALLOCATE_PRESENT
772782
virtual Status Allocate(off_t offset, off_t len) {
773783
TEST_KILL_RANDOM(rocksdb_kill_odds);
774-
if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
784+
int alloc_status = fallocate(
785+
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
786+
if (alloc_status == 0) {
775787
return Status::OK();
776788
} else {
777789
return IOError(filename_, errno);
@@ -797,14 +809,15 @@ class PosixRandomRWFile : public RandomRWFile {
797809
int fd_;
798810
bool pending_sync_;
799811
bool pending_fsync_;
812+
bool fallocate_with_keep_size_;
800813

801814
public:
802-
PosixRandomRWFile(const std::string& fname, int fd,
803-
const EnvOptions& options) :
804-
filename_(fname),
805-
fd_(fd),
806-
pending_sync_(false),
807-
pending_fsync_(false) {
815+
PosixRandomRWFile(const std::string& fname, int fd, const EnvOptions& options)
816+
: filename_(fname),
817+
fd_(fd),
818+
pending_sync_(false),
819+
pending_fsync_(false),
820+
fallocate_with_keep_size_(options.fallocate_with_keep_size) {
808821
assert(!options.use_mmap_writes && !options.use_mmap_reads);
809822
}
810823

@@ -874,7 +887,10 @@ class PosixRandomRWFile : public RandomRWFile {
874887

875888
#ifdef ROCKSDB_FALLOCATE_PRESENT
876889
virtual Status Allocate(off_t offset, off_t len) {
877-
if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
890+
TEST_KILL_RANDOM(rocksdb_kill_odds);
891+
int alloc_status = fallocate(
892+
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
893+
if (alloc_status == 0) {
878894
return Status::OK();
879895
} else {
880896
return IOError(filename_, errno);
@@ -1332,6 +1348,20 @@ class PosixEnv : public Env {
13321348
return dummy;
13331349
}
13341350

1351+
EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const {
1352+
EnvOptions optimized = env_options;
1353+
optimized.use_mmap_writes = false;
1354+
optimized.fallocate_with_keep_size = true;
1355+
return optimized;
1356+
}
1357+
1358+
EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options) const {
1359+
EnvOptions optimized = env_options;
1360+
optimized.use_mmap_writes = false;
1361+
optimized.fallocate_with_keep_size = true;
1362+
return optimized;
1363+
}
1364+
13351365
private:
13361366
bool checkedDiskForMmap_;
13371367
bool forceMmapOff; // do we override Env options?

0 commit comments

Comments
 (0)