Skip to content

Commit 6973bb1

Browse files
committed
MakeRoomForWrite() support for column families
Summary: Making room for write will be the hardest part of the column family implementation. For now, I just iterate through all column families and run MakeRoomForWrite() for every one. Test Plan: make check does not complain Reviewers: dhruba, haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D15597
1 parent c37e7de commit 6973bb1

6 files changed

+76
-71
lines changed

db/column_family.cc

+10-4
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,7 @@
1717

1818
namespace rocksdb {
1919

20-
SuperVersion::SuperVersion(const int num_memtables) {
21-
to_delete.resize(num_memtables);
22-
}
20+
SuperVersion::SuperVersion() {}
2321

2422
SuperVersion::~SuperVersion() {
2523
for (auto td : to_delete) {
@@ -71,7 +69,8 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
7169
imm_(options.min_write_buffer_number_to_merge),
7270
super_version_(nullptr),
7371
super_version_number_(0),
74-
log_number_(0) {}
72+
log_number_(0),
73+
need_slowdown_for_num_level0_files_(false) {}
7574

7675
ColumnFamilyData::~ColumnFamilyData() {
7776
if (super_version_ != nullptr) {
@@ -95,6 +94,13 @@ ColumnFamilyData::~ColumnFamilyData() {
9594
}
9695
}
9796

97+
void ColumnFamilyData::SetCurrent(Version* current) {
98+
current_ = current;
99+
need_slowdown_for_num_level0_files_ =
100+
(options_.level0_slowdown_writes_trigger >= 0 &&
101+
current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
102+
}
103+
98104
void ColumnFamilyData::CreateNewMemtable() {
99105
assert(current_ != nullptr);
100106
if (mem_ != nullptr) {

db/column_family.h

+12-2
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ struct SuperVersion {
3636
std::vector<MemTable*> to_delete;
3737

3838
// should be called outside the mutex
39-
explicit SuperVersion(const int num_memtables = 0);
39+
SuperVersion();
4040
~SuperVersion();
4141
SuperVersion* Ref();
4242
// Returns true if this was the last reference and caller should
@@ -72,7 +72,7 @@ class ColumnFamilyData {
7272
Version* current() { return current_; }
7373
Version* dummy_versions() { return dummy_versions_; }
7474
void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
75-
void SetCurrent(Version* current) { current_ = current; }
75+
void SetCurrent(Version* current);
7676
void CreateNewMemtable();
7777

7878
SuperVersion* GetSuperVersion() const { return super_version_; }
@@ -85,6 +85,12 @@ class ColumnFamilyData {
8585
// the clients to allocate SuperVersion outside of mutex.
8686
SuperVersion* InstallSuperVersion(SuperVersion* new_superversion);
8787

88+
// A Flag indicating whether write needs to slowdown because of there are
89+
// too many number of level0 files.
90+
bool NeedSlowdownForNumLevel0Files() const {
91+
return need_slowdown_for_num_level0_files_;
92+
}
93+
8894
private:
8995
uint32_t id_;
9096
const std::string name_;
@@ -105,6 +111,10 @@ class ColumnFamilyData {
105111
// Column Family. All earlier log files must be ignored and not
106112
// recovered from
107113
uint64_t log_number_;
114+
115+
// A flag indicating whether we should delay writes because
116+
// we have too many level 0 files
117+
bool need_slowdown_for_num_level0_files_;
108118
};
109119

110120
// Thread safe only for reading without a writer. All access should be

db/db_impl.cc

+49-42
Original file line numberDiff line numberDiff line change
@@ -1298,8 +1298,7 @@ Status DBImpl::ReFitLevel(int level, int target_level) {
12981298
assert(level < NumberLevels());
12991299

13001300
SuperVersion* superversion_to_free = nullptr;
1301-
SuperVersion* new_superversion =
1302-
new SuperVersion(options_.max_write_buffer_number);
1301+
SuperVersion* new_superversion = new SuperVersion();
13031302

13041303
mutex_.Lock();
13051304

@@ -2949,6 +2948,13 @@ std::vector<Status> DBImpl::MultiGet(
29492948
return statList;
29502949
}
29512950

2951+
// TODO(icanadi) creating column family while writing will cause a data race.
2952+
// In write code path, we iterate through column families and call
2953+
// MakeRoomForWrite() for each. MakeRoomForWrite() can unlock the mutex
2954+
// and wait (delay the write). If we create or drop a column family when
2955+
// that mutex is unlocked for delay, that's bad.
2956+
// Solution TODO: enable iteration by chaining column families in
2957+
// circular linked lists
29522958
Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options,
29532959
const std::string& column_family_name,
29542960
ColumnFamilyHandle* handle) {
@@ -3106,9 +3112,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
31063112
RecordTick(options_.statistics.get(), WRITE_DONE_BY_SELF, 1);
31073113
}
31083114

3109-
// May temporarily unlock and wait.
3110-
SuperVersion* superversion_to_free = nullptr;
3111-
Status status = MakeRoomForWrite(my_batch == nullptr, &superversion_to_free);
3115+
Status status;
3116+
for (auto cfd : *versions_->GetColumnFamilySet()) {
3117+
// May temporarily unlock and wait.
3118+
status = MakeRoomForWrite(cfd, my_batch == nullptr);
3119+
if (!status.ok()) {
3120+
break;
3121+
}
3122+
}
31123123
uint64_t last_sequence = versions_->LastSequence();
31133124
Writer* last_writer = &w;
31143125
if (status.ok() && my_batch != nullptr) { // nullptr batch is for compactions
@@ -3209,7 +3220,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
32093220
writers_.front()->cv.Signal();
32103221
}
32113222
mutex_.Unlock();
3212-
delete superversion_to_free;
32133223
return status;
32143224
}
32153225

@@ -3295,8 +3305,7 @@ uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) {
32953305

32963306
// REQUIRES: mutex_ is held
32973307
// REQUIRES: this thread is currently at the front of the writer queue
3298-
Status DBImpl::MakeRoomForWrite(bool force,
3299-
SuperVersion** superversion_to_free) {
3308+
Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) {
33003309
mutex_.AssertHeld();
33013310
assert(!writers_.empty());
33023311
bool allow_delay = !force;
@@ -3305,24 +3314,23 @@ Status DBImpl::MakeRoomForWrite(bool force,
33053314
uint64_t rate_limit_delay_millis = 0;
33063315
Status s;
33073316
double score;
3308-
*superversion_to_free = nullptr;
33093317

33103318
while (true) {
33113319
if (!bg_error_.ok()) {
33123320
// Yield previous error
33133321
s = bg_error_;
33143322
break;
3315-
} else if (allow_delay && versions_->NeedSlowdownForNumLevel0Files()) {
3323+
} else if (allow_delay && cfd->NeedSlowdownForNumLevel0Files()) {
33163324
// We are getting close to hitting a hard limit on the number of
33173325
// L0 files. Rather than delaying a single write by several
33183326
// seconds when we hit the hard limit, start delaying each
33193327
// individual write by 0-1ms to reduce latency variance. Also,
33203328
// this delay hands over some CPU to the compaction thread in
33213329
// case it is sharing the same core as the writer.
33223330
uint64_t slowdown =
3323-
SlowdownAmount(default_cfd_->current()->NumLevelFiles(0),
3324-
options_.level0_slowdown_writes_trigger,
3325-
options_.level0_stop_writes_trigger);
3331+
SlowdownAmount(cfd->current()->NumLevelFiles(0),
3332+
cfd->options()->level0_slowdown_writes_trigger,
3333+
cfd->options()->level0_stop_writes_trigger);
33263334
mutex_.Unlock();
33273335
uint64_t delayed;
33283336
{
@@ -3335,32 +3343,32 @@ Status DBImpl::MakeRoomForWrite(bool force,
33353343
allow_delay = false; // Do not delay a single write more than once
33363344
mutex_.Lock();
33373345
delayed_writes_++;
3338-
} else if (!force && (default_cfd_->mem()->ApproximateMemoryUsage() <=
3339-
options_.write_buffer_size)) {
3346+
} else if (!force && (cfd->mem()->ApproximateMemoryUsage() <=
3347+
cfd->options()->write_buffer_size)) {
33403348
// There is room in current memtable
33413349
if (allow_delay) {
33423350
DelayLoggingAndReset();
33433351
}
33443352
break;
3345-
} else if (default_cfd_->imm()->size() ==
3346-
options_.max_write_buffer_number - 1) {
3353+
} else if (cfd->imm()->size() ==
3354+
cfd->options()->max_write_buffer_number - 1) {
33473355
// We have filled up the current memtable, but the previous
33483356
// ones are still being compacted, so we wait.
33493357
DelayLoggingAndReset();
33503358
Log(options_.info_log, "wait for memtable compaction...\n");
33513359
uint64_t stall;
33523360
{
33533361
StopWatch sw(env_, options_.statistics.get(),
3354-
STALL_MEMTABLE_COMPACTION_COUNT);
3362+
STALL_MEMTABLE_COMPACTION_COUNT);
33553363
bg_cv_.Wait();
33563364
stall = sw.ElapsedMicros();
33573365
}
33583366
RecordTick(options_.statistics.get(),
33593367
STALL_MEMTABLE_COMPACTION_MICROS, stall);
33603368
internal_stats_.RecordWriteStall(InternalStats::MEMTABLE_COMPACTION,
33613369
stall);
3362-
} else if (default_cfd_->current()->NumLevelFiles(0) >=
3363-
options_.level0_stop_writes_trigger) {
3370+
} else if (cfd->current()->NumLevelFiles(0) >=
3371+
cfd->options()->level0_stop_writes_trigger) {
33643372
// There are too many level-0 files.
33653373
DelayLoggingAndReset();
33663374
Log(options_.info_log, "wait for fewer level0 files...\n");
@@ -3374,10 +3382,10 @@ Status DBImpl::MakeRoomForWrite(bool force,
33743382
RecordTick(options_.statistics.get(), STALL_L0_NUM_FILES_MICROS, stall);
33753383
internal_stats_.RecordWriteStall(InternalStats::LEVEL0_NUM_FILES, stall);
33763384
} else if (allow_hard_rate_limit_delay && options_.hard_rate_limit > 1.0 &&
3377-
(score = default_cfd_->current()->MaxCompactionScore()) >
3378-
options_.hard_rate_limit) {
3385+
(score = cfd->current()->MaxCompactionScore()) >
3386+
cfd->options()->hard_rate_limit) {
33793387
// Delay a write when the compaction score for any level is too large.
3380-
int max_level = default_cfd_->current()->MaxCompactionScoreLevel();
3388+
int max_level = cfd->current()->MaxCompactionScoreLevel();
33813389
mutex_.Unlock();
33823390
uint64_t delayed;
33833391
{
@@ -3392,26 +3400,25 @@ Status DBImpl::MakeRoomForWrite(bool force,
33923400
rate_limit_delay_millis += rate_limit;
33933401
RecordTick(options_.statistics.get(),
33943402
RATE_LIMIT_DELAY_MILLIS, rate_limit);
3395-
if (options_.rate_limit_delay_max_milliseconds > 0 &&
3403+
if (cfd->options()->rate_limit_delay_max_milliseconds > 0 &&
33963404
rate_limit_delay_millis >=
3397-
(unsigned)options_.rate_limit_delay_max_milliseconds) {
3405+
(unsigned)cfd->options()->rate_limit_delay_max_milliseconds) {
33983406
allow_hard_rate_limit_delay = false;
33993407
}
34003408
mutex_.Lock();
3401-
} else if (allow_soft_rate_limit_delay && options_.soft_rate_limit > 0.0 &&
3402-
(score = default_cfd_->current()->MaxCompactionScore()) >
3403-
options_.soft_rate_limit) {
3409+
} else if (allow_soft_rate_limit_delay &&
3410+
cfd->options()->soft_rate_limit > 0.0 &&
3411+
(score = cfd->current()->MaxCompactionScore()) >
3412+
cfd->options()->soft_rate_limit) {
34043413
// Delay a write when the compaction score for any level is too large.
34053414
// TODO: add statistics
34063415
mutex_.Unlock();
34073416
{
34083417
StopWatch sw(env_, options_.statistics.get(),
34093418
SOFT_RATE_LIMIT_DELAY_COUNT);
3410-
env_->SleepForMicroseconds(SlowdownAmount(
3411-
score,
3412-
options_.soft_rate_limit,
3413-
options_.hard_rate_limit)
3414-
);
3419+
env_->SleepForMicroseconds(
3420+
SlowdownAmount(score, cfd->options()->soft_rate_limit,
3421+
cfd->options()->hard_rate_limit));
34153422
rate_limit_delay_millis += sw.ElapsedMicros();
34163423
}
34173424
allow_soft_rate_limit_delay = false;
@@ -3436,9 +3443,10 @@ Status DBImpl::MakeRoomForWrite(bool force,
34363443
if (s.ok()) {
34373444
// Our final size should be less than write_buffer_size
34383445
// (compression, etc) but err on the side of caution.
3439-
lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size);
3440-
memtmp = new MemTable(internal_comparator_, options_);
3441-
new_superversion = new SuperVersion(options_.max_write_buffer_number);
3446+
lfile->SetPreallocationBlockSize(1.1 *
3447+
cfd->options()->write_buffer_size);
3448+
memtmp = new MemTable(internal_comparator_, *cfd->options());
3449+
new_superversion = new SuperVersion();
34423450
}
34433451
}
34443452
mutex_.Lock();
@@ -3450,20 +3458,19 @@ Status DBImpl::MakeRoomForWrite(bool force,
34503458
}
34513459
logfile_number_ = new_log_number;
34523460
log_.reset(new log::Writer(std::move(lfile)));
3453-
default_cfd_->mem()->SetNextLogNumber(logfile_number_);
3454-
default_cfd_->imm()->Add(default_cfd_->mem());
3461+
cfd->mem()->SetNextLogNumber(logfile_number_);
3462+
cfd->imm()->Add(cfd->mem());
34553463
if (force) {
3456-
default_cfd_->imm()->FlushRequested();
3464+
cfd->imm()->FlushRequested();
34573465
}
34583466
memtmp->Ref();
34593467
memtmp->SetLogNumber(logfile_number_);
3460-
default_cfd_->SetMemtable(memtmp);
3468+
cfd->SetMemtable(memtmp);
34613469
Log(options_.info_log, "New memtable created with log file: #%lu\n",
34623470
(unsigned long)logfile_number_);
34633471
force = false; // Do not force another compaction if have room
34643472
MaybeScheduleFlushOrCompaction();
3465-
*superversion_to_free =
3466-
default_cfd_->InstallSuperVersion(new_superversion);
3473+
delete cfd->InstallSuperVersion(new_superversion);
34673474
}
34683475
}
34693476
return s;

db/db_impl.h

+5-9
Original file line numberDiff line numberDiff line change
@@ -201,9 +201,9 @@ class DBImpl : public DB {
201201
// a list of memtables to be free
202202
std::vector<MemTable *> memtables_to_free;
203203

204-
SuperVersion* superversion_to_free; // if nullptr nothing to free
204+
SuperVersion* superversion_to_free; // if nullptr nothing to free
205205

206-
SuperVersion* new_superversion; // if nullptr no new superversion
206+
SuperVersion* new_superversion; // if nullptr no new superversion
207207

208208
// the current manifest_file_number, log_number and prev_log_number
209209
// that corresponds to the set of files in 'live'.
@@ -216,8 +216,7 @@ class DBImpl : public DB {
216216
prev_log_number = 0;
217217
memtables_to_free.reserve(num_memtables);
218218
superversion_to_free = nullptr;
219-
new_superversion =
220-
create_superversion ? new SuperVersion(num_memtables) : nullptr;
219+
new_superversion = create_superversion ? new SuperVersion() : nullptr;
221220
}
222221

223222
~DeletionState() {
@@ -303,11 +302,8 @@ class DBImpl : public DB {
303302
uint64_t* filenumber);
304303

305304
uint64_t SlowdownAmount(int n, double bottom, double top);
306-
// MakeRoomForWrite will return superversion_to_free through an arugment,
307-
// which the caller needs to delete. We do it because caller can delete
308-
// the superversion outside of mutex
309-
Status MakeRoomForWrite(bool force /* compact even if there is room? */,
310-
SuperVersion** superversion_to_free);
305+
Status MakeRoomForWrite(ColumnFamilyData* cfd,
306+
bool force /* flush even if there is room? */);
311307
void BuildBatchGroup(Writer** last_writer,
312308
autovector<WriteBatch*>* write_batch_group);
313309

db/version_set.cc

-4
Original file line numberDiff line numberDiff line change
@@ -1377,7 +1377,6 @@ VersionSet::VersionSet(const std::string& dbname, const Options* options,
13771377
log_number_(0),
13781378
prev_log_number_(0),
13791379
num_levels_(options_->num_levels),
1380-
need_slowdown_for_num_level0_files_(false),
13811380
current_version_number_(0),
13821381
manifest_file_size_(0),
13831382
storage_options_(storage_options),
@@ -1413,9 +1412,6 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
14131412
current->Unref();
14141413
}
14151414
column_family_data->SetCurrent(v);
1416-
need_slowdown_for_num_level0_files_ =
1417-
(options_->level0_slowdown_writes_trigger >= 0 &&
1418-
v->NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger);
14191415
v->Ref();
14201416

14211417
// Append to linked list

db/version_set.h

-10
Original file line numberDiff line numberDiff line change
@@ -315,12 +315,6 @@ class VersionSet {
315315
const EnvOptions& storage_options,
316316
int new_levels);
317317

318-
// A Flag indicating whether write needs to slowdown because of there are
319-
// too many number of level0 files.
320-
bool NeedSlowdownForNumLevel0Files() const {
321-
return need_slowdown_for_num_level0_files_;
322-
}
323-
324318
// Return the current manifest file number
325319
uint64_t ManifestFileNumber() const { return manifest_file_number_; }
326320

@@ -482,10 +476,6 @@ class VersionSet {
482476
// Opened lazily
483477
unique_ptr<log::Writer> descriptor_log_;
484478

485-
// A flag indicating whether we should delay writes because
486-
// we have too many level 0 files
487-
bool need_slowdown_for_num_level0_files_;
488-
489479
// An object that keeps all the compaction stats
490480
// and picks the next compaction
491481
std::unique_ptr<CompactionPicker> compaction_picker_;

0 commit comments

Comments
 (0)