Skip to content

Commit a2bb7c3

Browse files
committed
Push- instead of pull-model for managing Write stalls
Summary: Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either: * proceed with all writes without delay * delay all writes by fixed time * stop all writes The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case). When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal. This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write. Test Plan: make check for now. I'll add some unit tests later. Also, perf test. Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin Reviewed By: ljin Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D22791
1 parent 0af157f commit a2bb7c3

15 files changed

+321
-258
lines changed

HISTORY.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
# Rocksdb Change Log
22

3-
## Unreleased
3+
## Unreleased (will be released with 3.6)
4+
5+
### Behavior changes
6+
* We have refactored our system of stalling writes. Any stall-related statistics' meanings are changed. Instead of per-write stall counts, we now count stalls per-epoch, where epochs are periods between flushes and compactions. You'll find more information in our Tuning Perf Guide once we release RocksDB 3.6.
47

58
----- Past Releases -----
69

Makefile

+5-1
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,8 @@ TESTS = \
112112
version_edit_test \
113113
version_set_test \
114114
file_indexer_test \
115-
write_batch_test\
115+
write_batch_test \
116+
write_controller_test\
116117
deletefile_test \
117118
table_test \
118119
thread_local_test \
@@ -427,6 +428,9 @@ reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS)
427428
write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
428429
$(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
429430

431+
write_controller_test: db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS)
432+
$(CXX) db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
433+
430434
merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS)
431435
$(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
432436

db/column_family.cc

+96-34
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@
99

1010
#include "db/column_family.h"
1111

12+
#ifndef __STDC_FORMAT_MACROS
13+
#define __STDC_FORMAT_MACROS
14+
#endif
15+
16+
#include <inttypes.h>
1217
#include <vector>
1318
#include <string>
1419
#include <algorithm>
@@ -19,11 +24,42 @@
1924
#include "db/internal_stats.h"
2025
#include "db/compaction_picker.h"
2126
#include "db/table_properties_collector.h"
27+
#include "db/write_controller.h"
2228
#include "util/autovector.h"
2329
#include "util/hash_skiplist_rep.h"
2430

2531
namespace rocksdb {
2632

33+
namespace {
34+
// This function computes the amount of time in microseconds by which a write
35+
// should be delayed based on the number of level-0 files according to the
36+
// following formula:
37+
// if n < bottom, return 0;
38+
// if n >= top, return 1000;
39+
// otherwise, let r = (n - bottom) /
40+
// (top - bottom)
41+
// and return r^2 * 1000.
42+
// The goal of this formula is to gradually increase the rate at which writes
43+
// are slowed. We also tried linear delay (r * 1000), but it seemed to do
44+
// slightly worse. There is no other particular reason for choosing quadratic.
45+
uint64_t SlowdownAmount(int n, double bottom, double top) {
46+
uint64_t delay;
47+
if (n >= top) {
48+
delay = 1000;
49+
} else if (n < bottom) {
50+
delay = 0;
51+
} else {
52+
// If we are here, we know that:
53+
// level0_start_slowdown <= n < level0_slowdown
54+
// since the previous two conditions are false.
55+
double how_much = static_cast<double>(n - bottom) / (top - bottom);
56+
delay = std::max(how_much * how_much * 1000, 100.0);
57+
}
58+
assert(delay <= 1000);
59+
return delay;
60+
}
61+
} // namespace
62+
2763
ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
2864
DBImpl* db, port::Mutex* mutex)
2965
: cfd_(cfd), db_(db), mutex_(mutex) {
@@ -197,7 +233,6 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
197233
next_(nullptr),
198234
prev_(nullptr),
199235
log_number_(0),
200-
need_slowdown_for_num_level0_files_(false),
201236
column_family_set_(column_family_set) {
202237
Ref();
203238

@@ -278,44 +313,70 @@ ColumnFamilyData::~ColumnFamilyData() {
278313
}
279314

280315
void ColumnFamilyData::RecalculateWriteStallConditions() {
281-
need_wait_for_num_memtables_ =
282-
(imm()->size() == options()->max_write_buffer_number - 1);
283-
284316
if (current_ != nullptr) {
285-
need_wait_for_num_level0_files_ =
286-
(current_->NumLevelFiles(0) >= options()->level0_stop_writes_trigger);
287-
} else {
288-
need_wait_for_num_level0_files_ = false;
289-
}
290-
291-
RecalculateWriteStallRateLimitsConditions();
292-
}
293-
294-
void ColumnFamilyData::RecalculateWriteStallRateLimitsConditions() {
295-
if (current_ != nullptr) {
296-
exceeds_hard_rate_limit_ =
297-
(options()->hard_rate_limit > 1.0 &&
298-
current_->MaxCompactionScore() > options()->hard_rate_limit);
299-
300-
exceeds_soft_rate_limit_ =
301-
(options()->soft_rate_limit > 0.0 &&
302-
current_->MaxCompactionScore() > options()->soft_rate_limit);
303-
} else {
304-
exceeds_hard_rate_limit_ = false;
305-
exceeds_soft_rate_limit_ = false;
317+
const double score = current_->MaxCompactionScore();
318+
const int max_level = current_->MaxCompactionScoreLevel();
319+
320+
auto write_controller = column_family_set_->write_controller_;
321+
322+
if (imm()->size() == options_.max_write_buffer_number) {
323+
write_controller_token_ = write_controller->GetStopToken();
324+
internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1);
325+
Log(options_.info_log,
326+
"[%s] Stopping writes because we have %d immutable memtables "
327+
"(waiting for flush)",
328+
name_.c_str(), imm()->size());
329+
} else if (options_.level0_slowdown_writes_trigger >= 0 &&
330+
current_->NumLevelFiles(0) >=
331+
options_.level0_slowdown_writes_trigger) {
332+
uint64_t slowdown = SlowdownAmount(
333+
current_->NumLevelFiles(0), options_.level0_slowdown_writes_trigger,
334+
options_.level0_stop_writes_trigger);
335+
write_controller_token_ = write_controller->GetDelayToken(slowdown);
336+
internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN, slowdown);
337+
Log(options_.info_log,
338+
"[%s] Stalling writes because we have %d level-0 files (%" PRIu64
339+
"us)",
340+
name_.c_str(), current_->NumLevelFiles(0), slowdown);
341+
} else if (current_->NumLevelFiles(0) >=
342+
options_.level0_stop_writes_trigger) {
343+
write_controller_token_ = write_controller->GetStopToken();
344+
internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1);
345+
Log(options_.info_log,
346+
"[%s] Stopping writes because we have %d level-0 files",
347+
name_.c_str(), current_->NumLevelFiles(0));
348+
} else if (options_.hard_rate_limit > 1.0 &&
349+
score > options_.hard_rate_limit) {
350+
uint64_t kHardLimitSlowdown = 1000;
351+
write_controller_token_ =
352+
write_controller->GetDelayToken(kHardLimitSlowdown);
353+
internal_stats_->RecordLevelNSlowdown(max_level, kHardLimitSlowdown,
354+
false);
355+
Log(options_.info_log,
356+
"[%s] Stalling writes because we hit hard limit on level %d. "
357+
"(%" PRIu64 "us)",
358+
name_.c_str(), max_level, kHardLimitSlowdown);
359+
} else if (options_.soft_rate_limit > 0.0 &&
360+
score > options_.soft_rate_limit) {
361+
uint64_t slowdown = SlowdownAmount(score, options_.soft_rate_limit,
362+
options_.hard_rate_limit);
363+
write_controller_token_ = write_controller->GetDelayToken(slowdown);
364+
internal_stats_->RecordLevelNSlowdown(max_level, slowdown, true);
365+
Log(options_.info_log,
366+
"[%s] Stalling writes because we hit soft limit on level %d (%" PRIu64
367+
"us)",
368+
name_.c_str(), max_level, slowdown);
369+
} else {
370+
write_controller_token_.reset();
371+
}
306372
}
307373
}
308374

309375
const EnvOptions* ColumnFamilyData::soptions() const {
310376
return &(column_family_set_->env_options_);
311377
}
312378

313-
void ColumnFamilyData::SetCurrent(Version* current) {
314-
current_ = current;
315-
need_slowdown_for_num_level0_files_ =
316-
(options_.level0_slowdown_writes_trigger >= 0 &&
317-
current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
318-
}
379+
void ColumnFamilyData::SetCurrent(Version* current) { current_ = current; }
319380

320381
void ColumnFamilyData::CreateNewMemtable() {
321382
assert(current_ != nullptr);
@@ -328,7 +389,6 @@ void ColumnFamilyData::CreateNewMemtable() {
328389

329390
Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) {
330391
auto result = compaction_picker_->PickCompaction(current_, log_buffer);
331-
RecalculateWriteStallRateLimitsConditions();
332392
return result;
333393
}
334394

@@ -464,16 +524,18 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() {
464524
ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
465525
const DBOptions* db_options,
466526
const EnvOptions& env_options,
467-
Cache* table_cache)
527+
Cache* table_cache,
528+
WriteController* write_controller)
468529
: max_column_family_(0),
469530
dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr,
470531
ColumnFamilyOptions(), db_options,
471-
env_options_, nullptr)),
532+
env_options, nullptr)),
472533
default_cfd_cache_(nullptr),
473534
db_name_(dbname),
474535
db_options_(db_options),
475536
env_options_(env_options),
476537
table_cache_(table_cache),
538+
write_controller_(write_controller),
477539
spin_lock_(ATOMIC_FLAG_INIT) {
478540
// initialize linked list
479541
dummy_cfd_->prev_ = dummy_cfd_;

db/column_family.h

+8-46
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "rocksdb/env.h"
2020
#include "db/memtable_list.h"
2121
#include "db/write_batch_internal.h"
22+
#include "db/write_controller.h"
2223
#include "db/table_cache.h"
2324
#include "util/thread_local.h"
2425

@@ -156,6 +157,7 @@ class ColumnFamilyData {
156157
// can't drop default CF
157158
assert(id_ != 0);
158159
dropped_ = true;
160+
write_controller_token_.reset();
159161
}
160162
bool IsDropped() const { return dropped_; }
161163

@@ -225,35 +227,12 @@ class ColumnFamilyData {
225227

226228
void ResetThreadLocalSuperVersions();
227229

228-
// A Flag indicating whether write needs to slowdown because of there are
229-
// too many number of level0 files.
230-
bool NeedSlowdownForNumLevel0Files() const {
231-
return need_slowdown_for_num_level0_files_;
232-
}
233-
234-
bool NeedWaitForNumLevel0Files() const {
235-
return need_wait_for_num_level0_files_;
236-
}
237-
238-
bool NeedWaitForNumMemtables() const {
239-
return need_wait_for_num_memtables_;
240-
}
241-
242-
bool ExceedsSoftRateLimit() const {
243-
return exceeds_soft_rate_limit_;
244-
}
245-
246-
bool ExceedsHardRateLimit() const {
247-
return exceeds_hard_rate_limit_;
248-
}
249-
250230
private:
251231
friend class ColumnFamilySet;
252232
ColumnFamilyData(uint32_t id, const std::string& name,
253233
Version* dummy_versions, Cache* table_cache,
254234
const ColumnFamilyOptions& options,
255-
const DBOptions* db_options,
256-
const EnvOptions& env_options,
235+
const DBOptions* db_options, const EnvOptions& env_options,
257236
ColumnFamilySet* column_family_set);
258237

259238
// Recalculate some small conditions, which are changed only during
@@ -262,7 +241,6 @@ class ColumnFamilyData {
262241
// DBImpl::MakeRoomForWrite function to decide, if it need to make
263242
// a write stall
264243
void RecalculateWriteStallConditions();
265-
void RecalculateWriteStallRateLimitsConditions();
266244

267245
uint32_t id_;
268246
const std::string name_;
@@ -304,31 +282,13 @@ class ColumnFamilyData {
304282
// recovered from
305283
uint64_t log_number_;
306284

307-
// A flag indicating whether we should delay writes because
308-
// we have too many level 0 files
309-
bool need_slowdown_for_num_level0_files_;
310-
311-
// These 4 variables are updated only after compaction,
312-
// adding new memtable, flushing memtables to files
313-
// and/or add recalculation of compaction score.
314-
// That's why theirs values are cached in ColumnFamilyData.
315-
// Recalculation is made by RecalculateWriteStallConditions and
316-
// RecalculateWriteStallRateLimitsConditions function. They are used
317-
// in DBImpl::MakeRoomForWrite function to decide, if it need
318-
// to sleep during write operation
319-
bool need_wait_for_num_memtables_;
320-
321-
bool need_wait_for_num_level0_files_;
322-
323-
bool exceeds_hard_rate_limit_;
324-
325-
bool exceeds_soft_rate_limit_;
326-
327285
// An object that keeps all the compaction stats
328286
// and picks the next compaction
329287
std::unique_ptr<CompactionPicker> compaction_picker_;
330288

331289
ColumnFamilySet* column_family_set_;
290+
291+
std::unique_ptr<WriteControllerToken> write_controller_token_;
332292
};
333293

334294
// ColumnFamilySet has interesting thread-safety requirements
@@ -370,7 +330,8 @@ class ColumnFamilySet {
370330
};
371331

372332
ColumnFamilySet(const std::string& dbname, const DBOptions* db_options,
373-
const EnvOptions& env_options, Cache* table_cache);
333+
const EnvOptions& env_options, Cache* table_cache,
334+
WriteController* write_controller);
374335
~ColumnFamilySet();
375336

376337
ColumnFamilyData* GetDefault() const;
@@ -425,6 +386,7 @@ class ColumnFamilySet {
425386
const DBOptions* const db_options_;
426387
const EnvOptions env_options_;
427388
Cache* table_cache_;
389+
WriteController* write_controller_;
428390
std::atomic_flag spin_lock_;
429391
};
430392

0 commit comments

Comments
 (0)