Skip to content

Commit dee91c2

Browse files
committed
WriteThread
Summary: This diff just moves the write thread control out of the DBImpl. I will need this as I will control column family data concurrency by only accessing some data in the write thread. That way, we won't have to lock our accesses to column family hash table (mappings from IDs to CFDs). Test Plan: make check Reviewers: sdong, yhchiang, ljin Reviewed By: ljin Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D23301
1 parent 540a257 commit dee91c2

File tree

5 files changed

+246
-220
lines changed

5 files changed

+246
-220
lines changed

db/db_impl.cc

+12-166
Original file line numberDiff line numberDiff line change
@@ -1915,14 +1915,6 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
19151915

19161916
Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
19171917
const FlushOptions& options) {
1918-
Writer w(&mutex_);
1919-
w.batch = nullptr;
1920-
w.sync = false;
1921-
w.disableWAL = false;
1922-
w.in_batch_group = false;
1923-
w.done = false;
1924-
w.timeout_hint_us = kNoTimeOut;
1925-
19261918
Status s;
19271919
{
19281920
WriteContext context;
@@ -1933,7 +1925,8 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
19331925
return Status::OK();
19341926
}
19351927

1936-
s = BeginWrite(&w, 0);
1928+
WriteThread::Writer w(&mutex_);
1929+
s = write_thread_.EnterWriteThread(&w, 0);
19371930
assert(s.ok() && !w.done); // No timeout and nobody should do our job
19381931

19391932
// SetNewMemtableAndNewLogFile() will release and reacquire mutex
@@ -1942,12 +1935,9 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
19421935
cfd->imm()->FlushRequested();
19431936
MaybeScheduleFlushOrCompaction();
19441937

1945-
assert(!writers_.empty());
1946-
assert(writers_.front() == &w);
1947-
EndWrite(&w, &w, s);
1938+
write_thread_.ExitWriteThread(&w, &w, s);
19481939
}
19491940

1950-
19511941
if (s.ok() && options.wait) {
19521942
// Wait until the compaction completes
19531943
s = WaitForFlushMemTable(cfd);
@@ -3652,13 +3642,6 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
36523642
edit.DropColumnFamily();
36533643
edit.SetColumnFamily(cfd->GetID());
36543644

3655-
Writer w(&mutex_);
3656-
w.batch = nullptr;
3657-
w.sync = false;
3658-
w.disableWAL = false;
3659-
w.in_batch_group = false;
3660-
w.done = false;
3661-
w.timeout_hint_us = kNoTimeOut;
36623645

36633646
Status s;
36643647
{
@@ -3668,10 +3651,11 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
36683651
}
36693652
if (s.ok()) {
36703653
// we drop column family from a single write thread
3671-
s = BeginWrite(&w, 0);
3654+
WriteThread::Writer w(&mutex_);
3655+
s = write_thread_.EnterWriteThread(&w, 0);
36723656
assert(s.ok() && !w.done); // No timeout and nobody should do our job
36733657
s = versions_->LogAndApply(cfd, &edit, &mutex_);
3674-
EndWrite(&w, &w, s);
3658+
write_thread_.ExitWriteThread(&w, &w, s);
36753659
}
36763660
}
36773661

@@ -3891,88 +3875,12 @@ Status DBImpl::Delete(const WriteOptions& options,
38913875
return DB::Delete(options, column_family, key);
38923876
}
38933877

3894-
// REQUIRES: mutex_ is held
3895-
Status DBImpl::BeginWrite(Writer* w, uint64_t expiration_time) {
3896-
// the following code block pushes the current writer "w" into the writer
3897-
// queue "writers_" and wait until one of the following conditions met:
3898-
// 1. the job of "w" has been done by some other writers.
3899-
// 2. "w" becomes the first writer in "writers_"
3900-
// 3. "w" timed-out.
3901-
mutex_.AssertHeld();
3902-
writers_.push_back(w);
3903-
3904-
bool timed_out = false;
3905-
while (!w->done && w != writers_.front()) {
3906-
if (expiration_time == 0) {
3907-
w->cv.Wait();
3908-
} else if (w->cv.TimedWait(expiration_time)) {
3909-
if (w->in_batch_group) {
3910-
// then it means the front writer is currently doing the
3911-
// write on behalf of this "timed-out" writer. Then it
3912-
// should wait until the write completes.
3913-
expiration_time = 0;
3914-
} else {
3915-
timed_out = true;
3916-
break;
3917-
}
3918-
}
3919-
}
3920-
3921-
if (timed_out) {
3922-
#ifndef NDEBUG
3923-
bool found = false;
3924-
#endif
3925-
for (auto iter = writers_.begin(); iter != writers_.end(); iter++) {
3926-
if (*iter == w) {
3927-
writers_.erase(iter);
3928-
#ifndef NDEBUG
3929-
found = true;
3930-
#endif
3931-
break;
3932-
}
3933-
}
3934-
#ifndef NDEBUG
3935-
assert(found);
3936-
#endif
3937-
// writers_.front() might still be in cond_wait without a time-out.
3938-
// As a result, we need to signal it to wake it up. Otherwise no
3939-
// one else will wake him up, and RocksDB will hang.
3940-
if (!writers_.empty()) {
3941-
writers_.front()->cv.Signal();
3942-
}
3943-
return Status::TimedOut();
3944-
}
3945-
return Status::OK();
3946-
}
3947-
3948-
// REQUIRES: mutex_ is held
3949-
void DBImpl::EndWrite(Writer* w, Writer* last_writer, Status status) {
3950-
// Pop out the current writer and all writers being pushed before the
3951-
// current writer from the writer queue.
3952-
mutex_.AssertHeld();
3953-
while (!writers_.empty()) {
3954-
Writer* ready = writers_.front();
3955-
writers_.pop_front();
3956-
if (ready != w) {
3957-
ready->status = status;
3958-
ready->done = true;
3959-
ready->cv.Signal();
3960-
}
3961-
if (ready == last_writer) break;
3962-
}
3963-
3964-
// Notify new head of write queue
3965-
if (!writers_.empty()) {
3966-
writers_.front()->cv.Signal();
3967-
}
3968-
}
3969-
39703878
Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
39713879
if (my_batch == nullptr) {
39723880
return Status::Corruption("Batch is nullptr!");
39733881
}
39743882
PERF_TIMER_GUARD(write_pre_and_post_process_time);
3975-
Writer w(&mutex_);
3883+
WriteThread::Writer w(&mutex_);
39763884
w.batch = my_batch;
39773885
w.sync = options.sync;
39783886
w.disableWAL = options.disableWAL;
@@ -3983,7 +3891,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
39833891
uint64_t expiration_time = 0;
39843892
bool has_timeout = false;
39853893
if (w.timeout_hint_us == 0) {
3986-
w.timeout_hint_us = kNoTimeOut;
3894+
w.timeout_hint_us = WriteThread::kNoTimeOut;
39873895
} else {
39883896
expiration_time = env_->NowMicros() + w.timeout_hint_us;
39893897
has_timeout = true;
@@ -3996,7 +3904,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
39963904

39973905
WriteContext context;
39983906
mutex_.Lock();
3999-
Status status = BeginWrite(&w, expiration_time);
3907+
Status status = write_thread_.EnterWriteThread(&w, expiration_time);
40003908
assert(status.ok() || status.IsTimedOut());
40013909
if (status.IsTimedOut()) {
40023910
mutex_.Unlock();
@@ -4066,10 +3974,10 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
40663974
}
40673975

40683976
uint64_t last_sequence = versions_->LastSequence();
4069-
Writer* last_writer = &w;
3977+
WriteThread::Writer* last_writer = &w;
40703978
if (status.ok()) {
40713979
autovector<WriteBatch*> write_batch_group;
4072-
BuildBatchGroup(&last_writer, &write_batch_group);
3980+
write_thread_.BuildBatchGroup(&last_writer, &write_batch_group);
40733981

40743982
// Add to log and apply to memtable. We can release the lock
40753983
// during this phase since &w is currently responsible for logging
@@ -4161,7 +4069,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
41614069
bg_error_ = status; // stop compaction & fail any further writes
41624070
}
41634071

4164-
EndWrite(&w, last_writer, status);
4072+
write_thread_.ExitWriteThread(&w, last_writer, status);
41654073
mutex_.Unlock();
41664074

41674075
if (status.IsTimedOut()) {
@@ -4171,68 +4079,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
41714079
return status;
41724080
}
41734081

4174-
// This function will be called only when the first writer succeeds.
4175-
// All writers in the to-be-built batch group will be processed.
4176-
//
4177-
// REQUIRES: Writer list must be non-empty
4178-
// REQUIRES: First writer must have a non-nullptr batch
4179-
void DBImpl::BuildBatchGroup(Writer** last_writer,
4180-
autovector<WriteBatch*>* write_batch_group) {
4181-
assert(!writers_.empty());
4182-
Writer* first = writers_.front();
4183-
assert(first->batch != nullptr);
4184-
4185-
size_t size = WriteBatchInternal::ByteSize(first->batch);
4186-
write_batch_group->push_back(first->batch);
4187-
4188-
// Allow the group to grow up to a maximum size, but if the
4189-
// original write is small, limit the growth so we do not slow
4190-
// down the small write too much.
4191-
size_t max_size = 1 << 20;
4192-
if (size <= (128<<10)) {
4193-
max_size = size + (128<<10);
4194-
}
4195-
4196-
*last_writer = first;
4197-
std::deque<Writer*>::iterator iter = writers_.begin();
4198-
++iter; // Advance past "first"
4199-
for (; iter != writers_.end(); ++iter) {
4200-
Writer* w = *iter;
4201-
if (w->sync && !first->sync) {
4202-
// Do not include a sync write into a batch handled by a non-sync write.
4203-
break;
4204-
}
4205-
4206-
if (!w->disableWAL && first->disableWAL) {
4207-
// Do not include a write that needs WAL into a batch that has
4208-
// WAL disabled.
4209-
break;
4210-
}
4211-
4212-
if (w->timeout_hint_us < first->timeout_hint_us) {
4213-
// Do not include those writes with shorter timeout. Otherwise, we might
4214-
// execute a write that should instead be aborted because of timeout.
4215-
break;
4216-
}
4217-
4218-
if (w->batch == nullptr) {
4219-
// Do not include those writes with nullptr batch. Those are not writes,
4220-
// those are something else. They want to be alone
4221-
break;
4222-
}
4223-
4224-
size += WriteBatchInternal::ByteSize(w->batch);
4225-
if (size > max_size) {
4226-
// Do not make batch too big
4227-
break;
4228-
}
4229-
4230-
write_batch_group->push_back(w->batch);
4231-
w->in_batch_group = true;
4232-
*last_writer = w;
4233-
}
4234-
}
4235-
42364082
// REQUIRES: mutex_ is held
42374083
// REQUIRES: this thread is currently at the front of the writer queue
42384084
void DBImpl::DelayWrite(uint64_t expiration_time) {

db/db_impl.h

+3-44
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include "db/internal_stats.h"
3535
#include "db/write_controller.h"
3636
#include "db/flush_scheduler.h"
37+
#include "db/write_thread.h"
3738

3839
namespace rocksdb {
3940

@@ -359,44 +360,6 @@ class DBImpl : public DB {
359360
Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
360361
VersionEdit* edit, uint64_t* filenumber,
361362
LogBuffer* log_buffer);
362-
// Information kept for every waiting writer
363-
struct Writer {
364-
Status status;
365-
WriteBatch* batch;
366-
bool sync;
367-
bool disableWAL;
368-
bool in_batch_group;
369-
bool done;
370-
uint64_t timeout_hint_us;
371-
port::CondVar cv;
372-
373-
explicit Writer(port::Mutex* mu) : cv(mu) {}
374-
};
375-
376-
// Before applying write operation (such as DBImpl::Write, DBImpl::Flush)
377-
// thread should grab the mutex_ and be the first on writers queue.
378-
// BeginWrite is used for it.
379-
// Be aware! Writer's job can be done by other thread (see DBImpl::Write
380-
// for examples), so check it via w.done before applying changes.
381-
//
382-
// Writer* w: writer to be placed in the queue
383-
// uint64_t expiration_time: maximum time to be in the queue
384-
// See also: EndWrite
385-
Status BeginWrite(Writer* w, uint64_t expiration_time);
386-
387-
// After doing write job, we need to remove already used writers from
388-
// writers_ queue and notify head of the queue about it.
389-
// EndWrite is used for this.
390-
//
391-
// Writer* w: Writer, that was added by BeginWrite function
392-
// Writer* last_writer: Since we can join a few Writers (as DBImpl::Write
393-
// does)
394-
// we should pass last_writer as a parameter to
395-
// EndWrite
396-
// (if you don't touch other writers, just pass w)
397-
// Status status: Status of write operation
398-
// See also: BeginWrite
399-
void EndWrite(Writer* w, Writer* last_writer, Status status);
400363

401364
void DelayWrite(uint64_t expiration_time);
402365

@@ -405,9 +368,6 @@ class DBImpl : public DB {
405368
Status SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
406369
WriteContext* context);
407370

408-
void BuildBatchGroup(Writer** last_writer,
409-
autovector<WriteBatch*>* write_batch_group);
410-
411371
// Force current memtable contents to be flushed.
412372
Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);
413373

@@ -552,8 +512,8 @@ class DBImpl : public DB {
552512

553513
std::unique_ptr<Directory> db_directory_;
554514

555-
// Queue of writers.
556-
std::deque<Writer*> writers_;
515+
WriteThread write_thread_;
516+
557517
WriteBatch tmp_batch_;
558518

559519
WriteController write_controller_;
@@ -627,7 +587,6 @@ class DBImpl : public DB {
627587
bool flush_on_destroy_; // Used when disableWAL is true.
628588

629589
static const int KEEP_LOG_FILE_NUM = 1000;
630-
static const uint64_t kNoTimeOut = std::numeric_limits<uint64_t>::max();
631590
std::string db_absolute_path_;
632591

633592
// The options to access storage files

db/db_impl_debug.cc

+4-10
Original file line numberDiff line numberDiff line change
@@ -140,21 +140,15 @@ void DBImpl::TEST_UnlockMutex() {
140140
}
141141

142142
void* DBImpl::TEST_BeginWrite() {
143-
auto w = new Writer(&mutex_);
144-
w->batch = nullptr;
145-
w->sync = false;
146-
w->disableWAL = false;
147-
w->in_batch_group = false;
148-
w->done = false;
149-
w->timeout_hint_us = kNoTimeOut;
150-
Status s = BeginWrite(w, 0);
143+
auto w = new WriteThread::Writer(&mutex_);
144+
Status s = write_thread_.EnterWriteThread(w, 0);
151145
assert(s.ok() && !w->done); // No timeout and nobody should do our job
152146
return reinterpret_cast<void*>(w);
153147
}
154148

155149
void DBImpl::TEST_EndWrite(void* w) {
156-
auto writer = reinterpret_cast<Writer*>(w);
157-
EndWrite(writer, writer, Status::OK());
150+
auto writer = reinterpret_cast<WriteThread::Writer*>(w);
151+
write_thread_.ExitWriteThread(writer, writer, Status::OK());
158152
delete writer;
159153
}
160154

0 commit comments

Comments
 (0)