Skip to content

Commit 3615f53

Browse files
committed
Enable flushing memtables from arbitrary column families
Summary: Removed default_cfd_ from all flush code paths. This means we can now flush memtables from arbitrary column families! Test Plan: Added a new unit test Reviewers: dhruba, haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D15789
1 parent 9ca638a commit 3615f53

File tree

5 files changed

+162
-82
lines changed

5 files changed

+162
-82
lines changed

db/column_family_test.cc

+37
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ class ColumnFamilyTest {
6666
Status Merge(int cf, const string& key, const string& value) {
6767
return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value));
6868
}
69+
Status Flush(int cf) {
70+
return db_->Flush(FlushOptions(), handles_[cf]);
71+
}
6972

7073
string Get(int cf, const string& key) {
7174
ReadOptions options;
@@ -238,6 +241,40 @@ TEST(ColumnFamilyTest, IgnoreRecoveredLog) {
238241
}
239242
}
240243

244+
TEST(ColumnFamilyTest, FlushTest) {
245+
ASSERT_OK(Open({"default"}));
246+
CreateColumnFamilies({"one", "two"});
247+
Close();
248+
ASSERT_OK(Open({"default", "one", "two"}));
249+
ASSERT_OK(Put(0, "foo", "v1"));
250+
ASSERT_OK(Put(0, "bar", "v2"));
251+
ASSERT_OK(Put(1, "mirko", "v3"));
252+
ASSERT_OK(Put(0, "foo", "v2"));
253+
ASSERT_OK(Put(2, "fodor", "v5"));
254+
for (int i = 0; i < 3; ++i) {
255+
Flush(i);
256+
}
257+
Close();
258+
ASSERT_OK(Open({"default", "one", "two"}));
259+
260+
for (int iter = 0; iter <= 2; ++iter) {
261+
ASSERT_EQ("v2", Get(0, "foo"));
262+
ASSERT_EQ("v2", Get(0, "bar"));
263+
ASSERT_EQ("v3", Get(1, "mirko"));
264+
ASSERT_EQ("v5", Get(2, "fodor"));
265+
ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
266+
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
267+
ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
268+
if (iter <= 1) {
269+
// reopen
270+
Close();
271+
ASSERT_OK(Open({"default", "one", "two"}));
272+
}
273+
}
274+
Close();
275+
}
276+
277+
241278
} // namespace rocksdb
242279

243280
int main(int argc, char** argv) {

db/db_impl.cc

+100-51
Original file line numberDiff line numberDiff line change
@@ -317,8 +317,12 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
317317

318318
DBImpl::~DBImpl() {
319319
// Wait for background work to finish
320-
if (flush_on_destroy_ && default_cfd_->mem()->GetFirstSequenceNumber() != 0) {
321-
FlushMemTable(FlushOptions());
320+
if (flush_on_destroy_) {
321+
for (auto cfd : *versions_->GetColumnFamilySet()) {
322+
if (cfd->mem()->GetFirstSequenceNumber() != 0) {
323+
FlushMemTable(cfd, FlushOptions());
324+
}
325+
}
322326
}
323327
mutex_.Lock();
324328
shutting_down_.Release_Store(this); // Any non-nullptr value is ok
@@ -979,6 +983,9 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
979983
for (auto cfd : *versions_->GetColumnFamilySet()) {
980984
if (cfd->mem()->ApproximateMemoryUsage() >
981985
cfd->options()->write_buffer_size) {
986+
// If this asserts, it means that ColumnFamilyMemTablesImpl failed in
987+
// filtering updates to already-flushed column families
988+
assert(cfd->GetLogNumber() <= log_number);
982989
auto iter = version_edits.find(cfd->GetID());
983990
assert(iter != version_edits.end());
984991
VersionEdit* edit = &iter->second;
@@ -1001,8 +1008,20 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
10011008
assert(iter != version_edits.end());
10021009
VersionEdit* edit = &iter->second;
10031010

1004-
// flush the final memtable
1005-
status = WriteLevel0TableForRecovery(cfd->mem(), edit);
1011+
if (cfd->GetLogNumber() > log_number) {
1012+
// Column family cfd has already flushed the data
1013+
// from log_number. Memtable has to be empty because
1014+
// we filter the updates based on log_number
1015+
// (in ColumnFamilyMemTablesImpl)
1016+
assert(cfd->mem()->GetFirstSequenceNumber() == 0);
1017+
assert(edit->NumEntries() == 0);
1018+
continue;
1019+
}
1020+
1021+
// flush the final memtable (if non-empty)
1022+
if (cfd->mem()->GetFirstSequenceNumber() != 0) {
1023+
status = WriteLevel0TableForRecovery(cfd->mem(), edit);
1024+
}
10061025
// we still want to clear the memtable, even if the recovery failed
10071026
cfd->CreateNewMemtable();
10081027
if (!status.ok()) {
@@ -1016,6 +1035,12 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
10161035
// Since we already recovered log_number, we want all logs
10171036
// with numbers `<= log_number` (includes this one) to be ignored
10181037
edit->SetLogNumber(log_number + 1);
1038+
// we must mark the next log number as used, even though it's
1039+
// not actually used. that is because VersionSet assumes
1040+
// VersionSet::next_file_number_ always to be strictly greater than any
1041+
// log
1042+
// number
1043+
versions_->MarkFileNumberUsed(log_number + 1);
10191044
status = versions_->LogAndApply(cfd, edit, &mutex_);
10201045
if (!status.ok()) {
10211046
return status;
@@ -1077,8 +1102,8 @@ Status DBImpl::WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit) {
10771102
return s;
10781103
}
10791104

1080-
1081-
Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
1105+
Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
1106+
std::vector<MemTable*>& mems, VersionEdit* edit,
10821107
uint64_t* filenumber) {
10831108
mutex_.AssertHeld();
10841109
const uint64_t start_micros = env_->NowMicros();
@@ -1090,7 +1115,7 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
10901115
const SequenceNumber newest_snapshot = snapshots_.GetNewest();
10911116
const SequenceNumber earliest_seqno_in_memtable =
10921117
mems[0]->GetFirstSequenceNumber();
1093-
Version* base = default_cfd_->current();
1118+
Version* base = cfd->current();
10941119
base->Ref(); // it is likely that we do not need this reference
10951120
Status s;
10961121
{
@@ -1127,7 +1152,7 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
11271152

11281153

11291154
// re-acquire the most current version
1130-
base = default_cfd_->current();
1155+
base = cfd->current();
11311156

11321157
// There could be multiple threads writing to its own level-0 file.
11331158
// The pending_outputs cannot be cleared here, otherwise this newly
@@ -1149,7 +1174,7 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
11491174
// threads could be concurrently producing compacted files for
11501175
// that key range.
11511176
if (base != nullptr && options_.max_background_compactions <= 1 &&
1152-
options_.compaction_style == kCompactionStyleLevel) {
1177+
cfd->options()->compaction_style == kCompactionStyleLevel) {
11531178
level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
11541179
}
11551180
edit->AddFile(level, meta.number, meta.file_size,
@@ -1165,20 +1190,21 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
11651190
return s;
11661191
}
11671192

1168-
Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
1193+
Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd,
1194+
bool* madeProgress,
11691195
DeletionState& deletion_state) {
11701196
mutex_.AssertHeld();
1171-
assert(default_cfd_->imm()->size() != 0);
1197+
assert(cfd->imm()->size() != 0);
11721198

1173-
if (!default_cfd_->imm()->IsFlushPending()) {
1199+
if (!cfd->imm()->IsFlushPending()) {
11741200
Log(options_.info_log, "FlushMemTableToOutputFile already in progress");
11751201
return Status::IOError("FlushMemTableToOutputFile already in progress");
11761202
}
11771203

11781204
// Save the contents of the earliest memtable as a new Table
11791205
uint64_t file_number;
11801206
std::vector<MemTable*> mems;
1181-
default_cfd_->imm()->PickMemtablesToFlush(&mems);
1207+
cfd->imm()->PickMemtablesToFlush(&mems);
11821208
if (mems.empty()) {
11831209
Log(options_.info_log, "Nothing in memstore to flush");
11841210
return Status::IOError("Nothing in memstore to flush");
@@ -1193,17 +1219,16 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
11931219
edit->SetPrevLogNumber(0);
11941220
// SetLogNumber(log_num) indicates logs with number smaller than log_num
11951221
// will no longer be picked up for recovery.
1196-
edit->SetLogNumber(
1197-
mems.back()->GetNextLogNumber()
1198-
);
1222+
edit->SetLogNumber(mems.back()->GetNextLogNumber());
1223+
edit->SetColumnFamily(cfd->GetID());
11991224

12001225
std::vector<uint64_t> logs_to_delete;
12011226
for (auto mem : mems) {
12021227
logs_to_delete.push_back(mem->GetLogNumber());
12031228
}
12041229

12051230
// This will release and re-acquire the mutex.
1206-
Status s = WriteLevel0Table(mems, edit, &file_number);
1231+
Status s = WriteLevel0Table(cfd, mems, edit, &file_number);
12071232

12081233
if (s.ok() && shutting_down_.Acquire_Load()) {
12091234
s = Status::IOError(
@@ -1212,13 +1237,13 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
12121237
}
12131238

12141239
// Replace immutable memtable with the generated Table
1215-
s = default_cfd_->imm()->InstallMemtableFlushResults(
1216-
default_cfd_, mems, versions_.get(), s, &mutex_, options_.info_log.get(),
1240+
s = cfd->imm()->InstallMemtableFlushResults(
1241+
cfd, mems, versions_.get(), s, &mutex_, options_.info_log.get(),
12171242
file_number, pending_outputs_, &deletion_state.memtables_to_free,
12181243
db_directory_.get());
12191244

12201245
if (s.ok()) {
1221-
InstallSuperVersion(default_cfd_, deletion_state);
1246+
InstallSuperVersion(cfd, deletion_state);
12221247
if (madeProgress) {
12231248
*madeProgress = 1;
12241249
}
@@ -1239,7 +1264,7 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
12391264
Status DBImpl::CompactRange(const ColumnFamilyHandle& column_family,
12401265
const Slice* begin, const Slice* end,
12411266
bool reduce_level, int target_level) {
1242-
Status s = FlushMemTable(FlushOptions());
1267+
Status s = FlushMemTable(default_cfd_, FlushOptions());
12431268
if (!s.ok()) {
12441269
LogFlush(options_.info_log);
12451270
return s;
@@ -1382,8 +1407,12 @@ uint64_t DBImpl::CurrentVersionNumber() const {
13821407

13831408
Status DBImpl::Flush(const FlushOptions& options,
13841409
const ColumnFamilyHandle& column_family) {
1385-
Status status = FlushMemTable(options);
1386-
return status;
1410+
mutex_.Lock();
1411+
auto cfd = versions_->GetColumnFamilySet()->GetColumnFamily(column_family.id);
1412+
mutex_.Unlock();
1413+
assert(cfd != nullptr);
1414+
1415+
return FlushMemTable(cfd, options);
13871416
}
13881417

13891418
SequenceNumber DBImpl::GetLatestSequenceNumber() const {
@@ -1657,35 +1686,36 @@ Status DBImpl::TEST_CompactRange(int level,
16571686
return RunManualCompaction(level, output_level, begin, end);
16581687
}
16591688

1660-
Status DBImpl::FlushMemTable(const FlushOptions& options) {
1689+
Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
1690+
const FlushOptions& options) {
16611691
// nullptr batch means just wait for earlier writes to be done
16621692
Status s = Write(WriteOptions(), nullptr);
16631693
if (s.ok() && options.wait) {
16641694
// Wait until the compaction completes
1665-
s = WaitForFlushMemTable();
1695+
s = WaitForFlushMemTable(cfd);
16661696
}
16671697
return s;
16681698
}
16691699

1670-
Status DBImpl::WaitForFlushMemTable() {
1700+
Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) {
16711701
Status s;
16721702
// Wait until the compaction completes
16731703
MutexLock l(&mutex_);
1674-
while (default_cfd_->imm()->size() > 0 && bg_error_.ok()) {
1704+
while (cfd->imm()->size() > 0 && bg_error_.ok()) {
16751705
bg_cv_.Wait();
16761706
}
1677-
if (default_cfd_->imm()->size() != 0) {
1707+
if (!bg_error_.ok()) {
16781708
s = bg_error_;
16791709
}
16801710
return s;
16811711
}
16821712

16831713
Status DBImpl::TEST_FlushMemTable() {
1684-
return FlushMemTable(FlushOptions());
1714+
return FlushMemTable(default_cfd_, FlushOptions());
16851715
}
16861716

16871717
Status DBImpl::TEST_WaitForFlushMemTable() {
1688-
return WaitForFlushMemTable();
1718+
return WaitForFlushMemTable(default_cfd_);
16891719
}
16901720

16911721
Status DBImpl::TEST_WaitForCompact() {
@@ -1710,19 +1740,31 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
17101740
} else if (shutting_down_.Acquire_Load()) {
17111741
// DB is being deleted; no more background compactions
17121742
} else {
1713-
bool is_flush_pending = default_cfd_->imm()->IsFlushPending();
1743+
bool is_flush_pending = false;
1744+
for (auto cfd : *versions_->GetColumnFamilySet()) {
1745+
if (cfd->imm()->IsFlushPending()) {
1746+
is_flush_pending = true;
1747+
}
1748+
}
17141749
if (is_flush_pending &&
17151750
(bg_flush_scheduled_ < options_.max_background_flushes)) {
17161751
// memtable flush needed
17171752
bg_flush_scheduled_++;
17181753
env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH);
17191754
}
1755+
bool is_compaction_needed = false;
1756+
for (auto cfd : *versions_->GetColumnFamilySet()) {
1757+
if (cfd->current()->NeedsCompaction()) {
1758+
is_compaction_needed = true;
1759+
break;
1760+
}
1761+
}
17201762

17211763
// Schedule BGWorkCompaction if there's a compaction pending (or a memtable
17221764
// flush, but the HIGH pool is not enabled). Do it only if
17231765
// max_background_compactions hasn't been reached and, in case
17241766
// bg_manual_only_ > 0, if it's a manual compaction.
1725-
if ((manual_compaction_ || default_cfd_->current()->NeedsCompaction() ||
1767+
if ((manual_compaction_ || is_compaction_needed ||
17261768
(is_flush_pending && (options_.max_background_flushes <= 0))) &&
17271769
bg_compaction_scheduled_ < options_.max_background_compactions &&
17281770
(!bg_manual_only_ || manual_compaction_)) {
@@ -1744,11 +1786,14 @@ void DBImpl::BGWorkCompaction(void* db) {
17441786
Status DBImpl::BackgroundFlush(bool* madeProgress,
17451787
DeletionState& deletion_state) {
17461788
Status stat;
1747-
while (stat.ok() && default_cfd_->imm()->IsFlushPending()) {
1748-
Log(options_.info_log,
1749-
"BackgroundCallFlush doing FlushMemTableToOutputFile, flush slots available %d",
1750-
options_.max_background_flushes - bg_flush_scheduled_);
1751-
stat = FlushMemTableToOutputFile(madeProgress, deletion_state);
1789+
for (auto cfd : *versions_->GetColumnFamilySet()) {
1790+
while (stat.ok() && cfd->imm()->IsFlushPending()) {
1791+
Log(options_.info_log,
1792+
"BackgroundCallFlush doing FlushMemTableToOutputFile with column "
1793+
"family %u, flush slots available %d",
1794+
cfd->GetID(), options_.max_background_flushes - bg_flush_scheduled_);
1795+
stat = FlushMemTableToOutputFile(cfd, madeProgress, deletion_state);
1796+
}
17521797
}
17531798
return stat;
17541799
}
@@ -1871,20 +1916,24 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
18711916
}
18721917

18731918
// TODO: remove memtable flush from formal compaction
1874-
while (default_cfd_->imm()->IsFlushPending()) {
1875-
Log(options_.info_log,
1876-
"BackgroundCompaction doing FlushMemTableToOutputFile, compaction slots "
1877-
"available %d",
1878-
options_.max_background_compactions - bg_compaction_scheduled_);
1879-
Status stat = FlushMemTableToOutputFile(madeProgress, deletion_state);
1880-
if (!stat.ok()) {
1881-
if (is_manual) {
1882-
manual_compaction_->status = stat;
1883-
manual_compaction_->done = true;
1884-
manual_compaction_->in_progress = false;
1885-
manual_compaction_ = nullptr;
1919+
for (auto cfd : *versions_->GetColumnFamilySet()) {
1920+
while (cfd->imm()->IsFlushPending()) {
1921+
Log(options_.info_log,
1922+
"BackgroundCompaction doing FlushMemTableToOutputFile with column "
1923+
"family %d, compaction slots available %d",
1924+
cfd->GetID(),
1925+
options_.max_background_compactions - bg_compaction_scheduled_);
1926+
Status stat =
1927+
FlushMemTableToOutputFile(cfd, madeProgress, deletion_state);
1928+
if (!stat.ok()) {
1929+
if (is_manual) {
1930+
manual_compaction_->status = stat;
1931+
manual_compaction_->done = true;
1932+
manual_compaction_->in_progress = false;
1933+
manual_compaction_ = nullptr;
1934+
}
1935+
return stat;
18861936
}
1887-
return stat;
18881937
}
18891938
}
18901939

@@ -2285,7 +2334,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
22852334
LogFlush(options_.info_log);
22862335
mutex_.Lock();
22872336
if (default_cfd_->imm()->IsFlushPending()) {
2288-
FlushMemTableToOutputFile(nullptr, deletion_state);
2337+
FlushMemTableToOutputFile(default_cfd_, nullptr, deletion_state);
22892338
bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary
22902339
}
22912340
mutex_.Unlock();

0 commit comments

Comments
 (0)