@@ -317,8 +317,12 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
317
317
318
318
DBImpl::~DBImpl () {
319
319
// Wait for background work to finish
320
- if (flush_on_destroy_ && default_cfd_->mem ()->GetFirstSequenceNumber () != 0 ) {
321
- FlushMemTable (FlushOptions ());
320
+ if (flush_on_destroy_) {
321
+ for (auto cfd : *versions_->GetColumnFamilySet ()) {
322
+ if (cfd->mem ()->GetFirstSequenceNumber () != 0 ) {
323
+ FlushMemTable (cfd, FlushOptions ());
324
+ }
325
+ }
322
326
}
323
327
mutex_.Lock ();
324
328
shutting_down_.Release_Store (this ); // Any non-nullptr value is ok
@@ -979,6 +983,9 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
979
983
for (auto cfd : *versions_->GetColumnFamilySet ()) {
980
984
if (cfd->mem ()->ApproximateMemoryUsage () >
981
985
cfd->options ()->write_buffer_size ) {
986
+ // If this asserts, it means that ColumnFamilyMemTablesImpl failed in
987
+ // filtering updates to already-flushed column families
988
+ assert (cfd->GetLogNumber () <= log_number);
982
989
auto iter = version_edits.find (cfd->GetID ());
983
990
assert (iter != version_edits.end ());
984
991
VersionEdit* edit = &iter->second ;
@@ -1001,8 +1008,20 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
1001
1008
assert (iter != version_edits.end ());
1002
1009
VersionEdit* edit = &iter->second ;
1003
1010
1004
- // flush the final memtable
1005
- status = WriteLevel0TableForRecovery (cfd->mem (), edit);
1011
+ if (cfd->GetLogNumber () > log_number) {
1012
+ // Column family cfd has already flushed the data
1013
+ // from log_number. Memtable has to be empty because
1014
+ // we filter the updates based on log_number
1015
+ // (in ColumnFamilyMemTablesImpl)
1016
+ assert (cfd->mem ()->GetFirstSequenceNumber () == 0 );
1017
+ assert (edit->NumEntries () == 0 );
1018
+ continue ;
1019
+ }
1020
+
1021
+ // flush the final memtable (if non-empty)
1022
+ if (cfd->mem ()->GetFirstSequenceNumber () != 0 ) {
1023
+ status = WriteLevel0TableForRecovery (cfd->mem (), edit);
1024
+ }
1006
1025
// we still want to clear the memtable, even if the recovery failed
1007
1026
cfd->CreateNewMemtable ();
1008
1027
if (!status.ok ()) {
@@ -1016,6 +1035,12 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
1016
1035
// Since we already recovered log_number, we want all logs
1017
1036
// with numbers `<= log_number` (includes this one) to be ignored
1018
1037
edit->SetLogNumber (log_number + 1 );
1038
+ // we must mark the next log number as used, even though it's
1039
+ // not actually used. that is because VersionSet assumes
1040
+ // VersionSet::next_file_number_ always to be strictly greater than any
1041
+ // log
1042
+ // number
1043
+ versions_->MarkFileNumberUsed (log_number + 1 );
1019
1044
status = versions_->LogAndApply (cfd, edit, &mutex_);
1020
1045
if (!status.ok ()) {
1021
1046
return status;
@@ -1077,8 +1102,8 @@ Status DBImpl::WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit) {
1077
1102
return s;
1078
1103
}
1079
1104
1080
-
1081
- Status DBImpl::WriteLevel0Table ( std::vector<MemTable*> & mems, VersionEdit* edit,
1105
+ Status DBImpl::WriteLevel0Table (ColumnFamilyData* cfd,
1106
+ std::vector<MemTable*>& mems, VersionEdit* edit,
1082
1107
uint64_t * filenumber) {
1083
1108
mutex_.AssertHeld ();
1084
1109
const uint64_t start_micros = env_->NowMicros ();
@@ -1090,7 +1115,7 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
1090
1115
const SequenceNumber newest_snapshot = snapshots_.GetNewest ();
1091
1116
const SequenceNumber earliest_seqno_in_memtable =
1092
1117
mems[0 ]->GetFirstSequenceNumber ();
1093
- Version* base = default_cfd_ ->current ();
1118
+ Version* base = cfd ->current ();
1094
1119
base->Ref (); // it is likely that we do not need this reference
1095
1120
Status s;
1096
1121
{
@@ -1127,7 +1152,7 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
1127
1152
1128
1153
1129
1154
// re-acquire the most current version
1130
- base = default_cfd_ ->current ();
1155
+ base = cfd ->current ();
1131
1156
1132
1157
// There could be multiple threads writing to its own level-0 file.
1133
1158
// The pending_outputs cannot be cleared here, otherwise this newly
@@ -1149,7 +1174,7 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
1149
1174
// threads could be concurrently producing compacted files for
1150
1175
// that key range.
1151
1176
if (base != nullptr && options_.max_background_compactions <= 1 &&
1152
- options_. compaction_style == kCompactionStyleLevel ) {
1177
+ cfd-> options ()-> compaction_style == kCompactionStyleLevel ) {
1153
1178
level = base->PickLevelForMemTableOutput (min_user_key, max_user_key);
1154
1179
}
1155
1180
edit->AddFile (level, meta.number , meta.file_size ,
@@ -1165,20 +1190,21 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
1165
1190
return s;
1166
1191
}
1167
1192
1168
- Status DBImpl::FlushMemTableToOutputFile (bool * madeProgress,
1193
+ Status DBImpl::FlushMemTableToOutputFile (ColumnFamilyData* cfd,
1194
+ bool * madeProgress,
1169
1195
DeletionState& deletion_state) {
1170
1196
mutex_.AssertHeld ();
1171
- assert (default_cfd_ ->imm ()->size () != 0 );
1197
+ assert (cfd ->imm ()->size () != 0 );
1172
1198
1173
- if (!default_cfd_ ->imm ()->IsFlushPending ()) {
1199
+ if (!cfd ->imm ()->IsFlushPending ()) {
1174
1200
Log (options_.info_log , " FlushMemTableToOutputFile already in progress" );
1175
1201
return Status::IOError (" FlushMemTableToOutputFile already in progress" );
1176
1202
}
1177
1203
1178
1204
// Save the contents of the earliest memtable as a new Table
1179
1205
uint64_t file_number;
1180
1206
std::vector<MemTable*> mems;
1181
- default_cfd_ ->imm ()->PickMemtablesToFlush (&mems);
1207
+ cfd ->imm ()->PickMemtablesToFlush (&mems);
1182
1208
if (mems.empty ()) {
1183
1209
Log (options_.info_log , " Nothing in memstore to flush" );
1184
1210
return Status::IOError (" Nothing in memstore to flush" );
@@ -1193,17 +1219,16 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
1193
1219
edit->SetPrevLogNumber (0 );
1194
1220
// SetLogNumber(log_num) indicates logs with number smaller than log_num
1195
1221
// will no longer be picked up for recovery.
1196
- edit->SetLogNumber (
1197
- mems.back ()->GetNextLogNumber ()
1198
- );
1222
+ edit->SetLogNumber (mems.back ()->GetNextLogNumber ());
1223
+ edit->SetColumnFamily (cfd->GetID ());
1199
1224
1200
1225
std::vector<uint64_t > logs_to_delete;
1201
1226
for (auto mem : mems) {
1202
1227
logs_to_delete.push_back (mem->GetLogNumber ());
1203
1228
}
1204
1229
1205
1230
// This will release and re-acquire the mutex.
1206
- Status s = WriteLevel0Table (mems, edit, &file_number);
1231
+ Status s = WriteLevel0Table (cfd, mems, edit, &file_number);
1207
1232
1208
1233
if (s.ok () && shutting_down_.Acquire_Load ()) {
1209
1234
s = Status::IOError (
@@ -1212,13 +1237,13 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
1212
1237
}
1213
1238
1214
1239
// Replace immutable memtable with the generated Table
1215
- s = default_cfd_ ->imm ()->InstallMemtableFlushResults (
1216
- default_cfd_ , mems, versions_.get (), s, &mutex_, options_.info_log .get (),
1240
+ s = cfd ->imm ()->InstallMemtableFlushResults (
1241
+ cfd , mems, versions_.get (), s, &mutex_, options_.info_log .get (),
1217
1242
file_number, pending_outputs_, &deletion_state.memtables_to_free ,
1218
1243
db_directory_.get ());
1219
1244
1220
1245
if (s.ok ()) {
1221
- InstallSuperVersion (default_cfd_ , deletion_state);
1246
+ InstallSuperVersion (cfd , deletion_state);
1222
1247
if (madeProgress) {
1223
1248
*madeProgress = 1 ;
1224
1249
}
@@ -1239,7 +1264,7 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
1239
1264
Status DBImpl::CompactRange (const ColumnFamilyHandle& column_family,
1240
1265
const Slice* begin, const Slice* end,
1241
1266
bool reduce_level, int target_level) {
1242
- Status s = FlushMemTable (FlushOptions ());
1267
+ Status s = FlushMemTable (default_cfd_, FlushOptions ());
1243
1268
if (!s.ok ()) {
1244
1269
LogFlush (options_.info_log );
1245
1270
return s;
@@ -1382,8 +1407,12 @@ uint64_t DBImpl::CurrentVersionNumber() const {
1382
1407
1383
1408
Status DBImpl::Flush (const FlushOptions& options,
1384
1409
const ColumnFamilyHandle& column_family) {
1385
- Status status = FlushMemTable (options);
1386
- return status;
1410
+ mutex_.Lock ();
1411
+ auto cfd = versions_->GetColumnFamilySet ()->GetColumnFamily (column_family.id );
1412
+ mutex_.Unlock ();
1413
+ assert (cfd != nullptr );
1414
+
1415
+ return FlushMemTable (cfd, options);
1387
1416
}
1388
1417
1389
1418
SequenceNumber DBImpl::GetLatestSequenceNumber () const {
@@ -1657,35 +1686,36 @@ Status DBImpl::TEST_CompactRange(int level,
1657
1686
return RunManualCompaction (level, output_level, begin, end);
1658
1687
}
1659
1688
1660
- Status DBImpl::FlushMemTable (const FlushOptions& options) {
1689
+ Status DBImpl::FlushMemTable (ColumnFamilyData* cfd,
1690
+ const FlushOptions& options) {
1661
1691
// nullptr batch means just wait for earlier writes to be done
1662
1692
Status s = Write (WriteOptions (), nullptr );
1663
1693
if (s.ok () && options.wait ) {
1664
1694
// Wait until the compaction completes
1665
- s = WaitForFlushMemTable ();
1695
+ s = WaitForFlushMemTable (cfd );
1666
1696
}
1667
1697
return s;
1668
1698
}
1669
1699
1670
- Status DBImpl::WaitForFlushMemTable () {
1700
+ Status DBImpl::WaitForFlushMemTable (ColumnFamilyData* cfd ) {
1671
1701
Status s;
1672
1702
// Wait until the compaction completes
1673
1703
MutexLock l (&mutex_);
1674
- while (default_cfd_ ->imm ()->size () > 0 && bg_error_.ok ()) {
1704
+ while (cfd ->imm ()->size () > 0 && bg_error_.ok ()) {
1675
1705
bg_cv_.Wait ();
1676
1706
}
1677
- if (default_cfd_-> imm ()-> size () != 0 ) {
1707
+ if (!bg_error_. ok () ) {
1678
1708
s = bg_error_;
1679
1709
}
1680
1710
return s;
1681
1711
}
1682
1712
1683
1713
Status DBImpl::TEST_FlushMemTable () {
1684
- return FlushMemTable (FlushOptions ());
1714
+ return FlushMemTable (default_cfd_, FlushOptions ());
1685
1715
}
1686
1716
1687
1717
Status DBImpl::TEST_WaitForFlushMemTable () {
1688
- return WaitForFlushMemTable ();
1718
+ return WaitForFlushMemTable (default_cfd_ );
1689
1719
}
1690
1720
1691
1721
Status DBImpl::TEST_WaitForCompact () {
@@ -1710,19 +1740,31 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
1710
1740
} else if (shutting_down_.Acquire_Load ()) {
1711
1741
// DB is being deleted; no more background compactions
1712
1742
} else {
1713
- bool is_flush_pending = default_cfd_->imm ()->IsFlushPending ();
1743
+ bool is_flush_pending = false ;
1744
+ for (auto cfd : *versions_->GetColumnFamilySet ()) {
1745
+ if (cfd->imm ()->IsFlushPending ()) {
1746
+ is_flush_pending = true ;
1747
+ }
1748
+ }
1714
1749
if (is_flush_pending &&
1715
1750
(bg_flush_scheduled_ < options_.max_background_flushes )) {
1716
1751
// memtable flush needed
1717
1752
bg_flush_scheduled_++;
1718
1753
env_->Schedule (&DBImpl::BGWorkFlush, this , Env::Priority::HIGH);
1719
1754
}
1755
+ bool is_compaction_needed = false ;
1756
+ for (auto cfd : *versions_->GetColumnFamilySet ()) {
1757
+ if (cfd->current ()->NeedsCompaction ()) {
1758
+ is_compaction_needed = true ;
1759
+ break ;
1760
+ }
1761
+ }
1720
1762
1721
1763
// Schedule BGWorkCompaction if there's a compaction pending (or a memtable
1722
1764
// flush, but the HIGH pool is not enabled). Do it only if
1723
1765
// max_background_compactions hasn't been reached and, in case
1724
1766
// bg_manual_only_ > 0, if it's a manual compaction.
1725
- if ((manual_compaction_ || default_cfd_-> current ()-> NeedsCompaction () ||
1767
+ if ((manual_compaction_ || is_compaction_needed ||
1726
1768
(is_flush_pending && (options_.max_background_flushes <= 0 ))) &&
1727
1769
bg_compaction_scheduled_ < options_.max_background_compactions &&
1728
1770
(!bg_manual_only_ || manual_compaction_)) {
@@ -1744,11 +1786,14 @@ void DBImpl::BGWorkCompaction(void* db) {
1744
1786
Status DBImpl::BackgroundFlush (bool * madeProgress,
1745
1787
DeletionState& deletion_state) {
1746
1788
Status stat;
1747
- while (stat.ok () && default_cfd_->imm ()->IsFlushPending ()) {
1748
- Log (options_.info_log ,
1749
- " BackgroundCallFlush doing FlushMemTableToOutputFile, flush slots available %d" ,
1750
- options_.max_background_flushes - bg_flush_scheduled_);
1751
- stat = FlushMemTableToOutputFile (madeProgress, deletion_state);
1789
+ for (auto cfd : *versions_->GetColumnFamilySet ()) {
1790
+ while (stat.ok () && cfd->imm ()->IsFlushPending ()) {
1791
+ Log (options_.info_log ,
1792
+ " BackgroundCallFlush doing FlushMemTableToOutputFile with column "
1793
+ " family %u, flush slots available %d" ,
1794
+ cfd->GetID (), options_.max_background_flushes - bg_flush_scheduled_);
1795
+ stat = FlushMemTableToOutputFile (cfd, madeProgress, deletion_state);
1796
+ }
1752
1797
}
1753
1798
return stat;
1754
1799
}
@@ -1871,20 +1916,24 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
1871
1916
}
1872
1917
1873
1918
// TODO: remove memtable flush from formal compaction
1874
- while (default_cfd_->imm ()->IsFlushPending ()) {
1875
- Log (options_.info_log ,
1876
- " BackgroundCompaction doing FlushMemTableToOutputFile, compaction slots "
1877
- " available %d" ,
1878
- options_.max_background_compactions - bg_compaction_scheduled_);
1879
- Status stat = FlushMemTableToOutputFile (madeProgress, deletion_state);
1880
- if (!stat.ok ()) {
1881
- if (is_manual) {
1882
- manual_compaction_->status = stat;
1883
- manual_compaction_->done = true ;
1884
- manual_compaction_->in_progress = false ;
1885
- manual_compaction_ = nullptr ;
1919
+ for (auto cfd : *versions_->GetColumnFamilySet ()) {
1920
+ while (cfd->imm ()->IsFlushPending ()) {
1921
+ Log (options_.info_log ,
1922
+ " BackgroundCompaction doing FlushMemTableToOutputFile with column "
1923
+ " family %d, compaction slots available %d" ,
1924
+ cfd->GetID (),
1925
+ options_.max_background_compactions - bg_compaction_scheduled_);
1926
+ Status stat =
1927
+ FlushMemTableToOutputFile (cfd, madeProgress, deletion_state);
1928
+ if (!stat.ok ()) {
1929
+ if (is_manual) {
1930
+ manual_compaction_->status = stat;
1931
+ manual_compaction_->done = true ;
1932
+ manual_compaction_->in_progress = false ;
1933
+ manual_compaction_ = nullptr ;
1934
+ }
1935
+ return stat;
1886
1936
}
1887
- return stat;
1888
1937
}
1889
1938
}
1890
1939
@@ -2285,7 +2334,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
2285
2334
LogFlush (options_.info_log );
2286
2335
mutex_.Lock ();
2287
2336
if (default_cfd_->imm ()->IsFlushPending ()) {
2288
- FlushMemTableToOutputFile (nullptr , deletion_state);
2337
+ FlushMemTableToOutputFile (default_cfd_, nullptr , deletion_state);
2289
2338
bg_cv_.SignalAll (); // Wakeup MakeRoomForWrite() if necessary
2290
2339
}
2291
2340
mutex_.Unlock ();
0 commit comments