Skip to content

Commit f24a3ee

Browse files
committed
Read from and write to different column families
Summary: This one is big. It adds ability to write to and read from different column families (see the unit test). It also supports recovery of different column families from log, which was the hardest part to reason about. We need to make sure to never delete the log file which has unflushed data from any column family. To support that, I added another concept, which is versions_->MinLogNumber() Test Plan: Added a unit test in column_family_test Reviewers: dhruba, haobo, sdong, kailiu CC: leveldb Differential Revision: https://reviews.facebook.net/D15537
1 parent c1071ed commit f24a3ee

9 files changed

+400
-74
lines changed

db/column_family.cc

+16-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
6969
options(options),
7070
mem(nullptr),
7171
imm(options.min_write_buffer_number_to_merge),
72-
super_version(nullptr) {}
72+
super_version(nullptr),
73+
log_number(0) {}
7374

7475
ColumnFamilyData::~ColumnFamilyData() {
7576
if (super_version != nullptr) {
@@ -167,4 +168,18 @@ void ColumnFamilySet::DropColumnFamily(uint32_t id) {
167168
column_family_data_.erase(cfd);
168169
}
169170

171+
MemTable* ColumnFamilyMemTablesImpl::GetMemTable(uint32_t column_family_id) {
172+
auto cfd = column_family_set_->GetColumnFamily(column_family_id);
173+
// TODO(icanadi): this should not be asserting. Rather, it should somehow
174+
// return Corruption status back to the Iterator. This will require
175+
// API change in WriteBatch::Handler, which is a public API
176+
assert(cfd != nullptr);
177+
178+
if (log_number_ == 0 || log_number_ >= cfd->log_number) {
179+
return cfd->mem;
180+
} else {
181+
return nullptr;
182+
}
183+
}
184+
170185
} // namespace rocksdb

db/column_family.h

+26
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include "rocksdb/options.h"
1717
#include "db/memtablelist.h"
18+
#include "db/write_batch_internal.h"
1819

1920
namespace rocksdb {
2021

@@ -63,6 +64,11 @@ struct ColumnFamilyData {
6364
MemTableList imm;
6465
SuperVersion* super_version;
6566

67+
// This is the earliest log file number that contains data from this
68+
// Column Family. All earlier log files must be ignored and not
69+
// recovered from
70+
uint64_t log_number;
71+
6672
ColumnFamilyData(uint32_t id, const std::string& name,
6773
Version* dummy_versions, const ColumnFamilyOptions& options);
6874
~ColumnFamilyData();
@@ -122,4 +128,24 @@ class ColumnFamilySet {
122128
uint32_t max_column_family_;
123129
};
124130

131+
class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
132+
public:
133+
explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
134+
: column_family_set_(column_family_set), log_number_(0) {}
135+
136+
// If column_family_data->log_number is bigger than log_number,
137+
// the memtable will not be returned.
138+
// If log_number == 0, the memtable will be always returned
139+
void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
140+
141+
// Returns the column families memtable if log_number == 0 || log_number <=
142+
// column_family_data->log_number.
143+
// If column family doesn't exist, it asserts
144+
virtual MemTable* GetMemTable(uint32_t column_family_id) override;
145+
146+
private:
147+
ColumnFamilySet* column_family_set_;
148+
uint64_t log_number_;
149+
};
150+
125151
} // namespace rocksdb

db/column_family_test.cc

+169-6
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@
88
// found in the LICENSE file. See the AUTHORS file for names of contributors.
99

1010
#include "db/db_impl.h"
11+
#include "rocksdb/env.h"
1112
#include "rocksdb/db.h"
1213
#include "util/testharness.h"
14+
#include "utilities/merge_operators.h"
1315

1416
#include <algorithm>
1517
#include <vector>
@@ -22,10 +24,10 @@ using namespace std;
2224
class ColumnFamilyTest {
2325
public:
2426
ColumnFamilyTest() {
27+
env_ = Env::Default();
2528
dbname_ = test::TmpDir() + "/column_family_test";
2629
db_options_.create_if_missing = true;
27-
options_.create_if_missing = true;
28-
DestroyDB(dbname_, options_);
30+
DestroyDB(dbname_, Options(db_options_, column_family_options_));
2931
}
3032

3133
void Close() {
@@ -37,18 +39,77 @@ class ColumnFamilyTest {
3739
vector<ColumnFamilyDescriptor> column_families;
3840
for (auto x : cf) {
3941
column_families.push_back(
40-
ColumnFamilyDescriptor(x, ColumnFamilyOptions()));
42+
ColumnFamilyDescriptor(x, column_family_options_));
4143
}
42-
vector <ColumnFamilyHandle> handles;
4344
return DB::OpenWithColumnFamilies(db_options_, dbname_, column_families,
44-
&handles, &db_);
45+
&handles_, &db_);
4546
}
4647

47-
Options options_;
48+
void Destroy() {
49+
delete db_;
50+
db_ = nullptr;
51+
ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_)));
52+
}
53+
54+
void CreateColumnFamilies(const vector<string>& cfs) {
55+
int cfi = handles_.size();
56+
handles_.resize(cfi + cfs.size());
57+
for (auto cf : cfs) {
58+
ASSERT_OK(db_->CreateColumnFamily(column_family_options_, cf,
59+
&handles_[cfi++]));
60+
}
61+
}
62+
63+
Status Put(int cf, const string& key, const string& value) {
64+
return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value));
65+
}
66+
Status Merge(int cf, const string& key, const string& value) {
67+
return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value));
68+
}
69+
70+
string Get(int cf, const string& key) {
71+
ReadOptions options;
72+
options.verify_checksums = true;
73+
string result;
74+
Status s = db_->Get(options, handles_[cf], Slice(key), &result);
75+
if (s.IsNotFound()) {
76+
result = "NOT_FOUND";
77+
} else if (!s.ok()) {
78+
result = s.ToString();
79+
}
80+
return result;
81+
}
82+
83+
void CopyFile(const string& source, const string& destination,
84+
uint64_t size = 0) {
85+
const EnvOptions soptions;
86+
unique_ptr<SequentialFile> srcfile;
87+
ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
88+
unique_ptr<WritableFile> destfile;
89+
ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
90+
91+
if (size == 0) {
92+
// default argument means copy everything
93+
ASSERT_OK(env_->GetFileSize(source, &size));
94+
}
95+
96+
char buffer[4096];
97+
Slice slice;
98+
while (size > 0) {
99+
uint64_t one = min(uint64_t(sizeof(buffer)), size);
100+
ASSERT_OK(srcfile->Read(one, &slice, buffer));
101+
ASSERT_OK(destfile->Append(slice));
102+
size -= slice.size();
103+
}
104+
ASSERT_OK(destfile->Close());
105+
}
106+
107+
vector<ColumnFamilyHandle> handles_;
48108
ColumnFamilyOptions column_family_options_;
49109
DBOptions db_options_;
50110
string dbname_;
51111
DB* db_;
112+
Env* env_;
52113
};
53114

54115
TEST(ColumnFamilyTest, AddDrop) {
@@ -74,6 +135,108 @@ TEST(ColumnFamilyTest, AddDrop) {
74135
ASSERT_TRUE(families == vector<string>({"default", "four", "one", "three"}));
75136
}
76137

138+
TEST(ColumnFamilyTest, ReadWrite) {
139+
ASSERT_OK(Open({"default"}));
140+
CreateColumnFamilies({"one", "two"});
141+
Close();
142+
ASSERT_OK(Open({"default", "one", "two"}));
143+
ASSERT_OK(Put(0, "foo", "v1"));
144+
ASSERT_OK(Put(0, "bar", "v2"));
145+
ASSERT_OK(Put(1, "mirko", "v3"));
146+
ASSERT_OK(Put(0, "foo", "v2"));
147+
ASSERT_OK(Put(2, "fodor", "v5"));
148+
149+
for (int iter = 0; iter <= 3; ++iter) {
150+
ASSERT_EQ("v2", Get(0, "foo"));
151+
ASSERT_EQ("v2", Get(0, "bar"));
152+
ASSERT_EQ("v3", Get(1, "mirko"));
153+
ASSERT_EQ("v5", Get(2, "fodor"));
154+
ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
155+
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
156+
ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
157+
if (iter <= 1) {
158+
// reopen
159+
Close();
160+
ASSERT_OK(Open({"default", "one", "two"}));
161+
}
162+
}
163+
Close();
164+
}
165+
166+
TEST(ColumnFamilyTest, IgnoreRecoveredLog) {
167+
string backup_logs = dbname_ + "/backup_logs";
168+
169+
// delete old files in backup_logs directory
170+
env_->CreateDirIfMissing(backup_logs);
171+
vector<string> old_files;
172+
env_->GetChildren(backup_logs, &old_files);
173+
for (auto& file : old_files) {
174+
if (file != "." && file != "..") {
175+
env_->DeleteFile(backup_logs + "/" + file);
176+
}
177+
}
178+
179+
column_family_options_.merge_operator =
180+
MergeOperators::CreateUInt64AddOperator();
181+
db_options_.wal_dir = dbname_ + "/logs";
182+
Destroy();
183+
ASSERT_OK(Open({"default"}));
184+
CreateColumnFamilies({"cf1", "cf2"});
185+
186+
// fill up the DB
187+
string one, two, three;
188+
PutFixed64(&one, 1);
189+
PutFixed64(&two, 2);
190+
PutFixed64(&three, 3);
191+
ASSERT_OK(Merge(0, "foo", one));
192+
ASSERT_OK(Merge(1, "mirko", one));
193+
ASSERT_OK(Merge(0, "foo", one));
194+
ASSERT_OK(Merge(2, "bla", one));
195+
ASSERT_OK(Merge(2, "fodor", one));
196+
ASSERT_OK(Merge(0, "bar", one));
197+
ASSERT_OK(Merge(2, "bla", one));
198+
ASSERT_OK(Merge(1, "mirko", two));
199+
ASSERT_OK(Merge(1, "franjo", one));
200+
201+
// copy the logs to backup
202+
vector<string> logs;
203+
env_->GetChildren(db_options_.wal_dir, &logs);
204+
for (auto& log : logs) {
205+
if (log != ".." && log != ".") {
206+
CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log);
207+
}
208+
}
209+
210+
// recover the DB
211+
Close();
212+
213+
// 1. check consistency
214+
// 2. copy the logs from backup back to WAL dir. if the recovery happens
215+
// again on the same log files, this should lead to incorrect results
216+
// due to applying merge operator twice
217+
// 3. check consistency
218+
for (int iter = 0; iter < 2; ++iter) {
219+
// assert consistency
220+
ASSERT_OK(Open({"default", "cf1", "cf2"}));
221+
ASSERT_EQ(two, Get(0, "foo"));
222+
ASSERT_EQ(one, Get(0, "bar"));
223+
ASSERT_EQ(three, Get(1, "mirko"));
224+
ASSERT_EQ(one, Get(1, "franjo"));
225+
ASSERT_EQ(one, Get(2, "fodor"));
226+
ASSERT_EQ(two, Get(2, "bla"));
227+
Close();
228+
229+
if (iter == 0) {
230+
// copy the logs from backup back to wal dir
231+
for (auto& log : logs) {
232+
if (log != ".." && log != ".") {
233+
CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log);
234+
}
235+
}
236+
}
237+
}
238+
}
239+
77240
} // namespace rocksdb
78241

79242
int main(int argc, char** argv) {

0 commit comments

Comments
 (0)