Skip to content

Commit 9674c11

Browse files
committed
Integrating Cuckoo Hash SST Table format into RocksDB
Summary: Contains the following changes: - Implementation of cuckoo_table_factory - Adding cuckoo table into AdaptiveTableFactory - Adding cuckoo_table_db_test, similar to lines of plain_table_db_test - Minor fixes to Reader: When a key is found in the table, return the key found instead of the search key. - Minor fixes to Builder: Add table properties that are required by Version::UpdateTemporaryStats() during Get operation. Don't define curr_node as a reference variable as the memory locations may get reassigned during tree.push_back operation, leading to invalid memory access. Test Plan: cuckoo_table_reader_test --enable_perf cuckoo_table_builder_test cuckoo_table_db_test make check all make valgrind_check make asan_check Reviewers: sdong, igor, yhchiang, ljin Reviewed By: ljin Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D21219
1 parent 37c6740 commit 9674c11

11 files changed

+455
-31
lines changed

Makefile

+6-2
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,10 @@ TESTS = \
117117
thread_local_test \
118118
geodb_test \
119119
rate_limiter_test \
120-
cuckoo_table_builder_test \
121120
options_test \
122-
cuckoo_table_reader_test
121+
cuckoo_table_builder_test \
122+
cuckoo_table_reader_test \
123+
cuckoo_table_db_test
123124

124125
TOOLS = \
125126
sst_dump \
@@ -430,6 +431,9 @@ cuckoo_table_builder_test: table/cuckoo_table_builder_test.o $(LIBOBJECTS) $(TES
430431
cuckoo_table_reader_test: table/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS)
431432
$(CXX) table/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
432433

434+
cuckoo_table_db_test: db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
435+
$(CXX) db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
436+
433437
options_test: util/options_test.o $(LIBOBJECTS) $(TESTHARNESS)
434438
$(CXX) util/options_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
435439

db/cuckoo_table_db_test.cc

+291
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
2+
// This source code is licensed under the BSD-style license found in the
3+
// LICENSE file in the root directory of this source tree. An additional grant
4+
// of patent rights can be found in the PATENTS file in the same directory.
5+
6+
#include "db/db_impl.h"
7+
#include "rocksdb/db.h"
8+
#include "rocksdb/env.h"
9+
#include "table/meta_blocks.h"
10+
#include "table/cuckoo_table_factory.h"
11+
#include "table/cuckoo_table_reader.h"
12+
#include "util/testharness.h"
13+
#include "util/testutil.h"
14+
15+
namespace rocksdb {
16+
17+
class CuckooTableDBTest {
18+
private:
19+
std::string dbname_;
20+
Env* env_;
21+
DB* db_;
22+
23+
public:
24+
CuckooTableDBTest() : env_(Env::Default()) {
25+
dbname_ = test::TmpDir() + "/cuckoo_table_db_test";
26+
ASSERT_OK(DestroyDB(dbname_, Options()));
27+
db_ = nullptr;
28+
Reopen();
29+
}
30+
31+
~CuckooTableDBTest() {
32+
delete db_;
33+
ASSERT_OK(DestroyDB(dbname_, Options()));
34+
}
35+
36+
Options CurrentOptions() {
37+
Options options;
38+
options.table_factory.reset(NewCuckooTableFactory());
39+
options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
40+
options.allow_mmap_reads = true;
41+
options.create_if_missing = true;
42+
options.max_mem_compaction_level = 0;
43+
return options;
44+
}
45+
46+
DBImpl* dbfull() {
47+
return reinterpret_cast<DBImpl*>(db_);
48+
}
49+
50+
// The following util methods are copied from plain_table_db_test.
51+
void Reopen(Options* options = nullptr) {
52+
delete db_;
53+
db_ = nullptr;
54+
Options opts;
55+
if (options != nullptr) {
56+
opts = *options;
57+
} else {
58+
opts = CurrentOptions();
59+
opts.create_if_missing = true;
60+
}
61+
ASSERT_OK(DB::Open(opts, dbname_, &db_));
62+
}
63+
64+
Status Put(const Slice& k, const Slice& v) {
65+
return db_->Put(WriteOptions(), k, v);
66+
}
67+
68+
Status Delete(const std::string& k) {
69+
return db_->Delete(WriteOptions(), k);
70+
}
71+
72+
std::string Get(const std::string& k) {
73+
ReadOptions options;
74+
std::string result;
75+
Status s = db_->Get(options, k, &result);
76+
if (s.IsNotFound()) {
77+
result = "NOT_FOUND";
78+
} else if (!s.ok()) {
79+
result = s.ToString();
80+
}
81+
return result;
82+
}
83+
84+
int NumTableFilesAtLevel(int level) {
85+
std::string property;
86+
ASSERT_TRUE(
87+
db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
88+
&property));
89+
return atoi(property.c_str());
90+
}
91+
92+
// Return spread of files per level
93+
std::string FilesPerLevel() {
94+
std::string result;
95+
int last_non_zero_offset = 0;
96+
for (int level = 0; level < db_->NumberLevels(); level++) {
97+
int f = NumTableFilesAtLevel(level);
98+
char buf[100];
99+
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
100+
result += buf;
101+
if (f > 0) {
102+
last_non_zero_offset = result.size();
103+
}
104+
}
105+
result.resize(last_non_zero_offset);
106+
return result;
107+
}
108+
};
109+
110+
TEST(CuckooTableDBTest, Flush) {
111+
// Try with empty DB first.
112+
ASSERT_TRUE(dbfull() != nullptr);
113+
ASSERT_EQ("NOT_FOUND", Get("key2"));
114+
115+
// Add some values to db.
116+
Options options = CurrentOptions();
117+
Reopen(&options);
118+
119+
ASSERT_OK(Put("key1", "v1"));
120+
ASSERT_OK(Put("key2", "v2"));
121+
ASSERT_OK(Put("key3", "v3"));
122+
dbfull()->TEST_FlushMemTable();
123+
124+
TablePropertiesCollection ptc;
125+
reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
126+
ASSERT_EQ(1U, ptc.size());
127+
ASSERT_EQ(3, ptc.begin()->second->num_entries);
128+
ASSERT_EQ("1", FilesPerLevel());
129+
130+
ASSERT_EQ("v1", Get("key1"));
131+
ASSERT_EQ("v2", Get("key2"));
132+
ASSERT_EQ("v3", Get("key3"));
133+
ASSERT_EQ("NOT_FOUND", Get("key4"));
134+
ASSERT_EQ("Invalid argument: Length of key is invalid.", Get("somelongkey"));
135+
ASSERT_EQ("Invalid argument: Length of key is invalid.", Get("s"));
136+
137+
// Now add more keys and flush.
138+
ASSERT_OK(Put("key4", "v4"));
139+
ASSERT_OK(Put("key5", "v5"));
140+
ASSERT_OK(Put("key6", "v6"));
141+
dbfull()->TEST_FlushMemTable();
142+
143+
reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
144+
ASSERT_EQ(2U, ptc.size());
145+
auto row = ptc.begin();
146+
ASSERT_EQ(3, row->second->num_entries);
147+
ASSERT_EQ(3, (++row)->second->num_entries);
148+
ASSERT_EQ("2", FilesPerLevel());
149+
ASSERT_EQ("v1", Get("key1"));
150+
ASSERT_EQ("v2", Get("key2"));
151+
ASSERT_EQ("v3", Get("key3"));
152+
ASSERT_EQ("v4", Get("key4"));
153+
ASSERT_EQ("v5", Get("key5"));
154+
ASSERT_EQ("v6", Get("key6"));
155+
156+
ASSERT_OK(Delete("key6"));
157+
ASSERT_OK(Delete("key5"));
158+
ASSERT_OK(Delete("key4"));
159+
dbfull()->TEST_FlushMemTable();
160+
reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
161+
ASSERT_EQ(3U, ptc.size());
162+
row = ptc.begin();
163+
ASSERT_EQ(3, row->second->num_entries);
164+
ASSERT_EQ(3, (++row)->second->num_entries);
165+
ASSERT_EQ(3, (++row)->second->num_entries);
166+
ASSERT_EQ("3", FilesPerLevel());
167+
ASSERT_EQ("v1", Get("key1"));
168+
ASSERT_EQ("v2", Get("key2"));
169+
ASSERT_EQ("v3", Get("key3"));
170+
ASSERT_EQ("NOT_FOUND", Get("key4"));
171+
ASSERT_EQ("NOT_FOUND", Get("key5"));
172+
ASSERT_EQ("NOT_FOUND", Get("key6"));
173+
}
174+
175+
TEST(CuckooTableDBTest, FlushWithDuplicateKeys) {
176+
Options options = CurrentOptions();
177+
Reopen(&options);
178+
ASSERT_OK(Put("key1", "v1"));
179+
ASSERT_OK(Put("key2", "v2"));
180+
ASSERT_OK(Put("key1", "v3")); // Duplicate
181+
dbfull()->TEST_FlushMemTable();
182+
183+
TablePropertiesCollection ptc;
184+
reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
185+
ASSERT_EQ(1U, ptc.size());
186+
ASSERT_EQ(2, ptc.begin()->second->num_entries);
187+
ASSERT_EQ("1", FilesPerLevel());
188+
ASSERT_EQ("v3", Get("key1"));
189+
ASSERT_EQ("v2", Get("key2"));
190+
}
191+
192+
namespace {
193+
static std::string Key(int i) {
194+
char buf[100];
195+
snprintf(buf, sizeof(buf), "key_______%06d", i);
196+
return std::string(buf);
197+
}
198+
}
199+
200+
TEST(CuckooTableDBTest, CompactionTrigger) {
201+
Options options = CurrentOptions();
202+
options.write_buffer_size = 100 << 10; // 100KB
203+
options.level0_file_num_compaction_trigger = 2;
204+
Reopen(&options);
205+
206+
// Write 11 values, each 10016 B
207+
for (int idx = 0; idx < 11; ++idx) {
208+
ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
209+
}
210+
dbfull()->TEST_WaitForFlushMemTable();
211+
ASSERT_EQ("1", FilesPerLevel());
212+
213+
// Generate one more file in level-0, and should trigger level-0 compaction
214+
for (int idx = 11; idx < 22; ++idx) {
215+
ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
216+
}
217+
dbfull()->TEST_WaitForFlushMemTable();
218+
dbfull()->TEST_CompactRange(0, nullptr, nullptr);
219+
220+
ASSERT_EQ("0,2", FilesPerLevel());
221+
for (int idx = 0; idx < 22; ++idx) {
222+
ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx)));
223+
}
224+
}
225+
226+
TEST(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) {
227+
// Insert same key twice so that they go to different SST files. Then wait for
228+
// compaction and check if the latest value is stored and old value removed.
229+
Options options = CurrentOptions();
230+
options.write_buffer_size = 100 << 10; // 100KB
231+
options.level0_file_num_compaction_trigger = 2;
232+
Reopen(&options);
233+
234+
// Write 11 values, each 10016 B
235+
for (int idx = 0; idx < 11; ++idx) {
236+
ASSERT_OK(Put(Key(idx), std::string(10000, 'a')));
237+
}
238+
dbfull()->TEST_WaitForFlushMemTable();
239+
ASSERT_EQ("1", FilesPerLevel());
240+
241+
// Generate one more file in level-0, and should trigger level-0 compaction
242+
for (int idx = 0; idx < 11; ++idx) {
243+
ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
244+
}
245+
dbfull()->TEST_WaitForFlushMemTable();
246+
dbfull()->TEST_CompactRange(0, nullptr, nullptr);
247+
248+
ASSERT_EQ("0,1", FilesPerLevel());
249+
for (int idx = 0; idx < 11; ++idx) {
250+
ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx)));
251+
}
252+
}
253+
254+
TEST(CuckooTableDBTest, AdaptiveTable) {
255+
Options options = CurrentOptions();
256+
257+
// Write some keys using cuckoo table.
258+
options.table_factory.reset(NewCuckooTableFactory());
259+
Reopen(&options);
260+
261+
ASSERT_OK(Put("key1", "v1"));
262+
ASSERT_OK(Put("key2", "v2"));
263+
ASSERT_OK(Put("key3", "v3"));
264+
dbfull()->TEST_FlushMemTable();
265+
266+
// Write some keys using plain table.
267+
options.create_if_missing = false;
268+
options.table_factory.reset(NewPlainTableFactory());
269+
Reopen(&options);
270+
ASSERT_OK(Put("key4", "v4"));
271+
ASSERT_OK(Put("key1", "v5"));
272+
dbfull()->TEST_FlushMemTable();
273+
274+
// Write some keys using block based table.
275+
std::shared_ptr<TableFactory> block_based_factory(
276+
NewBlockBasedTableFactory());
277+
options.table_factory.reset(NewAdaptiveTableFactory(block_based_factory));
278+
Reopen(&options);
279+
ASSERT_OK(Put("key5", "v6"));
280+
ASSERT_OK(Put("key2", "v7"));
281+
dbfull()->TEST_FlushMemTable();
282+
283+
ASSERT_EQ("v5", Get("key1"));
284+
ASSERT_EQ("v7", Get("key2"));
285+
ASSERT_EQ("v3", Get("key3"));
286+
ASSERT_EQ("v4", Get("key4"));
287+
ASSERT_EQ("v6", Get("key5"));
288+
}
289+
} // namespace rocksdb
290+
291+
int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

include/rocksdb/table.h

+5-1
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,9 @@ struct CuckooTablePropertyNames {
192192
static const std::string kIsLastLevel;
193193
};
194194

195+
extern TableFactory* NewCuckooTableFactory(double hash_table_ratio = 0.9,
196+
uint32_t max_search_depth = 100);
197+
195198
#endif // ROCKSDB_LITE
196199

197200
// A base class for table factories.
@@ -263,7 +266,8 @@ class TableFactory {
263266
extern TableFactory* NewAdaptiveTableFactory(
264267
std::shared_ptr<TableFactory> table_factory_to_write = nullptr,
265268
std::shared_ptr<TableFactory> block_based_table_factory = nullptr,
266-
std::shared_ptr<TableFactory> plain_table_factory = nullptr);
269+
std::shared_ptr<TableFactory> plain_table_factory = nullptr,
270+
std::shared_ptr<TableFactory> cuckoo_table_factory = nullptr);
267271

268272
#endif // ROCKSDB_LITE
269273

table/adaptive_table_factory.cc

+15-5
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@ namespace rocksdb {
1212
AdaptiveTableFactory::AdaptiveTableFactory(
1313
std::shared_ptr<TableFactory> table_factory_to_write,
1414
std::shared_ptr<TableFactory> block_based_table_factory,
15-
std::shared_ptr<TableFactory> plain_table_factory)
15+
std::shared_ptr<TableFactory> plain_table_factory,
16+
std::shared_ptr<TableFactory> cuckoo_table_factory)
1617
: table_factory_to_write_(table_factory_to_write),
1718
block_based_table_factory_(block_based_table_factory),
18-
plain_table_factory_(plain_table_factory) {
19+
plain_table_factory_(plain_table_factory),
20+
cuckoo_table_factory_(cuckoo_table_factory) {
1921
if (!table_factory_to_write_) {
2022
table_factory_to_write_ = block_based_table_factory_;
2123
}
@@ -25,12 +27,16 @@ AdaptiveTableFactory::AdaptiveTableFactory(
2527
if (!block_based_table_factory_) {
2628
block_based_table_factory_.reset(NewBlockBasedTableFactory());
2729
}
30+
if (!cuckoo_table_factory_) {
31+
cuckoo_table_factory_.reset(NewCuckooTableFactory());
32+
}
2833
}
2934

3035
extern const uint64_t kPlainTableMagicNumber;
3136
extern const uint64_t kLegacyPlainTableMagicNumber;
3237
extern const uint64_t kBlockBasedTableMagicNumber;
3338
extern const uint64_t kLegacyBlockBasedTableMagicNumber;
39+
extern const uint64_t kCuckooTableMagicNumber;
3440

3541
Status AdaptiveTableFactory::NewTableReader(
3642
const Options& options, const EnvOptions& soptions,
@@ -49,6 +55,9 @@ Status AdaptiveTableFactory::NewTableReader(
4955
footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) {
5056
return block_based_table_factory_->NewTableReader(
5157
options, soptions, icomp, std::move(file), file_size, table);
58+
} else if (footer.table_magic_number() == kCuckooTableMagicNumber) {
59+
return cuckoo_table_factory_->NewTableReader(
60+
options, soptions, icomp, std::move(file), file_size, table);
5261
} else {
5362
return Status::NotSupported("Unidentified table format");
5463
}
@@ -64,9 +73,10 @@ TableBuilder* AdaptiveTableFactory::NewTableBuilder(
6473
extern TableFactory* NewAdaptiveTableFactory(
6574
std::shared_ptr<TableFactory> table_factory_to_write,
6675
std::shared_ptr<TableFactory> block_based_table_factory,
67-
std::shared_ptr<TableFactory> plain_table_factory) {
68-
return new AdaptiveTableFactory(
69-
table_factory_to_write, block_based_table_factory, plain_table_factory);
76+
std::shared_ptr<TableFactory> plain_table_factory,
77+
std::shared_ptr<TableFactory> cuckoo_table_factory) {
78+
return new AdaptiveTableFactory(table_factory_to_write,
79+
block_based_table_factory, plain_table_factory, cuckoo_table_factory);
7080
}
7181

7282
} // namespace rocksdb

0 commit comments

Comments
 (0)