Skip to content

Commit abaf262

Browse files
committed
[RocksDB] [Performance Branch] Some Changes to PlainTable format
Summary: Some changes to PlainTable format: (1) support variable key length (2) use user defined slice transformer to extract prefixes (3) Run some test cases against PlainTable in db_test and table_test Test Plan: test db_test Reviewers: haobo, kailiu CC: dhruba, igor, leveldb, nkg- Differential Revision: https://reviews.facebook.net/D14457
1 parent 28c24de commit abaf262

15 files changed

+716
-358
lines changed

db/db_test.cc

+52-19
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "rocksdb/env.h"
2424
#include "rocksdb/table.h"
2525
#include "rocksdb/perf_context.h"
26+
#include "rocksdb/plain_table_factory.h"
2627
#include "util/hash.h"
2728
#include "util/logging.h"
2829
#include "util/mutexlock.h"
@@ -244,6 +245,8 @@ class DBTest {
244245
// Sequence of option configurations to try
245246
enum OptionConfig {
246247
kDefault,
248+
kPlainTableFirstBytePrefix,
249+
kPlainTableAllBytesPrefix,
247250
kVectorRep,
248251
kMergePut,
249252
kFilter,
@@ -275,7 +278,8 @@ class DBTest {
275278
kNoSkip = 0,
276279
kSkipDeletesFilterFirst = 1,
277280
kSkipUniversalCompaction = 2,
278-
kSkipMergePut = 4
281+
kSkipMergePut = 4,
282+
kSkipPlainTable = 8
279283
};
280284

281285
DBTest() : option_config_(kDefault),
@@ -297,20 +301,27 @@ class DBTest {
297301
// Switch to a fresh database with the next option configuration to
298302
// test. Return false if there are no more configurations to test.
299303
bool ChangeOptions(int skip_mask = kNoSkip) {
300-
option_config_++;
301-
302304
// skip some options
303-
if (skip_mask & kSkipDeletesFilterFirst &&
304-
option_config_ == kDeletesFilterFirst) {
305-
option_config_++;
306-
}
307-
if (skip_mask & kSkipUniversalCompaction &&
308-
option_config_ == kUniversalCompaction) {
309-
option_config_++;
310-
}
311-
if (skip_mask & kSkipMergePut && option_config_ == kMergePut) {
312-
option_config_++;
305+
for(option_config_++; option_config_ < kEnd; option_config_++) {
306+
if ((skip_mask & kSkipDeletesFilterFirst) &&
307+
option_config_ == kDeletesFilterFirst) {
308+
continue;
309+
}
310+
if ((skip_mask & kSkipUniversalCompaction) &&
311+
option_config_ == kUniversalCompaction) {
312+
continue;
313+
}
314+
if ((skip_mask & kSkipMergePut) && option_config_ == kMergePut) {
315+
continue;
316+
}
317+
if ((skip_mask & kSkipPlainTable)
318+
&& (option_config_ == kPlainTableAllBytesPrefix
319+
|| option_config_ == kPlainTableFirstBytePrefix)) {
320+
continue;
321+
}
322+
break;
313323
}
324+
314325
if (option_config_ >= kEnd) {
315326
Destroy(&last_options_);
316327
return false;
@@ -343,6 +354,18 @@ class DBTest {
343354
options.memtable_factory.reset(
344355
NewHashSkipListRepFactory(NewFixedPrefixTransform(1)));
345356
break;
357+
case kPlainTableFirstBytePrefix:
358+
options.table_factory.reset(new PlainTableFactory());
359+
options.prefix_extractor = NewFixedPrefixTransform(1);
360+
options.allow_mmap_reads = true;
361+
options.max_sequential_skip_in_iterations = 999999;
362+
break;
363+
case kPlainTableAllBytesPrefix:
364+
options.table_factory.reset(new PlainTableFactory());
365+
options.prefix_extractor = NewNoopTransform();
366+
options.allow_mmap_reads = true;
367+
options.max_sequential_skip_in_iterations = 999999;
368+
break;
346369
case kMergePut:
347370
options.merge_operator = MergeOperators::CreatePutOperator();
348371
break;
@@ -1009,7 +1032,10 @@ TEST(DBTest, KeyMayExist) {
10091032
options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
10101033

10111034
delete options.filter_policy;
1012-
} while (ChangeOptions());
1035+
1036+
// KeyMayExist function only checks data in block caches, which is not used
1037+
// by plain table format.
1038+
} while (ChangeOptions(kSkipPlainTable));
10131039
}
10141040

10151041
TEST(DBTest, NonBlockingIteration) {
@@ -1073,7 +1099,9 @@ TEST(DBTest, NonBlockingIteration) {
10731099
options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
10741100
delete iter;
10751101

1076-
} while (ChangeOptions());
1102+
// This test verifies block cache behaviors, which is not used by plain
1103+
// table format.
1104+
} while (ChangeOptions(kSkipPlainTable));
10771105
}
10781106

10791107
// A delete is skipped for key if KeyMayExist(key) returns False
@@ -2932,7 +2960,8 @@ TEST(DBTest, ApproximateSizes) {
29322960
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
29332961
ASSERT_GT(NumTableFilesAtLevel(1), 0);
29342962
}
2935-
} while (ChangeOptions(kSkipUniversalCompaction));
2963+
// ApproximateOffsetOf() is not yet implemented in plain table format.
2964+
} while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable));
29362965
}
29372966

29382967
TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
@@ -2970,7 +2999,8 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
29702999

29713000
dbfull()->TEST_CompactRange(0, nullptr, nullptr);
29723001
}
2973-
} while (ChangeOptions());
3002+
// ApproximateOffsetOf() is not yet implemented in plain table format.
3003+
} while (ChangeOptions(kSkipPlainTable));
29743004
}
29753005

29763006
TEST(DBTest, IteratorPinsRef) {
@@ -3054,7 +3084,9 @@ TEST(DBTest, HiddenValuesAreRemoved) {
30543084
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
30553085

30563086
ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
3057-
} while (ChangeOptions(kSkipUniversalCompaction));
3087+
// ApproximateOffsetOf() is not yet implemented in plain table format,
3088+
// which is used by Size().
3089+
} while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable));
30583090
}
30593091

30603092
TEST(DBTest, CompactBetweenSnapshots) {
@@ -4626,7 +4658,8 @@ TEST(DBTest, Randomized) {
46264658
// TODO(sanjay): Test Get() works
46274659
int p = rnd.Uniform(100);
46284660
int minimum = 0;
4629-
if (option_config_ == kHashSkipList) {
4661+
if (option_config_ == kHashSkipList ||
4662+
option_config_ == kPlainTableFirstBytePrefix) {
46304663
minimum = 1;
46314664
}
46324665
if (p < 45) { // Put

db/plain_table_db_test.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@ class PlainTableDBTest {
5959
// Return the current option configuration.
6060
Options CurrentOptions() {
6161
Options options;
62-
options.table_factory.reset(new PlainTableFactory(16, 8, 2, 0.8));
62+
options.table_factory.reset(new PlainTableFactory(16, 2, 0.8));
63+
options.prefix_extractor = NewFixedPrefixTransform(8);
6364
options.allow_mmap_reads = true;
6465
return options;
6566
}

include/rocksdb/plain_table_factory.h

+21-24
Original file line numberDiff line numberDiff line change
@@ -23,41 +23,37 @@ class TableBuilder;
2323

2424
// IndexedTable requires fixed length key, configured as a constructor
2525
// parameter of the factory class. Output file format:
26-
// +-------------+
27-
// | version |
28-
// +-------------+------------------------------+ <= key1 offset
29-
// | key1 | value_size (4 bytes) | |
30-
// +----------------------------------------+ |
26+
// +-------------+-----------------+
27+
// | version | user_key_length |
28+
// +------------++------------------------------+ <= key1 offset
29+
// | [key_size] | key1 | value_size | |
30+
// +------------+-------------+-------------+ |
3131
// | value1 |
3232
// | |
3333
// +----------------------------------------+---+ <= key2 offset
34-
// | key2 | value_size (4 bytes) | |
35-
// +----------------------------------------+ |
34+
// | [key_size] | key2 | value_size | |
35+
// +------------+-------------+-------------+ |
3636
// | value2 |
3737
// | |
3838
// | ...... |
39-
// +-----------------+--------------------------+ <= index_block_offset
40-
// | key1 | key1 offset (8 bytes) |
4139
// +-----------------+--------------------------+
42-
// | key2 | key2 offset (8 bytes) |
43-
// +-----------------+--------------------------+
44-
// | key3 | key3 offset (8 bytes) |
45-
// +-----------------+--------------------------+
46-
// | ...... |
47-
// +-----------------+------------+-------------+
40+
// If user_key_length = kVariableLength, it means the key is variable length,
41+
// there will be an extra field for key size encoded before every key.
4842
class PlainTableFactory: public TableFactory {
4943
public:
5044
~PlainTableFactory() {
5145
}
52-
// user_key_size is the length of the user key. key_prefix_len is the
53-
// length of the prefix used for in-memory indexes. bloom_num_bits is
46+
// user_key_size is the length of the user key. If it is set to be
47+
// kVariableLength, then it means variable length. Otherwise, all the
48+
// keys need to have the fix length of this value. bloom_num_bits is
5449
// number of bits used for bloom filer per key. hash_table_ratio is
55-
// the desired ultilization of the hash table used for prefix hashing.
50+
// the desired utilization of the hash table used for prefix hashing.
5651
// hash_table_ratio = number of prefixes / #buckets in the hash table
57-
PlainTableFactory(int user_key_size, int key_prefix_len,
58-
int bloom_num_bits = 0, double hash_table_ratio = 0.75) :
59-
user_key_size_(user_key_size), key_prefix_len_(key_prefix_len),
60-
bloom_num_bits_(bloom_num_bits), hash_table_ratio_(hash_table_ratio) {
52+
explicit PlainTableFactory(uint32_t user_key_len = kVariableLength,
53+
int bloom_num_bits = 0,
54+
double hash_table_ratio = 0.75) :
55+
user_key_len_(user_key_len), bloom_num_bits_(bloom_num_bits),
56+
hash_table_ratio_(hash_table_ratio) {
6157
}
6258
const char* Name() const override {
6359
return "PlainTable";
@@ -70,9 +66,10 @@ class PlainTableFactory: public TableFactory {
7066
TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
7167
CompressionType compression_type) const
7268
override;
69+
70+
static const uint32_t kVariableLength = 0;
7371
private:
74-
int user_key_size_;
75-
int key_prefix_len_;
72+
uint32_t user_key_len_;
7673
int bloom_num_bits_;
7774
double hash_table_ratio_;
7875
};

include/rocksdb/table_properties.h

+6
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ struct TableProperties {
3737
uint64_t num_data_blocks = 0;
3838
// the number of entries in this table
3939
uint64_t num_entries = 0;
40+
// format version, reserved for backward compatibility
41+
uint64_t format_version = 0;
42+
// If 0, key is variable length. Otherwise number of bytes for each key.
43+
uint64_t fixed_key_len = 0;
4044

4145
// The name of the filter policy used in this table.
4246
// If no filter policy is used, `filter_policy_name` will be an empty string.
@@ -61,6 +65,8 @@ struct TablePropertiesNames {
6165
static const std::string kRawValueSize;
6266
static const std::string kNumDataBlocks;
6367
static const std::string kNumEntries;
68+
static const std::string kFormatVersion;
69+
static const std::string kFixedKeyLen;
6470
static const std::string kFilterPolicy;
6571
};
6672

table/meta_blocks.cc

+4
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
6767
Add(TablePropertiesNames::kNumEntries, props.num_entries);
6868
Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
6969
Add(TablePropertiesNames::kFilterSize, props.filter_size);
70+
Add(TablePropertiesNames::kFormatVersion, props.format_version);
71+
Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len);
7072

7173
if (!props.filter_policy_name.empty()) {
7274
Add(TablePropertiesNames::kFilterPolicy,
@@ -175,6 +177,8 @@ Status ReadProperties(
175177
{ TablePropertiesNames::kNumDataBlocks,
176178
&table_properties->num_data_blocks },
177179
{ TablePropertiesNames::kNumEntries, &table_properties->num_entries },
180+
{ TablePropertiesNames::kFormatVersion, &table_properties->format_version },
181+
{ TablePropertiesNames::kFixedKeyLen, &table_properties->fixed_key_len },
178182
};
179183

180184
std::string last_key;

table/plain_table_builder.cc

+23-14
Original file line numberDiff line numberDiff line change
@@ -50,38 +50,47 @@ extern const uint64_t kPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
5050

5151
PlainTableBuilder::PlainTableBuilder(const Options& options,
5252
WritableFile* file,
53-
int user_key_size, int key_prefix_len) :
54-
options_(options), file_(file), user_key_size_(user_key_size) {
55-
std::string version;
56-
PutFixed32(&version, 1 | 0x80000000);
57-
file_->Append(Slice(version));
58-
offset_ = 4;
53+
uint32_t user_key_len) :
54+
options_(options), file_(file), user_key_len_(user_key_len) {
55+
properties_.fixed_key_len = user_key_len;
5956

6057
// for plain table, we put all the data in a big chuck.
6158
properties_.num_data_blocks = 1;
6259
// emphasize that currently plain table doesn't have persistent index or
6360
// filter block.
6461
properties_.index_size = 0;
6562
properties_.filter_size = 0;
63+
properties_.format_version = 0;
6664
}
6765

6866
PlainTableBuilder::~PlainTableBuilder() {
6967
}
7068

7169
void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
72-
assert((int) key.size() == GetInternalKeyLength());
70+
assert(user_key_len_ == 0 || key.size() == user_key_len_ + 8);
71+
72+
if (!IsFixedLength()) {
73+
// Write key length
74+
int key_size = key.size();
75+
key_size_str_.clear();
76+
PutVarint32(&key_size_str_, key_size);
77+
file_->Append(key_size_str_);
78+
offset_ += key_size_str_.length();
79+
}
7380

74-
// Write key-value pair
81+
// Write key
7582
file_->Append(key);
76-
offset_ += GetInternalKeyLength();
83+
offset_ += key.size();
7784

78-
std::string size;
85+
// Write value length
86+
value_size_str_.clear();
7987
int value_size = value.size();
80-
PutVarint32(&size, value_size);
81-
Slice sizeSlice(size);
82-
file_->Append(sizeSlice);
88+
PutVarint32(&value_size_str_, value_size);
89+
file_->Append(value_size_str_);
90+
91+
// Write value
8392
file_->Append(value);
84-
offset_ += value_size + size.length();
93+
offset_ += value_size + value_size_str_.length();
8594

8695
properties_.num_entries++;
8796
properties_.raw_key_size += key.size();

table/plain_table_builder.h

+7-4
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class PlainTableBuilder: public TableBuilder {
2727
// will be part of level specified by 'level'. A value of -1 means
2828
// that the caller does not know which level the output file will reside.
2929
PlainTableBuilder(const Options& options, WritableFile* file,
30-
int user_key_size, int key_prefix_len);
30+
uint32_t user_key_size);
3131

3232
// REQUIRES: Either Finish() or Abandon() has been called.
3333
~PlainTableBuilder();
@@ -66,11 +66,14 @@ class PlainTableBuilder: public TableBuilder {
6666
Status status_;
6767
TableProperties properties_;
6868

69-
const size_t user_key_size_;
69+
const size_t user_key_len_;
7070
bool closed_ = false; // Either Finish() or Abandon() has been called.
7171

72-
int GetInternalKeyLength() {
73-
return user_key_size_ + 8;
72+
std::string key_size_str_;
73+
std::string value_size_str_;
74+
75+
bool IsFixedLength() const {
76+
return user_key_len_ > 0;
7477
}
7578

7679
// No copying allowed

table/plain_table_factory.cc

+2-3
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,12 @@ Status PlainTableFactory::GetTableReader(const Options& options,
1919
unique_ptr<TableReader>* table)
2020
const {
2121
return PlainTableReader::Open(options, soptions, std::move(file), file_size,
22-
table, user_key_size_, key_prefix_len_,
23-
bloom_num_bits_, hash_table_ratio_);
22+
table, bloom_num_bits_, hash_table_ratio_);
2423
}
2524

2625
TableBuilder* PlainTableFactory::GetTableBuilder(
2726
const Options& options, WritableFile* file,
2827
CompressionType compression_type) const {
29-
return new PlainTableBuilder(options, file, user_key_size_, key_prefix_len_);
28+
return new PlainTableBuilder(options, file, user_key_len_);
3029
}
3130
} // namespace rocksdb

0 commit comments

Comments
 (0)