Skip to content

Commit 4a7c747

Browse files
committed
Revert "Revert "Allow allocating dynamic bloom, plain table indexes and hash linked list from huge page TLB""
And make the default 0 for hash linked list memtable This reverts commit d69dc64.
1 parent d56959a commit 4a7c747

18 files changed

+516
-374
lines changed

db/db_test.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ class DBTest {
481481
break;
482482
case kHashLinkList:
483483
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
484-
options.memtable_factory.reset(NewHashLinkListRepFactory(4));
484+
options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0));
485485
break;
486486
case kHashCuckoo:
487487
options.memtable_factory.reset(

db/memtable.cc

+4-3
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,10 @@ MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
5252
// gone wrong already.
5353
assert(!should_flush_);
5454
if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
55-
prefix_bloom_.reset(new DynamicBloom(options.memtable_prefix_bloom_bits,
56-
options.bloom_locality,
57-
options.memtable_prefix_bloom_probes));
55+
prefix_bloom_.reset(new DynamicBloom(
56+
options.memtable_prefix_bloom_bits, options.bloom_locality,
57+
options.memtable_prefix_bloom_probes, nullptr,
58+
options.memtable_prefix_bloom_huge_page_tlb_size));
5859
}
5960
}
6061

db/plain_table_db_test.cc

+325-306
Large diffs are not rendered by default.

db/prefix_test.cc

+8
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
3030
DEFINE_int32(skiplist_height, 4, "");
3131
DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
3232
DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
33+
DEFINE_int32(memtable_prefix_bloom_huge_page_tlb_size, 2 * 1024 * 1024, "");
3334
DEFINE_int32(value_size, 40, "");
3435

3536
// Path to the database on file system
@@ -148,6 +149,8 @@ class PrefixTest {
148149

149150
options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits;
150151
options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes;
152+
options.memtable_prefix_bloom_huge_page_tlb_size =
153+
FLAGS_memtable_prefix_bloom_huge_page_tlb_size;
151154

152155
Status s = DB::Open(options, kDbName, &db);
153156
ASSERT_OK(s);
@@ -172,6 +175,10 @@ class PrefixTest {
172175
options.memtable_factory.reset(
173176
NewHashLinkListRepFactory(bucket_count));
174177
return true;
178+
case kHashLinkListHugePageTlb:
179+
options.memtable_factory.reset(
180+
NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024));
181+
return true;
175182
default:
176183
return false;
177184
}
@@ -190,6 +197,7 @@ class PrefixTest {
190197
kBegin,
191198
kHashSkipList,
192199
kHashLinkList,
200+
kHashLinkListHugePageTlb,
193201
kEnd
194202
};
195203
int option_config_;

include/rocksdb/memtablerep.h

+7-2
Original file line numberDiff line numberDiff line change
@@ -223,9 +223,14 @@ extern MemTableRepFactory* NewHashSkipListRepFactory(
223223
// The factory is to create memtables with a hashed linked list:
224224
// it contains a fixed array of buckets, each pointing to a sorted single
225225
// linked list (null if the bucket is empty).
226-
// bucket_count: number of fixed array buckets
226+
// @bucket_count: number of fixed array buckets
227+
// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
228+
// Otherwise from huge page TLB. The user needs to reserve
229+
// huge pages for it to be allocated, like:
230+
// sysctl -w vm.nr_hugepages=20
231+
// See linux doc Documentation/vm/hugetlbpage.txt
227232
extern MemTableRepFactory* NewHashLinkListRepFactory(
228-
size_t bucket_count = 50000);
233+
size_t bucket_count = 50000, size_t huge_page_tlb_size = 0);
229234

230235
// This factory creates a cuckoo-hashing based mem-table representation.
231236
// Cuckoo-hash is a closed-hash strategy, in which all key/value pairs

include/rocksdb/options.h

+8
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,14 @@ struct ColumnFamilyOptions {
498498
// number of hash probes per key
499499
uint32_t memtable_prefix_bloom_probes;
500500

501+
// Page size for huge page TLB for bloom in memtable. If <=0, not allocate
502+
// from huge page TLB but from malloc.
503+
// Need to reserve huge pages for it to be allocated. For example:
504+
// sysctl -w vm.nr_hugepages=20
505+
// See linux doc Documentation/vm/hugetlbpage.txt
506+
507+
size_t memtable_prefix_bloom_huge_page_tlb_size;
508+
501509
// Control locality of bloom filter probes to improve cache miss rate.
502510
// This option only applies to memtable prefix bloom and plaintable
503511
// prefix bloom. It essentially limits the max number of cache lines each

include/rocksdb/table.h

+15-2
Original file line numberDiff line numberDiff line change
@@ -107,12 +107,19 @@ extern TableFactory* NewBlockBasedTableFactory(
107107
// in the hash table
108108
// @index_sparseness: inside each prefix, need to build one index record for how
109109
// many keys for binary search inside each hash bucket.
110+
// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
111+
// Otherwise from huge page TLB. The user needs to reserve
112+
// huge pages for it to be allocated, like:
113+
// sysctl -w vm.nr_hugepages=20
114+
// See linux doc Documentation/vm/hugetlbpage.txt
115+
110116
const uint32_t kPlainTableVariableLength = 0;
111117
extern TableFactory* NewPlainTableFactory(uint32_t user_key_len =
112118
kPlainTableVariableLength,
113119
int bloom_bits_per_prefix = 10,
114120
double hash_table_ratio = 0.75,
115-
size_t index_sparseness = 16);
121+
size_t index_sparseness = 16,
122+
size_t huge_page_tlb_size = 0);
116123

117124
// -- Plain Table
118125
// This factory of plain table ignores Options.prefix_extractor and assumes no
@@ -126,9 +133,15 @@ extern TableFactory* NewPlainTableFactory(uint32_t user_key_len =
126133
// disable it by passing a zero.
127134
// @index_sparseness: need to build one index record for how many keys for
128135
// binary search.
136+
// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
137+
// Otherwise from huge page TLB. The user needs to reserve
138+
// huge pages for it to be allocated, like:
139+
// sysctl -w vm.nr_hugepages=20
140+
// See linux doc Documentation/vm/hugetlbpage.txt
129141
extern TableFactory* NewTotalOrderPlainTableFactory(
130142
uint32_t user_key_len = kPlainTableVariableLength,
131-
int bloom_bits_per_key = 0, size_t index_sparseness = 16);
143+
int bloom_bits_per_key = 0, size_t index_sparseness = 16,
144+
size_t huge_page_tlb_size = 0);
132145

133146
#endif // ROCKSDB_LITE
134147

table/plain_table_factory.cc

+9-5
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ Status PlainTableFactory::NewTableReader(const Options& options,
2222
unique_ptr<TableReader>* table) const {
2323
return PlainTableReader::Open(options, soptions, icomp, std::move(file),
2424
file_size, table, bloom_bits_per_key_,
25-
hash_table_ratio_, index_sparseness_);
25+
hash_table_ratio_, index_sparseness_,
26+
huge_page_tlb_size_);
2627
}
2728

2829
TableBuilder* PlainTableFactory::NewTableBuilder(
@@ -34,16 +35,19 @@ TableBuilder* PlainTableFactory::NewTableBuilder(
3435
extern TableFactory* NewPlainTableFactory(uint32_t user_key_len,
3536
int bloom_bits_per_key,
3637
double hash_table_ratio,
37-
size_t index_sparseness) {
38+
size_t index_sparseness,
39+
size_t huge_page_tlb_size) {
3840
return new PlainTableFactory(user_key_len, bloom_bits_per_key,
39-
hash_table_ratio, index_sparseness);
41+
hash_table_ratio, index_sparseness,
42+
huge_page_tlb_size);
4043
}
4144

4245
extern TableFactory* NewTotalOrderPlainTableFactory(uint32_t user_key_len,
4346
int bloom_bits_per_key,
44-
size_t index_sparseness) {
47+
size_t index_sparseness,
48+
size_t huge_page_tlb_size) {
4549
return new PlainTableFactory(user_key_len, bloom_bits_per_key, 0,
46-
index_sparseness);
50+
index_sparseness, huge_page_tlb_size);
4751
}
4852

4953
} // namespace rocksdb

table/plain_table_factory.h

+8-2
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,19 @@ class PlainTableFactory : public TableFactory {
5656
// inside the same prefix. It will be the maximum number of linear search
5757
// required after hash and binary search.
5858
// index_sparseness = 0 means index for every key.
59+
// huge_page_tlb_size determines whether to allocate hash indexes from huge
60+
// page TLB and the page size if allocating from there. See comments of
61+
// Arena::AllocateAligned() for details.
5962
explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength,
6063
int bloom_bits_per_key = 0,
6164
double hash_table_ratio = 0.75,
62-
size_t index_sparseness = 16)
65+
size_t index_sparseness = 16,
66+
size_t huge_page_tlb_size = 2 * 1024 * 1024)
6367
: user_key_len_(user_key_len),
6468
bloom_bits_per_key_(bloom_bits_per_key),
6569
hash_table_ratio_(hash_table_ratio),
66-
index_sparseness_(index_sparseness) {}
70+
index_sparseness_(index_sparseness),
71+
huge_page_tlb_size_(huge_page_tlb_size) {}
6772
const char* Name() const override { return "PlainTable"; }
6873
Status NewTableReader(const Options& options, const EnvOptions& soptions,
6974
const InternalKeyComparator& internal_comparator,
@@ -82,6 +87,7 @@ class PlainTableFactory : public TableFactory {
8287
int bloom_bits_per_key_;
8388
double hash_table_ratio_;
8489
size_t index_sparseness_;
90+
size_t huge_page_tlb_size_;
8591
};
8692

8793
} // namespace rocksdb

table/plain_table_reader.cc

+26-15
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "table/two_level_iterator.h"
2525
#include "table/plain_table_factory.h"
2626

27+
#include "util/arena.h"
2728
#include "util/coding.h"
2829
#include "util/dynamic_bloom.h"
2930
#include "util/hash.h"
@@ -95,7 +96,8 @@ PlainTableReader::PlainTableReader(
9596
const Options& options, unique_ptr<RandomAccessFile>&& file,
9697
const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
9798
uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio,
98-
size_t index_sparseness, const TableProperties* table_properties)
99+
size_t index_sparseness, const TableProperties* table_properties,
100+
size_t huge_page_tlb_size)
99101
: options_(options),
100102
soptions_(storage_options),
101103
file_(std::move(file)),
@@ -106,19 +108,23 @@ PlainTableReader::PlainTableReader(
106108
kIndexIntervalForSamePrefixKeys(index_sparseness),
107109
table_properties_(nullptr),
108110
data_end_offset_(table_properties->data_size),
109-
user_key_len_(table_properties->fixed_key_len) {
111+
user_key_len_(table_properties->fixed_key_len),
112+
huge_page_tlb_size_(huge_page_tlb_size) {
110113
assert(kHashTableRatio >= 0.0);
111114
}
112115

113116
PlainTableReader::~PlainTableReader() {
114117
}
115118

116-
Status PlainTableReader::Open(
117-
const Options& options, const EnvOptions& soptions,
118-
const InternalKeyComparator& internal_comparator,
119-
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
120-
unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
121-
double hash_table_ratio, size_t index_sparseness) {
119+
Status PlainTableReader::Open(const Options& options,
120+
const EnvOptions& soptions,
121+
const InternalKeyComparator& internal_comparator,
122+
unique_ptr<RandomAccessFile>&& file,
123+
uint64_t file_size,
124+
unique_ptr<TableReader>* table_reader,
125+
const int bloom_bits_per_key,
126+
double hash_table_ratio, size_t index_sparseness,
127+
size_t huge_page_tlb_size) {
122128
assert(options.allow_mmap_reads);
123129

124130
if (file_size > kMaxFileSize) {
@@ -134,7 +140,8 @@ Status PlainTableReader::Open(
134140

135141
std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
136142
options, std::move(file), soptions, internal_comparator, file_size,
137-
bloom_bits_per_key, hash_table_ratio, index_sparseness, props));
143+
bloom_bits_per_key, hash_table_ratio, index_sparseness, props,
144+
huge_page_tlb_size));
138145

139146
// -- Populate Index
140147
s = new_reader->PopulateIndex(props);
@@ -261,12 +268,11 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
261268
}
262269

263270
void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
264-
index_.reset();
265-
266271
if (options_.prefix_extractor.get() != nullptr) {
267272
uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey;
268273
if (bloom_total_bits > 0) {
269-
bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality));
274+
bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality,
275+
6, nullptr, huge_page_tlb_size_));
270276
}
271277
}
272278

@@ -278,7 +284,6 @@ void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
278284
double hash_table_size_multipier = 1.0 / kHashTableRatio;
279285
index_size_ = num_prefixes * hash_table_size_multipier + 1;
280286
}
281-
index_.reset(new uint32_t[index_size_]);
282287
}
283288

284289
size_t PlainTableReader::BucketizeIndexesAndFillBloom(
@@ -322,7 +327,12 @@ void PlainTableReader::FillIndexes(
322327
const std::vector<uint32_t>& entries_per_bucket) {
323328
Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
324329
kSubIndexSize);
325-
sub_index_.reset(new char[kSubIndexSize]);
330+
auto total_allocate_size = sizeof(uint32_t) * index_size_ + kSubIndexSize;
331+
char* allocated =
332+
arena_.AllocateAligned(total_allocate_size, huge_page_tlb_size_);
333+
index_ = reinterpret_cast<uint32_t*>(allocated);
334+
sub_index_ = allocated + sizeof(uint32_t) * index_size_;
335+
326336
size_t sub_index_offset = 0;
327337
for (int i = 0; i < index_size_; i++) {
328338
uint32_t num_keys_for_bucket = entries_per_bucket[i];
@@ -387,7 +397,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props) {
387397
if (IsTotalOrderMode()) {
388398
uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey;
389399
if (num_bloom_bits > 0) {
390-
bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality));
400+
bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality, 6,
401+
nullptr, huge_page_tlb_size_));
391402
}
392403
}
393404

table/plain_table_reader.h

+8-4
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "rocksdb/table_properties.h"
2020
#include "table/table_reader.h"
2121
#include "table/plain_table_factory.h"
22+
#include "util/arena.h"
2223

2324
namespace rocksdb {
2425

@@ -52,7 +53,7 @@ class PlainTableReader: public TableReader {
5253
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
5354
unique_ptr<TableReader>* table,
5455
const int bloom_bits_per_key, double hash_table_ratio,
55-
size_t index_sparseness);
56+
size_t index_sparseness, size_t huge_page_tlb_size);
5657

5758
Iterator* NewIterator(const ReadOptions&);
5859

@@ -74,7 +75,8 @@ class PlainTableReader: public TableReader {
7475
const InternalKeyComparator& internal_comparator,
7576
uint64_t file_size, int bloom_num_bits,
7677
double hash_table_ratio, size_t index_sparseness,
77-
const TableProperties* table_properties);
78+
const TableProperties* table_properties,
79+
size_t huge_page_tlb_size);
7880
virtual ~PlainTableReader();
7981

8082
protected:
@@ -136,9 +138,9 @@ class PlainTableReader: public TableReader {
136138
// For more details about the in-memory index, please refer to:
137139
// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
138140
// #wiki-in-memory-index-format
139-
std::unique_ptr<uint32_t[]> index_;
141+
uint32_t* index_;
140142
int index_size_ = 0;
141-
std::unique_ptr<char[]> sub_index_;
143+
char* sub_index_;
142144

143145
Options options_;
144146
const EnvOptions& soptions_;
@@ -159,13 +161,15 @@ class PlainTableReader: public TableReader {
159161
const size_t kIndexIntervalForSamePrefixKeys = 16;
160162
// Bloom filter is used to rule out non-existent key
161163
unique_ptr<DynamicBloom> bloom_;
164+
Arena arena_;
162165

163166
std::shared_ptr<const TableProperties> table_properties_;
164167
// data_start_offset_ and data_end_offset_ defines the range of the
165168
// sst file that stores data.
166169
const uint32_t data_start_offset_ = 0;
167170
const uint32_t data_end_offset_;
168171
const size_t user_key_len_;
172+
const size_t huge_page_tlb_size_;
169173

170174
static const size_t kNumInternalBytes = 8;
171175
static const uint32_t kSubIndexMask = 0x80000000;

0 commit comments

Comments
 (0)