Skip to content

Commit 94997ea

Browse files
author
Lei Jin
committed
reduce memory usage of cuckoo table builder
Summary: builder currently buffers all key value pairs as a vector of pair<string, string>. That is too much due to std::string overhead. It wasn't able to fit 1B key/values (12bytes total) in 100GB of ram. Switch to use a plain string to store the key/value sequence and use only 12GB of ram as a result. Test Plan: db_bench Reviewers: igor, sdong, yhchiang Reviewed By: sdong Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D23763
1 parent c627595 commit 94997ea

File tree

2 files changed

+56
-35
lines changed

2 files changed

+56
-35
lines changed

table/cuckoo_table_builder.cc

+43-33
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ CuckooTableBuilder::CuckooTableBuilder(
6060
hash_table_size_(use_module_hash ? 0 : 2),
6161
is_last_level_file_(false),
6262
has_seen_first_key_(false),
63+
key_size_(0),
64+
value_size_(0),
65+
num_entries_(0),
6366
ucomp_(user_comparator),
6467
use_module_hash_(use_module_hash),
6568
identity_as_first_hash_(identity_as_first_hash),
@@ -72,7 +75,7 @@ CuckooTableBuilder::CuckooTableBuilder(
7275
}
7376

7477
void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
75-
if (kvs_.size() >= kMaxVectorIdx - 1) {
78+
if (num_entries_ >= kMaxVectorIdx - 1) {
7679
status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1");
7780
return;
7881
}
@@ -90,15 +93,18 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
9093
has_seen_first_key_ = true;
9194
smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
9295
largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
96+
key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size();
97+
value_size_ = value.size();
9398
}
9499
// Even if one sequence number is non-zero, then it is not last level.
95100
assert(!is_last_level_file_ || ikey.sequence == 0);
96101
if (is_last_level_file_) {
97-
kvs_.emplace_back(std::make_pair(
98-
ikey.user_key.ToString(), value.ToString()));
102+
kvs_.append(ikey.user_key.data(), ikey.user_key.size());
99103
} else {
100-
kvs_.emplace_back(std::make_pair(key.ToString(), value.ToString()));
104+
kvs_.append(key.data(), key.size());
101105
}
106+
kvs_.append(value.data(), value.size());
107+
++num_entries_;
102108

103109
// In order to fill the empty buckets in the hash table, we identify a
104110
// key which is not used so far (unused_user_key). We determine this by
@@ -111,21 +117,32 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
111117
largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
112118
}
113119
if (!use_module_hash_) {
114-
if (hash_table_size_ < kvs_.size() / max_hash_table_ratio_) {
120+
if (hash_table_size_ < num_entries_ / max_hash_table_ratio_) {
115121
hash_table_size_ *= 2;
116122
}
117123
}
118124
}
119125

126+
Slice CuckooTableBuilder::GetKey(uint64_t idx) const {
127+
return Slice(&kvs_[idx * (key_size_ + value_size_)], key_size_);
128+
}
129+
130+
Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const {
131+
return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx));
132+
}
133+
134+
Slice CuckooTableBuilder::GetValue(uint64_t idx) const {
135+
return Slice(&kvs_[idx * (key_size_ + value_size_) + key_size_], value_size_);
136+
}
137+
120138
Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
121139
buckets->resize(hash_table_size_ + cuckoo_block_size_ - 1);
122140
uint64_t make_space_for_key_call_id = 0;
123-
for (uint32_t vector_idx = 0; vector_idx < kvs_.size(); vector_idx++) {
141+
for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) {
124142
uint64_t bucket_id;
125143
bool bucket_found = false;
126144
autovector<uint64_t> hash_vals;
127-
Slice user_key = is_last_level_file_ ? kvs_[vector_idx].first :
128-
ExtractUserKey(kvs_[vector_idx].first);
145+
Slice user_key = GetUserKey(vector_idx);
129146
for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found;
130147
++hash_cnt) {
131148
uint64_t hash_val = CuckooHash(user_key, hash_cnt, use_module_hash_,
@@ -140,10 +157,8 @@ Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
140157
bucket_found = true;
141158
break;
142159
} else {
143-
if (ucomp_->Compare(user_key, is_last_level_file_
144-
? Slice(kvs_[(*buckets)[hash_val].vector_idx].first)
145-
: ExtractUserKey(
146-
kvs_[(*buckets)[hash_val].vector_idx].first)) == 0) {
160+
if (ucomp_->Compare(user_key,
161+
GetUserKey((*buckets)[hash_val].vector_idx)) == 0) {
147162
return Status::NotSupported("Same key is being inserted again.");
148163
}
149164
hash_vals.push_back(hash_val);
@@ -183,10 +198,10 @@ Status CuckooTableBuilder::Finish() {
183198
std::vector<CuckooBucket> buckets;
184199
Status s;
185200
std::string unused_bucket;
186-
if (!kvs_.empty()) {
201+
if (num_entries_ > 0) {
187202
// Calculate the real hash size if module hash is enabled.
188203
if (use_module_hash_) {
189-
hash_table_size_ = kvs_.size() / max_hash_table_ratio_;
204+
hash_table_size_ = num_entries_ / max_hash_table_ratio_;
190205
}
191206
s = MakeHashTable(&buckets);
192207
if (!s.ok()) {
@@ -224,14 +239,13 @@ Status CuckooTableBuilder::Finish() {
224239
AppendInternalKey(&unused_bucket, ikey);
225240
}
226241
}
227-
properties_.num_entries = kvs_.size();
228-
properties_.fixed_key_len = unused_bucket.size();
229-
uint32_t value_length = kvs_.empty() ? 0 : kvs_[0].second.size();
230-
uint32_t bucket_size = value_length + properties_.fixed_key_len;
242+
properties_.num_entries = num_entries_;
243+
properties_.fixed_key_len = key_size_;
231244
properties_.user_collected_properties[
232245
CuckooTablePropertyNames::kValueLength].assign(
233-
reinterpret_cast<const char*>(&value_length), sizeof(value_length));
246+
reinterpret_cast<const char*>(&value_size_), sizeof(value_size_));
234247

248+
uint64_t bucket_size = key_size_ + value_size_;
235249
unused_bucket.resize(bucket_size, 'a');
236250
// Write the table.
237251
uint32_t num_added = 0;
@@ -240,9 +254,9 @@ Status CuckooTableBuilder::Finish() {
240254
s = file_->Append(Slice(unused_bucket));
241255
} else {
242256
++num_added;
243-
s = file_->Append(kvs_[bucket.vector_idx].first);
257+
s = file_->Append(GetKey(bucket.vector_idx));
244258
if (s.ok()) {
245-
s = file_->Append(kvs_[bucket.vector_idx].second);
259+
s = file_->Append(GetValue(bucket.vector_idx));
246260
}
247261
}
248262
if (!s.ok()) {
@@ -251,7 +265,7 @@ Status CuckooTableBuilder::Finish() {
251265
}
252266
assert(num_added == NumEntries());
253267
properties_.raw_key_size = num_added * properties_.fixed_key_len;
254-
properties_.raw_value_size = num_added * value_length;
268+
properties_.raw_value_size = num_added * value_size_;
255269

256270
uint64_t offset = buckets.size() * bucket_size;
257271
properties_.data_size = offset;
@@ -330,31 +344,29 @@ void CuckooTableBuilder::Abandon() {
330344
}
331345

332346
uint64_t CuckooTableBuilder::NumEntries() const {
333-
return kvs_.size();
347+
return num_entries_;
334348
}
335349

336350
uint64_t CuckooTableBuilder::FileSize() const {
337351
if (closed_) {
338352
return file_->GetFileSize();
339-
} else if (kvs_.size() == 0) {
353+
} else if (num_entries_ == 0) {
340354
return 0;
341355
}
342356

343357
if (use_module_hash_) {
344-
return (kvs_[0].first.size() + kvs_[0].second.size()) * kvs_.size() /
345-
max_hash_table_ratio_;
358+
return (key_size_ + value_size_) * num_entries_ / max_hash_table_ratio_;
346359
} else {
347360
// Account for buckets being a power of two.
348361
// As elements are added, file size remains constant for a while and
349362
// doubles its size. Since compaction algorithm stops adding elements
350363
// only after it exceeds the file limit, we account for the extra element
351364
// being added here.
352365
uint64_t expected_hash_table_size = hash_table_size_;
353-
if (expected_hash_table_size < (kvs_.size() + 1) / max_hash_table_ratio_) {
366+
if (expected_hash_table_size < (num_entries_ + 1) / max_hash_table_ratio_) {
354367
expected_hash_table_size *= 2;
355368
}
356-
return (kvs_[0].first.size() + kvs_[0].second.size()) *
357-
expected_hash_table_size - 1;
369+
return (key_size_ + value_size_) * expected_hash_table_size - 1;
358370
}
359371
}
360372

@@ -390,7 +402,7 @@ bool CuckooTableBuilder::MakeSpaceForKey(
390402
// of the method. We store this number into the nodes that we explore in
391403
// current method call.
392404
// It is unlikely for the increment operation to overflow because the maximum
393-
// no. of times this will be called is <= max_num_hash_func_ + kvs_.size().
405+
// no. of times this will be called is <= max_num_hash_func_ + num_entries_.
394406
for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
395407
uint64_t bucket_id = hash_vals[hash_cnt];
396408
(*buckets)[bucket_id].make_space_for_key_call_id =
@@ -408,9 +420,7 @@ bool CuckooTableBuilder::MakeSpaceForKey(
408420
CuckooBucket& curr_bucket = (*buckets)[curr_node.bucket_id];
409421
for (uint32_t hash_cnt = 0;
410422
hash_cnt < num_hash_func_ && !null_found; ++hash_cnt) {
411-
uint64_t child_bucket_id = CuckooHash(
412-
(is_last_level_file_ ? kvs_[curr_bucket.vector_idx].first :
413-
ExtractUserKey(Slice(kvs_[curr_bucket.vector_idx].first))),
423+
uint64_t child_bucket_id = CuckooHash(GetUserKey(curr_bucket.vector_idx),
414424
hash_cnt, use_module_hash_, hash_table_size_, identity_as_first_hash_,
415425
get_slice_hash_);
416426
// Iterate inside Cuckoo Block.

table/cuckoo_table_builder.h

+13-2
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ class CuckooTableBuilder: public TableBuilder {
7575
uint64_t* bucket_id);
7676
Status MakeHashTable(std::vector<CuckooBucket>* buckets);
7777

78+
inline Slice GetKey(uint64_t idx) const;
79+
inline Slice GetUserKey(uint64_t idx) const;
80+
inline Slice GetValue(uint64_t idx) const;
81+
7882
uint32_t num_hash_func_;
7983
WritableFile* file_;
8084
const double max_hash_table_ratio_;
@@ -83,10 +87,17 @@ class CuckooTableBuilder: public TableBuilder {
8387
const uint32_t cuckoo_block_size_;
8488
uint64_t hash_table_size_;
8589
bool is_last_level_file_;
90+
bool has_seen_first_key_;
91+
uint64_t key_size_;
92+
uint64_t value_size_;
93+
// A list of fixed-size key-value pairs concatenating into a string.
94+
// Use GetKey(), GetUserKey(), and GetValue() to retrieve a specific
95+
// key / value given an index
96+
std::string kvs_;
97+
// Number of key-value pairs stored in kvs_
98+
uint64_t num_entries_;
8699
Status status_;
87-
std::vector<std::pair<std::string, std::string>> kvs_;
88100
TableProperties properties_;
89-
bool has_seen_first_key_;
90101
const Comparator* ucomp_;
91102
bool use_module_hash_;
92103
bool identity_as_first_hash_;

0 commit comments

Comments
 (0)