Skip to content

Commit 3304266

Browse files
committed
Reduce malloc of iterators in Get() code paths
Summary: This patch optimized Get() code paths by avoiding malloc of iterators. Iterator creation is moved to mem table rep implementations, where a callback is called when any key is found. This is the same practice as what we do in (SST) table readers. db_bench result for readrandom following a writeseq, with no compression, single thread and tmpfs, we see throughput improved to 144958 from 139027, about 3%. Test Plan: make all check Reviewers: dhruba, haobo, igor Reviewed By: haobo CC: leveldb, yhchiang Differential Revision: https://reviews.facebook.net/D14685
1 parent d4b789f commit 3304266

10 files changed

+229
-99
lines changed

db/memtable.cc

+136-96
Original file line numberDiff line numberDiff line change
@@ -207,116 +207,147 @@ void MemTable::Add(SequenceNumber s, ValueType type,
207207
}
208208
}
209209

210+
// Callback from MemTable::Get()
211+
namespace {
212+
213+
struct Saver {
214+
Status* status;
215+
const LookupKey* key;
216+
bool* found_final_value; // Is value set correctly? Used by KeyMayExist
217+
bool* merge_in_progress;
218+
std::string* value;
219+
const MergeOperator* merge_operator;
220+
// the merge operations encountered;
221+
MergeContext* merge_context;
222+
MemTable* mem;
223+
Logger* logger;
224+
Statistics* statistics;
225+
bool inplace_update_support;
226+
};
227+
} // namespace
228+
229+
static bool SaveValue(void* arg, const char* entry) {
230+
Saver* s = reinterpret_cast<Saver*>(arg);
231+
MergeContext* merge_context = s->merge_context;
232+
const MergeOperator* merge_operator = s->merge_operator;
233+
234+
assert(s != nullptr && merge_context != nullptr);
235+
236+
// entry format is:
237+
// klength varint32
238+
// userkey char[klength-8]
239+
// tag uint64
240+
// vlength varint32
241+
// value char[vlength]
242+
// Check that it belongs to same user key. We do not check the
243+
// sequence number since the Seek() call above should have skipped
244+
// all entries with overly large sequence numbers.
245+
uint32_t key_length;
246+
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
247+
if (s->mem->GetInternalKeyComparator().user_comparator()->Compare(
248+
Slice(key_ptr, key_length - 8), s->key->user_key()) == 0) {
249+
// Correct user key
250+
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
251+
switch (static_cast<ValueType>(tag & 0xff)) {
252+
case kTypeValue: {
253+
if (s->inplace_update_support) {
254+
s->mem->GetLock(s->key->user_key())->ReadLock();
255+
}
256+
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
257+
*(s->status) = Status::OK();
258+
if (*(s->merge_in_progress)) {
259+
assert(merge_operator);
260+
if (!merge_operator->FullMerge(s->key->user_key(), &v,
261+
merge_context->GetOperands(), s->value,
262+
s->logger)) {
263+
RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
264+
*(s->status) =
265+
Status::Corruption("Error: Could not perform merge.");
266+
}
267+
} else {
268+
s->value->assign(v.data(), v.size());
269+
}
270+
if (s->inplace_update_support) {
271+
s->mem->GetLock(s->key->user_key())->Unlock();
272+
}
273+
*(s->found_final_value) = true;
274+
return false;
275+
}
276+
case kTypeDeletion: {
277+
if (*(s->merge_in_progress)) {
278+
assert(merge_operator);
279+
*(s->status) = Status::OK();
280+
if (!merge_operator->FullMerge(s->key->user_key(), nullptr,
281+
merge_context->GetOperands(), s->value,
282+
s->logger)) {
283+
RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
284+
*(s->status) =
285+
Status::Corruption("Error: Could not perform merge.");
286+
}
287+
} else {
288+
*(s->status) = Status::NotFound();
289+
}
290+
*(s->found_final_value) = true;
291+
return false;
292+
}
293+
case kTypeMerge: {
294+
std::string merge_result; // temporary area for merge results later
295+
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
296+
*(s->merge_in_progress) = true;
297+
merge_context->PushOperand(v);
298+
while (merge_context->GetNumOperands() >= 2) {
299+
// Attempt to associative merge. (Returns true if successful)
300+
if (merge_operator->PartialMerge(
301+
s->key->user_key(), merge_context->GetOperand(0),
302+
merge_context->GetOperand(1), &merge_result, s->logger)) {
303+
merge_context->PushPartialMergeResult(merge_result);
304+
} else {
305+
// Stack them because user can't associative merge
306+
break;
307+
}
308+
}
309+
return true;
310+
}
311+
default:
312+
assert(false);
313+
return true;
314+
}
315+
}
316+
317+
// s->state could be Corrupt, merge or notfound
318+
return false;
319+
}
320+
210321
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
211322
MergeContext& merge_context, const Options& options) {
212323
StopWatchNano memtable_get_timer(options.env, false);
213324
StartPerfTimer(&memtable_get_timer);
214325

215-
Slice mem_key = key.memtable_key();
216326
Slice user_key = key.user_key();
327+
bool found_final_value = false;
328+
bool merge_in_progress = s->IsMergeInProgress();
217329

218-
std::unique_ptr<MemTableRep::Iterator> iter;
219330
if (prefix_bloom_ &&
220331
!prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) {
221332
// iter is null if prefix bloom says the key does not exist
222333
} else {
223-
iter.reset(table_->GetIterator(user_key));
224-
iter->Seek(key.internal_key(), mem_key.data());
225-
}
226-
227-
bool merge_in_progress = s->IsMergeInProgress();
228-
auto merge_operator = options.merge_operator.get();
229-
auto logger = options.info_log;
230-
std::string merge_result;
231-
232-
bool found_final_value = false;
233-
for (; !found_final_value && iter && iter->Valid(); iter->Next()) {
234-
// entry format is:
235-
// klength varint32
236-
// userkey char[klength-8]
237-
// tag uint64
238-
// vlength varint32
239-
// value char[vlength]
240-
// Check that it belongs to same user key. We do not check the
241-
// sequence number since the Seek() call above should have skipped
242-
// all entries with overly large sequence numbers.
243-
const char* entry = iter->key();
244-
uint32_t key_length = 0;
245-
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
246-
if (comparator_.comparator.user_comparator()->Compare(
247-
Slice(key_ptr, key_length - 8), key.user_key()) == 0) {
248-
// Correct user key
249-
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
250-
switch (static_cast<ValueType>(tag & 0xff)) {
251-
case kTypeValue: {
252-
if (options.inplace_update_support) {
253-
GetLock(key.user_key())->ReadLock();
254-
}
255-
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
256-
*s = Status::OK();
257-
if (merge_in_progress) {
258-
assert(merge_operator);
259-
if (!merge_operator->FullMerge(key.user_key(), &v,
260-
merge_context.GetOperands(), value,
261-
logger.get())) {
262-
RecordTick(options.statistics.get(), NUMBER_MERGE_FAILURES);
263-
*s = Status::Corruption("Error: Could not perform merge.");
264-
}
265-
} else {
266-
value->assign(v.data(), v.size());
267-
}
268-
if (options.inplace_update_support) {
269-
GetLock(key.user_key())->Unlock();
270-
}
271-
found_final_value = true;
272-
break;
273-
}
274-
case kTypeDeletion: {
275-
if (merge_in_progress) {
276-
assert(merge_operator);
277-
*s = Status::OK();
278-
if (!merge_operator->FullMerge(key.user_key(), nullptr,
279-
merge_context.GetOperands(), value,
280-
logger.get())) {
281-
RecordTick(options.statistics.get(), NUMBER_MERGE_FAILURES);
282-
*s = Status::Corruption("Error: Could not perform merge.");
283-
}
284-
} else {
285-
*s = Status::NotFound();
286-
}
287-
found_final_value = true;
288-
break;
289-
}
290-
case kTypeMerge: {
291-
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
292-
merge_in_progress = true;
293-
merge_context.PushOperand(v);
294-
while(merge_context.GetNumOperands() >= 2) {
295-
// Attempt to associative merge. (Returns true if successful)
296-
if (merge_operator->PartialMerge(key.user_key(),
297-
merge_context.GetOperand(0),
298-
merge_context.GetOperand(1),
299-
&merge_result, logger.get())) {
300-
merge_context.PushPartialMergeResult(merge_result);
301-
} else {
302-
// Stack them because user can't associative merge
303-
break;
304-
}
305-
}
306-
break;
307-
}
308-
default:
309-
assert(false);
310-
break;
311-
}
312-
} else {
313-
// exit loop if user key does not match
314-
break;
315-
}
334+
Saver saver;
335+
saver.status = s;
336+
saver.found_final_value = &found_final_value;
337+
saver.merge_in_progress = &merge_in_progress;
338+
saver.key = &key;
339+
saver.value = value;
340+
saver.status = s;
341+
saver.mem = this;
342+
saver.merge_context = &merge_context;
343+
saver.merge_operator = options.merge_operator.get();
344+
saver.logger = options.info_log.get();
345+
saver.inplace_update_support = options.inplace_update_support;
346+
saver.statistics = options.statistics.get();
347+
table_->Get(key, &saver, SaveValue);
316348
}
317349

318350
// No change to value, since we have not yet found a Put/Delete
319-
320351
if (!found_final_value && merge_in_progress) {
321352
*s = Status::MergeInProgress("");
322353
}
@@ -488,4 +519,13 @@ size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
488519
return num_successive_merges;
489520
}
490521

522+
void MemTableRep::Get(const LookupKey& k, void* callback_args,
523+
bool (*callback_func)(void* arg, const char* entry)) {
524+
auto iter = GetIterator(k.user_key());
525+
for (iter->Seek(k.internal_key(), k.memtable_key().data());
526+
iter->Valid() && callback_func(callback_args, iter->key());
527+
iter->Next()) {
528+
}
529+
}
530+
491531
} // namespace rocksdb

db/memtable.h

+7-3
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,13 @@ class MemTable {
154154
// Notify the underlying storage that no more items will be added
155155
void MarkImmutable() { table_->MarkReadOnly(); }
156156

157+
// Get the lock associated for the key
158+
port::RWMutex* GetLock(const Slice& key);
159+
160+
const InternalKeyComparator& GetInternalKeyComparator() const {
161+
return comparator_.comparator;
162+
}
163+
157164
private:
158165
friend class MemTableIterator;
159166
friend class MemTableBackwardIterator;
@@ -190,9 +197,6 @@ class MemTable {
190197
MemTable(const MemTable&);
191198
void operator=(const MemTable&);
192199

193-
// Get the lock associated for the key
194-
port::RWMutex* GetLock(const Slice& key);
195-
196200
const SliceTransform* const prefix_extractor_;
197201
std::unique_ptr<DynamicBloom> prefix_bloom_;
198202
};

db/skiplist.h

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#pragma once
3434
#include <assert.h>
3535
#include <stdlib.h>
36+
#include "util/arena.h"
3637
#include "port/port.h"
3738
#include "util/arena.h"
3839
#include "util/random.h"

db/version_set.h

+1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ class TableCache;
4343
class Version;
4444
class VersionSet;
4545
class MergeContext;
46+
class LookupKey;
4647

4748
// Return the smallest index i such that files[i]->largest >= key.
4849
// Return files.size() if there is no such file.

include/rocksdb/db.h

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ struct ReadOptions;
3131
struct WriteOptions;
3232
struct FlushOptions;
3333
class WriteBatch;
34+
class Env;
3435

3536
// Metadata associated with each SST file.
3637
struct LiveFileMetaData {

include/rocksdb/memtablerep.h

+15
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
namespace rocksdb {
4242

4343
class Arena;
44+
class LookupKey;
4445
class Slice;
4546
class SliceTransform;
4647

@@ -74,6 +75,20 @@ class MemTableRep {
7475
// nothing.
7576
virtual void MarkReadOnly() { }
7677

78+
// Look up key from the mem table, since the first key in the mem table whose
79+
// user_key matches the one given k, call the function callback_func(), with
80+
// callback_args directly forwarded as the first parameter, and the mem table
81+
// key as the second parameter. If the return value is false, then terminates.
82+
// Otherwise, go through the next key.
83+
// It's safe for Get() to terminate after having finished all the potential
84+
// key for the k.user_key(), or not.
85+
//
86+
// Default:
87+
// Get() function with a default value of dynamically construct an iterator,
88+
// seek and call the call back function.
89+
virtual void Get(const LookupKey& k, void* callback_args,
90+
bool (*callback_func)(void* arg, const char* entry));
91+
7792
// Report an approximation of how much memory has been used other than memory
7893
// that was allocated through the arena.
7994
virtual size_t ApproximateMemoryUsage() = 0;

util/hash_linklist_rep.cc

+17
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ class HashLinkListRep : public MemTableRep {
6464

6565
virtual size_t ApproximateMemoryUsage() override;
6666

67+
virtual void Get(const LookupKey& k, void* callback_args,
68+
bool (*callback_func)(void* arg,
69+
const char* entry)) override;
70+
6771
virtual ~HashLinkListRep();
6872

6973
virtual MemTableRep::Iterator* GetIterator() override;
@@ -398,6 +402,19 @@ size_t HashLinkListRep::ApproximateMemoryUsage() {
398402
return 0;
399403
}
400404

405+
void HashLinkListRep::Get(const LookupKey& k, void* callback_args,
406+
bool (*callback_func)(void* arg, const char* entry)) {
407+
auto transformed = transform_->Transform(k.user_key());
408+
auto bucket = GetBucket(transformed);
409+
if (bucket != nullptr) {
410+
Iterator iter(this, bucket);
411+
for (iter.Seek(k.internal_key(), nullptr);
412+
iter.Valid() && callback_func(callback_args, iter.key());
413+
iter.Next()) {
414+
}
415+
}
416+
}
417+
401418
MemTableRep::Iterator* HashLinkListRep::GetIterator() {
402419
auto list = new FullList(compare_, arena_);
403420
for (size_t i = 0; i < bucket_size_; ++i) {

util/hash_skiplist_rep.cc

+17
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ class HashSkipListRep : public MemTableRep {
3131

3232
virtual size_t ApproximateMemoryUsage() override;
3333

34+
virtual void Get(const LookupKey& k, void* callback_args,
35+
bool (*callback_func)(void* arg,
36+
const char* entry)) override;
37+
3438
virtual ~HashSkipListRep();
3539

3640
virtual MemTableRep::Iterator* GetIterator() override;
@@ -271,6 +275,19 @@ size_t HashSkipListRep::ApproximateMemoryUsage() {
271275
return sizeof(buckets_);
272276
}
273277

278+
void HashSkipListRep::Get(const LookupKey& k, void* callback_args,
279+
bool (*callback_func)(void* arg, const char* entry)) {
280+
auto transformed = transform_->Transform(k.user_key());
281+
auto bucket = GetBucket(transformed);
282+
if (bucket != nullptr) {
283+
Bucket::Iterator iter(bucket);
284+
for (iter.Seek(k.memtable_key().data());
285+
iter.Valid() && callback_func(callback_args, iter.key());
286+
iter.Next()) {
287+
}
288+
}
289+
}
290+
274291
MemTableRep::Iterator* HashSkipListRep::GetIterator() {
275292
auto list = new Bucket(compare_, arena_);
276293
for (size_t i = 0; i < bucket_size_; ++i) {

0 commit comments

Comments
 (0)