Skip to content

Commit 906f3dc

Browse files
committed
Add a hash-index component for block
Summary: this is the key component extracted from diff: https://reviews.facebook.net/D14271 I separate it to a dedicated patch to make the review easier. Test Plan: added a unit test and passed it. Reviewers: haobo, sdong, dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D16245
1 parent 6b9da48 commit 906f3dc

8 files changed

+319
-14
lines changed

Makefile

+4
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
5555

5656
TESTS = \
5757
db_test \
58+
block_hash_index_test \
5859
autovector_test \
5960
table_properties_collector_test \
6061
arena_test \
@@ -227,6 +228,9 @@ $(LIBRARY): $(LIBOBJECTS)
227228
db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
228229
$(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
229230

231+
block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
232+
$(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
233+
230234
db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL)
231235
$(CXX) tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
232236

db/memtable.cc

+2-10
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,6 @@
2626
#include "util/statistics.h"
2727
#include "util/stop_watch.h"
2828

29-
namespace std {
30-
template <>
31-
struct hash<rocksdb::Slice> {
32-
size_t operator()(const rocksdb::Slice& slice) const {
33-
return MurmurHash(slice.data(), slice.size(), 0);
34-
}
35-
};
36-
}
37-
3829
namespace rocksdb {
3930

4031
MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
@@ -167,7 +158,8 @@ Iterator* MemTable::NewIterator(const ReadOptions& options) {
167158
}
168159

169160
port::RWMutex* MemTable::GetLock(const Slice& key) {
170-
return &locks_[std::hash<Slice>()(key) % locks_.size()];
161+
static murmur_hash hash;
162+
return &locks_[hash(key) % locks_.size()];
171163
}
172164

173165
void MemTable::Add(SequenceNumber s, ValueType type,

table/block_hash_index.cc

+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
2+
// This source code is licensed under the BSD-style license found in the
3+
// LICENSE file in the root directory of this source tree. An additional grant
4+
// of patent rights can be found in the PATENTS file in the same directory.
5+
6+
#include <algorithm>
7+
8+
#include "table/block_hash_index.h"
9+
#include "rocksdb/comparator.h"
10+
#include "rocksdb/iterator.h"
11+
#include "rocksdb/slice_transform.h"
12+
13+
namespace rocksdb {
14+
15+
BlockHashIndex* CreateBlockHashIndex(Iterator* index_iter, Iterator* data_iter,
16+
const uint32_t num_restarts,
17+
const Comparator* comparator,
18+
const SliceTransform* hash_key_extractor) {
19+
assert(hash_key_extractor);
20+
auto hash_index = new BlockHashIndex(hash_key_extractor);
21+
uint64_t current_restart_index = 0;
22+
23+
std::string pending_entry_prefix;
24+
// pending_block_num == 0 also implies there is no entry inserted at all.
25+
uint32_t pending_block_num = 0;
26+
uint32_t pending_entry_index = 0;
27+
28+
// scan all the entries and create a hash index based on their prefixes.
29+
data_iter->SeekToFirst();
30+
for (index_iter->SeekToFirst();
31+
index_iter->Valid() && current_restart_index < num_restarts;
32+
index_iter->Next()) {
33+
Slice last_key_in_block = index_iter->key();
34+
assert(data_iter->Valid() && data_iter->status().ok());
35+
36+
// scan through all entries within a data block.
37+
while (data_iter->Valid() &&
38+
comparator->Compare(data_iter->key(), last_key_in_block) <= 0) {
39+
auto key_prefix = hash_key_extractor->Transform(data_iter->key());
40+
bool is_first_entry = pending_block_num == 0;
41+
42+
// Keys may share the prefix
43+
if (is_first_entry || pending_entry_prefix != key_prefix) {
44+
if (!is_first_entry) {
45+
bool succeeded = hash_index->Add(
46+
pending_entry_prefix, pending_entry_index, pending_block_num);
47+
if (!succeeded) {
48+
delete hash_index;
49+
return nullptr;
50+
}
51+
}
52+
53+
// update the status.
54+
// needs a hard copy otherwise the underlying data changes all the time.
55+
pending_entry_prefix = key_prefix.ToString();
56+
pending_block_num = 1;
57+
pending_entry_index = current_restart_index;
58+
} else {
59+
// entry number increments when keys share the prefix reside in
60+
// differnt data blocks.
61+
auto last_restart_index = pending_entry_index + pending_block_num - 1;
62+
assert(last_restart_index <= current_restart_index);
63+
if (last_restart_index != current_restart_index) {
64+
++pending_block_num;
65+
}
66+
}
67+
data_iter->Next();
68+
}
69+
70+
++current_restart_index;
71+
}
72+
73+
// make sure all entries has been scaned.
74+
assert(!index_iter->Valid());
75+
assert(!data_iter->Valid());
76+
77+
if (pending_block_num > 0) {
78+
auto succeeded = hash_index->Add(pending_entry_prefix, pending_entry_index,
79+
pending_block_num);
80+
if (!succeeded) {
81+
delete hash_index;
82+
return nullptr;
83+
}
84+
}
85+
86+
return hash_index;
87+
}
88+
89+
bool BlockHashIndex::Add(const Slice& prefix, uint32_t restart_index,
90+
uint32_t num_blocks) {
91+
auto prefix_ptr = arena_.Allocate(prefix.size());
92+
std::copy(prefix.data() /* begin */, prefix.data() + prefix.size() /* end */,
93+
prefix_ptr /* destination */);
94+
auto result =
95+
restart_indices_.insert({Slice(prefix_ptr, prefix.size()),
96+
RestartIndex(restart_index, num_blocks)});
97+
return result.second;
98+
}
99+
100+
const BlockHashIndex::RestartIndex* BlockHashIndex::GetRestartIndex(
101+
const Slice& key) {
102+
auto key_prefix = hash_key_extractor_->Transform(key);
103+
104+
auto pos = restart_indices_.find(key_prefix);
105+
if (pos == restart_indices_.end()) {
106+
return nullptr;
107+
}
108+
109+
return &pos->second;
110+
}
111+
112+
} // namespace rocksdb

table/block_hash_index.h

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
2+
// This source code is licensed under the BSD-style license found in the
3+
// LICENSE file in the root directory of this source tree. An additional grant
4+
// of patent rights can be found in the PATENTS file in the same directory.
5+
#pragma once
6+
7+
#include <string>
8+
#include <unordered_map>
9+
10+
#include "util/arena.h"
11+
#include "util/murmurhash.h"
12+
13+
namespace rocksdb {
14+
15+
class Comparator;
16+
class Iterator;
17+
class Slice;
18+
class SliceTransform;
19+
20+
// Build a hash-based index to speed up the lookup for "index block".
21+
// BlockHashIndex accepts a key and, if found, returns its restart index within
22+
// that index block.
23+
class BlockHashIndex {
24+
public:
25+
// Represents a restart index in the index block's restart array.
26+
struct RestartIndex {
27+
explicit RestartIndex(uint32_t first_index, uint32_t num_blocks = 1)
28+
: first_index(first_index), num_blocks(num_blocks) {}
29+
30+
// For a given prefix, what is the restart index for the first data block
31+
// that contains it.
32+
uint32_t first_index = 0;
33+
34+
// How many data blocks contains this prefix?
35+
uint32_t num_blocks = 1;
36+
};
37+
38+
explicit BlockHashIndex(const SliceTransform* hash_key_extractor)
39+
: hash_key_extractor_(hash_key_extractor) {}
40+
41+
// Maps a key to its restart first_index.
42+
// Returns nullptr if the restart first_index is found
43+
const RestartIndex* GetRestartIndex(const Slice& key);
44+
45+
bool Add(const Slice& key_prefix, uint32_t restart_index,
46+
uint32_t num_blocks);
47+
48+
size_t ApproximateMemoryUsage() const {
49+
return arena_.ApproximateMemoryUsage();
50+
}
51+
52+
private:
53+
const SliceTransform* hash_key_extractor_;
54+
std::unordered_map<Slice, RestartIndex, murmur_hash> restart_indices_;
55+
Arena arena_;
56+
};
57+
58+
// Create hash index by scanning the entries in index as well as the whole
59+
// dataset.
60+
// @params index_iter: an iterator with the pointer to the first entry in a
61+
// block.
62+
// @params data_iter: an iterator that can scan all the entries reside in a
63+
// table.
64+
// @params num_restarts: used for correctness verification.
65+
// @params hash_key_extractor: extract the hashable part of a given key.
66+
// On error, nullptr will be returned.
67+
BlockHashIndex* CreateBlockHashIndex(Iterator* index_iter, Iterator* data_iter,
68+
const uint32_t num_restarts,
69+
const Comparator* comparator,
70+
const SliceTransform* hash_key_extractor);
71+
72+
} // namespace rocksdb

table/block_hash_index_test.cc

+117
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
2+
// This source code is licensed under the BSD-style license found in the
3+
// LICENSE file in the root directory of this source tree. An additional grant
4+
// of patent rights can be found in the PATENTS file in the same directory.
5+
6+
#include <map>
7+
#include <memory>
8+
#include <vector>
9+
10+
#include "rocksdb/comparator.h"
11+
#include "rocksdb/iterator.h"
12+
#include "rocksdb/slice_transform.h"
13+
#include "table/block_hash_index.h"
14+
#include "util/testharness.h"
15+
#include "util/testutil.h"
16+
17+
namespace rocksdb {
18+
19+
typedef std::map<std::string, std::string> Data;
20+
21+
class MapIterator : public Iterator {
22+
public:
23+
explicit MapIterator(const Data& data) : data_(data), pos_(data_.end()) {}
24+
25+
virtual bool Valid() const { return pos_ != data_.end(); }
26+
27+
virtual void SeekToFirst() { pos_ = data_.begin(); }
28+
29+
virtual void SeekToLast() {
30+
pos_ = data_.end();
31+
--pos_;
32+
}
33+
34+
virtual void Seek(const Slice& target) {
35+
pos_ = data_.find(target.ToString());
36+
}
37+
38+
virtual void Next() { ++pos_; }
39+
40+
virtual void Prev() { --pos_; }
41+
42+
virtual Slice key() const { return pos_->first; }
43+
44+
virtual Slice value() const { return pos_->second; }
45+
46+
virtual Status status() const { return Status::OK(); }
47+
48+
private:
49+
const Data& data_;
50+
Data::const_iterator pos_;
51+
};
52+
53+
class BlockTest {};
54+
55+
TEST(BlockTest, BasicTest) {
56+
const size_t keys_per_block = 4;
57+
const size_t prefix_size = 2;
58+
std::vector<std::string> keys = {/* block 1 */
59+
"0101", "0102", "0103", "0201",
60+
/* block 2 */
61+
"0202", "0203", "0301", "0401",
62+
/* block 3 */
63+
"0501", "0601", "0701", "0801",
64+
/* block 4 */
65+
"0802", "0803", "0804", "0805",
66+
/* block 5 */
67+
"0806", "0807", "0808", "0809", };
68+
69+
Data data_entries;
70+
for (const auto key : keys) {
71+
data_entries.insert({key, key});
72+
}
73+
74+
Data index_entries;
75+
for (size_t i = 3; i < keys.size(); i += keys_per_block) {
76+
// simply ignore the value part
77+
index_entries.insert({keys[i], ""});
78+
}
79+
80+
MapIterator data_iter(data_entries);
81+
MapIterator index_iter(index_entries);
82+
83+
auto prefix_extractor = NewFixedPrefixTransform(prefix_size);
84+
std::unique_ptr<BlockHashIndex> block_hash_index(
85+
CreateBlockHashIndex(&index_iter, &data_iter, index_entries.size(),
86+
BytewiseComparator(), prefix_extractor));
87+
88+
std::map<std::string, BlockHashIndex::RestartIndex> expected = {
89+
{"01xx", BlockHashIndex::RestartIndex(0, 1)},
90+
{"02yy", BlockHashIndex::RestartIndex(0, 2)},
91+
{"03zz", BlockHashIndex::RestartIndex(1, 1)},
92+
{"04pp", BlockHashIndex::RestartIndex(1, 1)},
93+
{"05ww", BlockHashIndex::RestartIndex(2, 1)},
94+
{"06xx", BlockHashIndex::RestartIndex(2, 1)},
95+
{"07pp", BlockHashIndex::RestartIndex(2, 1)},
96+
{"08xz", BlockHashIndex::RestartIndex(2, 3)}, };
97+
98+
const BlockHashIndex::RestartIndex* index = nullptr;
99+
// search existed prefixes
100+
for (const auto& item : expected) {
101+
index = block_hash_index->GetRestartIndex(item.first);
102+
ASSERT_TRUE(index != nullptr);
103+
ASSERT_EQ(item.second.first_index, index->first_index);
104+
ASSERT_EQ(item.second.num_blocks, index->num_blocks);
105+
}
106+
107+
// search non exist prefixes
108+
ASSERT_TRUE(!block_hash_index->GetRestartIndex("00xx"));
109+
ASSERT_TRUE(!block_hash_index->GetRestartIndex("10yy"));
110+
ASSERT_TRUE(!block_hash_index->GetRestartIndex("20zz"));
111+
112+
delete prefix_extractor;
113+
}
114+
115+
} // namespace rocksdb
116+
117+
int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

util/arena.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,12 @@ class Arena {
3939
// Returns an estimate of the total memory usage of data allocated
4040
// by the arena (exclude the space allocated but not yet used for future
4141
// allocations).
42-
const size_t ApproximateMemoryUsage() {
42+
size_t ApproximateMemoryUsage() const {
4343
return blocks_memory_ + blocks_.capacity() * sizeof(char*) -
4444
alloc_bytes_remaining_;
4545
}
4646

47-
const size_t MemoryAllocatedBytes() { return blocks_memory_; }
47+
size_t MemoryAllocatedBytes() const { return blocks_memory_; }
4848

4949
private:
5050
// Number of bytes allocated in one block

util/hash.cc

-1
Original file line numberDiff line numberDiff line change
@@ -46,5 +46,4 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) {
4646
return h;
4747
}
4848

49-
5049
} // namespace rocksdb

util/murmurhash.h

+10-1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
*/
1212
#pragma once
1313
#include <stdint.h>
14+
#include "rocksdb/slice.h"
1415

1516
#if defined(__x86_64__)
1617
#define MURMUR_HASH MurmurHash64A
@@ -29,5 +30,13 @@ typedef unsigned int murmur_t;
2930
unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed );
3031
#define MurmurHash MurmurHashNeutral2
3132
typedef unsigned int murmur_t;
32-
3333
#endif
34+
35+
// Allow slice to be hashable by murmur hash.
36+
namespace rocksdb {
37+
struct murmur_hash {
38+
size_t operator()(const Slice& slice) const {
39+
return MurmurHash(slice.data(), slice.size(), 0);
40+
}
41+
};
42+
} // rocksdb

0 commit comments

Comments
 (0)