Skip to content

Commit aa0ef66

Browse files
committed
[Performance Branch] If options.max_open_files set to be -1, cache table readers in FileMetadata for Get() and NewIterator()
Summary: In some use cases, table readers for all live files should always be cached. In that case, there will be an opportunity to avoid the table cache look-up while Get() and NewIterator(). We define options.max_open_files = -1 to be the mode that table readers for live files will always be kept. In that mode, table readers are cached in FileMetaData (with a reference count hold in table cache). So that when executing table_cache.Get() and table_cache.newInterator(), LRU cache checking can be by-passed, to reduce latency. Test Plan: add a test case in db_test Reviewers: haobo, kailiu Reviewed By: haobo CC: dhruba, igor, leveldb Differential Revision: https://reviews.facebook.net/D15039
1 parent 5b5ab0c commit aa0ef66

10 files changed

+124
-58
lines changed

db/builder.cc

+1-2
Original file line numberDiff line numberDiff line change
@@ -204,8 +204,7 @@ Status BuildTable(const std::string& dbname,
204204
// Verify that the table is usable
205205
Iterator* it = table_cache->NewIterator(ReadOptions(),
206206
soptions,
207-
meta->number,
208-
meta->file_size);
207+
*meta);
209208
s = it->status();
210209
delete it;
211210
}

db/db_impl.cc

+15-6
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,10 @@ Options SanitizeOptions(const std::string& dbname,
126126
Options result = src;
127127
result.comparator = icmp;
128128
result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
129-
ClipToRange(&result.max_open_files, 20, 1000000);
129+
// result.max_open_files means an "infinite" open files.
130+
if (result.max_open_files != -1) {
131+
ClipToRange(&result.max_open_files, 20, 1000000);
132+
}
130133
ClipToRange(&result.write_buffer_size, ((size_t)64)<<10,
131134
((size_t)64)<<30);
132135
ClipToRange(&result.block_size, 1<<10, 4<<20);
@@ -278,7 +281,10 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
278281
}
279282

280283
// Reserve ten files or so for other uses and give the rest to TableCache.
281-
const int table_cache_size = options_.max_open_files - 10;
284+
// Give a large number for setting of "infinite" open files.
285+
const int table_cache_size =
286+
(options_.max_open_files == -1) ?
287+
4194304 : options_.max_open_files - 10;
282288
table_cache_.reset(new TableCache(dbname_, &options_,
283289
storage_options_, table_cache_size));
284290
versions_.reset(new VersionSet(dbname_, &options_, storage_options_,
@@ -335,6 +341,9 @@ DBImpl::~DBImpl() {
335341
for (MemTable* m: to_delete) {
336342
delete m;
337343
}
344+
// versions need to be destroyed before table_cache since it can holds
345+
// references to table_cache.
346+
versions_.reset();
338347
LogFlush(options_.info_log);
339348
}
340349

@@ -2095,10 +2104,10 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
20952104

20962105
if (s.ok() && current_entries > 0) {
20972106
// Verify that the table is usable
2107+
FileMetaData meta(output_number, current_bytes);
20982108
Iterator* iter = table_cache_->NewIterator(ReadOptions(),
20992109
storage_options_,
2100-
output_number,
2101-
current_bytes);
2110+
meta);
21022111
s = iter->status();
21032112
delete iter;
21042113
if (s.ok()) {
@@ -3701,7 +3710,7 @@ Status DBImpl::DeleteFile(std::string name) {
37013710
}
37023711

37033712
int level;
3704-
FileMetaData metadata;
3713+
FileMetaData* metadata;
37053714
int maxlevel = NumberLevels();
37063715
VersionEdit edit(maxlevel);
37073716
DeletionState deletion_state(true);
@@ -3716,7 +3725,7 @@ Status DBImpl::DeleteFile(std::string name) {
37163725
assert((level > 0) && (level < maxlevel));
37173726

37183727
// If the file is being compacted no need to delete.
3719-
if (metadata.being_compacted) {
3728+
if (metadata->being_compacted) {
37203729
Log(options_.info_log,
37213730
"DeleteFile %s Skipped. File about to be compacted\n", name.c_str());
37223731
return Status::OK();

db/db_test.cc

+4
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ class DBTest {
265265
kHashSkipList,
266266
kUniversalCompaction,
267267
kCompressedBlockCache,
268+
kInfiniteMaxOpenFiles,
268269
kEnd
269270
};
270271
int option_config_;
@@ -415,6 +416,9 @@ class DBTest {
415416
case kCompressedBlockCache:
416417
options.block_cache_compressed = NewLRUCache(8*1024*1024);
417418
break;
419+
case kInfiniteMaxOpenFiles:
420+
options.max_open_files = -1;
421+
break;
418422
default:
419423
break;
420424
}

db/repair.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -265,8 +265,9 @@ class Repairer {
265265
int counter = 0;
266266
Status status = env_->GetFileSize(fname, &t->meta.file_size);
267267
if (status.ok()) {
268+
FileMetaData dummy_meta(t->meta.number, t->meta.file_size);
268269
Iterator* iter = table_cache_->NewIterator(
269-
ReadOptions(), storage_options_, t->meta.number, t->meta.file_size);
270+
ReadOptions(), storage_options_, dummy_meta);
270271
bool empty = true;
271272
ParsedInternalKey parsed;
272273
t->min_sequence = 0;

db/table_cache.cc

+33-21
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "db/table_cache.h"
1111

1212
#include "db/filename.h"
13+
#include "db/version_edit.h"
1314

1415
#include "rocksdb/statistics.h"
1516
#include "rocksdb/table.h"
@@ -50,6 +51,14 @@ TableCache::TableCache(const std::string& dbname,
5051
TableCache::~TableCache() {
5152
}
5253

54+
TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) {
55+
return reinterpret_cast<TableReader*>(cache_->Value(handle));
56+
}
57+
58+
void TableCache::ReleaseHandle(Cache::Handle* handle) {
59+
cache_->Release(handle);
60+
}
61+
5362
Status TableCache::FindTable(const EnvOptions& toptions,
5463
uint64_t file_number, uint64_t file_size,
5564
Cache::Handle** handle, bool* table_io,
@@ -94,25 +103,27 @@ Status TableCache::FindTable(const EnvOptions& toptions,
94103

95104
Iterator* TableCache::NewIterator(const ReadOptions& options,
96105
const EnvOptions& toptions,
97-
uint64_t file_number,
98-
uint64_t file_size,
106+
const FileMetaData& file_meta,
99107
TableReader** table_reader_ptr,
100108
bool for_compaction) {
101109
if (table_reader_ptr != nullptr) {
102110
*table_reader_ptr = nullptr;
103111
}
104-
105-
Cache::Handle* handle = nullptr;
106-
Status s = FindTable(toptions, file_number, file_size, &handle,
107-
nullptr, options.read_tier == kBlockCacheTier);
112+
Cache::Handle* handle = file_meta.table_reader_handle;
113+
Status s;
114+
if (!handle) {
115+
s = FindTable(toptions, file_meta.number, file_meta.file_size, &handle,
116+
nullptr, options.read_tier == kBlockCacheTier);
117+
}
108118
if (!s.ok()) {
109119
return NewErrorIterator(s);
110120
}
111121

112-
TableReader* table_reader =
113-
reinterpret_cast<TableReader*>(cache_->Value(handle));
122+
TableReader* table_reader = GetTableReaderFromHandle(handle);
114123
Iterator* result = table_reader->NewIterator(options);
115-
result->RegisterCleanup(&UnrefEntry, cache_.get(), handle);
124+
if (!file_meta.table_reader_handle) {
125+
result->RegisterCleanup(&UnrefEntry, cache_.get(), handle);
126+
}
116127
if (table_reader_ptr != nullptr) {
117128
*table_reader_ptr = table_reader;
118129
}
@@ -125,22 +136,24 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
125136
}
126137

127138
Status TableCache::Get(const ReadOptions& options,
128-
uint64_t file_number,
129-
uint64_t file_size,
139+
const FileMetaData& file_meta,
130140
const Slice& k,
131141
void* arg,
132142
bool (*saver)(void*, const Slice&, const Slice&, bool),
133143
bool* table_io,
134144
void (*mark_key_may_exist)(void*)) {
135-
Cache::Handle* handle = nullptr;
136-
Status s = FindTable(storage_options_, file_number, file_size,
137-
&handle, table_io,
138-
options.read_tier == kBlockCacheTier);
145+
Cache::Handle* handle = file_meta.table_reader_handle;
146+
Status s;
147+
if (!handle) {
148+
s = FindTable(storage_options_, file_meta.number, file_meta.file_size,
149+
&handle, table_io, options.read_tier == kBlockCacheTier);
150+
}
139151
if (s.ok()) {
140-
TableReader* t =
141-
reinterpret_cast<TableReader*>(cache_->Value(handle));
152+
TableReader* t = GetTableReaderFromHandle(handle);
142153
s = t->Get(options, k, arg, saver, mark_key_may_exist);
143-
cache_->Release(handle);
154+
if (!file_meta.table_reader_handle) {
155+
ReleaseHandle(handle);
156+
}
144157
} else if (options.read_tier && s.IsIncomplete()) {
145158
// Couldnt find Table in cache but treat as kFound if no_io set
146159
(*mark_key_may_exist)(arg);
@@ -159,10 +172,9 @@ bool TableCache::PrefixMayMatch(const ReadOptions& options,
159172
file_size, &handle, table_io);
160173
bool may_match = true;
161174
if (s.ok()) {
162-
TableReader* t =
163-
reinterpret_cast<TableReader*>(cache_->Value(handle));
175+
TableReader* t = GetTableReaderFromHandle(handle);
164176
may_match = t->PrefixMayMatch(internal_prefix);
165-
cache_->Release(handle);
177+
ReleaseHandle(handle);
166178
}
167179
return may_match;
168180
}

db/table_cache.h

+14-8
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
namespace rocksdb {
2222

2323
class Env;
24+
struct FileMetaData;
2425

2526
class TableCache {
2627
public:
@@ -37,17 +38,15 @@ class TableCache {
3738
// returned iterator is live.
3839
Iterator* NewIterator(const ReadOptions& options,
3940
const EnvOptions& toptions,
40-
uint64_t file_number,
41-
uint64_t file_size,
41+
const FileMetaData& file_meta,
4242
TableReader** table_reader_ptr = nullptr,
4343
bool for_compaction = false);
4444

4545
// If a seek to internal key "k" in specified file finds an entry,
4646
// call (*handle_result)(arg, found_key, found_value) repeatedly until
4747
// it returns false.
4848
Status Get(const ReadOptions& options,
49-
uint64_t file_number,
50-
uint64_t file_size,
49+
const FileMetaData& file_meta,
5150
const Slice& k,
5251
void* arg,
5352
bool (*handle_result)(void*, const Slice&, const Slice&, bool),
@@ -63,16 +62,23 @@ class TableCache {
6362
// Evict any entry for the specified file number
6463
void Evict(uint64_t file_number);
6564

65+
// Find table reader
66+
Status FindTable(const EnvOptions& toptions, uint64_t file_number,
67+
uint64_t file_size, Cache::Handle**, bool* table_io=nullptr,
68+
const bool no_io = false);
69+
70+
// Get TableReader from a cache handle.
71+
TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
72+
73+
// Release the handle from a cache
74+
void ReleaseHandle(Cache::Handle* handle);
75+
6676
private:
6777
Env* const env_;
6878
const std::string dbname_;
6979
const Options* options_;
7080
const EnvOptions& storage_options_;
7181
std::shared_ptr<Cache> cache_;
72-
73-
Status FindTable(const EnvOptions& toptions, uint64_t file_number,
74-
uint64_t file_size, Cache::Handle**, bool* table_io=nullptr,
75-
const bool no_io = false);
7682
};
7783

7884
} // namespace rocksdb

db/version_edit.h

+9-2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <set>
1212
#include <utility>
1313
#include <vector>
14+
#include "rocksdb/cache.h"
1415
#include "db/dbformat.h"
1516

1617
namespace rocksdb {
@@ -28,8 +29,14 @@ struct FileMetaData {
2829
SequenceNumber smallest_seqno;// The smallest seqno in this file
2930
SequenceNumber largest_seqno; // The largest seqno in this file
3031

31-
FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0),
32-
being_compacted(false) { }
32+
// Needs to be disposed when refs becomes 0.
33+
Cache::Handle* table_reader_handle;
34+
35+
FileMetaData(uint64_t number, uint64_t file_size) :
36+
refs(0), allowed_seeks(1 << 30), number(number), file_size(file_size),
37+
being_compacted(false), table_reader_handle(nullptr) {
38+
}
39+
FileMetaData() : FileMetaData(0, 0) { }
3340
};
3441

3542
class VersionEdit {

0 commit comments

Comments
 (0)