Skip to content

Commit 2386185

Browse files
author
Lei Jin
committed
ReadOptions.total_order_seek to allow total order seek for block-based table when hash index is enabled
Summary: as title Test Plan: table_test Reviewers: igor, yhchiang, sdong Reviewed By: sdong Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D22239
1 parent a98badf commit 2386185

12 files changed

+161
-41
lines changed

db/db_impl.cc

+7-3
Original file line numberDiff line numberDiff line change
@@ -1406,7 +1406,9 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
14061406
FileMetaData meta;
14071407
meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
14081408
pending_outputs_[meta.fd.GetNumber()] = 0; // path 0 for level 0 file.
1409-
Iterator* iter = mem->NewIterator(ReadOptions(), true);
1409+
ReadOptions ro;
1410+
ro.total_order_seek = true;
1411+
Iterator* iter = mem->NewIterator(ro);
14101412
const SequenceNumber newest_snapshot = snapshots_.GetNewest();
14111413
const SequenceNumber earliest_seqno_in_memtable =
14121414
mem->GetFirstSequenceNumber();
@@ -1473,11 +1475,13 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
14731475
mutex_.Unlock();
14741476
log_buffer->FlushBufferToLog();
14751477
std::vector<Iterator*> memtables;
1478+
ReadOptions ro;
1479+
ro.total_order_seek = true;
14761480
for (MemTable* m : mems) {
14771481
Log(options_.info_log,
14781482
"[%s] Flushing memtable with next log file: %" PRIu64 "\n",
14791483
cfd->GetName().c_str(), m->GetNextLogNumber());
1480-
memtables.push_back(m->NewIterator(ReadOptions(), true));
1484+
memtables.push_back(m->NewIterator(ro));
14811485
}
14821486
Iterator* iter = NewMergingIterator(&cfd->internal_comparator(),
14831487
&memtables[0], memtables.size());
@@ -3300,7 +3304,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
33003304
MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena);
33013305
// Collect iterator for mutable mem
33023306
merge_iter_builder.AddIterator(
3303-
super_version->mem->NewIterator(options, false, arena));
3307+
super_version->mem->NewIterator(options, arena));
33043308
// Collect all needed child iterators for immutable memtables
33053309
super_version->imm->AddIterators(options, &merge_iter_builder);
33063310
// Collect iterators for files in L0 - Ln

db/memtable.cc

+6-7
Original file line numberDiff line numberDiff line change
@@ -174,13 +174,13 @@ const char* EncodeKey(std::string* scratch, const Slice& target) {
174174

175175
class MemTableIterator: public Iterator {
176176
public:
177-
MemTableIterator(const MemTable& mem, const ReadOptions& options,
178-
bool enforce_total_order, Arena* arena)
177+
MemTableIterator(
178+
const MemTable& mem, const ReadOptions& options, Arena* arena)
179179
: bloom_(nullptr),
180180
prefix_extractor_(mem.prefix_extractor_),
181181
valid_(false),
182182
arena_mode_(arena != nullptr) {
183-
if (prefix_extractor_ != nullptr && !enforce_total_order) {
183+
if (prefix_extractor_ != nullptr && !options.total_order_seek) {
184184
bloom_ = mem.prefix_bloom_.get();
185185
iter_ = mem.table_->GetDynamicPrefixIterator(arena);
186186
} else {
@@ -248,14 +248,13 @@ class MemTableIterator: public Iterator {
248248
void operator=(const MemTableIterator&);
249249
};
250250

251-
Iterator* MemTable::NewIterator(const ReadOptions& options,
252-
bool enforce_total_order, Arena* arena) {
251+
Iterator* MemTable::NewIterator(const ReadOptions& options, Arena* arena) {
253252
if (arena == nullptr) {
254-
return new MemTableIterator(*this, options, enforce_total_order, nullptr);
253+
return new MemTableIterator(*this, options, nullptr);
255254
} else {
256255
auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
257256
return new (mem)
258-
MemTableIterator(*this, options, enforce_total_order, arena);
257+
MemTableIterator(*this, options, arena);
259258
}
260259
}
261260

db/memtable.h

-1
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ class MemTable {
8282
// Calling ~Iterator of the iterator will destroy all the states but
8383
// those allocated in arena.
8484
Iterator* NewIterator(const ReadOptions& options,
85-
bool enforce_total_order = false,
8685
Arena* arena = nullptr);
8786

8887
// Add an entry into memtable that maps key to value at the

db/repair.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,8 @@ class Repairer {
237237
FileMetaData meta;
238238
meta.fd = FileDescriptor(next_file_number_++, 0, 0);
239239
ReadOptions ro;
240-
Iterator* iter = mem->NewIterator(ro, true /* enforce_total_order */);
240+
ro.total_order_seek = true;
241+
Iterator* iter = mem->NewIterator(ro);
241242
status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_,
242243
iter, &meta, icmp_, 0, 0, kNoCompression);
243244
delete iter;

include/rocksdb/options.h

+9-2
Original file line numberDiff line numberDiff line change
@@ -902,18 +902,25 @@ struct ReadOptions {
902902
// Not supported in ROCKSDB_LITE mode!
903903
bool tailing;
904904

905+
// Enable a total order seek regardless of index format (e.g. hash index)
906+
// used in the table. Some table format (e.g. plain table) may not support
907+
// this option.
908+
bool total_order_seek;
909+
905910
ReadOptions()
906911
: verify_checksums(true),
907912
fill_cache(true),
908913
snapshot(nullptr),
909914
read_tier(kReadAllTier),
910-
tailing(false) {}
915+
tailing(false),
916+
total_order_seek(false) {}
911917
ReadOptions(bool cksum, bool cache)
912918
: verify_checksums(cksum),
913919
fill_cache(cache),
914920
snapshot(nullptr),
915921
read_tier(kReadAllTier),
916-
tailing(false) {}
922+
tailing(false),
923+
total_order_seek(false) {}
917924
};
918925

919926
// Options that control write operations

table/block.cc

+9-3
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,8 @@ Block::~Block() {
321321
}
322322
}
323323

324-
Iterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter) {
324+
Iterator* Block::NewIterator(
325+
const Comparator* cmp, BlockIter* iter, bool total_order_seek) {
325326
if (size_ < 2*sizeof(uint32_t)) {
326327
if (iter != nullptr) {
327328
iter->SetStatus(Status::Corruption("bad block contents"));
@@ -339,12 +340,17 @@ Iterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter) {
339340
return NewEmptyIterator();
340341
}
341342
} else {
343+
BlockHashIndex* hash_index_ptr =
344+
total_order_seek ? nullptr : hash_index_.get();
345+
BlockPrefixIndex* prefix_index_ptr =
346+
total_order_seek ? nullptr : prefix_index_.get();
347+
342348
if (iter != nullptr) {
343349
iter->Initialize(cmp, data_, restart_offset_, num_restarts,
344-
hash_index_.get(), prefix_index_.get());
350+
hash_index_ptr, prefix_index_ptr);
345351
} else {
346352
iter = new BlockIter(cmp, data_, restart_offset_, num_restarts,
347-
hash_index_.get(), prefix_index_.get());
353+
hash_index_ptr, prefix_index_ptr);
348354
}
349355
}
350356

table/block.h

+5-1
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,12 @@ class Block {
4545
//
4646
// If iter is null, return new Iterator
4747
// If iter is not null, update this one and return it as Iterator*
48+
//
49+
// If total_order_seek is true, hash_index_ and prefix_index_ are ignored.
50+
// This option only applies for index block. For data block, hash_index_
51+
// and prefix_index_ are null, so this option does not matter.
4852
Iterator* NewIterator(const Comparator* comparator,
49-
BlockIter* iter = nullptr);
53+
BlockIter* iter = nullptr, bool total_order_seek = true);
5054
void SetBlockHashIndex(BlockHashIndex* hash_index);
5155
void SetBlockPrefixIndex(BlockPrefixIndex* prefix_index);
5256

table/block_based_table_reader.cc

+15-9
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,8 @@ class BlockBasedTable::IndexReader {
137137
// Create an iterator for index access.
138138
// An iter is passed in, if it is not null, update this one and return it
139139
// If it is null, create a new Iterator
140-
virtual Iterator* NewIterator(BlockIter* iter = nullptr) = 0;
140+
virtual Iterator* NewIterator(
141+
BlockIter* iter = nullptr, bool total_order_seek = true) = 0;
141142

142143
// The size of the index.
143144
virtual size_t size() const = 0;
@@ -174,8 +175,9 @@ class BinarySearchIndexReader : public IndexReader {
174175
return s;
175176
}
176177

177-
virtual Iterator* NewIterator(BlockIter* iter = nullptr) override {
178-
return index_block_->NewIterator(comparator_, iter);
178+
virtual Iterator* NewIterator(
179+
BlockIter* iter = nullptr, bool dont_care = true) override {
180+
return index_block_->NewIterator(comparator_, iter, true);
179181
}
180182

181183
virtual size_t size() const override { return index_block_->size(); }
@@ -295,8 +297,9 @@ class HashIndexReader : public IndexReader {
295297
return Status::OK();
296298
}
297299

298-
virtual Iterator* NewIterator(BlockIter* iter = nullptr) override {
299-
return index_block_->NewIterator(comparator_, iter);
300+
virtual Iterator* NewIterator(
301+
BlockIter* iter = nullptr, bool total_order_seek = true) override {
302+
return index_block_->NewIterator(comparator_, iter, total_order_seek);
300303
}
301304

302305
virtual size_t size() const override { return index_block_->size(); }
@@ -818,7 +821,8 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options,
818821
BlockIter* input_iter) {
819822
// index reader has already been pre-populated.
820823
if (rep_->index_reader) {
821-
return rep_->index_reader->NewIterator(input_iter);
824+
return rep_->index_reader->NewIterator(
825+
input_iter, read_options.total_order_seek);
822826
}
823827

824828
bool no_io = read_options.read_tier == kBlockCacheTier;
@@ -866,10 +870,9 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options,
866870
}
867871

868872
assert(cache_handle);
869-
Iterator* iter;
870-
iter = index_reader->NewIterator(input_iter);
873+
auto* iter = index_reader->NewIterator(
874+
input_iter, read_options.total_order_seek);
871875
iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle);
872-
873876
return iter;
874877
}
875878

@@ -988,6 +991,9 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState {
988991
}
989992

990993
bool PrefixMayMatch(const Slice& internal_key) override {
994+
if (read_options_.total_order_seek) {
995+
return true;
996+
}
991997
return table_->PrefixMayMatch(internal_key);
992998
}
993999

table/block_test.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ void CheckBlockContents(BlockContents contents, const int max_key,
172172
}
173173

174174
std::unique_ptr<Iterator> hash_iter(
175-
reader1.NewIterator(BytewiseComparator()));
175+
reader1.NewIterator(BytewiseComparator(), nullptr, false));
176176

177177
std::unique_ptr<Iterator> regular_iter(
178178
reader2.NewIterator(BytewiseComparator()));

table/cuckoo_table_reader.cc

+9-2
Original file line numberDiff line numberDiff line change
@@ -271,10 +271,17 @@ Slice CuckooTableIterator::value() const {
271271
return curr_value_;
272272
}
273273

274-
Iterator* CuckooTableReader::NewIterator(const ReadOptions&, Arena* arena) {
274+
extern Iterator* NewErrorIterator(const Status& status, Arena* arena);
275+
276+
Iterator* CuckooTableReader::NewIterator(
277+
const ReadOptions& read_options, Arena* arena) {
275278
if (!status().ok()) {
276279
return NewErrorIterator(
277-
Status::Corruption("CuckooTableReader status is not okay."));
280+
Status::Corruption("CuckooTableReader status is not okay."), arena);
281+
}
282+
if (read_options.total_order_seek) {
283+
return NewErrorIterator(
284+
Status::InvalidArgument("total_order_seek is not supported."), arena);
278285
}
279286
CuckooTableIterator* iter;
280287
if (arena == nullptr) {

table/plain_table_reader.cc

+4
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,10 @@ void PlainTableReader::SetupForCompaction() {
187187

188188
Iterator* PlainTableReader::NewIterator(const ReadOptions& options,
189189
Arena* arena) {
190+
if (options.total_order_seek && !IsTotalOrderMode()) {
191+
return NewErrorIterator(
192+
Status::InvalidArgument("total_order_seek not supported"), arena);
193+
}
190194
if (arena == nullptr) {
191195
return new PlainTableIterator(this, prefix_extractor_ != nullptr);
192196
} else {

0 commit comments

Comments
 (0)