Skip to content

Commit ff6ec0e

Browse files
committed
Optimize SpatialDB
Summary: Two things: 1. Use hash-based index for data column family 2. Use Get() instead of Iterator Seek() when DB is opened read-only Test Plan: added read-only test in unit test Reviewers: yinwang Reviewed By: yinwang Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D22323
1 parent 2386185 commit ff6ec0e

File tree

2 files changed

+219
-105
lines changed

2 files changed

+219
-105
lines changed

utilities/spatialdb/spatial_db.cc

+173-71
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#include <unordered_set>
1717

1818
#include "rocksdb/cache.h"
19+
#include "rocksdb/options.h"
20+
#include "rocksdb/slice_transform.h"
1921
#include "rocksdb/table.h"
2022
#include "rocksdb/db.h"
2123
#include "rocksdb/utilities/stackable_db.h"
@@ -244,13 +246,76 @@ std::string FeatureSet::DebugString() const {
244246
return out + "}";
245247
}
246248

249+
class ValueGetter {
250+
public:
251+
ValueGetter() {}
252+
virtual ~ValueGetter() {}
253+
254+
virtual bool Get(uint64_t id) = 0;
255+
virtual const Slice value() const = 0;
256+
257+
virtual Status status() const = 0;
258+
};
259+
260+
class ValueGetterFromDB : public ValueGetter {
261+
public:
262+
ValueGetterFromDB(DB* db, ColumnFamilyHandle* cf) : db_(db), cf_(cf) {}
263+
264+
virtual bool Get(uint64_t id) override {
265+
std::string encoded_id;
266+
PutFixed64BigEndian(&encoded_id, id);
267+
status_ = db_->Get(ReadOptions(), cf_, encoded_id, &value_);
268+
if (status_.IsNotFound()) {
269+
status_ = Status::Corruption("Index inconsistency");
270+
return false;
271+
}
272+
273+
return true;
274+
}
275+
276+
virtual const Slice value() const override { return value_; }
277+
278+
virtual Status status() const override { return status_; }
279+
280+
private:
281+
std::string value_;
282+
DB* db_;
283+
ColumnFamilyHandle* cf_;
284+
Status status_;
285+
};
286+
287+
class ValueGetterFromIterator : public ValueGetter {
288+
public:
289+
explicit ValueGetterFromIterator(Iterator* iterator) : iterator_(iterator) {}
290+
291+
virtual bool Get(uint64_t id) override {
292+
std::string encoded_id;
293+
PutFixed64BigEndian(&encoded_id, id);
294+
iterator_->Seek(encoded_id);
295+
296+
if (!iterator_->Valid() || iterator_->key() != Slice(encoded_id)) {
297+
status_ = Status::Corruption("Index inconsistency");
298+
return false;
299+
}
300+
301+
return true;
302+
}
303+
304+
virtual const Slice value() const override { return iterator_->value(); }
305+
306+
virtual Status status() const override { return status_; }
307+
308+
private:
309+
std::unique_ptr<Iterator> iterator_;
310+
Status status_;
311+
};
312+
247313
class SpatialIndexCursor : public Cursor {
248314
public:
249315
// tile_box is inclusive
250-
SpatialIndexCursor(Iterator* spatial_iterator, Iterator* data_iterator,
316+
SpatialIndexCursor(Iterator* spatial_iterator, ValueGetter* value_getter,
251317
const BoundingBox<uint64_t>& tile_bbox, uint32_t tile_bits)
252-
: data_iterator_(data_iterator),
253-
valid_(true) {
318+
: value_getter_(value_getter), valid_(true) {
254319
// calculate quad keys we'll need to query
255320
std::vector<uint64_t> quad_keys;
256321
quad_keys.reserve((tile_bbox.max_x - tile_bbox.min_x + 1) *
@@ -329,7 +394,7 @@ class SpatialIndexCursor : public Cursor {
329394
if (!status_.ok()) {
330395
return status_;
331396
}
332-
return data_iterator_->status();
397+
return value_getter_->status();
333398
}
334399

335400
private:
@@ -356,32 +421,23 @@ class SpatialIndexCursor : public Cursor {
356421
return true;
357422
}
358423

359-
// doesn't return anything, but sets valid_ and status_ on corruption
360424
void ExtractData() {
361425
assert(valid_);
362-
std::string encoded_id;
363-
PutFixed64BigEndian(&encoded_id, *primary_keys_iterator_);
426+
valid_ = value_getter_->Get(*primary_keys_iterator_);
364427

365-
data_iterator_->Seek(encoded_id);
366-
367-
if (!data_iterator_->Valid() ||
368-
data_iterator_->key() != Slice(encoded_id)) {
369-
status_ = Status::Corruption("Index inconsistency");
370-
valid_ = false;
371-
return;
428+
if (valid_) {
429+
Slice data = value_getter_->value();
430+
current_feature_set_.Clear();
431+
if (!GetLengthPrefixedSlice(&data, &current_blob_) ||
432+
!current_feature_set_.Deserialize(data)) {
433+
status_ = Status::Corruption("Primary key column family corruption");
434+
valid_ = false;
435+
}
372436
}
373437

374-
Slice data = data_iterator_->value();
375-
current_feature_set_.Clear();
376-
if (!GetLengthPrefixedSlice(&data, &current_blob_) ||
377-
!current_feature_set_.Deserialize(data)) {
378-
status_ = Status::Corruption("Primary key column family corruption");
379-
valid_ = false;
380-
return;
381-
}
382438
}
383439

384-
unique_ptr<Iterator> data_iterator_;
440+
unique_ptr<ValueGetter> value_getter_;
385441
bool valid_;
386442
Status status_;
387443

@@ -427,10 +483,11 @@ class SpatialDBImpl : public SpatialDB {
427483
DB* db, ColumnFamilyHandle* data_column_family,
428484
const std::vector<std::pair<SpatialIndexOptions, ColumnFamilyHandle*>>&
429485
spatial_indexes,
430-
uint64_t next_id)
486+
uint64_t next_id, bool read_only)
431487
: SpatialDB(db),
432488
data_column_family_(data_column_family),
433-
next_id_(next_id) {
489+
next_id_(next_id),
490+
read_only_(read_only) {
434491
for (const auto& index : spatial_indexes) {
435492
name_to_index_.insert(
436493
{index.first.name, IndexColumnFamily(index.first, index.second)});
@@ -521,17 +578,26 @@ class SpatialDBImpl : public SpatialDB {
521578
return new ErrorCursor(Status::InvalidArgument(
522579
"Spatial index " + spatial_index + " not found"));
523580
}
581+
const auto& si = itr->second.index;
582+
Iterator* spatial_iterator;
583+
ValueGetter* value_getter;
524584

525-
std::vector<Iterator*> iterators;
526-
Status s = NewIterators(read_options,
527-
{data_column_family_, itr->second.column_family},
528-
&iterators);
529-
if (!s.ok()) {
530-
return new ErrorCursor(s);
531-
}
585+
if (read_only_) {
586+
spatial_iterator = NewIterator(read_options, itr->second.column_family);
587+
value_getter = new ValueGetterFromDB(this, data_column_family_);
588+
} else {
589+
std::vector<Iterator*> iterators;
590+
Status s = NewIterators(read_options,
591+
{data_column_family_, itr->second.column_family},
592+
&iterators);
593+
if (!s.ok()) {
594+
return new ErrorCursor(s);
595+
}
532596

533-
const auto& si = itr->second.index;
534-
return new SpatialIndexCursor(iterators[1], iterators[0],
597+
spatial_iterator = iterators[1];
598+
value_getter = new ValueGetterFromIterator(iterators[0]);
599+
}
600+
return new SpatialIndexCursor(spatial_iterator, value_getter,
535601
GetTileBoundingBox(si, bbox), si.tile_bits);
536602
}
537603

@@ -548,31 +614,61 @@ class SpatialDBImpl : public SpatialDB {
548614
std::unordered_map<std::string, IndexColumnFamily> name_to_index_;
549615

550616
std::atomic<uint64_t> next_id_;
617+
bool read_only_;
551618
};
552619

553620
namespace {
554-
Options GetRocksDBOptionsFromOptions(const SpatialDBOptions& options) {
555-
Options rocksdb_options;
556-
rocksdb_options.IncreaseParallelism(options.num_threads);
557-
rocksdb_options.write_buffer_size = 256 * 1024 * 1024; // 256MB
558-
rocksdb_options.max_bytes_for_level_base = 1024 * 1024 * 1024; // 1 GB
621+
DBOptions GetDBOptions(const SpatialDBOptions& options) {
622+
DBOptions db_options;
623+
db_options.IncreaseParallelism(options.num_threads);
624+
if (options.bulk_load) {
625+
db_options.disableDataSync = true;
626+
}
627+
return db_options;
628+
}
629+
630+
ColumnFamilyOptions GetColumnFamilyOptions(const SpatialDBOptions& options,
631+
std::shared_ptr<Cache> block_cache) {
632+
ColumnFamilyOptions column_family_options;
633+
column_family_options.write_buffer_size = 256 * 1024 * 1024; // 256MB
634+
column_family_options.max_bytes_for_level_base = 1024 * 1024 * 1024; // 1 GB
559635
// only compress levels >= 1
560-
rocksdb_options.compression_per_level.resize(rocksdb_options.num_levels);
561-
for (int i = 0; i < rocksdb_options.num_levels; ++i) {
636+
column_family_options.compression_per_level.resize(
637+
column_family_options.num_levels);
638+
for (int i = 0; i < column_family_options.num_levels; ++i) {
562639
if (i == 0) {
563-
rocksdb_options.compression_per_level[i] = kNoCompression;
640+
column_family_options.compression_per_level[i] = kNoCompression;
564641
} else {
565-
rocksdb_options.compression_per_level[i] = kLZ4Compression;
642+
column_family_options.compression_per_level[i] = kLZ4Compression;
566643
}
567644
}
568645
BlockBasedTableOptions table_options;
569-
table_options.block_cache = NewLRUCache(options.cache_size);
570-
rocksdb_options.table_factory.reset(NewBlockBasedTableFactory(table_options));
646+
table_options.block_cache = block_cache;
647+
column_family_options.table_factory.reset(
648+
NewBlockBasedTableFactory(table_options));
571649
if (options.bulk_load) {
572-
rocksdb_options.PrepareForBulkLoad();
573-
}
574-
return rocksdb_options;
650+
column_family_options.level0_file_num_compaction_trigger = (1 << 30);
651+
column_family_options.level0_slowdown_writes_trigger = (1 << 30);
652+
column_family_options.level0_stop_writes_trigger = (1 << 30);
653+
column_family_options.disable_auto_compactions = true;
654+
column_family_options.source_compaction_factor = (1 << 30);
655+
column_family_options.num_levels = 2;
656+
column_family_options.target_file_size_base = 256 * 1024 * 1024;
657+
column_family_options.max_mem_compaction_level = 0;
658+
}
659+
return column_family_options;
660+
}
661+
662+
ColumnFamilyOptions OptimizeOptionsForDataColumnFamily(
663+
ColumnFamilyOptions options, std::shared_ptr<Cache> block_cache) {
664+
options.prefix_extractor.reset(NewNoopTransform());
665+
BlockBasedTableOptions block_based_options;
666+
block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
667+
block_based_options.block_cache = block_cache;
668+
options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
669+
return options;
575670
}
671+
576672
} // namespace
577673

578674
class MetadataStorage {
@@ -618,26 +714,30 @@ class MetadataStorage {
618714
Status SpatialDB::Create(
619715
const SpatialDBOptions& options, const std::string& name,
620716
const std::vector<SpatialIndexOptions>& spatial_indexes) {
621-
Options rocksdb_options = GetRocksDBOptionsFromOptions(options);
622-
rocksdb_options.create_if_missing = true;
623-
rocksdb_options.create_missing_column_families = true;
624-
rocksdb_options.error_if_exists = true;
717+
DBOptions db_options = GetDBOptions(options);
718+
db_options.create_if_missing = true;
719+
db_options.create_missing_column_families = true;
720+
db_options.error_if_exists = true;
721+
722+
auto block_cache = NewLRUCache(options.cache_size);
723+
ColumnFamilyOptions column_family_options =
724+
GetColumnFamilyOptions(options, block_cache);
625725

626726
std::vector<ColumnFamilyDescriptor> column_families;
627727
column_families.push_back(ColumnFamilyDescriptor(
628-
kDefaultColumnFamilyName, ColumnFamilyOptions(rocksdb_options)));
629-
column_families.push_back(ColumnFamilyDescriptor(
630-
kMetadataColumnFamilyName, ColumnFamilyOptions(rocksdb_options)));
728+
kDefaultColumnFamilyName,
729+
OptimizeOptionsForDataColumnFamily(column_family_options, block_cache)));
730+
column_families.push_back(
731+
ColumnFamilyDescriptor(kMetadataColumnFamilyName, column_family_options));
631732

632733
for (const auto& index : spatial_indexes) {
633734
column_families.emplace_back(GetSpatialIndexColumnFamilyName(index.name),
634-
ColumnFamilyOptions(rocksdb_options));
735+
column_family_options);
635736
}
636737

637738
std::vector<ColumnFamilyHandle*> handles;
638739
DB* base_db;
639-
Status s = DB::Open(DBOptions(rocksdb_options), name, column_families,
640-
&handles, &base_db);
740+
Status s = DB::Open(db_options, name, column_families, &handles, &base_db);
641741
if (!s.ok()) {
642742
return s;
643743
}
@@ -659,13 +759,15 @@ Status SpatialDB::Create(
659759

660760
Status SpatialDB::Open(const SpatialDBOptions& options, const std::string& name,
661761
SpatialDB** db, bool read_only) {
662-
Options rocksdb_options = GetRocksDBOptionsFromOptions(options);
762+
DBOptions db_options = GetDBOptions(options);
763+
auto block_cache = NewLRUCache(options.cache_size);
764+
ColumnFamilyOptions column_family_options =
765+
GetColumnFamilyOptions(options, block_cache);
663766

664767
Status s;
665768
std::vector<std::string> existing_column_families;
666769
std::vector<std::string> spatial_indexes;
667-
s = DB::ListColumnFamilies(DBOptions(rocksdb_options), name,
668-
&existing_column_families);
770+
s = DB::ListColumnFamilies(db_options, name, &existing_column_families);
669771
if (!s.ok()) {
670772
return s;
671773
}
@@ -678,22 +780,22 @@ Status SpatialDB::Open(const SpatialDBOptions& options, const std::string& name,
678780

679781
std::vector<ColumnFamilyDescriptor> column_families;
680782
column_families.push_back(ColumnFamilyDescriptor(
681-
kDefaultColumnFamilyName, ColumnFamilyOptions(rocksdb_options)));
682-
column_families.push_back(ColumnFamilyDescriptor(
683-
kMetadataColumnFamilyName, ColumnFamilyOptions(rocksdb_options)));
783+
kDefaultColumnFamilyName,
784+
OptimizeOptionsForDataColumnFamily(column_family_options, block_cache)));
785+
column_families.push_back(
786+
ColumnFamilyDescriptor(kMetadataColumnFamilyName, column_family_options));
684787

685788
for (const auto& index : spatial_indexes) {
686789
column_families.emplace_back(GetSpatialIndexColumnFamilyName(index),
687-
ColumnFamilyOptions(rocksdb_options));
790+
column_family_options);
688791
}
689792
std::vector<ColumnFamilyHandle*> handles;
690793
DB* base_db;
691794
if (read_only) {
692-
s = DB::OpenForReadOnly(DBOptions(rocksdb_options), name, column_families,
693-
&handles, &base_db);
795+
s = DB::OpenForReadOnly(db_options, name, column_families, &handles,
796+
&base_db);
694797
} else {
695-
s = DB::Open(DBOptions(rocksdb_options), name, column_families, &handles,
696-
&base_db);
798+
s = DB::Open(db_options, name, column_families, &handles, &base_db);
697799
}
698800
if (!s.ok()) {
699801
return s;
@@ -730,13 +832,13 @@ Status SpatialDB::Open(const SpatialDBOptions& options, const std::string& name,
730832
for (auto h : handles) {
731833
delete h;
732834
}
733-
delete db;
835+
delete base_db;
734836
return s;
735837
}
736838

737839
// I don't need metadata column family any more, so delete it
738840
delete handles[1];
739-
*db = new SpatialDBImpl(base_db, handles[0], index_cf, next_id);
841+
*db = new SpatialDBImpl(base_db, handles[0], index_cf, next_id, read_only);
740842
return Status::OK();
741843
}
742844

0 commit comments

Comments
 (0)