Skip to content

Commit b135d01

Browse files
Siying Dongsiying
Siying Dong
authored andcommitted
Allow users to profile a query and see bottleneck of the query
Summary: Provide a framework to profile a query in detail to figure out latency bottleneck. Currently, in Get(), Put() and iterators, 2-3 simple timing is used. We can easily add more profile counters to the framework later. Test Plan: Enable this profiling in seveal existing tests. Reviewers: haobo, dhruba, kailiu, emayanke, vamsi, igor CC: leveldb Differential Revision: https://reviews.facebook.net/D14001 Conflicts: table/merger.cc
1 parent 58e1956 commit b135d01

11 files changed

+207
-22
lines changed

db/db_bench.cc

+7-1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "rocksdb/memtablerep.h"
2323
#include "rocksdb/write_batch.h"
2424
#include "rocksdb/statistics.h"
25+
#include "rocksdb/perf_context.h"
2526
#include "port/port.h"
2627
#include "util/bit_set.h"
2728
#include "util/crc32c.h"
@@ -350,6 +351,8 @@ DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
350351
DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
351352
" this is greater than 0.");
352353

354+
DEFINE_int32(perf_level, 0, "Level of perf collection");
355+
353356
static bool ValidateRateLimit(const char* flagname, double value) {
354357
static constexpr double EPSILON = 1e-10;
355358
if ( value < -EPSILON ) {
@@ -689,6 +692,7 @@ struct SharedState {
689692
port::Mutex mu;
690693
port::CondVar cv;
691694
int total;
695+
int perf_level;
692696

693697
// Each thread goes through the following states:
694698
// (1) initializing
@@ -700,7 +704,7 @@ struct SharedState {
700704
long num_done;
701705
bool start;
702706

703-
SharedState() : cv(&mu) { }
707+
SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { }
704708
};
705709

706710
// Per-thread state for concurrent executions of the same benchmark.
@@ -810,6 +814,7 @@ class Benchmark {
810814
fprintf(stdout, "Memtablerep: vector\n");
811815
break;
812816
}
817+
fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
813818

814819
PrintWarnings();
815820
fprintf(stdout, "------------------------------------------------\n");
@@ -1150,6 +1155,7 @@ class Benchmark {
11501155
}
11511156
}
11521157

1158+
SetPerfLevel(static_cast<PerfLevel> (shared->perf_level));
11531159
thread->stats.Start(thread->tid);
11541160
(arg->bm->*(arg->method))(thread);
11551161
thread->stats.Stop();

db/db_impl.cc

+31-3
Original file line numberDiff line numberDiff line change
@@ -1035,7 +1035,7 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
10351035
(unsigned long)m->GetLogNumber());
10361036
list.push_back(m->NewIterator());
10371037
}
1038-
Iterator* iter = NewMergingIterator(&internal_comparator_, &list[0],
1038+
Iterator* iter = NewMergingIterator(env_, &internal_comparator_, &list[0],
10391039
list.size());
10401040
const SequenceNumber newest_snapshot = snapshots_.GetNewest();
10411041
const SequenceNumber earliest_seqno_in_memtable =
@@ -2519,7 +2519,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
25192519
// Collect iterators for files in L0 - Ln
25202520
versions_->current()->AddIterators(options, storage_options_, &list);
25212521
Iterator* internal_iter =
2522-
NewMergingIterator(&internal_comparator_, &list[0], list.size());
2522+
NewMergingIterator(env_, &internal_comparator_, &list[0], list.size());
25232523
versions_->current()->Ref();
25242524

25252525
cleanup->mu = &mutex_;
@@ -2555,6 +2555,8 @@ Status DBImpl::GetImpl(const ReadOptions& options,
25552555
Status s;
25562556

25572557
StopWatch sw(env_, options_.statistics, DB_GET);
2558+
StopWatchNano snapshot_timer(env_, false);
2559+
StartPerfTimer(&snapshot_timer);
25582560
SequenceNumber snapshot;
25592561
mutex_.Lock();
25602562
if (options.snapshot != nullptr) {
@@ -2583,15 +2585,23 @@ Status DBImpl::GetImpl(const ReadOptions& options,
25832585
// s is both in/out. When in, s could either be OK or MergeInProgress.
25842586
// merge_operands will contain the sequence of merges in the latter case.
25852587
LookupKey lkey(key, snapshot);
2588+
BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer);
25862589
if (mem->Get(lkey, value, &s, &merge_operands, options_)) {
25872590
// Done
25882591
} else if (imm.Get(lkey, value, &s, &merge_operands, options_)) {
25892592
// Done
25902593
} else {
2594+
StopWatchNano from_files_timer(env_, false);
2595+
StartPerfTimer(&from_files_timer);
2596+
25912597
current->Get(options, lkey, value, &s, &merge_operands, &stats,
25922598
options_, value_found);
25932599
have_stat_update = true;
2600+
BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer);
25942601
}
2602+
2603+
StopWatchNano post_process_timer(env_, false);
2604+
StartPerfTimer(&post_process_timer);
25952605
mutex_.Lock();
25962606

25972607
if (!options_.disable_seek_compaction &&
@@ -2607,6 +2617,8 @@ Status DBImpl::GetImpl(const ReadOptions& options,
26072617
// Note, tickers are atomic now - no lock protection needed any more.
26082618
RecordTick(options_.statistics, NUMBER_KEYS_READ);
26092619
RecordTick(options_.statistics, BYTES_READ, value->size());
2620+
BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer);
2621+
26102622
return s;
26112623
}
26122624

@@ -2615,6 +2627,8 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
26152627
std::vector<std::string>* values) {
26162628

26172629
StopWatch sw(env_, options_.statistics, DB_MULTIGET);
2630+
StopWatchNano snapshot_timer(env_, false);
2631+
StartPerfTimer(&snapshot_timer);
26182632
SequenceNumber snapshot;
26192633
mutex_.Lock();
26202634
if (options.snapshot != nullptr) {
@@ -2646,6 +2660,7 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
26462660

26472661
// Keep track of bytes that we read for statistics-recording later
26482662
uint64_t bytesRead = 0;
2663+
BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer);
26492664

26502665
// For each of the given keys, apply the entire "get" process as follows:
26512666
// First look in the memtable, then in the immutable memtable (if any).
@@ -2672,6 +2687,8 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
26722687
}
26732688

26742689
// Post processing (decrement reference counts and record statistics)
2690+
StopWatchNano post_process_timer(env_, false);
2691+
StartPerfTimer(&post_process_timer);
26752692
mutex_.Lock();
26762693
if (!options_.disable_seek_compaction &&
26772694
have_stat_update && current->UpdateStats(stats)) {
@@ -2686,6 +2703,7 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
26862703
RecordTick(options_.statistics, NUMBER_MULTIGET_CALLS);
26872704
RecordTick(options_.statistics, NUMBER_MULTIGET_KEYS_READ, numKeys);
26882705
RecordTick(options_.statistics, NUMBER_MULTIGET_BYTES_READ, bytesRead);
2706+
BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer);
26892707

26902708
return statList;
26912709
}
@@ -2754,6 +2772,8 @@ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
27542772
}
27552773

27562774
Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
2775+
StopWatchNano pre_post_process_timer(env_, false);
2776+
StartPerfTimer(&pre_post_process_timer);
27572777
Writer w(&mutex_);
27582778
w.batch = my_batch;
27592779
w.sync = options.sync;
@@ -2800,12 +2820,13 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
28002820
if (options.disableWAL) {
28012821
flush_on_destroy_ = true;
28022822
}
2823+
BumpPerfTime(&perf_context.write_pre_and_post_process_time,
2824+
&pre_post_process_timer);
28032825

28042826
if (!options.disableWAL) {
28052827
StopWatchNano timer(env_);
28062828
StartPerfTimer(&timer);
28072829
status = log_->AddRecord(WriteBatchInternal::Contents(updates));
2808-
BumpPerfTime(&perf_context.wal_write_time, &timer);
28092830
if (status.ok() && options.sync) {
28102831
if (options_.use_fsync) {
28112832
StopWatch(env_, options_.statistics, WAL_FILE_SYNC_MICROS);
@@ -2815,10 +2836,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
28152836
status = log_->file()->Sync();
28162837
}
28172838
}
2839+
BumpPerfTime(&perf_context.write_wal_time, &timer);
28182840
}
28192841
if (status.ok()) {
2842+
StopWatchNano write_memtable_timer(env_, false);
2843+
StartPerfTimer(&write_memtable_timer);
28202844
status = WriteBatchInternal::InsertInto(updates, mem_, &options_, this,
28212845
options_.filter_deletes);
2846+
BumpPerfTime(&perf_context.write_memtable_time, &write_memtable_timer);
28222847
if (!status.ok()) {
28232848
// Panic for in-memory corruptions
28242849
// Note that existing logic was not sound. Any partial failure writing
@@ -2828,6 +2853,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
28282853
}
28292854
SetTickerCount(options_.statistics, SEQUENCE_NUMBER, last_sequence);
28302855
}
2856+
StartPerfTimer(&pre_post_process_timer);
28312857
LogFlush(options_.info_log);
28322858
mutex_.Lock();
28332859
if (status.ok()) {
@@ -2855,6 +2881,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
28552881
if (!writers_.empty()) {
28562882
writers_.front()->cv.Signal();
28572883
}
2884+
BumpPerfTime(&perf_context.write_pre_and_post_process_time,
2885+
&pre_post_process_timer);
28582886
return status;
28592887
}
28602888

db/db_iter.cc

+20-2
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,8 @@ class DBIter: public Iterator {
102102
virtual void SeekToLast();
103103

104104
private:
105-
void FindNextUserEntry(bool skipping);
105+
inline void FindNextUserEntry(bool skipping);
106+
void FindNextUserEntryInternal(bool skipping);
106107
void FindPrevUserEntry();
107108
bool ParseKey(ParsedInternalKey* key);
108109
void MergeValuesNewToOld();
@@ -191,7 +192,15 @@ void DBIter::Next() {
191192
//
192193
// NOTE: In between, saved_key_ can point to a user key that has
193194
// a delete marker
194-
void DBIter::FindNextUserEntry(bool skipping) {
195+
inline void DBIter::FindNextUserEntry(bool skipping) {
196+
StopWatchNano timer(env_, false);
197+
StartPerfTimer(&timer);
198+
FindNextUserEntryInternal(skipping);
199+
BumpPerfTime(&perf_context.find_next_user_entry_time, &timer);
200+
}
201+
202+
// Actual implementation of DBIter::FindNextUserEntry()
203+
void DBIter::FindNextUserEntryInternal(bool skipping) {
195204
// Loop until we hit an acceptable entry to yield
196205
assert(iter_->Valid());
197206
assert(direction_ == kForward);
@@ -429,7 +438,10 @@ void DBIter::Seek(const Slice& target) {
429438
saved_key_.clear();
430439
AppendInternalKey(
431440
&saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
441+
StopWatchNano internal_seek_timer(env_, false);
442+
StartPerfTimer(&internal_seek_timer);
432443
iter_->Seek(saved_key_);
444+
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
433445
if (iter_->Valid()) {
434446
direction_ = kForward;
435447
ClearSavedValue();
@@ -442,7 +454,10 @@ void DBIter::Seek(const Slice& target) {
442454
void DBIter::SeekToFirst() {
443455
direction_ = kForward;
444456
ClearSavedValue();
457+
StopWatchNano internal_seek_timer(env_, false);
458+
StartPerfTimer(&internal_seek_timer);
445459
iter_->SeekToFirst();
460+
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
446461
if (iter_->Valid()) {
447462
FindNextUserEntry(false /* not skipping */);
448463
} else {
@@ -461,7 +476,10 @@ void DBIter::SeekToLast() {
461476

462477
direction_ = kReverse;
463478
ClearSavedValue();
479+
StopWatchNano internal_seek_timer(env_, false);
480+
StartPerfTimer(&internal_seek_timer);
464481
iter_->SeekToLast();
482+
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
465483
FindPrevUserEntry();
466484
}
467485

db/db_test.cc

+41
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "rocksdb/compaction_filter.h"
2323
#include "rocksdb/env.h"
2424
#include "rocksdb/table.h"
25+
#include "rocksdb/perf_context.h"
2526
#include "util/hash.h"
2627
#include "util/logging.h"
2728
#include "util/mutexlock.h"
@@ -1215,7 +1216,13 @@ TEST(DBTest, IterMulti) {
12151216
ASSERT_EQ(IterStatus(iter), "a->va");
12161217
iter->Seek("ax");
12171218
ASSERT_EQ(IterStatus(iter), "b->vb");
1219+
1220+
SetPerfLevel(kEnableTime);
1221+
perf_context.Reset();
12181222
iter->Seek("b");
1223+
ASSERT_TRUE((int) perf_context.seek_internal_seek_time > 0);
1224+
ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0);
1225+
SetPerfLevel(kDisable);
12191226
ASSERT_EQ(IterStatus(iter), "b->vb");
12201227
iter->Seek("z");
12211228
ASSERT_EQ(IterStatus(iter), "(invalid)");
@@ -1230,7 +1237,12 @@ TEST(DBTest, IterMulti) {
12301237
// Switch from forward to reverse
12311238
iter->SeekToFirst();
12321239
iter->Next();
1240+
SetPerfLevel(kEnableTime);
1241+
perf_context.Reset();
12331242
iter->Next();
1243+
ASSERT_EQ(0, (int) perf_context.seek_internal_seek_time);
1244+
ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0);
1245+
SetPerfLevel(kDisable);
12341246
iter->Prev();
12351247
ASSERT_EQ(IterStatus(iter), "b->vb");
12361248

@@ -1590,22 +1602,42 @@ TEST(DBTest, NumImmutableMemTable) {
15901602

15911603
std::string big_value(1000000, 'x');
15921604
std::string num;
1605+
SetPerfLevel(kEnableTime);;
15931606

15941607
ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
15951608
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
15961609
ASSERT_EQ(num, "0");
1610+
perf_context.Reset();
1611+
Get("k1");
1612+
ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
15971613

15981614
ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
15991615
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
16001616
ASSERT_EQ(num, "1");
1617+
perf_context.Reset();
1618+
Get("k1");
1619+
ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
1620+
perf_context.Reset();
1621+
Get("k2");
1622+
ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
16011623

16021624
ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
16031625
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
16041626
ASSERT_EQ(num, "2");
1627+
perf_context.Reset();
1628+
Get("k2");
1629+
ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
1630+
perf_context.Reset();
1631+
Get("k3");
1632+
ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
1633+
perf_context.Reset();
1634+
Get("k1");
1635+
ASSERT_EQ(3, (int) perf_context.get_from_memtable_count);
16051636

16061637
dbfull()->Flush(FlushOptions());
16071638
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
16081639
ASSERT_EQ(num, "0");
1640+
SetPerfLevel(kDisable);
16091641
} while (ChangeCompactOptions());
16101642
}
16111643

@@ -1614,11 +1646,16 @@ TEST(DBTest, FLUSH) {
16141646
Options options = CurrentOptions();
16151647
WriteOptions writeOpt = WriteOptions();
16161648
writeOpt.disableWAL = true;
1649+
SetPerfLevel(kEnableTime);;
16171650
ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1"));
16181651
// this will now also flush the last 2 writes
16191652
dbfull()->Flush(FlushOptions());
16201653
ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1"));
16211654

1655+
perf_context.Reset();
1656+
Get("foo");
1657+
ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
1658+
16221659
Reopen();
16231660
ASSERT_EQ("v1", Get("foo"));
16241661
ASSERT_EQ("v1", Get("bar"));
@@ -1630,7 +1667,9 @@ TEST(DBTest, FLUSH) {
16301667

16311668
Reopen();
16321669
ASSERT_EQ("v2", Get("bar"));
1670+
perf_context.Reset();
16331671
ASSERT_EQ("v2", Get("foo"));
1672+
ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
16341673

16351674
writeOpt.disableWAL = false;
16361675
ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3"));
@@ -1642,6 +1681,8 @@ TEST(DBTest, FLUSH) {
16421681
// has WAL enabled.
16431682
ASSERT_EQ("v3", Get("foo"));
16441683
ASSERT_EQ("v3", Get("bar"));
1684+
1685+
SetPerfLevel(kDisable);
16451686
} while (ChangeCompactOptions());
16461687
}
16471688

0 commit comments

Comments
 (0)